Esempi in PHP per UrlParser::cleanRedundantLinks

Linguaggio di programmazione: PHP
Classe/tipologia: UrlParser
Metodo/funzione: cleanRedundantLinks
Esempi su hotexamples.com: 1
UrlParser::cleanRedundantLinks in PHP: 1 esempio trovato. Questo è il miglior esempio reale in PHP per UrlParser::cleanRedundantLinks, estratto da progetti open source. Lo puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.
Metodi utilizzati di frequente
Mostra Nascondi
canonicalLink(12)
getHost(11)
getDocumentType(7)
checkRecursiveUrl(6)
getDocumentFilename(5)
getPath(4)
urlMemberSiteArray(4)
getScheme(4)
getWordsLastPathPartUrl(3)
getWordsIfHostUrl(3)
getPathArray(3)
isLocalhostUrl(3)
isPathMemberRegexPaths(3)
simplifyUrl(3)
getHostAndPath(2)
pruneLinks(2)
guessMimeTypeFromFileName(2)
getHostSubdomains(2)
parse(1)
parseUrl(1)
isVideoUrl(1)
getPort(1)
isFollowUrl(1)
getLang(1)
getHostPaths(1)
getCourseDirName(1)
defaultFilter(1)
cleanRedundantLinks(1)
urlParse(1)
Esempio n. 1
Mostra file
File: fetcher.php Progetto: yakar/yioop
 /**
  * Processes an array of downloaded web pages with the appropriate page
  * processor.
  *
  * Summary data is extracted from each non robots.txt file in the array.
  * Disallowed paths and crawl-delays are extracted from robots.txt files.
  *
  * @param array $site_pages a collection of web pages to process
  * @return array summary data extracted from these pages
  */
 function processFetchPages($site_pages)
 {
     $PAGE_PROCESSORS = $this->page_processors;
     crawlLog("Start process pages... Current Memory:" . memory_get_usage());
     $start_time = microtime();
     $prefix = $this->fetcher_num . "-";
     $stored_site_pages = array();
     $summarized_site_pages = array();
     $num_items = $this->web_archive->count;
     $i = 0;
     foreach ($site_pages as $site) {
         $response_code = $site[self::HTTP_CODE];
         $was_error = false;
         if ($response_code < 200 || $response_code >= 300) {
             crawlLog($site[self::URL] . " response code {$response_code}");
             $host = UrlParser::getHost($site[self::URL]);
             if (!isset($this->hosts_with_errors[$host])) {
                 $this->hosts_with_errors[$host] = 0;
             }
             if ($response_code >= 400 || $response_code < 100) {
                 // < 100 will capture failures to connect which are returned
                 // as strings
                 $was_error = true;
                 $this->hosts_with_errors[$host]++;
             }
             /* we print out errors to std output. We still go ahead and
                   process the page. Maybe it is a cool error page, also
                   this makes sure we don't crawl it again
                */
         }
         // text/robot is my made up mimetype for robots.txt files
         $was_robot_error = false;
         if (isset($site[self::ROBOT_PATHS])) {
             if (!$was_error) {
                 $type = "text/robot";
             } else {
                 $type = $site[self::TYPE];
                 if ($response_code != 404) {
                     /*
                        disallow crawling if robots.txt was any error other
                        that not found
                     */
                     $was_robot_error = true;
                     $site[self::ROBOT_PATHS][] = "/";
                 }
             }
         } else {
             if (isset($site[self::FILE_NAME])) {
                 $extension = UrlParser::getDocumentType($site[self::FILE_NAME]);
                 if ($extension == $this->programming_language_extension['java']) {
                     $type = "text/java";
                 } else {
                     if ($extension == $this->programming_language_extension['py']) {
                         $type = "text/py";
                     } else {
                         $type = $site[self::TYPE];
                     }
                 }
             } else {
                 $type = $site[self::TYPE];
             }
         }
         $handled = false;
         /*deals with short URLs and directs them to the original link
           for robots.txt don't want to introduce stuff that can be
           mis-parsed (we follow redirects in this case anyway) */
         if (isset($site[self::LOCATION]) && count($site[self::LOCATION]) > 0 && strcmp($type, "text/robot") != 0) {
             array_unshift($site[self::LOCATION], $site[self::URL]);
             $tmp_loc = array_pop($site[self::LOCATION]);
             $tmp_loc = UrlParser::canonicalLink($tmp_loc, $site[self::URL]);
             $site[self::LOCATION] = array_push($site[self::LOCATION], $tmp_loc);
             $doc_info = array();
             $doc_info[self::LINKS][$tmp_loc] = "location:" . $site[self::URL];
             $doc_info[self::LOCATION] = true;
             $doc_info[self::DESCRIPTION] = $site[self::URL] . " => " . $tmp_loc;
             $doc_info[self::PAGE] = $doc_info[self::DESCRIPTION];
             $doc_info[self::TITLE] = $site[self::URL];
             $text_data = true;
             if (!isset($site[self::ENCODING])) {
                 $site[self::ENCODING] = "UTF-8";
             }
             $handled = true;
         } else {
             if (isset($PAGE_PROCESSORS[$type])) {
                 $page_processor = $PAGE_PROCESSORS[$type];
                 if (generalIsA($page_processor, "TextProcessor")) {
                     $text_data = true;
                 } else {
                     $text_data = false;
                 }
             } else {
                 crawlLog("No page processor for mime type: " . $type);
                 crawlLog("Not processing: " . $site[self::URL]);
                 continue;
             }
         }
         if (!$handled) {
             if (isset($this->plugin_processors[$page_processor])) {
                 $processor = new $page_processor($this->plugin_processors[$page_processor], $this->max_description_len, $this->summarizer_option);
             } else {
                 $processor = new $page_processor(array(), $this->max_description_len, $this->summarizer_option);
             }
         }
         if (isset($site[self::PAGE]) && !$handled) {
             if (!isset($site[self::ENCODING])) {
                 $site[self::ENCODING] = "UTF-8";
             }
             //if not UTF-8 convert before doing anything else
             if (isset($site[self::ENCODING]) && $site[self::ENCODING] != "UTF-8" && $site[self::ENCODING] != "" && generalIsA($page_processor, "TextProcessor")) {
                 if (!@mb_check_encoding($site[self::PAGE], $site[self::ENCODING])) {
                     crawlLog("  MB_CHECK_ENCODING FAILED!!");
                 }
                 crawlLog("  Converting from encoding " . $site[self::ENCODING] . "...");
                 //if HEBREW WINDOWS-1255 use ISO-8859 instead
                 if (stristr($site[self::ENCODING], "1255")) {
                     $site[self::ENCODING] = "ISO-8859-8";
                     crawlLog("  using encoding " . $site[self::ENCODING] . "...");
                 }
                 if (stristr($site[self::ENCODING], "1256")) {
                     $site[self::PAGE] = w1256ToUTF8($site[self::PAGE]);
                     crawlLog("  using Yioop hack encoding ...");
                 } else {
                     $site[self::PAGE] = @mb_convert_encoding($site[self::PAGE], "UTF-8", $site[self::ENCODING]);
                 }
             }
             crawlLog("  Using Processor..." . $page_processor);
             if (isset($site[self::REPOSITORY_TYPE]) && $site[self::REPOSITORY_TYPE] == self::REPOSITORY_GIT) {
                 $tmp_url_store = $site[self::URL];
                 $site[self::URL] = $site[self::FILE_NAME];
             }
             $doc_info = $processor->handle($site[self::PAGE], $site[self::URL]);
             if (isset($site[self::REPOSITORY_TYPE]) && $site[self::REPOSITORY_TYPE] == self::REPOSITORY_GIT) {
                 $site[self::URL] = $tmp_url_store;
             }
             if (!$doc_info) {
                 crawlLog("  Processing Yielded No Data For: " . $site[self::URL]);
             }
             if ($page_processor != "RobotProcessor" && !isset($doc_info[self::JUST_METAS])) {
                 $this->pruneLinks($doc_info, CrawlConstants::LINKS, $start_time);
             }
         } else {
             if (!$handled) {
                 $doc_info = false;
             }
         }
         $not_loc = true;
         if ($doc_info) {
             $site[self::DOC_INFO] = $doc_info;
             if (isset($doc_info[self::LOCATION])) {
                 $site[self::HASH] = crawlHash(crawlHash($site[self::URL], true) . "LOCATION", true);
                 $not_loc = false;
             }
             $site[self::ROBOT_INSTANCE] = $prefix . ROBOT_INSTANCE;
             if (!is_dir(CRAWL_DIR . "/cache")) {
                 mkdir(CRAWL_DIR . "/cache");
                 $htaccess = "Options None\nphp_flag engine off\n";
                 file_put_contents(CRAWL_DIR . "/cache/.htaccess", $htaccess);
             }
             if ($type == "text/robot" && isset($doc_info[self::PAGE])) {
                 $site[self::PAGE] = $doc_info[self::PAGE];
             }
             if ($text_data) {
                 if (isset($doc_info[self::PAGE])) {
                     $site[self::PAGE] = $doc_info[self::PAGE];
                 } else {
                     $site[self::PAGE] = NULL;
                 }
                 if ($not_loc) {
                     $content = $doc_info[self::DESCRIPTION];
                     $site[self::HASH] = FetchUrl::computePageHash($content);
                 }
             } else {
                 $site[self::HASH] = FetchUrl::computePageHash($site[self::PAGE]);
             }
             if (isset($doc_info[self::WORD_CLOUD])) {
                 $site[self::WORD_CLOUD] = $doc_info[self::WORD_CLOUD];
             } else {
                 $site[self::WORD_CLOUD] = NULL;
             }
             if (isset($doc_info[self::CRAWL_DELAY])) {
                 $site[self::CRAWL_DELAY] = $doc_info[self::CRAWL_DELAY];
             }
             if (isset($doc_info[self::ROBOT_PATHS]) && !$was_error) {
                 $site[self::ROBOT_PATHS] = $doc_info[self::ROBOT_PATHS];
             }
             if (!isset($site[self::ROBOT_METAS])) {
                 $site[self::ROBOT_METAS] = array();
             }
             if (isset($doc_info[self::ROBOT_METAS])) {
                 $site[self::ROBOT_METAS] = array_merge($site[self::ROBOT_METAS], $doc_info[self::ROBOT_METAS]);
             }
             //here's where we enforce NOFOLLOW
             if (in_array("NOFOLLOW", $site[self::ROBOT_METAS]) || in_array("NONE", $site[self::ROBOT_METAS])) {
                 $site[self::DOC_INFO][self::LINKS] = array();
             }
             if (isset($doc_info[self::AGENT_LIST])) {
                 $site[self::AGENT_LIST] = $doc_info[self::AGENT_LIST];
             }
             $this->copySiteFields($i, $site, $summarized_site_pages, $stored_site_pages);
             $summarized_site_pages[$i][self::URL] = strip_tags($site[self::URL]);
             if (isset($site[self::REPOSITORY_TYPE]) && $site[self::REPOSITORY_TYPE] == self::REPOSITORY_GIT) {
                 $summarized_site_pages[$i][self::TITLE] = $site[self::FILE_NAME];
             } else {
                 $summarized_site_pages[$i][self::TITLE] = strip_tags($site[self::DOC_INFO][self::TITLE]);
                 // stripping html to be on the safe side
             }
             if (!isset($site[self::REPOSITORY_TYPE])) {
                 if ($was_robot_error) {
                     $site[self::DOC_INFO][self::DESCRIPTION] = "There was an HTTP error in trying to download " . "this robots.txt file, so all paths to this site " . "were dsallowed by Yioop.\n" . $site[self::DOC_INFO][self::DESCRIPTION];
                 }
                 $summarized_site_pages[$i][self::DESCRIPTION] = strip_tags($site[self::DOC_INFO][self::DESCRIPTION]);
             } else {
                 $summarized_site_pages[$i][self::DESCRIPTION] = $site[self::DOC_INFO][self::DESCRIPTION];
             }
             if (isset($site[self::DOC_INFO][self::JUST_METAS]) || isset($site[self::ROBOT_PATHS])) {
                 $summarized_site_pages[$i][self::JUST_METAS] = true;
             }
             if (isset($site[self::DOC_INFO][self::META_WORDS])) {
                 if (!isset($summarized_site_pages[$i][self::META_WORDS])) {
                     $summarized_site_pages[$i][self::META_WORDS] = $site[self::DOC_INFO][self::META_WORDS];
                 } else {
                     $summarized_site_pages[$i][self::META_WORDS] = array_merge($summarized_site_pages[$i][self::META_WORDS], $site[self::DOC_INFO][self::META_WORDS]);
                 }
             }
             if (isset($site[self::DOC_INFO][self::LANG])) {
                 if ($site[self::DOC_INFO][self::LANG] == 'en' && $site[self::ENCODING] != "UTF-8") {
                     $site[self::DOC_INFO][self::LANG] = guessLangEncoding($site[self::ENCODING]);
                 }
                 $summarized_site_pages[$i][self::LANG] = $site[self::DOC_INFO][self::LANG];
             }
             if (isset($site[self::DOC_INFO][self::LINKS])) {
                 $summarized_site_pages[$i][self::LINKS] = $site[self::DOC_INFO][self::LINKS];
             }
             if (isset($site[self::DOC_INFO][self::WORD_CLOUD])) {
                 $summarized_site_pages[$i][self::WORD_CLOUD] = $site[self::DOC_INFO][self::WORD_CLOUD];
             }
             if (isset($site[self::DOC_INFO][self::THUMB])) {
                 $summarized_site_pages[$i][self::THUMB] = $site[self::DOC_INFO][self::THUMB];
             }
             if (isset($site[self::DOC_INFO][self::SUBDOCS])) {
                 $this->processSubdocs($i, $site, $summarized_site_pages, $stored_site_pages);
             }
             if (isset($summarized_site_pages[$i][self::LINKS])) {
                 $summarized_site_pages[$i][self::LINKS] = UrlParser::cleanRedundantLinks($summarized_site_pages[$i][self::LINKS], $summarized_site_pages[$i][self::URL]);
             }
             if (!empty($this->classifiers)) {
                 Classifier::labelPage($summarized_site_pages[$i], $this->classifiers, $this->active_classifiers, $this->active_rankers);
             }
             if ($this->page_rule_parser != NULL) {
                 $this->page_rule_parser->executeRuleTrees($summarized_site_pages[$i]);
             }
             $metas = isset($summarized_site_pages[$i][self::ROBOT_METAS]) ? $summarized_site_pages[$i][self::ROBOT_METAS] : array();
             if (array_intersect($metas, array("NOARCHIVE", "NOINDEX", "JUSTFOLLOW", "NONE")) != array()) {
                 $stored_site_pages[$i] = false;
             }
             $stored_site_pages[$i][self::INDEX] = $i;
             $i++;
         }
     }
     // end for
     $num_pages = count($stored_site_pages);
     $filter_stored = array_filter($stored_site_pages);
     if ($num_pages > 0 && $this->cache_pages) {
         $cache_page_partition = $this->web_archive->addPages(self::OFFSET, $filter_stored);
     } else {
         if ($num_pages > 0) {
             $this->web_archive->addCount(count($filter_stored));
         }
     }
     for ($i = 0; $i < $num_pages; $i++) {
         $summarized_site_pages[$i][self::INDEX] = $num_items + $i;
     }
     foreach ($filter_stored as $stored) {
         $i = $stored[self::INDEX];
         if (isset($stored[self::OFFSET])) {
             $summarized_site_pages[$i][self::OFFSET] = $stored[self::OFFSET];
             $summarized_site_pages[$i][self::CACHE_PAGE_PARTITION] = $cache_page_partition;
         }
     }
     crawlLog("  Process pages time: " . changeInMicrotime($start_time) . " Current Memory: " . memory_get_usage());
     return $summarized_site_pages;
 }