示例#1
0
 /**
  * Determines the language of the html document by looking at the root
  * language attribute. If that fails $sample_text is used to try to guess
  * the language
  *
  * @param object $dom  a document object to check the language of
  * @param string $sample_text sample text to try guess the language from
  * @param string $url url of web-page as a fallback look at the country
  *     to figure out language
  *
  * @return string language tag for guessed language
  */
 static function lang($dom, $sample_text = NULL, $url = NULL)
 {
     $htmls = $dom->getElementsByTagName("html");
     $lang = NULL;
     foreach ($htmls as $html) {
         $lang = $html->getAttribute('lang');
         if ($lang != NULL) {
             return $lang;
         }
     }
     if ($lang == NULL) {
         //baidu doesn't have a lang attribute but does say encoding
         $xpath = new DOMXPath($dom);
         $charset_check = "contains(translate(@http-equiv," . "'abcdefghijklmnopqrstuvwxyz'," . " 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),'CONTENT-TYPE')";
         $metas = $xpath->evaluate("/html/head//meta[{$charset_check}]");
         $found_metas = array();
         foreach ($metas as $meta) {
             $content = $meta->getAttribute('content');
             $charset_metas = explode("=", $content);
             if (isset($charset_metas[1])) {
                 $charset = strtoupper($charset_metas[1]);
                 $lang = guessLangEncoding($charset);
                 return $lang;
             }
         }
         $lang = self::calculateLang($sample_text, $url);
     }
     return $lang;
 }
示例#2
0
文件: fetcher.php 项目: yakar/yioop
 /**
  * Processes an array of downloaded web pages with the appropriate page
  * processor.
  *
  * Summary data is extracted from each non robots.txt file in the array.
  * Disallowed paths and crawl-delays are extracted from robots.txt files.
  *
  * @param array $site_pages a collection of web pages to process
  * @return array summary data extracted from these pages
  */
 function processFetchPages($site_pages)
 {
     $PAGE_PROCESSORS = $this->page_processors;
     crawlLog("Start process pages... Current Memory:" . memory_get_usage());
     $start_time = microtime();
     $prefix = $this->fetcher_num . "-";
     $stored_site_pages = array();
     $summarized_site_pages = array();
     $num_items = $this->web_archive->count;
     $i = 0;
     foreach ($site_pages as $site) {
         $response_code = $site[self::HTTP_CODE];
         $was_error = false;
         if ($response_code < 200 || $response_code >= 300) {
             crawlLog($site[self::URL] . " response code {$response_code}");
             $host = UrlParser::getHost($site[self::URL]);
             if (!isset($this->hosts_with_errors[$host])) {
                 $this->hosts_with_errors[$host] = 0;
             }
             if ($response_code >= 400 || $response_code < 100) {
                 // < 100 will capture failures to connect which are returned
                 // as strings
                 $was_error = true;
                 $this->hosts_with_errors[$host]++;
             }
             /* we print out errors to std output. We still go ahead and
                   process the page. Maybe it is a cool error page, also
                   this makes sure we don't crawl it again
                */
         }
         // text/robot is my made up mimetype for robots.txt files
         $was_robot_error = false;
         if (isset($site[self::ROBOT_PATHS])) {
             if (!$was_error) {
                 $type = "text/robot";
             } else {
                 $type = $site[self::TYPE];
                 if ($response_code != 404) {
                     /*
                        disallow crawling if robots.txt was any error other
                        that not found
                     */
                     $was_robot_error = true;
                     $site[self::ROBOT_PATHS][] = "/";
                 }
             }
         } else {
             if (isset($site[self::FILE_NAME])) {
                 $extension = UrlParser::getDocumentType($site[self::FILE_NAME]);
                 if ($extension == $this->programming_language_extension['java']) {
                     $type = "text/java";
                 } else {
                     if ($extension == $this->programming_language_extension['py']) {
                         $type = "text/py";
                     } else {
                         $type = $site[self::TYPE];
                     }
                 }
             } else {
                 $type = $site[self::TYPE];
             }
         }
         $handled = false;
         /*deals with short URLs and directs them to the original link
           for robots.txt don't want to introduce stuff that can be
           mis-parsed (we follow redirects in this case anyway) */
         if (isset($site[self::LOCATION]) && count($site[self::LOCATION]) > 0 && strcmp($type, "text/robot") != 0) {
             array_unshift($site[self::LOCATION], $site[self::URL]);
             $tmp_loc = array_pop($site[self::LOCATION]);
             $tmp_loc = UrlParser::canonicalLink($tmp_loc, $site[self::URL]);
             $site[self::LOCATION] = array_push($site[self::LOCATION], $tmp_loc);
             $doc_info = array();
             $doc_info[self::LINKS][$tmp_loc] = "location:" . $site[self::URL];
             $doc_info[self::LOCATION] = true;
             $doc_info[self::DESCRIPTION] = $site[self::URL] . " => " . $tmp_loc;
             $doc_info[self::PAGE] = $doc_info[self::DESCRIPTION];
             $doc_info[self::TITLE] = $site[self::URL];
             $text_data = true;
             if (!isset($site[self::ENCODING])) {
                 $site[self::ENCODING] = "UTF-8";
             }
             $handled = true;
         } else {
             if (isset($PAGE_PROCESSORS[$type])) {
                 $page_processor = $PAGE_PROCESSORS[$type];
                 if (generalIsA($page_processor, "TextProcessor")) {
                     $text_data = true;
                 } else {
                     $text_data = false;
                 }
             } else {
                 crawlLog("No page processor for mime type: " . $type);
                 crawlLog("Not processing: " . $site[self::URL]);
                 continue;
             }
         }
         if (!$handled) {
             if (isset($this->plugin_processors[$page_processor])) {
                 $processor = new $page_processor($this->plugin_processors[$page_processor], $this->max_description_len, $this->summarizer_option);
             } else {
                 $processor = new $page_processor(array(), $this->max_description_len, $this->summarizer_option);
             }
         }
         if (isset($site[self::PAGE]) && !$handled) {
             if (!isset($site[self::ENCODING])) {
                 $site[self::ENCODING] = "UTF-8";
             }
             //if not UTF-8 convert before doing anything else
             if (isset($site[self::ENCODING]) && $site[self::ENCODING] != "UTF-8" && $site[self::ENCODING] != "" && generalIsA($page_processor, "TextProcessor")) {
                 if (!@mb_check_encoding($site[self::PAGE], $site[self::ENCODING])) {
                     crawlLog("  MB_CHECK_ENCODING FAILED!!");
                 }
                 crawlLog("  Converting from encoding " . $site[self::ENCODING] . "...");
                 //if HEBREW WINDOWS-1255 use ISO-8859 instead
                 if (stristr($site[self::ENCODING], "1255")) {
                     $site[self::ENCODING] = "ISO-8859-8";
                     crawlLog("  using encoding " . $site[self::ENCODING] . "...");
                 }
                 if (stristr($site[self::ENCODING], "1256")) {
                     $site[self::PAGE] = w1256ToUTF8($site[self::PAGE]);
                     crawlLog("  using Yioop hack encoding ...");
                 } else {
                     $site[self::PAGE] = @mb_convert_encoding($site[self::PAGE], "UTF-8", $site[self::ENCODING]);
                 }
             }
             crawlLog("  Using Processor..." . $page_processor);
             if (isset($site[self::REPOSITORY_TYPE]) && $site[self::REPOSITORY_TYPE] == self::REPOSITORY_GIT) {
                 $tmp_url_store = $site[self::URL];
                 $site[self::URL] = $site[self::FILE_NAME];
             }
             $doc_info = $processor->handle($site[self::PAGE], $site[self::URL]);
             if (isset($site[self::REPOSITORY_TYPE]) && $site[self::REPOSITORY_TYPE] == self::REPOSITORY_GIT) {
                 $site[self::URL] = $tmp_url_store;
             }
             if (!$doc_info) {
                 crawlLog("  Processing Yielded No Data For: " . $site[self::URL]);
             }
             if ($page_processor != "RobotProcessor" && !isset($doc_info[self::JUST_METAS])) {
                 $this->pruneLinks($doc_info, CrawlConstants::LINKS, $start_time);
             }
         } else {
             if (!$handled) {
                 $doc_info = false;
             }
         }
         $not_loc = true;
         if ($doc_info) {
             $site[self::DOC_INFO] = $doc_info;
             if (isset($doc_info[self::LOCATION])) {
                 $site[self::HASH] = crawlHash(crawlHash($site[self::URL], true) . "LOCATION", true);
                 $not_loc = false;
             }
             $site[self::ROBOT_INSTANCE] = $prefix . ROBOT_INSTANCE;
             if (!is_dir(CRAWL_DIR . "/cache")) {
                 mkdir(CRAWL_DIR . "/cache");
                 $htaccess = "Options None\nphp_flag engine off\n";
                 file_put_contents(CRAWL_DIR . "/cache/.htaccess", $htaccess);
             }
             if ($type == "text/robot" && isset($doc_info[self::PAGE])) {
                 $site[self::PAGE] = $doc_info[self::PAGE];
             }
             if ($text_data) {
                 if (isset($doc_info[self::PAGE])) {
                     $site[self::PAGE] = $doc_info[self::PAGE];
                 } else {
                     $site[self::PAGE] = NULL;
                 }
                 if ($not_loc) {
                     $content = $doc_info[self::DESCRIPTION];
                     $site[self::HASH] = FetchUrl::computePageHash($content);
                 }
             } else {
                 $site[self::HASH] = FetchUrl::computePageHash($site[self::PAGE]);
             }
             if (isset($doc_info[self::WORD_CLOUD])) {
                 $site[self::WORD_CLOUD] = $doc_info[self::WORD_CLOUD];
             } else {
                 $site[self::WORD_CLOUD] = NULL;
             }
             if (isset($doc_info[self::CRAWL_DELAY])) {
                 $site[self::CRAWL_DELAY] = $doc_info[self::CRAWL_DELAY];
             }
             if (isset($doc_info[self::ROBOT_PATHS]) && !$was_error) {
                 $site[self::ROBOT_PATHS] = $doc_info[self::ROBOT_PATHS];
             }
             if (!isset($site[self::ROBOT_METAS])) {
                 $site[self::ROBOT_METAS] = array();
             }
             if (isset($doc_info[self::ROBOT_METAS])) {
                 $site[self::ROBOT_METAS] = array_merge($site[self::ROBOT_METAS], $doc_info[self::ROBOT_METAS]);
             }
             //here's where we enforce NOFOLLOW
             if (in_array("NOFOLLOW", $site[self::ROBOT_METAS]) || in_array("NONE", $site[self::ROBOT_METAS])) {
                 $site[self::DOC_INFO][self::LINKS] = array();
             }
             if (isset($doc_info[self::AGENT_LIST])) {
                 $site[self::AGENT_LIST] = $doc_info[self::AGENT_LIST];
             }
             $this->copySiteFields($i, $site, $summarized_site_pages, $stored_site_pages);
             $summarized_site_pages[$i][self::URL] = strip_tags($site[self::URL]);
             if (isset($site[self::REPOSITORY_TYPE]) && $site[self::REPOSITORY_TYPE] == self::REPOSITORY_GIT) {
                 $summarized_site_pages[$i][self::TITLE] = $site[self::FILE_NAME];
             } else {
                 $summarized_site_pages[$i][self::TITLE] = strip_tags($site[self::DOC_INFO][self::TITLE]);
                 // stripping html to be on the safe side
             }
             if (!isset($site[self::REPOSITORY_TYPE])) {
                 if ($was_robot_error) {
                     $site[self::DOC_INFO][self::DESCRIPTION] = "There was an HTTP error in trying to download " . "this robots.txt file, so all paths to this site " . "were dsallowed by Yioop.\n" . $site[self::DOC_INFO][self::DESCRIPTION];
                 }
                 $summarized_site_pages[$i][self::DESCRIPTION] = strip_tags($site[self::DOC_INFO][self::DESCRIPTION]);
             } else {
                 $summarized_site_pages[$i][self::DESCRIPTION] = $site[self::DOC_INFO][self::DESCRIPTION];
             }
             if (isset($site[self::DOC_INFO][self::JUST_METAS]) || isset($site[self::ROBOT_PATHS])) {
                 $summarized_site_pages[$i][self::JUST_METAS] = true;
             }
             if (isset($site[self::DOC_INFO][self::META_WORDS])) {
                 if (!isset($summarized_site_pages[$i][self::META_WORDS])) {
                     $summarized_site_pages[$i][self::META_WORDS] = $site[self::DOC_INFO][self::META_WORDS];
                 } else {
                     $summarized_site_pages[$i][self::META_WORDS] = array_merge($summarized_site_pages[$i][self::META_WORDS], $site[self::DOC_INFO][self::META_WORDS]);
                 }
             }
             if (isset($site[self::DOC_INFO][self::LANG])) {
                 if ($site[self::DOC_INFO][self::LANG] == 'en' && $site[self::ENCODING] != "UTF-8") {
                     $site[self::DOC_INFO][self::LANG] = guessLangEncoding($site[self::ENCODING]);
                 }
                 $summarized_site_pages[$i][self::LANG] = $site[self::DOC_INFO][self::LANG];
             }
             if (isset($site[self::DOC_INFO][self::LINKS])) {
                 $summarized_site_pages[$i][self::LINKS] = $site[self::DOC_INFO][self::LINKS];
             }
             if (isset($site[self::DOC_INFO][self::WORD_CLOUD])) {
                 $summarized_site_pages[$i][self::WORD_CLOUD] = $site[self::DOC_INFO][self::WORD_CLOUD];
             }
             if (isset($site[self::DOC_INFO][self::THUMB])) {
                 $summarized_site_pages[$i][self::THUMB] = $site[self::DOC_INFO][self::THUMB];
             }
             if (isset($site[self::DOC_INFO][self::SUBDOCS])) {
                 $this->processSubdocs($i, $site, $summarized_site_pages, $stored_site_pages);
             }
             if (isset($summarized_site_pages[$i][self::LINKS])) {
                 $summarized_site_pages[$i][self::LINKS] = UrlParser::cleanRedundantLinks($summarized_site_pages[$i][self::LINKS], $summarized_site_pages[$i][self::URL]);
             }
             if (!empty($this->classifiers)) {
                 Classifier::labelPage($summarized_site_pages[$i], $this->classifiers, $this->active_classifiers, $this->active_rankers);
             }
             if ($this->page_rule_parser != NULL) {
                 $this->page_rule_parser->executeRuleTrees($summarized_site_pages[$i]);
             }
             $metas = isset($summarized_site_pages[$i][self::ROBOT_METAS]) ? $summarized_site_pages[$i][self::ROBOT_METAS] : array();
             if (array_intersect($metas, array("NOARCHIVE", "NOINDEX", "JUSTFOLLOW", "NONE")) != array()) {
                 $stored_site_pages[$i] = false;
             }
             $stored_site_pages[$i][self::INDEX] = $i;
             $i++;
         }
     }
     // end for
     $num_pages = count($stored_site_pages);
     $filter_stored = array_filter($stored_site_pages);
     if ($num_pages > 0 && $this->cache_pages) {
         $cache_page_partition = $this->web_archive->addPages(self::OFFSET, $filter_stored);
     } else {
         if ($num_pages > 0) {
             $this->web_archive->addCount(count($filter_stored));
         }
     }
     for ($i = 0; $i < $num_pages; $i++) {
         $summarized_site_pages[$i][self::INDEX] = $num_items + $i;
     }
     foreach ($filter_stored as $stored) {
         $i = $stored[self::INDEX];
         if (isset($stored[self::OFFSET])) {
             $summarized_site_pages[$i][self::OFFSET] = $stored[self::OFFSET];
             $summarized_site_pages[$i][self::CACHE_PAGE_PARTITION] = $cache_page_partition;
         }
     }
     crawlLog("  Process pages time: " . changeInMicrotime($start_time) . " Current Memory: " . memory_get_usage());
     return $summarized_site_pages;
 }