/** * Determines the language of the html document by looking at the root * language attribute. If that fails $sample_text is used to try to guess * the language * * @param object $dom a document object to check the language of * @param string $sample_text sample text to try guess the language from * @param string $url url of web-page as a fallback look at the country * to figure out language * * @return string language tag for guessed language */ static function lang($dom, $sample_text = NULL, $url = NULL) { $htmls = $dom->getElementsByTagName("html"); $lang = NULL; foreach ($htmls as $html) { $lang = $html->getAttribute('lang'); if ($lang != NULL) { return $lang; } } if ($lang == NULL) { //baidu doesn't have a lang attribute but does say encoding $xpath = new DOMXPath($dom); $charset_check = "contains(translate(@http-equiv," . "'abcdefghijklmnopqrstuvwxyz'," . " 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),'CONTENT-TYPE')"; $metas = $xpath->evaluate("/html/head//meta[{$charset_check}]"); $found_metas = array(); foreach ($metas as $meta) { $content = $meta->getAttribute('content'); $charset_metas = explode("=", $content); if (isset($charset_metas[1])) { $charset = strtoupper($charset_metas[1]); $lang = guessLangEncoding($charset); return $lang; } } $lang = self::calculateLang($sample_text, $url); } return $lang; }
/** * Processes an array of downloaded web pages with the appropriate page * processor. * * Summary data is extracted from each non robots.txt file in the array. * Disallowed paths and crawl-delays are extracted from robots.txt files. * * @param array $site_pages a collection of web pages to process * @return array summary data extracted from these pages */ function processFetchPages($site_pages) { $PAGE_PROCESSORS = $this->page_processors; crawlLog("Start process pages... Current Memory:" . memory_get_usage()); $start_time = microtime(); $prefix = $this->fetcher_num . "-"; $stored_site_pages = array(); $summarized_site_pages = array(); $num_items = $this->web_archive->count; $i = 0; foreach ($site_pages as $site) { $response_code = $site[self::HTTP_CODE]; $was_error = false; if ($response_code < 200 || $response_code >= 300) { crawlLog($site[self::URL] . " response code {$response_code}"); $host = UrlParser::getHost($site[self::URL]); if (!isset($this->hosts_with_errors[$host])) { $this->hosts_with_errors[$host] = 0; } if ($response_code >= 400 || $response_code < 100) { // < 100 will capture failures to connect which are returned // as strings $was_error = true; $this->hosts_with_errors[$host]++; } /* we print out errors to std output. We still go ahead and process the page. Maybe it is a cool error page, also this makes sure we don't crawl it again */ } // text/robot is my made up mimetype for robots.txt files $was_robot_error = false; if (isset($site[self::ROBOT_PATHS])) { if (!$was_error) { $type = "text/robot"; } else { $type = $site[self::TYPE]; if ($response_code != 404) { /* disallow crawling if robots.txt was any error other that not found */ $was_robot_error = true; $site[self::ROBOT_PATHS][] = "/"; } } } else { if (isset($site[self::FILE_NAME])) { $extension = UrlParser::getDocumentType($site[self::FILE_NAME]); if ($extension == $this->programming_language_extension['java']) { $type = "text/java"; } else { if ($extension == $this->programming_language_extension['py']) { $type = "text/py"; } else { $type = $site[self::TYPE]; } } } else { $type = $site[self::TYPE]; } } $handled = false; /*deals with short URLs and directs them to the original link for robots.txt don't want to introduce stuff that can be mis-parsed (we follow redirects in this case anyway) */ if (isset($site[self::LOCATION]) && count($site[self::LOCATION]) > 0 && strcmp($type, "text/robot") != 0) { array_unshift($site[self::LOCATION], $site[self::URL]); $tmp_loc = array_pop($site[self::LOCATION]); $tmp_loc = UrlParser::canonicalLink($tmp_loc, $site[self::URL]); $site[self::LOCATION] = array_push($site[self::LOCATION], $tmp_loc); $doc_info = array(); $doc_info[self::LINKS][$tmp_loc] = "location:" . $site[self::URL]; $doc_info[self::LOCATION] = true; $doc_info[self::DESCRIPTION] = $site[self::URL] . " => " . $tmp_loc; $doc_info[self::PAGE] = $doc_info[self::DESCRIPTION]; $doc_info[self::TITLE] = $site[self::URL]; $text_data = true; if (!isset($site[self::ENCODING])) { $site[self::ENCODING] = "UTF-8"; } $handled = true; } else { if (isset($PAGE_PROCESSORS[$type])) { $page_processor = $PAGE_PROCESSORS[$type]; if (generalIsA($page_processor, "TextProcessor")) { $text_data = true; } else { $text_data = false; } } else { crawlLog("No page processor for mime type: " . $type); crawlLog("Not processing: " . $site[self::URL]); continue; } } if (!$handled) { if (isset($this->plugin_processors[$page_processor])) { $processor = new $page_processor($this->plugin_processors[$page_processor], $this->max_description_len, $this->summarizer_option); } else { $processor = new $page_processor(array(), $this->max_description_len, $this->summarizer_option); } } if (isset($site[self::PAGE]) && !$handled) { if (!isset($site[self::ENCODING])) { $site[self::ENCODING] = "UTF-8"; } //if not UTF-8 convert before doing anything else if (isset($site[self::ENCODING]) && $site[self::ENCODING] != "UTF-8" && $site[self::ENCODING] != "" && generalIsA($page_processor, "TextProcessor")) { if (!@mb_check_encoding($site[self::PAGE], $site[self::ENCODING])) { crawlLog(" MB_CHECK_ENCODING FAILED!!"); } crawlLog(" Converting from encoding " . $site[self::ENCODING] . "..."); //if HEBREW WINDOWS-1255 use ISO-8859 instead if (stristr($site[self::ENCODING], "1255")) { $site[self::ENCODING] = "ISO-8859-8"; crawlLog(" using encoding " . $site[self::ENCODING] . "..."); } if (stristr($site[self::ENCODING], "1256")) { $site[self::PAGE] = w1256ToUTF8($site[self::PAGE]); crawlLog(" using Yioop hack encoding ..."); } else { $site[self::PAGE] = @mb_convert_encoding($site[self::PAGE], "UTF-8", $site[self::ENCODING]); } } crawlLog(" Using Processor..." . $page_processor); if (isset($site[self::REPOSITORY_TYPE]) && $site[self::REPOSITORY_TYPE] == self::REPOSITORY_GIT) { $tmp_url_store = $site[self::URL]; $site[self::URL] = $site[self::FILE_NAME]; } $doc_info = $processor->handle($site[self::PAGE], $site[self::URL]); if (isset($site[self::REPOSITORY_TYPE]) && $site[self::REPOSITORY_TYPE] == self::REPOSITORY_GIT) { $site[self::URL] = $tmp_url_store; } if (!$doc_info) { crawlLog(" Processing Yielded No Data For: " . $site[self::URL]); } if ($page_processor != "RobotProcessor" && !isset($doc_info[self::JUST_METAS])) { $this->pruneLinks($doc_info, CrawlConstants::LINKS, $start_time); } } else { if (!$handled) { $doc_info = false; } } $not_loc = true; if ($doc_info) { $site[self::DOC_INFO] = $doc_info; if (isset($doc_info[self::LOCATION])) { $site[self::HASH] = crawlHash(crawlHash($site[self::URL], true) . "LOCATION", true); $not_loc = false; } $site[self::ROBOT_INSTANCE] = $prefix . ROBOT_INSTANCE; if (!is_dir(CRAWL_DIR . "/cache")) { mkdir(CRAWL_DIR . "/cache"); $htaccess = "Options None\nphp_flag engine off\n"; file_put_contents(CRAWL_DIR . "/cache/.htaccess", $htaccess); } if ($type == "text/robot" && isset($doc_info[self::PAGE])) { $site[self::PAGE] = $doc_info[self::PAGE]; } if ($text_data) { if (isset($doc_info[self::PAGE])) { $site[self::PAGE] = $doc_info[self::PAGE]; } else { $site[self::PAGE] = NULL; } if ($not_loc) { $content = $doc_info[self::DESCRIPTION]; $site[self::HASH] = FetchUrl::computePageHash($content); } } else { $site[self::HASH] = FetchUrl::computePageHash($site[self::PAGE]); } if (isset($doc_info[self::WORD_CLOUD])) { $site[self::WORD_CLOUD] = $doc_info[self::WORD_CLOUD]; } else { $site[self::WORD_CLOUD] = NULL; } if (isset($doc_info[self::CRAWL_DELAY])) { $site[self::CRAWL_DELAY] = $doc_info[self::CRAWL_DELAY]; } if (isset($doc_info[self::ROBOT_PATHS]) && !$was_error) { $site[self::ROBOT_PATHS] = $doc_info[self::ROBOT_PATHS]; } if (!isset($site[self::ROBOT_METAS])) { $site[self::ROBOT_METAS] = array(); } if (isset($doc_info[self::ROBOT_METAS])) { $site[self::ROBOT_METAS] = array_merge($site[self::ROBOT_METAS], $doc_info[self::ROBOT_METAS]); } //here's where we enforce NOFOLLOW if (in_array("NOFOLLOW", $site[self::ROBOT_METAS]) || in_array("NONE", $site[self::ROBOT_METAS])) { $site[self::DOC_INFO][self::LINKS] = array(); } if (isset($doc_info[self::AGENT_LIST])) { $site[self::AGENT_LIST] = $doc_info[self::AGENT_LIST]; } $this->copySiteFields($i, $site, $summarized_site_pages, $stored_site_pages); $summarized_site_pages[$i][self::URL] = strip_tags($site[self::URL]); if (isset($site[self::REPOSITORY_TYPE]) && $site[self::REPOSITORY_TYPE] == self::REPOSITORY_GIT) { $summarized_site_pages[$i][self::TITLE] = $site[self::FILE_NAME]; } else { $summarized_site_pages[$i][self::TITLE] = strip_tags($site[self::DOC_INFO][self::TITLE]); // stripping html to be on the safe side } if (!isset($site[self::REPOSITORY_TYPE])) { if ($was_robot_error) { $site[self::DOC_INFO][self::DESCRIPTION] = "There was an HTTP error in trying to download " . "this robots.txt file, so all paths to this site " . "were dsallowed by Yioop.\n" . $site[self::DOC_INFO][self::DESCRIPTION]; } $summarized_site_pages[$i][self::DESCRIPTION] = strip_tags($site[self::DOC_INFO][self::DESCRIPTION]); } else { $summarized_site_pages[$i][self::DESCRIPTION] = $site[self::DOC_INFO][self::DESCRIPTION]; } if (isset($site[self::DOC_INFO][self::JUST_METAS]) || isset($site[self::ROBOT_PATHS])) { $summarized_site_pages[$i][self::JUST_METAS] = true; } if (isset($site[self::DOC_INFO][self::META_WORDS])) { if (!isset($summarized_site_pages[$i][self::META_WORDS])) { $summarized_site_pages[$i][self::META_WORDS] = $site[self::DOC_INFO][self::META_WORDS]; } else { $summarized_site_pages[$i][self::META_WORDS] = array_merge($summarized_site_pages[$i][self::META_WORDS], $site[self::DOC_INFO][self::META_WORDS]); } } if (isset($site[self::DOC_INFO][self::LANG])) { if ($site[self::DOC_INFO][self::LANG] == 'en' && $site[self::ENCODING] != "UTF-8") { $site[self::DOC_INFO][self::LANG] = guessLangEncoding($site[self::ENCODING]); } $summarized_site_pages[$i][self::LANG] = $site[self::DOC_INFO][self::LANG]; } if (isset($site[self::DOC_INFO][self::LINKS])) { $summarized_site_pages[$i][self::LINKS] = $site[self::DOC_INFO][self::LINKS]; } if (isset($site[self::DOC_INFO][self::WORD_CLOUD])) { $summarized_site_pages[$i][self::WORD_CLOUD] = $site[self::DOC_INFO][self::WORD_CLOUD]; } if (isset($site[self::DOC_INFO][self::THUMB])) { $summarized_site_pages[$i][self::THUMB] = $site[self::DOC_INFO][self::THUMB]; } if (isset($site[self::DOC_INFO][self::SUBDOCS])) { $this->processSubdocs($i, $site, $summarized_site_pages, $stored_site_pages); } if (isset($summarized_site_pages[$i][self::LINKS])) { $summarized_site_pages[$i][self::LINKS] = UrlParser::cleanRedundantLinks($summarized_site_pages[$i][self::LINKS], $summarized_site_pages[$i][self::URL]); } if (!empty($this->classifiers)) { Classifier::labelPage($summarized_site_pages[$i], $this->classifiers, $this->active_classifiers, $this->active_rankers); } if ($this->page_rule_parser != NULL) { $this->page_rule_parser->executeRuleTrees($summarized_site_pages[$i]); } $metas = isset($summarized_site_pages[$i][self::ROBOT_METAS]) ? $summarized_site_pages[$i][self::ROBOT_METAS] : array(); if (array_intersect($metas, array("NOARCHIVE", "NOINDEX", "JUSTFOLLOW", "NONE")) != array()) { $stored_site_pages[$i] = false; } $stored_site_pages[$i][self::INDEX] = $i; $i++; } } // end for $num_pages = count($stored_site_pages); $filter_stored = array_filter($stored_site_pages); if ($num_pages > 0 && $this->cache_pages) { $cache_page_partition = $this->web_archive->addPages(self::OFFSET, $filter_stored); } else { if ($num_pages > 0) { $this->web_archive->addCount(count($filter_stored)); } } for ($i = 0; $i < $num_pages; $i++) { $summarized_site_pages[$i][self::INDEX] = $num_items + $i; } foreach ($filter_stored as $stored) { $i = $stored[self::INDEX]; if (isset($stored[self::OFFSET])) { $summarized_site_pages[$i][self::OFFSET] = $stored[self::OFFSET]; $summarized_site_pages[$i][self::CACHE_PAGE_PARTITION] = $cache_page_partition; } } crawlLog(" Process pages time: " . changeInMicrotime($start_time) . " Current Memory: " . memory_get_usage()); return $summarized_site_pages; }