/** * Checks to see whether there are more pages to extract from the current * archive, and if so returns the next batch to the requesting fetcher. The * iteration progress is automatically saved on each call to nextPages, so * that the next fetcher will get the next batch of pages. If there is no * current archive to iterate over, or the iterator has reached the end of * the archive then indicate that there is no more data by setting the * status to NO_DATA_STATE. */ function archiveSchedule() { $view = "fetch"; $request_start = time(); if (isset($_REQUEST['crawl_time'])) { $crawl_time = substr($this->clean($_REQUEST['crawl_time'], 'int'), 0, TIMESTAMP_LEN); } else { $crawl_time = 0; } $messages_filename = CRAWL_DIR . '/schedules/name_server_messages.txt'; $lock_filename = WORK_DIRECTORY . "/schedules/name_server_lock.txt"; if ($crawl_time > 0 && file_exists($messages_filename)) { $fetch_pages = true; $info = unserialize(file_get_contents($messages_filename)); if ($info[self::STATUS] == 'STOP_CRAWL') { /* The stop crawl message gets created by the admin_controller when the "stop crawl" button is pressed.*/ if (file_exists($messages_filename)) { unlink($messages_filename); } if (file_exists($lock_filename)) { unlink($lock_filename); } $fetch_pages = false; $info = array(); } $this->checkRestart(self::ARCHIVE_CRAWL); } else { $fetch_pages = false; $info = array(); } $pages = array(); $got_lock = true; if (file_exists($lock_filename)) { $lock_time = unserialize(file_get_contents($lock_filename)); if ($request_start - $lock_time < ini_get('max_execution_time')) { $got_lock = false; } } $chunk = false; $archive_iterator = NULL; if ($fetch_pages && $got_lock) { file_put_contents($lock_filename, serialize($request_start)); if ($info[self::ARC_DIR] == "MIX" || file_exists($info[self::ARC_DIR])) { $iterate_timestamp = $info[self::CRAWL_INDEX]; $result_timestamp = $crawl_time; $result_dir = WORK_DIRECTORY . "/schedules/" . self::name_archive_iterator . $crawl_time; $arctype = $info[self::ARC_TYPE]; $iterator_name = $arctype . "Iterator"; if (!class_exists($iterator_name)) { $info['ARCHIVE_BUNDLE_ERROR'] = "Invalid bundle iterator: '{$iterator_name}'"; } else { if ($info[self::ARC_DIR] == "MIX") { //recrawl of crawl mix case $archive_iterator = new $iterator_name($iterate_timestamp, $result_timestamp); } else { //any other archive crawl except web archive recrawls $archive_iterator = new $iterator_name($iterate_timestamp, $info[self::ARC_DIR], $result_timestamp, $result_dir); } } } $pages = false; if ($archive_iterator && !$archive_iterator->end_of_iterator) { if (generalIsA($archive_iterator, "TextArchiveBundleIterator")) { $pages = $archive_iterator->nextChunk(); $chunk = true; } else { $pages = $archive_iterator->nextPages(ARCHIVE_BATCH_SIZE); } } if (file_exists($lock_filename)) { unlink($lock_filename); } } if ($archive_iterator && $archive_iterator->end_of_iterator) { $info[self::END_ITERATOR] = true; } if ($chunk && $pages || $pages && !empty($pages)) { $pages_string = webencode(gzcompress(serialize($pages))); } else { $info[self::STATUS] = self::NO_DATA_STATE; $info[self::POST_MAX_SIZE] = metricToInt(ini_get("post_max_size")); $pages = array(); $pages_string = webencode(gzcompress(serialize($pages))); } $info[self::DATA] = $pages_string; $info_string = serialize($info); $data['MESSAGE'] = $info_string; $this->displayView($view, $data); }
/** * Processes an array of downloaded web pages with the appropriate page * processor. * * Summary data is extracted from each non robots.txt file in the array. * Disallowed paths and crawl-delays are extracted from robots.txt files. * * @param array $site_pages a collection of web pages to process * @return array summary data extracted from these pages */ function processFetchPages($site_pages) { $PAGE_PROCESSORS = $this->page_processors; crawlLog("Start process pages... Current Memory:" . memory_get_usage()); $start_time = microtime(); $prefix = $this->fetcher_num . "-"; $stored_site_pages = array(); $summarized_site_pages = array(); $num_items = $this->web_archive->count; $i = 0; foreach ($site_pages as $site) { $response_code = $site[self::HTTP_CODE]; $was_error = false; if ($response_code < 200 || $response_code >= 300) { crawlLog($site[self::URL] . " response code {$response_code}"); $host = UrlParser::getHost($site[self::URL]); if (!isset($this->hosts_with_errors[$host])) { $this->hosts_with_errors[$host] = 0; } if ($response_code >= 400 || $response_code < 100) { // < 100 will capture failures to connect which are returned // as strings $was_error = true; $this->hosts_with_errors[$host]++; } /* we print out errors to std output. We still go ahead and process the page. Maybe it is a cool error page, also this makes sure we don't crawl it again */ } // text/robot is my made up mimetype for robots.txt files $was_robot_error = false; if (isset($site[self::ROBOT_PATHS])) { if (!$was_error) { $type = "text/robot"; } else { $type = $site[self::TYPE]; if ($response_code != 404) { /* disallow crawling if robots.txt was any error other that not found */ $was_robot_error = true; $site[self::ROBOT_PATHS][] = "/"; } } } else { if (isset($site[self::FILE_NAME])) { $extension = UrlParser::getDocumentType($site[self::FILE_NAME]); if ($extension == $this->programming_language_extension['java']) { $type = "text/java"; } else { if ($extension == $this->programming_language_extension['py']) { $type = "text/py"; } else { $type = $site[self::TYPE]; } } } else { $type = $site[self::TYPE]; } } $handled = false; /*deals with short URLs and directs them to the original link for robots.txt don't want to introduce stuff that can be mis-parsed (we follow redirects in this case anyway) */ if (isset($site[self::LOCATION]) && count($site[self::LOCATION]) > 0 && strcmp($type, "text/robot") != 0) { array_unshift($site[self::LOCATION], $site[self::URL]); $tmp_loc = array_pop($site[self::LOCATION]); $tmp_loc = UrlParser::canonicalLink($tmp_loc, $site[self::URL]); $site[self::LOCATION] = array_push($site[self::LOCATION], $tmp_loc); $doc_info = array(); $doc_info[self::LINKS][$tmp_loc] = "location:" . $site[self::URL]; $doc_info[self::LOCATION] = true; $doc_info[self::DESCRIPTION] = $site[self::URL] . " => " . $tmp_loc; $doc_info[self::PAGE] = $doc_info[self::DESCRIPTION]; $doc_info[self::TITLE] = $site[self::URL]; $text_data = true; if (!isset($site[self::ENCODING])) { $site[self::ENCODING] = "UTF-8"; } $handled = true; } else { if (isset($PAGE_PROCESSORS[$type])) { $page_processor = $PAGE_PROCESSORS[$type]; if (generalIsA($page_processor, "TextProcessor")) { $text_data = true; } else { $text_data = false; } } else { crawlLog("No page processor for mime type: " . $type); crawlLog("Not processing: " . $site[self::URL]); continue; } } if (!$handled) { if (isset($this->plugin_processors[$page_processor])) { $processor = new $page_processor($this->plugin_processors[$page_processor], $this->max_description_len, $this->summarizer_option); } else { $processor = new $page_processor(array(), $this->max_description_len, $this->summarizer_option); } } if (isset($site[self::PAGE]) && !$handled) { if (!isset($site[self::ENCODING])) { $site[self::ENCODING] = "UTF-8"; } //if not UTF-8 convert before doing anything else if (isset($site[self::ENCODING]) && $site[self::ENCODING] != "UTF-8" && $site[self::ENCODING] != "" && generalIsA($page_processor, "TextProcessor")) { if (!@mb_check_encoding($site[self::PAGE], $site[self::ENCODING])) { crawlLog(" MB_CHECK_ENCODING FAILED!!"); } crawlLog(" Converting from encoding " . $site[self::ENCODING] . "..."); //if HEBREW WINDOWS-1255 use ISO-8859 instead if (stristr($site[self::ENCODING], "1255")) { $site[self::ENCODING] = "ISO-8859-8"; crawlLog(" using encoding " . $site[self::ENCODING] . "..."); } if (stristr($site[self::ENCODING], "1256")) { $site[self::PAGE] = w1256ToUTF8($site[self::PAGE]); crawlLog(" using Yioop hack encoding ..."); } else { $site[self::PAGE] = @mb_convert_encoding($site[self::PAGE], "UTF-8", $site[self::ENCODING]); } } crawlLog(" Using Processor..." . $page_processor); if (isset($site[self::REPOSITORY_TYPE]) && $site[self::REPOSITORY_TYPE] == self::REPOSITORY_GIT) { $tmp_url_store = $site[self::URL]; $site[self::URL] = $site[self::FILE_NAME]; } $doc_info = $processor->handle($site[self::PAGE], $site[self::URL]); if (isset($site[self::REPOSITORY_TYPE]) && $site[self::REPOSITORY_TYPE] == self::REPOSITORY_GIT) { $site[self::URL] = $tmp_url_store; } if (!$doc_info) { crawlLog(" Processing Yielded No Data For: " . $site[self::URL]); } if ($page_processor != "RobotProcessor" && !isset($doc_info[self::JUST_METAS])) { $this->pruneLinks($doc_info, CrawlConstants::LINKS, $start_time); } } else { if (!$handled) { $doc_info = false; } } $not_loc = true; if ($doc_info) { $site[self::DOC_INFO] = $doc_info; if (isset($doc_info[self::LOCATION])) { $site[self::HASH] = crawlHash(crawlHash($site[self::URL], true) . "LOCATION", true); $not_loc = false; } $site[self::ROBOT_INSTANCE] = $prefix . ROBOT_INSTANCE; if (!is_dir(CRAWL_DIR . "/cache")) { mkdir(CRAWL_DIR . "/cache"); $htaccess = "Options None\nphp_flag engine off\n"; file_put_contents(CRAWL_DIR . "/cache/.htaccess", $htaccess); } if ($type == "text/robot" && isset($doc_info[self::PAGE])) { $site[self::PAGE] = $doc_info[self::PAGE]; } if ($text_data) { if (isset($doc_info[self::PAGE])) { $site[self::PAGE] = $doc_info[self::PAGE]; } else { $site[self::PAGE] = NULL; } if ($not_loc) { $content = $doc_info[self::DESCRIPTION]; $site[self::HASH] = FetchUrl::computePageHash($content); } } else { $site[self::HASH] = FetchUrl::computePageHash($site[self::PAGE]); } if (isset($doc_info[self::WORD_CLOUD])) { $site[self::WORD_CLOUD] = $doc_info[self::WORD_CLOUD]; } else { $site[self::WORD_CLOUD] = NULL; } if (isset($doc_info[self::CRAWL_DELAY])) { $site[self::CRAWL_DELAY] = $doc_info[self::CRAWL_DELAY]; } if (isset($doc_info[self::ROBOT_PATHS]) && !$was_error) { $site[self::ROBOT_PATHS] = $doc_info[self::ROBOT_PATHS]; } if (!isset($site[self::ROBOT_METAS])) { $site[self::ROBOT_METAS] = array(); } if (isset($doc_info[self::ROBOT_METAS])) { $site[self::ROBOT_METAS] = array_merge($site[self::ROBOT_METAS], $doc_info[self::ROBOT_METAS]); } //here's where we enforce NOFOLLOW if (in_array("NOFOLLOW", $site[self::ROBOT_METAS]) || in_array("NONE", $site[self::ROBOT_METAS])) { $site[self::DOC_INFO][self::LINKS] = array(); } if (isset($doc_info[self::AGENT_LIST])) { $site[self::AGENT_LIST] = $doc_info[self::AGENT_LIST]; } $this->copySiteFields($i, $site, $summarized_site_pages, $stored_site_pages); $summarized_site_pages[$i][self::URL] = strip_tags($site[self::URL]); if (isset($site[self::REPOSITORY_TYPE]) && $site[self::REPOSITORY_TYPE] == self::REPOSITORY_GIT) { $summarized_site_pages[$i][self::TITLE] = $site[self::FILE_NAME]; } else { $summarized_site_pages[$i][self::TITLE] = strip_tags($site[self::DOC_INFO][self::TITLE]); // stripping html to be on the safe side } if (!isset($site[self::REPOSITORY_TYPE])) { if ($was_robot_error) { $site[self::DOC_INFO][self::DESCRIPTION] = "There was an HTTP error in trying to download " . "this robots.txt file, so all paths to this site " . "were dsallowed by Yioop.\n" . $site[self::DOC_INFO][self::DESCRIPTION]; } $summarized_site_pages[$i][self::DESCRIPTION] = strip_tags($site[self::DOC_INFO][self::DESCRIPTION]); } else { $summarized_site_pages[$i][self::DESCRIPTION] = $site[self::DOC_INFO][self::DESCRIPTION]; } if (isset($site[self::DOC_INFO][self::JUST_METAS]) || isset($site[self::ROBOT_PATHS])) { $summarized_site_pages[$i][self::JUST_METAS] = true; } if (isset($site[self::DOC_INFO][self::META_WORDS])) { if (!isset($summarized_site_pages[$i][self::META_WORDS])) { $summarized_site_pages[$i][self::META_WORDS] = $site[self::DOC_INFO][self::META_WORDS]; } else { $summarized_site_pages[$i][self::META_WORDS] = array_merge($summarized_site_pages[$i][self::META_WORDS], $site[self::DOC_INFO][self::META_WORDS]); } } if (isset($site[self::DOC_INFO][self::LANG])) { if ($site[self::DOC_INFO][self::LANG] == 'en' && $site[self::ENCODING] != "UTF-8") { $site[self::DOC_INFO][self::LANG] = guessLangEncoding($site[self::ENCODING]); } $summarized_site_pages[$i][self::LANG] = $site[self::DOC_INFO][self::LANG]; } if (isset($site[self::DOC_INFO][self::LINKS])) { $summarized_site_pages[$i][self::LINKS] = $site[self::DOC_INFO][self::LINKS]; } if (isset($site[self::DOC_INFO][self::WORD_CLOUD])) { $summarized_site_pages[$i][self::WORD_CLOUD] = $site[self::DOC_INFO][self::WORD_CLOUD]; } if (isset($site[self::DOC_INFO][self::THUMB])) { $summarized_site_pages[$i][self::THUMB] = $site[self::DOC_INFO][self::THUMB]; } if (isset($site[self::DOC_INFO][self::SUBDOCS])) { $this->processSubdocs($i, $site, $summarized_site_pages, $stored_site_pages); } if (isset($summarized_site_pages[$i][self::LINKS])) { $summarized_site_pages[$i][self::LINKS] = UrlParser::cleanRedundantLinks($summarized_site_pages[$i][self::LINKS], $summarized_site_pages[$i][self::URL]); } if (!empty($this->classifiers)) { Classifier::labelPage($summarized_site_pages[$i], $this->classifiers, $this->active_classifiers, $this->active_rankers); } if ($this->page_rule_parser != NULL) { $this->page_rule_parser->executeRuleTrees($summarized_site_pages[$i]); } $metas = isset($summarized_site_pages[$i][self::ROBOT_METAS]) ? $summarized_site_pages[$i][self::ROBOT_METAS] : array(); if (array_intersect($metas, array("NOARCHIVE", "NOINDEX", "JUSTFOLLOW", "NONE")) != array()) { $stored_site_pages[$i] = false; } $stored_site_pages[$i][self::INDEX] = $i; $i++; } } // end for $num_pages = count($stored_site_pages); $filter_stored = array_filter($stored_site_pages); if ($num_pages > 0 && $this->cache_pages) { $cache_page_partition = $this->web_archive->addPages(self::OFFSET, $filter_stored); } else { if ($num_pages > 0) { $this->web_archive->addCount(count($filter_stored)); } } for ($i = 0; $i < $num_pages; $i++) { $summarized_site_pages[$i][self::INDEX] = $num_items + $i; } foreach ($filter_stored as $stored) { $i = $stored[self::INDEX]; if (isset($stored[self::OFFSET])) { $summarized_site_pages[$i][self::OFFSET] = $stored[self::OFFSET]; $summarized_site_pages[$i][self::CACHE_PAGE_PARTITION] = $cache_page_partition; } } crawlLog(" Process pages time: " . changeInMicrotime($start_time) . " Current Memory: " . memory_get_usage()); return $summarized_site_pages; }