/** * Gets the next doc from the iterator * @param bool $no_process do not do any processing on page data * @return array associative array for doc or string if no_process true */ function nextPage($no_process = false) { static $minimal_regexes = false; static $first_call = true; if ($first_call) { $this->initializeSubstitutions($this->header['base_address']); } $page_info = $this->getNextTagData("page"); if ($no_process) { return $page_info; } $dom = new DOMDocument(); @$dom->loadXML($page_info); $site = array(); $pre_url = $this->getTextContent($dom, "/page/title"); $pre_url = str_replace(" ", "_", $pre_url); $site[self::URL] = $this->header['base_address'] . $pre_url; $site[self::IP_ADDRESSES] = array($this->header['ip_address']); $pre_timestamp = $this->getTextContent($dom, "/page/revision/timestamp"); $site[self::MODIFIED] = date("U", strtotime($pre_timestamp)); $site[self::TIMESTAMP] = time(); $site[self::TYPE] = "text/html"; $site[self::HEADER] = "mediawiki_bundle_iterator extractor"; $site[self::HTTP_CODE] = 200; $site[self::ENCODING] = "UTF-8"; $site[self::SERVER] = "unknown"; $site[self::SERVER_VERSION] = "unknown"; $site[self::OPERATING_SYSTEM] = "unknown"; $site[self::PAGE] = "<html lang='" . $this->header['lang'] . "' >\n" . "<head><title>{$pre_url}</title>\n" . WIKI_PAGE_STYLES . "\n</head>\n" . "<body><h1>{$pre_url}</h1>\n"; $pre_page = $this->getTextContent($dom, "/page/revision/text"); $current_hash = crawlHash($pre_page); if ($first_call) { $this->saveCheckPoint(); //ensure we remember to advance one on fail $first_call = false; } $pre_page = $this->parser->parse($pre_page, false, true); $pre_page = preg_replace("/{{Other uses}}/i", "<div class='indent'>\"\$1\". (<a href='" . $site[self::URL] . "_(disambiguation)'>{$pre_url}</a>)</div>", $pre_page); $site[self::PAGE] .= $pre_page; $site[self::PAGE] .= "\n</body>\n</html>"; $site[self::HASH] = FetchUrl::computePageHash($site[self::PAGE]); $site[self::WEIGHT] = ceil(max(log(strlen($site[self::PAGE]) + 1, 2) - 10, 1)); return $site; }
/** * Gets the next doc from the iterator * @param bool $no_process if true then just return page string found * not any additional meta data. * @return mixed associative array for doc or just string of doc */ function nextPage($no_process = false) { if (!$this->checkFileHandle()) { return NULL; } $matches = array(); while (preg_match($this->delimiter, $this->buffer, $matches, PREG_OFFSET_CAPTURE) != 1) { crawlTimeoutLog("..still looking for a page in local buffer"); $block = $this->getFileBlock(); if (!$block || !$this->checkFileHandle() || $this->checkEof()) { return NULL; } $this->buffer .= $block; } $delim_len = strlen($matches[0][0]); $pos = $matches[0][1] + $delim_len; $page_pos = $this->start_delimiter == "" ? $pos : $pos - $delim_len; $page = substr($this->buffer, 0, $page_pos); if ($this->end_delimiter == "") { $page = $this->remainder . $page; $this->remainder = $matches[0][0]; } $this->buffer = substr($this->buffer, $pos + $delim_len); if ($this->start_delimiter != "") { $matches = array(); if (preg_match($this->start_delimiter, $this->buffer, $matches, PREG_OFFSET_CAPTURE) != 1) { if (isset($matches[0][1])) { $page = substr($page, $matches[0][1]); } } } if ($no_process == true) { return $page; } $site = array(); $site[self::HEADER] = "text_archive_bundle_iterator extractor"; $site[self::IP_ADDRESSES] = array("0.0.0.0"); $site[self::TIMESTAMP] = date("U", time()); $site[self::TYPE] = "text/plain"; $site[self::PAGE] = $page; $site[self::HASH] = FetchUrl::computePageHash($page); $site[self::URL] = "record:" . webencode($site[self::HASH]); $site[self::HTTP_CODE] = 200; $site[self::ENCODING] = $this->encoding; $site[self::SERVER] = "unknown"; $site[self::SERVER_VERSION] = "unknown"; $site[self::OPERATING_SYSTEM] = "unknown"; $site[self::WEIGHT] = 1; return $site; }
/** * Gets the next doc from the iterator * @param bool $no_process do not do any processing on page data * @return array associative array for doc or string if no_process true */ function nextPage($no_process = false) { if (!$this->checkFileHandle()) { return NULL; } $tag_data = $this->getNextTagsData(array("Topic", "ExternalPage")); if (!$tag_data) { return false; } list($page_info, $tag) = $tag_data; if ($no_process) { return $page_info; } $page_info = str_replace("r:id", "id", $page_info); $page_info = str_replace("r:resource", "resource", $page_info); $page_info = str_replace("d:Title", "Title", $page_info); $page_info = str_replace("d:Description", "Description", $page_info); $dom = new DOMDocument(); $dom->loadXML($page_info); $processMethod = "process" . $tag; $site[self::IP_ADDRESSES] = array($this->header['ip_address']); $site[self::MODIFIED] = time(); $site[self::TIMESTAMP] = time(); $site[self::TYPE] = "text/html"; $site[self::HEADER] = "odp_rdf_bundle_iterator extractor"; $site[self::HTTP_CODE] = 200; $site[self::ENCODING] = "UTF-8"; $site[self::SERVER] = "unknown"; $site[self::SERVER_VERSION] = "unknown"; $site[self::OPERATING_SYSTEM] = "unknown"; $this->{$processMethod}($dom, $site); $site[self::HASH] = FetchUrl::computePageHash($site[self::PAGE]); return $site; }
/** * Processes an array of downloaded web pages with the appropriate page * processor. * * Summary data is extracted from each non robots.txt file in the array. * Disallowed paths and crawl-delays are extracted from robots.txt files. * * @param array $site_pages a collection of web pages to process * @return array summary data extracted from these pages */ function processFetchPages($site_pages) { $PAGE_PROCESSORS = $this->page_processors; crawlLog("Start process pages... Current Memory:" . memory_get_usage()); $start_time = microtime(); $prefix = $this->fetcher_num . "-"; $stored_site_pages = array(); $summarized_site_pages = array(); $num_items = $this->web_archive->count; $i = 0; foreach ($site_pages as $site) { $response_code = $site[self::HTTP_CODE]; $was_error = false; if ($response_code < 200 || $response_code >= 300) { crawlLog($site[self::URL] . " response code {$response_code}"); $host = UrlParser::getHost($site[self::URL]); if (!isset($this->hosts_with_errors[$host])) { $this->hosts_with_errors[$host] = 0; } if ($response_code >= 400 || $response_code < 100) { // < 100 will capture failures to connect which are returned // as strings $was_error = true; $this->hosts_with_errors[$host]++; } /* we print out errors to std output. We still go ahead and process the page. Maybe it is a cool error page, also this makes sure we don't crawl it again */ } // text/robot is my made up mimetype for robots.txt files $was_robot_error = false; if (isset($site[self::ROBOT_PATHS])) { if (!$was_error) { $type = "text/robot"; } else { $type = $site[self::TYPE]; if ($response_code != 404) { /* disallow crawling if robots.txt was any error other that not found */ $was_robot_error = true; $site[self::ROBOT_PATHS][] = "/"; } } } else { if (isset($site[self::FILE_NAME])) { $extension = UrlParser::getDocumentType($site[self::FILE_NAME]); if ($extension == $this->programming_language_extension['java']) { $type = "text/java"; } else { if ($extension == $this->programming_language_extension['py']) { $type = "text/py"; } else { $type = $site[self::TYPE]; } } } else { $type = $site[self::TYPE]; } } $handled = false; /*deals with short URLs and directs them to the original link for robots.txt don't want to introduce stuff that can be mis-parsed (we follow redirects in this case anyway) */ if (isset($site[self::LOCATION]) && count($site[self::LOCATION]) > 0 && strcmp($type, "text/robot") != 0) { array_unshift($site[self::LOCATION], $site[self::URL]); $tmp_loc = array_pop($site[self::LOCATION]); $tmp_loc = UrlParser::canonicalLink($tmp_loc, $site[self::URL]); $site[self::LOCATION] = array_push($site[self::LOCATION], $tmp_loc); $doc_info = array(); $doc_info[self::LINKS][$tmp_loc] = "location:" . $site[self::URL]; $doc_info[self::LOCATION] = true; $doc_info[self::DESCRIPTION] = $site[self::URL] . " => " . $tmp_loc; $doc_info[self::PAGE] = $doc_info[self::DESCRIPTION]; $doc_info[self::TITLE] = $site[self::URL]; $text_data = true; if (!isset($site[self::ENCODING])) { $site[self::ENCODING] = "UTF-8"; } $handled = true; } else { if (isset($PAGE_PROCESSORS[$type])) { $page_processor = $PAGE_PROCESSORS[$type]; if (generalIsA($page_processor, "TextProcessor")) { $text_data = true; } else { $text_data = false; } } else { crawlLog("No page processor for mime type: " . $type); crawlLog("Not processing: " . $site[self::URL]); continue; } } if (!$handled) { if (isset($this->plugin_processors[$page_processor])) { $processor = new $page_processor($this->plugin_processors[$page_processor], $this->max_description_len, $this->summarizer_option); } else { $processor = new $page_processor(array(), $this->max_description_len, $this->summarizer_option); } } if (isset($site[self::PAGE]) && !$handled) { if (!isset($site[self::ENCODING])) { $site[self::ENCODING] = "UTF-8"; } //if not UTF-8 convert before doing anything else if (isset($site[self::ENCODING]) && $site[self::ENCODING] != "UTF-8" && $site[self::ENCODING] != "" && generalIsA($page_processor, "TextProcessor")) { if (!@mb_check_encoding($site[self::PAGE], $site[self::ENCODING])) { crawlLog(" MB_CHECK_ENCODING FAILED!!"); } crawlLog(" Converting from encoding " . $site[self::ENCODING] . "..."); //if HEBREW WINDOWS-1255 use ISO-8859 instead if (stristr($site[self::ENCODING], "1255")) { $site[self::ENCODING] = "ISO-8859-8"; crawlLog(" using encoding " . $site[self::ENCODING] . "..."); } if (stristr($site[self::ENCODING], "1256")) { $site[self::PAGE] = w1256ToUTF8($site[self::PAGE]); crawlLog(" using Yioop hack encoding ..."); } else { $site[self::PAGE] = @mb_convert_encoding($site[self::PAGE], "UTF-8", $site[self::ENCODING]); } } crawlLog(" Using Processor..." . $page_processor); if (isset($site[self::REPOSITORY_TYPE]) && $site[self::REPOSITORY_TYPE] == self::REPOSITORY_GIT) { $tmp_url_store = $site[self::URL]; $site[self::URL] = $site[self::FILE_NAME]; } $doc_info = $processor->handle($site[self::PAGE], $site[self::URL]); if (isset($site[self::REPOSITORY_TYPE]) && $site[self::REPOSITORY_TYPE] == self::REPOSITORY_GIT) { $site[self::URL] = $tmp_url_store; } if (!$doc_info) { crawlLog(" Processing Yielded No Data For: " . $site[self::URL]); } if ($page_processor != "RobotProcessor" && !isset($doc_info[self::JUST_METAS])) { $this->pruneLinks($doc_info, CrawlConstants::LINKS, $start_time); } } else { if (!$handled) { $doc_info = false; } } $not_loc = true; if ($doc_info) { $site[self::DOC_INFO] = $doc_info; if (isset($doc_info[self::LOCATION])) { $site[self::HASH] = crawlHash(crawlHash($site[self::URL], true) . "LOCATION", true); $not_loc = false; } $site[self::ROBOT_INSTANCE] = $prefix . ROBOT_INSTANCE; if (!is_dir(CRAWL_DIR . "/cache")) { mkdir(CRAWL_DIR . "/cache"); $htaccess = "Options None\nphp_flag engine off\n"; file_put_contents(CRAWL_DIR . "/cache/.htaccess", $htaccess); } if ($type == "text/robot" && isset($doc_info[self::PAGE])) { $site[self::PAGE] = $doc_info[self::PAGE]; } if ($text_data) { if (isset($doc_info[self::PAGE])) { $site[self::PAGE] = $doc_info[self::PAGE]; } else { $site[self::PAGE] = NULL; } if ($not_loc) { $content = $doc_info[self::DESCRIPTION]; $site[self::HASH] = FetchUrl::computePageHash($content); } } else { $site[self::HASH] = FetchUrl::computePageHash($site[self::PAGE]); } if (isset($doc_info[self::WORD_CLOUD])) { $site[self::WORD_CLOUD] = $doc_info[self::WORD_CLOUD]; } else { $site[self::WORD_CLOUD] = NULL; } if (isset($doc_info[self::CRAWL_DELAY])) { $site[self::CRAWL_DELAY] = $doc_info[self::CRAWL_DELAY]; } if (isset($doc_info[self::ROBOT_PATHS]) && !$was_error) { $site[self::ROBOT_PATHS] = $doc_info[self::ROBOT_PATHS]; } if (!isset($site[self::ROBOT_METAS])) { $site[self::ROBOT_METAS] = array(); } if (isset($doc_info[self::ROBOT_METAS])) { $site[self::ROBOT_METAS] = array_merge($site[self::ROBOT_METAS], $doc_info[self::ROBOT_METAS]); } //here's where we enforce NOFOLLOW if (in_array("NOFOLLOW", $site[self::ROBOT_METAS]) || in_array("NONE", $site[self::ROBOT_METAS])) { $site[self::DOC_INFO][self::LINKS] = array(); } if (isset($doc_info[self::AGENT_LIST])) { $site[self::AGENT_LIST] = $doc_info[self::AGENT_LIST]; } $this->copySiteFields($i, $site, $summarized_site_pages, $stored_site_pages); $summarized_site_pages[$i][self::URL] = strip_tags($site[self::URL]); if (isset($site[self::REPOSITORY_TYPE]) && $site[self::REPOSITORY_TYPE] == self::REPOSITORY_GIT) { $summarized_site_pages[$i][self::TITLE] = $site[self::FILE_NAME]; } else { $summarized_site_pages[$i][self::TITLE] = strip_tags($site[self::DOC_INFO][self::TITLE]); // stripping html to be on the safe side } if (!isset($site[self::REPOSITORY_TYPE])) { if ($was_robot_error) { $site[self::DOC_INFO][self::DESCRIPTION] = "There was an HTTP error in trying to download " . "this robots.txt file, so all paths to this site " . "were dsallowed by Yioop.\n" . $site[self::DOC_INFO][self::DESCRIPTION]; } $summarized_site_pages[$i][self::DESCRIPTION] = strip_tags($site[self::DOC_INFO][self::DESCRIPTION]); } else { $summarized_site_pages[$i][self::DESCRIPTION] = $site[self::DOC_INFO][self::DESCRIPTION]; } if (isset($site[self::DOC_INFO][self::JUST_METAS]) || isset($site[self::ROBOT_PATHS])) { $summarized_site_pages[$i][self::JUST_METAS] = true; } if (isset($site[self::DOC_INFO][self::META_WORDS])) { if (!isset($summarized_site_pages[$i][self::META_WORDS])) { $summarized_site_pages[$i][self::META_WORDS] = $site[self::DOC_INFO][self::META_WORDS]; } else { $summarized_site_pages[$i][self::META_WORDS] = array_merge($summarized_site_pages[$i][self::META_WORDS], $site[self::DOC_INFO][self::META_WORDS]); } } if (isset($site[self::DOC_INFO][self::LANG])) { if ($site[self::DOC_INFO][self::LANG] == 'en' && $site[self::ENCODING] != "UTF-8") { $site[self::DOC_INFO][self::LANG] = guessLangEncoding($site[self::ENCODING]); } $summarized_site_pages[$i][self::LANG] = $site[self::DOC_INFO][self::LANG]; } if (isset($site[self::DOC_INFO][self::LINKS])) { $summarized_site_pages[$i][self::LINKS] = $site[self::DOC_INFO][self::LINKS]; } if (isset($site[self::DOC_INFO][self::WORD_CLOUD])) { $summarized_site_pages[$i][self::WORD_CLOUD] = $site[self::DOC_INFO][self::WORD_CLOUD]; } if (isset($site[self::DOC_INFO][self::THUMB])) { $summarized_site_pages[$i][self::THUMB] = $site[self::DOC_INFO][self::THUMB]; } if (isset($site[self::DOC_INFO][self::SUBDOCS])) { $this->processSubdocs($i, $site, $summarized_site_pages, $stored_site_pages); } if (isset($summarized_site_pages[$i][self::LINKS])) { $summarized_site_pages[$i][self::LINKS] = UrlParser::cleanRedundantLinks($summarized_site_pages[$i][self::LINKS], $summarized_site_pages[$i][self::URL]); } if (!empty($this->classifiers)) { Classifier::labelPage($summarized_site_pages[$i], $this->classifiers, $this->active_classifiers, $this->active_rankers); } if ($this->page_rule_parser != NULL) { $this->page_rule_parser->executeRuleTrees($summarized_site_pages[$i]); } $metas = isset($summarized_site_pages[$i][self::ROBOT_METAS]) ? $summarized_site_pages[$i][self::ROBOT_METAS] : array(); if (array_intersect($metas, array("NOARCHIVE", "NOINDEX", "JUSTFOLLOW", "NONE")) != array()) { $stored_site_pages[$i] = false; } $stored_site_pages[$i][self::INDEX] = $i; $i++; } } // end for $num_pages = count($stored_site_pages); $filter_stored = array_filter($stored_site_pages); if ($num_pages > 0 && $this->cache_pages) { $cache_page_partition = $this->web_archive->addPages(self::OFFSET, $filter_stored); } else { if ($num_pages > 0) { $this->web_archive->addCount(count($filter_stored)); } } for ($i = 0; $i < $num_pages; $i++) { $summarized_site_pages[$i][self::INDEX] = $num_items + $i; } foreach ($filter_stored as $stored) { $i = $stored[self::INDEX]; if (isset($stored[self::OFFSET])) { $summarized_site_pages[$i][self::OFFSET] = $stored[self::OFFSET]; $summarized_site_pages[$i][self::CACHE_PAGE_PARTITION] = $cache_page_partition; } } crawlLog(" Process pages time: " . changeInMicrotime($start_time) . " Current Memory: " . memory_get_usage()); return $summarized_site_pages; }
/** * Gets the next at most $num many docs from the iterator. It might return * less than $num many documents if the partition changes or the end of the * bundle is reached. * * @param int $num number of docs to get * @param bool $no_process do not do any processing on page data * @return array associative arrays for $num pages */ function nextPages($num, $no_process = false) { $pages = array(); $page_count = 0; $db = $this->db; $query = "{$this->sql} " . $db->limitOffset($this->limit, $num); $result = $db->execute($query); $i = 0; while ($row = $db->fetchArray($result)) { crawlTimeoutLog("..Still getting pages from archive iterator. At %s" . " of %s", $i, $num); $page = ""; foreach ($row as $key => $value) { $page .= "{$key}{$this->field_value_separator}" . "{$value}{$this->column_separator}"; } if ($no_process) { $pages[] = $page; } else { $site = array(); $site[self::HEADER] = "database_bundle_iterator extractor"; $site[self::IP_ADDRESSES] = array("0.0.0.0"); $site[self::TIMESTAMP] = date("U", time()); $site[self::TYPE] = "text/plain"; $site[self::PAGE] = $page; $site[self::HASH] = FetchUrl::computePageHash($page); $site[self::URL] = "record:" . webencode($site[self::HASH]); $site[self::HTTP_CODE] = 200; $site[self::ENCODING] = $this->encoding; $site[self::SERVER] = "unknown"; $site[self::SERVER_VERSION] = "unknown"; $site[self::OPERATING_SYSTEM] = "unknown"; $site[self::WEIGHT] = 1; $pages[] = $site; } $page_count++; } $this->limit += $page_count; if ($page_count < $num) { $this->end_of_iterator = true; } $this->saveCheckpoint(); return $pages; }
/** * Gets the next doc from the iterator * @param bool $no_process do not do any processing on page data * @return array associative array for doc or string if no_process true */ function nextPage($no_process = false) { if (!$this->checkFileHandle()) { return NULL; } $indexable_records = array('response', 'resource'); do { $this->getRecordStart(); $page_info = $this->getWarcHeaders(); if ($page_info == NULL || !isset($page_info[self::SIZE])) { return NULL; } $length = intval($page_info[self::SIZE]); $page_info[self::SIZE] = $length; $header_and_page = ltrim($this->fileRead($length + 2)); $this->fileGets(); $this->fileGets(); if (!$header_and_page) { return NULL; } } while (!in_array($page_info['warc-type'], $indexable_records) || substr($page_info[self::URL], 0, 4) == 'dns:'); //ignore warcinfo, request, metadata, revisit, etc. records if ($no_process) { return $header_and_page; } unset($page_info['line']); unset($page_info['warc-type']); $site = $page_info; $site_contents = FetchUrl::parseHeaderPage($header_and_page); $site = array_merge($site, $site_contents); $site[self::HASH] = FetchUrl::computePageHash($site[self::PAGE]); $site[self::WEIGHT] = 1; if (!isset($site[self::TYPE])) { $site[self::TYPE] = "text/plain"; } return $site; }
/** * Gets the next doc from the iterator * @param bool $no_process do not do any processing on page data * @return array associative array for doc or string if no_process true */ function nextPage($no_process = false) { if (!$this->checkFileHandle()) { return NULL; } do { $page_info = $this->fileGets(); if (trim($page_info) == "") { return NULL; } $info_parts = explode(" ", $page_info); $num_parts = count($info_parts); $length = intval($info_parts[$num_parts - 1]); $header_and_page = $this->fileRead($length + 1); if (!$header_and_page) { return NULL; } } while (substr($page_info, 0, 3) == 'dns' || substr($page_info, 0, 8) == 'filedesc'); //ignore dns entries in arc and ignore first record if ($no_process) { return $header_and_page; } $site = array(); $site[self::URL] = $info_parts[0]; $site[self::IP_ADDRESSES] = array($info_parts[1]); $site[self::TIMESTAMP] = date("U", strtotime($info_parts[2])); $site[self::TYPE] = $info_parts[3]; $site_contents = FetchUrl::parseHeaderPage($header_and_page); $site = array_merge($site, $site_contents); $site[self::HASH] = FetchUrl::computePageHash($site[self::PAGE]); $site[self::WEIGHT] = 1; return $site; }