/** * Used to extract data between two tags for the first tag found * amongst the array of tags $tags. After operation $this->buffer has * contents after the close tag. * * @param array $tags array of tagnames to look for * * @return array of two elements: the first element is a string consisting * of start tag contents close tag of first tag found, the second * has the name of the tag amongst $tags found */ function getNextTagsData($tags) { $close_regex = '@</(' . implode('|', $tags) . ')[^>]*?>@'; $offset = 0; while (!preg_match($close_regex, $this->buffer, $matches, PREG_OFFSET_CAPTURE, $offset)) { if (!$this->checkFileHandle() || $this->checkEof()) { return false; } /* Get the next block; the block iterator can very occasionally return a bad block if a block header pattern happens to show up in compressed data, in which case decompression will fail. We want to skip over these false blocks and get back to real blocks. */ while (!is_string($block = $this->getFileBlock())) { crawlTimeoutLog("..still getting next tags data.."); if ($this->checkEof()) { return false; } } $this->buffer .= $block; } $tag = $matches[1][0]; $start_info = strpos($this->buffer, "<{$tag}"); $this->remainder = substr($this->buffer, 0, $start_info); $pre_end_info = strpos($this->buffer, "</{$tag}", $start_info); $end_info = strpos($this->buffer, ">", $pre_end_info) + 1; $tag_info = substr($this->buffer, $start_info, $end_info - $start_info); $this->buffer = substr($this->buffer, $end_info); return array($tag_info, $tag); }
/** * Used to remove from the queue urls that are no longer crawlable * because the allowed and disallowed sites have changed. */ function cullNoncrawlableSites() { $count = $this->web_queue->to_crawl_queue->count; crawlLog("Scheduler: " . " Culling noncrawlable urls after change in crawl parameters;" . " Queue Size {$count}"); $start_time = microtime(); $fh = $this->web_queue->openUrlArchive(); $delete_urls = array(); $i = 1; while ($i < $count) { crawlTimeoutLog("..Scheduler: " . "still culling noncrawlable urls. Examining " . "location %s in queue of %s.", $i, $count); $tmp = $this->web_queue->peekQueue($i, $fh); list($url, $weight, $flag, $probe) = $tmp; if (!$this->allowedToCrawlSite($url) || $this->disallowedToCrawlSite($url)) { $delete_urls[] = $url; } $i++; } $this->web_queue->closeUrlArchive($fh); $new_time = microtime(); crawlLog("...Scheduler: Done selecting cullable URLS, time so far:" . changeInMicrotime($start_time)); $this->web_queue->closeUrlArchive($fh); $new_time = microtime(); $num_deletes = count($delete_urls); $k = 0; foreach ($delete_urls as $delete_url) { $k++; crawlTimeoutLog("..Scheduler: Removing selected url %s of %s " . "from queue.", $k, $num_deletes); if ($delete_url) { $this->web_queue->removeQueue($delete_url); } else { /* if there was a hash table look up error still get rid of index from priority queue */ $this->web_queue->to_crawl_queue->poll($k); } } crawlLog("...Scheduler: Removed {$k} cullable URLS from queue in time: " . changeInMicrotime($new_time)); }
/** * Used to make a reference list for a wiki page based on the * cite tags on that page. * * @param string $page a wiki document * @return string HTML reference list to be inserted after wiki * page processed */ function makeReferences($page) { $base_address = $this->base_address; $references = "\n"; $matches = array(); preg_match_all('/{{v?cite(.+?)}}/si', $page, $matches); citeCallback(NULL, 1); $page = preg_replace_callback('/{{v?cite?a?t?i?o?n?(.+?)}}/si', "citeCallback", $page); if (isset($matches[1])) { $i = 1; $wiki_fields = array("title", "publisher", "author", "journal", "book", "quote"); foreach ($matches[1] as $reference) { $ref_parts = explode("|", $reference); $references .= "<div id=\"ref_{$i}\">{$i}." . "<a href=\"#cite_{$i}\">^</a>."; crawlTimeoutLog("..Making wiki references outer.."); if (count($ref_parts) > 0) { $ref_data = array(); $type = trim(strtolower($ref_parts[0])); array_shift($ref_parts); foreach ($ref_parts as $part) { crawlTimeoutLog("..Making wiki references inner.."); $part_parts = explode("=", $part); if (isset($part_parts[1])) { $field = strtolower(trim($part_parts[0])); $value = trim($part_parts[1]); if (in_array($field, $wiki_fields)) { $value = preg_replace($this->matches, $this->replaces, $value); $value = strip_tags($value, '<a><b><i><span><img>'); } $ref_data[$field] = $value; } } if (!isset($ref_data['author']) && isset($ref_data['last']) && isset($ref_data['first'])) { $ref_data['author'] = $ref_data['last'] . ", " . $ref_data['first']; } if (isset($ref_data['authorlink'])) { if (!isset($ref_data['author'])) { $ref_data['author'] = $ref_data['authorlink']; } $ref_data['author'] = "<a href=\"{$base_address}" . $ref_data['author'] . "\">{$ref_data['author']}</a>"; } for ($i = 2; $i < 6; $i++) { if (!isset($ref_data["author{$i}"]) && isset($ref_data["last{$i}"]) && isset($ref_data['first'])) { $ref_data["author{$i}"] = $ref_data["last{$i}"] . ", " . $ref_data["first{$i}"]; } if (!isset($ref_data["author{$i}"])) { break; } if (isset($ref_data["authorlink{$i}"])) { if (!isset($ref_data["author{$i}"])) { $ref_data["author{$i}"] = $ref_data["authorlink{$i}"]; } $ref_data["author{$i}"] = "<a href=\"{$base_address}" . $ref_data["author{$i}"] . "\">" . $ref_data["author{$i}"] . "</a>"; } $ref_data["author"] .= " and " . $ref_data["author{$i}"]; } if (!isset($ref_data['title']) && isset($ref_data['url'])) { $ref_data['title'] = $ref_data['url']; } if (isset($ref_data['title']) && isset($ref_data['url'])) { $ref_data['title'] = "<a href=\"{$ref_data['url']}\">" . "{$ref_data['title']}</a>"; } if (isset($ref_data['quote'])) { $references .= '"' . $ref_data['quote'] . '". '; } if (isset($ref_data['author'])) { $references .= $ref_data['author'] . ". "; } if (isset($ref_data['title'])) { $references .= '"' . $ref_data['title'] . '". '; } if (isset($ref_data['accessdate']) && !isset($ref_data['archivedate'])) { $references .= '(' . $ref_data['accessdate'] . ') '; } if (isset($ref_data['archivedate'])) { if (isset($ref_data['archiveurl'])) { $ref_data['archivedate'] = "<a href=\"" . $ref_data['archiveurl'] . "\">" . $ref_data['archivedate'] . "</a>"; } $references .= '(' . $ref_data['archivedate'] . ') '; } if (isset($ref_data['journal'])) { $references .= "<i>{$ref_data['journal']}</i> "; } if (isset($ref_data['location'])) { $references .= $ref_data['location'] . ". "; } if (isset($ref_data['publisher'])) { $references .= $ref_data['publisher'] . ". "; } if (isset($ref_data['doi'])) { $references .= "doi:" . $ref_data['doi'] . ". "; } if (isset($ref_data['isbn'])) { $references .= "ISBN:" . $ref_data['isbn'] . ". "; } if (isset($ref_data['jstor'])) { $references .= "JSTOR:" . $ref_data['jstor'] . ". "; } if (isset($ref_data['oclc'])) { $references .= "OCLC:" . $ref_data['oclc'] . ". "; } if (isset($ref_data['volume'])) { $references .= "<b>" . $ref_data['volume'] . "</b> "; } if (isset($ref_data['issue'])) { $references .= "#" . $ref_data['issue'] . ". "; } if (isset($ref_data['date'])) { $references .= $ref_data['date'] . ". "; } if (isset($ref_data['year'])) { $references .= $ref_data['year'] . ". "; } if (isset($ref_data['page'])) { $references .= "p." . $ref_data['page'] . ". "; } if (isset($ref_data['pages'])) { $references .= "pp." . $ref_data['pages'] . ". "; } } $references .= "</div>\n"; $i++; } } return array($page, $references); }
/** * Gets the next at most $num many docs from the iterator. It might return * less than $num many documents if the partition changes or the end of the * bundle is reached. * * @param int $num number of docs to get * @param bool $no_process do not do any processing on page data * @return array associative arrays for $num pages */ function nextPages($num, $no_process = false) { $pages = array(); $page_count = 0; $db = $this->db; $query = "{$this->sql} " . $db->limitOffset($this->limit, $num); $result = $db->execute($query); $i = 0; while ($row = $db->fetchArray($result)) { crawlTimeoutLog("..Still getting pages from archive iterator. At %s" . " of %s", $i, $num); $page = ""; foreach ($row as $key => $value) { $page .= "{$key}{$this->field_value_separator}" . "{$value}{$this->column_separator}"; } if ($no_process) { $pages[] = $page; } else { $site = array(); $site[self::HEADER] = "database_bundle_iterator extractor"; $site[self::IP_ADDRESSES] = array("0.0.0.0"); $site[self::TIMESTAMP] = date("U", time()); $site[self::TYPE] = "text/plain"; $site[self::PAGE] = $page; $site[self::HASH] = FetchUrl::computePageHash($page); $site[self::URL] = "record:" . webencode($site[self::HASH]); $site[self::HTTP_CODE] = 200; $site[self::ENCODING] = $this->encoding; $site[self::SERVER] = "unknown"; $site[self::SERVER_VERSION] = "unknown"; $site[self::OPERATING_SYSTEM] = "unknown"; $site[self::WEIGHT] = 1; $pages[] = $site; } $page_count++; } $this->limit += $page_count; if ($page_count < $num) { $this->end_of_iterator = true; } $this->saveCheckpoint(); return $pages; }
/** * Builds an inverted index shard (word --> {docs it appears in}) * for the current batch of SEEN_URLS_BEFORE_UPDATE_SCHEDULER many pages. * This inverted index shard is then merged by a queue_server * into the inverted index of the current generation of the crawl. * The complete inverted index for the whole crawl is built out of these * inverted indexes for generations. The point of computing a partial * inverted index on the fetcher is to reduce some of the computational * burden on the queue server. The resulting mini index computed by * buildMiniInvertedIndex() is stored in * $this->found_sites[self::INVERTED_INDEX] * */ function buildMiniInvertedIndex() { $start_time = microtime(); $keypad = ""; crawlLog(" Start building mini inverted index ... Current Memory:" . memory_get_usage()); $num_seen = count($this->found_sites[self::SEEN_URLS]); $this->num_seen_sites += $num_seen; /* for the fetcher we are not saving the index shards so name doesn't matter. */ if (!isset($this->found_sites[self::INVERTED_INDEX][$this->current_server])) { $this->found_sites[self::INVERTED_INDEX][$this->current_server] = new IndexShard("fetcher_shard_{$this->current_server}"); } for ($i = 0; $i < $num_seen; $i++) { $interim_time = microtime(); $site = $this->found_sites[self::SEEN_URLS][$i]; if (!isset($site[self::HASH]) || isset($site[self::ROBOT_METAS]) && in_array("JUSTFOLLOW", $site[self::ROBOT_METAS])) { continue; } $doc_rank = false; if ($this->crawl_type == self::ARCHIVE_CRAWL && isset($this->archive_iterator)) { $doc_rank = $this->archive_iterator->weight($site); } if (isset($site[self::TYPE]) && $site[self::TYPE] == "link") { $is_link = true; $doc_keys = $site[self::HTTP_CODE]; $site_url = $site[self::TITLE]; $host = UrlParser::getHost($site_url); $link_parts = explode('|', $site[self::HASH]); if (isset($link_parts[5])) { $link_origin = $link_parts[5]; } else { $link_origin = $site_url; } $meta_ids = PhraseParser::calculateLinkMetas($site_url, $host, $site[self::DESCRIPTION], $link_origin); } else { $is_link = false; $site_url = str_replace('|', "%7C", $site[self::URL]); $host = UrlParser::getHost($site_url); $doc_keys = crawlHash($site_url, true) . $site[self::HASH] . "d" . substr(crawlHash($host . "/", true), 1); $meta_ids = PhraseParser::calculateMetas($site, $this->video_sources); } $word_lists = array(); /* self::JUST_METAS check to avoid getting sitemaps in results for popular words */ $lang = NULL; if (!isset($site[self::JUST_METAS])) { $host_words = UrlParser::getWordsIfHostUrl($site_url); $path_words = UrlParser::getWordsLastPathPartUrl($site_url); if ($is_link) { $phrase_string = $site[self::DESCRIPTION]; } else { if (isset($site[self::LANG])) { if (isset($this->programming_language_extension[$site[self::LANG]])) { $phrase_string = $site[self::DESCRIPTION]; } else { $phrase_string = $host_words . " " . $site[self::TITLE] . " " . $path_words . " " . $site[self::DESCRIPTION]; } } else { $phrase_string = $host_words . " " . $site[self::TITLE] . " " . $path_words . " " . $site[self::DESCRIPTION]; } } if (isset($site[self::LANG])) { $lang = guessLocaleFromString(mb_substr($site[self::DESCRIPTION], 0, AD_HOC_TITLE_LENGTH), $site[self::LANG]); } $word_lists = PhraseParser::extractPhrasesInLists($phrase_string, $lang); $len = strlen($phrase_string); if (isset($this->programming_language_extension[$lang]) || PhraseParser::computeSafeSearchScore($word_lists, $len) < 0.012) { $meta_ids[] = "safe:true"; $safe = true; } else { $meta_ids[] = "safe:false"; $safe = false; } } if (!$is_link) { //store inlinks so they can be searched by $num_links = count($site[self::LINKS]); if ($num_links > 0) { $link_rank = false; if ($doc_rank !== false) { $link_rank = max($doc_rank - 1, 1); } } else { $link_rank = false; } } $num_queue_servers = count($this->queue_servers); if (isset($site[self::USER_RANKS]) && count($site[self::USER_RANKS]) > 0) { $score_keys = ""; foreach ($site[self::USER_RANKS] as $label => $score) { $score_keys .= packInt($score); } if (strlen($score_keys) % 8 != 0) { $score_keys .= $keypad; } $doc_keys .= $score_keys; } $this->found_sites[self::INVERTED_INDEX][$this->current_server]->addDocumentWords($doc_keys, self::NEEDS_OFFSET_FLAG, $word_lists, $meta_ids, PhraseParser::$materialized_metas, true, $doc_rank); /* $this->no_process_links is set when doing things like mix recrawls. In this case links likely already will appear in what indexing, so don't index again. $site[self::JUST_META] is set when have a sitemap or robots.txt (this case set later). In this case link info is not particularly useful for indexing and can greatly slow building inverted index. */ if (!$this->no_process_links && !isset($site[self::JUST_METAS]) && !isset($this->programming_language_extension[$lang])) { foreach ($site[self::LINKS] as $url => $link_text) { /* this mysterious check means won't index links from robots.txt. Sitemap will still be in TO_CRAWL, but that's done elsewhere */ if (strlen($url) == 0 || is_numeric($url)) { continue; } $link_host = UrlParser::getHost($url); if (strlen($link_host) == 0) { continue; } $part_num = calculatePartition($link_host, $num_queue_servers); $summary = array(); if (!isset($this->found_sites[self::LINK_SEEN_URLS][$part_num])) { $this->found_sites[self::LINK_SEEN_URLS][$part_num] = array(); } $elink_flag = $link_host != $host ? true : false; $link_text = strip_tags($link_text); $ref = $elink_flag ? "eref" : "iref"; $url = str_replace('|', "%7C", $url); $link_id = "url|" . $url . "|text|" . urlencode($link_text) . "|{$ref}|" . $site_url; $elink_flag_string = $elink_flag ? "e" : "i"; $link_keys = crawlHash($url, true) . crawlHash($link_id, true) . $elink_flag_string . substr(crawlHash($host . "/", true), 1); $summary[self::URL] = $link_id; $summary[self::TITLE] = $url; // stripping html to be on the safe side $summary[self::DESCRIPTION] = $link_text; $summary[self::TIMESTAMP] = $site[self::TIMESTAMP]; $summary[self::ENCODING] = $site[self::ENCODING]; $summary[self::HASH] = $link_id; $summary[self::TYPE] = "link"; $summary[self::HTTP_CODE] = $link_keys; $summary[self::LANG] = $lang; $this->found_sites[self::LINK_SEEN_URLS][$part_num][] = $summary; $link_word_lists = PhraseParser::extractPhrasesInLists($link_text, $lang); $link_meta_ids = PhraseParser::calculateLinkMetas($url, $link_host, $link_text, $site_url); if (!isset($this->found_sites[self::INVERTED_INDEX][$part_num])) { $this->found_sites[self::INVERTED_INDEX][$part_num] = new IndexShard("fetcher_shard_{$part_num}"); } $this->found_sites[self::INVERTED_INDEX][$part_num]->addDocumentWords($link_keys, self::NEEDS_OFFSET_FLAG, $link_word_lists, $link_meta_ids, PhraseParser::$materialized_metas, false, $link_rank); } } $iterim_elapse = changeInMicrotime($interim_time); if ($iterim_elapse > 5) { crawlLog("..Inverting " . $site[self::URL] . "...took > 5s."); } crawlTimeoutLog("..Still building inverted index. Have processed " . "%s of %s documents.\nLast url processed was %s.", $i, $num_seen, $site[self::URL]); } if ($this->crawl_type == self::ARCHIVE_CRAWL) { $this->recrawl_check_scheduler = true; } crawlLog(" Build mini inverted index time " . changeInMicrotime($start_time)); }
/** * Makes a new HashTable without deleted rows * * The hash table in Yioop is implemented using open addressing. i.e., * We store key value pair in the table itself and if there is a collision * we look for the next available slot. Two codes are use to indicate * space available in the table. One to indicate empty never used, the * other used to indicate empty but previously used and deleted. The reason * you need two codes is to ensure that if somebody inserted an item B, * it hashes to the same value as A and we move to the next empty slot, * to store B, then if we delete A we should still be able to lookup B. * The problem is as the table gets reused a lot, it tends to fill up * with a lot of deleted entries making lookup times get more and more * linear in the hash table size. By rebuilding the table we mitigate * against this problem. By choosing the rebuild frequency appropriately, * the amortized cost of this operation is only O(1). */ function rebuildHashTable() { crawlLog("Rebuilding Hash table"); $num_values = $this->to_crawl_table->num_values; $tmp_table = $this->constructHashTable($this->dir_name . "/tmp_table.dat", $num_values); $null = $this->to_crawl_table->null; $deleted = $this->to_crawl_table->deleted; for ($i = 0; $i < $num_values; $i++) { crawlTimeoutLog("..still rebuilding hash table. At entry %s of %s", $i, $num_values); list($key, $value) = $this->to_crawl_table->getEntry($i); if (strcmp($key, $null) != 0 && strcmp($key, $deleted) != 0) { $tmp_table->insert($key, $value); } } $this->to_crawl_table = NULL; gc_collect_cycles(); if (file_exists($this->dir_name . "/hash_table.dat")) { unlink($this->dir_name . "/hash_table.dat"); if (file_exists($this->dir_name . "/tmp_table.dat")) { rename($this->dir_name . "/tmp_table.dat", $this->dir_name . "/hash_table.dat"); } } $tmp_table->filename = $this->dir_name . "/hash_table.dat"; $this->to_crawl_table = $tmp_table; }
/** * Used to recompute both the index shards and the dictionary * of an index archive. The first step involves re-extracting the * word into an inverted index from the summaries' web_archives. * Then a reindex is done. * * @param string $archive_path file path to a IndexArchiveBundle */ function rebuildIndexArchive($archive_path) { $archive_type = $this->getArchiveKind($archive_path); if ($archive_type != "IndexArchiveBundle") { $this->badFormatMessageAndExit($archive_path); } $info = $archive_type::getArchiveInfo($archive_path); $tmp = unserialize($info["DESCRIPTION"]); $video_sources = $tmp[self::VIDEO_SOURCES]; $generation_info = unserialize(file_get_contents("{$archive_path}/generation.txt")); $num_generations = $generation_info['ACTIVE'] + 1; $archive = new WebArchiveBundle($archive_path . "/summaries"); $seen = 0; $generation = 0; $keypad = ""; while ($generation < $num_generations) { $partition = $archive->getPartition($generation, false); $shard_name = $archive_path . "/posting_doc_shards/index{$generation}"; crawlLog("Processing partition {$generation}"); if (file_exists($shard_name)) { crawlLog("..Unlinking old shard {$generation}"); @unlink($shard_name); } $shard = new IndexShard($shard_name, $generation, NUM_DOCS_PER_GENERATION, true); $seen_partition = 0; while ($seen_partition < $partition->count) { $num_to_get = min($partition->count - $seen_partition, 8000); $offset = $partition->iterator_pos; $objects = $partition->nextObjects($num_to_get); $cnt = 0; foreach ($objects as $object) { $cnt++; $site = $object[1]; if (isset($site[self::TYPE]) && $site[self::TYPE] == "link") { $is_link = true; $doc_keys = $site[self::HTTP_CODE]; $site_url = $site[self::TITLE]; $host = UrlParser::getHost($site_url); $link_parts = explode('|', $site[self::HASH]); if (isset($link_parts[5])) { $link_origin = $link_parts[5]; } else { $link_origin = $site_url; } $meta_ids = PhraseParser::calculateLinkMetas($site_url, $host, $site[self::DESCRIPTION], $link_origin); $link_to = "LINK TO:"; } else { $is_link = false; $site_url = str_replace('|', "%7C", $site[self::URL]); $host = UrlParser::getHost($site_url); $doc_keys = crawlHash($site_url, true) . $site[self::HASH] . "d" . substr(crawlHash($host . "/", true), 1); $meta_ids = PhraseParser::calculateMetas($site, $video_sources); $link_to = ""; } $so_far_cnt = $seen_partition + $cnt; $time_out_message = "..still processing {$so_far_cnt} " . "of {$partition->count} in partition {$generation}." . "\n..Last processed was: " . ($seen + 1) . ". {$link_to}{$site_url}. "; crawlTimeoutLog($time_out_message); $seen++; $word_lists = array(); /* self::JUST_METAS check to avoid getting sitemaps in results for popular words */ $lang = NULL; if (!isset($site[self::JUST_METAS])) { $host_words = UrlParser::getWordsIfHostUrl($site_url); $path_words = UrlParser::getWordsLastPathPartUrl($site_url); if ($is_link) { $phrase_string = $site[self::DESCRIPTION]; } else { $phrase_string = $host_words . " " . $site[self::TITLE] . " " . $path_words . " " . $site[self::DESCRIPTION]; } if (isset($site[self::LANG])) { $lang = guessLocaleFromString(mb_substr($site[self::DESCRIPTION], 0, AD_HOC_TITLE_LENGTH), $site[self::LANG]); } $word_lists = PhraseParser::extractPhrasesInLists($phrase_string, $lang); $len = strlen($phrase_string); if (PhraseParser::computeSafeSearchScore($word_lists, $len) < 0.012) { $meta_ids[] = "safe:true"; $safe = true; } else { $meta_ids[] = "safe:false"; $safe = false; } } if (isset($site[self::USER_RANKS]) && count($site[self::USER_RANKS]) > 0) { $score_keys = ""; foreach ($site[self::USER_RANKS] as $label => $score) { $score_keys .= packInt($score); } if (strlen($score_keys) % 8 != 0) { $score_keys .= $keypad; } $doc_keys .= $score_keys; } $shard->addDocumentWords($doc_keys, $offset, $word_lists, $meta_ids, PhraseParser::$materialized_metas, true, false); $offset = $object[0]; } $seen_partition += $num_to_get; } $shard->save(false, true); $generation++; } $this->reindexIndexArchive($archive_path); }
/** * Copies all feeds items newer than $age to a new shard, then deletes * old index shard and database entries older than $age. Finally sets copied * shard to be active. If this method is going to take max_execution_time/2 * it returns false, so an additional job can be schedules; otherwise * it returns true * * @param int $age how many seconds old records should be deleted * @return bool whether job executed to complete */ function rebuildFeedShard($age) { $time = time(); $feed_shard_name = WORK_DIRECTORY . "/feeds/index"; $prune_shard_name = WORK_DIRECTORY . "/feeds/prune_index"; $prune_shard = new IndexShard($prune_shard_name); $too_old = $time - $age; if (!$prune_shard) { return false; } $pre_feeds = $this->getNewsSources(); if (!$pre_feeds) { return false; } $feeds = array(); foreach ($pre_feeds as $pre_feed) { if (!isset($pre_feed['NAME'])) { continue; } $feeds[$pre_feed['NAME']] = $pre_feed; } $db = $this->db; // we now rebuild the inverted index with the remaining items $sql = "SELECT * FROM FEED_ITEM " . "WHERE PUBDATE >= ? " . "ORDER BY PUBDATE DESC"; $result = $db->execute($sql, array($too_old)); if ($result) { $completed = true; crawlLog("..still deleting. Making new index of non-pruned items."); $i = 0; while ($item = $db->fetchArray($result)) { crawlTimeoutLog("..have added %s non-pruned items to index.", $i); $i++; if (!isset($item['SOURCE_NAME'])) { continue; } $source_name = $item['SOURCE_NAME']; if (isset($feeds[$source_name])) { $lang = $feeds[$source_name]['LANGUAGE']; } else { $lang = ""; } $phrase_string = $item["TITLE"] . " " . $item["DESCRIPTION"]; $word_lists = PhraseParser::extractPhrasesInLists($phrase_string, $lang); $raw_guid = unbase64Hash($item["GUID"]); $doc_keys = crawlHash($item["LINK"], true) . $raw_guid . "d" . substr(crawlHash(UrlParser::getHost($item["LINK"]) . "/", true), 1); $meta_ids = $this->calculateMetas($lang, $item['PUBDATE'], $source_name, $item["GUID"]); $prune_shard->addDocumentWords($doc_keys, $item['PUBDATE'], $word_lists, $meta_ids, PhraseParser::$materialized_metas, true, false); } } $prune_shard->save(); @chmod($prune_shard_name, 0777); @chmod($feed_shard_name, 0777); @rename($prune_shard_name, $feed_shard_name); @chmod($feed_shard_name, 0777); $sql = "DELETE FROM FEED_ITEM WHERE PUBDATE < ?"; $db->execute($sql, array($too_old)); }
/** * Make multi_curl requests for an array of sites with urls or onion urls * * @param array $sites an array containing urls of pages to request * @param bool $timer flag, true means print timing statistics to log * @param int $page_range_request maximum number of bytes to download/page * 0 means download all * @param string $temp_dir folder to store temporary ip header info * @param string $key the component of $sites[$i] that has the value of * a url to get defaults to URL * @param string $value component of $sites[$i] in which to store the * page that was gotten * @param bool $minimal if true do a faster request of pages by not * doing things like extract HTTP headers sent, etcs * @param array $post_data data to be POST'd to each site * @param bool $follow whether to follow redirects or not * @param string $tor_proxy url of a proxy that knows how to download * .onion urls * @param array $proxy_servers if not array(), then an array of proxy * server to use rather than to directly download web pages from * the current machine * * @return array an updated array with the contents of those pages */ static function getPages($sites, $timer = false, $page_range_request = PAGE_RANGE_REQUEST, $temp_dir = NULL, $key = CrawlConstants::URL, $value = CrawlConstants::PAGE, $minimal = false, $post_data = NULL, $follow = false, $tor_proxy = "", $proxy_servers = array()) { $agent_handler = curl_multi_init(); $active = NULL; $start_time = microtime(); if (!$minimal && $temp_dir == NULL) { $temp_dir = CRAWL_DIR . "/temp"; if (!file_exists($temp_dir)) { mkdir($temp_dir); } } //Set-up requests $num_sites = count($sites); for ($i = 0; $i < $num_sites; $i++) { $is_gopher = false; $sites[$i][CrawlConstants::IS_GOPHER_URL] = $is_gopher; if (isset($sites[$i][$key])) { list($sites[$i][$key], $url, $headers) = self::prepareUrlHeaders($sites[$i][$key], $minimal, $proxy_servers); if ($headers == "gopher") { $is_gopher = true; $sites[$i][CrawlConstants::IS_GOPHER_URL] = $is_gopher; $headers = array(); } $sites[$i][0] = curl_init(); if (!$minimal) { $ip_holder[$i] = fopen("{$temp_dir}/tmp{$i}.txt", 'w+'); curl_setopt($sites[$i][0], CURLOPT_STDERR, $ip_holder[$i]); curl_setopt($sites[$i][0], CURLOPT_VERBOSE, true); } curl_setopt($sites[$i][0], CURLOPT_USERAGENT, USER_AGENT); curl_setopt($sites[$i][0], CURLOPT_IPRESOLVE, CURL_IPRESOLVE_WHATEVER); curl_setopt($sites[$i][0], CURLOPT_URL, $url); if (strcmp(substr($url, -10), "robots.txt") == 0) { $sites[$i]['ROBOT'] = true; $follow = true; /*wikipedia redirects their robot page. grr want to force this for robots pages */ } curl_setopt($sites[$i][0], CURLOPT_FOLLOWLOCATION, $follow); curl_setopt($sites[$i][0], CURLOPT_SSL_VERIFYHOST, 0); curl_setopt($sites[$i][0], CURLOPT_AUTOREFERER, true); curl_setopt($sites[$i][0], CURLOPT_RETURNTRANSFER, true); curl_setopt($sites[$i][0], CURLOPT_CONNECTTIMEOUT, PAGE_TIMEOUT); curl_setopt($sites[$i][0], CURLOPT_TIMEOUT, PAGE_TIMEOUT); if (stripos($url, '.onion') !== false && $tor_proxy != "") { curl_setopt($sites[$i][0], CURLOPT_PROXY, $tor_proxy); //CURLPROXY_SOCKS5_HOSTNAME = 7 curl_setopt($sites[$i][0], CURLOPT_PROXYTYPE, 7); if ($timer) { crawlLog("Using Tor proxy for {$url}.."); } } else { if ($proxy_servers != array() && !$is_gopher) { $select_proxy = rand(0, count($proxy_servers) - 1); $proxy_server = $proxy_servers[$select_proxy]; $proxy_parts = explode(":", $proxy_server); $proxy_ip = $proxy_parts[0]; if (!isset($proxy_parts[2]) || strtolower($proxy_parts[2]) == 'http') { $proxy_type = CURLPROXY_HTTP; } else { if (strtolower($proxy_parts[2]) == 'socks5') { $proxy_type = CURLPROXY_SOCKS5; } else { $proxy_type = $proxy_parts[2]; } } if (isset($proxy_parts[1])) { $proxy_port = $proxy_parts[1]; } else { $proxy_port = "80"; } curl_setopt($sites[$i][0], CURLOPT_PROXY, "{$proxy_ip}:{$proxy_port}"); curl_setopt($sites[$i][0], CURLOPT_PROXYTYPE, $proxy_type); if ($timer) { crawlLog("Selecting proxy {$select_proxy} for {$url}"); } } } if (!$minimal) { curl_setopt($sites[$i][0], CURLOPT_HEADER, true); } //make lighttpd happier if (!$is_gopher) { curl_setopt($sites[$i][0], CURLOPT_HTTPHEADER, $headers); } curl_setopt($sites[$i][0], CURLOPT_ENCODING, ""); // ^ need to set for sites like att that use gzip if ($page_range_request > 0) { curl_setopt($sites[$i][0], CURLOPT_RANGE, "0-" . $page_range_request); } if ($post_data != NULL) { curl_setopt($sites[$i][0], CURLOPT_POST, true); curl_setopt($sites[$i][0], CURLOPT_POSTFIELDS, $post_data[$i]); } curl_multi_add_handle($agent_handler, $sites[$i][0]); } } if ($timer) { crawlLog(" Init Get Pages " . changeInMicrotime($start_time)); } $start_time = microtime(); $start = time(); //Wait for responses $running = NULL; $memory_limit = metricToInt(ini_get("memory_limit")) * 0.7; do { $mrc = curl_multi_exec($agent_handler, $running); $ready = curl_multi_select($agent_handler, 0.005); } while (memory_get_usage() < $memory_limit && time() - $start < PAGE_TIMEOUT && $running > 0); if (time() - $start > PAGE_TIMEOUT && $timer) { crawlLog(" TIMED OUT!!!"); } if ($timer) { crawlLog(" Page Request time " . changeInMicrotime($start_time)); } $start_time = microtime(); //Process returned pages for ($i = 0; $i < $num_sites; $i++) { if ($timer) { crawlTimeoutLog("fetch_url initial processing of page %s of %s", $i, $num_sites); } if (!$minimal && isset($ip_holder[$i])) { rewind($ip_holder[$i]); $header = fread($ip_holder[$i], 8192); $ip_addresses = self::getCurlIp($header); fclose($ip_holder[$i]); } $is_gopher = false; if (isset($sites[$i][0]) && $sites[$i][0]) { // Get Data and Message Code $content = @curl_multi_getcontent($sites[$i][0]); $is_gopher = $sites[$i][CrawlConstants::IS_GOPHER_URL]; /* If the Transfer-encoding was chunked then the Range header we sent was ignored. So we manually truncate the data here */ if ($page_range_request > 0) { $content = substr($content, 0, $page_range_request); } if (isset($content) && !$minimal && !$is_gopher) { $site = self::parseHeaderPage($content, $value); $sites[$i] = array_merge($sites[$i], $site); if (isset($header)) { $header = substr($header, 0, strpos($header, "\r\n\r\n") + 4); } else { $header = ""; } $sites[$i][CrawlConstants::HEADER] = $header . $sites[$i][CrawlConstants::HEADER]; unset($header); } else { if (isset($content) && !$minimal && $is_gopher) { $sites[$i][CrawlConstants::HEADER] = $header; $sites[$i][$value] = $content; unset($header); } else { $sites[$i][$value] = $content; } } if (!$minimal) { $sites[$i][self::SIZE] = @curl_getinfo($sites[$i][0], CURLINFO_SIZE_DOWNLOAD); $sites[$i][self::DNS_TIME] = @curl_getinfo($sites[$i][0], CURLINFO_NAMELOOKUP_TIME); $sites[$i][self::TOTAL_TIME] = @curl_getinfo($sites[$i][0], CURLINFO_TOTAL_TIME); $sites[$i][self::HTTP_CODE] = curl_getinfo($sites[$i][0], CURLINFO_HTTP_CODE); if (!$sites[$i][self::HTTP_CODE] && !$is_gopher) { $sites[$i][self::HTTP_CODE] = curl_error($sites[$i][0]); } else { $sites[$i][self::HTTP_CODE] = 200; } if ($ip_addresses) { $sites[$i][self::IP_ADDRESSES] = $ip_addresses; } else { $sites[$i][self::IP_ADDRESSES] = array("0.0.0.0"); } //Get Time, Mime type and Character encoding $sites[$i][self::TIMESTAMP] = time(); if ($is_gopher) { $path = UrlParser::getPath($sites[$i][self::URL]); $filename = UrlParser::getDocumentFilename($sites[$i][self::URL]); if (isset($path[1])) { $gopher_type = $path[1]; } else { $gopher_type = 1; } if ($gopher_type == 1) { $sites[$i][self::TYPE] = "text/gopher"; } else { if (in_array($gopher_type, array(0, 3, 6))) { $sites[$i][self::TYPE] = "text/plain"; if ($gopher_type == 6) { $sites[$i][$value] = convert_uudecode($content); } } else { if ($gopher_type == 'h') { $sites[$i][self::TYPE] = "text/html"; } else { if ($gopher_type == 'g') { $sites[$i][self::TYPE] = "image/gif"; } } } } $path_info = pathinfo($filename); if (!isset($sites[$i][self::TYPE]) && isset($path_info['extension'])) { $sites[$i][self::TYPE] = UrlParser::guessMimeTypeFromFileName($filename); } else { if (!isset($sites[$i][self::TYPE])) { $sites[$i][self::TYPE] = "unknown"; } } } else { $type_parts = explode(";", curl_getinfo($sites[$i][0], CURLINFO_CONTENT_TYPE)); $sites[$i][self::TYPE] = strtolower(trim($type_parts[0])); } } //curl_multi_remove_handle($agent_handler, $sites[$i][0]); curl_close($sites[$i][0]); if (isset($sites[$i]['ROBOT']) && $sites[$i]['ROBOT']) { if (isset($sites[$i][self::TYPE]) && $sites[$i][self::TYPE] != "text/plain" && isset($sites[$i][CrawlConstants::LOCATION]) && count($site[CrawlConstants::LOCATION]) > 0) { $sites[$i][self::TYPE] = "text/plain"; $sites[$i][self::HTTP_CODE] = "200"; $tmp = wordwrap($sites[$i][$value], 80); $tmp_parts = explode("\n", $tmp); $tmp = "# Suspect server misconfiguration\n"; $tmp .= "# Assume shouldn't crawl this site.\n"; $tmp .= "# Pretending got following robots.txt.\n"; $tmp .= "User-agent: *\n"; $tmp .= "Disallow: /\n"; $tmp .= "# Original error code: " . $sites[$i][self::HTTP_CODE] . "\n"; $tmp .= "# Original content:\n"; foreach ($tmp_parts as $part) { $tmp = "#" . $part . "\n"; } $sites[$i][$value] = $tmp; $sites[$i][self::HTTP_CODE] = "200"; unset($site[CrawlConstants::LOCATION]); } } } //end big if } //end for if ($timer) { crawlLog(" Get Page Content time " . changeInMicrotime($start_time)); } curl_multi_close($agent_handler); return $sites; }
/** * Gets the next $num many docs from the iterator * * @param int $num number of docs to get * @param bool $no_process this flag is inherited from base class but * does not do anything in this case * @return array associative arrays for $num pages */ function nextPages($num, $no_process = false) { if ($num + $this->overall_index >= $this->count) { $num = max($this->count - $this->overall_index, 0); } $num_to_get = 1; $objects = array(); for ($i = 0; $i < $num; $i += $num_to_get) { crawlTimeoutLog("..Still getting pages from archive iterator. At %s" . " of %s", $i, $num); $num_to_get = min($num, $this->partition->count - $this->partition_index); $pre_new_objects = $this->partition->nextObjects($num_to_get); foreach ($pre_new_objects as $object) { $objects[] = $object[1]; } $this->overall_index += $num_to_get; $this->partition_index += $num_to_get; if ($num_to_get <= 0) { $this->current_partition_num++; $this->partition = $this->archive->getPartition($this->current_partition_num, false); $this->partition_index = 0; } if ($this->current_partition_num > $this->num_partitions) { break; } } $this->end_of_iterator = $this->overall_index >= $this->count ? true : false; $this->saveCheckpoint(); return $objects; }