/**
  * Used to extract data between two tags for the first tag found
  * amongst the array of tags $tags. After operation $this->buffer has
  * contents after the close tag.
  *
  * @param array $tags array of tagnames to look for
  *
  * @return array of two elements: the first element is a string consisting
  *     of start tag contents close tag of first tag found, the second
  *     has the name of the tag amongst $tags found
  */
 function getNextTagsData($tags)
 {
     $close_regex = '@</(' . implode('|', $tags) . ')[^>]*?>@';
     $offset = 0;
     while (!preg_match($close_regex, $this->buffer, $matches, PREG_OFFSET_CAPTURE, $offset)) {
         if (!$this->checkFileHandle() || $this->checkEof()) {
             return false;
         }
         /*
            Get the next block; the block iterator can very occasionally
            return a bad block if a block header pattern happens to show up
            in compressed data, in which case decompression will fail. We
            want to skip over these false blocks and get back to real
            blocks.
         */
         while (!is_string($block = $this->getFileBlock())) {
             crawlTimeoutLog("..still getting next tags data..");
             if ($this->checkEof()) {
                 return false;
             }
         }
         $this->buffer .= $block;
     }
     $tag = $matches[1][0];
     $start_info = strpos($this->buffer, "<{$tag}");
     $this->remainder = substr($this->buffer, 0, $start_info);
     $pre_end_info = strpos($this->buffer, "</{$tag}", $start_info);
     $end_info = strpos($this->buffer, ">", $pre_end_info) + 1;
     $tag_info = substr($this->buffer, $start_info, $end_info - $start_info);
     $this->buffer = substr($this->buffer, $end_info);
     return array($tag_info, $tag);
 }
Beispiel #2
0
 /**
  * Used to remove from the queue urls that are no longer crawlable
  * because the allowed and disallowed sites have changed.
  */
 function cullNoncrawlableSites()
 {
     $count = $this->web_queue->to_crawl_queue->count;
     crawlLog("Scheduler: " . " Culling noncrawlable urls after change in crawl parameters;" . " Queue Size {$count}");
     $start_time = microtime();
     $fh = $this->web_queue->openUrlArchive();
     $delete_urls = array();
     $i = 1;
     while ($i < $count) {
         crawlTimeoutLog("..Scheduler: " . "still culling noncrawlable urls. Examining " . "location %s in queue of %s.", $i, $count);
         $tmp = $this->web_queue->peekQueue($i, $fh);
         list($url, $weight, $flag, $probe) = $tmp;
         if (!$this->allowedToCrawlSite($url) || $this->disallowedToCrawlSite($url)) {
             $delete_urls[] = $url;
         }
         $i++;
     }
     $this->web_queue->closeUrlArchive($fh);
     $new_time = microtime();
     crawlLog("...Scheduler: Done selecting cullable URLS, time so far:" . changeInMicrotime($start_time));
     $this->web_queue->closeUrlArchive($fh);
     $new_time = microtime();
     $num_deletes = count($delete_urls);
     $k = 0;
     foreach ($delete_urls as $delete_url) {
         $k++;
         crawlTimeoutLog("..Scheduler: Removing selected url %s of %s " . "from queue.", $k, $num_deletes);
         if ($delete_url) {
             $this->web_queue->removeQueue($delete_url);
         } else {
             /*  if there was a hash table look up error still get rid of
                 index from priority queue */
             $this->web_queue->to_crawl_queue->poll($k);
         }
     }
     crawlLog("...Scheduler: Removed {$k} cullable URLS  from queue in time: " . changeInMicrotime($new_time));
 }
Beispiel #3
0
 /**
  * Used to make a reference list for a wiki page based on the
  * cite tags on that page.
  *
  * @param string $page a wiki document
  * @return string HTML reference list to be inserted after wiki
  *     page processed
  */
 function makeReferences($page)
 {
     $base_address = $this->base_address;
     $references = "\n";
     $matches = array();
     preg_match_all('/{{v?cite(.+?)}}/si', $page, $matches);
     citeCallback(NULL, 1);
     $page = preg_replace_callback('/{{v?cite?a?t?i?o?n?(.+?)}}/si', "citeCallback", $page);
     if (isset($matches[1])) {
         $i = 1;
         $wiki_fields = array("title", "publisher", "author", "journal", "book", "quote");
         foreach ($matches[1] as $reference) {
             $ref_parts = explode("|", $reference);
             $references .= "<div id=\"ref_{$i}\">{$i}." . "<a href=\"#cite_{$i}\">^</a>.";
             crawlTimeoutLog("..Making wiki references outer..");
             if (count($ref_parts) > 0) {
                 $ref_data = array();
                 $type = trim(strtolower($ref_parts[0]));
                 array_shift($ref_parts);
                 foreach ($ref_parts as $part) {
                     crawlTimeoutLog("..Making wiki references inner..");
                     $part_parts = explode("=", $part);
                     if (isset($part_parts[1])) {
                         $field = strtolower(trim($part_parts[0]));
                         $value = trim($part_parts[1]);
                         if (in_array($field, $wiki_fields)) {
                             $value = preg_replace($this->matches, $this->replaces, $value);
                             $value = strip_tags($value, '<a><b><i><span><img>');
                         }
                         $ref_data[$field] = $value;
                     }
                 }
                 if (!isset($ref_data['author']) && isset($ref_data['last']) && isset($ref_data['first'])) {
                     $ref_data['author'] = $ref_data['last'] . ", " . $ref_data['first'];
                 }
                 if (isset($ref_data['authorlink'])) {
                     if (!isset($ref_data['author'])) {
                         $ref_data['author'] = $ref_data['authorlink'];
                     }
                     $ref_data['author'] = "<a href=\"{$base_address}" . $ref_data['author'] . "\">{$ref_data['author']}</a>";
                 }
                 for ($i = 2; $i < 6; $i++) {
                     if (!isset($ref_data["author{$i}"]) && isset($ref_data["last{$i}"]) && isset($ref_data['first'])) {
                         $ref_data["author{$i}"] = $ref_data["last{$i}"] . ", " . $ref_data["first{$i}"];
                     }
                     if (!isset($ref_data["author{$i}"])) {
                         break;
                     }
                     if (isset($ref_data["authorlink{$i}"])) {
                         if (!isset($ref_data["author{$i}"])) {
                             $ref_data["author{$i}"] = $ref_data["authorlink{$i}"];
                         }
                         $ref_data["author{$i}"] = "<a href=\"{$base_address}" . $ref_data["author{$i}"] . "\">" . $ref_data["author{$i}"] . "</a>";
                     }
                     $ref_data["author"] .= " and " . $ref_data["author{$i}"];
                 }
                 if (!isset($ref_data['title']) && isset($ref_data['url'])) {
                     $ref_data['title'] = $ref_data['url'];
                 }
                 if (isset($ref_data['title']) && isset($ref_data['url'])) {
                     $ref_data['title'] = "<a href=\"{$ref_data['url']}\">" . "{$ref_data['title']}</a>";
                 }
                 if (isset($ref_data['quote'])) {
                     $references .= '"' . $ref_data['quote'] . '". ';
                 }
                 if (isset($ref_data['author'])) {
                     $references .= $ref_data['author'] . ". ";
                 }
                 if (isset($ref_data['title'])) {
                     $references .= '"' . $ref_data['title'] . '". ';
                 }
                 if (isset($ref_data['accessdate']) && !isset($ref_data['archivedate'])) {
                     $references .= '(' . $ref_data['accessdate'] . ') ';
                 }
                 if (isset($ref_data['archivedate'])) {
                     if (isset($ref_data['archiveurl'])) {
                         $ref_data['archivedate'] = "<a href=\"" . $ref_data['archiveurl'] . "\">" . $ref_data['archivedate'] . "</a>";
                     }
                     $references .= '(' . $ref_data['archivedate'] . ') ';
                 }
                 if (isset($ref_data['journal'])) {
                     $references .= "<i>{$ref_data['journal']}</i> ";
                 }
                 if (isset($ref_data['location'])) {
                     $references .= $ref_data['location'] . ". ";
                 }
                 if (isset($ref_data['publisher'])) {
                     $references .= $ref_data['publisher'] . ". ";
                 }
                 if (isset($ref_data['doi'])) {
                     $references .= "doi:" . $ref_data['doi'] . ". ";
                 }
                 if (isset($ref_data['isbn'])) {
                     $references .= "ISBN:" . $ref_data['isbn'] . ". ";
                 }
                 if (isset($ref_data['jstor'])) {
                     $references .= "JSTOR:" . $ref_data['jstor'] . ". ";
                 }
                 if (isset($ref_data['oclc'])) {
                     $references .= "OCLC:" . $ref_data['oclc'] . ". ";
                 }
                 if (isset($ref_data['volume'])) {
                     $references .= "<b>" . $ref_data['volume'] . "</b> ";
                 }
                 if (isset($ref_data['issue'])) {
                     $references .= "#" . $ref_data['issue'] . ". ";
                 }
                 if (isset($ref_data['date'])) {
                     $references .= $ref_data['date'] . ". ";
                 }
                 if (isset($ref_data['year'])) {
                     $references .= $ref_data['year'] . ". ";
                 }
                 if (isset($ref_data['page'])) {
                     $references .= "p." . $ref_data['page'] . ". ";
                 }
                 if (isset($ref_data['pages'])) {
                     $references .= "pp." . $ref_data['pages'] . ". ";
                 }
             }
             $references .= "</div>\n";
             $i++;
         }
     }
     return array($page, $references);
 }
 /**
  * Gets the next at most $num many docs from the iterator. It might return
  * less than $num many documents if the partition changes or the end of the
  * bundle is reached.
  *
  * @param int $num number of docs to get
  * @param bool $no_process do not do any processing on page data
  * @return array associative arrays for $num pages
  */
 function nextPages($num, $no_process = false)
 {
     $pages = array();
     $page_count = 0;
     $db = $this->db;
     $query = "{$this->sql} " . $db->limitOffset($this->limit, $num);
     $result = $db->execute($query);
     $i = 0;
     while ($row = $db->fetchArray($result)) {
         crawlTimeoutLog("..Still getting pages from archive iterator. At %s" . " of %s", $i, $num);
         $page = "";
         foreach ($row as $key => $value) {
             $page .= "{$key}{$this->field_value_separator}" . "{$value}{$this->column_separator}";
         }
         if ($no_process) {
             $pages[] = $page;
         } else {
             $site = array();
             $site[self::HEADER] = "database_bundle_iterator extractor";
             $site[self::IP_ADDRESSES] = array("0.0.0.0");
             $site[self::TIMESTAMP] = date("U", time());
             $site[self::TYPE] = "text/plain";
             $site[self::PAGE] = $page;
             $site[self::HASH] = FetchUrl::computePageHash($page);
             $site[self::URL] = "record:" . webencode($site[self::HASH]);
             $site[self::HTTP_CODE] = 200;
             $site[self::ENCODING] = $this->encoding;
             $site[self::SERVER] = "unknown";
             $site[self::SERVER_VERSION] = "unknown";
             $site[self::OPERATING_SYSTEM] = "unknown";
             $site[self::WEIGHT] = 1;
             $pages[] = $site;
         }
         $page_count++;
     }
     $this->limit += $page_count;
     if ($page_count < $num) {
         $this->end_of_iterator = true;
     }
     $this->saveCheckpoint();
     return $pages;
 }
Beispiel #5
0
 /**
  * Builds an inverted index shard (word --> {docs it appears in})
  * for the current batch of SEEN_URLS_BEFORE_UPDATE_SCHEDULER many pages.
  * This inverted index shard is then merged by a queue_server
  * into the inverted index of the current generation of the crawl.
  * The complete inverted index for the whole crawl is built out of these
  * inverted indexes for generations. The point of computing a partial
  * inverted index on the fetcher is to reduce some of the computational
  * burden on the queue server. The resulting mini index computed by
  * buildMiniInvertedIndex() is stored in
  * $this->found_sites[self::INVERTED_INDEX]
  *
  */
 function buildMiniInvertedIndex()
 {
     $start_time = microtime();
     $keypad = "";
     crawlLog("  Start building mini inverted index ...  Current Memory:" . memory_get_usage());
     $num_seen = count($this->found_sites[self::SEEN_URLS]);
     $this->num_seen_sites += $num_seen;
     /*
         for the fetcher we are not saving the index shards so
         name doesn't matter.
     */
     if (!isset($this->found_sites[self::INVERTED_INDEX][$this->current_server])) {
         $this->found_sites[self::INVERTED_INDEX][$this->current_server] = new IndexShard("fetcher_shard_{$this->current_server}");
     }
     for ($i = 0; $i < $num_seen; $i++) {
         $interim_time = microtime();
         $site = $this->found_sites[self::SEEN_URLS][$i];
         if (!isset($site[self::HASH]) || isset($site[self::ROBOT_METAS]) && in_array("JUSTFOLLOW", $site[self::ROBOT_METAS])) {
             continue;
         }
         $doc_rank = false;
         if ($this->crawl_type == self::ARCHIVE_CRAWL && isset($this->archive_iterator)) {
             $doc_rank = $this->archive_iterator->weight($site);
         }
         if (isset($site[self::TYPE]) && $site[self::TYPE] == "link") {
             $is_link = true;
             $doc_keys = $site[self::HTTP_CODE];
             $site_url = $site[self::TITLE];
             $host = UrlParser::getHost($site_url);
             $link_parts = explode('|', $site[self::HASH]);
             if (isset($link_parts[5])) {
                 $link_origin = $link_parts[5];
             } else {
                 $link_origin = $site_url;
             }
             $meta_ids = PhraseParser::calculateLinkMetas($site_url, $host, $site[self::DESCRIPTION], $link_origin);
         } else {
             $is_link = false;
             $site_url = str_replace('|', "%7C", $site[self::URL]);
             $host = UrlParser::getHost($site_url);
             $doc_keys = crawlHash($site_url, true) . $site[self::HASH] . "d" . substr(crawlHash($host . "/", true), 1);
             $meta_ids = PhraseParser::calculateMetas($site, $this->video_sources);
         }
         $word_lists = array();
         /*
            self::JUST_METAS check to avoid getting sitemaps in results for
            popular words
         */
         $lang = NULL;
         if (!isset($site[self::JUST_METAS])) {
             $host_words = UrlParser::getWordsIfHostUrl($site_url);
             $path_words = UrlParser::getWordsLastPathPartUrl($site_url);
             if ($is_link) {
                 $phrase_string = $site[self::DESCRIPTION];
             } else {
                 if (isset($site[self::LANG])) {
                     if (isset($this->programming_language_extension[$site[self::LANG]])) {
                         $phrase_string = $site[self::DESCRIPTION];
                     } else {
                         $phrase_string = $host_words . " " . $site[self::TITLE] . " " . $path_words . " " . $site[self::DESCRIPTION];
                     }
                 } else {
                     $phrase_string = $host_words . " " . $site[self::TITLE] . " " . $path_words . " " . $site[self::DESCRIPTION];
                 }
             }
             if (isset($site[self::LANG])) {
                 $lang = guessLocaleFromString(mb_substr($site[self::DESCRIPTION], 0, AD_HOC_TITLE_LENGTH), $site[self::LANG]);
             }
             $word_lists = PhraseParser::extractPhrasesInLists($phrase_string, $lang);
             $len = strlen($phrase_string);
             if (isset($this->programming_language_extension[$lang]) || PhraseParser::computeSafeSearchScore($word_lists, $len) < 0.012) {
                 $meta_ids[] = "safe:true";
                 $safe = true;
             } else {
                 $meta_ids[] = "safe:false";
                 $safe = false;
             }
         }
         if (!$is_link) {
             //store inlinks so they can be searched by
             $num_links = count($site[self::LINKS]);
             if ($num_links > 0) {
                 $link_rank = false;
                 if ($doc_rank !== false) {
                     $link_rank = max($doc_rank - 1, 1);
                 }
             } else {
                 $link_rank = false;
             }
         }
         $num_queue_servers = count($this->queue_servers);
         if (isset($site[self::USER_RANKS]) && count($site[self::USER_RANKS]) > 0) {
             $score_keys = "";
             foreach ($site[self::USER_RANKS] as $label => $score) {
                 $score_keys .= packInt($score);
             }
             if (strlen($score_keys) % 8 != 0) {
                 $score_keys .= $keypad;
             }
             $doc_keys .= $score_keys;
         }
         $this->found_sites[self::INVERTED_INDEX][$this->current_server]->addDocumentWords($doc_keys, self::NEEDS_OFFSET_FLAG, $word_lists, $meta_ids, PhraseParser::$materialized_metas, true, $doc_rank);
         /*
            $this->no_process_links is set when doing things like
            mix recrawls. In this case links likely already will appear
            in what indexing, so don't index again. $site[self::JUST_META]
            is set when have a sitemap or robots.txt (this case set later).
            In this case link  info is not particularly useful for indexing
            and can greatly slow building inverted index.
         */
         if (!$this->no_process_links && !isset($site[self::JUST_METAS]) && !isset($this->programming_language_extension[$lang])) {
             foreach ($site[self::LINKS] as $url => $link_text) {
                 /* this mysterious check means won't index links from
                     robots.txt. Sitemap will still be in TO_CRAWL, but that's
                     done elsewhere
                    */
                 if (strlen($url) == 0 || is_numeric($url)) {
                     continue;
                 }
                 $link_host = UrlParser::getHost($url);
                 if (strlen($link_host) == 0) {
                     continue;
                 }
                 $part_num = calculatePartition($link_host, $num_queue_servers);
                 $summary = array();
                 if (!isset($this->found_sites[self::LINK_SEEN_URLS][$part_num])) {
                     $this->found_sites[self::LINK_SEEN_URLS][$part_num] = array();
                 }
                 $elink_flag = $link_host != $host ? true : false;
                 $link_text = strip_tags($link_text);
                 $ref = $elink_flag ? "eref" : "iref";
                 $url = str_replace('|', "%7C", $url);
                 $link_id = "url|" . $url . "|text|" . urlencode($link_text) . "|{$ref}|" . $site_url;
                 $elink_flag_string = $elink_flag ? "e" : "i";
                 $link_keys = crawlHash($url, true) . crawlHash($link_id, true) . $elink_flag_string . substr(crawlHash($host . "/", true), 1);
                 $summary[self::URL] = $link_id;
                 $summary[self::TITLE] = $url;
                 // stripping html to be on the safe side
                 $summary[self::DESCRIPTION] = $link_text;
                 $summary[self::TIMESTAMP] = $site[self::TIMESTAMP];
                 $summary[self::ENCODING] = $site[self::ENCODING];
                 $summary[self::HASH] = $link_id;
                 $summary[self::TYPE] = "link";
                 $summary[self::HTTP_CODE] = $link_keys;
                 $summary[self::LANG] = $lang;
                 $this->found_sites[self::LINK_SEEN_URLS][$part_num][] = $summary;
                 $link_word_lists = PhraseParser::extractPhrasesInLists($link_text, $lang);
                 $link_meta_ids = PhraseParser::calculateLinkMetas($url, $link_host, $link_text, $site_url);
                 if (!isset($this->found_sites[self::INVERTED_INDEX][$part_num])) {
                     $this->found_sites[self::INVERTED_INDEX][$part_num] = new IndexShard("fetcher_shard_{$part_num}");
                 }
                 $this->found_sites[self::INVERTED_INDEX][$part_num]->addDocumentWords($link_keys, self::NEEDS_OFFSET_FLAG, $link_word_lists, $link_meta_ids, PhraseParser::$materialized_metas, false, $link_rank);
             }
         }
         $iterim_elapse = changeInMicrotime($interim_time);
         if ($iterim_elapse > 5) {
             crawlLog("..Inverting " . $site[self::URL] . "...took > 5s.");
         }
         crawlTimeoutLog("..Still building inverted index. Have processed " . "%s of %s documents.\nLast url processed was %s.", $i, $num_seen, $site[self::URL]);
     }
     if ($this->crawl_type == self::ARCHIVE_CRAWL) {
         $this->recrawl_check_scheduler = true;
     }
     crawlLog("  Build mini inverted index time " . changeInMicrotime($start_time));
 }
Beispiel #6
0
 /**
  * Makes a new HashTable without deleted rows
  *
  * The hash table in Yioop is implemented using open addressing. i.e.,
  * We store key value pair in the table itself and if there is a collision
  * we look for the next available slot. Two codes are use to indicate
  * space available in the table. One to indicate empty never used, the
  * other used to indicate empty but previously used and deleted. The reason
  * you need two codes is to ensure that if somebody inserted an item B,
  * it hashes to the same value as A and we move to the next empty slot,
  * to store B, then if we delete A we should still be able to lookup B.
  * The problem is as the table gets reused a lot, it tends to fill up
  * with a lot of deleted entries making lookup times get more and more
  * linear in the hash table size. By rebuilding the table we mitigate
  * against this problem. By choosing the rebuild frequency appropriately,
  * the amortized cost of this operation is only O(1).
  */
 function rebuildHashTable()
 {
     crawlLog("Rebuilding Hash table");
     $num_values = $this->to_crawl_table->num_values;
     $tmp_table = $this->constructHashTable($this->dir_name . "/tmp_table.dat", $num_values);
     $null = $this->to_crawl_table->null;
     $deleted = $this->to_crawl_table->deleted;
     for ($i = 0; $i < $num_values; $i++) {
         crawlTimeoutLog("..still rebuilding hash table. At entry %s of %s", $i, $num_values);
         list($key, $value) = $this->to_crawl_table->getEntry($i);
         if (strcmp($key, $null) != 0 && strcmp($key, $deleted) != 0) {
             $tmp_table->insert($key, $value);
         }
     }
     $this->to_crawl_table = NULL;
     gc_collect_cycles();
     if (file_exists($this->dir_name . "/hash_table.dat")) {
         unlink($this->dir_name . "/hash_table.dat");
         if (file_exists($this->dir_name . "/tmp_table.dat")) {
             rename($this->dir_name . "/tmp_table.dat", $this->dir_name . "/hash_table.dat");
         }
     }
     $tmp_table->filename = $this->dir_name . "/hash_table.dat";
     $this->to_crawl_table = $tmp_table;
 }
Beispiel #7
0
 /**
  * Used to recompute both the index shards and the dictionary
  * of an index archive. The first step involves re-extracting the
  * word into an inverted index from the summaries' web_archives.
  * Then a reindex is done.
  *
  * @param string $archive_path file path to a IndexArchiveBundle
  */
 function rebuildIndexArchive($archive_path)
 {
     $archive_type = $this->getArchiveKind($archive_path);
     if ($archive_type != "IndexArchiveBundle") {
         $this->badFormatMessageAndExit($archive_path);
     }
     $info = $archive_type::getArchiveInfo($archive_path);
     $tmp = unserialize($info["DESCRIPTION"]);
     $video_sources = $tmp[self::VIDEO_SOURCES];
     $generation_info = unserialize(file_get_contents("{$archive_path}/generation.txt"));
     $num_generations = $generation_info['ACTIVE'] + 1;
     $archive = new WebArchiveBundle($archive_path . "/summaries");
     $seen = 0;
     $generation = 0;
     $keypad = "";
     while ($generation < $num_generations) {
         $partition = $archive->getPartition($generation, false);
         $shard_name = $archive_path . "/posting_doc_shards/index{$generation}";
         crawlLog("Processing partition {$generation}");
         if (file_exists($shard_name)) {
             crawlLog("..Unlinking old shard {$generation}");
             @unlink($shard_name);
         }
         $shard = new IndexShard($shard_name, $generation, NUM_DOCS_PER_GENERATION, true);
         $seen_partition = 0;
         while ($seen_partition < $partition->count) {
             $num_to_get = min($partition->count - $seen_partition, 8000);
             $offset = $partition->iterator_pos;
             $objects = $partition->nextObjects($num_to_get);
             $cnt = 0;
             foreach ($objects as $object) {
                 $cnt++;
                 $site = $object[1];
                 if (isset($site[self::TYPE]) && $site[self::TYPE] == "link") {
                     $is_link = true;
                     $doc_keys = $site[self::HTTP_CODE];
                     $site_url = $site[self::TITLE];
                     $host = UrlParser::getHost($site_url);
                     $link_parts = explode('|', $site[self::HASH]);
                     if (isset($link_parts[5])) {
                         $link_origin = $link_parts[5];
                     } else {
                         $link_origin = $site_url;
                     }
                     $meta_ids = PhraseParser::calculateLinkMetas($site_url, $host, $site[self::DESCRIPTION], $link_origin);
                     $link_to = "LINK TO:";
                 } else {
                     $is_link = false;
                     $site_url = str_replace('|', "%7C", $site[self::URL]);
                     $host = UrlParser::getHost($site_url);
                     $doc_keys = crawlHash($site_url, true) . $site[self::HASH] . "d" . substr(crawlHash($host . "/", true), 1);
                     $meta_ids = PhraseParser::calculateMetas($site, $video_sources);
                     $link_to = "";
                 }
                 $so_far_cnt = $seen_partition + $cnt;
                 $time_out_message = "..still processing {$so_far_cnt} " . "of {$partition->count} in partition {$generation}." . "\n..Last processed was: " . ($seen + 1) . ". {$link_to}{$site_url}. ";
                 crawlTimeoutLog($time_out_message);
                 $seen++;
                 $word_lists = array();
                 /*
                    self::JUST_METAS check to avoid getting sitemaps in
                    results for popular words
                 */
                 $lang = NULL;
                 if (!isset($site[self::JUST_METAS])) {
                     $host_words = UrlParser::getWordsIfHostUrl($site_url);
                     $path_words = UrlParser::getWordsLastPathPartUrl($site_url);
                     if ($is_link) {
                         $phrase_string = $site[self::DESCRIPTION];
                     } else {
                         $phrase_string = $host_words . " " . $site[self::TITLE] . " " . $path_words . " " . $site[self::DESCRIPTION];
                     }
                     if (isset($site[self::LANG])) {
                         $lang = guessLocaleFromString(mb_substr($site[self::DESCRIPTION], 0, AD_HOC_TITLE_LENGTH), $site[self::LANG]);
                     }
                     $word_lists = PhraseParser::extractPhrasesInLists($phrase_string, $lang);
                     $len = strlen($phrase_string);
                     if (PhraseParser::computeSafeSearchScore($word_lists, $len) < 0.012) {
                         $meta_ids[] = "safe:true";
                         $safe = true;
                     } else {
                         $meta_ids[] = "safe:false";
                         $safe = false;
                     }
                 }
                 if (isset($site[self::USER_RANKS]) && count($site[self::USER_RANKS]) > 0) {
                     $score_keys = "";
                     foreach ($site[self::USER_RANKS] as $label => $score) {
                         $score_keys .= packInt($score);
                     }
                     if (strlen($score_keys) % 8 != 0) {
                         $score_keys .= $keypad;
                     }
                     $doc_keys .= $score_keys;
                 }
                 $shard->addDocumentWords($doc_keys, $offset, $word_lists, $meta_ids, PhraseParser::$materialized_metas, true, false);
                 $offset = $object[0];
             }
             $seen_partition += $num_to_get;
         }
         $shard->save(false, true);
         $generation++;
     }
     $this->reindexIndexArchive($archive_path);
 }
Beispiel #8
0
 /**
  * Copies all feeds items newer than $age to a new shard, then deletes
  * old index shard and database entries older than $age. Finally sets copied
  * shard to be active. If this method is going to take max_execution_time/2
  * it returns false, so an additional job can be schedules; otherwise
  * it returns true
  *
  * @param int $age how many seconds old records should be deleted
  * @return bool whether job executed to complete
  */
 function rebuildFeedShard($age)
 {
     $time = time();
     $feed_shard_name = WORK_DIRECTORY . "/feeds/index";
     $prune_shard_name = WORK_DIRECTORY . "/feeds/prune_index";
     $prune_shard = new IndexShard($prune_shard_name);
     $too_old = $time - $age;
     if (!$prune_shard) {
         return false;
     }
     $pre_feeds = $this->getNewsSources();
     if (!$pre_feeds) {
         return false;
     }
     $feeds = array();
     foreach ($pre_feeds as $pre_feed) {
         if (!isset($pre_feed['NAME'])) {
             continue;
         }
         $feeds[$pre_feed['NAME']] = $pre_feed;
     }
     $db = $this->db;
     // we now rebuild the inverted index with the remaining items
     $sql = "SELECT * FROM FEED_ITEM " . "WHERE PUBDATE >= ? " . "ORDER BY PUBDATE DESC";
     $result = $db->execute($sql, array($too_old));
     if ($result) {
         $completed = true;
         crawlLog("..still deleting. Making new index of non-pruned items.");
         $i = 0;
         while ($item = $db->fetchArray($result)) {
             crawlTimeoutLog("..have added %s non-pruned items to index.", $i);
             $i++;
             if (!isset($item['SOURCE_NAME'])) {
                 continue;
             }
             $source_name = $item['SOURCE_NAME'];
             if (isset($feeds[$source_name])) {
                 $lang = $feeds[$source_name]['LANGUAGE'];
             } else {
                 $lang = "";
             }
             $phrase_string = $item["TITLE"] . " " . $item["DESCRIPTION"];
             $word_lists = PhraseParser::extractPhrasesInLists($phrase_string, $lang);
             $raw_guid = unbase64Hash($item["GUID"]);
             $doc_keys = crawlHash($item["LINK"], true) . $raw_guid . "d" . substr(crawlHash(UrlParser::getHost($item["LINK"]) . "/", true), 1);
             $meta_ids = $this->calculateMetas($lang, $item['PUBDATE'], $source_name, $item["GUID"]);
             $prune_shard->addDocumentWords($doc_keys, $item['PUBDATE'], $word_lists, $meta_ids, PhraseParser::$materialized_metas, true, false);
         }
     }
     $prune_shard->save();
     @chmod($prune_shard_name, 0777);
     @chmod($feed_shard_name, 0777);
     @rename($prune_shard_name, $feed_shard_name);
     @chmod($feed_shard_name, 0777);
     $sql = "DELETE FROM FEED_ITEM WHERE PUBDATE < ?";
     $db->execute($sql, array($too_old));
 }
Beispiel #9
0
 /**
  * Make multi_curl requests for an array of sites with urls or onion urls
  *
  * @param array $sites  an array containing urls of pages to request
  * @param bool $timer  flag, true means print timing statistics to log
  * @param int $page_range_request maximum number of bytes to download/page
  *     0 means download all
  * @param string $temp_dir folder to store temporary ip header info
  * @param string $key  the component of $sites[$i] that has the value of
  *     a url to get defaults to URL
  * @param string $value component of $sites[$i] in which to store the
  *     page that was gotten
  * @param bool $minimal if true do a faster request of pages by not
  *     doing things like extract HTTP headers sent, etcs
  * @param array $post_data data to be POST'd to each site
  * @param bool $follow whether to follow redirects or not
  * @param string $tor_proxy url of a proxy that knows how to download
  *     .onion urls
  * @param array $proxy_servers if not array(), then an array of proxy
  *     server to use rather than to directly download web pages from
  *     the current machine
  *
  * @return array an updated array with the contents of those pages
  */
 static function getPages($sites, $timer = false, $page_range_request = PAGE_RANGE_REQUEST, $temp_dir = NULL, $key = CrawlConstants::URL, $value = CrawlConstants::PAGE, $minimal = false, $post_data = NULL, $follow = false, $tor_proxy = "", $proxy_servers = array())
 {
     $agent_handler = curl_multi_init();
     $active = NULL;
     $start_time = microtime();
     if (!$minimal && $temp_dir == NULL) {
         $temp_dir = CRAWL_DIR . "/temp";
         if (!file_exists($temp_dir)) {
             mkdir($temp_dir);
         }
     }
     //Set-up requests
     $num_sites = count($sites);
     for ($i = 0; $i < $num_sites; $i++) {
         $is_gopher = false;
         $sites[$i][CrawlConstants::IS_GOPHER_URL] = $is_gopher;
         if (isset($sites[$i][$key])) {
             list($sites[$i][$key], $url, $headers) = self::prepareUrlHeaders($sites[$i][$key], $minimal, $proxy_servers);
             if ($headers == "gopher") {
                 $is_gopher = true;
                 $sites[$i][CrawlConstants::IS_GOPHER_URL] = $is_gopher;
                 $headers = array();
             }
             $sites[$i][0] = curl_init();
             if (!$minimal) {
                 $ip_holder[$i] = fopen("{$temp_dir}/tmp{$i}.txt", 'w+');
                 curl_setopt($sites[$i][0], CURLOPT_STDERR, $ip_holder[$i]);
                 curl_setopt($sites[$i][0], CURLOPT_VERBOSE, true);
             }
             curl_setopt($sites[$i][0], CURLOPT_USERAGENT, USER_AGENT);
             curl_setopt($sites[$i][0], CURLOPT_IPRESOLVE, CURL_IPRESOLVE_WHATEVER);
             curl_setopt($sites[$i][0], CURLOPT_URL, $url);
             if (strcmp(substr($url, -10), "robots.txt") == 0) {
                 $sites[$i]['ROBOT'] = true;
                 $follow = true;
                 /*wikipedia redirects their robot page. grr
                     want to force this for robots pages
                   */
             }
             curl_setopt($sites[$i][0], CURLOPT_FOLLOWLOCATION, $follow);
             curl_setopt($sites[$i][0], CURLOPT_SSL_VERIFYHOST, 0);
             curl_setopt($sites[$i][0], CURLOPT_AUTOREFERER, true);
             curl_setopt($sites[$i][0], CURLOPT_RETURNTRANSFER, true);
             curl_setopt($sites[$i][0], CURLOPT_CONNECTTIMEOUT, PAGE_TIMEOUT);
             curl_setopt($sites[$i][0], CURLOPT_TIMEOUT, PAGE_TIMEOUT);
             if (stripos($url, '.onion') !== false && $tor_proxy != "") {
                 curl_setopt($sites[$i][0], CURLOPT_PROXY, $tor_proxy);
                 //CURLPROXY_SOCKS5_HOSTNAME = 7
                 curl_setopt($sites[$i][0], CURLOPT_PROXYTYPE, 7);
                 if ($timer) {
                     crawlLog("Using Tor proxy for {$url}..");
                 }
             } else {
                 if ($proxy_servers != array() && !$is_gopher) {
                     $select_proxy = rand(0, count($proxy_servers) - 1);
                     $proxy_server = $proxy_servers[$select_proxy];
                     $proxy_parts = explode(":", $proxy_server);
                     $proxy_ip = $proxy_parts[0];
                     if (!isset($proxy_parts[2]) || strtolower($proxy_parts[2]) == 'http') {
                         $proxy_type = CURLPROXY_HTTP;
                     } else {
                         if (strtolower($proxy_parts[2]) == 'socks5') {
                             $proxy_type = CURLPROXY_SOCKS5;
                         } else {
                             $proxy_type = $proxy_parts[2];
                         }
                     }
                     if (isset($proxy_parts[1])) {
                         $proxy_port = $proxy_parts[1];
                     } else {
                         $proxy_port = "80";
                     }
                     curl_setopt($sites[$i][0], CURLOPT_PROXY, "{$proxy_ip}:{$proxy_port}");
                     curl_setopt($sites[$i][0], CURLOPT_PROXYTYPE, $proxy_type);
                     if ($timer) {
                         crawlLog("Selecting proxy {$select_proxy} for {$url}");
                     }
                 }
             }
             if (!$minimal) {
                 curl_setopt($sites[$i][0], CURLOPT_HEADER, true);
             }
             //make lighttpd happier
             if (!$is_gopher) {
                 curl_setopt($sites[$i][0], CURLOPT_HTTPHEADER, $headers);
             }
             curl_setopt($sites[$i][0], CURLOPT_ENCODING, "");
             // ^ need to set for sites like att that use gzip
             if ($page_range_request > 0) {
                 curl_setopt($sites[$i][0], CURLOPT_RANGE, "0-" . $page_range_request);
             }
             if ($post_data != NULL) {
                 curl_setopt($sites[$i][0], CURLOPT_POST, true);
                 curl_setopt($sites[$i][0], CURLOPT_POSTFIELDS, $post_data[$i]);
             }
             curl_multi_add_handle($agent_handler, $sites[$i][0]);
         }
     }
     if ($timer) {
         crawlLog("  Init Get Pages " . changeInMicrotime($start_time));
     }
     $start_time = microtime();
     $start = time();
     //Wait for responses
     $running = NULL;
     $memory_limit = metricToInt(ini_get("memory_limit")) * 0.7;
     do {
         $mrc = curl_multi_exec($agent_handler, $running);
         $ready = curl_multi_select($agent_handler, 0.005);
     } while (memory_get_usage() < $memory_limit && time() - $start < PAGE_TIMEOUT && $running > 0);
     if (time() - $start > PAGE_TIMEOUT && $timer) {
         crawlLog("  TIMED OUT!!!");
     }
     if ($timer) {
         crawlLog("  Page Request time " . changeInMicrotime($start_time));
     }
     $start_time = microtime();
     //Process returned pages
     for ($i = 0; $i < $num_sites; $i++) {
         if ($timer) {
             crawlTimeoutLog("fetch_url initial processing of page %s of %s", $i, $num_sites);
         }
         if (!$minimal && isset($ip_holder[$i])) {
             rewind($ip_holder[$i]);
             $header = fread($ip_holder[$i], 8192);
             $ip_addresses = self::getCurlIp($header);
             fclose($ip_holder[$i]);
         }
         $is_gopher = false;
         if (isset($sites[$i][0]) && $sites[$i][0]) {
             // Get Data and Message Code
             $content = @curl_multi_getcontent($sites[$i][0]);
             $is_gopher = $sites[$i][CrawlConstants::IS_GOPHER_URL];
             /*
                If the Transfer-encoding was chunked then the Range header
                we sent was ignored. So we manually truncate the data
                here
             */
             if ($page_range_request > 0) {
                 $content = substr($content, 0, $page_range_request);
             }
             if (isset($content) && !$minimal && !$is_gopher) {
                 $site = self::parseHeaderPage($content, $value);
                 $sites[$i] = array_merge($sites[$i], $site);
                 if (isset($header)) {
                     $header = substr($header, 0, strpos($header, "\r\n\r\n") + 4);
                 } else {
                     $header = "";
                 }
                 $sites[$i][CrawlConstants::HEADER] = $header . $sites[$i][CrawlConstants::HEADER];
                 unset($header);
             } else {
                 if (isset($content) && !$minimal && $is_gopher) {
                     $sites[$i][CrawlConstants::HEADER] = $header;
                     $sites[$i][$value] = $content;
                     unset($header);
                 } else {
                     $sites[$i][$value] = $content;
                 }
             }
             if (!$minimal) {
                 $sites[$i][self::SIZE] = @curl_getinfo($sites[$i][0], CURLINFO_SIZE_DOWNLOAD);
                 $sites[$i][self::DNS_TIME] = @curl_getinfo($sites[$i][0], CURLINFO_NAMELOOKUP_TIME);
                 $sites[$i][self::TOTAL_TIME] = @curl_getinfo($sites[$i][0], CURLINFO_TOTAL_TIME);
                 $sites[$i][self::HTTP_CODE] = curl_getinfo($sites[$i][0], CURLINFO_HTTP_CODE);
                 if (!$sites[$i][self::HTTP_CODE] && !$is_gopher) {
                     $sites[$i][self::HTTP_CODE] = curl_error($sites[$i][0]);
                 } else {
                     $sites[$i][self::HTTP_CODE] = 200;
                 }
                 if ($ip_addresses) {
                     $sites[$i][self::IP_ADDRESSES] = $ip_addresses;
                 } else {
                     $sites[$i][self::IP_ADDRESSES] = array("0.0.0.0");
                 }
                 //Get Time, Mime type and Character encoding
                 $sites[$i][self::TIMESTAMP] = time();
                 if ($is_gopher) {
                     $path = UrlParser::getPath($sites[$i][self::URL]);
                     $filename = UrlParser::getDocumentFilename($sites[$i][self::URL]);
                     if (isset($path[1])) {
                         $gopher_type = $path[1];
                     } else {
                         $gopher_type = 1;
                     }
                     if ($gopher_type == 1) {
                         $sites[$i][self::TYPE] = "text/gopher";
                     } else {
                         if (in_array($gopher_type, array(0, 3, 6))) {
                             $sites[$i][self::TYPE] = "text/plain";
                             if ($gopher_type == 6) {
                                 $sites[$i][$value] = convert_uudecode($content);
                             }
                         } else {
                             if ($gopher_type == 'h') {
                                 $sites[$i][self::TYPE] = "text/html";
                             } else {
                                 if ($gopher_type == 'g') {
                                     $sites[$i][self::TYPE] = "image/gif";
                                 }
                             }
                         }
                     }
                     $path_info = pathinfo($filename);
                     if (!isset($sites[$i][self::TYPE]) && isset($path_info['extension'])) {
                         $sites[$i][self::TYPE] = UrlParser::guessMimeTypeFromFileName($filename);
                     } else {
                         if (!isset($sites[$i][self::TYPE])) {
                             $sites[$i][self::TYPE] = "unknown";
                         }
                     }
                 } else {
                     $type_parts = explode(";", curl_getinfo($sites[$i][0], CURLINFO_CONTENT_TYPE));
                     $sites[$i][self::TYPE] = strtolower(trim($type_parts[0]));
                 }
             }
             //curl_multi_remove_handle($agent_handler, $sites[$i][0]);
             curl_close($sites[$i][0]);
             if (isset($sites[$i]['ROBOT']) && $sites[$i]['ROBOT']) {
                 if (isset($sites[$i][self::TYPE]) && $sites[$i][self::TYPE] != "text/plain" && isset($sites[$i][CrawlConstants::LOCATION]) && count($site[CrawlConstants::LOCATION]) > 0) {
                     $sites[$i][self::TYPE] = "text/plain";
                     $sites[$i][self::HTTP_CODE] = "200";
                     $tmp = wordwrap($sites[$i][$value], 80);
                     $tmp_parts = explode("\n", $tmp);
                     $tmp = "# Suspect server misconfiguration\n";
                     $tmp .= "# Assume shouldn't crawl this site.\n";
                     $tmp .= "# Pretending got following robots.txt.\n";
                     $tmp .= "User-agent: *\n";
                     $tmp .= "Disallow: /\n";
                     $tmp .= "# Original error code: " . $sites[$i][self::HTTP_CODE] . "\n";
                     $tmp .= "# Original content:\n";
                     foreach ($tmp_parts as $part) {
                         $tmp = "#" . $part . "\n";
                     }
                     $sites[$i][$value] = $tmp;
                     $sites[$i][self::HTTP_CODE] = "200";
                     unset($site[CrawlConstants::LOCATION]);
                 }
             }
         }
         //end big if
     }
     //end for
     if ($timer) {
         crawlLog("  Get Page Content time " . changeInMicrotime($start_time));
     }
     curl_multi_close($agent_handler);
     return $sites;
 }
 /**
  * Gets the next $num many docs from the iterator
  *
  * @param int $num number of docs to get
  * @param bool $no_process this flag is inherited from base class but
  *     does not do anything in this case
  * @return array associative arrays for $num pages
  */
 function nextPages($num, $no_process = false)
 {
     if ($num + $this->overall_index >= $this->count) {
         $num = max($this->count - $this->overall_index, 0);
     }
     $num_to_get = 1;
     $objects = array();
     for ($i = 0; $i < $num; $i += $num_to_get) {
         crawlTimeoutLog("..Still getting pages from archive iterator. At %s" . " of %s", $i, $num);
         $num_to_get = min($num, $this->partition->count - $this->partition_index);
         $pre_new_objects = $this->partition->nextObjects($num_to_get);
         foreach ($pre_new_objects as $object) {
             $objects[] = $object[1];
         }
         $this->overall_index += $num_to_get;
         $this->partition_index += $num_to_get;
         if ($num_to_get <= 0) {
             $this->current_partition_num++;
             $this->partition = $this->archive->getPartition($this->current_partition_num, false);
             $this->partition_index = 0;
         }
         if ($this->current_partition_num > $this->num_partitions) {
             break;
         }
     }
     $this->end_of_iterator = $this->overall_index >= $this->count ? true : false;
     $this->saveCheckpoint();
     return $objects;
 }