Example #1
0
 /**
  * Produces a schedule.txt file of url data for a fetcher to crawl next.
  *
  * The hard part of scheduling is to make sure that the overall crawl
  * process obeys robots.txt files. This involves checking the url is in
  * an allowed path for that host and it also involves making sure the
  * Crawl-delay directive is respected. The first fetcher that contacts the
  * server requesting data to crawl will get the schedule.txt
  * produced by produceFetchBatch() at which point it will be unlinked
  * (these latter thing are controlled in FetchController).
  *
  * @see FetchController
  */
 function produceFetchBatch()
 {
     $i = 1;
     // array implementation of priority queue starts at 1 not 0
     $fetch_size = 0;
     crawlLog("Scheduler: Start Produce Fetch Batch Memory usage" . memory_get_usage());
     $count = $this->web_queue->to_crawl_queue->count;
     $schedule_time = time();
     $first_line = $this->calculateScheduleMetaInfo($schedule_time);
     $sites = array();
     $delete_urls = array();
     $crawl_delay_hosts = array();
     $time_per_request_guess = MINIMUM_FETCH_LOOP_TIME;
     // it would be impressive if we can achieve this speed
     $current_crawl_index = -1;
     crawlLog("Scheduler: Trying to Produce Fetch Batch; Queue Size {$count}");
     $start_time = microtime();
     $fh = $this->web_queue->openUrlArchive();
     /*
         $delete - array of items we will delete from the queue after
             we have selected all of the items for fetch batch
         $sites - array of urls for fetch batch indices in this array we'll
             call slots. Crawled-delayed host urls are spaced by a certain
             number of slots
     */
     while ($i <= $count && $fetch_size < MAX_FETCH_SIZE) {
         crawlTimeoutLog("..Scheduler: still producing fetch batch. " . "Examining location %s in queue of %s.", $i, $count);
         //look in queue for url and its weight
         $tmp = $this->web_queue->peekQueue($i, $fh);
         list($url, $weight, $flag, $probe) = $tmp;
         // if queue error remove entry any loop
         if ($tmp === false || strcmp($url, "LOOKUP ERROR") == 0) {
             $delete_urls[$i] = false;
             crawlLog("Scheduler: Removing lookup error at" . " {$i} during produce fetch");
             $i++;
             continue;
         }
         $no_flags = false;
         $hard_coded = false;
         $host_url = UrlParser::getHost($url);
         if ($flag == WebQueueBundle::NO_FLAGS) {
             $hard_coded_pos = strpos($url, "###!");
             if ($hard_coded_pos > 0) {
                 $has_robots = true;
                 $hard_coded = true;
                 $is_robot = false;
             } else {
                 $has_robots = $this->web_queue->containsGotRobotTxt($host_url);
                 $scheme = UrlParser::getScheme($host_url);
                 if ($scheme == "gopher") {
                     $is_robot = strcmp($host_url . "/0/robots.txt", $url) == 0;
                 } else {
                     $is_robot = strcmp($host_url . "/robots.txt", $url) == 0;
                 }
             }
             $no_flags = true;
         } else {
             $is_robot = $flag == WebQueueBundle::ROBOT;
             if ($flag >= WebQueueBundle::SCHEDULABLE) {
                 $has_robots = true;
                 if ($flag > WebQueueBundle::SCHEDULABLE) {
                     $delay = $flag - WebQueueBundle::SCHEDULABLE;
                 }
             }
         }
         //if $url is a robots.txt url see if we need to schedule or not
         if ($is_robot) {
             if ($has_robots) {
                 $delete_urls[$i] = $url;
                 $i++;
             } else {
                 $next_slot = $this->getEarliestSlot($current_crawl_index, $sites);
                 if ($next_slot < MAX_FETCH_SIZE) {
                     $sites[$next_slot] = array($url, $weight, 0);
                     $delete_urls[$i] = $url;
                     /* note don't add to seen url filter
                          since check robots every 24 hours as needed
                        */
                     $current_crawl_index = $next_slot;
                     $fetch_size++;
                     $i++;
                 } else {
                     //no more available slots so prepare to bail
                     $i = $count;
                     if ($no_flags) {
                         $this->web_queue->setQueueFlag($url, WebQueueBundle::ROBOT);
                     }
                 }
             }
             continue;
         }
         //Now handle the non-robots.txt url case
         $robots_okay = true;
         if ($has_robots) {
             if ($no_flags) {
                 if (!isset($hard_coded) || !$hard_coded) {
                     $robots_okay = $this->web_queue->checkRobotOkay($url);
                 } else {
                     $robots_okay = true;
                 }
                 if (!$robots_okay) {
                     $delete_urls[$i] = $url;
                     $this->web_queue->addSeenUrlFilter($url);
                     $i++;
                     continue;
                 }
                 $delay = $this->web_queue->getCrawlDelay($host_url);
             }
             if (!$this->withinQuota($url)) {
                 //we've not allowed to schedule $url till next hour
                 $delete_urls[$i] = $url;
                 //delete from queue (so no clog) but don't mark seen
                 $i++;
                 continue;
             }
             //each host has two entries in $this->waiting_hosts
             $num_waiting = floor(count($this->waiting_hosts) / 2);
             if ($delay > 0) {
                 // handle adding a url if there is a crawl delay
                 $hash_host = crawlHash($host_url);
                 $is_waiting_host = isset($this->waiting_hosts[$hash_host]);
                 if (!$is_waiting_host && $num_waiting < MAX_WAITING_HOSTS || $is_waiting_host && $this->waiting_hosts[$hash_host] == $schedule_time) {
                     $this->waiting_hosts[$hash_host] = $schedule_time;
                     $this->waiting_hosts[$schedule_time][] = $hash_host;
                     $request_batches_per_delay = ceil($delay / $time_per_request_guess);
                     if (!isset($crawl_delay_hosts[$hash_host])) {
                         $next_earliest_slot = $current_crawl_index;
                         $crawl_delay_hosts[$hash_host] = $next_earliest_slot;
                     } else {
                         $next_earliest_slot = $crawl_delay_hosts[$hash_host] + $request_batches_per_delay * NUM_MULTI_CURL_PAGES;
                     }
                     if (($next_slot = $this->getEarliestSlot($next_earliest_slot, $sites)) < MAX_FETCH_SIZE) {
                         $crawl_delay_hosts[$hash_host] = $next_slot;
                         $delete_urls[$i] = $url;
                         $sites[$next_slot] = array($url, $weight, $delay);
                         $this->web_queue->addSeenUrlFilter($url);
                         /* we might miss some sites by marking them
                              seen after only scheduling them
                            */
                         $fetch_size++;
                     } else {
                         if ($no_flags) {
                             $this->web_queue->setQueueFlag($url, $delay + WebQueueBundle::SCHEDULABLE);
                         }
                     }
                 } else {
                     if (!$is_waiting_host) {
                         // has crawl delay but too many already waiting
                         $delete_urls[$i] = $url;
                         //delete from queue (so no clog) but don't mark seen
                         $i++;
                         continue;
                     }
                 }
             } else {
                 // add a url no crawl delay
                 $next_slot = $this->getEarliestSlot($current_crawl_index, $sites);
                 if ($next_slot < MAX_FETCH_SIZE) {
                     $sites[$next_slot] = array($url, $weight, 0);
                     $delete_urls[$i] = $url;
                     $this->web_queue->addSeenUrlFilter($url);
                     /* we might miss some sites by marking them
                          seen after only scheduling them
                        */
                     $current_crawl_index = $next_slot;
                     $fetch_size++;
                 } else {
                     //no more available slots so prepare to bail
                     $i = $count;
                     if ($no_flags) {
                         $this->web_queue->setQueueFlag($url, WebQueueBundle::SCHEDULABLE);
                     }
                 }
             }
             //if delay else
         }
         // if containsGotRobotTxt
         // handle robots.txt urls
         $i++;
     }
     //end while
     $this->web_queue->closeUrlArchive($fh);
     $new_time = microtime();
     crawlLog("...Scheduler: Done selecting URLS for fetch batch time " . "so far:" . changeInMicrotime($start_time));
     $num_deletes = count($delete_urls);
     $k = 0;
     foreach ($delete_urls as $delete_url) {
         $k++;
         crawlTimeoutLog("..Removing selected url %s of %s " . "from queue.", $k, $num_deletes);
         if ($delete_url) {
             $this->web_queue->removeQueue($delete_url);
         } else {
             /*  if there was a hash table look up error still get rid of
                 index from priority queue */
             $this->web_queue->to_crawl_queue->poll($k);
         }
     }
     crawlLog("...Scheduler: Removed {$k} URLS for fetch batch from " . "queue in time: " . changeInMicrotime($new_time));
     $new_time = microtime();
     if (isset($sites) && count($sites) > 0) {
         $dummy_slot = array(self::DUMMY, 0.0, 0);
         /* dummy's are used for crawl delays of sites with longer delays
              when we don't have much else to crawl.
            */
         $cnt = 0;
         for ($j = 0; $j < MAX_FETCH_SIZE; $j++) {
             if (isset($sites[$j])) {
                 $cnt++;
                 if ($cnt == $fetch_size) {
                     break;
                 }
             } else {
                 if ($j % NUM_MULTI_CURL_PAGES == 0) {
                     $sites[$j] = $dummy_slot;
                 }
             }
         }
         ksort($sites);
         //write schedule to disk
         $fh = fopen(CRAWL_DIR . "/schedules/" . self::schedule_name . $this->crawl_time . ".txt", "wb");
         fwrite($fh, $first_line);
         $num_sites = count($sites);
         $k = 0;
         foreach ($sites as $site) {
             crawlTimeoutLog("..Scheduler: Still Writing fetch schedule %s" . " of %s.", $k, $num_sites);
             $k++;
             $extracted_etag = NULL;
             list($url, $weight, $delay) = $site;
             $key = crawlHash($url, true);
             if (USE_ETAG_EXPIRES) {
                 /*check if we have cache validation data for a URL. If both
                    ETag and Expires timestamp are found or only an expires
                    timestamp is found, the timestamp is compared with the current
                    time. If the current time is less than the expires timestamp,
                    the URL is not added to the fetch batch. If only an ETag is
                    found, the ETag is appended to the URL so that it can be
                    processed by the fetcher.
                   */
                 $value = $this->web_queue->etag_btree->findValue($key);
                 if ($value !== NULL) {
                     $cache_validation_data = $value[1];
                     if ($cache_validation_data['etag'] !== -1 && $cache_validation_data['expires'] !== -1) {
                         $expires_timestamp = $cache_validation_data['expires'];
                         $current_time = time();
                         if ($current_time < $expires_timestamp) {
                             continue;
                         } else {
                             $etag = $cache_validation_data['etag'];
                             $extracted_etag = "ETag: " . $etag;
                         }
                     } else {
                         if ($cache_validation_data['etag'] !== -1) {
                             $etag = $cache_validation_data['etag'];
                             $extracted_etag = "ETag: " . $etag;
                         } else {
                             if ($cache_validation_data['expires'] !== -1) {
                                 $expires_timestamp = $cache_validation_data['expires'];
                                 $current_time = time();
                                 if ($current_time < $expires_timestamp) {
                                     continue;
                                 }
                             }
                         }
                     }
                 }
             }
             $host_url = UrlParser::getHost($url);
             $dns_lookup = $this->web_queue->dnsLookup($host_url);
             if ($dns_lookup) {
                 $url .= "###" . urlencode($dns_lookup);
             }
             if ($extracted_etag !== NULL) {
                 $url .= $extracted_etag;
             }
             $out_string = base64_encode(packFloat($weight) . packInt($delay) . $url) . "\n";
             fwrite($fh, $out_string);
         }
         fclose($fh);
         crawlLog("...Scheduler: Sort URLS and write schedule time: " . changeInMicrotime($new_time));
         crawlLog("Scheduler: End Produce Fetch Batch Memory usage" . memory_get_usage());
         crawlLog("Scheduler: Created fetch batch of size {$num_sites}." . " {$num_deletes} urls were deleted." . " Queue size is now " . $this->web_queue->to_crawl_queue->count . "...Total Time to create batch: " . changeInMicrotime($start_time));
     } else {
         crawlLog("Scheduler: No fetch batch created!! " . "Time failing to make a fetch batch:" . changeInMicrotime($start_time) . ". Loop properties:{$i} {$count}" . " {$num_deletes} urls were deleted in failed attempt.");
         $max_links = max(MAX_LINKS_PER_PAGE, MAX_LINKS_PER_SITEMAP);
         if ($num_deletes < 5 && $i >= $count && $count >= NUM_URLS_QUEUE_RAM - SEEN_URLS_BEFORE_UPDATE_SCHEDULER * $max_links) {
             crawlLog("Scheduler: Queue Full and Couldn't produce Fetch " . "Batch!! Or Delete any URLS!!!");
             crawlLog("Scheduler: Rescheduling Queue Contents " . "(not marking seen) to try to unjam!");
             $this->dumpQueueToSchedules(true);
             $this->clearWebQueue();
         }
     }
 }
Example #2
0
 /**
  * Save the StringArray to its filename
  */
 function save()
 {
     $fh = fopen($this->filename, "wb");
     $tmp =& $this->string_array;
     fwrite($fh, packInt($this->string_array_size));
     fwrite($fh, $this->string_array);
     unset($this->string_array);
     fwrite($fh, serialize($this));
     $this->string_array =& $tmp;
     fclose($fh);
 }
Example #3
0
 /**
  * Used to compress an int as a fixed length string in the format of
  * the compression algorithm underlying the compressor. Since this
  * compressor doesn't compress we just use pack
  *
  * @param int $my_int the integer to compress as a fixed length string
  * @return string the fixed length string containing the packed int
  */
 function compressInt($my_int)
 {
     return packInt($my_int);
 }
Example #4
0
 /**
  * Builds an inverted index shard (word --> {docs it appears in})
  * for the current batch of SEEN_URLS_BEFORE_UPDATE_SCHEDULER many pages.
  * This inverted index shard is then merged by a queue_server
  * into the inverted index of the current generation of the crawl.
  * The complete inverted index for the whole crawl is built out of these
  * inverted indexes for generations. The point of computing a partial
  * inverted index on the fetcher is to reduce some of the computational
  * burden on the queue server. The resulting mini index computed by
  * buildMiniInvertedIndex() is stored in
  * $this->found_sites[self::INVERTED_INDEX]
  *
  */
 function buildMiniInvertedIndex()
 {
     $start_time = microtime();
     $keypad = "";
     crawlLog("  Start building mini inverted index ...  Current Memory:" . memory_get_usage());
     $num_seen = count($this->found_sites[self::SEEN_URLS]);
     $this->num_seen_sites += $num_seen;
     /*
         for the fetcher we are not saving the index shards so
         name doesn't matter.
     */
     if (!isset($this->found_sites[self::INVERTED_INDEX][$this->current_server])) {
         $this->found_sites[self::INVERTED_INDEX][$this->current_server] = new IndexShard("fetcher_shard_{$this->current_server}");
     }
     for ($i = 0; $i < $num_seen; $i++) {
         $interim_time = microtime();
         $site = $this->found_sites[self::SEEN_URLS][$i];
         if (!isset($site[self::HASH]) || isset($site[self::ROBOT_METAS]) && in_array("JUSTFOLLOW", $site[self::ROBOT_METAS])) {
             continue;
         }
         $doc_rank = false;
         if ($this->crawl_type == self::ARCHIVE_CRAWL && isset($this->archive_iterator)) {
             $doc_rank = $this->archive_iterator->weight($site);
         }
         if (isset($site[self::TYPE]) && $site[self::TYPE] == "link") {
             $is_link = true;
             $doc_keys = $site[self::HTTP_CODE];
             $site_url = $site[self::TITLE];
             $host = UrlParser::getHost($site_url);
             $link_parts = explode('|', $site[self::HASH]);
             if (isset($link_parts[5])) {
                 $link_origin = $link_parts[5];
             } else {
                 $link_origin = $site_url;
             }
             $meta_ids = PhraseParser::calculateLinkMetas($site_url, $host, $site[self::DESCRIPTION], $link_origin);
         } else {
             $is_link = false;
             $site_url = str_replace('|', "%7C", $site[self::URL]);
             $host = UrlParser::getHost($site_url);
             $doc_keys = crawlHash($site_url, true) . $site[self::HASH] . "d" . substr(crawlHash($host . "/", true), 1);
             $meta_ids = PhraseParser::calculateMetas($site, $this->video_sources);
         }
         $word_lists = array();
         /*
            self::JUST_METAS check to avoid getting sitemaps in results for
            popular words
         */
         $lang = NULL;
         if (!isset($site[self::JUST_METAS])) {
             $host_words = UrlParser::getWordsIfHostUrl($site_url);
             $path_words = UrlParser::getWordsLastPathPartUrl($site_url);
             if ($is_link) {
                 $phrase_string = $site[self::DESCRIPTION];
             } else {
                 if (isset($site[self::LANG])) {
                     if (isset($this->programming_language_extension[$site[self::LANG]])) {
                         $phrase_string = $site[self::DESCRIPTION];
                     } else {
                         $phrase_string = $host_words . " " . $site[self::TITLE] . " " . $path_words . " " . $site[self::DESCRIPTION];
                     }
                 } else {
                     $phrase_string = $host_words . " " . $site[self::TITLE] . " " . $path_words . " " . $site[self::DESCRIPTION];
                 }
             }
             if (isset($site[self::LANG])) {
                 $lang = guessLocaleFromString(mb_substr($site[self::DESCRIPTION], 0, AD_HOC_TITLE_LENGTH), $site[self::LANG]);
             }
             $word_lists = PhraseParser::extractPhrasesInLists($phrase_string, $lang);
             $len = strlen($phrase_string);
             if (isset($this->programming_language_extension[$lang]) || PhraseParser::computeSafeSearchScore($word_lists, $len) < 0.012) {
                 $meta_ids[] = "safe:true";
                 $safe = true;
             } else {
                 $meta_ids[] = "safe:false";
                 $safe = false;
             }
         }
         if (!$is_link) {
             //store inlinks so they can be searched by
             $num_links = count($site[self::LINKS]);
             if ($num_links > 0) {
                 $link_rank = false;
                 if ($doc_rank !== false) {
                     $link_rank = max($doc_rank - 1, 1);
                 }
             } else {
                 $link_rank = false;
             }
         }
         $num_queue_servers = count($this->queue_servers);
         if (isset($site[self::USER_RANKS]) && count($site[self::USER_RANKS]) > 0) {
             $score_keys = "";
             foreach ($site[self::USER_RANKS] as $label => $score) {
                 $score_keys .= packInt($score);
             }
             if (strlen($score_keys) % 8 != 0) {
                 $score_keys .= $keypad;
             }
             $doc_keys .= $score_keys;
         }
         $this->found_sites[self::INVERTED_INDEX][$this->current_server]->addDocumentWords($doc_keys, self::NEEDS_OFFSET_FLAG, $word_lists, $meta_ids, PhraseParser::$materialized_metas, true, $doc_rank);
         /*
            $this->no_process_links is set when doing things like
            mix recrawls. In this case links likely already will appear
            in what indexing, so don't index again. $site[self::JUST_META]
            is set when have a sitemap or robots.txt (this case set later).
            In this case link  info is not particularly useful for indexing
            and can greatly slow building inverted index.
         */
         if (!$this->no_process_links && !isset($site[self::JUST_METAS]) && !isset($this->programming_language_extension[$lang])) {
             foreach ($site[self::LINKS] as $url => $link_text) {
                 /* this mysterious check means won't index links from
                     robots.txt. Sitemap will still be in TO_CRAWL, but that's
                     done elsewhere
                    */
                 if (strlen($url) == 0 || is_numeric($url)) {
                     continue;
                 }
                 $link_host = UrlParser::getHost($url);
                 if (strlen($link_host) == 0) {
                     continue;
                 }
                 $part_num = calculatePartition($link_host, $num_queue_servers);
                 $summary = array();
                 if (!isset($this->found_sites[self::LINK_SEEN_URLS][$part_num])) {
                     $this->found_sites[self::LINK_SEEN_URLS][$part_num] = array();
                 }
                 $elink_flag = $link_host != $host ? true : false;
                 $link_text = strip_tags($link_text);
                 $ref = $elink_flag ? "eref" : "iref";
                 $url = str_replace('|', "%7C", $url);
                 $link_id = "url|" . $url . "|text|" . urlencode($link_text) . "|{$ref}|" . $site_url;
                 $elink_flag_string = $elink_flag ? "e" : "i";
                 $link_keys = crawlHash($url, true) . crawlHash($link_id, true) . $elink_flag_string . substr(crawlHash($host . "/", true), 1);
                 $summary[self::URL] = $link_id;
                 $summary[self::TITLE] = $url;
                 // stripping html to be on the safe side
                 $summary[self::DESCRIPTION] = $link_text;
                 $summary[self::TIMESTAMP] = $site[self::TIMESTAMP];
                 $summary[self::ENCODING] = $site[self::ENCODING];
                 $summary[self::HASH] = $link_id;
                 $summary[self::TYPE] = "link";
                 $summary[self::HTTP_CODE] = $link_keys;
                 $summary[self::LANG] = $lang;
                 $this->found_sites[self::LINK_SEEN_URLS][$part_num][] = $summary;
                 $link_word_lists = PhraseParser::extractPhrasesInLists($link_text, $lang);
                 $link_meta_ids = PhraseParser::calculateLinkMetas($url, $link_host, $link_text, $site_url);
                 if (!isset($this->found_sites[self::INVERTED_INDEX][$part_num])) {
                     $this->found_sites[self::INVERTED_INDEX][$part_num] = new IndexShard("fetcher_shard_{$part_num}");
                 }
                 $this->found_sites[self::INVERTED_INDEX][$part_num]->addDocumentWords($link_keys, self::NEEDS_OFFSET_FLAG, $link_word_lists, $link_meta_ids, PhraseParser::$materialized_metas, false, $link_rank);
             }
         }
         $iterim_elapse = changeInMicrotime($interim_time);
         if ($iterim_elapse > 5) {
             crawlLog("..Inverting " . $site[self::URL] . "...took > 5s.");
         }
         crawlTimeoutLog("..Still building inverted index. Have processed " . "%s of %s documents.\nLast url processed was %s.", $i, $num_seen, $site[self::URL]);
     }
     if ($this->crawl_type == self::ARCHIVE_CRAWL) {
         $this->recrawl_check_scheduler = true;
     }
     crawlLog("  Build mini inverted index time " . changeInMicrotime($start_time));
 }
Example #5
0
 /**
  * Test how fast insertion and deletions can be done
  */
 function timingTestCase()
 {
     $start_time = microtime();
     for ($i = 0; $i < 10000; $i++) {
         $this->test_objects['FILE1']->insert(crawlHash("hi{$i}", true), "0000" . packInt($i));
     }
     $this->assertTrue(changeInMicrotime($start_time) < 2, "Insert 10000 into table of size 20000 takes less than 2 seconds");
     $start_time = microtime();
     for ($i = 0; $i < 10000; $i++) {
         $this->test_objects['FILE1']->delete(crawlHash("hi{$i}", true));
     }
     $this->assertTrue(changeInMicrotime($start_time) < 2, "Delete 10000 from table of size 20000 takes less than 2 seconds");
 }
Example #6
0
 /**
  * Used to flush changes of hash_url indexes caused by adjusting weights
  * in the bundle's priority queue to its hash table.
  */
 function notifyFlush()
 {
     foreach ($this->notify_buffer as $hash_url => $index) {
         $both = $this->lookupHashTable($hash_url, HashTable::RETURN_BOTH);
         if ($both !== false) {
             list($probe, $value) = $both;
             $packed_offset = substr($value, 0, 4);
             $packed_flag = substr($value, 8, 4);
             $new_data = $packed_offset . packInt($index) . $packed_flag;
             $this->insertHashTable($hash_url, $new_data, $probe);
         } else {
             crawlLog("NOTIFY LOOKUP FAILED. INDEX WAS {$index}. DATA WAS " . bin2hex($hash_url));
         }
     }
     $this->notify_buffer = array();
 }
Example #7
0
 /**
  * Used to recompute both the index shards and the dictionary
  * of an index archive. The first step involves re-extracting the
  * word into an inverted index from the summaries' web_archives.
  * Then a reindex is done.
  *
  * @param string $archive_path file path to a IndexArchiveBundle
  */
 function rebuildIndexArchive($archive_path)
 {
     $archive_type = $this->getArchiveKind($archive_path);
     if ($archive_type != "IndexArchiveBundle") {
         $this->badFormatMessageAndExit($archive_path);
     }
     $info = $archive_type::getArchiveInfo($archive_path);
     $tmp = unserialize($info["DESCRIPTION"]);
     $video_sources = $tmp[self::VIDEO_SOURCES];
     $generation_info = unserialize(file_get_contents("{$archive_path}/generation.txt"));
     $num_generations = $generation_info['ACTIVE'] + 1;
     $archive = new WebArchiveBundle($archive_path . "/summaries");
     $seen = 0;
     $generation = 0;
     $keypad = "";
     while ($generation < $num_generations) {
         $partition = $archive->getPartition($generation, false);
         $shard_name = $archive_path . "/posting_doc_shards/index{$generation}";
         crawlLog("Processing partition {$generation}");
         if (file_exists($shard_name)) {
             crawlLog("..Unlinking old shard {$generation}");
             @unlink($shard_name);
         }
         $shard = new IndexShard($shard_name, $generation, NUM_DOCS_PER_GENERATION, true);
         $seen_partition = 0;
         while ($seen_partition < $partition->count) {
             $num_to_get = min($partition->count - $seen_partition, 8000);
             $offset = $partition->iterator_pos;
             $objects = $partition->nextObjects($num_to_get);
             $cnt = 0;
             foreach ($objects as $object) {
                 $cnt++;
                 $site = $object[1];
                 if (isset($site[self::TYPE]) && $site[self::TYPE] == "link") {
                     $is_link = true;
                     $doc_keys = $site[self::HTTP_CODE];
                     $site_url = $site[self::TITLE];
                     $host = UrlParser::getHost($site_url);
                     $link_parts = explode('|', $site[self::HASH]);
                     if (isset($link_parts[5])) {
                         $link_origin = $link_parts[5];
                     } else {
                         $link_origin = $site_url;
                     }
                     $meta_ids = PhraseParser::calculateLinkMetas($site_url, $host, $site[self::DESCRIPTION], $link_origin);
                     $link_to = "LINK TO:";
                 } else {
                     $is_link = false;
                     $site_url = str_replace('|', "%7C", $site[self::URL]);
                     $host = UrlParser::getHost($site_url);
                     $doc_keys = crawlHash($site_url, true) . $site[self::HASH] . "d" . substr(crawlHash($host . "/", true), 1);
                     $meta_ids = PhraseParser::calculateMetas($site, $video_sources);
                     $link_to = "";
                 }
                 $so_far_cnt = $seen_partition + $cnt;
                 $time_out_message = "..still processing {$so_far_cnt} " . "of {$partition->count} in partition {$generation}." . "\n..Last processed was: " . ($seen + 1) . ". {$link_to}{$site_url}. ";
                 crawlTimeoutLog($time_out_message);
                 $seen++;
                 $word_lists = array();
                 /*
                    self::JUST_METAS check to avoid getting sitemaps in
                    results for popular words
                 */
                 $lang = NULL;
                 if (!isset($site[self::JUST_METAS])) {
                     $host_words = UrlParser::getWordsIfHostUrl($site_url);
                     $path_words = UrlParser::getWordsLastPathPartUrl($site_url);
                     if ($is_link) {
                         $phrase_string = $site[self::DESCRIPTION];
                     } else {
                         $phrase_string = $host_words . " " . $site[self::TITLE] . " " . $path_words . " " . $site[self::DESCRIPTION];
                     }
                     if (isset($site[self::LANG])) {
                         $lang = guessLocaleFromString(mb_substr($site[self::DESCRIPTION], 0, AD_HOC_TITLE_LENGTH), $site[self::LANG]);
                     }
                     $word_lists = PhraseParser::extractPhrasesInLists($phrase_string, $lang);
                     $len = strlen($phrase_string);
                     if (PhraseParser::computeSafeSearchScore($word_lists, $len) < 0.012) {
                         $meta_ids[] = "safe:true";
                         $safe = true;
                     } else {
                         $meta_ids[] = "safe:false";
                         $safe = false;
                     }
                 }
                 if (isset($site[self::USER_RANKS]) && count($site[self::USER_RANKS]) > 0) {
                     $score_keys = "";
                     foreach ($site[self::USER_RANKS] as $label => $score) {
                         $score_keys .= packInt($score);
                     }
                     if (strlen($score_keys) % 8 != 0) {
                         $score_keys .= $keypad;
                     }
                     $doc_keys .= $score_keys;
                 }
                 $shard->addDocumentWords($doc_keys, $offset, $word_lists, $meta_ids, PhraseParser::$materialized_metas, true, false);
                 $offset = $object[0];
             }
             $seen_partition += $num_to_get;
         }
         $shard->save(false, true);
         $generation++;
     }
     $this->reindexIndexArchive($archive_path);
 }