Exemple #1
0
 /**
  * Retrieves the requested page from the WebArchiveBundle and echo it page,
  * base64 encoded
  */
 function cache()
 {
     $offset = $this->clean($_REQUEST['offset'], "int");
     $partition = $this->clean($_REQUEST['partition'], "int");
     $crawl_time = $this->clean($_REQUEST['crawl_time'], "string");
     $prefix = "";
     if (isset($_REQUEST['instance_num'])) {
         $prefix = $this->clean($_REQUEST['instance_num'], "int") . "-";
     }
     if (file_exists(CRAWL_DIR . '/cache/' . $prefix . self::archive_base_name . $crawl_time)) {
         $web_archive = new WebArchiveBundle(CRAWL_DIR . '/cache/' . $prefix . self::archive_base_name . $crawl_time);
         $page = $web_archive->getPage($offset, $partition);
         echo base64_encode(serialize($page));
     } else {
         echo base64_encode(serialize(false));
     }
 }
Exemple #2
0
 /**
  * Returns the mast time the archive info of the bundle was modified.
  *
  * @param string $dir_name folder with archive bundle
  */
 static function getParamModifiedTime($dir_name)
 {
     return WebArchiveBundle::getParamModifiedTime($dir_name . "/summaries");
 }
Exemple #3
0
 /**
  * Used to recompute both the index shards and the dictionary
  * of an index archive. The first step involves re-extracting the
  * word into an inverted index from the summaries' web_archives.
  * Then a reindex is done.
  *
  * @param string $archive_path file path to a IndexArchiveBundle
  */
 function rebuildIndexArchive($archive_path)
 {
     $archive_type = $this->getArchiveKind($archive_path);
     if ($archive_type != "IndexArchiveBundle") {
         $this->badFormatMessageAndExit($archive_path);
     }
     $info = $archive_type::getArchiveInfo($archive_path);
     $tmp = unserialize($info["DESCRIPTION"]);
     $video_sources = $tmp[self::VIDEO_SOURCES];
     $generation_info = unserialize(file_get_contents("{$archive_path}/generation.txt"));
     $num_generations = $generation_info['ACTIVE'] + 1;
     $archive = new WebArchiveBundle($archive_path . "/summaries");
     $seen = 0;
     $generation = 0;
     $keypad = "";
     while ($generation < $num_generations) {
         $partition = $archive->getPartition($generation, false);
         $shard_name = $archive_path . "/posting_doc_shards/index{$generation}";
         crawlLog("Processing partition {$generation}");
         if (file_exists($shard_name)) {
             crawlLog("..Unlinking old shard {$generation}");
             @unlink($shard_name);
         }
         $shard = new IndexShard($shard_name, $generation, NUM_DOCS_PER_GENERATION, true);
         $seen_partition = 0;
         while ($seen_partition < $partition->count) {
             $num_to_get = min($partition->count - $seen_partition, 8000);
             $offset = $partition->iterator_pos;
             $objects = $partition->nextObjects($num_to_get);
             $cnt = 0;
             foreach ($objects as $object) {
                 $cnt++;
                 $site = $object[1];
                 if (isset($site[self::TYPE]) && $site[self::TYPE] == "link") {
                     $is_link = true;
                     $doc_keys = $site[self::HTTP_CODE];
                     $site_url = $site[self::TITLE];
                     $host = UrlParser::getHost($site_url);
                     $link_parts = explode('|', $site[self::HASH]);
                     if (isset($link_parts[5])) {
                         $link_origin = $link_parts[5];
                     } else {
                         $link_origin = $site_url;
                     }
                     $meta_ids = PhraseParser::calculateLinkMetas($site_url, $host, $site[self::DESCRIPTION], $link_origin);
                     $link_to = "LINK TO:";
                 } else {
                     $is_link = false;
                     $site_url = str_replace('|', "%7C", $site[self::URL]);
                     $host = UrlParser::getHost($site_url);
                     $doc_keys = crawlHash($site_url, true) . $site[self::HASH] . "d" . substr(crawlHash($host . "/", true), 1);
                     $meta_ids = PhraseParser::calculateMetas($site, $video_sources);
                     $link_to = "";
                 }
                 $so_far_cnt = $seen_partition + $cnt;
                 $time_out_message = "..still processing {$so_far_cnt} " . "of {$partition->count} in partition {$generation}." . "\n..Last processed was: " . ($seen + 1) . ". {$link_to}{$site_url}. ";
                 crawlTimeoutLog($time_out_message);
                 $seen++;
                 $word_lists = array();
                 /*
                    self::JUST_METAS check to avoid getting sitemaps in
                    results for popular words
                 */
                 $lang = NULL;
                 if (!isset($site[self::JUST_METAS])) {
                     $host_words = UrlParser::getWordsIfHostUrl($site_url);
                     $path_words = UrlParser::getWordsLastPathPartUrl($site_url);
                     if ($is_link) {
                         $phrase_string = $site[self::DESCRIPTION];
                     } else {
                         $phrase_string = $host_words . " " . $site[self::TITLE] . " " . $path_words . " " . $site[self::DESCRIPTION];
                     }
                     if (isset($site[self::LANG])) {
                         $lang = guessLocaleFromString(mb_substr($site[self::DESCRIPTION], 0, AD_HOC_TITLE_LENGTH), $site[self::LANG]);
                     }
                     $word_lists = PhraseParser::extractPhrasesInLists($phrase_string, $lang);
                     $len = strlen($phrase_string);
                     if (PhraseParser::computeSafeSearchScore($word_lists, $len) < 0.012) {
                         $meta_ids[] = "safe:true";
                         $safe = true;
                     } else {
                         $meta_ids[] = "safe:false";
                         $safe = false;
                     }
                 }
                 if (isset($site[self::USER_RANKS]) && count($site[self::USER_RANKS]) > 0) {
                     $score_keys = "";
                     foreach ($site[self::USER_RANKS] as $label => $score) {
                         $score_keys .= packInt($score);
                     }
                     if (strlen($score_keys) % 8 != 0) {
                         $score_keys .= $keypad;
                     }
                     $doc_keys .= $score_keys;
                 }
                 $shard->addDocumentWords($doc_keys, $offset, $word_lists, $meta_ids, PhraseParser::$materialized_metas, true, false);
                 $offset = $object[0];
             }
             $seen_partition += $num_to_get;
         }
         $shard->save(false, true);
         $generation++;
     }
     $this->reindexIndexArchive($archive_path);
 }