Example #1
0
 /**
  * Implements post processing of recipes. recipes are extracted
  * ingredients are scrubbed and recipes are clustered. The clustered
  * recipes are added back to the index.
  *
  * @param string $index_name  index name of the current crawl.
  */
 function postProcessing($index_name)
 {
     global $INDEXING_PLUGINS;
     if (!class_exists("SplHeap")) {
         crawlLog("...Recipe Plugin Requires SPLHeap for clustering!");
         crawlLog("...Aborting plugin");
         return;
     }
     $locale_tag = guessLocale();
     setLocaleObject($locale_tag);
     $search_controller = new SearchController($INDEXING_PLUGINS);
     $query = "recipe:all i:{$index_name}";
     crawlLog("...Running Recipe Plugin!");
     crawlLog("...Finding docs tagged as recipes.");
     $more_docs = true;
     $raw_recipes = array();
     $limit = 0;
     $num = 100;
     while ($more_docs) {
         $results = @$search_controller->queryRequest($query, $num, $limit, 1, $index_name);
         if (isset($results["PAGES"]) && ($num_results = count($results["PAGES"])) > 0) {
             $raw_recipes = array_merge($raw_recipes, $results["PAGES"]);
         }
         crawlLog("Scanning recipes {$limit} through " . ($limit + $num_results) . ".");
         $limit += $num_results;
         if (isset($results["SAVE_POINT"])) {
             $end = true;
             foreach ($results["SAVE_POINT"] as $save_point) {
                 if ($save_point != -1) {
                     $end = false;
                 }
             }
             if ($end) {
                 $more_docs = false;
             }
         } else {
             $more_docs = false;
         }
     }
     crawlLog("...Clustering.");
     // only cluster if would make more than one cluster
     if (count($raw_recipes) * CLUSTER_RATIO > 1) {
         $recipes = array();
         $i = 0;
         foreach ($raw_recipes as $raw_recipe) {
             $description = $raw_recipe[self::DESCRIPTION];
             $ingredients = explode("||", $description);
             if (is_array($ingredients) && count($ingredients) > 1) {
                 $recipes[$i][0] = $raw_recipe[self::TITLE];
                 $recipes[$i][1] = $ingredients;
                 $recipes[$i][2] = crawlHash($raw_recipe[self::URL]);
                 $recipes[$i][3] = $raw_recipe;
                 $i++;
             }
         }
         $recipes_ingredients = array();
         $count = count($recipes);
         foreach ($recipes as $key => $recipe) {
             foreach ($recipe[1] as $index => $ingredient) {
                 if (strlen($ingredient) != 0 && substr($ingredient, strlen($ingredient) - 1) != ":") {
                     $mainIngredient = $this->getIngredientName((string) $ingredient);
                     if (strlen($mainIngredient) != 0) {
                         $recipe[1][$index] = $mainIngredient;
                     } else {
                         unset($recipe[1][$index]);
                     }
                 } else {
                     unset($recipe[1][$index]);
                 }
             }
             $recipes[$key] = $recipe;
         }
         $count = count($recipes);
         $k = 0;
         $basic_ingredients = array('onion', 'oil', 'cheese', 'pepper', 'sauce', 'salt', 'milk', 'butter', 'flour', 'cake', 'garlic', 'cream', 'soda', 'honey', 'powder', 'sauce', 'water', 'vanilla', 'pepper', 'bread', 'sugar', 'vanillaextract', 'celery', 'seasoning', 'syrup', 'skewers', 'egg', 'muffin', 'ginger', 'basil', 'oregano', 'cinammon', 'cumin', 'mayonnaise', 'mayo', 'chillipowder', 'lemon', 'greens', 'yogurt', 'margarine', 'asparagus', 'halfhalf', 'pancakemix', 'coffee', 'cookies', 'lime', 'chillies', 'cilantro', 'rosemary', 'vanillaextract', 'vinegar', 'shallots', 'wine', 'cornmeal', 'nonstickspray');
         for ($i = 0; $i < $count; $i++) {
             $recipe1_main_ingredient = "";
             $recipe1 = $recipes[$i][1];
             $recipe_name = $recipes[$i][0];
             $recipe1_title = strtolower($recipes[$i][0]);
             $distinct_ingredients[$recipe_name] = $recipes[$i][1];
             $doc_keys[$recipe_name] = $recipes[$i][2];
             $recipes_summary[$recipe_name] = $recipes[$i][3];
             for ($j = $i + 1; $j < $count; $j++) {
                 $recipe2_main_ingredient = "";
                 $recipe2 = $recipes[$j][1];
                 $recipe2_title = strtolower($recipes[$j][0]);
                 $weights[$k][0] = $recipes[$i][0];
                 $weights[$k][1] = $recipes[$j][0];
                 $merge_array = array_merge($recipe1, $recipe2);
                 $vector_array = array_unique($merge_array);
                 sort($vector_array);
                 $recipe1_vector = array_fill_keys($vector_array, 0);
                 $recipe2_vector = array_fill_keys($vector_array, 0);
                 foreach ($recipe1 as $ingredient) {
                     if ($ingredient != "" && !in_array($ingredient, $basic_ingredients)) {
                         if (strstr($recipe1_title, $ingredient)) {
                             $recipe1_main_ingredient = $ingredient;
                         }
                     }
                     $recipe1_vector[$ingredient] = 1;
                 }
                 foreach ($recipe2 as $ingredient) {
                     if ($ingredient != "" && !in_array($ingredient, $basic_ingredients)) {
                         if (strstr($recipe2_title, $ingredient)) {
                             $recipe2_main_ingredient = $ingredient;
                         }
                     }
                     $recipe2_vector[$ingredient] = 1;
                 }
                 $edge_weight = 0;
                 $matches = 1;
                 foreach ($vector_array as $vector) {
                     $diff = $recipe1_vector[$vector] - $recipe2_vector[$vector];
                     $vector_diff[$vector] = pow($diff, 2);
                     if (abs($diff) == 1) {
                         $matches += 1;
                     }
                     $edge_weight += $vector_diff[$vector];
                 }
                 $main_ingredient_match = 1;
                 if ($recipe1_main_ingredient != $recipe2_main_ingredient) {
                     $main_ingredient_match = 1000;
                 }
                 $edge_weight = sqrt($edge_weight) * $matches * $main_ingredient_match;
                 $weights[$k][2] = $edge_weight;
                 $k++;
             }
         }
         crawlLog("...Making new shard with clustered recipes as docs.");
         $clusters = kruskalClustering($weights, $count, $distinct_ingredients);
         $index_shard = new IndexShard("cluster_shard");
         $word_lists = array();
         $recipe_sites = array();
         foreach ($clusters as $cluster) {
             $count = count($cluster);
             for ($i = 0; $i < $count - 1; $i++) {
                 $meta_ids = array();
                 $summary = array();
                 $recipe = $cluster[$i];
                 $summary[self::URL] = $recipes_summary[$recipe][self::URL];
                 $summary[self::TITLE] = $recipes_summary[$recipe][self::TITLE];
                 $summary[self::DESCRIPTION] = $recipes_summary[$recipe][self::DESCRIPTION];
                 $summary[self::TIMESTAMP] = $recipes_summary[$recipe][self::TIMESTAMP];
                 $summary[self::ENCODING] = $recipes_summary[$recipe][self::ENCODING];
                 $summary[self::HASH] = $recipes_summary[$recipe][self::HASH];
                 $doc_keys[$recipe] = crawlHash($summary[self::URL], true);
                 $hash_rhost = "r" . substr(crawlHash(UrlParser::getHost($summary[self::URL]) . "/", true), 1);
                 $doc_keys[$recipe] .= $summary[self::HASH] . $hash_rhost;
                 $summary[self::TYPE] = $recipes_summary[$recipe][self::TYPE];
                 $summary[self::HTTP_CODE] = $recipes_summary[$recipe][self::HTTP_CODE];
                 $recipe_sites[] = $summary;
                 $meta_ids[] = "ingredient:" . trim($cluster["ingredient"]);
                 crawlLog("ingredient:" . $cluster["ingredient"]);
                 if (!$index_shard->addDocumentWords($doc_keys[$recipe], self::NEEDS_OFFSET_FLAG, $word_lists, $meta_ids, true, false)) {
                     crawlLog("Problem inserting recipe: " . $summary[self::TITLE]);
                 }
             }
         }
         $shard_string = $index_shard->save(true);
         $index_shard = IndexShard::load("cluster_shard", $shard_string);
         unset($shard_string);
         crawlLog("...Adding recipe shard to index archive bundle");
         $dir = CRAWL_DIR . "/cache/" . self::index_data_base_name . $index_name;
         $index_archive = new IndexArchiveBundle($dir, false);
         if ($index_shard->word_docs_packed) {
             $index_shard->unpackWordDocs();
         }
         $generation = $index_archive->initGenerationToAdd($index_shard);
         if (isset($recipe_sites)) {
             crawlLog("... Adding " . count($recipe_sites) . " recipe docs.");
             $index_archive->addPages($generation, self::SUMMARY_OFFSET, $recipe_sites, 0);
         }
         $k = 0;
         foreach ($recipe_sites as $site) {
             $recipe = $site[self::TITLE];
             $hash = crawlHash($site[self::URL], true) . $site[self::HASH] . "r" . substr(crawlHash(UrlParser::getHost($site[self::URL]) . "/", true), 1);
             $summary_offsets[$hash] = $site[self::SUMMARY_OFFSET];
         }
         $index_shard->changeDocumentOffsets($summary_offsets);
         $index_archive->addIndexData($index_shard);
         $index_archive->saveAndAddCurrentShardDictionary();
         $index_archive->dictionary->mergeAllTiers();
         $this->db->setWorldPermissionsRecursive(CRAWL_DIR . '/cache/' . self::index_data_base_name . $index_name);
         crawlLog("...Recipe plugin finished.");
     }
 }
Example #2
0
 /**
  * Adds the summary and index data in $file to summary bundle and word index
  *
  * @param string $file containing web pages summaries and a mini-inverted
  *     index for their content
  * @param bool $blocking this method might be called by the indexer
  *     subcomponent when a merge tier phase is ongoing to allow for
  *     other processing to occur. If so, we don't want a regress
  *     where the indexer calls this code calls the indexer etc. If
  *     the blocking flag is set then the indexer subcomponent won't
  *     be called
  */
 function processIndexArchive($file, $blocking)
 {
     static $blocked = false;
     if ($blocking && $blocked) {
         crawlLog("Indexer waiting for merge tiers to " . "complete before write partition. B");
         return;
     }
     if (!$blocking) {
         $blocked = false;
     }
     crawlLog("{$this->server_name} is starting to process index data," . " memory usage: " . memory_get_usage() . "...");
     crawlLog("Indexer: Processing index data in {$file}...");
     $start_time = microtime();
     $start_total_time = microtime();
     $pre_sites = webdecode(file_get_contents($file));
     $len_urls = unpackInt(substr($pre_sites, 0, 4));
     $seen_urls_string = substr($pre_sites, 4, $len_urls);
     $pre_sites = substr($pre_sites, 4 + $len_urls);
     $sites[self::SEEN_URLS] = array();
     $pos = 0;
     $num = 0;
     $bad = false;
     $max_batch_sites_and_links = SEEN_URLS_BEFORE_UPDATE_SCHEDULER * (max(MAX_LINKS_PER_PAGE, MAX_LINKS_PER_SITEMAP) + 1);
     while ($pos < $len_urls && $num <= $max_batch_sites_and_links) {
         crawlTimeoutLog("..Indexer still processing index data at position" . " %s of out of %s", $pos, $len_urls);
         $len_site = unpackInt(substr($seen_urls_string, $pos, 4));
         if ($len_site > 2 * $this->page_range_request) {
             crawlLog("Indexer: Site string too long, {$len_site}," . " data file may be corrupted? Skip rest.");
             $bad = true;
             break;
         }
         $pos += 4;
         $site_string = substr($seen_urls_string, $pos, $len_site);
         $pos += strlen($site_string);
         $tmp = unserialize(gzuncompress($site_string));
         if (!$tmp || !is_array($tmp)) {
             crawlLog("Compressed array null," . " data file may be corrupted? Skip rest.");
             $bad = true;
             break;
         }
         $sites[self::SEEN_URLS][] = $tmp;
         $num++;
     }
     if ($num > $max_batch_sites_and_links * SEEN_URLS_BEFORE_UPDATE_SCHEDULER || $bad) {
         crawlLog("Index data file len_urls was {$len_urls} num was {$num}, " . "may be corrupt so skipping this file.");
         crawlLog("Indexer Done Index Processing File: {$file}. Total time: " . changeInMicrotime($start_total_time));
         unlink($file);
         return;
     }
     crawlLog("A. Indexer Load SEEN_URLS. Memory usage:" . memory_get_usage() . " time: " . changeInMicrotime($start_time));
     $sites[self::INVERTED_INDEX] = IndexShard::load("fetcher_shard", $pre_sites);
     unset($pre_sites);
     crawlLog("B. Indexer Load Sent shard. Memory usage:" . memory_get_usage() . " time: " . changeInMicrotime($start_time));
     $start_time = microtime();
     //do deduplication of summaries
     if (isset($sites[self::SEEN_URLS]) && count($sites[self::SEEN_URLS]) > 0) {
         $seen_sites = $sites[self::SEEN_URLS];
         $seen_sites = array_values($seen_sites);
         unset($sites[self::SEEN_URLS]);
         $num_seen = count($seen_sites);
         crawlLog("Indexer: SEEN_URLS array had {$num_seen} sites.");
     } else {
         $num_seen = 0;
     }
     $visited_urls_count = 0;
     $recent_urls_count = 0;
     $recent_urls = array();
     for ($i = 0; $i < $num_seen; $i++) {
         $seen_sites[$i][self::HASH_URL] = crawlHash($seen_sites[$i][self::URL], true);
         $link_url_parts = explode("|", $seen_sites[$i][self::URL]);
         if (strcmp("url", $link_url_parts[0]) == 0) {
             $reftype = strcmp("eref", $link_url_parts[4]) == 0 ? "e" : "i";
             $seen_sites[$i][self::HASH_URL] = crawlHash($link_url_parts[1], true) . crawlHash($seen_sites[$i][self::URL], true) . $reftype . substr(crawlHash(UrlParser::getHost($link_url_parts[5]) . "/", true), 1);
             $seen_sites[$i][self::IS_DOC] = false;
         } else {
             $seen_sites[$i][self::IS_DOC] = true;
             $visited_urls_count++;
             array_push($recent_urls, $seen_sites[$i][self::URL]);
             if ($recent_urls_count >= NUM_RECENT_URLS_TO_DISPLAY) {
                 array_shift($recent_urls);
             }
             $recent_urls_count++;
         }
     }
     if (isset($sites[self::INVERTED_INDEX])) {
         $index_shard =& $sites[self::INVERTED_INDEX];
         $generation = $this->index_archive->initGenerationToAdd($index_shard->num_docs, $this, $blocking);
         if ($generation == -1) {
             crawlLog("Indexer waiting for merge tiers to " . "complete before write partition. A");
             $blocked = true;
             return;
         }
         $summary_offsets = array();
         if (isset($seen_sites)) {
             $this->index_archive->addPages($generation, self::SUMMARY_OFFSET, $seen_sites, $visited_urls_count);
             foreach ($seen_sites as $site) {
                 if ($site[self::IS_DOC]) {
                     // so not link
                     $site_url = str_replace('|', "%7C", $site[self::URL]);
                     $host = UrlParser::getHost($site_url);
                     $hash = crawlHash($site_url, true) . $site[self::HASH] . "d" . substr(crawlHash($host . "/", true), 1);
                 } else {
                     $hash = $site[self::HASH_URL];
                 }
                 $summary_offsets[$hash] = $site[self::SUMMARY_OFFSET];
             }
             unset($seen_sites);
         }
         crawlLog("C. Indexer init local shard, store " . "Summaries memory usage:" . memory_get_usage() . " time: " . changeInMicrotime($start_time));
         $start_time = microtime();
         // added summary offset info to inverted index data
         $index_shard->changeDocumentOffsets($summary_offsets);
         crawlLog("D. Indexer Update shard offsets. Memory usage:" . memory_get_usage() . " time: " . changeInMicrotime($start_time));
         $start_time = microtime();
         $this->index_archive->addIndexData($index_shard);
         $this->index_dirty = true;
     }
     crawlLog("E. Indexer Add index shard. Memory usage:" . memory_get_usage() . " time: " . changeInMicrotime($start_time));
     crawlLog("Indexer Done Index Processing File: {$file}. Total time: " . changeInMicrotime($start_total_time));
     if (isset($recent_urls)) {
         $sites[self::RECENT_URLS] =& $recent_urls;
         $this->writeCrawlStatus($sites);
     }
     if (file_exists($file)) {
         //Haven't tracked down yet, but can try to delete twice giving warn
         unlink($file);
     }
 }
Example #3
0
 /**
  * Returns the shard which is currently being used to read word-document
  * data from the bundle. If one wants to write data to the bundle use
  * getActiveShard() instead. The point of this method is to allow
  * for lazy reading of the file associated with the shard.
  *
  * @return object the currently being index shard
  */
 function getCurrentShard()
 {
     if (!isset($this->current_shard)) {
         if (!isset($this->generation_info['CURRENT'])) {
             $this->generation_info['CURRENT'] = $this->generation_info['ACTIVE'];
         }
         $current_index_shard_file = $this->dir_name . "/posting_doc_shards/index" . $this->generation_info['CURRENT'];
         if (file_exists($current_index_shard_file)) {
             if (isset($this->generation_info['DISK_BASED']) && $this->generation_info['DISK_BASED'] == true) {
                 $this->current_shard = new IndexShard($current_index_shard_file, $this->generation_info['CURRENT'], $this->num_docs_per_generation, true);
                 $this->current_shard->getShardHeader();
                 $this->current_shard->read_only_from_disk = true;
             } else {
                 if (filesize($current_index_shard_file) > self::NO_LOAD_SIZE) {
                     $this->addAdvanceGeneration();
                 } else {
                     $this->current_shard = IndexShard::load($current_index_shard_file);
                 }
             }
         } else {
             $this->current_shard = new IndexShard($current_index_shard_file, $this->generation_info['CURRENT'], $this->num_docs_per_generation);
         }
     }
     return $this->current_shard;
 }
Example #4
0
 /**
  * Check that save and load work
  */
 function saveLoadTestCase()
 {
     $docid = "AAAAAAAABBBBBBBBCCCCCCCC";
     $offset = 5;
     $word_counts = array('BBBBBBBB' => array(1), 'CCCCCCCC' => array(2), 'DDDDDDDD' => array(6));
     $meta_ids = array("EEEEEEEE", "FFFFFFFF");
     //test saving and loading to a file
     $this->test_objects['shard']->addDocumentWords($docid, $offset, $word_counts, $meta_ids, array(), true);
     $this->test_objects['shard']->save();
     $this->test_objects['shard2'] = IndexShard::load(WORK_DIRECTORY . "/shard.txt");
     $this->assertEqual($this->test_objects['shard2']->len_all_docs, 3, "Len All Docs Correctly Counts Length of First Doc");
     $c_data = $this->test_objects['shard2']->getPostingsSliceById(crawlHashWord('BBBBBBBB', true), 5);
     $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]), "Doc lookup by word works");
     $c_data = $this->test_objects['shard2']->getPostingsSliceById(crawlHashWord('CCCCCCCC', true), 5);
     $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]), "Doc lookup 2 by word works");
     $c_data = $this->test_objects['shard2']->getPostingsSliceById(crawlHashWord('DDDDDDDD', true), 5);
     $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]), "Doc lookup 2 by word works");
     $c_data = $this->test_objects['shard2']->getPostingsSliceById(crawlHashWord('EEEEEEEE', true), 5);
     $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]), "Doc lookup 2 by word works");
     $c_data = $this->test_objects['shard2']->getPostingsSliceById(crawlHashWord('FFFFFFFF', true), 5);
     $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]), "Doc lookup 2 by word works");
     // test saving and loading from a string
     $out_string = $this->test_objects['shard']->save(true);
     $this->test_objects['shard2'] = IndexShard::load("shard.txt", $out_string);
     $c_data = $this->test_objects['shard2']->getPostingsSliceById(crawlHashWord('BBBBBBBB', true), 5);
     $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]), "String Load Doc lookup by word works");
     $c_data = $this->test_objects['shard2']->getPostingsSliceById(crawlHashWord('CCCCCCCC', true), 5);
     $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]), "String Load Doc lookup 2 by word works");
     $c_data = $this->test_objects['shard2']->getPostingsSliceById(crawlHashWord('DDDDDDDD', true), 5);
     $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]), "String Load Doc lookup 2 by word works");
     $c_data = $this->test_objects['shard2']->getPostingsSliceById(crawlHashWord('EEEEEEEE', true), 5);
     $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]), "String Load Doc lookup 2 by word works");
     $c_data = $this->test_objects['shard2']->getPostingsSliceById(crawlHashWord('FFFFFFFF', true), 5);
     $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]), "String Load Doc lookup 2 by word works");
 }
Example #5
0
 /**
  * Used to recompute both the index shards and the dictionary
  * of an index archive. The first step involves re-extracting the
  * word into an inverted index from the summaries' web_archives.
  * Then a reindex is done.
  *
  * @param string $archive_path file path to a IndexArchiveBundle
  */
 function rebuildIndexArchive($archive_path)
 {
     $archive_type = $this->getArchiveKind($archive_path);
     if ($archive_type != "IndexArchiveBundle") {
         $this->badFormatMessageAndExit($archive_path);
     }
     $info = $archive_type::getArchiveInfo($archive_path);
     $tmp = unserialize($info["DESCRIPTION"]);
     $video_sources = $tmp[self::VIDEO_SOURCES];
     $generation_info = unserialize(file_get_contents("{$archive_path}/generation.txt"));
     $num_generations = $generation_info['ACTIVE'] + 1;
     $archive = new WebArchiveBundle($archive_path . "/summaries");
     $seen = 0;
     $generation = 0;
     $keypad = "";
     while ($generation < $num_generations) {
         $partition = $archive->getPartition($generation, false);
         $shard_name = $archive_path . "/posting_doc_shards/index{$generation}";
         crawlLog("Processing partition {$generation}");
         if (file_exists($shard_name)) {
             crawlLog("..Unlinking old shard {$generation}");
             @unlink($shard_name);
         }
         $shard = new IndexShard($shard_name, $generation, NUM_DOCS_PER_GENERATION, true);
         $seen_partition = 0;
         while ($seen_partition < $partition->count) {
             $num_to_get = min($partition->count - $seen_partition, 8000);
             $offset = $partition->iterator_pos;
             $objects = $partition->nextObjects($num_to_get);
             $cnt = 0;
             foreach ($objects as $object) {
                 $cnt++;
                 $site = $object[1];
                 if (isset($site[self::TYPE]) && $site[self::TYPE] == "link") {
                     $is_link = true;
                     $doc_keys = $site[self::HTTP_CODE];
                     $site_url = $site[self::TITLE];
                     $host = UrlParser::getHost($site_url);
                     $link_parts = explode('|', $site[self::HASH]);
                     if (isset($link_parts[5])) {
                         $link_origin = $link_parts[5];
                     } else {
                         $link_origin = $site_url;
                     }
                     $meta_ids = PhraseParser::calculateLinkMetas($site_url, $host, $site[self::DESCRIPTION], $link_origin);
                     $link_to = "LINK TO:";
                 } else {
                     $is_link = false;
                     $site_url = str_replace('|', "%7C", $site[self::URL]);
                     $host = UrlParser::getHost($site_url);
                     $doc_keys = crawlHash($site_url, true) . $site[self::HASH] . "d" . substr(crawlHash($host . "/", true), 1);
                     $meta_ids = PhraseParser::calculateMetas($site, $video_sources);
                     $link_to = "";
                 }
                 $so_far_cnt = $seen_partition + $cnt;
                 $time_out_message = "..still processing {$so_far_cnt} " . "of {$partition->count} in partition {$generation}." . "\n..Last processed was: " . ($seen + 1) . ". {$link_to}{$site_url}. ";
                 crawlTimeoutLog($time_out_message);
                 $seen++;
                 $word_lists = array();
                 /*
                    self::JUST_METAS check to avoid getting sitemaps in
                    results for popular words
                 */
                 $lang = NULL;
                 if (!isset($site[self::JUST_METAS])) {
                     $host_words = UrlParser::getWordsIfHostUrl($site_url);
                     $path_words = UrlParser::getWordsLastPathPartUrl($site_url);
                     if ($is_link) {
                         $phrase_string = $site[self::DESCRIPTION];
                     } else {
                         $phrase_string = $host_words . " " . $site[self::TITLE] . " " . $path_words . " " . $site[self::DESCRIPTION];
                     }
                     if (isset($site[self::LANG])) {
                         $lang = guessLocaleFromString(mb_substr($site[self::DESCRIPTION], 0, AD_HOC_TITLE_LENGTH), $site[self::LANG]);
                     }
                     $word_lists = PhraseParser::extractPhrasesInLists($phrase_string, $lang);
                     $len = strlen($phrase_string);
                     if (PhraseParser::computeSafeSearchScore($word_lists, $len) < 0.012) {
                         $meta_ids[] = "safe:true";
                         $safe = true;
                     } else {
                         $meta_ids[] = "safe:false";
                         $safe = false;
                     }
                 }
                 if (isset($site[self::USER_RANKS]) && count($site[self::USER_RANKS]) > 0) {
                     $score_keys = "";
                     foreach ($site[self::USER_RANKS] as $label => $score) {
                         $score_keys .= packInt($score);
                     }
                     if (strlen($score_keys) % 8 != 0) {
                         $score_keys .= $keypad;
                     }
                     $doc_keys .= $score_keys;
                 }
                 $shard->addDocumentWords($doc_keys, $offset, $word_lists, $meta_ids, PhraseParser::$materialized_metas, true, false);
                 $offset = $object[0];
             }
             $seen_partition += $num_to_get;
         }
         $shard->save(false, true);
         $generation++;
     }
     $this->reindexIndexArchive($archive_path);
 }
Example #6
0
 /**
  * Copies all feeds items newer than $age to a new shard, then deletes
  * old index shard and database entries older than $age. Finally sets copied
  * shard to be active. If this method is going to take max_execution_time/2
  * it returns false, so an additional job can be schedules; otherwise
  * it returns true
  *
  * @param int $age how many seconds old records should be deleted
  * @return bool whether job executed to complete
  */
 function rebuildFeedShard($age)
 {
     $time = time();
     $feed_shard_name = WORK_DIRECTORY . "/feeds/index";
     $prune_shard_name = WORK_DIRECTORY . "/feeds/prune_index";
     $prune_shard = new IndexShard($prune_shard_name);
     $too_old = $time - $age;
     if (!$prune_shard) {
         return false;
     }
     $pre_feeds = $this->getNewsSources();
     if (!$pre_feeds) {
         return false;
     }
     $feeds = array();
     foreach ($pre_feeds as $pre_feed) {
         if (!isset($pre_feed['NAME'])) {
             continue;
         }
         $feeds[$pre_feed['NAME']] = $pre_feed;
     }
     $db = $this->db;
     // we now rebuild the inverted index with the remaining items
     $sql = "SELECT * FROM FEED_ITEM " . "WHERE PUBDATE >= ? " . "ORDER BY PUBDATE DESC";
     $result = $db->execute($sql, array($too_old));
     if ($result) {
         $completed = true;
         crawlLog("..still deleting. Making new index of non-pruned items.");
         $i = 0;
         while ($item = $db->fetchArray($result)) {
             crawlTimeoutLog("..have added %s non-pruned items to index.", $i);
             $i++;
             if (!isset($item['SOURCE_NAME'])) {
                 continue;
             }
             $source_name = $item['SOURCE_NAME'];
             if (isset($feeds[$source_name])) {
                 $lang = $feeds[$source_name]['LANGUAGE'];
             } else {
                 $lang = "";
             }
             $phrase_string = $item["TITLE"] . " " . $item["DESCRIPTION"];
             $word_lists = PhraseParser::extractPhrasesInLists($phrase_string, $lang);
             $raw_guid = unbase64Hash($item["GUID"]);
             $doc_keys = crawlHash($item["LINK"], true) . $raw_guid . "d" . substr(crawlHash(UrlParser::getHost($item["LINK"]) . "/", true), 1);
             $meta_ids = $this->calculateMetas($lang, $item['PUBDATE'], $source_name, $item["GUID"]);
             $prune_shard->addDocumentWords($doc_keys, $item['PUBDATE'], $word_lists, $meta_ids, PhraseParser::$materialized_metas, true, false);
         }
     }
     $prune_shard->save();
     @chmod($prune_shard_name, 0777);
     @chmod($feed_shard_name, 0777);
     @rename($prune_shard_name, $feed_shard_name);
     @chmod($feed_shard_name, 0777);
     $sql = "DELETE FROM FEED_ITEM WHERE PUBDATE < ?";
     $db->execute($sql, array($too_old));
 }