/** * Used to recompute both the index shards and the dictionary * of an index archive. The first step involves re-extracting the * word into an inverted index from the summaries' web_archives. * Then a reindex is done. * * @param string $archive_path file path to a IndexArchiveBundle */ function rebuildIndexArchive($archive_path) { $archive_type = $this->getArchiveKind($archive_path); if ($archive_type != "IndexArchiveBundle") { $this->badFormatMessageAndExit($archive_path); } $info = $archive_type::getArchiveInfo($archive_path); $tmp = unserialize($info["DESCRIPTION"]); $video_sources = $tmp[self::VIDEO_SOURCES]; $generation_info = unserialize(file_get_contents("{$archive_path}/generation.txt")); $num_generations = $generation_info['ACTIVE'] + 1; $archive = new WebArchiveBundle($archive_path . "/summaries"); $seen = 0; $generation = 0; $keypad = ""; while ($generation < $num_generations) { $partition = $archive->getPartition($generation, false); $shard_name = $archive_path . "/posting_doc_shards/index{$generation}"; crawlLog("Processing partition {$generation}"); if (file_exists($shard_name)) { crawlLog("..Unlinking old shard {$generation}"); @unlink($shard_name); } $shard = new IndexShard($shard_name, $generation, NUM_DOCS_PER_GENERATION, true); $seen_partition = 0; while ($seen_partition < $partition->count) { $num_to_get = min($partition->count - $seen_partition, 8000); $offset = $partition->iterator_pos; $objects = $partition->nextObjects($num_to_get); $cnt = 0; foreach ($objects as $object) { $cnt++; $site = $object[1]; if (isset($site[self::TYPE]) && $site[self::TYPE] == "link") { $is_link = true; $doc_keys = $site[self::HTTP_CODE]; $site_url = $site[self::TITLE]; $host = UrlParser::getHost($site_url); $link_parts = explode('|', $site[self::HASH]); if (isset($link_parts[5])) { $link_origin = $link_parts[5]; } else { $link_origin = $site_url; } $meta_ids = PhraseParser::calculateLinkMetas($site_url, $host, $site[self::DESCRIPTION], $link_origin); $link_to = "LINK TO:"; } else { $is_link = false; $site_url = str_replace('|', "%7C", $site[self::URL]); $host = UrlParser::getHost($site_url); $doc_keys = crawlHash($site_url, true) . $site[self::HASH] . "d" . substr(crawlHash($host . "/", true), 1); $meta_ids = PhraseParser::calculateMetas($site, $video_sources); $link_to = ""; } $so_far_cnt = $seen_partition + $cnt; $time_out_message = "..still processing {$so_far_cnt} " . "of {$partition->count} in partition {$generation}." . "\n..Last processed was: " . ($seen + 1) . ". {$link_to}{$site_url}. "; crawlTimeoutLog($time_out_message); $seen++; $word_lists = array(); /* self::JUST_METAS check to avoid getting sitemaps in results for popular words */ $lang = NULL; if (!isset($site[self::JUST_METAS])) { $host_words = UrlParser::getWordsIfHostUrl($site_url); $path_words = UrlParser::getWordsLastPathPartUrl($site_url); if ($is_link) { $phrase_string = $site[self::DESCRIPTION]; } else { $phrase_string = $host_words . " " . $site[self::TITLE] . " " . $path_words . " " . $site[self::DESCRIPTION]; } if (isset($site[self::LANG])) { $lang = guessLocaleFromString(mb_substr($site[self::DESCRIPTION], 0, AD_HOC_TITLE_LENGTH), $site[self::LANG]); } $word_lists = PhraseParser::extractPhrasesInLists($phrase_string, $lang); $len = strlen($phrase_string); if (PhraseParser::computeSafeSearchScore($word_lists, $len) < 0.012) { $meta_ids[] = "safe:true"; $safe = true; } else { $meta_ids[] = "safe:false"; $safe = false; } } if (isset($site[self::USER_RANKS]) && count($site[self::USER_RANKS]) > 0) { $score_keys = ""; foreach ($site[self::USER_RANKS] as $label => $score) { $score_keys .= packInt($score); } if (strlen($score_keys) % 8 != 0) { $score_keys .= $keypad; } $doc_keys .= $score_keys; } $shard->addDocumentWords($doc_keys, $offset, $word_lists, $meta_ids, PhraseParser::$materialized_metas, true, false); $offset = $object[0]; } $seen_partition += $num_to_get; } $shard->save(false, true); $generation++; } $this->reindexIndexArchive($archive_path); }
/** * Implements post processing of recipes. recipes are extracted * ingredients are scrubbed and recipes are clustered. The clustered * recipes are added back to the index. * * @param string $index_name index name of the current crawl. */ function postProcessing($index_name) { global $INDEXING_PLUGINS; if (!class_exists("SplHeap")) { crawlLog("...Recipe Plugin Requires SPLHeap for clustering!"); crawlLog("...Aborting plugin"); return; } $locale_tag = guessLocale(); setLocaleObject($locale_tag); $search_controller = new SearchController($INDEXING_PLUGINS); $query = "recipe:all i:{$index_name}"; crawlLog("...Running Recipe Plugin!"); crawlLog("...Finding docs tagged as recipes."); $more_docs = true; $raw_recipes = array(); $limit = 0; $num = 100; while ($more_docs) { $results = @$search_controller->queryRequest($query, $num, $limit, 1, $index_name); if (isset($results["PAGES"]) && ($num_results = count($results["PAGES"])) > 0) { $raw_recipes = array_merge($raw_recipes, $results["PAGES"]); } crawlLog("Scanning recipes {$limit} through " . ($limit + $num_results) . "."); $limit += $num_results; if (isset($results["SAVE_POINT"])) { $end = true; foreach ($results["SAVE_POINT"] as $save_point) { if ($save_point != -1) { $end = false; } } if ($end) { $more_docs = false; } } else { $more_docs = false; } } crawlLog("...Clustering."); // only cluster if would make more than one cluster if (count($raw_recipes) * CLUSTER_RATIO > 1) { $recipes = array(); $i = 0; foreach ($raw_recipes as $raw_recipe) { $description = $raw_recipe[self::DESCRIPTION]; $ingredients = explode("||", $description); if (is_array($ingredients) && count($ingredients) > 1) { $recipes[$i][0] = $raw_recipe[self::TITLE]; $recipes[$i][1] = $ingredients; $recipes[$i][2] = crawlHash($raw_recipe[self::URL]); $recipes[$i][3] = $raw_recipe; $i++; } } $recipes_ingredients = array(); $count = count($recipes); foreach ($recipes as $key => $recipe) { foreach ($recipe[1] as $index => $ingredient) { if (strlen($ingredient) != 0 && substr($ingredient, strlen($ingredient) - 1) != ":") { $mainIngredient = $this->getIngredientName((string) $ingredient); if (strlen($mainIngredient) != 0) { $recipe[1][$index] = $mainIngredient; } else { unset($recipe[1][$index]); } } else { unset($recipe[1][$index]); } } $recipes[$key] = $recipe; } $count = count($recipes); $k = 0; $basic_ingredients = array('onion', 'oil', 'cheese', 'pepper', 'sauce', 'salt', 'milk', 'butter', 'flour', 'cake', 'garlic', 'cream', 'soda', 'honey', 'powder', 'sauce', 'water', 'vanilla', 'pepper', 'bread', 'sugar', 'vanillaextract', 'celery', 'seasoning', 'syrup', 'skewers', 'egg', 'muffin', 'ginger', 'basil', 'oregano', 'cinammon', 'cumin', 'mayonnaise', 'mayo', 'chillipowder', 'lemon', 'greens', 'yogurt', 'margarine', 'asparagus', 'halfhalf', 'pancakemix', 'coffee', 'cookies', 'lime', 'chillies', 'cilantro', 'rosemary', 'vanillaextract', 'vinegar', 'shallots', 'wine', 'cornmeal', 'nonstickspray'); for ($i = 0; $i < $count; $i++) { $recipe1_main_ingredient = ""; $recipe1 = $recipes[$i][1]; $recipe_name = $recipes[$i][0]; $recipe1_title = strtolower($recipes[$i][0]); $distinct_ingredients[$recipe_name] = $recipes[$i][1]; $doc_keys[$recipe_name] = $recipes[$i][2]; $recipes_summary[$recipe_name] = $recipes[$i][3]; for ($j = $i + 1; $j < $count; $j++) { $recipe2_main_ingredient = ""; $recipe2 = $recipes[$j][1]; $recipe2_title = strtolower($recipes[$j][0]); $weights[$k][0] = $recipes[$i][0]; $weights[$k][1] = $recipes[$j][0]; $merge_array = array_merge($recipe1, $recipe2); $vector_array = array_unique($merge_array); sort($vector_array); $recipe1_vector = array_fill_keys($vector_array, 0); $recipe2_vector = array_fill_keys($vector_array, 0); foreach ($recipe1 as $ingredient) { if ($ingredient != "" && !in_array($ingredient, $basic_ingredients)) { if (strstr($recipe1_title, $ingredient)) { $recipe1_main_ingredient = $ingredient; } } $recipe1_vector[$ingredient] = 1; } foreach ($recipe2 as $ingredient) { if ($ingredient != "" && !in_array($ingredient, $basic_ingredients)) { if (strstr($recipe2_title, $ingredient)) { $recipe2_main_ingredient = $ingredient; } } $recipe2_vector[$ingredient] = 1; } $edge_weight = 0; $matches = 1; foreach ($vector_array as $vector) { $diff = $recipe1_vector[$vector] - $recipe2_vector[$vector]; $vector_diff[$vector] = pow($diff, 2); if (abs($diff) == 1) { $matches += 1; } $edge_weight += $vector_diff[$vector]; } $main_ingredient_match = 1; if ($recipe1_main_ingredient != $recipe2_main_ingredient) { $main_ingredient_match = 1000; } $edge_weight = sqrt($edge_weight) * $matches * $main_ingredient_match; $weights[$k][2] = $edge_weight; $k++; } } crawlLog("...Making new shard with clustered recipes as docs."); $clusters = kruskalClustering($weights, $count, $distinct_ingredients); $index_shard = new IndexShard("cluster_shard"); $word_lists = array(); $recipe_sites = array(); foreach ($clusters as $cluster) { $count = count($cluster); for ($i = 0; $i < $count - 1; $i++) { $meta_ids = array(); $summary = array(); $recipe = $cluster[$i]; $summary[self::URL] = $recipes_summary[$recipe][self::URL]; $summary[self::TITLE] = $recipes_summary[$recipe][self::TITLE]; $summary[self::DESCRIPTION] = $recipes_summary[$recipe][self::DESCRIPTION]; $summary[self::TIMESTAMP] = $recipes_summary[$recipe][self::TIMESTAMP]; $summary[self::ENCODING] = $recipes_summary[$recipe][self::ENCODING]; $summary[self::HASH] = $recipes_summary[$recipe][self::HASH]; $doc_keys[$recipe] = crawlHash($summary[self::URL], true); $hash_rhost = "r" . substr(crawlHash(UrlParser::getHost($summary[self::URL]) . "/", true), 1); $doc_keys[$recipe] .= $summary[self::HASH] . $hash_rhost; $summary[self::TYPE] = $recipes_summary[$recipe][self::TYPE]; $summary[self::HTTP_CODE] = $recipes_summary[$recipe][self::HTTP_CODE]; $recipe_sites[] = $summary; $meta_ids[] = "ingredient:" . trim($cluster["ingredient"]); crawlLog("ingredient:" . $cluster["ingredient"]); if (!$index_shard->addDocumentWords($doc_keys[$recipe], self::NEEDS_OFFSET_FLAG, $word_lists, $meta_ids, true, false)) { crawlLog("Problem inserting recipe: " . $summary[self::TITLE]); } } } $shard_string = $index_shard->save(true); $index_shard = IndexShard::load("cluster_shard", $shard_string); unset($shard_string); crawlLog("...Adding recipe shard to index archive bundle"); $dir = CRAWL_DIR . "/cache/" . self::index_data_base_name . $index_name; $index_archive = new IndexArchiveBundle($dir, false); if ($index_shard->word_docs_packed) { $index_shard->unpackWordDocs(); } $generation = $index_archive->initGenerationToAdd($index_shard); if (isset($recipe_sites)) { crawlLog("... Adding " . count($recipe_sites) . " recipe docs."); $index_archive->addPages($generation, self::SUMMARY_OFFSET, $recipe_sites, 0); } $k = 0; foreach ($recipe_sites as $site) { $recipe = $site[self::TITLE]; $hash = crawlHash($site[self::URL], true) . $site[self::HASH] . "r" . substr(crawlHash(UrlParser::getHost($site[self::URL]) . "/", true), 1); $summary_offsets[$hash] = $site[self::SUMMARY_OFFSET]; } $index_shard->changeDocumentOffsets($summary_offsets); $index_archive->addIndexData($index_shard); $index_archive->saveAndAddCurrentShardDictionary(); $index_archive->dictionary->mergeAllTiers(); $this->db->setWorldPermissionsRecursive(CRAWL_DIR . '/cache/' . self::index_data_base_name . $index_name); crawlLog("...Recipe plugin finished."); } }
/** * Copies all feeds items newer than $age to a new shard, then deletes * old index shard and database entries older than $age. Finally sets copied * shard to be active. If this method is going to take max_execution_time/2 * it returns false, so an additional job can be schedules; otherwise * it returns true * * @param int $age how many seconds old records should be deleted * @return bool whether job executed to complete */ function rebuildFeedShard($age) { $time = time(); $feed_shard_name = WORK_DIRECTORY . "/feeds/index"; $prune_shard_name = WORK_DIRECTORY . "/feeds/prune_index"; $prune_shard = new IndexShard($prune_shard_name); $too_old = $time - $age; if (!$prune_shard) { return false; } $pre_feeds = $this->getNewsSources(); if (!$pre_feeds) { return false; } $feeds = array(); foreach ($pre_feeds as $pre_feed) { if (!isset($pre_feed['NAME'])) { continue; } $feeds[$pre_feed['NAME']] = $pre_feed; } $db = $this->db; // we now rebuild the inverted index with the remaining items $sql = "SELECT * FROM FEED_ITEM " . "WHERE PUBDATE >= ? " . "ORDER BY PUBDATE DESC"; $result = $db->execute($sql, array($too_old)); if ($result) { $completed = true; crawlLog("..still deleting. Making new index of non-pruned items."); $i = 0; while ($item = $db->fetchArray($result)) { crawlTimeoutLog("..have added %s non-pruned items to index.", $i); $i++; if (!isset($item['SOURCE_NAME'])) { continue; } $source_name = $item['SOURCE_NAME']; if (isset($feeds[$source_name])) { $lang = $feeds[$source_name]['LANGUAGE']; } else { $lang = ""; } $phrase_string = $item["TITLE"] . " " . $item["DESCRIPTION"]; $word_lists = PhraseParser::extractPhrasesInLists($phrase_string, $lang); $raw_guid = unbase64Hash($item["GUID"]); $doc_keys = crawlHash($item["LINK"], true) . $raw_guid . "d" . substr(crawlHash(UrlParser::getHost($item["LINK"]) . "/", true), 1); $meta_ids = $this->calculateMetas($lang, $item['PUBDATE'], $source_name, $item["GUID"]); $prune_shard->addDocumentWords($doc_keys, $item['PUBDATE'], $word_lists, $meta_ids, PhraseParser::$materialized_metas, true, false); } } $prune_shard->save(); @chmod($prune_shard_name, 0777); @chmod($feed_shard_name, 0777); @rename($prune_shard_name, $feed_shard_name); @chmod($feed_shard_name, 0777); $sql = "DELETE FROM FEED_ITEM WHERE PUBDATE < ?"; $db->execute($sql, array($too_old)); }