/** * Implements post processing of recipes. recipes are extracted * ingredients are scrubbed and recipes are clustered. The clustered * recipes are added back to the index. * * @param string $index_name index name of the current crawl. */ function postProcessing($index_name) { global $INDEXING_PLUGINS; if (!class_exists("SplHeap")) { crawlLog("...Recipe Plugin Requires SPLHeap for clustering!"); crawlLog("...Aborting plugin"); return; } $locale_tag = guessLocale(); setLocaleObject($locale_tag); $search_controller = new SearchController($INDEXING_PLUGINS); $query = "recipe:all i:{$index_name}"; crawlLog("...Running Recipe Plugin!"); crawlLog("...Finding docs tagged as recipes."); $more_docs = true; $raw_recipes = array(); $limit = 0; $num = 100; while ($more_docs) { $results = @$search_controller->queryRequest($query, $num, $limit, 1, $index_name); if (isset($results["PAGES"]) && ($num_results = count($results["PAGES"])) > 0) { $raw_recipes = array_merge($raw_recipes, $results["PAGES"]); } crawlLog("Scanning recipes {$limit} through " . ($limit + $num_results) . "."); $limit += $num_results; if (isset($results["SAVE_POINT"])) { $end = true; foreach ($results["SAVE_POINT"] as $save_point) { if ($save_point != -1) { $end = false; } } if ($end) { $more_docs = false; } } else { $more_docs = false; } } crawlLog("...Clustering."); // only cluster if would make more than one cluster if (count($raw_recipes) * CLUSTER_RATIO > 1) { $recipes = array(); $i = 0; foreach ($raw_recipes as $raw_recipe) { $description = $raw_recipe[self::DESCRIPTION]; $ingredients = explode("||", $description); if (is_array($ingredients) && count($ingredients) > 1) { $recipes[$i][0] = $raw_recipe[self::TITLE]; $recipes[$i][1] = $ingredients; $recipes[$i][2] = crawlHash($raw_recipe[self::URL]); $recipes[$i][3] = $raw_recipe; $i++; } } $recipes_ingredients = array(); $count = count($recipes); foreach ($recipes as $key => $recipe) { foreach ($recipe[1] as $index => $ingredient) { if (strlen($ingredient) != 0 && substr($ingredient, strlen($ingredient) - 1) != ":") { $mainIngredient = $this->getIngredientName((string) $ingredient); if (strlen($mainIngredient) != 0) { $recipe[1][$index] = $mainIngredient; } else { unset($recipe[1][$index]); } } else { unset($recipe[1][$index]); } } $recipes[$key] = $recipe; } $count = count($recipes); $k = 0; $basic_ingredients = array('onion', 'oil', 'cheese', 'pepper', 'sauce', 'salt', 'milk', 'butter', 'flour', 'cake', 'garlic', 'cream', 'soda', 'honey', 'powder', 'sauce', 'water', 'vanilla', 'pepper', 'bread', 'sugar', 'vanillaextract', 'celery', 'seasoning', 'syrup', 'skewers', 'egg', 'muffin', 'ginger', 'basil', 'oregano', 'cinammon', 'cumin', 'mayonnaise', 'mayo', 'chillipowder', 'lemon', 'greens', 'yogurt', 'margarine', 'asparagus', 'halfhalf', 'pancakemix', 'coffee', 'cookies', 'lime', 'chillies', 'cilantro', 'rosemary', 'vanillaextract', 'vinegar', 'shallots', 'wine', 'cornmeal', 'nonstickspray'); for ($i = 0; $i < $count; $i++) { $recipe1_main_ingredient = ""; $recipe1 = $recipes[$i][1]; $recipe_name = $recipes[$i][0]; $recipe1_title = strtolower($recipes[$i][0]); $distinct_ingredients[$recipe_name] = $recipes[$i][1]; $doc_keys[$recipe_name] = $recipes[$i][2]; $recipes_summary[$recipe_name] = $recipes[$i][3]; for ($j = $i + 1; $j < $count; $j++) { $recipe2_main_ingredient = ""; $recipe2 = $recipes[$j][1]; $recipe2_title = strtolower($recipes[$j][0]); $weights[$k][0] = $recipes[$i][0]; $weights[$k][1] = $recipes[$j][0]; $merge_array = array_merge($recipe1, $recipe2); $vector_array = array_unique($merge_array); sort($vector_array); $recipe1_vector = array_fill_keys($vector_array, 0); $recipe2_vector = array_fill_keys($vector_array, 0); foreach ($recipe1 as $ingredient) { if ($ingredient != "" && !in_array($ingredient, $basic_ingredients)) { if (strstr($recipe1_title, $ingredient)) { $recipe1_main_ingredient = $ingredient; } } $recipe1_vector[$ingredient] = 1; } foreach ($recipe2 as $ingredient) { if ($ingredient != "" && !in_array($ingredient, $basic_ingredients)) { if (strstr($recipe2_title, $ingredient)) { $recipe2_main_ingredient = $ingredient; } } $recipe2_vector[$ingredient] = 1; } $edge_weight = 0; $matches = 1; foreach ($vector_array as $vector) { $diff = $recipe1_vector[$vector] - $recipe2_vector[$vector]; $vector_diff[$vector] = pow($diff, 2); if (abs($diff) == 1) { $matches += 1; } $edge_weight += $vector_diff[$vector]; } $main_ingredient_match = 1; if ($recipe1_main_ingredient != $recipe2_main_ingredient) { $main_ingredient_match = 1000; } $edge_weight = sqrt($edge_weight) * $matches * $main_ingredient_match; $weights[$k][2] = $edge_weight; $k++; } } crawlLog("...Making new shard with clustered recipes as docs."); $clusters = kruskalClustering($weights, $count, $distinct_ingredients); $index_shard = new IndexShard("cluster_shard"); $word_lists = array(); $recipe_sites = array(); foreach ($clusters as $cluster) { $count = count($cluster); for ($i = 0; $i < $count - 1; $i++) { $meta_ids = array(); $summary = array(); $recipe = $cluster[$i]; $summary[self::URL] = $recipes_summary[$recipe][self::URL]; $summary[self::TITLE] = $recipes_summary[$recipe][self::TITLE]; $summary[self::DESCRIPTION] = $recipes_summary[$recipe][self::DESCRIPTION]; $summary[self::TIMESTAMP] = $recipes_summary[$recipe][self::TIMESTAMP]; $summary[self::ENCODING] = $recipes_summary[$recipe][self::ENCODING]; $summary[self::HASH] = $recipes_summary[$recipe][self::HASH]; $doc_keys[$recipe] = crawlHash($summary[self::URL], true); $hash_rhost = "r" . substr(crawlHash(UrlParser::getHost($summary[self::URL]) . "/", true), 1); $doc_keys[$recipe] .= $summary[self::HASH] . $hash_rhost; $summary[self::TYPE] = $recipes_summary[$recipe][self::TYPE]; $summary[self::HTTP_CODE] = $recipes_summary[$recipe][self::HTTP_CODE]; $recipe_sites[] = $summary; $meta_ids[] = "ingredient:" . trim($cluster["ingredient"]); crawlLog("ingredient:" . $cluster["ingredient"]); if (!$index_shard->addDocumentWords($doc_keys[$recipe], self::NEEDS_OFFSET_FLAG, $word_lists, $meta_ids, true, false)) { crawlLog("Problem inserting recipe: " . $summary[self::TITLE]); } } } $shard_string = $index_shard->save(true); $index_shard = IndexShard::load("cluster_shard", $shard_string); unset($shard_string); crawlLog("...Adding recipe shard to index archive bundle"); $dir = CRAWL_DIR . "/cache/" . self::index_data_base_name . $index_name; $index_archive = new IndexArchiveBundle($dir, false); if ($index_shard->word_docs_packed) { $index_shard->unpackWordDocs(); } $generation = $index_archive->initGenerationToAdd($index_shard); if (isset($recipe_sites)) { crawlLog("... Adding " . count($recipe_sites) . " recipe docs."); $index_archive->addPages($generation, self::SUMMARY_OFFSET, $recipe_sites, 0); } $k = 0; foreach ($recipe_sites as $site) { $recipe = $site[self::TITLE]; $hash = crawlHash($site[self::URL], true) . $site[self::HASH] . "r" . substr(crawlHash(UrlParser::getHost($site[self::URL]) . "/", true), 1); $summary_offsets[$hash] = $site[self::SUMMARY_OFFSET]; } $index_shard->changeDocumentOffsets($summary_offsets); $index_archive->addIndexData($index_shard); $index_archive->saveAndAddCurrentShardDictionary(); $index_archive->dictionary->mergeAllTiers(); $this->db->setWorldPermissionsRecursive(CRAWL_DIR . '/cache/' . self::index_data_base_name . $index_name); crawlLog("...Recipe plugin finished."); } }
/** * Returns the shard which is currently being used to read word-document * data from the bundle. If one wants to write data to the bundle use * getActiveShard() instead. The point of this method is to allow * for lazy reading of the file associated with the shard. * * @return object the currently being index shard */ function getCurrentShard() { if (!isset($this->current_shard)) { if (!isset($this->generation_info['CURRENT'])) { $this->generation_info['CURRENT'] = $this->generation_info['ACTIVE']; } $current_index_shard_file = $this->dir_name . "/posting_doc_shards/index" . $this->generation_info['CURRENT']; if (file_exists($current_index_shard_file)) { if (isset($this->generation_info['DISK_BASED']) && $this->generation_info['DISK_BASED'] == true) { $this->current_shard = new IndexShard($current_index_shard_file, $this->generation_info['CURRENT'], $this->num_docs_per_generation, true); $this->current_shard->getShardHeader(); $this->current_shard->read_only_from_disk = true; } else { if (filesize($current_index_shard_file) > self::NO_LOAD_SIZE) { $this->addAdvanceGeneration(); } else { $this->current_shard = IndexShard::load($current_index_shard_file); } } } else { $this->current_shard = new IndexShard($current_index_shard_file, $this->generation_info['CURRENT'], $this->num_docs_per_generation); } } return $this->current_shard; }
/** * Adds the summary and index data in $file to summary bundle and word index * * @param string $file containing web pages summaries and a mini-inverted * index for their content * @param bool $blocking this method might be called by the indexer * subcomponent when a merge tier phase is ongoing to allow for * other processing to occur. If so, we don't want a regress * where the indexer calls this code calls the indexer etc. If * the blocking flag is set then the indexer subcomponent won't * be called */ function processIndexArchive($file, $blocking) { static $blocked = false; if ($blocking && $blocked) { crawlLog("Indexer waiting for merge tiers to " . "complete before write partition. B"); return; } if (!$blocking) { $blocked = false; } crawlLog("{$this->server_name} is starting to process index data," . " memory usage: " . memory_get_usage() . "..."); crawlLog("Indexer: Processing index data in {$file}..."); $start_time = microtime(); $start_total_time = microtime(); $pre_sites = webdecode(file_get_contents($file)); $len_urls = unpackInt(substr($pre_sites, 0, 4)); $seen_urls_string = substr($pre_sites, 4, $len_urls); $pre_sites = substr($pre_sites, 4 + $len_urls); $sites[self::SEEN_URLS] = array(); $pos = 0; $num = 0; $bad = false; $max_batch_sites_and_links = SEEN_URLS_BEFORE_UPDATE_SCHEDULER * (max(MAX_LINKS_PER_PAGE, MAX_LINKS_PER_SITEMAP) + 1); while ($pos < $len_urls && $num <= $max_batch_sites_and_links) { crawlTimeoutLog("..Indexer still processing index data at position" . " %s of out of %s", $pos, $len_urls); $len_site = unpackInt(substr($seen_urls_string, $pos, 4)); if ($len_site > 2 * $this->page_range_request) { crawlLog("Indexer: Site string too long, {$len_site}," . " data file may be corrupted? Skip rest."); $bad = true; break; } $pos += 4; $site_string = substr($seen_urls_string, $pos, $len_site); $pos += strlen($site_string); $tmp = unserialize(gzuncompress($site_string)); if (!$tmp || !is_array($tmp)) { crawlLog("Compressed array null," . " data file may be corrupted? Skip rest."); $bad = true; break; } $sites[self::SEEN_URLS][] = $tmp; $num++; } if ($num > $max_batch_sites_and_links * SEEN_URLS_BEFORE_UPDATE_SCHEDULER || $bad) { crawlLog("Index data file len_urls was {$len_urls} num was {$num}, " . "may be corrupt so skipping this file."); crawlLog("Indexer Done Index Processing File: {$file}. Total time: " . changeInMicrotime($start_total_time)); unlink($file); return; } crawlLog("A. Indexer Load SEEN_URLS. Memory usage:" . memory_get_usage() . " time: " . changeInMicrotime($start_time)); $sites[self::INVERTED_INDEX] = IndexShard::load("fetcher_shard", $pre_sites); unset($pre_sites); crawlLog("B. Indexer Load Sent shard. Memory usage:" . memory_get_usage() . " time: " . changeInMicrotime($start_time)); $start_time = microtime(); //do deduplication of summaries if (isset($sites[self::SEEN_URLS]) && count($sites[self::SEEN_URLS]) > 0) { $seen_sites = $sites[self::SEEN_URLS]; $seen_sites = array_values($seen_sites); unset($sites[self::SEEN_URLS]); $num_seen = count($seen_sites); crawlLog("Indexer: SEEN_URLS array had {$num_seen} sites."); } else { $num_seen = 0; } $visited_urls_count = 0; $recent_urls_count = 0; $recent_urls = array(); for ($i = 0; $i < $num_seen; $i++) { $seen_sites[$i][self::HASH_URL] = crawlHash($seen_sites[$i][self::URL], true); $link_url_parts = explode("|", $seen_sites[$i][self::URL]); if (strcmp("url", $link_url_parts[0]) == 0) { $reftype = strcmp("eref", $link_url_parts[4]) == 0 ? "e" : "i"; $seen_sites[$i][self::HASH_URL] = crawlHash($link_url_parts[1], true) . crawlHash($seen_sites[$i][self::URL], true) . $reftype . substr(crawlHash(UrlParser::getHost($link_url_parts[5]) . "/", true), 1); $seen_sites[$i][self::IS_DOC] = false; } else { $seen_sites[$i][self::IS_DOC] = true; $visited_urls_count++; array_push($recent_urls, $seen_sites[$i][self::URL]); if ($recent_urls_count >= NUM_RECENT_URLS_TO_DISPLAY) { array_shift($recent_urls); } $recent_urls_count++; } } if (isset($sites[self::INVERTED_INDEX])) { $index_shard =& $sites[self::INVERTED_INDEX]; $generation = $this->index_archive->initGenerationToAdd($index_shard->num_docs, $this, $blocking); if ($generation == -1) { crawlLog("Indexer waiting for merge tiers to " . "complete before write partition. A"); $blocked = true; return; } $summary_offsets = array(); if (isset($seen_sites)) { $this->index_archive->addPages($generation, self::SUMMARY_OFFSET, $seen_sites, $visited_urls_count); foreach ($seen_sites as $site) { if ($site[self::IS_DOC]) { // so not link $site_url = str_replace('|', "%7C", $site[self::URL]); $host = UrlParser::getHost($site_url); $hash = crawlHash($site_url, true) . $site[self::HASH] . "d" . substr(crawlHash($host . "/", true), 1); } else { $hash = $site[self::HASH_URL]; } $summary_offsets[$hash] = $site[self::SUMMARY_OFFSET]; } unset($seen_sites); } crawlLog("C. Indexer init local shard, store " . "Summaries memory usage:" . memory_get_usage() . " time: " . changeInMicrotime($start_time)); $start_time = microtime(); // added summary offset info to inverted index data $index_shard->changeDocumentOffsets($summary_offsets); crawlLog("D. Indexer Update shard offsets. Memory usage:" . memory_get_usage() . " time: " . changeInMicrotime($start_time)); $start_time = microtime(); $this->index_archive->addIndexData($index_shard); $this->index_dirty = true; } crawlLog("E. Indexer Add index shard. Memory usage:" . memory_get_usage() . " time: " . changeInMicrotime($start_time)); crawlLog("Indexer Done Index Processing File: {$file}. Total time: " . changeInMicrotime($start_total_time)); if (isset($recent_urls)) { $sites[self::RECENT_URLS] =& $recent_urls; $this->writeCrawlStatus($sites); } if (file_exists($file)) { //Haven't tracked down yet, but can try to delete twice giving warn unlink($file); } }
/** * Check that save and load work */ function saveLoadTestCase() { $docid = "AAAAAAAABBBBBBBBCCCCCCCC"; $offset = 5; $word_counts = array('BBBBBBBB' => array(1), 'CCCCCCCC' => array(2), 'DDDDDDDD' => array(6)); $meta_ids = array("EEEEEEEE", "FFFFFFFF"); //test saving and loading to a file $this->test_objects['shard']->addDocumentWords($docid, $offset, $word_counts, $meta_ids, array(), true); $this->test_objects['shard']->save(); $this->test_objects['shard2'] = IndexShard::load(WORK_DIRECTORY . "/shard.txt"); $this->assertEqual($this->test_objects['shard2']->len_all_docs, 3, "Len All Docs Correctly Counts Length of First Doc"); $c_data = $this->test_objects['shard2']->getPostingsSliceById(crawlHashWord('BBBBBBBB', true), 5); $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]), "Doc lookup by word works"); $c_data = $this->test_objects['shard2']->getPostingsSliceById(crawlHashWord('CCCCCCCC', true), 5); $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]), "Doc lookup 2 by word works"); $c_data = $this->test_objects['shard2']->getPostingsSliceById(crawlHashWord('DDDDDDDD', true), 5); $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]), "Doc lookup 2 by word works"); $c_data = $this->test_objects['shard2']->getPostingsSliceById(crawlHashWord('EEEEEEEE', true), 5); $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]), "Doc lookup 2 by word works"); $c_data = $this->test_objects['shard2']->getPostingsSliceById(crawlHashWord('FFFFFFFF', true), 5); $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]), "Doc lookup 2 by word works"); // test saving and loading from a string $out_string = $this->test_objects['shard']->save(true); $this->test_objects['shard2'] = IndexShard::load("shard.txt", $out_string); $c_data = $this->test_objects['shard2']->getPostingsSliceById(crawlHashWord('BBBBBBBB', true), 5); $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]), "String Load Doc lookup by word works"); $c_data = $this->test_objects['shard2']->getPostingsSliceById(crawlHashWord('CCCCCCCC', true), 5); $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]), "String Load Doc lookup 2 by word works"); $c_data = $this->test_objects['shard2']->getPostingsSliceById(crawlHashWord('DDDDDDDD', true), 5); $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]), "String Load Doc lookup 2 by word works"); $c_data = $this->test_objects['shard2']->getPostingsSliceById(crawlHashWord('EEEEEEEE', true), 5); $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]), "String Load Doc lookup 2 by word works"); $c_data = $this->test_objects['shard2']->getPostingsSliceById(crawlHashWord('FFFFFFFF', true), 5); $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]), "String Load Doc lookup 2 by word works"); }