PHP IndexShard Examples

Programming Language: PHP

Class/Type: IndexShard

Examples at hotexamples.com: 6

PHP IndexShard - 6 examples found. These are the top rated real world PHP examples of IndexShard extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

load(4)

addDocumentWords(3)

save(3)

Example #1

Show file

File: recipe_plugin.php Project: yakar/yioop

 /**
  * Implements post processing of recipes. recipes are extracted
  * ingredients are scrubbed and recipes are clustered. The clustered
  * recipes are added back to the index.
  *
  * @param string $index_name  index name of the current crawl.
  */
 function postProcessing($index_name)
 {
     global $INDEXING_PLUGINS;
     if (!class_exists("SplHeap")) {
         crawlLog("...Recipe Plugin Requires SPLHeap for clustering!");
         crawlLog("...Aborting plugin");
         return;
     }
     $locale_tag = guessLocale();
     setLocaleObject($locale_tag);
     $search_controller = new SearchController($INDEXING_PLUGINS);
     $query = "recipe:all i:{$index_name}";
     crawlLog("...Running Recipe Plugin!");
     crawlLog("...Finding docs tagged as recipes.");
     $more_docs = true;
     $raw_recipes = array();
     $limit = 0;
     $num = 100;
     while ($more_docs) {
         $results = @$search_controller->queryRequest($query, $num, $limit, 1, $index_name);
         if (isset($results["PAGES"]) && ($num_results = count($results["PAGES"])) > 0) {
             $raw_recipes = array_merge($raw_recipes, $results["PAGES"]);
         }
         crawlLog("Scanning recipes {$limit} through " . ($limit + $num_results) . ".");
         $limit += $num_results;
         if (isset($results["SAVE_POINT"])) {
             $end = true;
             foreach ($results["SAVE_POINT"] as $save_point) {
                 if ($save_point != -1) {
                     $end = false;
                 }
             }
             if ($end) {
                 $more_docs = false;
             }
         } else {
             $more_docs = false;
         }
     }
     crawlLog("...Clustering.");
     // only cluster if would make more than one cluster
     if (count($raw_recipes) * CLUSTER_RATIO > 1) {
         $recipes = array();
         $i = 0;
         foreach ($raw_recipes as $raw_recipe) {
             $description = $raw_recipe[self::DESCRIPTION];
             $ingredients = explode("||", $description);
             if (is_array($ingredients) && count($ingredients) > 1) {
                 $recipes[$i][0] = $raw_recipe[self::TITLE];
                 $recipes[$i][1] = $ingredients;
                 $recipes[$i][2] = crawlHash($raw_recipe[self::URL]);
                 $recipes[$i][3] = $raw_recipe;
                 $i++;
             }
         }
         $recipes_ingredients = array();
         $count = count($recipes);
         foreach ($recipes as $key => $recipe) {
             foreach ($recipe[1] as $index => $ingredient) {
                 if (strlen($ingredient) != 0 && substr($ingredient, strlen($ingredient) - 1) != ":") {
                     $mainIngredient = $this->getIngredientName((string) $ingredient);
                     if (strlen($mainIngredient) != 0) {
                         $recipe[1][$index] = $mainIngredient;
                     } else {
                         unset($recipe[1][$index]);
                     }
                 } else {
                     unset($recipe[1][$index]);
                 }
             }
             $recipes[$key] = $recipe;
         }
         $count = count($recipes);
         $k = 0;
         $basic_ingredients = array('onion', 'oil', 'cheese', 'pepper', 'sauce', 'salt', 'milk', 'butter', 'flour', 'cake', 'garlic', 'cream', 'soda', 'honey', 'powder', 'sauce', 'water', 'vanilla', 'pepper', 'bread', 'sugar', 'vanillaextract', 'celery', 'seasoning', 'syrup', 'skewers', 'egg', 'muffin', 'ginger', 'basil', 'oregano', 'cinammon', 'cumin', 'mayonnaise', 'mayo', 'chillipowder', 'lemon', 'greens', 'yogurt', 'margarine', 'asparagus', 'halfhalf', 'pancakemix', 'coffee', 'cookies', 'lime', 'chillies', 'cilantro', 'rosemary', 'vanillaextract', 'vinegar', 'shallots', 'wine', 'cornmeal', 'nonstickspray');
         for ($i = 0; $i < $count; $i++) {
             $recipe1_main_ingredient = "";
             $recipe1 = $recipes[$i][1];
             $recipe_name = $recipes[$i][0];
             $recipe1_title = strtolower($recipes[$i][0]);
             $distinct_ingredients[$recipe_name] = $recipes[$i][1];
             $doc_keys[$recipe_name] = $recipes[$i][2];
             $recipes_summary[$recipe_name] = $recipes[$i][3];
             for ($j = $i + 1; $j < $count; $j++) {
                 $recipe2_main_ingredient = "";
                 $recipe2 = $recipes[$j][1];
                 $recipe2_title = strtolower($recipes[$j][0]);
                 $weights[$k][0] = $recipes[$i][0];
                 $weights[$k][1] = $recipes[$j][0];
                 $merge_array = array_merge($recipe1, $recipe2);
                 $vector_array = array_unique($merge_array);
                 sort($vector_array);
                 $recipe1_vector = array_fill_keys($vector_array, 0);
                 $recipe2_vector = array_fill_keys($vector_array, 0);
                 foreach ($recipe1 as $ingredient) {
                     if ($ingredient != "" && !in_array($ingredient, $basic_ingredients)) {
                         if (strstr($recipe1_title, $ingredient)) {
                             $recipe1_main_ingredient = $ingredient;
                         }
                     }
                     $recipe1_vector[$ingredient] = 1;
                 }
                 foreach ($recipe2 as $ingredient) {
                     if ($ingredient != "" && !in_array($ingredient, $basic_ingredients)) {
                         if (strstr($recipe2_title, $ingredient)) {
                             $recipe2_main_ingredient = $ingredient;
                         }
                     }
                     $recipe2_vector[$ingredient] = 1;
                 }
                 $edge_weight = 0;
                 $matches = 1;
                 foreach ($vector_array as $vector) {
                     $diff = $recipe1_vector[$vector] - $recipe2_vector[$vector];
                     $vector_diff[$vector] = pow($diff, 2);
                     if (abs($diff) == 1) {
                         $matches += 1;
                     }
                     $edge_weight += $vector_diff[$vector];
                 }
                 $main_ingredient_match = 1;
                 if ($recipe1_main_ingredient != $recipe2_main_ingredient) {
                     $main_ingredient_match = 1000;
                 }
                 $edge_weight = sqrt($edge_weight) * $matches * $main_ingredient_match;
                 $weights[$k][2] = $edge_weight;
                 $k++;
             }
         }
         crawlLog("...Making new shard with clustered recipes as docs.");
         $clusters = kruskalClustering($weights, $count, $distinct_ingredients);
         $index_shard = new IndexShard("cluster_shard");
         $word_lists = array();
         $recipe_sites = array();
         foreach ($clusters as $cluster) {
             $count = count($cluster);
             for ($i = 0; $i < $count - 1; $i++) {
                 $meta_ids = array();
                 $summary = array();
                 $recipe = $cluster[$i];
                 $summary[self::URL] = $recipes_summary[$recipe][self::URL];
                 $summary[self::TITLE] = $recipes_summary[$recipe][self::TITLE];
                 $summary[self::DESCRIPTION] = $recipes_summary[$recipe][self::DESCRIPTION];
                 $summary[self::TIMESTAMP] = $recipes_summary[$recipe][self::TIMESTAMP];
                 $summary[self::ENCODING] = $recipes_summary[$recipe][self::ENCODING];
                 $summary[self::HASH] = $recipes_summary[$recipe][self::HASH];
                 $doc_keys[$recipe] = crawlHash($summary[self::URL], true);
                 $hash_rhost = "r" . substr(crawlHash(UrlParser::getHost($summary[self::URL]) . "/", true), 1);
                 $doc_keys[$recipe] .= $summary[self::HASH] . $hash_rhost;
                 $summary[self::TYPE] = $recipes_summary[$recipe][self::TYPE];
                 $summary[self::HTTP_CODE] = $recipes_summary[$recipe][self::HTTP_CODE];
                 $recipe_sites[] = $summary;
                 $meta_ids[] = "ingredient:" . trim($cluster["ingredient"]);
                 crawlLog("ingredient:" . $cluster["ingredient"]);
                 if (!$index_shard->addDocumentWords($doc_keys[$recipe], self::NEEDS_OFFSET_FLAG, $word_lists, $meta_ids, true, false)) {
                     crawlLog("Problem inserting recipe: " . $summary[self::TITLE]);
                 }
             }
         }
         $shard_string = $index_shard->save(true);
         $index_shard = IndexShard::load("cluster_shard", $shard_string);
         unset($shard_string);
         crawlLog("...Adding recipe shard to index archive bundle");
         $dir = CRAWL_DIR . "/cache/" . self::index_data_base_name . $index_name;
         $index_archive = new IndexArchiveBundle($dir, false);
         if ($index_shard->word_docs_packed) {
             $index_shard->unpackWordDocs();
         }
         $generation = $index_archive->initGenerationToAdd($index_shard);
         if (isset($recipe_sites)) {
             crawlLog("... Adding " . count($recipe_sites) . " recipe docs.");
             $index_archive->addPages($generation, self::SUMMARY_OFFSET, $recipe_sites, 0);
         }
         $k = 0;
         foreach ($recipe_sites as $site) {
             $recipe = $site[self::TITLE];
             $hash = crawlHash($site[self::URL], true) . $site[self::HASH] . "r" . substr(crawlHash(UrlParser::getHost($site[self::URL]) . "/", true), 1);
             $summary_offsets[$hash] = $site[self::SUMMARY_OFFSET];
         }
         $index_shard->changeDocumentOffsets($summary_offsets);
         $index_archive->addIndexData($index_shard);
         $index_archive->saveAndAddCurrentShardDictionary();
         $index_archive->dictionary->mergeAllTiers();
         $this->db->setWorldPermissionsRecursive(CRAWL_DIR . '/cache/' . self::index_data_base_name . $index_name);
         crawlLog("...Recipe plugin finished.");
     }
 }

Example #2

Show file

File: queue_server.php Project: yakar/yioop

 /**
  * Adds the summary and index data in $file to summary bundle and word index
  *
  * @param string $file containing web pages summaries and a mini-inverted
  *     index for their content
  * @param bool $blocking this method might be called by the indexer
  *     subcomponent when a merge tier phase is ongoing to allow for
  *     other processing to occur. If so, we don't want a regress
  *     where the indexer calls this code calls the indexer etc. If
  *     the blocking flag is set then the indexer subcomponent won't
  *     be called
  */
 function processIndexArchive($file, $blocking)
 {
     static $blocked = false;
     if ($blocking && $blocked) {
         crawlLog("Indexer waiting for merge tiers to " . "complete before write partition. B");
         return;
     }
     if (!$blocking) {
         $blocked = false;
     }
     crawlLog("{$this->server_name} is starting to process index data," . " memory usage: " . memory_get_usage() . "...");
     crawlLog("Indexer: Processing index data in {$file}...");
     $start_time = microtime();
     $start_total_time = microtime();
     $pre_sites = webdecode(file_get_contents($file));
     $len_urls = unpackInt(substr($pre_sites, 0, 4));
     $seen_urls_string = substr($pre_sites, 4, $len_urls);
     $pre_sites = substr($pre_sites, 4 + $len_urls);
     $sites[self::SEEN_URLS] = array();
     $pos = 0;
     $num = 0;
     $bad = false;
     $max_batch_sites_and_links = SEEN_URLS_BEFORE_UPDATE_SCHEDULER * (max(MAX_LINKS_PER_PAGE, MAX_LINKS_PER_SITEMAP) + 1);
     while ($pos < $len_urls && $num <= $max_batch_sites_and_links) {
         crawlTimeoutLog("..Indexer still processing index data at position" . " %s of out of %s", $pos, $len_urls);
         $len_site = unpackInt(substr($seen_urls_string, $pos, 4));
         if ($len_site > 2 * $this->page_range_request) {
             crawlLog("Indexer: Site string too long, {$len_site}," . " data file may be corrupted? Skip rest.");
             $bad = true;
             break;
         }
         $pos += 4;
         $site_string = substr($seen_urls_string, $pos, $len_site);
         $pos += strlen($site_string);
         $tmp = unserialize(gzuncompress($site_string));
         if (!$tmp || !is_array($tmp)) {
             crawlLog("Compressed array null," . " data file may be corrupted? Skip rest.");
             $bad = true;
             break;
         }
         $sites[self::SEEN_URLS][] = $tmp;
         $num++;
     }
     if ($num > $max_batch_sites_and_links * SEEN_URLS_BEFORE_UPDATE_SCHEDULER || $bad) {
         crawlLog("Index data file len_urls was {$len_urls} num was {$num}, " . "may be corrupt so skipping this file.");
         crawlLog("Indexer Done Index Processing File: {$file}. Total time: " . changeInMicrotime($start_total_time));
         unlink($file);
         return;
     }
     crawlLog("A. Indexer Load SEEN_URLS. Memory usage:" . memory_get_usage() . " time: " . changeInMicrotime($start_time));
     $sites[self::INVERTED_INDEX] = IndexShard::load("fetcher_shard", $pre_sites);
     unset($pre_sites);
     crawlLog("B. Indexer Load Sent shard. Memory usage:" . memory_get_usage() . " time: " . changeInMicrotime($start_time));
     $start_time = microtime();
     //do deduplication of summaries
     if (isset($sites[self::SEEN_URLS]) && count($sites[self::SEEN_URLS]) > 0) {
         $seen_sites = $sites[self::SEEN_URLS];
         $seen_sites = array_values($seen_sites);
         unset($sites[self::SEEN_URLS]);
         $num_seen = count($seen_sites);
         crawlLog("Indexer: SEEN_URLS array had {$num_seen} sites.");
     } else {
         $num_seen = 0;
     }
     $visited_urls_count = 0;
     $recent_urls_count = 0;
     $recent_urls = array();
     for ($i = 0; $i < $num_seen; $i++) {
         $seen_sites[$i][self::HASH_URL] = crawlHash($seen_sites[$i][self::URL], true);
         $link_url_parts = explode("|", $seen_sites[$i][self::URL]);
         if (strcmp("url", $link_url_parts[0]) == 0) {
             $reftype = strcmp("eref", $link_url_parts[4]) == 0 ? "e" : "i";
             $seen_sites[$i][self::HASH_URL] = crawlHash($link_url_parts[1], true) . crawlHash($seen_sites[$i][self::URL], true) . $reftype . substr(crawlHash(UrlParser::getHost($link_url_parts[5]) . "/", true), 1);
             $seen_sites[$i][self::IS_DOC] = false;
         } else {
             $seen_sites[$i][self::IS_DOC] = true;
             $visited_urls_count++;
             array_push($recent_urls, $seen_sites[$i][self::URL]);
             if ($recent_urls_count >= NUM_RECENT_URLS_TO_DISPLAY) {
                 array_shift($recent_urls);
             }
             $recent_urls_count++;
         }
     }
     if (isset($sites[self::INVERTED_INDEX])) {
         $index_shard =& $sites[self::INVERTED_INDEX];
         $generation = $this->index_archive->initGenerationToAdd($index_shard->num_docs, $this, $blocking);
         if ($generation == -1) {
             crawlLog("Indexer waiting for merge tiers to " . "complete before write partition. A");
             $blocked = true;
             return;
         }
         $summary_offsets = array();
         if (isset($seen_sites)) {
             $this->index_archive->addPages($generation, self::SUMMARY_OFFSET, $seen_sites, $visited_urls_count);
             foreach ($seen_sites as $site) {
                 if ($site[self::IS_DOC]) {
                     // so not link
                     $site_url = str_replace('|', "%7C", $site[self::URL]);
                     $host = UrlParser::getHost($site_url);
                     $hash = crawlHash($site_url, true) . $site[self::HASH] . "d" . substr(crawlHash($host . "/", true), 1);
                 } else {
                     $hash = $site[self::HASH_URL];
                 }
                 $summary_offsets[$hash] = $site[self::SUMMARY_OFFSET];
             }
             unset($seen_sites);
         }
         crawlLog("C. Indexer init local shard, store " . "Summaries memory usage:" . memory_get_usage() . " time: " . changeInMicrotime($start_time));
         $start_time = microtime();
         // added summary offset info to inverted index data
         $index_shard->changeDocumentOffsets($summary_offsets);
         crawlLog("D. Indexer Update shard offsets. Memory usage:" . memory_get_usage() . " time: " . changeInMicrotime($start_time));
         $start_time = microtime();
         $this->index_archive->addIndexData($index_shard);
         $this->index_dirty = true;
     }
     crawlLog("E. Indexer Add index shard. Memory usage:" . memory_get_usage() . " time: " . changeInMicrotime($start_time));
     crawlLog("Indexer Done Index Processing File: {$file}. Total time: " . changeInMicrotime($start_total_time));
     if (isset($recent_urls)) {
         $sites[self::RECENT_URLS] =& $recent_urls;
         $this->writeCrawlStatus($sites);
     }
     if (file_exists($file)) {
         //Haven't tracked down yet, but can try to delete twice giving warn
         unlink($file);
     }
 }

Example #3

Show file

File: index_archive_bundle.php Project: yakar/yioop

 /**
  * Returns the shard which is currently being used to read word-document
  * data from the bundle. If one wants to write data to the bundle use
  * getActiveShard() instead. The point of this method is to allow
  * for lazy reading of the file associated with the shard.
  *
  * @return object the currently being index shard
  */
 function getCurrentShard()
 {
     if (!isset($this->current_shard)) {
         if (!isset($this->generation_info['CURRENT'])) {
             $this->generation_info['CURRENT'] = $this->generation_info['ACTIVE'];
         }
         $current_index_shard_file = $this->dir_name . "/posting_doc_shards/index" . $this->generation_info['CURRENT'];
         if (file_exists($current_index_shard_file)) {
             if (isset($this->generation_info['DISK_BASED']) && $this->generation_info['DISK_BASED'] == true) {
                 $this->current_shard = new IndexShard($current_index_shard_file, $this->generation_info['CURRENT'], $this->num_docs_per_generation, true);
                 $this->current_shard->getShardHeader();
                 $this->current_shard->read_only_from_disk = true;
             } else {
                 if (filesize($current_index_shard_file) > self::NO_LOAD_SIZE) {
                     $this->addAdvanceGeneration();
                 } else {
                     $this->current_shard = IndexShard::load($current_index_shard_file);
                 }
             }
         } else {
             $this->current_shard = new IndexShard($current_index_shard_file, $this->generation_info['CURRENT'], $this->num_docs_per_generation);
         }
     }
     return $this->current_shard;
 }

Example #4

Show file

File: index_shard_test.php Project: yakar/yioop

 /**
  * Check that save and load work
  */
 function saveLoadTestCase()
 {
     $docid = "AAAAAAAABBBBBBBBCCCCCCCC";
     $offset = 5;
     $word_counts = array('BBBBBBBB' => array(1), 'CCCCCCCC' => array(2), 'DDDDDDDD' => array(6));
     $meta_ids = array("EEEEEEEE", "FFFFFFFF");
     //test saving and loading to a file
     $this->test_objects['shard']->addDocumentWords($docid, $offset, $word_counts, $meta_ids, array(), true);
     $this->test_objects['shard']->save();
     $this->test_objects['shard2'] = IndexShard::load(WORK_DIRECTORY . "/shard.txt");
     $this->assertEqual($this->test_objects['shard2']->len_all_docs, 3, "Len All Docs Correctly Counts Length of First Doc");
     $c_data = $this->test_objects['shard2']->getPostingsSliceById(crawlHashWord('BBBBBBBB', true), 5);
     $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]), "Doc lookup by word works");
     $c_data = $this->test_objects['shard2']->getPostingsSliceById(crawlHashWord('CCCCCCCC', true), 5);
     $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]), "Doc lookup 2 by word works");
     $c_data = $this->test_objects['shard2']->getPostingsSliceById(crawlHashWord('DDDDDDDD', true), 5);
     $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]), "Doc lookup 2 by word works");
     $c_data = $this->test_objects['shard2']->getPostingsSliceById(crawlHashWord('EEEEEEEE', true), 5);
     $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]), "Doc lookup 2 by word works");
     $c_data = $this->test_objects['shard2']->getPostingsSliceById(crawlHashWord('FFFFFFFF', true), 5);
     $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]), "Doc lookup 2 by word works");
     // test saving and loading from a string
     $out_string = $this->test_objects['shard']->save(true);
     $this->test_objects['shard2'] = IndexShard::load("shard.txt", $out_string);
     $c_data = $this->test_objects['shard2']->getPostingsSliceById(crawlHashWord('BBBBBBBB', true), 5);
     $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]), "String Load Doc lookup by word works");
     $c_data = $this->test_objects['shard2']->getPostingsSliceById(crawlHashWord('CCCCCCCC', true), 5);
     $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]), "String Load Doc lookup 2 by word works");
     $c_data = $this->test_objects['shard2']->getPostingsSliceById(crawlHashWord('DDDDDDDD', true), 5);
     $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]), "String Load Doc lookup 2 by word works");
     $c_data = $this->test_objects['shard2']->getPostingsSliceById(crawlHashWord('EEEEEEEE', true), 5);
     $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]), "String Load Doc lookup 2 by word works");
     $c_data = $this->test_objects['shard2']->getPostingsSliceById(crawlHashWord('FFFFFFFF', true), 5);
     $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]), "String Load Doc lookup 2 by word works");
 }

Example #5

Show file

File: arc_tool.php Project: yakar/yioop

 /**
  * Used to recompute both the index shards and the dictionary
  * of an index archive. The first step involves re-extracting the
  * word into an inverted index from the summaries' web_archives.
  * Then a reindex is done.
  *
  * @param string $archive_path file path to a IndexArchiveBundle
  */
 function rebuildIndexArchive($archive_path)
 {
     $archive_type = $this->getArchiveKind($archive_path);
     if ($archive_type != "IndexArchiveBundle") {
         $this->badFormatMessageAndExit($archive_path);
     }
     $info = $archive_type::getArchiveInfo($archive_path);
     $tmp = unserialize($info["DESCRIPTION"]);
     $video_sources = $tmp[self::VIDEO_SOURCES];
     $generation_info = unserialize(file_get_contents("{$archive_path}/generation.txt"));
     $num_generations = $generation_info['ACTIVE'] + 1;
     $archive = new WebArchiveBundle($archive_path . "/summaries");
     $seen = 0;
     $generation = 0;
     $keypad = "";
     while ($generation < $num_generations) {
         $partition = $archive->getPartition($generation, false);
         $shard_name = $archive_path . "/posting_doc_shards/index{$generation}";
         crawlLog("Processing partition {$generation}");
         if (file_exists($shard_name)) {
             crawlLog("..Unlinking old shard {$generation}");
             @unlink($shard_name);
         }
         $shard = new IndexShard($shard_name, $generation, NUM_DOCS_PER_GENERATION, true);
         $seen_partition = 0;
         while ($seen_partition < $partition->count) {
             $num_to_get = min($partition->count - $seen_partition, 8000);
             $offset = $partition->iterator_pos;
             $objects = $partition->nextObjects($num_to_get);
             $cnt = 0;
             foreach ($objects as $object) {
                 $cnt++;
                 $site = $object[1];
                 if (isset($site[self::TYPE]) && $site[self::TYPE] == "link") {
                     $is_link = true;
                     $doc_keys = $site[self::HTTP_CODE];
                     $site_url = $site[self::TITLE];
                     $host = UrlParser::getHost($site_url);
                     $link_parts = explode('|', $site[self::HASH]);
                     if (isset($link_parts[5])) {
                         $link_origin = $link_parts[5];
                     } else {
                         $link_origin = $site_url;
                     }
                     $meta_ids = PhraseParser::calculateLinkMetas($site_url, $host, $site[self::DESCRIPTION], $link_origin);
                     $link_to = "LINK TO:";
                 } else {
                     $is_link = false;
                     $site_url = str_replace('|', "%7C", $site[self::URL]);
                     $host = UrlParser::getHost($site_url);
                     $doc_keys = crawlHash($site_url, true) . $site[self::HASH] . "d" . substr(crawlHash($host . "/", true), 1);
                     $meta_ids = PhraseParser::calculateMetas($site, $video_sources);
                     $link_to = "";
                 }
                 $so_far_cnt = $seen_partition + $cnt;
                 $time_out_message = "..still processing {$so_far_cnt} " . "of {$partition->count} in partition {$generation}." . "\n..Last processed was: " . ($seen + 1) . ". {$link_to}{$site_url}. ";
                 crawlTimeoutLog($time_out_message);
                 $seen++;
                 $word_lists = array();
                 /*
                    self::JUST_METAS check to avoid getting sitemaps in
                    results for popular words
                 */
                 $lang = NULL;
                 if (!isset($site[self::JUST_METAS])) {
                     $host_words = UrlParser::getWordsIfHostUrl($site_url);
                     $path_words = UrlParser::getWordsLastPathPartUrl($site_url);
                     if ($is_link) {
                         $phrase_string = $site[self::DESCRIPTION];
                     } else {
                         $phrase_string = $host_words . " " . $site[self::TITLE] . " " . $path_words . " " . $site[self::DESCRIPTION];
                     }
                     if (isset($site[self::LANG])) {
                         $lang = guessLocaleFromString(mb_substr($site[self::DESCRIPTION], 0, AD_HOC_TITLE_LENGTH), $site[self::LANG]);
                     }
                     $word_lists = PhraseParser::extractPhrasesInLists($phrase_string, $lang);
                     $len = strlen($phrase_string);
                     if (PhraseParser::computeSafeSearchScore($word_lists, $len) < 0.012) {
                         $meta_ids[] = "safe:true";
                         $safe = true;
                     } else {
                         $meta_ids[] = "safe:false";
                         $safe = false;
                     }
                 }
                 if (isset($site[self::USER_RANKS]) && count($site[self::USER_RANKS]) > 0) {
                     $score_keys = "";
                     foreach ($site[self::USER_RANKS] as $label => $score) {
                         $score_keys .= packInt($score);
                     }
                     if (strlen($score_keys) % 8 != 0) {
                         $score_keys .= $keypad;
                     }
                     $doc_keys .= $score_keys;
                 }
                 $shard->addDocumentWords($doc_keys, $offset, $word_lists, $meta_ids, PhraseParser::$materialized_metas, true, false);
                 $offset = $object[0];
             }
             $seen_partition += $num_to_get;
         }
         $shard->save(false, true);
         $generation++;
     }
     $this->reindexIndexArchive($archive_path);
 }

Example #6

Show file

File: source_model.php Project: yakar/yioop

 /**
  * Copies all feeds items newer than $age to a new shard, then deletes
  * old index shard and database entries older than $age. Finally sets copied
  * shard to be active. If this method is going to take max_execution_time/2
  * it returns false, so an additional job can be schedules; otherwise
  * it returns true
  *
  * @param int $age how many seconds old records should be deleted
  * @return bool whether job executed to complete
  */
 function rebuildFeedShard($age)
 {
     $time = time();
     $feed_shard_name = WORK_DIRECTORY . "/feeds/index";
     $prune_shard_name = WORK_DIRECTORY . "/feeds/prune_index";
     $prune_shard = new IndexShard($prune_shard_name);
     $too_old = $time - $age;
     if (!$prune_shard) {
         return false;
     }
     $pre_feeds = $this->getNewsSources();
     if (!$pre_feeds) {
         return false;
     }
     $feeds = array();
     foreach ($pre_feeds as $pre_feed) {
         if (!isset($pre_feed['NAME'])) {
             continue;
         }
         $feeds[$pre_feed['NAME']] = $pre_feed;
     }
     $db = $this->db;
     // we now rebuild the inverted index with the remaining items
     $sql = "SELECT * FROM FEED_ITEM " . "WHERE PUBDATE >= ? " . "ORDER BY PUBDATE DESC";
     $result = $db->execute($sql, array($too_old));
     if ($result) {
         $completed = true;
         crawlLog("..still deleting. Making new index of non-pruned items.");
         $i = 0;
         while ($item = $db->fetchArray($result)) {
             crawlTimeoutLog("..have added %s non-pruned items to index.", $i);
             $i++;
             if (!isset($item['SOURCE_NAME'])) {
                 continue;
             }
             $source_name = $item['SOURCE_NAME'];
             if (isset($feeds[$source_name])) {
                 $lang = $feeds[$source_name]['LANGUAGE'];
             } else {
                 $lang = "";
             }
             $phrase_string = $item["TITLE"] . " " . $item["DESCRIPTION"];
             $word_lists = PhraseParser::extractPhrasesInLists($phrase_string, $lang);
             $raw_guid = unbase64Hash($item["GUID"]);
             $doc_keys = crawlHash($item["LINK"], true) . $raw_guid . "d" . substr(crawlHash(UrlParser::getHost($item["LINK"]) . "/", true), 1);
             $meta_ids = $this->calculateMetas($lang, $item['PUBDATE'], $source_name, $item["GUID"]);
             $prune_shard->addDocumentWords($doc_keys, $item['PUBDATE'], $word_lists, $meta_ids, PhraseParser::$materialized_metas, true, false);
         }
     }
     $prune_shard->save();
     @chmod($prune_shard_name, 0777);
     @chmod($feed_shard_name, 0777);
     @rename($prune_shard_name, $feed_shard_name);
     @chmod($feed_shard_name, 0777);
     $sql = "DELETE FROM FEED_ITEM WHERE PUBDATE < ?";
     $db->execute($sql, array($too_old));
 }