示例#1
0
 /**
  * Using the supplied $word_structs, contructs an iterator for getting
  * results to a query
  *
  * @param array $word_structs an array of word_structs. Here a word_struct
  *     is an associative array with at least the following fields
  *     KEYS -- an array of word keys
  *     QUOTE_POSITIONS -- an array of positions of words that appreared in
  *         quotes (so need to be matched exactly)
  *     DISALLOW_PHRASES -- an array of words the document must not contain
  *     WEIGHT -- a weight to multiple scores returned from this iterator by
  *     INDEX_NAME -- an index timestamp to get results from
  * @param array& $filter an array of hashes of domains to filter from
  *     results
  *     and then potentially restored in cache
  * @param int $raw ($raw == 0) normal grouping, ($raw == 1)
  *     no grouping done on data also no summaries returned (only lookup
  *     info), $raw > 1 return summaries but no grouping
  * @param int $to_retrieve number of items to retrieve from location in
  *     in interator
  * @param array $queue_servers a list of urls of yioop machines which might
  *     be used during lookup
  * @param string $original_query if set, the orginal query that corresponds
  *     to $word_structs
  * @param string $save_timestamp_name if this timestamp is non empty, then
  *     when making iterator get sub-iterators to advance to gen doc_offset
  *     stored with respect to save_timestamp if exists.
  * @param bool $limit_news if true the number of media:news items to
  *     allow in search results is limited to WordIterator::LIMIT_NEWS_COUNT
  *
  * @return &object an iterator for iterating through results to the
  * query
  */
 function getQueryIterator($word_structs, &$filter, $raw, &$to_retrieve, $queue_servers = array(), $original_query = "", $save_timestamp_name = "", $limit_news = true)
 {
     $iterators = array();
     $total_iterators = 0;
     $network_flag = false;
     $min_group_flag = false;
     $min_group_override = false;
     if ($queue_servers != array() && !$this->isSingleLocalhost($queue_servers)) {
         $network_flag = true;
         $total_iterators = 1;
         if (!in_array(NAME_SERVER, $queue_servers)) {
             $queue_servers[] = NAME_SERVER;
             //name server might still have news
         }
         $num_servers = count($queue_servers);
         if ((!isset($this->index_name) || !$this->index_name) && isset($word_structs[0]["INDEX_NAME"])) {
             $index_name = $word_structs[0]["INDEX_NAME"];
         } else {
             $index_name = $this->index_name;
         }
         $iterators[0] = new NetworkIterator($original_query, $queue_servers, $index_name, $filter, $save_timestamp_name, $limit_news);
     }
     if (!$network_flag) {
         $doc_iterate_hashes = array(substr(crawlHashWord("site:any"), 0, 9), substr(crawlHash("site:any"), 0, 9), substr(crawlHashWord("site:doc"), 0, 9), substr(crawlHash("site:doc"), 0, 9));
         if ($save_timestamp_name != "") {
             // used for archive crawls of crawl mixes
             $save_file = CRAWL_DIR . '/schedules/' . self::save_point . $save_timestamp_name . ".txt";
             if (file_exists($save_file)) {
                 $save_point = unserialize(file_get_contents($save_file));
             }
             $save_count = 0;
         }
         foreach ($word_structs as $word_struct) {
             if (!is_array($word_struct)) {
                 continue;
             }
             $word_keys = $word_struct["KEYS"];
             $distinct_word_keys = array();
             $seen_keys = array();
             foreach ($word_keys as $wkey) {
                 if (is_string($wkey) || is_string($wkey[0])) {
                     $tmp_key = is_string($wkey) ? $wkey : $wkey[0];
                     if (!isset($seen_keys[$tmp_key])) {
                         $seen_keys[$tmp_key] = true;
                         $distinct_word_keys[] = $wkey;
                     }
                 } else {
                     $distinct_word_keys[] = $wkey;
                 }
             }
             $quote_positions = $word_struct["QUOTE_POSITIONS"];
             $disallow_keys = $word_struct["DISALLOW_KEYS"];
             $index_name = $word_struct["INDEX_NAME"];
             $weight = $word_struct["WEIGHT"];
             $num_word_keys = count($word_keys);
             $total_iterators = count($distinct_word_keys);
             $word_iterators = array();
             $word_iterator_map = array();
             if ($num_word_keys < 1) {
                 continue;
             }
             $sum = 0;
             for ($i = 0; $i < $total_iterators; $i++) {
                 $current_key = is_string($distinct_word_keys[$i]) ? $distinct_word_keys[$i] : (is_string($distinct_word_keys[$i][0]) ? $distinct_word_keys[$i][0] : $distinct_word_keys[$i][0][0]);
                 if (!is_string($current_key)) {
                     $current_key = $current_key[0];
                 }
                 if (in_array(substr($current_key, 0, 9), $doc_iterate_hashes)) {
                     $word_iterators[$i] = new DocIterator($index_name, $filter, $to_retrieve);
                     $min_group_override = true;
                 } else {
                     //can happen if exact phrase search suffix approach used
                     if (isset($distinct_word_keys[$i][0][0]) && is_array($distinct_word_keys[$i][0][0])) {
                         $distinct_keys = array($distinct_word_keys[$i][0][1]);
                     } else {
                         if (isset($distinct_word_keys[$i][0]) && is_array($distinct_word_keys[$i][0])) {
                             $distinct_keys = $distinct_word_keys[$i];
                         } else {
                             $distinct_keys = array($distinct_word_keys[$i]);
                         }
                     }
                     $out_keys = array();
                     $old_distinct_key_id = "";
                     foreach ($distinct_keys as $distinct_key) {
                         if (is_array($distinct_key)) {
                             if (!isset($distinct_key[2]) && isset($distinct_key[1])) {
                                 $distinct_keys[] = $distinct_key[1];
                             }
                             $shift = isset($distinct_key[1]) ? $distinct_key[1] : 0;
                             $mask = isset($distinct_key[2]) ? $distinct_key[2] : "" . "";
                             if (isset($distinct_key[3])) {
                                 $old_distinct_key_id = unbase64Hash($distinct_key[3]);
                             }
                             $distinct_key_id = unbase64Hash($distinct_key[0]);
                         } else {
                             $shift = 0;
                             $mask = "" . "";
                             $distinct_key_id = unbase64Hash($distinct_key);
                         }
                         $lookup_cutoff = max(MIN_RESULTS_TO_GROUP, $to_retrieve);
                         $info = IndexManager::getWordInfo($index_name, $distinct_key_id, $shift, $mask);
                         if ($old_distinct_key_id != "") {
                             $old_info = IndexManager::getWordInfo($index_name, $old_distinct_key_id, $shift, $mask);
                             if ($info !== false && $old_info !== false) {
                                 $info = array_merge($info, $old_info);
                             } else {
                                 if ($old_info !== false) {
                                     $info = $old_info;
                                 }
                             }
                         }
                         if ($info != array()) {
                             $tmp_keys = arrayColumnCount($info, 4, 3);
                             $sum += array_sum($tmp_keys);
                             $out_keys = array_merge($out_keys, $tmp_keys);
                         }
                         if ($sum > $lookup_cutoff) {
                             break;
                         }
                     }
                     arsort($out_keys);
                     $out_keys = array_keys(array_slice($out_keys, 0, 50));
                     $tmp_word_iterators = array();
                     $m = 0;
                     foreach ($out_keys as $distinct_key) {
                         $tmp_word_iterators[$m] = new WordIterator($distinct_key, $index_name, true, $filter, $to_retrieve, $limit_news);
                         if ($tmp_word_iterators[$m]->dictionary_info != array() || $tmp_word_iterators[$m]->feed_count > 0) {
                             $min_group_override = true;
                             $m++;
                         } else {
                             unset($tmp_word_iterators[$m]);
                         }
                     }
                     if ($m == 1) {
                         $word_iterators[$i] = $tmp_word_iterators[0];
                     } else {
                         $word_iterators[$i] = new DisjointIterator($tmp_word_iterators);
                     }
                 }
                 foreach ($word_keys as $index => $key) {
                     if (isset($distinct_word_keys[$i]) && $key == $distinct_word_keys[$i]) {
                         $word_iterator_map[$index] = $i;
                     }
                 }
             }
             $num_disallow_keys = count($disallow_keys);
             if ($num_disallow_keys > 0) {
                 for ($i = 0; $i < $num_disallow_keys; $i++) {
                     $disallow_iterator = new WordIterator($disallow_keys[$i], $index_name, false, $filter);
                     $word_iterators[$num_word_keys + $i] = new NegationIterator($disallow_iterator);
                 }
             }
             $num_word_keys += $num_disallow_keys;
             if ($num_word_keys == 1 && $weight == 1) {
                 $base_iterator = $word_iterators[0];
             } else {
                 $base_iterator = new IntersectIterator($word_iterators, $word_iterator_map, $quote_positions, $weight);
                 $min_group_flag = true;
                 if ($save_timestamp_name == "") {
                     $base_iterator->sync_timer_on = true;
                 } else {
                     $base_iterator->sync_timer_on = false;
                 }
             }
             if ($save_timestamp_name != "") {
                 if (isset($save_point[$save_count]) && $save_point[$save_count] != -1) {
                     $base_iterator->advance($save_point[$save_count]);
                 }
                 $save_count++;
             }
             $iterators[] = $base_iterator;
         }
     }
     $num_iterators = count($iterators);
     //if network_flag should be 1
     if ($num_iterators < 1) {
         return NULL;
     } else {
         if ($num_iterators == 1) {
             $union_iterator = $iterators[0];
         } else {
             $union_iterator = new UnionIterator($iterators);
         }
     }
     $raw = intval($raw);
     if ($raw > 0) {
         $group_iterator = $union_iterator;
     } else {
         $group_iterator = new GroupIterator($union_iterator, $total_iterators, $this->current_machine, $network_flag);
     }
     if ($network_flag) {
         $union_iterator->results_per_block = ceil(SERVER_ALPHA * $group_iterator->results_per_block / $num_servers);
     } else {
         if ($save_timestamp_name != "") {
             $group_iterator->save_iterators = $iterators;
         } else {
             if ($min_group_flag && !$min_group_override) {
                 $group_iterator->results_per_block = max(MIN_RESULTS_TO_GROUP / 20, 1);
                 $to_retrieve = -1;
             }
         }
     }
     return $group_iterator;
 }
示例#2
0
 /**
  * Determines the offset into the summaries WebArchiveBundle and generation
  * of the provided url (or hash_url) so that the info:url
  * (info:base64_hash_url) summary can be retrieved. This assumes of course
  * that the info:url  meta word has been stored.
  *
  * @param string $url_or_key either info:base64_hash_url or just a url to
  *     lookup
  * @param string $index_name index into which to do the lookup
  * @param bool $is_key whether the string is info:base64_hash_url or just a
  *     url
  * @return array (offset, generation) into the web archive bundle
  */
 function lookupSummaryOffsetGeneration($url_or_key, $index_name = "", $is_key = false)
 {
     if ($index_name == "") {
         $index_name = $this->index_name;
     }
     $index_archive = IndexManager::getIndex($index_name);
     if (!$index_archive) {
         return false;
     }
     $num_retrieved = 0;
     $pages = array();
     $summary_offset = NULL;
     if (!isset($index_archive->generation_info['ACTIVE'])) {
         return false;
     }
     $mask = "";
     $num_generations = $index_archive->generation_info['ACTIVE'];
     $hash_key = $is_key ? crawlHashWord($url_or_key, true, $mask) : crawlHashWord("info:{$url_or_key}", true, $mask);
     $info = IndexManager::getWordInfo($index_name, $hash_key, 0, $mask, 1);
     if (!isset($info[0][4])) {
         return false;
     }
     $word_iterator = new WordIterator($info[0][4], $index_name, true);
     if (is_array($next_docs = $word_iterator->nextDocsWithWord())) {
         foreach ($next_docs as $doc_key => $doc_info) {
             $summary_offset = $doc_info[CrawlConstants::SUMMARY_OFFSET];
             $generation = $doc_info[CrawlConstants::GENERATION];
             $index_archive->setCurrentShard($generation, true);
             $page = @$index_archive->getPage($summary_offset);
             $num_retrieved++;
             if ($num_retrieved >= 1) {
                 break;
             }
         }
         if ($num_retrieved == 0) {
             return false;
         }
     } else {
         return false;
     }
     return array($summary_offset, $generation);
 }
示例#3
0
 /**
  * Check that save and load work
  */
 function saveLoadTestCase()
 {
     $docid = "AAAAAAAABBBBBBBBCCCCCCCC";
     $offset = 5;
     $word_counts = array('BBBBBBBB' => array(1), 'CCCCCCCC' => array(2), 'DDDDDDDD' => array(6));
     $meta_ids = array("EEEEEEEE", "FFFFFFFF");
     //test saving and loading to a file
     $this->test_objects['shard']->addDocumentWords($docid, $offset, $word_counts, $meta_ids, array(), true);
     $this->test_objects['shard']->save();
     $this->test_objects['shard2'] = IndexShard::load(WORK_DIRECTORY . "/shard.txt");
     $this->assertEqual($this->test_objects['shard2']->len_all_docs, 3, "Len All Docs Correctly Counts Length of First Doc");
     $c_data = $this->test_objects['shard2']->getPostingsSliceById(crawlHashWord('BBBBBBBB', true), 5);
     $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]), "Doc lookup by word works");
     $c_data = $this->test_objects['shard2']->getPostingsSliceById(crawlHashWord('CCCCCCCC', true), 5);
     $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]), "Doc lookup 2 by word works");
     $c_data = $this->test_objects['shard2']->getPostingsSliceById(crawlHashWord('DDDDDDDD', true), 5);
     $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]), "Doc lookup 2 by word works");
     $c_data = $this->test_objects['shard2']->getPostingsSliceById(crawlHashWord('EEEEEEEE', true), 5);
     $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]), "Doc lookup 2 by word works");
     $c_data = $this->test_objects['shard2']->getPostingsSliceById(crawlHashWord('FFFFFFFF', true), 5);
     $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]), "Doc lookup 2 by word works");
     // test saving and loading from a string
     $out_string = $this->test_objects['shard']->save(true);
     $this->test_objects['shard2'] = IndexShard::load("shard.txt", $out_string);
     $c_data = $this->test_objects['shard2']->getPostingsSliceById(crawlHashWord('BBBBBBBB', true), 5);
     $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]), "String Load Doc lookup by word works");
     $c_data = $this->test_objects['shard2']->getPostingsSliceById(crawlHashWord('CCCCCCCC', true), 5);
     $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]), "String Load Doc lookup 2 by word works");
     $c_data = $this->test_objects['shard2']->getPostingsSliceById(crawlHashWord('DDDDDDDD', true), 5);
     $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]), "String Load Doc lookup 2 by word works");
     $c_data = $this->test_objects['shard2']->getPostingsSliceById(crawlHashWord('EEEEEEEE', true), 5);
     $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]), "String Load Doc lookup 2 by word works");
     $c_data = $this->test_objects['shard2']->getPostingsSliceById(crawlHashWord('FFFFFFFF', true), 5);
     $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]), "String Load Doc lookup 2 by word works");
 }
示例#4
0
文件: arc_tool.php 项目: yakar/yioop
 /**
  * Prints the IndexDictionary records for a word in an IndexArchiveBundle
  *
  * @param string $archive_path the path of a directory that holds
  *     an IndexArchiveBundle
  * @param string $word to look up dictionary record for
  */
 function outputDictInfo($archive_path, $word)
 {
     $bundle_name = $this->getArchiveName($archive_path);
     echo "\nBundle Name: {$bundle_name}\n";
     $archive_type = $this->getArchiveKind($archive_path);
     echo "Bundle Type: {$archive_type}\n";
     if (strcmp($archive_type, "IndexArchiveBundle") != 0) {
         $this->badFormatMessageAndExit($archive_path, "index");
     }
     $index_timestamp = substr($archive_path, strpos($archive_path, self::index_data_base_name) + strlen(self::index_data_base_name));
     $mask = "";
     $hash_key = crawlHashWord($word, true, $mask);
     $info = IndexManager::getWordInfo($index_timestamp, $hash_key, 0, $mask, 1);
     if (!$info) {
         //fallback to old word hashes
         $info = IndexManager::getWordInfo($index_timestamp, crawlHash($word, true), 0, "", 1);
         if (!$info) {
             echo "\n{$word} does not appear in bundle!\n\n";
             exit;
         }
     }
     echo "Dictionary Tiers: ";
     $index = IndexManager::getIndex($index_timestamp);
     $tiers = $index->dictionary->active_tiers;
     foreach ($tiers as $tier) {
         echo " {$tier}";
     }
     echo "\nBundle Dictionary Entries for '{$word}':\n";
     echo "====================================\n";
     $i = 1;
     foreach ($info as $record) {
         echo "RECORD: {$i}\n";
         echo "GENERATION: {$record[0]}\n";
         echo "FIRST WORD OFFSET: {$record[1]}\n";
         echo "LAST WORD OFFSET: {$record[2]}\n";
         echo "NUMBER OF POSTINGS: {$record[3]}\n\n";
         $i++;
     }
 }