/** * Using the supplied $word_structs, contructs an iterator for getting * results to a query * * @param array $word_structs an array of word_structs. Here a word_struct * is an associative array with at least the following fields * KEYS -- an array of word keys * QUOTE_POSITIONS -- an array of positions of words that appreared in * quotes (so need to be matched exactly) * DISALLOW_PHRASES -- an array of words the document must not contain * WEIGHT -- a weight to multiple scores returned from this iterator by * INDEX_NAME -- an index timestamp to get results from * @param array& $filter an array of hashes of domains to filter from * results * and then potentially restored in cache * @param int $raw ($raw == 0) normal grouping, ($raw == 1) * no grouping done on data also no summaries returned (only lookup * info), $raw > 1 return summaries but no grouping * @param int $to_retrieve number of items to retrieve from location in * in interator * @param array $queue_servers a list of urls of yioop machines which might * be used during lookup * @param string $original_query if set, the orginal query that corresponds * to $word_structs * @param string $save_timestamp_name if this timestamp is non empty, then * when making iterator get sub-iterators to advance to gen doc_offset * stored with respect to save_timestamp if exists. * @param bool $limit_news if true the number of media:news items to * allow in search results is limited to WordIterator::LIMIT_NEWS_COUNT * * @return &object an iterator for iterating through results to the * query */ function getQueryIterator($word_structs, &$filter, $raw, &$to_retrieve, $queue_servers = array(), $original_query = "", $save_timestamp_name = "", $limit_news = true) { $iterators = array(); $total_iterators = 0; $network_flag = false; $min_group_flag = false; $min_group_override = false; if ($queue_servers != array() && !$this->isSingleLocalhost($queue_servers)) { $network_flag = true; $total_iterators = 1; if (!in_array(NAME_SERVER, $queue_servers)) { $queue_servers[] = NAME_SERVER; //name server might still have news } $num_servers = count($queue_servers); if ((!isset($this->index_name) || !$this->index_name) && isset($word_structs[0]["INDEX_NAME"])) { $index_name = $word_structs[0]["INDEX_NAME"]; } else { $index_name = $this->index_name; } $iterators[0] = new NetworkIterator($original_query, $queue_servers, $index_name, $filter, $save_timestamp_name, $limit_news); } if (!$network_flag) { $doc_iterate_hashes = array(substr(crawlHashWord("site:any"), 0, 9), substr(crawlHash("site:any"), 0, 9), substr(crawlHashWord("site:doc"), 0, 9), substr(crawlHash("site:doc"), 0, 9)); if ($save_timestamp_name != "") { // used for archive crawls of crawl mixes $save_file = CRAWL_DIR . '/schedules/' . self::save_point . $save_timestamp_name . ".txt"; if (file_exists($save_file)) { $save_point = unserialize(file_get_contents($save_file)); } $save_count = 0; } foreach ($word_structs as $word_struct) { if (!is_array($word_struct)) { continue; } $word_keys = $word_struct["KEYS"]; $distinct_word_keys = array(); $seen_keys = array(); foreach ($word_keys as $wkey) { if (is_string($wkey) || is_string($wkey[0])) { $tmp_key = is_string($wkey) ? $wkey : $wkey[0]; if (!isset($seen_keys[$tmp_key])) { $seen_keys[$tmp_key] = true; $distinct_word_keys[] = $wkey; } } else { $distinct_word_keys[] = $wkey; } } $quote_positions = $word_struct["QUOTE_POSITIONS"]; $disallow_keys = $word_struct["DISALLOW_KEYS"]; $index_name = $word_struct["INDEX_NAME"]; $weight = $word_struct["WEIGHT"]; $num_word_keys = count($word_keys); $total_iterators = count($distinct_word_keys); $word_iterators = array(); $word_iterator_map = array(); if ($num_word_keys < 1) { continue; } $sum = 0; for ($i = 0; $i < $total_iterators; $i++) { $current_key = is_string($distinct_word_keys[$i]) ? $distinct_word_keys[$i] : (is_string($distinct_word_keys[$i][0]) ? $distinct_word_keys[$i][0] : $distinct_word_keys[$i][0][0]); if (!is_string($current_key)) { $current_key = $current_key[0]; } if (in_array(substr($current_key, 0, 9), $doc_iterate_hashes)) { $word_iterators[$i] = new DocIterator($index_name, $filter, $to_retrieve); $min_group_override = true; } else { //can happen if exact phrase search suffix approach used if (isset($distinct_word_keys[$i][0][0]) && is_array($distinct_word_keys[$i][0][0])) { $distinct_keys = array($distinct_word_keys[$i][0][1]); } else { if (isset($distinct_word_keys[$i][0]) && is_array($distinct_word_keys[$i][0])) { $distinct_keys = $distinct_word_keys[$i]; } else { $distinct_keys = array($distinct_word_keys[$i]); } } $out_keys = array(); $old_distinct_key_id = ""; foreach ($distinct_keys as $distinct_key) { if (is_array($distinct_key)) { if (!isset($distinct_key[2]) && isset($distinct_key[1])) { $distinct_keys[] = $distinct_key[1]; } $shift = isset($distinct_key[1]) ? $distinct_key[1] : 0; $mask = isset($distinct_key[2]) ? $distinct_key[2] : "" . ""; if (isset($distinct_key[3])) { $old_distinct_key_id = unbase64Hash($distinct_key[3]); } $distinct_key_id = unbase64Hash($distinct_key[0]); } else { $shift = 0; $mask = "" . ""; $distinct_key_id = unbase64Hash($distinct_key); } $lookup_cutoff = max(MIN_RESULTS_TO_GROUP, $to_retrieve); $info = IndexManager::getWordInfo($index_name, $distinct_key_id, $shift, $mask); if ($old_distinct_key_id != "") { $old_info = IndexManager::getWordInfo($index_name, $old_distinct_key_id, $shift, $mask); if ($info !== false && $old_info !== false) { $info = array_merge($info, $old_info); } else { if ($old_info !== false) { $info = $old_info; } } } if ($info != array()) { $tmp_keys = arrayColumnCount($info, 4, 3); $sum += array_sum($tmp_keys); $out_keys = array_merge($out_keys, $tmp_keys); } if ($sum > $lookup_cutoff) { break; } } arsort($out_keys); $out_keys = array_keys(array_slice($out_keys, 0, 50)); $tmp_word_iterators = array(); $m = 0; foreach ($out_keys as $distinct_key) { $tmp_word_iterators[$m] = new WordIterator($distinct_key, $index_name, true, $filter, $to_retrieve, $limit_news); if ($tmp_word_iterators[$m]->dictionary_info != array() || $tmp_word_iterators[$m]->feed_count > 0) { $min_group_override = true; $m++; } else { unset($tmp_word_iterators[$m]); } } if ($m == 1) { $word_iterators[$i] = $tmp_word_iterators[0]; } else { $word_iterators[$i] = new DisjointIterator($tmp_word_iterators); } } foreach ($word_keys as $index => $key) { if (isset($distinct_word_keys[$i]) && $key == $distinct_word_keys[$i]) { $word_iterator_map[$index] = $i; } } } $num_disallow_keys = count($disallow_keys); if ($num_disallow_keys > 0) { for ($i = 0; $i < $num_disallow_keys; $i++) { $disallow_iterator = new WordIterator($disallow_keys[$i], $index_name, false, $filter); $word_iterators[$num_word_keys + $i] = new NegationIterator($disallow_iterator); } } $num_word_keys += $num_disallow_keys; if ($num_word_keys == 1 && $weight == 1) { $base_iterator = $word_iterators[0]; } else { $base_iterator = new IntersectIterator($word_iterators, $word_iterator_map, $quote_positions, $weight); $min_group_flag = true; if ($save_timestamp_name == "") { $base_iterator->sync_timer_on = true; } else { $base_iterator->sync_timer_on = false; } } if ($save_timestamp_name != "") { if (isset($save_point[$save_count]) && $save_point[$save_count] != -1) { $base_iterator->advance($save_point[$save_count]); } $save_count++; } $iterators[] = $base_iterator; } } $num_iterators = count($iterators); //if network_flag should be 1 if ($num_iterators < 1) { return NULL; } else { if ($num_iterators == 1) { $union_iterator = $iterators[0]; } else { $union_iterator = new UnionIterator($iterators); } } $raw = intval($raw); if ($raw > 0) { $group_iterator = $union_iterator; } else { $group_iterator = new GroupIterator($union_iterator, $total_iterators, $this->current_machine, $network_flag); } if ($network_flag) { $union_iterator->results_per_block = ceil(SERVER_ALPHA * $group_iterator->results_per_block / $num_servers); } else { if ($save_timestamp_name != "") { $group_iterator->save_iterators = $iterators; } else { if ($min_group_flag && !$min_group_override) { $group_iterator->results_per_block = max(MIN_RESULTS_TO_GROUP / 20, 1); $to_retrieve = -1; } } } return $group_iterator; }
/** * Determines the offset into the summaries WebArchiveBundle and generation * of the provided url (or hash_url) so that the info:url * (info:base64_hash_url) summary can be retrieved. This assumes of course * that the info:url meta word has been stored. * * @param string $url_or_key either info:base64_hash_url or just a url to * lookup * @param string $index_name index into which to do the lookup * @param bool $is_key whether the string is info:base64_hash_url or just a * url * @return array (offset, generation) into the web archive bundle */ function lookupSummaryOffsetGeneration($url_or_key, $index_name = "", $is_key = false) { if ($index_name == "") { $index_name = $this->index_name; } $index_archive = IndexManager::getIndex($index_name); if (!$index_archive) { return false; } $num_retrieved = 0; $pages = array(); $summary_offset = NULL; if (!isset($index_archive->generation_info['ACTIVE'])) { return false; } $mask = ""; $num_generations = $index_archive->generation_info['ACTIVE']; $hash_key = $is_key ? crawlHashWord($url_or_key, true, $mask) : crawlHashWord("info:{$url_or_key}", true, $mask); $info = IndexManager::getWordInfo($index_name, $hash_key, 0, $mask, 1); if (!isset($info[0][4])) { return false; } $word_iterator = new WordIterator($info[0][4], $index_name, true); if (is_array($next_docs = $word_iterator->nextDocsWithWord())) { foreach ($next_docs as $doc_key => $doc_info) { $summary_offset = $doc_info[CrawlConstants::SUMMARY_OFFSET]; $generation = $doc_info[CrawlConstants::GENERATION]; $index_archive->setCurrentShard($generation, true); $page = @$index_archive->getPage($summary_offset); $num_retrieved++; if ($num_retrieved >= 1) { break; } } if ($num_retrieved == 0) { return false; } } else { return false; } return array($summary_offset, $generation); }
/** * Check that save and load work */ function saveLoadTestCase() { $docid = "AAAAAAAABBBBBBBBCCCCCCCC"; $offset = 5; $word_counts = array('BBBBBBBB' => array(1), 'CCCCCCCC' => array(2), 'DDDDDDDD' => array(6)); $meta_ids = array("EEEEEEEE", "FFFFFFFF"); //test saving and loading to a file $this->test_objects['shard']->addDocumentWords($docid, $offset, $word_counts, $meta_ids, array(), true); $this->test_objects['shard']->save(); $this->test_objects['shard2'] = IndexShard::load(WORK_DIRECTORY . "/shard.txt"); $this->assertEqual($this->test_objects['shard2']->len_all_docs, 3, "Len All Docs Correctly Counts Length of First Doc"); $c_data = $this->test_objects['shard2']->getPostingsSliceById(crawlHashWord('BBBBBBBB', true), 5); $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]), "Doc lookup by word works"); $c_data = $this->test_objects['shard2']->getPostingsSliceById(crawlHashWord('CCCCCCCC', true), 5); $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]), "Doc lookup 2 by word works"); $c_data = $this->test_objects['shard2']->getPostingsSliceById(crawlHashWord('DDDDDDDD', true), 5); $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]), "Doc lookup 2 by word works"); $c_data = $this->test_objects['shard2']->getPostingsSliceById(crawlHashWord('EEEEEEEE', true), 5); $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]), "Doc lookup 2 by word works"); $c_data = $this->test_objects['shard2']->getPostingsSliceById(crawlHashWord('FFFFFFFF', true), 5); $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]), "Doc lookup 2 by word works"); // test saving and loading from a string $out_string = $this->test_objects['shard']->save(true); $this->test_objects['shard2'] = IndexShard::load("shard.txt", $out_string); $c_data = $this->test_objects['shard2']->getPostingsSliceById(crawlHashWord('BBBBBBBB', true), 5); $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]), "String Load Doc lookup by word works"); $c_data = $this->test_objects['shard2']->getPostingsSliceById(crawlHashWord('CCCCCCCC', true), 5); $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]), "String Load Doc lookup 2 by word works"); $c_data = $this->test_objects['shard2']->getPostingsSliceById(crawlHashWord('DDDDDDDD', true), 5); $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]), "String Load Doc lookup 2 by word works"); $c_data = $this->test_objects['shard2']->getPostingsSliceById(crawlHashWord('EEEEEEEE', true), 5); $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]), "String Load Doc lookup 2 by word works"); $c_data = $this->test_objects['shard2']->getPostingsSliceById(crawlHashWord('FFFFFFFF', true), 5); $this->assertTrue(isset($c_data["AAAAAAAABBBBBBBBCCCCCCCC"]), "String Load Doc lookup 2 by word works"); }
/** * Prints the IndexDictionary records for a word in an IndexArchiveBundle * * @param string $archive_path the path of a directory that holds * an IndexArchiveBundle * @param string $word to look up dictionary record for */ function outputDictInfo($archive_path, $word) { $bundle_name = $this->getArchiveName($archive_path); echo "\nBundle Name: {$bundle_name}\n"; $archive_type = $this->getArchiveKind($archive_path); echo "Bundle Type: {$archive_type}\n"; if (strcmp($archive_type, "IndexArchiveBundle") != 0) { $this->badFormatMessageAndExit($archive_path, "index"); } $index_timestamp = substr($archive_path, strpos($archive_path, self::index_data_base_name) + strlen(self::index_data_base_name)); $mask = ""; $hash_key = crawlHashWord($word, true, $mask); $info = IndexManager::getWordInfo($index_timestamp, $hash_key, 0, $mask, 1); if (!$info) { //fallback to old word hashes $info = IndexManager::getWordInfo($index_timestamp, crawlHash($word, true), 0, "", 1); if (!$info) { echo "\n{$word} does not appear in bundle!\n\n"; exit; } } echo "Dictionary Tiers: "; $index = IndexManager::getIndex($index_timestamp); $tiers = $index->dictionary->active_tiers; foreach ($tiers as $tier) { echo " {$tier}"; } echo "\nBundle Dictionary Entries for '{$word}':\n"; echo "====================================\n"; $i = 1; foreach ($info as $record) { echo "RECORD: {$i}\n"; echo "GENERATION: {$record[0]}\n"; echo "FIRST WORD OFFSET: {$record[1]}\n"; echo "LAST WORD OFFSET: {$record[2]}\n"; echo "NUMBER OF POSTINGS: {$record[3]}\n\n"; $i++; } }