/** * Hook function used by currentDocsWithWord to return the current block * of docs if it is not cached * * @return mixed doc ids and score if there are docs left, -1 otherwise */ function findDocsWithWord() { if ($this->current_generation >= $this->num_generations || $this->current_generation == $this->num_generations - 1 && $this->current_offset > $this->last_offset) { return -1; } $pre_results = array(); $this->next_offset = $this->current_offset; $index = IndexManager::getIndex($this->index_name); $index->setCurrentShard($this->current_generation, true); //the next call also updates next offset $shard = $index->getCurrentShard(); $this->getShardInfo($this->current_generation); $doc_key_len = IndexShard::DOC_KEY_LEN; $num_docs_or_links = $shard->num_docs + $shard->num_link_docs; $pre_results = array(); $num_docs_so_far = 0; do { if ($this->next_offset >= $this->last_offset) { break; } $posting = packPosting($this->next_offset >> 4, array(1)); list($doc_id, $num_keys, $item) = $shard->makeItem($posting, $num_docs_or_links); if ($num_keys % 2 == 0) { $num_keys++; } $this->next_offset += ($num_keys + 1) * $doc_key_len; $pre_results[$doc_id] = $item; $num_docs_so_far++; } while ($num_docs_so_far < $this->results_per_block); $results = array(); $doc_key_len = IndexShard::DOC_KEY_LEN; $filter = $this->filter == NULL ? array() : $this->filter; foreach ($pre_results as $keys => $data) { $host_key = substr($keys, self::HOST_KEY_POS, self::KEY_LEN); if (in_array($host_key, $filter)) { continue; } $data[self::KEY] = $keys; // inlinks is the domain of the inlink list($hash_url, $data[self::HASH], $data[self::INLINKS]) = str_split($keys, $doc_key_len); $data[self::CRAWL_TIME] = $this->index_name; $results[$keys] = $data; } $this->count_block = count($results); if ($this->current_generation == $this->num_generations - 1 && $results == array()) { $results = NULL; } $this->pages = $results; return $results; }
/** * Determines the offset into the summaries WebArchiveBundle and generation * of the provided url (or hash_url) so that the info:url * (info:base64_hash_url) summary can be retrieved. This assumes of course * that the info:url meta word has been stored. * * @param string $url_or_key either info:base64_hash_url or just a url to * lookup * @param string $index_name index into which to do the lookup * @param bool $is_key whether the string is info:base64_hash_url or just a * url * @return array (offset, generation) into the web archive bundle */ function lookupSummaryOffsetGeneration($url_or_key, $index_name = "", $is_key = false) { if ($index_name == "") { $index_name = $this->index_name; } $index_archive = IndexManager::getIndex($index_name); if (!$index_archive) { return false; } $num_retrieved = 0; $pages = array(); $summary_offset = NULL; if (!isset($index_archive->generation_info['ACTIVE'])) { return false; } $mask = ""; $num_generations = $index_archive->generation_info['ACTIVE']; $hash_key = $is_key ? crawlHashWord($url_or_key, true, $mask) : crawlHashWord("info:{$url_or_key}", true, $mask); $info = IndexManager::getWordInfo($index_name, $hash_key, 0, $mask, 1); if (!isset($info[0][4])) { return false; } $word_iterator = new WordIterator($info[0][4], $index_name, true); if (is_array($next_docs = $word_iterator->nextDocsWithWord())) { foreach ($next_docs as $doc_key => $doc_info) { $summary_offset = $doc_info[CrawlConstants::SUMMARY_OFFSET]; $generation = $doc_info[CrawlConstants::GENERATION]; $index_archive->setCurrentShard($generation, true); $page = @$index_archive->getPage($summary_offset); $num_retrieved++; if ($num_retrieved >= 1) { break; } } if ($num_retrieved == 0) { return false; } } else { return false; } return array($summary_offset, $generation); }
/** * Prints information about $num many postings beginning at the * provided $generation and $offset * * @param string $archive_path the path of a directory that holds * an IndexArchiveBundle * @param int $generation which index shard to use * @param int $offset offset into posting lists for that shard * @param int $num how many postings to print info for */ function outputPostingInfo($archive_path, $generation, $offset, $num = 1) { $bundle_name = $this->getArchiveName($archive_path); echo "\nBundle Name: {$bundle_name}\n"; $archive_type = $this->getArchiveKind($archive_path); echo "Bundle Type: {$archive_type}\n"; echo "Generation: {$generation}\n"; echo "Offset: {$offset}\n"; if (strcmp($archive_type, "IndexArchiveBundle") != 0) { $this->badFormatMessageAndExit($archive_path, "index"); } $index_timestamp = substr($archive_path, strpos($archive_path, self::index_data_base_name) + strlen(self::index_data_base_name)); $index = IndexManager::getIndex($index_timestamp); $index->setCurrentShard($generation, true); $shard = $index->getCurrentShard(); $next = $offset >> 2; $raw_postings = array(); $doc_indexes = array(); $documents = array(); for ($i = 0; $i < $num; $i++) { $dummy_offset = 0; $posting_start = $next; $posting_end = $next; $old_offset = $next << 2; $old_start = $next << 2; $old_end = $next << 2; $tmp = $shard->getPostingAtOffset($next, $posting_start, $posting_end); $next = $posting_end + 1; if (!$tmp) { break; } $documents = array_merge($documents, $shard->getPostingsSlice($old_offset, $old_start, $old_end, 1)); $raw_postings[] = $tmp; $post_array = unpackPosting($tmp, $dummy_offset); $doc_indexes[] = $post_array[0]; } $end_offset = $next << 2; echo "Offset After Returned Results: {$end_offset}\n\n"; if (!$documents || ($count = count($documents)) < 1) { echo "No documents correspond to generation and offset given\n\n"; exit; } $document_word = $count == 1 ? "Document" : "Documents"; echo "{$count} {$document_word} Found:\n"; echo str_pad("", $count + 1, "=") . "================\n"; $j = 0; foreach ($documents as $key => $document) { echo "\nDOC ID: " . toHexString($key); echo "\nTYPE: " . ($document[self::IS_DOC] ? "Document" : "Link"); echo "\nDOC INDEX: " . $doc_indexes[$j]; $summary_offset = $document[self::SUMMARY_OFFSET]; echo "\nSUMMARY OFFSET: " . $summary_offset; echo "\nSCORE: " . $document[self::SCORE]; echo "\nDOC RANK: " . $document[self::DOC_RANK]; echo "\nRELEVANCE: " . $document[self::RELEVANCE]; echo "\nPROXIMITY: " . $document[self::PROXIMITY]; echo "\nHEX POSTING:\n"; echo "------------\n"; echo wordwrap(toHexString($raw_postings[$j]), 80); if (isset($document[self::POSITION_LIST])) { echo "\nTERM OCCURRENCES IN DOCUMENT (Count starts at title):"; echo "\n-------------------------" . "----------------------------\n"; $i = 0; foreach ($document[self::POSITION_LIST] as $position) { printf("%09d ", $position); $i++; if ($i >= 5) { echo "\n"; $i = 0; } } if ($i != 0) { echo "\n"; } } $page = @$index->getPage($summary_offset); if (isset($page[self::TITLE])) { echo "SUMMARY TITLE:\n"; echo "--------------\n"; echo wordwrap($page[self::TITLE], 80) . "\n"; } if (isset($page[self::DESCRIPTION])) { echo "SUMMARY DESCRIPTION:\n"; echo "--------------\n"; echo $page[self::DESCRIPTION] . "\n"; } $j++; } }
/** * Returns the number of document that a given term or phrase appears in * in the given index * * @param string $term_or_phrase what to look up in the indexes dictionary * no mask is used for this look up * @param string $index_name index to look up term or phrase in * @param int $threshold if set and positive then once threshold many * documents are found the search for more documents to add to the * total is stopped * @return int number of documents */ static function numDocsTerm($term_or_phrase, $index_name, $threshold = -1) { $index = IndexManager::getIndex($index_name); if (!$index->dictionary) { return false; } $pos = -1; $total_num_docs = 0; $hashes = allCrawlHashPaths($term_or_phrase, array(), array(), true); if (!is_array($hashes)) { $hashes = array($hashes); } foreach ($hashes as $hash) { if (is_array($hash)) { $dictionary_info = IndexManager::getWordInfo($index_name, $hash[0], $hash[1], $hash[2], $threshold); } else { $dictionary_info = IndexManager::getWordInfo($index_name, $hash); } $num_generations = count($dictionary_info); $start = isset($dictionary_info[-1]) ? -1 : 0; $end = $start == -1 ? $num_generations - 1 : $num_generations; for ($i = $start; $i < $end; $i++) { list(, , , $num_docs) = $dictionary_info[$i]; $total_num_docs += $num_docs; if ($threshold > 0 && $total_num_docs > $threshold) { return $total_num_docs; } } } return $total_num_docs; }
/** * Computes for each word in an array of words a count of the total number * of times it occurs in this crawl model's default index. * * @param array $words words to find the counts for * @param array $machine_urls machines to invoke this command on * @return array associative array of word => counts */ function countWords($words, $machine_urls = NULL) { if ($machine_urls != NULL && !$this->isSingleLocalhost($machine_urls)) { $count_strings = $this->execMachines("countWords", $machine_urls, serialize(array($words, $this->index_name))); $word_counts = array(); foreach ($count_strings as $count_string) { $a_word_counts = unserialize(webdecode($count_string[self::PAGE])); if (is_array($a_word_counts)) { foreach ($a_word_counts as $word => $count) { $word_counts[$word] = isset($word_counts[$word]) ? $word_counts[$word] + $count : $count; } } } return $word_counts; } $index_archive = IndexManager::getIndex($this->index_name); $hashes = array(); $lookup = array(); foreach ($words as $word) { $tmp = crawlHash($word); $hashes[] = $tmp; $lookup[$tmp] = $word; } $word_key_counts = $index_archive->countWordKeys($hashes); $phrases = array(); $word_counts = array(); if (is_array($word_key_counts) && count($word_key_counts) > 0) { foreach ($word_key_counts as $word_key => $count) { $word_counts[$lookup[$word_key]] = $count; } } return $word_counts; }