コード例 #1
0
ファイル: doc_iterator.php プロジェクト: yakar/yioop
 /**
  * Hook function used by currentDocsWithWord to return the current block
  * of docs if it is not cached
  *
  * @return mixed doc ids and score if there are docs left, -1 otherwise
  */
 function findDocsWithWord()
 {
     if ($this->current_generation >= $this->num_generations || $this->current_generation == $this->num_generations - 1 && $this->current_offset > $this->last_offset) {
         return -1;
     }
     $pre_results = array();
     $this->next_offset = $this->current_offset;
     $index = IndexManager::getIndex($this->index_name);
     $index->setCurrentShard($this->current_generation, true);
     //the next call also updates next offset
     $shard = $index->getCurrentShard();
     $this->getShardInfo($this->current_generation);
     $doc_key_len = IndexShard::DOC_KEY_LEN;
     $num_docs_or_links = $shard->num_docs + $shard->num_link_docs;
     $pre_results = array();
     $num_docs_so_far = 0;
     do {
         if ($this->next_offset >= $this->last_offset) {
             break;
         }
         $posting = packPosting($this->next_offset >> 4, array(1));
         list($doc_id, $num_keys, $item) = $shard->makeItem($posting, $num_docs_or_links);
         if ($num_keys % 2 == 0) {
             $num_keys++;
         }
         $this->next_offset += ($num_keys + 1) * $doc_key_len;
         $pre_results[$doc_id] = $item;
         $num_docs_so_far++;
     } while ($num_docs_so_far < $this->results_per_block);
     $results = array();
     $doc_key_len = IndexShard::DOC_KEY_LEN;
     $filter = $this->filter == NULL ? array() : $this->filter;
     foreach ($pre_results as $keys => $data) {
         $host_key = substr($keys, self::HOST_KEY_POS, self::KEY_LEN);
         if (in_array($host_key, $filter)) {
             continue;
         }
         $data[self::KEY] = $keys;
         // inlinks is the domain of the inlink
         list($hash_url, $data[self::HASH], $data[self::INLINKS]) = str_split($keys, $doc_key_len);
         $data[self::CRAWL_TIME] = $this->index_name;
         $results[$keys] = $data;
     }
     $this->count_block = count($results);
     if ($this->current_generation == $this->num_generations - 1 && $results == array()) {
         $results = NULL;
     }
     $this->pages = $results;
     return $results;
 }
コード例 #2
0
ファイル: parallel_model.php プロジェクト: yakar/yioop
 /**
  * Determines the offset into the summaries WebArchiveBundle and generation
  * of the provided url (or hash_url) so that the info:url
  * (info:base64_hash_url) summary can be retrieved. This assumes of course
  * that the info:url  meta word has been stored.
  *
  * @param string $url_or_key either info:base64_hash_url or just a url to
  *     lookup
  * @param string $index_name index into which to do the lookup
  * @param bool $is_key whether the string is info:base64_hash_url or just a
  *     url
  * @return array (offset, generation) into the web archive bundle
  */
 function lookupSummaryOffsetGeneration($url_or_key, $index_name = "", $is_key = false)
 {
     if ($index_name == "") {
         $index_name = $this->index_name;
     }
     $index_archive = IndexManager::getIndex($index_name);
     if (!$index_archive) {
         return false;
     }
     $num_retrieved = 0;
     $pages = array();
     $summary_offset = NULL;
     if (!isset($index_archive->generation_info['ACTIVE'])) {
         return false;
     }
     $mask = "";
     $num_generations = $index_archive->generation_info['ACTIVE'];
     $hash_key = $is_key ? crawlHashWord($url_or_key, true, $mask) : crawlHashWord("info:{$url_or_key}", true, $mask);
     $info = IndexManager::getWordInfo($index_name, $hash_key, 0, $mask, 1);
     if (!isset($info[0][4])) {
         return false;
     }
     $word_iterator = new WordIterator($info[0][4], $index_name, true);
     if (is_array($next_docs = $word_iterator->nextDocsWithWord())) {
         foreach ($next_docs as $doc_key => $doc_info) {
             $summary_offset = $doc_info[CrawlConstants::SUMMARY_OFFSET];
             $generation = $doc_info[CrawlConstants::GENERATION];
             $index_archive->setCurrentShard($generation, true);
             $page = @$index_archive->getPage($summary_offset);
             $num_retrieved++;
             if ($num_retrieved >= 1) {
                 break;
             }
         }
         if ($num_retrieved == 0) {
             return false;
         }
     } else {
         return false;
     }
     return array($summary_offset, $generation);
 }
コード例 #3
0
ファイル: arc_tool.php プロジェクト: yakar/yioop
 /**
  * Prints information about $num many postings beginning at the
  * provided $generation and $offset
  *
  * @param string $archive_path the path of a directory that holds
  *     an IndexArchiveBundle
  * @param int $generation which index shard to use
  * @param int $offset offset into posting lists for that shard
  * @param int $num how many postings to print info for
  */
 function outputPostingInfo($archive_path, $generation, $offset, $num = 1)
 {
     $bundle_name = $this->getArchiveName($archive_path);
     echo "\nBundle Name: {$bundle_name}\n";
     $archive_type = $this->getArchiveKind($archive_path);
     echo "Bundle Type: {$archive_type}\n";
     echo "Generation: {$generation}\n";
     echo "Offset: {$offset}\n";
     if (strcmp($archive_type, "IndexArchiveBundle") != 0) {
         $this->badFormatMessageAndExit($archive_path, "index");
     }
     $index_timestamp = substr($archive_path, strpos($archive_path, self::index_data_base_name) + strlen(self::index_data_base_name));
     $index = IndexManager::getIndex($index_timestamp);
     $index->setCurrentShard($generation, true);
     $shard = $index->getCurrentShard();
     $next = $offset >> 2;
     $raw_postings = array();
     $doc_indexes = array();
     $documents = array();
     for ($i = 0; $i < $num; $i++) {
         $dummy_offset = 0;
         $posting_start = $next;
         $posting_end = $next;
         $old_offset = $next << 2;
         $old_start = $next << 2;
         $old_end = $next << 2;
         $tmp = $shard->getPostingAtOffset($next, $posting_start, $posting_end);
         $next = $posting_end + 1;
         if (!$tmp) {
             break;
         }
         $documents = array_merge($documents, $shard->getPostingsSlice($old_offset, $old_start, $old_end, 1));
         $raw_postings[] = $tmp;
         $post_array = unpackPosting($tmp, $dummy_offset);
         $doc_indexes[] = $post_array[0];
     }
     $end_offset = $next << 2;
     echo "Offset After Returned Results: {$end_offset}\n\n";
     if (!$documents || ($count = count($documents)) < 1) {
         echo "No documents correspond to generation and offset given\n\n";
         exit;
     }
     $document_word = $count == 1 ? "Document" : "Documents";
     echo "{$count} {$document_word} Found:\n";
     echo str_pad("", $count + 1, "=") . "================\n";
     $j = 0;
     foreach ($documents as $key => $document) {
         echo "\nDOC ID: " . toHexString($key);
         echo "\nTYPE: " . ($document[self::IS_DOC] ? "Document" : "Link");
         echo "\nDOC INDEX: " . $doc_indexes[$j];
         $summary_offset = $document[self::SUMMARY_OFFSET];
         echo "\nSUMMARY OFFSET: " . $summary_offset;
         echo "\nSCORE: " . $document[self::SCORE];
         echo "\nDOC RANK: " . $document[self::DOC_RANK];
         echo "\nRELEVANCE: " . $document[self::RELEVANCE];
         echo "\nPROXIMITY: " . $document[self::PROXIMITY];
         echo "\nHEX POSTING:\n";
         echo "------------\n";
         echo wordwrap(toHexString($raw_postings[$j]), 80);
         if (isset($document[self::POSITION_LIST])) {
             echo "\nTERM OCCURRENCES IN DOCUMENT (Count starts at title):";
             echo "\n-------------------------" . "----------------------------\n";
             $i = 0;
             foreach ($document[self::POSITION_LIST] as $position) {
                 printf("%09d ", $position);
                 $i++;
                 if ($i >= 5) {
                     echo "\n";
                     $i = 0;
                 }
             }
             if ($i != 0) {
                 echo "\n";
             }
         }
         $page = @$index->getPage($summary_offset);
         if (isset($page[self::TITLE])) {
             echo "SUMMARY TITLE:\n";
             echo "--------------\n";
             echo wordwrap($page[self::TITLE], 80) . "\n";
         }
         if (isset($page[self::DESCRIPTION])) {
             echo "SUMMARY DESCRIPTION:\n";
             echo "--------------\n";
             echo $page[self::DESCRIPTION] . "\n";
         }
         $j++;
     }
 }
コード例 #4
0
ファイル: index_manager.php プロジェクト: yakar/yioop
 /**
  * Returns the number of document that a given term or phrase appears in
  * in the given index
  *
  * @param string $term_or_phrase what to look up in the indexes dictionary
  *     no  mask is used for this look up
  * @param string $index_name index to look up term or phrase in
  * @param int $threshold if set and positive then once threshold many
  *     documents are found the search for more documents to add to the
  *     total is stopped
  * @return int number of documents
  */
 static function numDocsTerm($term_or_phrase, $index_name, $threshold = -1)
 {
     $index = IndexManager::getIndex($index_name);
     if (!$index->dictionary) {
         return false;
     }
     $pos = -1;
     $total_num_docs = 0;
     $hashes = allCrawlHashPaths($term_or_phrase, array(), array(), true);
     if (!is_array($hashes)) {
         $hashes = array($hashes);
     }
     foreach ($hashes as $hash) {
         if (is_array($hash)) {
             $dictionary_info = IndexManager::getWordInfo($index_name, $hash[0], $hash[1], $hash[2], $threshold);
         } else {
             $dictionary_info = IndexManager::getWordInfo($index_name, $hash);
         }
         $num_generations = count($dictionary_info);
         $start = isset($dictionary_info[-1]) ? -1 : 0;
         $end = $start == -1 ? $num_generations - 1 : $num_generations;
         for ($i = $start; $i < $end; $i++) {
             list(, , , $num_docs) = $dictionary_info[$i];
             $total_num_docs += $num_docs;
             if ($threshold > 0 && $total_num_docs > $threshold) {
                 return $total_num_docs;
             }
         }
     }
     return $total_num_docs;
 }
コード例 #5
0
ファイル: crawl_model.php プロジェクト: yakar/yioop
 /**
  * Computes for each word in an array of words a count of the total number
  * of times it occurs in this crawl model's default index.
  *
  * @param array $words words to find the counts for
  * @param array $machine_urls machines to invoke this command on
  * @return array associative array of word => counts
  */
 function countWords($words, $machine_urls = NULL)
 {
     if ($machine_urls != NULL && !$this->isSingleLocalhost($machine_urls)) {
         $count_strings = $this->execMachines("countWords", $machine_urls, serialize(array($words, $this->index_name)));
         $word_counts = array();
         foreach ($count_strings as $count_string) {
             $a_word_counts = unserialize(webdecode($count_string[self::PAGE]));
             if (is_array($a_word_counts)) {
                 foreach ($a_word_counts as $word => $count) {
                     $word_counts[$word] = isset($word_counts[$word]) ? $word_counts[$word] + $count : $count;
                 }
             }
         }
         return $word_counts;
     }
     $index_archive = IndexManager::getIndex($this->index_name);
     $hashes = array();
     $lookup = array();
     foreach ($words as $word) {
         $tmp = crawlHash($word);
         $hashes[] = $tmp;
         $lookup[$tmp] = $word;
     }
     $word_key_counts = $index_archive->countWordKeys($hashes);
     $phrases = array();
     $word_counts = array();
     if (is_array($word_key_counts) && count($word_key_counts) > 0) {
         foreach ($word_key_counts as $word_key => $count) {
             $word_counts[$lookup[$word_key]] = $count;
         }
     }
     return $word_counts;
 }