示例#1
0
 /**
  * Given a page summary extract the words from it and try to find documents
  * which match the most relevant words. The algorithm for "relevant" is
  * pretty weak. For now we pick the $num many words whose ratio
  * of number of occurences in crawl item/ number of occurences in all
  * documents is the largest
  *
  * @param string $crawl_item a page summary
  * @param int $num number of key phrase to return
  * @param int $crawl_time the timestamp of an index to use, if 0 then
  *     default used
  * @return array  an array of most selective key phrases
  */
 function getTopPhrases($crawl_item, $num, $crawl_time = 0)
 {
     $crawl_model = $this->model("crawl");
     $queue_servers = $this->model("machine")->getQueueServerUrls();
     if ($crawl_time == 0) {
         $crawl_time = $crawl_model->getCurrentIndexDatabaseName();
     }
     $this->model("phrase")->index_name = $crawl_time;
     $crawl_model->index_name = $crawl_time;
     $phrase_string = PhraseParser::extractWordStringPageSummary($crawl_item);
     $crawl_item[self::LANG] = isset($crawl_item[self::LANG]) ? $crawl_item[self::LANG] : DEFAULT_LOCALE;
     $page_word_counts = PhraseParser::extractPhrasesAndCount($phrase_string, $crawl_item[self::LANG]);
     $words = array_keys($page_word_counts);
     $word_counts = $crawl_model->countWords($words, $queue_servers);
     $word_ratios = array();
     foreach ($page_word_counts as $word => $count) {
         $word_ratios[$word] = isset($word_counts[$word]) && $word_counts[$word] > 0 ? $count / $word_counts[$word] : 0;
         /*discard cases where word only occurs in one doc as want
           to find related relevant documents */
         if ($word_ratios[$word] == 1) {
             $word_ratios[$word] = 0;
         }
     }
     uasort($word_ratios, "greaterThan");
     $top_phrases = array_keys($word_ratios);
     $top_phrases = array_slice($top_phrases, 0, $num);
     return $top_phrases;
 }