/** * Given a page summary extract the words from it and try to find documents * which match the most relevant words. The algorithm for "relevant" is * pretty weak. For now we pick the $num many words whose ratio * of number of occurences in crawl item/ number of occurences in all * documents is the largest * * @param string $crawl_item a page summary * @param int $num number of key phrase to return * @param int $crawl_time the timestamp of an index to use, if 0 then * default used * @return array an array of most selective key phrases */ function getTopPhrases($crawl_item, $num, $crawl_time = 0) { $crawl_model = $this->model("crawl"); $queue_servers = $this->model("machine")->getQueueServerUrls(); if ($crawl_time == 0) { $crawl_time = $crawl_model->getCurrentIndexDatabaseName(); } $this->model("phrase")->index_name = $crawl_time; $crawl_model->index_name = $crawl_time; $phrase_string = PhraseParser::extractWordStringPageSummary($crawl_item); $crawl_item[self::LANG] = isset($crawl_item[self::LANG]) ? $crawl_item[self::LANG] : DEFAULT_LOCALE; $page_word_counts = PhraseParser::extractPhrasesAndCount($phrase_string, $crawl_item[self::LANG]); $words = array_keys($page_word_counts); $word_counts = $crawl_model->countWords($words, $queue_servers); $word_ratios = array(); foreach ($page_word_counts as $word => $count) { $word_ratios[$word] = isset($word_counts[$word]) && $word_counts[$word] > 0 ? $count / $word_counts[$word] : 0; /*discard cases where word only occurs in one doc as want to find related relevant documents */ if ($word_ratios[$word] == 1) { $word_ratios[$word] = 0; } } uasort($word_ratios, "greaterThan"); $top_phrases = array_keys($word_ratios); $top_phrases = array_slice($top_phrases, 0, $num); return $top_phrases; }