コード例 #1
0
ファイル: tokenizer.php プロジェクト: yakar/yioop
 /**
  * Computes similar words and scores from WordNet output based on word type.
  *
  * @param string $term term to find related thesaurus terms
  * @param string $word_type is the type of word such as "NN" (noun),
  *     "VB" (verb), "AJ" (adjective), or "AV" (adverb)
  *     (all other types will be ignored)
  * @param string $whole_query the original query $term came from
  * @return array a sequence of
  *     (score => array of thesaurus terms) associations. The score
  *     representing one word sense of term
  */
 static function scoredThesaurusMatches($term, $word_type, $whole_query)
 {
     $word_map = array("VB" => "verb", "NN" => "noun", "AJ" => "adj", "AV" => "adv");
     //Gets overview of senses of term[$i] into data
     exec(WORDNET_EXEC . " {$term} -over", $data);
     if (!$data || !isset($word_map[$word_type])) {
         return NULL;
     }
     $full_name = $word_map[$word_type];
     $lexicon_output = implode("\n", $data);
     $sense_parts = preg_split("/\\bThe\\s{$full_name}" . '[^\\n]*\\n\\n/', $lexicon_output);
     if (!isset($sense_parts[1])) {
         return NULL;
     }
     list($sense, ) = preg_split("/\\bOverview\\sof\\s/", $sense_parts[1]);
     $definitions_for_sense = preg_split("/\\d+\\.\\s/", $sense, -1, PREG_SPLIT_NO_EMPTY);
     $num_definitions = count($definitions_for_sense);
     $sentence = array();
     $similar_phrases = array();
     $avg_scores = array();
     for ($i = 0; $i < $num_definitions; $i++) {
         //get sentence fragments examples of using that definition
         preg_match_all('/\\"(.*?)\\"/', $definitions_for_sense[$i], $matches);
         // to separate out the words
         preg_match('/[\\w+\\s\\,\\.\']+\\s\\-+/', $definitions_for_sense[$i], $match_word);
         $thesaurus_phrases = preg_split("/\\s*\\,\\s*/", strtolower(rtrim(trim($match_word[0]), "-")));
         //remove ori ginal term from thesaurus phrases if present
         $m = 0;
         foreach ($thesaurus_phrases as $thesaurus_phrase) {
             $tphrase = trim($thesaurus_phrase);
             if ($tphrase == trim($term)) {
                 unset($thesaurus_phrases[$m]);
             }
             $m++;
         }
         $thesaurus_phrases = array_filter($thesaurus_phrases);
         if ($thesaurus_phrases == array()) {
             continue;
         }
         $num_example_sentences = count($matches[1]);
         $score = array();
         for ($j = 0; $j < $num_example_sentences; $j++) {
             $query_parts = explode(' ', strtolower($whole_query));
             $example_sentence_parts = explode(' ', strtolower($matches[1][$j]));
             $score[$j] = PhraseParser::getCosineRank($query_parts, $example_sentence_parts);
             /*  If Cosine similarity is zero then go for
              * intersection similarity ranking
              */
             if ($score[$j] == 0) {
                 $score[$j] = PhraseParser::getIntersection($query_parts, $example_sentence_parts);
             }
         }
         /*  We use the rounded average of the above times 100 as a score
                score for a definition. To avoid ties we store in the low
                order digits 99 - the definition it was
             */
         if ($num_example_sentences > 0) {
             $definition_score = 100 * round(100 * (array_sum($score) / $num_example_sentences)) + (99 - $i);
         } else {
             $definition_score = 99 - $i;
         }
         $similar_phrases[$definition_score] = $thesaurus_phrases;
     }
     krsort($similar_phrases);
     return $similar_phrases;
 }