/** * Computes similar words and scores from WordNet output based on word type. * * @param string $term term to find related thesaurus terms * @param string $word_type is the type of word such as "NN" (noun), * "VB" (verb), "AJ" (adjective), or "AV" (adverb) * (all other types will be ignored) * @param string $whole_query the original query $term came from * @return array a sequence of * (score => array of thesaurus terms) associations. The score * representing one word sense of term */ static function scoredThesaurusMatches($term, $word_type, $whole_query) { $word_map = array("VB" => "verb", "NN" => "noun", "AJ" => "adj", "AV" => "adv"); //Gets overview of senses of term[$i] into data exec(WORDNET_EXEC . " {$term} -over", $data); if (!$data || !isset($word_map[$word_type])) { return NULL; } $full_name = $word_map[$word_type]; $lexicon_output = implode("\n", $data); $sense_parts = preg_split("/\\bThe\\s{$full_name}" . '[^\\n]*\\n\\n/', $lexicon_output); if (!isset($sense_parts[1])) { return NULL; } list($sense, ) = preg_split("/\\bOverview\\sof\\s/", $sense_parts[1]); $definitions_for_sense = preg_split("/\\d+\\.\\s/", $sense, -1, PREG_SPLIT_NO_EMPTY); $num_definitions = count($definitions_for_sense); $sentence = array(); $similar_phrases = array(); $avg_scores = array(); for ($i = 0; $i < $num_definitions; $i++) { //get sentence fragments examples of using that definition preg_match_all('/\\"(.*?)\\"/', $definitions_for_sense[$i], $matches); // to separate out the words preg_match('/[\\w+\\s\\,\\.\']+\\s\\-+/', $definitions_for_sense[$i], $match_word); $thesaurus_phrases = preg_split("/\\s*\\,\\s*/", strtolower(rtrim(trim($match_word[0]), "-"))); //remove ori ginal term from thesaurus phrases if present $m = 0; foreach ($thesaurus_phrases as $thesaurus_phrase) { $tphrase = trim($thesaurus_phrase); if ($tphrase == trim($term)) { unset($thesaurus_phrases[$m]); } $m++; } $thesaurus_phrases = array_filter($thesaurus_phrases); if ($thesaurus_phrases == array()) { continue; } $num_example_sentences = count($matches[1]); $score = array(); for ($j = 0; $j < $num_example_sentences; $j++) { $query_parts = explode(' ', strtolower($whole_query)); $example_sentence_parts = explode(' ', strtolower($matches[1][$j])); $score[$j] = PhraseParser::getCosineRank($query_parts, $example_sentence_parts); /* If Cosine similarity is zero then go for * intersection similarity ranking */ if ($score[$j] == 0) { $score[$j] = PhraseParser::getIntersection($query_parts, $example_sentence_parts); } } /* We use the rounded average of the above times 100 as a score score for a definition. To avoid ties we store in the low order digits 99 - the definition it was */ if ($num_example_sentences > 0) { $definition_score = 100 * round(100 * (array_sum($score) / $num_example_sentences)) + (99 - $i); } else { $definition_score = 99 - $i; } $similar_phrases[$definition_score] = $thesaurus_phrases; } krsort($similar_phrases); return $similar_phrases; }