コード例 #1
0
ファイル: phrase_parser.php プロジェクト: yakar/yioop
 /**
  * Extracts all phrases (sequences of adjacent words) from $string. Does
  * not extract terms within those phrase. Array key indicates position
  * of phrase
  *
  * @param string $string subject to extract phrases from
  * @param string $lang locale tag for stemming
  * @param string $index_name name of index to be used as a reference
  *     when extracting phrases
  * @param bool $exact_match whether the match has to be exact or not
  * @param int $threshold roughly causes a stop to extracting more phrases
  *  if exceed $threshold (still might get more than $threshold back, only
  *  when detect have more stop)
  * @return array of phrases
  */
 static function extractPhrases($string, $lang = NULL, $index_name = NULL, $exact_match = false, $threshold = 10)
 {
     if (isset(self::$programming_language_map[$lang])) {
         $control_word = self::$programming_language_map[$lang] . self::CONTROL_WORD_INDICATOR;
         $string = trim(substr($string, strlen($control_word) + 1));
     } else {
         self::canonicalizePunctuatedTerms($string, $lang);
     }
     $terms = self::stemCharGramSegment($string, $lang);
     $num = count($terms);
     if ($index_name == NULL || $num <= 1) {
         return $terms;
     }
     if (count($terms) > MAX_QUERY_TERMS) {
         $first_terms = array_slice($terms, 0, MAX_QUERY_TERMS);
         $whole_phrase = implode(" ", $first_terms);
     } else {
         $whole_phrase = implode(" ", $terms);
         $first_terms =& $terms;
     }
     if ($exact_match) {
         return $terms;
         /* for exact phrase search do not use suffix tree
              stuff for now
            */
     }
     $count_whole_phrase = IndexManager::numDocsTerm($whole_phrase, $index_name, $threshold);
     if ($count_whole_phrase >= $threshold || $num > SUFFIX_TREE_THRESHOLD) {
         $terms = array($whole_phrase, $terms[0]);
         return $terms;
     } else {
         if ($count_whole_phrase > 0) {
             foreach ($terms as $term) {
                 $count_term = IndexManager::numDocsTerm($term, $index_name, 5 * $threshold);
                 if ($count_term > 50 * $count_whole_phrase) {
                     $terms = array($whole_phrase, $terms[0]);
                     return $terms;
                 }
             }
         } else {
             if ($num > 2) {
                 $start_terms = $first_terms;
                 $last_term = array_pop($start_terms);
                 $start_phrase = implode(" ", $start_terms);
                 $count_start = IndexManager::numDocsTerm($start_phrase, $index_name, $threshold);
                 if ($count_start >= $threshold) {
                     $terms = array($start_phrase, $last_term, $terms[0]);
                     return $terms;
                 }
                 $end_terms = $first_terms;
                 $first_term = array_shift($end_terms);
                 $end_phrase = implode(" ", $end_terms);
                 $count_end = IndexManager::numDocsTerm($end_phrase, $index_name, $threshold);
                 if ($count_end >= $threshold) {
                     $terms = array($first_term, $end_phrase);
                     return $terms;
                 }
             }
         }
     }
     if ($index_name != 'feed' && IndexManager::getVersion($index_name) == 0) {
         return $terms;
         //old style index before max phrase extraction
     }
     return $terms;
 }
コード例 #2
0
ファイル: thesaurus.php プロジェクト: yakar/yioop
 /**
  * Returns the number of documents in an index that a phrase occurs in.
  * If it occurs in more than threshold documents then cut off search.
  *
  * @param string $phrase to look up in index
  * @param int $threshold once count in posting list for any word
  *     reaches to threshold then return the number
  * @param string $index_name selected index for search engine
  * @param string $lang locale tag for the query
  * @return int number of documents phrase occurs in
  */
 static function numDocsIndex($phrase, $threshold, $index_name, $lang)
 {
     PhraseParser::canonicalizePunctuatedTerms($phrase, $lang);
     $terms = PhraseParser::stemCharGramSegment($phrase, $lang);
     $num = count($terms);
     if ($index_name == NULL) {
         return 0;
     }
     if (count($terms) > MAX_QUERY_TERMS) {
         $terms = array_slice($terms, 0, MAX_QUERY_TERMS);
     }
     $whole_phrase = implode(" ", $terms);
     return IndexManager::numDocsTerm($whole_phrase, $index_name, $threshold);
 }