/** * Extracts all phrases (sequences of adjacent words) from $string. Does * not extract terms within those phrase. Array key indicates position * of phrase * * @param string $string subject to extract phrases from * @param string $lang locale tag for stemming * @param string $index_name name of index to be used as a reference * when extracting phrases * @param bool $exact_match whether the match has to be exact or not * @param int $threshold roughly causes a stop to extracting more phrases * if exceed $threshold (still might get more than $threshold back, only * when detect have more stop) * @return array of phrases */ static function extractPhrases($string, $lang = NULL, $index_name = NULL, $exact_match = false, $threshold = 10) { if (isset(self::$programming_language_map[$lang])) { $control_word = self::$programming_language_map[$lang] . self::CONTROL_WORD_INDICATOR; $string = trim(substr($string, strlen($control_word) + 1)); } else { self::canonicalizePunctuatedTerms($string, $lang); } $terms = self::stemCharGramSegment($string, $lang); $num = count($terms); if ($index_name == NULL || $num <= 1) { return $terms; } if (count($terms) > MAX_QUERY_TERMS) { $first_terms = array_slice($terms, 0, MAX_QUERY_TERMS); $whole_phrase = implode(" ", $first_terms); } else { $whole_phrase = implode(" ", $terms); $first_terms =& $terms; } if ($exact_match) { return $terms; /* for exact phrase search do not use suffix tree stuff for now */ } $count_whole_phrase = IndexManager::numDocsTerm($whole_phrase, $index_name, $threshold); if ($count_whole_phrase >= $threshold || $num > SUFFIX_TREE_THRESHOLD) { $terms = array($whole_phrase, $terms[0]); return $terms; } else { if ($count_whole_phrase > 0) { foreach ($terms as $term) { $count_term = IndexManager::numDocsTerm($term, $index_name, 5 * $threshold); if ($count_term > 50 * $count_whole_phrase) { $terms = array($whole_phrase, $terms[0]); return $terms; } } } else { if ($num > 2) { $start_terms = $first_terms; $last_term = array_pop($start_terms); $start_phrase = implode(" ", $start_terms); $count_start = IndexManager::numDocsTerm($start_phrase, $index_name, $threshold); if ($count_start >= $threshold) { $terms = array($start_phrase, $last_term, $terms[0]); return $terms; } $end_terms = $first_terms; $first_term = array_shift($end_terms); $end_phrase = implode(" ", $end_terms); $count_end = IndexManager::numDocsTerm($end_phrase, $index_name, $threshold); if ($count_end >= $threshold) { $terms = array($first_term, $end_phrase); return $terms; } } } } if ($index_name != 'feed' && IndexManager::getVersion($index_name) == 0) { return $terms; //old style index before max phrase extraction } return $terms; }
/** * Returns the number of documents in an index that a phrase occurs in. * If it occurs in more than threshold documents then cut off search. * * @param string $phrase to look up in index * @param int $threshold once count in posting list for any word * reaches to threshold then return the number * @param string $index_name selected index for search engine * @param string $lang locale tag for the query * @return int number of documents phrase occurs in */ static function numDocsIndex($phrase, $threshold, $index_name, $lang) { PhraseParser::canonicalizePunctuatedTerms($phrase, $lang); $terms = PhraseParser::stemCharGramSegment($phrase, $lang); $num = count($terms); if ($index_name == NULL) { return 0; } if (count($terms) > MAX_QUERY_TERMS) { $terms = array_slice($terms, 0, MAX_QUERY_TERMS); } $whole_phrase = implode(" ", $terms); return IndexManager::numDocsTerm($whole_phrase, $index_name, $threshold); }