/** * Extracts all phrases (sequences of adjacent words) from $string. Does * not extract terms within those phrase. Array key indicates position * of phrase * * @param string $string subject to extract phrases from * @param string $lang locale tag for stemming * @param string $index_name name of index to be used as a reference * when extracting phrases * @param bool $exact_match whether the match has to be exact or not * @param int $threshold roughly causes a stop to extracting more phrases * if exceed $threshold (still might get more than $threshold back, only * when detect have more stop) * @return array of phrases */ static function extractPhrases($string, $lang = NULL, $index_name = NULL, $exact_match = false, $threshold = 10) { if (isset(self::$programming_language_map[$lang])) { $control_word = self::$programming_language_map[$lang] . self::CONTROL_WORD_INDICATOR; $string = trim(substr($string, strlen($control_word) + 1)); } else { self::canonicalizePunctuatedTerms($string, $lang); } $terms = self::stemCharGramSegment($string, $lang); $num = count($terms); if ($index_name == NULL || $num <= 1) { return $terms; } if (count($terms) > MAX_QUERY_TERMS) { $first_terms = array_slice($terms, 0, MAX_QUERY_TERMS); $whole_phrase = implode(" ", $first_terms); } else { $whole_phrase = implode(" ", $terms); $first_terms =& $terms; } if ($exact_match) { return $terms; /* for exact phrase search do not use suffix tree stuff for now */ } $count_whole_phrase = IndexManager::numDocsTerm($whole_phrase, $index_name, $threshold); if ($count_whole_phrase >= $threshold || $num > SUFFIX_TREE_THRESHOLD) { $terms = array($whole_phrase, $terms[0]); return $terms; } else { if ($count_whole_phrase > 0) { foreach ($terms as $term) { $count_term = IndexManager::numDocsTerm($term, $index_name, 5 * $threshold); if ($count_term > 50 * $count_whole_phrase) { $terms = array($whole_phrase, $terms[0]); return $terms; } } } else { if ($num > 2) { $start_terms = $first_terms; $last_term = array_pop($start_terms); $start_phrase = implode(" ", $start_terms); $count_start = IndexManager::numDocsTerm($start_phrase, $index_name, $threshold); if ($count_start >= $threshold) { $terms = array($start_phrase, $last_term, $terms[0]); return $terms; } $end_terms = $first_terms; $first_term = array_shift($end_terms); $end_phrase = implode(" ", $end_terms); $count_end = IndexManager::numDocsTerm($end_phrase, $index_name, $threshold); if ($count_end >= $threshold) { $terms = array($first_term, $end_phrase); return $terms; } } } } if ($index_name != 'feed' && IndexManager::getVersion($index_name) == 0) { return $terms; //old style index before max phrase extraction } return $terms; }
/** * Parses from a string phrase representing a conjunctive query, a struct * consisting of the words keys searched for, the allowed and disallowed * phrases, the weight that should be put on these query results, and * which archive to use. * * @param string& $phrase string to extract struct from, if the phrase * semantics is guessed or an if condition is processed the value of * phrase will be altered. (Helps for feeding to network queries) * @param bool $guess_semantics whether to do query rewriting before parse * @return array struct representing the conjunctive query */ function parseWordStructConjunctiveQuery(&$phrase, $guess_semantics = true) { $query = $phrase; $indent = " "; $in2 = $indent . $indent; $in3 = $in2 . $indent; $in4 = $in2 . $in2; $phrase = " " . $phrase; if ($guess_semantics) { $phrase = $this->guessSemantics($phrase); } $phrase = $this->parseIfConditions($phrase); $phrase_string = $phrase; list($found_metas, $found_materialized_metas, $disallow_phrases, $phrase_string, $query_string, $index_name, $weight) = $this->extractMetaWordInfo($phrase); /* we search using the stemmed/char-grammed words, but we format snippets in the results by bolding either */ $query_words = explode(" ", $query_string); //not stemmed if ($this->program_indicator) { $query_string = $query; $this->program_indicator = false; } $locale_tag = guessLocaleFromString($query_string); $quote_state = false; $phrase_parts = explode('"', $phrase_string); $base_words = array(); $num_words = 0; $quote_positions = array(); foreach ($phrase_parts as $phrase_part) { if (trim($phrase_part) == "") { $quote_state = $quote_state ? false : true; continue; } /*still use original phrase string here to handle acronyms abbreviations and the like that use periods */ if ($quote_state) { $sub_parts = explode('*', $phrase_part); $first_part = true; $quote_position = array(); foreach ($sub_parts as $sub_part) { if (!$first_part) { $quote_position["*{$num_words}"] = "*"; } $new_words = PhraseParser::extractPhrases($sub_part, $locale_tag, $index_name, true); $base_words = array_merge($base_words, $new_words); foreach ($new_words as $new_word) { $len = substr_count($new_word, " ") + 1; $quote_position[$num_words] = $len; $num_words++; } $first_part = false; } $quote_positions[] = $quote_position; } else { $new_words = PhraseParser::extractPhrases($phrase_part, $locale_tag, $index_name); if (strpos($new_words[0], " ") > 0 && $found_materialized_metas == array()) { array_pop($new_words); } $base_words = array_merge($base_words, $new_words); } $num_words = count($base_words); $quote_state = $quote_state ? false : true; } //stemmed, if have stemmer $index_version = IndexManager::getVersion($index_name); $add_metas = $found_metas; if (count($base_words) > 0 && $index_version > 0) { $add_metas = array_diff($found_metas, $found_materialized_metas); } $words = array_merge($base_words, $add_metas); if (count($words) == 0 && count($disallow_phrases) > 0) { $words[] = "site:any"; } if (QUERY_STATISTICS) { if (!isset($this->query_info['QUERY'])) { $this->query_info['QUERY'] = ""; } $this->query_info['QUERY'] .= "{$in3}<i>Index</i>: " . $index_name . "<br />"; $this->query_info['QUERY'] .= "{$in3}<i>LocaleTag</i>: " . $locale_tag . "<br />"; $this->query_info['QUERY'] .= "{$in3}<i>Stemmed/Char-grammed Words</i>:<br />"; foreach ($base_words as $word) { $this->query_info['QUERY'] .= "{$in4}{$word}<br />"; } $this->query_info['QUERY'] .= "{$in3}<i>Meta Words</i>:<br />"; foreach ($found_metas as $word) { $this->query_info['QUERY'] .= "{$in4}{$word}<br />"; } $this->query_info['QUERY'] .= "{$in3}<i>Quoted Word Locs</i>:<br />"; foreach ($quote_positions as $quote_position) { $this->query_info['QUERY'] .= "{$in4}("; $comma = ""; foreach ($quote_position as $pos => $len) { $this->query_info['QUERY'] .= "{$comma} {$pos} => {$len}"; $comma = ","; } $this->query_info['QUERY'] .= ")<br />"; } } if (isset($words) && count($words) == 1 && count($disallow_phrases) < 1 && !strpos($words[0], " ")) { $phrase_string = $words[0]; if ($index_version == 0) { $tmp_hash = allCrawlHashPaths($phrase_string); $tmp_hash = is_array($tmp_hash) ? $tmp_hash : array($tmp_hash); $phrase_hash = array_merge(array($tmp_hash), array(crawlHash($phrase_string))); } else { if ($found_materialized_metas == array()) { $phrase_hash = allCrawlHashPaths($phrase_string); } else { $phrase_hash = allCrawlHashPaths($phrase_string, $found_materialized_metas, PhraseParser::$materialized_metas); } } $word_struct = array("KEYS" => array($phrase_hash), "QUOTE_POSITIONS" => NULL, "DISALLOW_KEYS" => array(), "WEIGHT" => $weight, "INDEX_NAME" => $index_name); } else { //get a raw list of words and their hashes $hashes = array(); $metas_accounted = false; $materialized_metas = array(); foreach ($words as $word) { if (!$metas_accounted && substr_count($word, " ") == 0 && !in_array($word, $found_metas)) { $metas_accounted = true; $materialized_metas = $found_materialized_metas; } $tmp_hash = allCrawlHashPaths($word, $materialized_metas, PhraseParser::$materialized_metas); if ($index_version == 0) { $tmp_hash = is_array($tmp_hash) ? $tmp_hash : array($tmp_hash); $test = array_merge($tmp_hash, array(crawlHash($word))); $word_keys[] = $test; } else { $word_keys[] = $tmp_hash; } } if (!isset($word_keys) || count($word_keys) == 0) { $word_keys = NULL; $word_struct = NULL; } $disallow_keys = array(); $num_disallow_keys = min(MAX_QUERY_TERMS, count($disallow_phrases)); if ($num_disallow_keys > 0 && QUERY_STATISTICS) { $this->query_info['QUERY'] .= "{$in3}<i>Disallowed Words</i>:" . "<br />"; } for ($i = 0; $i < $num_disallow_keys; $i++) { // check if disallowed is a meta word and stem or not stem if (mb_strstr($disallow_phrases[$i], ':') === false) { $disallow_stem = PhraseParser::extractPhrases($disallow_phrases[$i], getLocaleTag()); //stemmed } else { $disallow_stem[0] = $disallow_phrases[$i]; } if (QUERY_STATISTICS) { $this->query_info['QUERY'] .= "{$in4}{$disallow_stem[0]}" . "<br />"; } $disallow_keys[] = crawlHashWord($disallow_stem[0]); if ($index_version == 0) { $disallow_keys[] = crawlHash($word); } } if ($word_keys !== NULL) { $word_struct = array("KEYS" => $word_keys, "QUOTE_POSITIONS" => $quote_positions, "DISALLOW_KEYS" => $disallow_keys, "WEIGHT" => $weight, "INDEX_NAME" => $index_name); } } $pre_format_words = array(); foreach ($base_words as $base_word) { $pre_format_words = array_merge($pre_format_words, explode(" * ", $base_word)); } $pre_format_words = array_values(array_unique(array_merge($query_words, $pre_format_words))); $format_words = array(); $count = count($pre_format_words); for ($i = 0; $i < $count; $i++) { $flag = true; if ($pre_format_words[$i] == "") { continue; } for ($j = 0; $j < $count; $j++) { if ($j == $i) { continue; } $hay = mb_strtolower($pre_format_words[$j]); $needle = mb_strtolower($pre_format_words[$i]); if ($hay == $needle && $j > $i) { continue; } if (mb_strstr($hay, $needle)) { $flag = false; break; } } if ($flag) { $format_words[] = $pre_format_words[$i]; } } return array($word_struct, $format_words); }