コード例 #1
0
ファイル: phrase_parser.php プロジェクト: yakar/yioop
 /**
  * Extracts all phrases (sequences of adjacent words) from $string. Does
  * not extract terms within those phrase. Array key indicates position
  * of phrase
  *
  * @param string $string subject to extract phrases from
  * @param string $lang locale tag for stemming
  * @param string $index_name name of index to be used as a reference
  *     when extracting phrases
  * @param bool $exact_match whether the match has to be exact or not
  * @param int $threshold roughly causes a stop to extracting more phrases
  *  if exceed $threshold (still might get more than $threshold back, only
  *  when detect have more stop)
  * @return array of phrases
  */
 static function extractPhrases($string, $lang = NULL, $index_name = NULL, $exact_match = false, $threshold = 10)
 {
     if (isset(self::$programming_language_map[$lang])) {
         $control_word = self::$programming_language_map[$lang] . self::CONTROL_WORD_INDICATOR;
         $string = trim(substr($string, strlen($control_word) + 1));
     } else {
         self::canonicalizePunctuatedTerms($string, $lang);
     }
     $terms = self::stemCharGramSegment($string, $lang);
     $num = count($terms);
     if ($index_name == NULL || $num <= 1) {
         return $terms;
     }
     if (count($terms) > MAX_QUERY_TERMS) {
         $first_terms = array_slice($terms, 0, MAX_QUERY_TERMS);
         $whole_phrase = implode(" ", $first_terms);
     } else {
         $whole_phrase = implode(" ", $terms);
         $first_terms =& $terms;
     }
     if ($exact_match) {
         return $terms;
         /* for exact phrase search do not use suffix tree
              stuff for now
            */
     }
     $count_whole_phrase = IndexManager::numDocsTerm($whole_phrase, $index_name, $threshold);
     if ($count_whole_phrase >= $threshold || $num > SUFFIX_TREE_THRESHOLD) {
         $terms = array($whole_phrase, $terms[0]);
         return $terms;
     } else {
         if ($count_whole_phrase > 0) {
             foreach ($terms as $term) {
                 $count_term = IndexManager::numDocsTerm($term, $index_name, 5 * $threshold);
                 if ($count_term > 50 * $count_whole_phrase) {
                     $terms = array($whole_phrase, $terms[0]);
                     return $terms;
                 }
             }
         } else {
             if ($num > 2) {
                 $start_terms = $first_terms;
                 $last_term = array_pop($start_terms);
                 $start_phrase = implode(" ", $start_terms);
                 $count_start = IndexManager::numDocsTerm($start_phrase, $index_name, $threshold);
                 if ($count_start >= $threshold) {
                     $terms = array($start_phrase, $last_term, $terms[0]);
                     return $terms;
                 }
                 $end_terms = $first_terms;
                 $first_term = array_shift($end_terms);
                 $end_phrase = implode(" ", $end_terms);
                 $count_end = IndexManager::numDocsTerm($end_phrase, $index_name, $threshold);
                 if ($count_end >= $threshold) {
                     $terms = array($first_term, $end_phrase);
                     return $terms;
                 }
             }
         }
     }
     if ($index_name != 'feed' && IndexManager::getVersion($index_name) == 0) {
         return $terms;
         //old style index before max phrase extraction
     }
     return $terms;
 }
コード例 #2
0
ファイル: phrase_model.php プロジェクト: yakar/yioop
 /**
  * Parses from a string phrase representing a conjunctive query, a struct
  * consisting of the words keys searched for, the allowed and disallowed
  * phrases, the weight that should be put on these query results, and
  * which archive to use.
  *
  * @param string& $phrase string to extract struct from, if the phrase
  * semantics is guessed or an if condition is processed the value of
  * phrase will be altered. (Helps for feeding to network queries)
  * @param bool $guess_semantics whether to do query rewriting before parse
  * @return array struct representing the conjunctive query
  */
 function parseWordStructConjunctiveQuery(&$phrase, $guess_semantics = true)
 {
     $query = $phrase;
     $indent = "&nbsp;&nbsp;";
     $in2 = $indent . $indent;
     $in3 = $in2 . $indent;
     $in4 = $in2 . $in2;
     $phrase = " " . $phrase;
     if ($guess_semantics) {
         $phrase = $this->guessSemantics($phrase);
     }
     $phrase = $this->parseIfConditions($phrase);
     $phrase_string = $phrase;
     list($found_metas, $found_materialized_metas, $disallow_phrases, $phrase_string, $query_string, $index_name, $weight) = $this->extractMetaWordInfo($phrase);
     /*
        we search using the stemmed/char-grammed words, but we format
        snippets in the results by bolding either
     */
     $query_words = explode(" ", $query_string);
     //not stemmed
     if ($this->program_indicator) {
         $query_string = $query;
         $this->program_indicator = false;
     }
     $locale_tag = guessLocaleFromString($query_string);
     $quote_state = false;
     $phrase_parts = explode('"', $phrase_string);
     $base_words = array();
     $num_words = 0;
     $quote_positions = array();
     foreach ($phrase_parts as $phrase_part) {
         if (trim($phrase_part) == "") {
             $quote_state = $quote_state ? false : true;
             continue;
         }
         /*still use original phrase string here to handle
           acronyms abbreviations and the like that use periods */
         if ($quote_state) {
             $sub_parts = explode('*', $phrase_part);
             $first_part = true;
             $quote_position = array();
             foreach ($sub_parts as $sub_part) {
                 if (!$first_part) {
                     $quote_position["*{$num_words}"] = "*";
                 }
                 $new_words = PhraseParser::extractPhrases($sub_part, $locale_tag, $index_name, true);
                 $base_words = array_merge($base_words, $new_words);
                 foreach ($new_words as $new_word) {
                     $len = substr_count($new_word, " ") + 1;
                     $quote_position[$num_words] = $len;
                     $num_words++;
                 }
                 $first_part = false;
             }
             $quote_positions[] = $quote_position;
         } else {
             $new_words = PhraseParser::extractPhrases($phrase_part, $locale_tag, $index_name);
             if (strpos($new_words[0], " ") > 0 && $found_materialized_metas == array()) {
                 array_pop($new_words);
             }
             $base_words = array_merge($base_words, $new_words);
         }
         $num_words = count($base_words);
         $quote_state = $quote_state ? false : true;
     }
     //stemmed, if have stemmer
     $index_version = IndexManager::getVersion($index_name);
     $add_metas = $found_metas;
     if (count($base_words) > 0 && $index_version > 0) {
         $add_metas = array_diff($found_metas, $found_materialized_metas);
     }
     $words = array_merge($base_words, $add_metas);
     if (count($words) == 0 && count($disallow_phrases) > 0) {
         $words[] = "site:any";
     }
     if (QUERY_STATISTICS) {
         if (!isset($this->query_info['QUERY'])) {
             $this->query_info['QUERY'] = "";
         }
         $this->query_info['QUERY'] .= "{$in3}<i>Index</i>: " . $index_name . "<br />";
         $this->query_info['QUERY'] .= "{$in3}<i>LocaleTag</i>: " . $locale_tag . "<br />";
         $this->query_info['QUERY'] .= "{$in3}<i>Stemmed/Char-grammed Words</i>:<br />";
         foreach ($base_words as $word) {
             $this->query_info['QUERY'] .= "{$in4}{$word}<br />";
         }
         $this->query_info['QUERY'] .= "{$in3}<i>Meta Words</i>:<br />";
         foreach ($found_metas as $word) {
             $this->query_info['QUERY'] .= "{$in4}{$word}<br />";
         }
         $this->query_info['QUERY'] .= "{$in3}<i>Quoted Word Locs</i>:<br />";
         foreach ($quote_positions as $quote_position) {
             $this->query_info['QUERY'] .= "{$in4}(";
             $comma = "";
             foreach ($quote_position as $pos => $len) {
                 $this->query_info['QUERY'] .= "{$comma} {$pos} => {$len}";
                 $comma = ",";
             }
             $this->query_info['QUERY'] .= ")<br />";
         }
     }
     if (isset($words) && count($words) == 1 && count($disallow_phrases) < 1 && !strpos($words[0], " ")) {
         $phrase_string = $words[0];
         if ($index_version == 0) {
             $tmp_hash = allCrawlHashPaths($phrase_string);
             $tmp_hash = is_array($tmp_hash) ? $tmp_hash : array($tmp_hash);
             $phrase_hash = array_merge(array($tmp_hash), array(crawlHash($phrase_string)));
         } else {
             if ($found_materialized_metas == array()) {
                 $phrase_hash = allCrawlHashPaths($phrase_string);
             } else {
                 $phrase_hash = allCrawlHashPaths($phrase_string, $found_materialized_metas, PhraseParser::$materialized_metas);
             }
         }
         $word_struct = array("KEYS" => array($phrase_hash), "QUOTE_POSITIONS" => NULL, "DISALLOW_KEYS" => array(), "WEIGHT" => $weight, "INDEX_NAME" => $index_name);
     } else {
         //get a raw list of words and their hashes
         $hashes = array();
         $metas_accounted = false;
         $materialized_metas = array();
         foreach ($words as $word) {
             if (!$metas_accounted && substr_count($word, " ") == 0 && !in_array($word, $found_metas)) {
                 $metas_accounted = true;
                 $materialized_metas = $found_materialized_metas;
             }
             $tmp_hash = allCrawlHashPaths($word, $materialized_metas, PhraseParser::$materialized_metas);
             if ($index_version == 0) {
                 $tmp_hash = is_array($tmp_hash) ? $tmp_hash : array($tmp_hash);
                 $test = array_merge($tmp_hash, array(crawlHash($word)));
                 $word_keys[] = $test;
             } else {
                 $word_keys[] = $tmp_hash;
             }
         }
         if (!isset($word_keys) || count($word_keys) == 0) {
             $word_keys = NULL;
             $word_struct = NULL;
         }
         $disallow_keys = array();
         $num_disallow_keys = min(MAX_QUERY_TERMS, count($disallow_phrases));
         if ($num_disallow_keys > 0 && QUERY_STATISTICS) {
             $this->query_info['QUERY'] .= "{$in3}<i>Disallowed Words</i>:" . "<br />";
         }
         for ($i = 0; $i < $num_disallow_keys; $i++) {
             // check if disallowed is a meta word and stem or not stem
             if (mb_strstr($disallow_phrases[$i], ':') === false) {
                 $disallow_stem = PhraseParser::extractPhrases($disallow_phrases[$i], getLocaleTag());
                 //stemmed
             } else {
                 $disallow_stem[0] = $disallow_phrases[$i];
             }
             if (QUERY_STATISTICS) {
                 $this->query_info['QUERY'] .= "{$in4}{$disallow_stem[0]}" . "<br />";
             }
             $disallow_keys[] = crawlHashWord($disallow_stem[0]);
             if ($index_version == 0) {
                 $disallow_keys[] = crawlHash($word);
             }
         }
         if ($word_keys !== NULL) {
             $word_struct = array("KEYS" => $word_keys, "QUOTE_POSITIONS" => $quote_positions, "DISALLOW_KEYS" => $disallow_keys, "WEIGHT" => $weight, "INDEX_NAME" => $index_name);
         }
     }
     $pre_format_words = array();
     foreach ($base_words as $base_word) {
         $pre_format_words = array_merge($pre_format_words, explode(" * ", $base_word));
     }
     $pre_format_words = array_values(array_unique(array_merge($query_words, $pre_format_words)));
     $format_words = array();
     $count = count($pre_format_words);
     for ($i = 0; $i < $count; $i++) {
         $flag = true;
         if ($pre_format_words[$i] == "") {
             continue;
         }
         for ($j = 0; $j < $count; $j++) {
             if ($j == $i) {
                 continue;
             }
             $hay = mb_strtolower($pre_format_words[$j]);
             $needle = mb_strtolower($pre_format_words[$i]);
             if ($hay == $needle && $j > $i) {
                 continue;
             }
             if (mb_strstr($hay, $needle)) {
                 $flag = false;
                 break;
             }
         }
         if ($flag) {
             $format_words[] = $pre_format_words[$i];
         }
     }
     return array($word_struct, $format_words);
 }