コード例 #1
0
ファイル: phrase_model.php プロジェクト: yakar/yioop
 /**
  * Parses from a string phrase representing a conjunctive query, a struct
  * consisting of the words keys searched for, the allowed and disallowed
  * phrases, the weight that should be put on these query results, and
  * which archive to use.
  *
  * @param string& $phrase string to extract struct from, if the phrase
  * semantics is guessed or an if condition is processed the value of
  * phrase will be altered. (Helps for feeding to network queries)
  * @param bool $guess_semantics whether to do query rewriting before parse
  * @return array struct representing the conjunctive query
  */
 function parseWordStructConjunctiveQuery(&$phrase, $guess_semantics = true)
 {
     $query = $phrase;
     $indent = "  ";
     $in2 = $indent . $indent;
     $in3 = $in2 . $indent;
     $in4 = $in2 . $in2;
     $phrase = " " . $phrase;
     if ($guess_semantics) {
         $phrase = $this->guessSemantics($phrase);
     }
     $phrase = $this->parseIfConditions($phrase);
     $phrase_string = $phrase;
     list($found_metas, $found_materialized_metas, $disallow_phrases, $phrase_string, $query_string, $index_name, $weight) = $this->extractMetaWordInfo($phrase);
     /*
        we search using the stemmed/char-grammed words, but we format
        snippets in the results by bolding either
     */
     $query_words = explode(" ", $query_string);
     //not stemmed
     if ($this->program_indicator) {
         $query_string = $query;
         $this->program_indicator = false;
     }
     $locale_tag = guessLocaleFromString($query_string);
     $quote_state = false;
     $phrase_parts = explode('"', $phrase_string);
     $base_words = array();
     $num_words = 0;
     $quote_positions = array();
     foreach ($phrase_parts as $phrase_part) {
         if (trim($phrase_part) == "") {
             $quote_state = $quote_state ? false : true;
             continue;
         }
         /*still use original phrase string here to handle
           acronyms abbreviations and the like that use periods */
         if ($quote_state) {
             $sub_parts = explode('*', $phrase_part);
             $first_part = true;
             $quote_position = array();
             foreach ($sub_parts as $sub_part) {
                 if (!$first_part) {
                     $quote_position["*{$num_words}"] = "*";
                 }
                 $new_words = PhraseParser::extractPhrases($sub_part, $locale_tag, $index_name, true);
                 $base_words = array_merge($base_words, $new_words);
                 foreach ($new_words as $new_word) {
                     $len = substr_count($new_word, " ") + 1;
                     $quote_position[$num_words] = $len;
                     $num_words++;
                 }
                 $first_part = false;
             }
             $quote_positions[] = $quote_position;
         } else {
             $new_words = PhraseParser::extractPhrases($phrase_part, $locale_tag, $index_name);
             if (strpos($new_words[0], " ") > 0 && $found_materialized_metas == array()) {
                 array_pop($new_words);
             }
             $base_words = array_merge($base_words, $new_words);
         }
         $num_words = count($base_words);
         $quote_state = $quote_state ? false : true;
     }
     //stemmed, if have stemmer
     $index_version = IndexManager::getVersion($index_name);
     $add_metas = $found_metas;
     if (count($base_words) > 0 && $index_version > 0) {
         $add_metas = array_diff($found_metas, $found_materialized_metas);
     }
     $words = array_merge($base_words, $add_metas);
     if (count($words) == 0 && count($disallow_phrases) > 0) {
         $words[] = "site:any";
     }
     if (QUERY_STATISTICS) {
         if (!isset($this->query_info['QUERY'])) {
             $this->query_info['QUERY'] = "";
         }
         $this->query_info['QUERY'] .= "{$in3}<i>Index</i>: " . $index_name . "<br />";
         $this->query_info['QUERY'] .= "{$in3}<i>LocaleTag</i>: " . $locale_tag . "<br />";
         $this->query_info['QUERY'] .= "{$in3}<i>Stemmed/Char-grammed Words</i>:<br />";
         foreach ($base_words as $word) {
             $this->query_info['QUERY'] .= "{$in4}{$word}<br />";
         }
         $this->query_info['QUERY'] .= "{$in3}<i>Meta Words</i>:<br />";
         foreach ($found_metas as $word) {
             $this->query_info['QUERY'] .= "{$in4}{$word}<br />";
         }
         $this->query_info['QUERY'] .= "{$in3}<i>Quoted Word Locs</i>:<br />";
         foreach ($quote_positions as $quote_position) {
             $this->query_info['QUERY'] .= "{$in4}(";
             $comma = "";
             foreach ($quote_position as $pos => $len) {
                 $this->query_info['QUERY'] .= "{$comma} {$pos} => {$len}";
                 $comma = ",";
             }
             $this->query_info['QUERY'] .= ")<br />";
         }
     }
     if (isset($words) && count($words) == 1 && count($disallow_phrases) < 1 && !strpos($words[0], " ")) {
         $phrase_string = $words[0];
         if ($index_version == 0) {
             $tmp_hash = allCrawlHashPaths($phrase_string);
             $tmp_hash = is_array($tmp_hash) ? $tmp_hash : array($tmp_hash);
             $phrase_hash = array_merge(array($tmp_hash), array(crawlHash($phrase_string)));
         } else {
             if ($found_materialized_metas == array()) {
                 $phrase_hash = allCrawlHashPaths($phrase_string);
             } else {
                 $phrase_hash = allCrawlHashPaths($phrase_string, $found_materialized_metas, PhraseParser::$materialized_metas);
             }
         }
         $word_struct = array("KEYS" => array($phrase_hash), "QUOTE_POSITIONS" => NULL, "DISALLOW_KEYS" => array(), "WEIGHT" => $weight, "INDEX_NAME" => $index_name);
     } else {
         //get a raw list of words and their hashes
         $hashes = array();
         $metas_accounted = false;
         $materialized_metas = array();
         foreach ($words as $word) {
             if (!$metas_accounted && substr_count($word, " ") == 0 && !in_array($word, $found_metas)) {
                 $metas_accounted = true;
                 $materialized_metas = $found_materialized_metas;
             }
             $tmp_hash = allCrawlHashPaths($word, $materialized_metas, PhraseParser::$materialized_metas);
             if ($index_version == 0) {
                 $tmp_hash = is_array($tmp_hash) ? $tmp_hash : array($tmp_hash);
                 $test = array_merge($tmp_hash, array(crawlHash($word)));
                 $word_keys[] = $test;
             } else {
                 $word_keys[] = $tmp_hash;
             }
         }
         if (!isset($word_keys) || count($word_keys) == 0) {
             $word_keys = NULL;
             $word_struct = NULL;
         }
         $disallow_keys = array();
         $num_disallow_keys = min(MAX_QUERY_TERMS, count($disallow_phrases));
         if ($num_disallow_keys > 0 && QUERY_STATISTICS) {
             $this->query_info['QUERY'] .= "{$in3}<i>Disallowed Words</i>:" . "<br />";
         }
         for ($i = 0; $i < $num_disallow_keys; $i++) {
             // check if disallowed is a meta word and stem or not stem
             if (mb_strstr($disallow_phrases[$i], ':') === false) {
                 $disallow_stem = PhraseParser::extractPhrases($disallow_phrases[$i], getLocaleTag());
                 //stemmed
             } else {
                 $disallow_stem[0] = $disallow_phrases[$i];
             }
             if (QUERY_STATISTICS) {
                 $this->query_info['QUERY'] .= "{$in4}{$disallow_stem[0]}" . "<br />";
             }
             $disallow_keys[] = crawlHashWord($disallow_stem[0]);
             if ($index_version == 0) {
                 $disallow_keys[] = crawlHash($word);
             }
         }
         if ($word_keys !== NULL) {
             $word_struct = array("KEYS" => $word_keys, "QUOTE_POSITIONS" => $quote_positions, "DISALLOW_KEYS" => $disallow_keys, "WEIGHT" => $weight, "INDEX_NAME" => $index_name);
         }
     }
     $pre_format_words = array();
     foreach ($base_words as $base_word) {
         $pre_format_words = array_merge($pre_format_words, explode(" * ", $base_word));
     }
     $pre_format_words = array_values(array_unique(array_merge($query_words, $pre_format_words)));
     $format_words = array();
     $count = count($pre_format_words);
     for ($i = 0; $i < $count; $i++) {
         $flag = true;
         if ($pre_format_words[$i] == "") {
             continue;
         }
         for ($j = 0; $j < $count; $j++) {
             if ($j == $i) {
                 continue;
             }
             $hay = mb_strtolower($pre_format_words[$j]);
             $needle = mb_strtolower($pre_format_words[$i]);
             if ($hay == $needle && $j > $i) {
                 continue;
             }
             if (mb_strstr($hay, $needle)) {
                 $flag = false;
                 break;
             }
         }
         if ($flag) {
             $format_words[] = $pre_format_words[$i];
         }
     }
     return array($word_struct, $format_words);
 }
コード例 #2
0
ファイル: index_manager.php プロジェクト: yakar/yioop
 /**
  * Returns the number of document that a given term or phrase appears in
  * in the given index
  *
  * @param string $term_or_phrase what to look up in the indexes dictionary
  *     no  mask is used for this look up
  * @param string $index_name index to look up term or phrase in
  * @param int $threshold if set and positive then once threshold many
  *     documents are found the search for more documents to add to the
  *     total is stopped
  * @return int number of documents
  */
 static function numDocsTerm($term_or_phrase, $index_name, $threshold = -1)
 {
     $index = IndexManager::getIndex($index_name);
     if (!$index->dictionary) {
         return false;
     }
     $pos = -1;
     $total_num_docs = 0;
     $hashes = allCrawlHashPaths($term_or_phrase, array(), array(), true);
     if (!is_array($hashes)) {
         $hashes = array($hashes);
     }
     foreach ($hashes as $hash) {
         if (is_array($hash)) {
             $dictionary_info = IndexManager::getWordInfo($index_name, $hash[0], $hash[1], $hash[2], $threshold);
         } else {
             $dictionary_info = IndexManager::getWordInfo($index_name, $hash);
         }
         $num_generations = count($dictionary_info);
         $start = isset($dictionary_info[-1]) ? -1 : 0;
         $end = $start == -1 ? $num_generations - 1 : $num_generations;
         for ($i = $start; $i < $end; $i++) {
             list(, , , $num_docs) = $dictionary_info[$i];
             $total_num_docs += $num_docs;
             if ($threshold > 0 && $total_num_docs > $threshold) {
                 return $total_num_docs;
             }
         }
     }
     return $total_num_docs;
 }