예제 #1
0
 /**
  * Generates suggestions for searches based upon input
  */
 public static function suggest($ps_phrase, $pa_table_nums = null, $pn_max_suggestions = 1, $pa_options = null)
 {
     $o_db = new Db();
     $va_sql = array();
     if ($pa_table_nums && !is_array($pa_table_nums)) {
         $pa_table_nums = array(intval($pa_table_nums));
     } else {
         if (!$pa_table_nums) {
             $pa_table_nums = array();
         }
     }
     if (sizeof($pa_table_nums)) {
         $va_sql[] = "(p.table_num IN (" . join(', ', $pa_table_nums) . "))";
     }
     $vs_phrase = preg_replace("![^A-Za-z\\-_0-9]+!", " ", $ps_phrase);
     $va_words = preg_split("#[ ]+#", $vs_phrase);
     while (sizeof($va_words)) {
         $vn_len = strlen($vs_phrase);
         $vn_ngram_len = $vn_len - 8;
         if ($vn_ngram_len < 3) {
             $vn_ngram_len = 3;
         }
         $va_gen_ngrams = caNgrams($vs_phrase, $vn_ngram_len, false);
         $va_ngrams = array();
         foreach ($va_gen_ngrams as $vs_ngram) {
             if ($vs_ngram) {
                 $va_ngrams[] = "'" . $o_db->escape($vs_ngram) . "'";
             }
         }
         if (sizeof($va_ngrams)) {
             $qr_res = $o_db->query("\n\t\t\t\t\t\tSELECT p.table_num, p.phrase, (count(*) + (sum(n.endpoint) * 2)) score \n\t\t\t\t\t\tFROM ca_did_you_mean_ngrams n \n\t\t\t\t\t\tINNER JOIN ca_did_you_mean_phrases AS p ON p.phrase_id = n.phrase_id WHERE \n\t\t\t\t\t\t\tn.ngram IN (" . join(',', $va_ngrams) . ") " . (sizeof($va_sql) ? ' AND ' . join(' AND ', $va_sql) : '') . "\n\t\t\t\t\t\tGROUP BY p.phrase_id \n\t\t\t\t\t\tORDER BY score DESC, p.num_words DESC, ABS(length(p.phrase) - " . $vn_len . ") ASC\n\t\t\t\t\t\tLIMIT " . intval($pn_max_suggestions));
             if ($qr_res->numRows()) {
                 $va_suggestions = array();
                 while ($qr_res->nextRow()) {
                     if (isset($pa_options['groupByTableNum']) && $pa_options['groupByTableNum']) {
                         $va_suggestions[$qr_res->get('table_num')][] = $qr_res->get('phrase');
                     } else {
                         $va_suggestions[$qr_res->get('phrase')] = $qr_res->get('score');
                     }
                 }
                 return $va_suggestions;
             }
         }
         array_pop($va_words);
         $vs_phrase = join(' ', $va_words);
     }
     return array();
 }
예제 #2
0
 public function getWordID($ps_word)
 {
     if (!strlen($ps_word = trim(mb_strtolower($ps_word, "UTF-8")))) {
         return null;
     }
     if ((int) WLPlugSearchEngineSqlSearch::$s_word_cache[(string) $ps_word]) {
         return (int) WLPlugSearchEngineSqlSearch::$s_word_cache[(string) $ps_word];
     }
     if ($qr_res = $this->opqr_lookup_word->execute((string) $ps_word)) {
         if ($qr_res->nextRow()) {
             return WLPlugSearchEngineSqlSearch::$s_word_cache[(string) $ps_word] = (int) $qr_res->get('word_id', array('binary' => true));
         }
     }
     // insert word
     if (!($vs_stem = trim($this->opo_stemmer->stem((string) $ps_word)))) {
         $vs_stem = (string) $ps_word;
     }
     $this->opqr_insert_word->execute((string) $ps_word, $vs_stem);
     if ($this->opqr_insert_word->numErrors()) {
         return null;
     }
     if (!($vn_word_id = (int) $this->opqr_insert_word->getLastInsertID())) {
         return null;
     }
     // create ngrams
     $va_ngrams = caNgrams((string) $ps_word, 4);
     $vn_seq = 0;
     $va_ngram_buf = array();
     foreach ($va_ngrams as $vs_ngram) {
         //$this->opqr_insert_ngram->execute($vn_word_id, $vs_ngram, $vn_seq);
         $va_ngram_buf[] = "({$vn_word_id},'{$vs_ngram}',{$vn_seq})";
         $vn_seq++;
     }
     if (sizeof($va_ngram_buf)) {
         $vs_sql = $this->ops_insert_ngram_sql . "\n" . join(",", $va_ngram_buf);
         $this->opo_db->query($vs_sql);
     }
     return WLPlugSearchEngineSqlSearch::$s_word_cache[(string) $ps_word] = (int) $vn_word_id;
 }
예제 #3
0
 /**
  * Return list of suggested searches that will find something, based upon the specified search expression
  *
  * @param string $ps_text The search expression
  * @param array $pa_options Options are:
  *		returnAsLink = return suggestions as links to full-text searces. [Default is no]
  *		request = the current request; required if links are to be generated using returnAsLink. [Default is null]
  *		table = the name or number of the table to restrict searches to. If you pass, for example, "ca_objects" search expressions specifically for object searches will be returned. [Default is null]
  * @return array List of suggested searches
  */
 public function suggest($ps_text, $pa_options = null)
 {
     $o_dm = Datamodel::load();
     $va_tokens = $this->_tokenize($ps_text);
     $pm_table = caGetOption('table', $pa_options, null);
     $vn_table_num = $pm_table ? $o_dm->getTableNum($pm_table) : null;
     $va_word_ids = array();
     foreach ($va_tokens as $vn_i => $vs_token) {
         if (preg_match("![\\d]+!", $vs_token)) {
             continue;
         }
         // don't try to match if there are numbers
         // set ngram length based upon length of word
         // shorter words require shorter ngrams to detect similarity
         $vn_token_len = strlen($vs_token);
         if ($vn_token_len <= 8) {
             $vn_ngram_len = 2;
         } elseif ($vn_token_len <= 11) {
             $vn_ngram_len = 3;
         } else {
             $vn_ngram_len = 4;
         }
         $va_ngrams = caNgrams($vs_token, $vn_ngram_len);
         $vs_table_sql = $vn_table_num ? 'AND swi.table_num = ?' : '';
         if (!is_array($va_ngrams) || !sizeof($va_ngrams)) {
             continue;
         }
         $vn_num_ngrams = sizeof($va_ngrams);
         // Look for items with the most shared ngrams
         $va_params = array($va_ngrams);
         //if ($vn_table_num) { $va_params[] = $vn_table_num; }
         $qr_res = $this->opo_db->query("\n\t\t\t\tSELECT ng.word_id, sw.word, count(*) sc\n\t\t\t\tFROM ca_sql_search_ngrams ng\n\t\t\t\tINNER JOIN ca_sql_search_words AS sw ON sw.word_id = ng.word_id\n\t\t\t\tWHERE\n\t\t\t\t\tng.ngram IN (?)\n\t\t\t\tGROUP BY ng.word_id, sw.word\n\t\t\t\tORDER BY (length(sw.word) - (count(*) * {$vn_ngram_len})), (" . $vn_ngram_len * $vn_num_ngrams . ") - ((count(*) * {$vn_ngram_len}))\n\t\t\t\tLIMIT 250\n\t\t\t", $va_params);
         $va_word_ids[$vn_i] = array();
         $vn_c = 0;
         // Check ngram results using various techniques to find most relevant hits
         $vs_token_metaphone = metaphone($vs_token);
         while ($qr_res->nextRow()) {
             $vs_word = $qr_res->get('word');
             if (preg_match("![^A-Za-z ]+!", $vs_word)) {
                 continue;
             }
             // skip anything that is not entirely letters and space
             $vn_word_id = $qr_res->get('word_id');
             // Is it an exact match?
             if ($vs_word == $vs_token) {
                 $va_word_ids[$vn_i][$vn_word_id] = -250;
                 $vn_c++;
                 continue;
             }
             // Does it sound like the word we're looking for (in English at least)
             if (metaphone($vs_word) == $vs_token_metaphone) {
                 $va_word_ids[$vn_i][$vn_word_id] = -150;
                 $vn_c++;
                 continue;
             }
             // Is it close to what we're looking for distance-wise?
             if (strpos($vs_word, $vs_token) === false) {
                 if (($vn_score = levenshtein($vs_word, $vs_token)) > 3) {
                     continue;
                 }
             } else {
                 $vn_score -= 150;
             }
             // does it begin with the same character?
             for ($i = 1; $i <= mb_strlen($vs_word); $i++) {
                 if (mb_substr($vs_word, 0, $i) === mb_substr($vs_token, 0, $i)) {
                     $vn_score -= 25;
                 } else {
                     break;
                 }
             }
             $va_word_ids[$vn_i][$vn_word_id] = $vn_score;
             $vn_c++;
             //if ($vn_c > 25) { break; }	// give up when we're found 500 possible hits
         }
     }
     $va_temp_tables = array();
     $vn_w = 0;
     if (!is_array($va_word_ids) || !sizeof($va_word_ids)) {
         return array();
     }
     // Look for phrases that use any sequence of matched words in proper order
     //
     if (sizeof($va_word_ids) > 1) {
         foreach ($va_word_ids as $vn_i => $va_word_list) {
             if (!sizeof($va_word_list)) {
                 continue;
             }
             asort($va_word_list, SORT_NUMERIC);
             $va_word_list = array_keys(array_slice($va_word_list, 0, 30, true));
             $vn_w++;
             $vs_temp_table = 'ca_sql_search_suggest_' . md5("/" . $vn_i . "/" . print_R($va_word_list, true));
             $this->_createTempTable($vs_temp_table);
             $vs_sql = "\n\t\t\t\t\tINSERT INTO {$vs_temp_table}\n\t\t\t\t\tSELECT swi.index_id + 1, 1\n\t\t\t\t\tFROM ca_sql_search_word_index swi\n\t\t\t\t\t" . (sizeof($va_temp_tables) ? " INNER JOIN " . $va_temp_tables[sizeof($va_temp_tables) - 1] . " AS tt ON swi.index_id = tt.row_id" : "") . "\n\t\t\t\t\tWHERE \n\t\t\t\t\t\tswi.word_id IN (?) {$vs_table_sql}\n\t\t\t\t\t\t" . ($this->getOption('omitPrivateIndexing') ? " AND swi.access = 0" : '') . "\n\t\t\t\t";
             $va_params = array($va_word_list);
             if ($vn_table_num) {
                 $va_params[] = $vn_table_num;
             }
             $qr_res = $this->opo_db->query($vs_sql, $va_params);
             $va_temp_tables[] = $vs_temp_table;
         }
         if (!sizeof($va_temp_tables)) {
             return array();
         }
         // Get most relevant phrases from index
         //
         $vs_results_table = array_pop($va_temp_tables);
         $qr_result = $this->opo_db->query("SELECT * FROM {$vs_results_table} LIMIT 50");
         $va_phrases = array();
         while ($qr_result->nextRow()) {
             $va_indices = array();
             $vn_index_id = $qr_result->get('row_id') - 1;
             for ($i = 0; $i < sizeof($va_tokens); $i++) {
                 $va_indices[] = $vn_index_id;
                 $vn_index_id--;
             }
             $qr_phrases = $this->opo_db->query("\n\t\t\t\t\tSELECT sw.word, swi.index_id \n\t\t\t\t\tFROM ca_sql_search_words sw\n\t\t\t\t\tINNER JOIN ca_sql_search_word_index AS swi ON sw.word_id = swi.word_id\n\t\t\t\t\tWHERE\n\t\t\t\t\t\t(swi.index_id IN (?))\n\t\t\t\t", array($va_indices));
             $va_acc = array();
             while ($qr_phrases->nextRow()) {
                 $va_acc[] = $qr_phrases->get('word');
             }
             $va_phrases[] = join(" ", $va_acc);
         }
         foreach ($va_temp_tables as $vs_temp_table) {
             $this->_dropTempTable($vs_temp_table);
         }
         $this->_dropTempTable($vs_results_table);
         $va_phrases = array_unique($va_phrases);
     } else {
         // handle single word
         if (!sizeof($va_word_ids[0])) {
             return array();
         }
         asort($va_word_ids[0], SORT_NUMERIC);
         $va_word_ids[0] = array_slice($va_word_ids[0], 0, 3, true);
         $qr_phrases = $this->opo_db->query("\n\t\t\t\tSELECT sw.word\n\t\t\t\tFROM ca_sql_search_words sw\n\t\t\t\tWHERE\n\t\t\t\t\t(sw.word_id IN (?))\n\t\t\t", array(array_keys($va_word_ids[0])));
         $va_phrases = array();
         while ($qr_phrases->nextRow()) {
             $va_phrases[] = $qr_phrases->get('word');
         }
     }
     if (caGetOption('returnAsLink', $pa_options, false) && ($po_request = caGetOption('request', $pa_options, null))) {
         foreach ($va_phrases as $vn_i => $vs_phrase) {
             $va_phrases[$vn_i] = caNavLink($po_request, $vs_phrase, '', '*', '*', 'Index', array('search' => $vs_phrase));
         }
     }
     return $va_phrases;
 }
예제 #4
0
 /**
  *
  */
 public static function create_ngrams($po_opts = null)
 {
     require_once __CA_LIB_DIR__ . "/core/Db.php";
     $o_db = new Db();
     $pb_clear = (bool) $po_opts->getOption('clear');
     $pa_sizes = explode(",", (string) $po_opts->getOption('sizes'));
     foreach ($pa_sizes as $vn_i => $vn_size) {
         $vn_size = (int) $vn_size;
         if (!$vn_size || $vn_size <= 0) {
             unset($pa_sizes[$vn_i]);
             continue;
         }
         $pa_sizes[$vn_i] = $vn_size;
     }
     if (!is_array($pa_sizes) || !sizeof($pa_sizes)) {
         $pa_sizes = array(2, 3, 4);
     }
     $vs_insert_ngram_sql = "\n\t\t\t\tINSERT  INTO ca_sql_search_ngrams\n\t\t\t\t(word_id, ngram, seq)\n\t\t\t\tVALUES\n\t\t\t";
     if ($pb_clear) {
         $qr_res = $o_db->query("TRUNCATE TABLE ca_sql_search_ngrams");
     }
     //create ngrams
     $qr_res = $o_db->query("SELECT word_id, word FROM ca_sql_search_words");
     print CLIProgressBar::start($qr_res->numRows(), _t('Starting...'));
     $vn_c = 0;
     $vn_ngram_c = 0;
     while ($qr_res->nextRow()) {
         print CLIProgressBar::next();
         $vn_word_id = $qr_res->get('word_id');
         $vs_word = $qr_res->get('word');
         print CLIProgressBar::next(1, _t('Processing %1', $vs_word));
         if (!$pb_clear) {
             $qr_chk = $o_db->query("SELECT word_id FROM ca_sql_search_ngrams WHERE word_id = ?", array($vn_word_id));
             if ($qr_chk->nextRow()) {
                 continue;
             }
         }
         $vn_seq = 0;
         foreach ($pa_sizes as $vn_size) {
             $va_ngrams = caNgrams((string) $vs_word, $vn_size);
             $va_ngram_buf = array();
             foreach ($va_ngrams as $vs_ngram) {
                 $va_ngram_buf[] = "({$vn_word_id},'{$vs_ngram}',{$vn_seq})";
                 $vn_seq++;
                 $vn_ngram_c++;
             }
             if (sizeof($va_ngram_buf)) {
                 $o_db->query($vs_insert_ngram_sql . "\n" . join(",", $va_ngram_buf));
             }
         }
         $vn_c++;
     }
     print CLIProgressBar::finish();
     CLIUtils::addMessage(_t('Processed %1 words and created %2 ngrams', $vn_c, $vn_ngram_c));
     return true;
 }