/** * Generates suggestions for searches based upon input */ public static function suggest($ps_phrase, $pa_table_nums = null, $pn_max_suggestions = 1, $pa_options = null) { $o_db = new Db(); $va_sql = array(); if ($pa_table_nums && !is_array($pa_table_nums)) { $pa_table_nums = array(intval($pa_table_nums)); } else { if (!$pa_table_nums) { $pa_table_nums = array(); } } if (sizeof($pa_table_nums)) { $va_sql[] = "(p.table_num IN (" . join(', ', $pa_table_nums) . "))"; } $vs_phrase = preg_replace("![^A-Za-z\\-_0-9]+!", " ", $ps_phrase); $va_words = preg_split("#[ ]+#", $vs_phrase); while (sizeof($va_words)) { $vn_len = strlen($vs_phrase); $vn_ngram_len = $vn_len - 8; if ($vn_ngram_len < 3) { $vn_ngram_len = 3; } $va_gen_ngrams = caNgrams($vs_phrase, $vn_ngram_len, false); $va_ngrams = array(); foreach ($va_gen_ngrams as $vs_ngram) { if ($vs_ngram) { $va_ngrams[] = "'" . $o_db->escape($vs_ngram) . "'"; } } if (sizeof($va_ngrams)) { $qr_res = $o_db->query("\n\t\t\t\t\t\tSELECT p.table_num, p.phrase, (count(*) + (sum(n.endpoint) * 2)) score \n\t\t\t\t\t\tFROM ca_did_you_mean_ngrams n \n\t\t\t\t\t\tINNER JOIN ca_did_you_mean_phrases AS p ON p.phrase_id = n.phrase_id WHERE \n\t\t\t\t\t\t\tn.ngram IN (" . join(',', $va_ngrams) . ") " . (sizeof($va_sql) ? ' AND ' . join(' AND ', $va_sql) : '') . "\n\t\t\t\t\t\tGROUP BY p.phrase_id \n\t\t\t\t\t\tORDER BY score DESC, p.num_words DESC, ABS(length(p.phrase) - " . $vn_len . ") ASC\n\t\t\t\t\t\tLIMIT " . intval($pn_max_suggestions)); if ($qr_res->numRows()) { $va_suggestions = array(); while ($qr_res->nextRow()) { if (isset($pa_options['groupByTableNum']) && $pa_options['groupByTableNum']) { $va_suggestions[$qr_res->get('table_num')][] = $qr_res->get('phrase'); } else { $va_suggestions[$qr_res->get('phrase')] = $qr_res->get('score'); } } return $va_suggestions; } } array_pop($va_words); $vs_phrase = join(' ', $va_words); } return array(); }
public function getWordID($ps_word) { if (!strlen($ps_word = trim(mb_strtolower($ps_word, "UTF-8")))) { return null; } if ((int) WLPlugSearchEngineSqlSearch::$s_word_cache[(string) $ps_word]) { return (int) WLPlugSearchEngineSqlSearch::$s_word_cache[(string) $ps_word]; } if ($qr_res = $this->opqr_lookup_word->execute((string) $ps_word)) { if ($qr_res->nextRow()) { return WLPlugSearchEngineSqlSearch::$s_word_cache[(string) $ps_word] = (int) $qr_res->get('word_id', array('binary' => true)); } } // insert word if (!($vs_stem = trim($this->opo_stemmer->stem((string) $ps_word)))) { $vs_stem = (string) $ps_word; } $this->opqr_insert_word->execute((string) $ps_word, $vs_stem); if ($this->opqr_insert_word->numErrors()) { return null; } if (!($vn_word_id = (int) $this->opqr_insert_word->getLastInsertID())) { return null; } // create ngrams $va_ngrams = caNgrams((string) $ps_word, 4); $vn_seq = 0; $va_ngram_buf = array(); foreach ($va_ngrams as $vs_ngram) { //$this->opqr_insert_ngram->execute($vn_word_id, $vs_ngram, $vn_seq); $va_ngram_buf[] = "({$vn_word_id},'{$vs_ngram}',{$vn_seq})"; $vn_seq++; } if (sizeof($va_ngram_buf)) { $vs_sql = $this->ops_insert_ngram_sql . "\n" . join(",", $va_ngram_buf); $this->opo_db->query($vs_sql); } return WLPlugSearchEngineSqlSearch::$s_word_cache[(string) $ps_word] = (int) $vn_word_id; }
/** * Return list of suggested searches that will find something, based upon the specified search expression * * @param string $ps_text The search expression * @param array $pa_options Options are: * returnAsLink = return suggestions as links to full-text searces. [Default is no] * request = the current request; required if links are to be generated using returnAsLink. [Default is null] * table = the name or number of the table to restrict searches to. If you pass, for example, "ca_objects" search expressions specifically for object searches will be returned. [Default is null] * @return array List of suggested searches */ public function suggest($ps_text, $pa_options = null) { $o_dm = Datamodel::load(); $va_tokens = $this->_tokenize($ps_text); $pm_table = caGetOption('table', $pa_options, null); $vn_table_num = $pm_table ? $o_dm->getTableNum($pm_table) : null; $va_word_ids = array(); foreach ($va_tokens as $vn_i => $vs_token) { if (preg_match("![\\d]+!", $vs_token)) { continue; } // don't try to match if there are numbers // set ngram length based upon length of word // shorter words require shorter ngrams to detect similarity $vn_token_len = strlen($vs_token); if ($vn_token_len <= 8) { $vn_ngram_len = 2; } elseif ($vn_token_len <= 11) { $vn_ngram_len = 3; } else { $vn_ngram_len = 4; } $va_ngrams = caNgrams($vs_token, $vn_ngram_len); $vs_table_sql = $vn_table_num ? 'AND swi.table_num = ?' : ''; if (!is_array($va_ngrams) || !sizeof($va_ngrams)) { continue; } $vn_num_ngrams = sizeof($va_ngrams); // Look for items with the most shared ngrams $va_params = array($va_ngrams); //if ($vn_table_num) { $va_params[] = $vn_table_num; } $qr_res = $this->opo_db->query("\n\t\t\t\tSELECT ng.word_id, sw.word, count(*) sc\n\t\t\t\tFROM ca_sql_search_ngrams ng\n\t\t\t\tINNER JOIN ca_sql_search_words AS sw ON sw.word_id = ng.word_id\n\t\t\t\tWHERE\n\t\t\t\t\tng.ngram IN (?)\n\t\t\t\tGROUP BY ng.word_id, sw.word\n\t\t\t\tORDER BY (length(sw.word) - (count(*) * {$vn_ngram_len})), (" . $vn_ngram_len * $vn_num_ngrams . ") - ((count(*) * {$vn_ngram_len}))\n\t\t\t\tLIMIT 250\n\t\t\t", $va_params); $va_word_ids[$vn_i] = array(); $vn_c = 0; // Check ngram results using various techniques to find most relevant hits $vs_token_metaphone = metaphone($vs_token); while ($qr_res->nextRow()) { $vs_word = $qr_res->get('word'); if (preg_match("![^A-Za-z ]+!", $vs_word)) { continue; } // skip anything that is not entirely letters and space $vn_word_id = $qr_res->get('word_id'); // Is it an exact match? if ($vs_word == $vs_token) { $va_word_ids[$vn_i][$vn_word_id] = -250; $vn_c++; continue; } // Does it sound like the word we're looking for (in English at least) if (metaphone($vs_word) == $vs_token_metaphone) { $va_word_ids[$vn_i][$vn_word_id] = -150; $vn_c++; continue; } // Is it close to what we're looking for distance-wise? if (strpos($vs_word, $vs_token) === false) { if (($vn_score = levenshtein($vs_word, $vs_token)) > 3) { continue; } } else { $vn_score -= 150; } // does it begin with the same character? for ($i = 1; $i <= mb_strlen($vs_word); $i++) { if (mb_substr($vs_word, 0, $i) === mb_substr($vs_token, 0, $i)) { $vn_score -= 25; } else { break; } } $va_word_ids[$vn_i][$vn_word_id] = $vn_score; $vn_c++; //if ($vn_c > 25) { break; } // give up when we're found 500 possible hits } } $va_temp_tables = array(); $vn_w = 0; if (!is_array($va_word_ids) || !sizeof($va_word_ids)) { return array(); } // Look for phrases that use any sequence of matched words in proper order // if (sizeof($va_word_ids) > 1) { foreach ($va_word_ids as $vn_i => $va_word_list) { if (!sizeof($va_word_list)) { continue; } asort($va_word_list, SORT_NUMERIC); $va_word_list = array_keys(array_slice($va_word_list, 0, 30, true)); $vn_w++; $vs_temp_table = 'ca_sql_search_suggest_' . md5("/" . $vn_i . "/" . print_R($va_word_list, true)); $this->_createTempTable($vs_temp_table); $vs_sql = "\n\t\t\t\t\tINSERT INTO {$vs_temp_table}\n\t\t\t\t\tSELECT swi.index_id + 1, 1\n\t\t\t\t\tFROM ca_sql_search_word_index swi\n\t\t\t\t\t" . (sizeof($va_temp_tables) ? " INNER JOIN " . $va_temp_tables[sizeof($va_temp_tables) - 1] . " AS tt ON swi.index_id = tt.row_id" : "") . "\n\t\t\t\t\tWHERE \n\t\t\t\t\t\tswi.word_id IN (?) {$vs_table_sql}\n\t\t\t\t\t\t" . ($this->getOption('omitPrivateIndexing') ? " AND swi.access = 0" : '') . "\n\t\t\t\t"; $va_params = array($va_word_list); if ($vn_table_num) { $va_params[] = $vn_table_num; } $qr_res = $this->opo_db->query($vs_sql, $va_params); $va_temp_tables[] = $vs_temp_table; } if (!sizeof($va_temp_tables)) { return array(); } // Get most relevant phrases from index // $vs_results_table = array_pop($va_temp_tables); $qr_result = $this->opo_db->query("SELECT * FROM {$vs_results_table} LIMIT 50"); $va_phrases = array(); while ($qr_result->nextRow()) { $va_indices = array(); $vn_index_id = $qr_result->get('row_id') - 1; for ($i = 0; $i < sizeof($va_tokens); $i++) { $va_indices[] = $vn_index_id; $vn_index_id--; } $qr_phrases = $this->opo_db->query("\n\t\t\t\t\tSELECT sw.word, swi.index_id \n\t\t\t\t\tFROM ca_sql_search_words sw\n\t\t\t\t\tINNER JOIN ca_sql_search_word_index AS swi ON sw.word_id = swi.word_id\n\t\t\t\t\tWHERE\n\t\t\t\t\t\t(swi.index_id IN (?))\n\t\t\t\t", array($va_indices)); $va_acc = array(); while ($qr_phrases->nextRow()) { $va_acc[] = $qr_phrases->get('word'); } $va_phrases[] = join(" ", $va_acc); } foreach ($va_temp_tables as $vs_temp_table) { $this->_dropTempTable($vs_temp_table); } $this->_dropTempTable($vs_results_table); $va_phrases = array_unique($va_phrases); } else { // handle single word if (!sizeof($va_word_ids[0])) { return array(); } asort($va_word_ids[0], SORT_NUMERIC); $va_word_ids[0] = array_slice($va_word_ids[0], 0, 3, true); $qr_phrases = $this->opo_db->query("\n\t\t\t\tSELECT sw.word\n\t\t\t\tFROM ca_sql_search_words sw\n\t\t\t\tWHERE\n\t\t\t\t\t(sw.word_id IN (?))\n\t\t\t", array(array_keys($va_word_ids[0]))); $va_phrases = array(); while ($qr_phrases->nextRow()) { $va_phrases[] = $qr_phrases->get('word'); } } if (caGetOption('returnAsLink', $pa_options, false) && ($po_request = caGetOption('request', $pa_options, null))) { foreach ($va_phrases as $vn_i => $vs_phrase) { $va_phrases[$vn_i] = caNavLink($po_request, $vs_phrase, '', '*', '*', 'Index', array('search' => $vs_phrase)); } } return $va_phrases; }
/** * */ public static function create_ngrams($po_opts = null) { require_once __CA_LIB_DIR__ . "/core/Db.php"; $o_db = new Db(); $pb_clear = (bool) $po_opts->getOption('clear'); $pa_sizes = explode(",", (string) $po_opts->getOption('sizes')); foreach ($pa_sizes as $vn_i => $vn_size) { $vn_size = (int) $vn_size; if (!$vn_size || $vn_size <= 0) { unset($pa_sizes[$vn_i]); continue; } $pa_sizes[$vn_i] = $vn_size; } if (!is_array($pa_sizes) || !sizeof($pa_sizes)) { $pa_sizes = array(2, 3, 4); } $vs_insert_ngram_sql = "\n\t\t\t\tINSERT INTO ca_sql_search_ngrams\n\t\t\t\t(word_id, ngram, seq)\n\t\t\t\tVALUES\n\t\t\t"; if ($pb_clear) { $qr_res = $o_db->query("TRUNCATE TABLE ca_sql_search_ngrams"); } //create ngrams $qr_res = $o_db->query("SELECT word_id, word FROM ca_sql_search_words"); print CLIProgressBar::start($qr_res->numRows(), _t('Starting...')); $vn_c = 0; $vn_ngram_c = 0; while ($qr_res->nextRow()) { print CLIProgressBar::next(); $vn_word_id = $qr_res->get('word_id'); $vs_word = $qr_res->get('word'); print CLIProgressBar::next(1, _t('Processing %1', $vs_word)); if (!$pb_clear) { $qr_chk = $o_db->query("SELECT word_id FROM ca_sql_search_ngrams WHERE word_id = ?", array($vn_word_id)); if ($qr_chk->nextRow()) { continue; } } $vn_seq = 0; foreach ($pa_sizes as $vn_size) { $va_ngrams = caNgrams((string) $vs_word, $vn_size); $va_ngram_buf = array(); foreach ($va_ngrams as $vs_ngram) { $va_ngram_buf[] = "({$vn_word_id},'{$vs_ngram}',{$vn_seq})"; $vn_seq++; $vn_ngram_c++; } if (sizeof($va_ngram_buf)) { $o_db->query($vs_insert_ngram_sql . "\n" . join(",", $va_ngram_buf)); } } $vn_c++; } print CLIProgressBar::finish(); CLIUtils::addMessage(_t('Processed %1 words and created %2 ngrams', $vn_c, $vn_ngram_c)); return true; }