public function buildDictionarySql() { $app = \Slim\Slim::getInstance(); foreach (['freq_threshold', 'suggest_dubug', 'length_threshold', 'levenshtein_threshold', 'top_count'] as $var) { define(strtoupper($var), intval($app->config['sphinx'][$var])); } $in = fopen("php://stdin", "r"); $out = fopen("php://stdout", "w+"); $used_keywords = array(); fwrite($out, "TRUNCATE suggest;\n"); $n = 0; $m = 0; while ($line = fgets($in, 1024)) { list($keyword, $freq) = explode(" ", trim($line)); $keyword = trim($keyword); if (strlen($keyword) < 2 || $keyword === '' || $freq < FREQ_THRESHOLD || strstr($keyword, "_") !== FALSE || strstr($keyword, "'") !== FALSE || array_key_exists($keyword, $used_keywords) === TRUE) { continue; } $trigrams = buildTrigrams($keyword); $used_keywords[$keyword] = NULL; fwrite($out, !$m ? "INSERT INTO suggest VALUES\n" : ",\n"); $n++; fwrite($out, "( 0, '{$keyword}', '{$trigrams}', {$freq} )"); $m++; if ($m % 10000 == 0) { fwrite($out, ";\n"); $m = 0; } } if ($m) { fwrite($out, ";"); } fwrite($out, "\n"); $app->stop(); }
function MakeSuggestion($keyword, $ln) { $trigrams = buildTrigrams($keyword); $query = "\"{$trigrams}\"/1"; $len = strlen($keyword); $delta = LENGTH_THRESHOLD; $stmt = $ln->prepare("\n\t\t\tSELECT *, weight() as w, w+:delta-ABS(len-:len) as myrank\n\t\t\tFROM slimpdsuggest\n\t\t\tWHERE MATCH(:match) AND len BETWEEN :lowlen AND :highlen\n\t\t\tORDER BY myrank DESC, freq DESC\n\t\t\tLIMIT 0,:topcount OPTION ranker=wordcount"); $stmt->bindValue(':match', $query, PDO::PARAM_STR); $stmt->bindValue(':len', $len, PDO::PARAM_INT); $stmt->bindValue(':delta', $delta, PDO::PARAM_INT); $stmt->bindValue(':lowlen', $len - $delta, PDO::PARAM_INT); $stmt->bindValue(':highlen', $len + $delta, PDO::PARAM_INT); $stmt->bindValue(':topcount', TOP_COUNT, PDO::PARAM_INT); $stmt->execute(); if (!($rows = $stmt->fetchAll())) { return false; } // further restrict trigram matches with a sane Levenshtein distance limit foreach ($rows as $match) { $suggested = $match["keyword"]; if (levenshtein($keyword, $suggested) <= LEVENSHTEIN_THRESHOLD) { return $suggested; } } return $keyword; }