public static function searchNGram($cuv) { $leng = mb_strlen($cuv); $hash = NGram::searchLexemIds($cuv); if (empty($hash)) { return array(); } arsort($hash); $max = current($hash); $lexIds = array_keys($hash, $max); $results = array(); foreach ($lexIds as $id) { $lexem = Model::factory('Lexem')->where('id', $id)->where_gte('charLength', $leng - self::$LENGTH_DIF)->where_lte('charLength', $leng + self::$LENGTH_DIF)->find_one(); if ($lexem) { $results[] = $lexem; if (count($results) == self::$MAX_RESULTS) { break; } } } // Sort the lexems by their Levenshtein distance from $cuv $distances = array(); foreach ($results as $lexem) { $distances[] = Levenshtein::dist($cuv, $lexem->formNoAccent); } array_multisort($distances, $results); return $results; }
<?php require_once __DIR__ . '/../phplib/util.php'; define('INSERT_SIZE', 10000); ini_set('max_execution_time', '3600'); log_scriptLog('Running genNGram.php.'); $start = microtime(true); db_execute("truncate table NGram"); // This should be fast $dbResult = db_execute("select * from Lexem", PDO::FETCH_ASSOC); $values = array(); foreach ($dbResult as $cnt => $row) { $lexem = Model::factory('Lexem')->create($row); $ngrams = NGram::split($lexem->formNoAccent); foreach ($ngrams as $i => $ngram) { $values[] = array($ngram, $i, $lexem->id); } if (count($values) >= INSERT_SIZE) { dumpValues($values); $values = array(); } if ($cnt % 1000 == 0) { log_scriptLog(sprintf("%d lexems processed, %0.3f lexems/second.", $cnt, $cnt / (microtime(true) - $start))); } } dumpValues($values); $end = microtime(true); log_scriptLog(sprintf("genNGram.php completed in %0.3f seconds\n", $end - $start)); /*********************************************************************/ function dumpValues($values) {
public static function searchApproximate($cuv, $hasDiacritics, $useMemcache = false) { if ($useMemcache) { $key = "approx_" . ($hasDiacritics ? '1' : '0') . "_{$cuv}"; $result = mc_get($key); if ($result) { return $result; } } $field = $hasDiacritics ? 'formNoAccent' : 'formUtf8General'; $start = microtime(true); $method = "trigram"; $leng = mb_strlen($cuv); $result = NGram::searchNGram($cuv); $end = microtime(true); $search_time = sprintf('%0.3f', $end - $start); /* $logArray = ""; foreach ($result as $word) { $logArray = $logArray . " " . $word; } $logEntry = "$method\t$search_time\t$cuv:\t$logArray\t$leng\t" . count($result) . "\n"; file_put_contents("/var/log/dex-approx.log", $logEntry, FILE_APPEND | LOCK_EX); */ if ($useMemcache) { mc_set($key, $result); } return $result; }
public static function searchApproximate($cuv, $hasDiacritics, $useMemcache = false) { if ($useMemcache) { $key = "approx_" . ($hasDiacritics ? '1' : '0') . "_{$cuv}"; $result = mc_get($key); if ($result) { return $result; } } $result = NGram::searchNGram($cuv); if ($useMemcache) { mc_set($key, $result); } return $result; }
$log->save(); } $count = Model::factory('DefinitionSimple')->count(); $chosenDef = rand(0, $count - 1); $answer = rand(1, 4); $maindef = Model::factory('DefinitionSimple')->limit(1)->offset($chosenDef)->find_one(); $word = getWordForDefitionId($maindef->definitionId); $options = array(); $options[$answer] = array(); $options[$answer]['term'] = getWordForDefitionId($maindef->definitionId); $options[$answer]['text'] = $maindef->getDisplayValue(); $used[$maindef->definitionId] = 1; $closestLexemsDefinitionsCount = null; $closestLexemsDefinitions = null; if ($difficulty > 1) { $nearLexemIds = NGram::searchLexemIds($word); arsort($nearLexemIds); $lexemPoolSize = 48 / $difficulty; $closestLexemIds = array_slice($nearLexemIds, 0, $lexemPoolSize, true); $closestLexemIds = array_keys($closestLexemIds); $closestLexemsDefinitions = getSimpleDefinitionsForLexemIds($closestLexemIds); $closestLexemsDefinitionsCount = count($closestLexemsDefinitions); //if there are no close lexem definitions to choose from //then use easier difficulty if ($closestLexemsDefinitionsCount == 0) { $difficulty = 1; } } for ($i = 1; $i <= 4; $i++) { $def = null; if ($i != $answer) {