Example #1
0
 public static function searchNGram($cuv)
 {
     $leng = mb_strlen($cuv);
     $hash = NGram::searchLexemIds($cuv);
     if (empty($hash)) {
         return array();
     }
     arsort($hash);
     $max = current($hash);
     $lexIds = array_keys($hash, $max);
     $results = array();
     foreach ($lexIds as $id) {
         $lexem = Model::factory('Lexem')->where('id', $id)->where_gte('charLength', $leng - self::$LENGTH_DIF)->where_lte('charLength', $leng + self::$LENGTH_DIF)->find_one();
         if ($lexem) {
             $results[] = $lexem;
             if (count($results) == self::$MAX_RESULTS) {
                 break;
             }
         }
     }
     // Sort the lexems by their Levenshtein distance from $cuv
     $distances = array();
     foreach ($results as $lexem) {
         $distances[] = Levenshtein::dist($cuv, $lexem->formNoAccent);
     }
     array_multisort($distances, $results);
     return $results;
 }
Example #2
0
<?php

require_once __DIR__ . '/../phplib/util.php';
define('INSERT_SIZE', 10000);
ini_set('max_execution_time', '3600');
log_scriptLog('Running genNGram.php.');
$start = microtime(true);
db_execute("truncate table NGram");
// This should be fast
$dbResult = db_execute("select * from Lexem", PDO::FETCH_ASSOC);
$values = array();
foreach ($dbResult as $cnt => $row) {
    $lexem = Model::factory('Lexem')->create($row);
    $ngrams = NGram::split($lexem->formNoAccent);
    foreach ($ngrams as $i => $ngram) {
        $values[] = array($ngram, $i, $lexem->id);
    }
    if (count($values) >= INSERT_SIZE) {
        dumpValues($values);
        $values = array();
    }
    if ($cnt % 1000 == 0) {
        log_scriptLog(sprintf("%d lexems processed, %0.3f lexems/second.", $cnt, $cnt / (microtime(true) - $start)));
    }
}
dumpValues($values);
$end = microtime(true);
log_scriptLog(sprintf("genNGram.php completed in %0.3f seconds\n", $end - $start));
/*********************************************************************/
function dumpValues($values)
{
Example #3
0
 public static function searchApproximate($cuv, $hasDiacritics, $useMemcache = false)
 {
     if ($useMemcache) {
         $key = "approx_" . ($hasDiacritics ? '1' : '0') . "_{$cuv}";
         $result = mc_get($key);
         if ($result) {
             return $result;
         }
     }
     $field = $hasDiacritics ? 'formNoAccent' : 'formUtf8General';
     $start = microtime(true);
     $method = "trigram";
     $leng = mb_strlen($cuv);
     $result = NGram::searchNGram($cuv);
     $end = microtime(true);
     $search_time = sprintf('%0.3f', $end - $start);
     /*
         $logArray = "";
         foreach ($result as $word) {
           $logArray = $logArray . " " . $word;
         }
         $logEntry = "$method\t$search_time\t$cuv:\t$logArray\t$leng\t" . count($result) . "\n";
         file_put_contents("/var/log/dex-approx.log", $logEntry, FILE_APPEND | LOCK_EX);
     */
     if ($useMemcache) {
         mc_set($key, $result);
     }
     return $result;
 }
Example #4
0
 public static function searchApproximate($cuv, $hasDiacritics, $useMemcache = false)
 {
     if ($useMemcache) {
         $key = "approx_" . ($hasDiacritics ? '1' : '0') . "_{$cuv}";
         $result = mc_get($key);
         if ($result) {
             return $result;
         }
     }
     $result = NGram::searchNGram($cuv);
     if ($useMemcache) {
         mc_set($key, $result);
     }
     return $result;
 }
Example #5
0
    $log->save();
}
$count = Model::factory('DefinitionSimple')->count();
$chosenDef = rand(0, $count - 1);
$answer = rand(1, 4);
$maindef = Model::factory('DefinitionSimple')->limit(1)->offset($chosenDef)->find_one();
$word = getWordForDefitionId($maindef->definitionId);
$options = array();
$options[$answer] = array();
$options[$answer]['term'] = getWordForDefitionId($maindef->definitionId);
$options[$answer]['text'] = $maindef->getDisplayValue();
$used[$maindef->definitionId] = 1;
$closestLexemsDefinitionsCount = null;
$closestLexemsDefinitions = null;
if ($difficulty > 1) {
    $nearLexemIds = NGram::searchLexemIds($word);
    arsort($nearLexemIds);
    $lexemPoolSize = 48 / $difficulty;
    $closestLexemIds = array_slice($nearLexemIds, 0, $lexemPoolSize, true);
    $closestLexemIds = array_keys($closestLexemIds);
    $closestLexemsDefinitions = getSimpleDefinitionsForLexemIds($closestLexemIds);
    $closestLexemsDefinitionsCount = count($closestLexemsDefinitions);
    //if there are no close lexem definitions to choose from
    //then use easier difficulty
    if ($closestLexemsDefinitionsCount == 0) {
        $difficulty = 1;
    }
}
for ($i = 1; $i <= 4; $i++) {
    $def = null;
    if ($i != $answer) {