예제 #1
0
 public static function dist($s1, $s2)
 {
     $s1 = mb_strtolower(StringUtil::unicodeToLatin($s1));
     $s2 = mb_strtolower(StringUtil::unicodeToLatin($s2));
     $len1 = mb_strlen($s1);
     $len2 = mb_strlen($s2);
     // Split the strings into characters to minimize the number calls to getCharAt().
     $chars1 = array();
     for ($i = 0; $i < $len1; $i++) {
         $chars1[] = StringUtil::getCharAt($s1, $i);
     }
     $chars2 = array();
     for ($j = 0; $j < $len2; $j++) {
         $chars2[] = StringUtil::getCharAt($s2, $j);
     }
     // Initialize the first row and column of the matrix
     $a = array();
     for ($i = 0; $i <= $len1; $i++) {
         $a[$i][0] = $i * self::$DIST_OTHER;
     }
     for ($j = 0; $j <= $len2; $j++) {
         $a[0][$j] = $j * self::$COST_DEL;
     }
     // Compute the rest of the matrix with the custom Levenshtein algorithm
     for ($i = 0; $i < $len1; $i++) {
         for ($j = 0; $j < $len2; $j++) {
             $mati = $i + 1;
             $matj = $j + 1;
             // Delete
             $a[$mati][$matj] = $a[$mati][$matj - 1] + self::$COST_DEL;
             // Insert
             $costInsert = $i == 0 ? self::$INFTY : max(self::$COST_INS, self::letterDistance($chars1[$i], $chars1[$i - 1]));
             // At least COST_INS
             $a[$mati][$matj] = min($a[$mati][$matj], $a[$mati - 1][$matj] + $costInsert);
             // Modify (This includes the case where $s1[i] == $s2[j] because dist(x, x) returns 0)
             $a[$mati][$matj] = min($a[$mati][$matj], $a[$mati - 1][$matj - 1] + self::letterDistance($chars1[$i], $chars2[$j]));
             // Transpose
             if ($i && $j && $chars1[$i] == $chars2[$j - 1] && $chars1[$i - 1] == $chars2[$j]) {
                 $a[$mati][$matj] = min($a[$mati][$matj], $a[$mati - 2][$matj - 2] + self::$COST_TRANSPOSE);
             }
         }
     }
     return $a[$len1][$len2];
 }
예제 #2
0
 static function isSeparator($ch)
 {
     crawlerLog("INSIDE " . __FILE__ . ' - ' . __CLASS__ . '::' . __FUNCTION__ . '() - ' . 'line ' . __LINE__);
     return !(ctype_alpha(StringUtil::unicodeToLatin($ch)) || $ch == '-');
 }
예제 #3
0
assertEquals(AdminStringUtil::shorthandToUnicode("'~a'~A~'a~'A"), 'ắẮắẮ');
assertEquals(AdminStringUtil::shorthandToUnicode("~a~A^a^A'a'A"), 'ăĂâÂáÁ');
assertEquals(AdminStringUtil::shorthandToUnicode("`a`A:a:A"), 'àÀäÄ');
assertEquals(AdminStringUtil::shorthandToUnicode(",c,C'c'C~c~C"), 'çÇćĆčČ');
assertEquals(AdminStringUtil::shorthandToUnicode("'e'E`e`E^e^E"), 'éÉèÈêÊ');
assertEquals(AdminStringUtil::shorthandToUnicode(":e:E~e~E~g~G"), 'ëËĕĔğĞ');
assertEquals(AdminStringUtil::shorthandToUnicode("'^i'^I^'i^'I"), 'î́Î́î́Î́');
assertEquals(AdminStringUtil::shorthandToUnicode("'i'I`i`I^i^I"), 'íÍìÌîÎ');
assertEquals(AdminStringUtil::shorthandToUnicode(":i:I~i~I~n~N"), 'ïÏĭĬñÑ');
assertEquals(AdminStringUtil::shorthandToUnicode("'o'O`o`O^o^O"), 'óÓòÒôÔ');
assertEquals(AdminStringUtil::shorthandToUnicode(":o:O~o~O~r~R"), 'öÖõÕřŘ');
assertEquals(AdminStringUtil::shorthandToUnicode("~s~S,s,S,t,T"), 'šŠșȘțȚ');
assertEquals(AdminStringUtil::shorthandToUnicode("'u'U`u`U^u^U"), 'úÚùÙûÛ');
assertEquals(AdminStringUtil::shorthandToUnicode(":u:U~u~U"), 'üÜŭŬ');
assertEquals(AdminStringUtil::shorthandToUnicode("'y'Y:y:Y~z~Z"), 'ýÝÿŸžŽ');
assertEquals('acegyzACEGYZ', StringUtil::unicodeToLatin("ắčèğýžẮČÈĞÝŽ"));
assertEquals('mama', mb_strtolower('mama'));
assertEquals('mama', mb_strtolower('maMa'));
assertEquals('mama', mb_strtolower('MAmA'));
assertEquals('mamă', mb_strtolower('MAmă'));
assertEquals('mamă', mb_strtolower('MAmĂ'));
assertEquals('abcúùû', mb_strtolower('ABCÚÙÛ'));
assertEquals('ÿ', mb_strtolower('Ÿ'));
assertEquals('MAMA', mb_strtoupper('MAMA'));
assertEquals('MAMA', mb_strtoupper('MAmA'));
assertEquals('MAMA', mb_strtoupper('MAmA'));
assertEquals('MAMĂ', mb_strtoupper('MamĂ'));
assertEquals('MAMĂ', mb_strtoupper('maMă'));
assertEquals('ABCÚÙÛ', mb_strtoupper('abcúùû'));
assertEquals('Ÿ', mb_strtoupper('ÿ'));
// Check that we're using the right encoding
예제 #4
0
 function postProcess($fileName)
 {
     $tmpFile = tempnam($this->tmpDir, 'loc_');
     log_scriptLog('* removing diacritics');
     $s = file_get_contents($fileName);
     $s = StringUtil::unicodeToLatin($s);
     file_put_contents($tmpFile, $s);
     log_scriptLog('* removing duplicates and sorting');
     OS::executeAndAssert("sort -u {$tmpFile} -o {$fileName}");
     unlink($tmpFile);
 }
예제 #5
0
function stripImageName($fileName)
{
    $s = StringUtil::unicodeToLatin($fileName);
    $s = str_replace(array('-', ' ', 'ş', 'ţ', 'Ş', 'Ţ'), array('', '', 's', 't', 's', 't'), $s);
    $s = mb_strtolower($s);
    return $s;
}
예제 #6
0
function fetchNextRow($row)
{
    global $lexemDbResult;
    global $sourceMap;
    global $currentLexem;
    $def = Model::factory('Definition')->create($row);
    $def->internalRep = AdminStringUtil::xmlizeRequired($def->internalRep);
    if (hasFlag('d')) {
        $def->internalRep = AdminStringUtil::xmlizeOptional($def->internalRep);
    }
    $lexemNames = array();
    $lexemLatinNames = array();
    while (merge_compare($def, $currentLexem) < 0) {
        $currentLexem = $lexemDbResult->fetch();
    }
    while (merge_compare($def, $currentLexem) == 0) {
        $lexemNames[] = $currentLexem[1];
        $lexemLatinNames[] = StringUtil::unicodeToLatin($currentLexem[1]);
        $currentLexem = $lexemDbResult->fetch();
    }
    SmartyWrap::assign('def', $def);
    SmartyWrap::assign('lexemNames', $lexemNames);
    SmartyWrap::assign('lexemLatinNames', $lexemLatinNames);
    SmartyWrap::assign('source', $sourceMap[$def->sourceId]);
    SmartyWrap::assign('user', userCache_get($def->userId));
}
<?php

require_once "../phplib/util.php";
ini_set('max_execution_time', '3600');
ini_set("memory_limit", "128000000");
$query = "select I.formNoAccent from InflectedForm I, Lexem L, Model M, ModelDescription MD, ModelType MT " . "where I.lexemId = L.id and L.modelType = MT.code and MT.canonical = M.modelType and L.modelNumber = M.number and M.id = MD.modelId " . "and MD.inflectionId = I.inflectionId and MD.variant = I.variant and MD.applOrder = 0 and L.isLoc and MD.isLoc " . "and char_length(I.formNoAccent) between 2 and 15";
$dbResult = db_execute($query);
$fileName = tempnam('/tmp', 'unique_');
$file = fopen($fileName, 'w');
foreach ($dbResult as $dbRow) {
    fwrite($file, "{$dbRow[0]}\r\n");
}
fclose($file);
$s = file_get_contents($fileName);
$s = StringUtil::unicodeToLatin($s);
$s = strtoupper($s);
$file = fopen($fileName, 'w');
fwrite($file, $s);
fclose($file);
$fileName2 = tempnam('/tmp', 'unique_');
OS::executeAndAssert("sort -u {$fileName} -o {$fileName2}");
header('Content-type: text/plain');
print file_get_contents($fileName2);
OS::executeAndAssert("rm -f {$fileName} {$fileName2}");