public static function dist($s1, $s2) { $s1 = mb_strtolower(StringUtil::unicodeToLatin($s1)); $s2 = mb_strtolower(StringUtil::unicodeToLatin($s2)); $len1 = mb_strlen($s1); $len2 = mb_strlen($s2); // Split the strings into characters to minimize the number calls to getCharAt(). $chars1 = array(); for ($i = 0; $i < $len1; $i++) { $chars1[] = StringUtil::getCharAt($s1, $i); } $chars2 = array(); for ($j = 0; $j < $len2; $j++) { $chars2[] = StringUtil::getCharAt($s2, $j); } // Initialize the first row and column of the matrix $a = array(); for ($i = 0; $i <= $len1; $i++) { $a[$i][0] = $i * self::$DIST_OTHER; } for ($j = 0; $j <= $len2; $j++) { $a[0][$j] = $j * self::$COST_DEL; } // Compute the rest of the matrix with the custom Levenshtein algorithm for ($i = 0; $i < $len1; $i++) { for ($j = 0; $j < $len2; $j++) { $mati = $i + 1; $matj = $j + 1; // Delete $a[$mati][$matj] = $a[$mati][$matj - 1] + self::$COST_DEL; // Insert $costInsert = $i == 0 ? self::$INFTY : max(self::$COST_INS, self::letterDistance($chars1[$i], $chars1[$i - 1])); // At least COST_INS $a[$mati][$matj] = min($a[$mati][$matj], $a[$mati - 1][$matj] + $costInsert); // Modify (This includes the case where $s1[i] == $s2[j] because dist(x, x) returns 0) $a[$mati][$matj] = min($a[$mati][$matj], $a[$mati - 1][$matj - 1] + self::letterDistance($chars1[$i], $chars2[$j])); // Transpose if ($i && $j && $chars1[$i] == $chars2[$j - 1] && $chars1[$i - 1] == $chars2[$j]) { $a[$mati][$matj] = min($a[$mati][$matj], $a[$mati - 2][$matj - 2] + self::$COST_TRANSPOSE); } } } return $a[$len1][$len2]; }
static function isSeparator($ch) { crawlerLog("INSIDE " . __FILE__ . ' - ' . __CLASS__ . '::' . __FUNCTION__ . '() - ' . 'line ' . __LINE__); return !(ctype_alpha(StringUtil::unicodeToLatin($ch)) || $ch == '-'); }
assertEquals(AdminStringUtil::shorthandToUnicode("'~a'~A~'a~'A"), 'ắẮắẮ'); assertEquals(AdminStringUtil::shorthandToUnicode("~a~A^a^A'a'A"), 'ăĂâÂáÁ'); assertEquals(AdminStringUtil::shorthandToUnicode("`a`A:a:A"), 'àÀäÄ'); assertEquals(AdminStringUtil::shorthandToUnicode(",c,C'c'C~c~C"), 'çÇćĆčČ'); assertEquals(AdminStringUtil::shorthandToUnicode("'e'E`e`E^e^E"), 'éÉèÈêÊ'); assertEquals(AdminStringUtil::shorthandToUnicode(":e:E~e~E~g~G"), 'ëËĕĔğĞ'); assertEquals(AdminStringUtil::shorthandToUnicode("'^i'^I^'i^'I"), 'î́Î́î́Î́'); assertEquals(AdminStringUtil::shorthandToUnicode("'i'I`i`I^i^I"), 'íÍìÌîÎ'); assertEquals(AdminStringUtil::shorthandToUnicode(":i:I~i~I~n~N"), 'ïÏĭĬñÑ'); assertEquals(AdminStringUtil::shorthandToUnicode("'o'O`o`O^o^O"), 'óÓòÒôÔ'); assertEquals(AdminStringUtil::shorthandToUnicode(":o:O~o~O~r~R"), 'öÖõÕřŘ'); assertEquals(AdminStringUtil::shorthandToUnicode("~s~S,s,S,t,T"), 'šŠșȘțȚ'); assertEquals(AdminStringUtil::shorthandToUnicode("'u'U`u`U^u^U"), 'úÚùÙûÛ'); assertEquals(AdminStringUtil::shorthandToUnicode(":u:U~u~U"), 'üÜŭŬ'); assertEquals(AdminStringUtil::shorthandToUnicode("'y'Y:y:Y~z~Z"), 'ýÝÿŸžŽ'); assertEquals('acegyzACEGYZ', StringUtil::unicodeToLatin("ắčèğýžẮČÈĞÝŽ")); assertEquals('mama', mb_strtolower('mama')); assertEquals('mama', mb_strtolower('maMa')); assertEquals('mama', mb_strtolower('MAmA')); assertEquals('mamă', mb_strtolower('MAmă')); assertEquals('mamă', mb_strtolower('MAmĂ')); assertEquals('abcúùû', mb_strtolower('ABCÚÙÛ')); assertEquals('ÿ', mb_strtolower('Ÿ')); assertEquals('MAMA', mb_strtoupper('MAMA')); assertEquals('MAMA', mb_strtoupper('MAmA')); assertEquals('MAMA', mb_strtoupper('MAmA')); assertEquals('MAMĂ', mb_strtoupper('MamĂ')); assertEquals('MAMĂ', mb_strtoupper('maMă')); assertEquals('ABCÚÙÛ', mb_strtoupper('abcúùû')); assertEquals('Ÿ', mb_strtoupper('ÿ')); // Check that we're using the right encoding
function postProcess($fileName) { $tmpFile = tempnam($this->tmpDir, 'loc_'); log_scriptLog('* removing diacritics'); $s = file_get_contents($fileName); $s = StringUtil::unicodeToLatin($s); file_put_contents($tmpFile, $s); log_scriptLog('* removing duplicates and sorting'); OS::executeAndAssert("sort -u {$tmpFile} -o {$fileName}"); unlink($tmpFile); }
function stripImageName($fileName) { $s = StringUtil::unicodeToLatin($fileName); $s = str_replace(array('-', ' ', 'ş', 'ţ', 'Ş', 'Ţ'), array('', '', 's', 't', 's', 't'), $s); $s = mb_strtolower($s); return $s; }
function fetchNextRow($row) { global $lexemDbResult; global $sourceMap; global $currentLexem; $def = Model::factory('Definition')->create($row); $def->internalRep = AdminStringUtil::xmlizeRequired($def->internalRep); if (hasFlag('d')) { $def->internalRep = AdminStringUtil::xmlizeOptional($def->internalRep); } $lexemNames = array(); $lexemLatinNames = array(); while (merge_compare($def, $currentLexem) < 0) { $currentLexem = $lexemDbResult->fetch(); } while (merge_compare($def, $currentLexem) == 0) { $lexemNames[] = $currentLexem[1]; $lexemLatinNames[] = StringUtil::unicodeToLatin($currentLexem[1]); $currentLexem = $lexemDbResult->fetch(); } SmartyWrap::assign('def', $def); SmartyWrap::assign('lexemNames', $lexemNames); SmartyWrap::assign('lexemLatinNames', $lexemLatinNames); SmartyWrap::assign('source', $sourceMap[$def->sourceId]); SmartyWrap::assign('user', userCache_get($def->userId)); }
<?php require_once "../phplib/util.php"; ini_set('max_execution_time', '3600'); ini_set("memory_limit", "128000000"); $query = "select I.formNoAccent from InflectedForm I, Lexem L, Model M, ModelDescription MD, ModelType MT " . "where I.lexemId = L.id and L.modelType = MT.code and MT.canonical = M.modelType and L.modelNumber = M.number and M.id = MD.modelId " . "and MD.inflectionId = I.inflectionId and MD.variant = I.variant and MD.applOrder = 0 and L.isLoc and MD.isLoc " . "and char_length(I.formNoAccent) between 2 and 15"; $dbResult = db_execute($query); $fileName = tempnam('/tmp', 'unique_'); $file = fopen($fileName, 'w'); foreach ($dbResult as $dbRow) { fwrite($file, "{$dbRow[0]}\r\n"); } fclose($file); $s = file_get_contents($fileName); $s = StringUtil::unicodeToLatin($s); $s = strtoupper($s); $file = fopen($fileName, 'w'); fwrite($file, $s); fclose($file); $fileName2 = tempnam('/tmp', 'unique_'); OS::executeAndAssert("sort -u {$fileName} -o {$fileName2}"); header('Content-type: text/plain'); print file_get_contents($fileName2); OS::executeAndAssert("rm -f {$fileName} {$fileName2}");