if (PHP_SAPI != 'cli') { die("Run me from the command line please.\n"); } if (isset($_SERVER['argv']) && in_array('--icu', $_SERVER['argv'])) { dl('php_utfnormal.so'); } require_once 'UtfNormalDefines.php'; require_once 'UtfNormalUtil.php'; require_once 'UtfNormal.php'; define('BENCH_CYCLES', 1); define('BIGSIZE', 1024 * 1024 * 10); // 10m ini_set('memory_limit', BIGSIZE + 120 * 1024 * 1024); $testfiles = array('testdata/washington.txt' => 'English text', 'testdata/berlin.txt' => 'German text', 'testdata/bulgakov.txt' => 'Russian text', 'testdata/tokyo.txt' => 'Japanese text', 'testdata/young.txt' => 'Korean text'); $normalizer = new UtfNormal(); UtfNormal::loadData(); foreach ($testfiles as $file => $desc) { benchmarkTest($normalizer, $file, $desc); } # ------- function benchmarkTest(&$u, $filename, $desc) { print "Testing {$filename} ({$desc})...\n"; $data = file_get_contents($filename); $all = $data; while (strlen($all) < BIGSIZE) { $all .= $all; } $data = $all; echo "Data is " . strlen($data) . " bytes.\n"; $forms = array('quickIsNFCVerify', 'cleanUp');
/** * Produces canonically composed sequences, i.e. normal form C or KC. * * @private * @param $string String: a valid UTF-8 string in sorted normal form D or KD. Input is not validated. * @return string a UTF-8 string with canonical precomposed characters used where possible */ static function fastCompose($string) { UtfNormal::loadData(); $len = strlen($string); $out = ''; $lastClass = -1; $lastHangul = 0; $startChar = ''; $combining = ''; $x1 = ord(substr(UTF8_HANGUL_VBASE, 0, 1)); $x2 = ord(substr(UTF8_HANGUL_TEND, 0, 1)); for ($i = 0; $i < $len; $i++) { $c = $string[$i]; $n = ord($c); if ($n < 0x80) { # No combining characters here... $out .= $startChar; $out .= $combining; $startChar = $c; $combining = ''; $lastClass = 0; continue; } elseif ($n >= 0xf0) { $c = substr($string, $i, 4); $i += 3; } elseif ($n >= 0xe0) { $c = substr($string, $i, 3); $i += 2; } elseif ($n >= 0xc0) { $c = substr($string, $i, 2); $i++; } $pair = $startChar . $c; if ($n > 0x80) { if (isset(self::$utfCombiningClass[$c])) { # A combining char; see what we can do with it $class = self::$utfCombiningClass[$c]; if (!empty($startChar) && $lastClass < $class && $class > 0 && isset(self::$utfCanonicalComp[$pair])) { $startChar = self::$utfCanonicalComp[$pair]; $class = 0; } else { $combining .= $c; } $lastClass = $class; $lastHangul = 0; continue; } } # New start char if ($lastClass == 0) { if (isset(self::$utfCanonicalComp[$pair])) { $startChar = self::$utfCanonicalComp[$pair]; $lastHangul = 0; continue; } if ($n >= $x1 && $n <= $x2) { # WARNING: Hangul code is painfully slow. # I apologize for this ugly, ugly code; however # performance is even more teh suck if we call # out to nice clean functions. Lookup tables are # marginally faster, but require a lot of space. # if ($c >= UTF8_HANGUL_VBASE && $c <= UTF8_HANGUL_VEND && $startChar >= UTF8_HANGUL_LBASE && $startChar <= UTF8_HANGUL_LEND) { # #$lIndex = utf8ToCodepoint( $startChar ) - UNICODE_HANGUL_LBASE; #$vIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_VBASE; $lIndex = ord($startChar[2]) - 0x80; $vIndex = ord($c[2]) - 0xa1; $hangulPoint = UNICODE_HANGUL_FIRST + UNICODE_HANGUL_TCOUNT * (UNICODE_HANGUL_VCOUNT * $lIndex + $vIndex); # Hardcode the limited-range UTF-8 conversion: $startChar = chr($hangulPoint >> 12 & 0xf | 0xe0) . chr($hangulPoint >> 6 & 0x3f | 0x80) . chr($hangulPoint & 0x3f | 0x80); $lastHangul = 0; continue; } elseif ($c >= UTF8_HANGUL_TBASE && $c <= UTF8_HANGUL_TEND && $startChar >= UTF8_HANGUL_FIRST && $startChar <= UTF8_HANGUL_LAST && !$lastHangul) { # $tIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_TBASE; $tIndex = ord($c[2]) - 0xa7; if ($tIndex < 0) { $tIndex = ord($c[2]) - 0x80 + (0x11c0 - 0x11a7); } # Increment the code point by $tIndex, without # the function overhead of decoding and recoding UTF-8 # $tail = ord($startChar[2]) + $tIndex; if ($tail > 0xbf) { $tail -= 0x40; $mid = ord($startChar[1]) + 1; if ($mid > 0xbf) { $startChar[0] = chr(ord($startChar[0]) + 1); $mid -= 0x40; } $startChar[1] = chr($mid); } $startChar[2] = chr($tail); # If there's another jamo char after this, *don't* try to merge it. $lastHangul = 1; continue; } } } $out .= $startChar; $out .= $combining; $startChar = $c; $combining = ''; $lastClass = 0; $lastHangul = 0; } $out .= $startChar . $combining; return $out; }