if (PHP_SAPI != 'cli') {
    die("Run me from the command line please.\n");
}
if (isset($_SERVER['argv']) && in_array('--icu', $_SERVER['argv'])) {
    dl('php_utfnormal.so');
}
require_once 'UtfNormalDefines.php';
require_once 'UtfNormalUtil.php';
require_once 'UtfNormal.php';
define('BENCH_CYCLES', 1);
define('BIGSIZE', 1024 * 1024 * 10);
// 10m
ini_set('memory_limit', BIGSIZE + 120 * 1024 * 1024);
$testfiles = array('testdata/washington.txt' => 'English text', 'testdata/berlin.txt' => 'German text', 'testdata/bulgakov.txt' => 'Russian text', 'testdata/tokyo.txt' => 'Japanese text', 'testdata/young.txt' => 'Korean text');
$normalizer = new UtfNormal();
UtfNormal::loadData();
foreach ($testfiles as $file => $desc) {
    benchmarkTest($normalizer, $file, $desc);
}
# -------
function benchmarkTest(&$u, $filename, $desc)
{
    print "Testing {$filename} ({$desc})...\n";
    $data = file_get_contents($filename);
    $all = $data;
    while (strlen($all) < BIGSIZE) {
        $all .= $all;
    }
    $data = $all;
    echo "Data is " . strlen($data) . " bytes.\n";
    $forms = array('quickIsNFCVerify', 'cleanUp');
Example #2
0
 /**
  * Produces canonically composed sequences, i.e. normal form C or KC.
  *
  * @private
  * @param $string String: a valid UTF-8 string in sorted normal form D or KD. Input is not validated.
  * @return string a UTF-8 string with canonical precomposed characters used where possible
  */
 static function fastCompose($string)
 {
     UtfNormal::loadData();
     $len = strlen($string);
     $out = '';
     $lastClass = -1;
     $lastHangul = 0;
     $startChar = '';
     $combining = '';
     $x1 = ord(substr(UTF8_HANGUL_VBASE, 0, 1));
     $x2 = ord(substr(UTF8_HANGUL_TEND, 0, 1));
     for ($i = 0; $i < $len; $i++) {
         $c = $string[$i];
         $n = ord($c);
         if ($n < 0x80) {
             # No combining characters here...
             $out .= $startChar;
             $out .= $combining;
             $startChar = $c;
             $combining = '';
             $lastClass = 0;
             continue;
         } elseif ($n >= 0xf0) {
             $c = substr($string, $i, 4);
             $i += 3;
         } elseif ($n >= 0xe0) {
             $c = substr($string, $i, 3);
             $i += 2;
         } elseif ($n >= 0xc0) {
             $c = substr($string, $i, 2);
             $i++;
         }
         $pair = $startChar . $c;
         if ($n > 0x80) {
             if (isset(self::$utfCombiningClass[$c])) {
                 # A combining char; see what we can do with it
                 $class = self::$utfCombiningClass[$c];
                 if (!empty($startChar) && $lastClass < $class && $class > 0 && isset(self::$utfCanonicalComp[$pair])) {
                     $startChar = self::$utfCanonicalComp[$pair];
                     $class = 0;
                 } else {
                     $combining .= $c;
                 }
                 $lastClass = $class;
                 $lastHangul = 0;
                 continue;
             }
         }
         # New start char
         if ($lastClass == 0) {
             if (isset(self::$utfCanonicalComp[$pair])) {
                 $startChar = self::$utfCanonicalComp[$pair];
                 $lastHangul = 0;
                 continue;
             }
             if ($n >= $x1 && $n <= $x2) {
                 # WARNING: Hangul code is painfully slow.
                 # I apologize for this ugly, ugly code; however
                 # performance is even more teh suck if we call
                 # out to nice clean functions. Lookup tables are
                 # marginally faster, but require a lot of space.
                 #
                 if ($c >= UTF8_HANGUL_VBASE && $c <= UTF8_HANGUL_VEND && $startChar >= UTF8_HANGUL_LBASE && $startChar <= UTF8_HANGUL_LEND) {
                     #
                     #$lIndex = utf8ToCodepoint( $startChar ) - UNICODE_HANGUL_LBASE;
                     #$vIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_VBASE;
                     $lIndex = ord($startChar[2]) - 0x80;
                     $vIndex = ord($c[2]) - 0xa1;
                     $hangulPoint = UNICODE_HANGUL_FIRST + UNICODE_HANGUL_TCOUNT * (UNICODE_HANGUL_VCOUNT * $lIndex + $vIndex);
                     # Hardcode the limited-range UTF-8 conversion:
                     $startChar = chr($hangulPoint >> 12 & 0xf | 0xe0) . chr($hangulPoint >> 6 & 0x3f | 0x80) . chr($hangulPoint & 0x3f | 0x80);
                     $lastHangul = 0;
                     continue;
                 } elseif ($c >= UTF8_HANGUL_TBASE && $c <= UTF8_HANGUL_TEND && $startChar >= UTF8_HANGUL_FIRST && $startChar <= UTF8_HANGUL_LAST && !$lastHangul) {
                     # $tIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_TBASE;
                     $tIndex = ord($c[2]) - 0xa7;
                     if ($tIndex < 0) {
                         $tIndex = ord($c[2]) - 0x80 + (0x11c0 - 0x11a7);
                     }
                     # Increment the code point by $tIndex, without
                     # the function overhead of decoding and recoding UTF-8
                     #
                     $tail = ord($startChar[2]) + $tIndex;
                     if ($tail > 0xbf) {
                         $tail -= 0x40;
                         $mid = ord($startChar[1]) + 1;
                         if ($mid > 0xbf) {
                             $startChar[0] = chr(ord($startChar[0]) + 1);
                             $mid -= 0x40;
                         }
                         $startChar[1] = chr($mid);
                     }
                     $startChar[2] = chr($tail);
                     # If there's another jamo char after this, *don't* try to merge it.
                     $lastHangul = 1;
                     continue;
                 }
             }
         }
         $out .= $startChar;
         $out .= $combining;
         $startChar = $c;
         $combining = '';
         $lastClass = 0;
         $lastHangul = 0;
     }
     $out .= $startChar . $combining;
     return $out;
 }