function generateMalayalam()
 {
     $hexPairs = array('0D23 0D4D 200D' => '0D7A', '0D28 0D4D 200D' => '0D7B', '0D30 0D4D 200D' => '0D7C', '0D32 0D4D 200D' => '0D7D', '0D33 0D4D 200D' => '0D7E', '0D15 0D4D 200D' => '0D7F');
     $pairs = array();
     foreach ($hexPairs as $hexSource => $hexDest) {
         $source = hexSequenceToUtf8($hexSource);
         $dest = hexSequenceToUtf8($hexDest);
         $pairs[$source] = $dest;
     }
     global $IP;
     file_put_contents("{$IP}/serialized/normalize-ml.ser", serialize($pairs));
     echo "ml: " . count($pairs) . " pairs written.\n";
 }
 public function execute()
 {
     if (!$this->hasOption('unicode-data-file')) {
         $dataFile = 'UnicodeData.txt';
         if (!file_exists($dataFile)) {
             $this->error("Unable to find UnicodeData.txt. Please specify " . "its location with --unicode-data-file=<FILE>");
             exit(1);
         }
     } else {
         $dataFile = $this->getOption('unicode-data-file');
         if (!file_exists($dataFile)) {
             $this->error('Unable to find the specified data file.');
             exit(1);
         }
     }
     $file = fopen($dataFile, 'r');
     if (!$file) {
         $this->error('Unable to open the data file.');
         exit(1);
     }
     // For the file format, see http://www.unicode.org/reports/tr44/
     $fieldNames = array('Code', 'Name', 'General_Category', 'Canonical_Combining_Class', 'Bidi_Class', 'Decomposition_Type_Mapping', 'Numeric_Type_Value_6', 'Numeric_Type_Value_7', 'Numeric_Type_Value_8', 'Bidi_Mirrored', 'Unicode_1_Name', 'ISO_Comment', 'Simple_Uppercase_Mapping', 'Simple_Lowercase_Mapping', 'Simple_Titlecase_Mapping');
     $pairs = array();
     $lineNum = 0;
     while (false !== ($line = fgets($file))) {
         ++$lineNum;
         # Strip comments
         $line = trim(substr($line, 0, strcspn($line, '#')));
         if ($line === '') {
             continue;
         }
         # Split fields
         $numberedData = explode(';', $line);
         $data = array();
         foreach ($fieldNames as $number => $name) {
             $data[$name] = $numberedData[$number];
         }
         $code = base_convert($data['Code'], 16, 10);
         if ($code >= 0xfb50 && $code <= 0xfdff || $code >= 0xfe70 && $code <= 0xfeff) {
             if ($data['Decomposition_Type_Mapping'] === '') {
                 // No decomposition
                 continue;
             }
             if (!preg_match('/^ *(<\\w*>) +([0-9A-F ]*)$/', $data['Decomposition_Type_Mapping'], $m)) {
                 $this->error("Can't parse Decomposition_Type/Mapping on line {$lineNum}");
                 $this->error($line);
                 continue;
             }
             $source = hexSequenceToUtf8($data['Code']);
             $dest = hexSequenceToUtf8($m[2]);
             $pairs[$source] = $dest;
         }
     }
     global $IP;
     file_put_contents("{$IP}/serialized/normalize-ar.ser", serialize($pairs));
     echo "ar: " . count($pairs) . " pairs written.\n";
 }
Example #3
0
        $combiningClass[$source] = intval($canonicalCombiningClass);
    }
    if ($decompositionMapping === '') {
        continue;
    }
    if (preg_match('/^<(.+)> (.*)$/', $decompositionMapping, $matches)) {
        # Compatibility decomposition
        $canonical = false;
        $decompositionMapping = $matches[2];
        $compat++;
    } else {
        $canonical = true;
        $canon++;
    }
    $total++;
    $dest = hexSequenceToUtf8($decompositionMapping);
    $compatibilityDecomp[$source] = $dest;
    if ($canonical) {
        $canonicalDecomp[$source] = $dest;
        if (empty($exclude[$source])) {
            $canonicalComp[$dest] = $source;
        }
    }
    #print "$codepoint | $canonicalCombiningClasses | $decompositionMapping\n";
}
fclose($in);
print "Recursively expanding canonical mappings...\n";
$changed = 42;
$pass = 1;
while ($changed > 0) {
    print "pass {$pass}\n";
 public function execute()
 {
     if (!$this->hasOption('unicode-data-file')) {
         $dataFile = 'UnicodeData.txt';
         if (!file_exists($dataFile)) {
             $this->error("Unable to find UnicodeData.txt. Please specify " . "its location with --unicode-data-file=<FILE>");
             exit(1);
         }
     } else {
         $dataFile = $this->getOption('unicode-data-file');
         if (!file_exists($dataFile)) {
             $this->error('Unable to find the specified data file.');
             exit(1);
         }
     }
     $file = fopen($dataFile, 'r');
     if (!$file) {
         $this->error('Unable to open the data file.');
         exit(1);
     }
     // For the file format, see http://www.unicode.org/reports/tr44/
     $fieldNames = array('Code', 'Name', 'General_Category', 'Canonical_Combining_Class', 'Bidi_Class', 'Decomposition_Type_Mapping', 'Numeric_Type_Value_6', 'Numeric_Type_Value_7', 'Numeric_Type_Value_8', 'Bidi_Mirrored', 'Unicode_1_Name', 'ISO_Comment', 'Simple_Uppercase_Mapping', 'Simple_Lowercase_Mapping', 'Simple_Titlecase_Mapping');
     $upper = array();
     $lower = array();
     $lineNum = 0;
     while (false !== ($line = fgets($file))) {
         ++$lineNum;
         # Strip comments
         $line = trim(substr($line, 0, strcspn($line, '#')));
         if ($line === '') {
             continue;
         }
         # Split fields
         $numberedData = explode(';', $line);
         $data = array();
         foreach ($fieldNames as $number => $name) {
             $data[$name] = $numberedData[$number];
         }
         $source = hexSequenceToUtf8($data['Code']);
         if ($data['Simple_Uppercase_Mapping']) {
             $upper[$source] = hexSequenceToUtf8($data['Simple_Uppercase_Mapping']);
         }
         if ($data['Simple_Lowercase_Mapping']) {
             $lower[$source] = hexSequenceToUtf8($data['Simple_Lowercase_Mapping']);
         }
     }
     global $IP;
     file_put_contents("{$IP}/serialized/Utf8Case.ser", serialize(array('wikiUpperChars' => $upper, 'wikiLowerChars' => $lower)));
 }