function generateMalayalam() { $hexPairs = array('0D23 0D4D 200D' => '0D7A', '0D28 0D4D 200D' => '0D7B', '0D30 0D4D 200D' => '0D7C', '0D32 0D4D 200D' => '0D7D', '0D33 0D4D 200D' => '0D7E', '0D15 0D4D 200D' => '0D7F'); $pairs = array(); foreach ($hexPairs as $hexSource => $hexDest) { $source = hexSequenceToUtf8($hexSource); $dest = hexSequenceToUtf8($hexDest); $pairs[$source] = $dest; } global $IP; file_put_contents("{$IP}/serialized/normalize-ml.ser", serialize($pairs)); echo "ml: " . count($pairs) . " pairs written.\n"; }
public function execute() { if (!$this->hasOption('unicode-data-file')) { $dataFile = 'UnicodeData.txt'; if (!file_exists($dataFile)) { $this->error("Unable to find UnicodeData.txt. Please specify " . "its location with --unicode-data-file=<FILE>"); exit(1); } } else { $dataFile = $this->getOption('unicode-data-file'); if (!file_exists($dataFile)) { $this->error('Unable to find the specified data file.'); exit(1); } } $file = fopen($dataFile, 'r'); if (!$file) { $this->error('Unable to open the data file.'); exit(1); } // For the file format, see http://www.unicode.org/reports/tr44/ $fieldNames = array('Code', 'Name', 'General_Category', 'Canonical_Combining_Class', 'Bidi_Class', 'Decomposition_Type_Mapping', 'Numeric_Type_Value_6', 'Numeric_Type_Value_7', 'Numeric_Type_Value_8', 'Bidi_Mirrored', 'Unicode_1_Name', 'ISO_Comment', 'Simple_Uppercase_Mapping', 'Simple_Lowercase_Mapping', 'Simple_Titlecase_Mapping'); $pairs = array(); $lineNum = 0; while (false !== ($line = fgets($file))) { ++$lineNum; # Strip comments $line = trim(substr($line, 0, strcspn($line, '#'))); if ($line === '') { continue; } # Split fields $numberedData = explode(';', $line); $data = array(); foreach ($fieldNames as $number => $name) { $data[$name] = $numberedData[$number]; } $code = base_convert($data['Code'], 16, 10); if ($code >= 0xfb50 && $code <= 0xfdff || $code >= 0xfe70 && $code <= 0xfeff) { if ($data['Decomposition_Type_Mapping'] === '') { // No decomposition continue; } if (!preg_match('/^ *(<\\w*>) +([0-9A-F ]*)$/', $data['Decomposition_Type_Mapping'], $m)) { $this->error("Can't parse Decomposition_Type/Mapping on line {$lineNum}"); $this->error($line); continue; } $source = hexSequenceToUtf8($data['Code']); $dest = hexSequenceToUtf8($m[2]); $pairs[$source] = $dest; } } global $IP; file_put_contents("{$IP}/serialized/normalize-ar.ser", serialize($pairs)); echo "ar: " . count($pairs) . " pairs written.\n"; }
$combiningClass[$source] = intval($canonicalCombiningClass); } if ($decompositionMapping === '') { continue; } if (preg_match('/^<(.+)> (.*)$/', $decompositionMapping, $matches)) { # Compatibility decomposition $canonical = false; $decompositionMapping = $matches[2]; $compat++; } else { $canonical = true; $canon++; } $total++; $dest = hexSequenceToUtf8($decompositionMapping); $compatibilityDecomp[$source] = $dest; if ($canonical) { $canonicalDecomp[$source] = $dest; if (empty($exclude[$source])) { $canonicalComp[$dest] = $source; } } #print "$codepoint | $canonicalCombiningClasses | $decompositionMapping\n"; } fclose($in); print "Recursively expanding canonical mappings...\n"; $changed = 42; $pass = 1; while ($changed > 0) { print "pass {$pass}\n";
public function execute() { if (!$this->hasOption('unicode-data-file')) { $dataFile = 'UnicodeData.txt'; if (!file_exists($dataFile)) { $this->error("Unable to find UnicodeData.txt. Please specify " . "its location with --unicode-data-file=<FILE>"); exit(1); } } else { $dataFile = $this->getOption('unicode-data-file'); if (!file_exists($dataFile)) { $this->error('Unable to find the specified data file.'); exit(1); } } $file = fopen($dataFile, 'r'); if (!$file) { $this->error('Unable to open the data file.'); exit(1); } // For the file format, see http://www.unicode.org/reports/tr44/ $fieldNames = array('Code', 'Name', 'General_Category', 'Canonical_Combining_Class', 'Bidi_Class', 'Decomposition_Type_Mapping', 'Numeric_Type_Value_6', 'Numeric_Type_Value_7', 'Numeric_Type_Value_8', 'Bidi_Mirrored', 'Unicode_1_Name', 'ISO_Comment', 'Simple_Uppercase_Mapping', 'Simple_Lowercase_Mapping', 'Simple_Titlecase_Mapping'); $upper = array(); $lower = array(); $lineNum = 0; while (false !== ($line = fgets($file))) { ++$lineNum; # Strip comments $line = trim(substr($line, 0, strcspn($line, '#'))); if ($line === '') { continue; } # Split fields $numberedData = explode(';', $line); $data = array(); foreach ($fieldNames as $number => $name) { $data[$name] = $numberedData[$number]; } $source = hexSequenceToUtf8($data['Code']); if ($data['Simple_Uppercase_Mapping']) { $upper[$source] = hexSequenceToUtf8($data['Simple_Uppercase_Mapping']); } if ($data['Simple_Lowercase_Mapping']) { $lower[$source] = hexSequenceToUtf8($data['Simple_Lowercase_Mapping']); } } global $IP; file_put_contents("{$IP}/serialized/Utf8Case.ser", serialize(array('wikiUpperChars' => $upper, 'wikiLowerChars' => $lower))); }