/** * @return DOMDocument DOM to manipulate */ public function getDoc() { if (!$this->doc) { // DOMDocument::loadHTML apparently isn't very good with encodings, so // convert input to ASCII by encoding everything above 128 as entities. if (function_exists('mb_convert_encoding')) { $html = mb_convert_encoding($this->html, 'HTML-ENTITIES', 'UTF-8'); } else { $html = preg_replace_callback('/[\\x{80}-\\x{10ffff}]/u', function ($m) { return '&#' . UtfNormal\Utils::utf8ToCodepoint($m[0]) . ';'; }, $this->html); } // Workaround for bug that caused spaces before references // to disappear during processing: https://phabricator.wikimedia.org/T55086 // TODO: Please replace with a better fix if one can be found. $html = str_replace(' <', ' <', $html); libxml_use_internal_errors(true); $loader = libxml_disable_entity_loader(); $this->doc = new DOMDocument(); $this->doc->strictErrorChecking = false; $this->doc->loadHTML($html); libxml_disable_entity_loader($loader); libxml_use_internal_errors(false); $this->doc->encoding = 'UTF-8'; } return $this->doc; }
public function execute() { $hexPairs = array('0D23 0D4D 200D' => '0D7A', '0D28 0D4D 200D' => '0D7B', '0D30 0D4D 200D' => '0D7C', '0D32 0D4D 200D' => '0D7D', '0D33 0D4D 200D' => '0D7E', '0D15 0D4D 200D' => '0D7F'); $pairs = array(); foreach ($hexPairs as $hexSource => $hexDest) { $source = UtfNormal\Utils::hexSequenceToUtf8($hexSource); $dest = UtfNormal\Utils::hexSequenceToUtf8($hexDest); $pairs[$source] = $dest; } global $IP; file_put_contents("{$IP}/serialized/normalize-ml.ser", serialize($pairs)); echo "ml: " . count($pairs) . " pairs written.\n"; }
/** * Get the first character of a string. * * @param string $s * @return string */ function firstChar($s) { $matches = array(); preg_match('/^([\\x00-\\x7f]|[\\xc0-\\xdf][\\x80-\\xbf]|' . '[\\xe0-\\xef][\\x80-\\xbf]{2}|[\\xf0-\\xf7][\\x80-\\xbf]{3})/', $s, $matches); if (isset($matches[1])) { if (strlen($matches[1]) != 3) { return $matches[1]; } // Break down Hangul syllables to grab the first jamo $code = UtfNormal\Utils::utf8ToCodepoint($matches[1]); if ($code < 0xac00 || 0xd7a4 <= $code) { return $matches[1]; } elseif ($code < 0xb098) { return "ㄱ"; } elseif ($code < 0xb2e4) { return "ㄴ"; } elseif ($code < 0xb77c) { return "ㄷ"; } elseif ($code < 0xb9c8) { return "ㄹ"; } elseif ($code < 0xbc14) { return "ㅁ"; } elseif ($code < 0xc0ac) { return "ㅂ"; } elseif ($code < 0xc544) { return "ㅅ"; } elseif ($code < 0xc790) { return "ㅇ"; } elseif ($code < 0xcc28) { return "ㅈ"; } elseif ($code < 0xce74) { return "ㅊ"; } elseif ($code < 0xd0c0) { return "ㅋ"; } elseif ($code < 0xd30c) { return "ㅌ"; } elseif ($code < 0xd558) { return "ㅍ"; } else { return "ㅎ"; } } else { return ''; } }
function generateFirstChars() { $file = fopen("{$this->dataDir}/allkeys.txt", 'r'); if (!$file) { $this->error("Unable to open allkeys.txt"); exit(1); } global $IP; $outFile = fopen("{$IP}/serialized/first-letters-root.ser", 'w'); if (!$outFile) { $this->error("Unable to open output file first-letters-root.ser"); exit(1); } $goodTertiaryChars = array(); // For each character with an entry in allkeys.txt, overwrite the implicit // entry in $this->weights that came from the UCD. // Also gather a list of tertiary weights, for use in selecting the group header while (false !== ($line = fgets($file))) { // We're only interested in single-character weights, pick them out with a regex $line = trim($line); if (!preg_match('/^([0-9A-F]+)\\s*;\\s*([^#]*)/', $line, $m)) { continue; } $cp = hexdec($m[1]); $allWeights = trim($m[2]); $primary = ''; $tertiary = ''; if (!isset($this->weights[$cp])) { // Non-printable, ignore continue; } foreach (StringUtils::explode('[', $allWeights) as $weightStr) { preg_match_all('/[*.]([0-9A-F]+)/', $weightStr, $m); if (!empty($m[1])) { if ($m[1][0] !== '0000') { $primary .= '.' . $m[1][0]; } if ($m[1][2] !== '0000') { $tertiary .= '.' . $m[1][2]; } } } $this->weights[$cp] = $primary; if ($tertiary === '.0008' || $tertiary === '.000E') { $goodTertiaryChars[$cp] = true; } } fclose($file); // Identify groups of characters with the same primary weight $this->groups = array(); asort($this->weights, SORT_STRING); $prevWeight = reset($this->weights); $group = array(); foreach ($this->weights as $cp => $weight) { if ($weight !== $prevWeight) { $this->groups[$prevWeight] = $group; $prevWeight = $weight; if (isset($this->groups[$weight])) { $group = $this->groups[$weight]; } else { $group = array(); } } $group[] = $cp; } if ($group) { $this->groups[$prevWeight] = $group; } // If one character has a given primary weight sequence, and a second // character has a longer primary weight sequence with an initial // portion equal to the first character, then remove the second // character. This avoids having characters like U+A732 (double A) // polluting the basic latin sort area. foreach ($this->groups as $weight => $group) { if (preg_match('/(\\.[0-9A-F]*)\\./', $weight, $m)) { if (isset($this->groups[$m[1]])) { unset($this->groups[$weight]); } } } ksort($this->groups, SORT_STRING); // Identify the header character in each group $headerChars = array(); $prevChar = ""; $tertiaryCollator = new Collator('root'); $primaryCollator = new Collator('root'); $primaryCollator->setStrength(Collator::PRIMARY); $numOutOfOrder = 0; foreach ($this->groups as $weight => $group) { $uncomposedChars = array(); $goodChars = array(); foreach ($group as $cp) { if (isset($goodTertiaryChars[$cp])) { $goodChars[] = $cp; } if (!isset($this->mappedChars[$cp])) { $uncomposedChars[] = $cp; } } $x = array_intersect($goodChars, $uncomposedChars); if (!$x) { $x = $uncomposedChars; if (!$x) { $x = $group; } } // Use ICU to pick the lowest sorting character in the selection $tertiaryCollator->sort($x); $cp = $x[0]; $char = UtfNormal\Utils::codepointToUtf8($cp); $headerChars[] = $char; if ($primaryCollator->compare($char, $prevChar) <= 0) { $numOutOfOrder++; /* printf( "Out of order: U+%05X > U+%05X\n", utf8ToCodepoint( $prevChar ), utf8ToCodepoint( $char ) ); */ } $prevChar = $char; if ($this->debugOutFile) { fwrite($this->debugOutFile, sprintf("%05X %s %s (%s)\n", $cp, $weight, $char, implode(' ', array_map('UtfNormal\\Utils::codepointToUtf8', $group)))); } } print "Out of order: {$numOutOfOrder} / " . count($headerChars) . "\n"; fwrite($outFile, serialize($headerChars)); }
/** * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD, * return the UTF-8 encoding of that character. Otherwise, returns * pseudo-entity source (eg "&foo;") * * @param string $name * @return string */ static function decodeEntity($name) { if (isset(self::$htmlEntityAliases[$name])) { $name = self::$htmlEntityAliases[$name]; } if (isset(self::$htmlEntities[$name])) { return UtfNormal\Utils::codepointToUtf8(self::$htmlEntities[$name]); } else { return "&{$name};"; } }
public function execute() { if (!$this->hasOption('unicode-data-file')) { $dataFile = 'UnicodeData.txt'; if (!file_exists($dataFile)) { $this->error("Unable to find UnicodeData.txt. Please specify " . "its location with --unicode-data-file=<FILE>"); exit(1); } } else { $dataFile = $this->getOption('unicode-data-file'); if (!file_exists($dataFile)) { $this->error('Unable to find the specified data file.'); exit(1); } } $file = fopen($dataFile, 'r'); if (!$file) { $this->error('Unable to open the data file.'); exit(1); } // For the file format, see http://www.unicode.org/reports/tr44/ $fieldNames = array('Code', 'Name', 'General_Category', 'Canonical_Combining_Class', 'Bidi_Class', 'Decomposition_Type_Mapping', 'Numeric_Type_Value_6', 'Numeric_Type_Value_7', 'Numeric_Type_Value_8', 'Bidi_Mirrored', 'Unicode_1_Name', 'ISO_Comment', 'Simple_Uppercase_Mapping', 'Simple_Lowercase_Mapping', 'Simple_Titlecase_Mapping'); $pairs = array(); $lineNum = 0; while (false !== ($line = fgets($file))) { ++$lineNum; # Strip comments $line = trim(substr($line, 0, strcspn($line, '#'))); if ($line === '') { continue; } # Split fields $numberedData = explode(';', $line); $data = array(); foreach ($fieldNames as $number => $name) { $data[$name] = $numberedData[$number]; } $code = base_convert($data['Code'], 16, 10); if ($code >= 0xfb50 && $code <= 0xfdff || $code >= 0xfe70 && $code <= 0xfeff) { if ($data['Decomposition_Type_Mapping'] === '') { // No decomposition continue; } if (!preg_match('/^ *(<\\w*>) +([0-9A-F ]*)$/', $data['Decomposition_Type_Mapping'], $m)) { $this->error("Can't parse Decomposition_Type/Mapping on line {$lineNum}"); $this->error($line); continue; } $source = UtfNormal\Utils::hexSequenceToUtf8($data['Code']); $dest = UtfNormal\Utils::hexSequenceToUtf8($m[2]); $pairs[$source] = $dest; } } global $IP; file_put_contents("{$IP}/serialized/normalize-ar.ser", serialize($pairs)); echo "ar: " . count($pairs) . " pairs written.\n"; }
function getFirstLetter($string) { $string = strval($string); if ($string === '') { return ''; } // Check for CJK $firstChar = mb_substr($string, 0, 1, 'UTF-8'); if (ord($firstChar) > 0x7f && self::isCjk(UtfNormal\Utils::utf8ToCodepoint($firstChar))) { return $firstChar; } $sortKey = $this->getPrimarySortKey($string); // Do a binary search to find the correct letter to sort under $min = ArrayUtils::findLowerBound(array($this, 'getSortKeyByLetterIndex'), $this->getFirstLetterCount(), 'strcmp', $sortKey); if ($min === false) { // Before the first letter return ''; } return $this->getLetterByIndex($min); }
public function getFirstLetter($string) { $string = strval($string); if ($string === '') { return ''; } $firstChar = mb_substr($string, 0, 1, 'UTF-8'); // If the first character is a CJK character, just return that character. if (ord($firstChar) > 0x7f && self::isCjk(UtfNormal\Utils::utf8ToCodepoint($firstChar))) { return $firstChar; } $sortKey = $this->getPrimarySortKey($string); // Do a binary search to find the correct letter to sort under $min = ArrayUtils::findLowerBound([$this, 'getSortKeyByLetterIndex'], $this->getFirstLetterCount(), 'strcmp', $sortKey); if ($min === false) { // Before the first letter return ''; } $sortLetter = $this->getLetterByIndex($min); if ($this->useNumericCollation) { // If the sort letter is a number, return '0–9' (or localized equivalent). // ASCII value of 0 is 48. ASCII value of 9 is 57. // Note that this also applies to non-Arabic numerals since they are // mapped to Arabic numeral sort letters. For example, ২ sorts as 2. if (ord($sortLetter) >= 48 && ord($sortLetter) <= 57) { $sortLetter = wfMessage('category-header-numerals')->numParams(0, 9)->text(); } } return $sortLetter; }
/** * Reverse the previously applied transliteration of non-ASCII characters * back to UTF-8. Used to protect data from corruption by broken web browsers * as listed in $wgBrowserBlackList. * * @param string $invalue * @return string */ private function unmakeSafe($invalue) { $result = ""; $valueLength = strlen($invalue); for ($i = 0; $i < $valueLength; $i++) { if (substr($invalue, $i, 3) == "&#x" && $invalue[$i + 3] != '0') { $i += 3; $hexstring = ""; do { $hexstring .= $invalue[$i]; $i++; } while (ctype_xdigit($invalue[$i]) && $i < strlen($invalue)); // Do some sanity checks. These aren't needed for reversibility, // but should help keep the breakage down if the editor // breaks one of the entities whilst editing. if (substr($invalue, $i, 1) == ";" && strlen($hexstring) <= 6) { $codepoint = hexdec($hexstring); $result .= UtfNormal\Utils::codepointToUtf8($codepoint); } else { $result .= "&#x" . $hexstring . substr($invalue, $i, 1); } } else { $result .= substr($invalue, $i, 1); } } // reverse the transform that we made for reversibility reasons. return strtr($result, array("�" => "&#x")); }
/** * Mangle XML-invalid names to be valid in XML * @param string $name * @param array $preserveKeys Names to not mangle * @return string Mangled name */ private static function mangleName($name, $preserveKeys = []) { static $nsc = null, $nc = null; if (in_array($name, $preserveKeys, true)) { return $name; } if ($name === '') { return '_'; } if ($nsc === null) { // Note we omit ':' from $nsc and $nc because it's reserved for XML // namespacing, and we omit '_' from $nsc (but not $nc) because we // reserve it. $nsc = 'A-Za-z\\x{C0}-\\x{D6}\\x{D8}-\\x{F6}\\x{F8}-\\x{2FF}\\x{370}-\\x{37D}\\x{37F}-\\x{1FFF}' . '\\x{200C}-\\x{200D}\\x{2070}-\\x{218F}\\x{2C00}-\\x{2FEF}\\x{3001}-\\x{D7FF}' . '\\x{F900}-\\x{FDCF}\\x{FDF0}-\\x{FFFD}\\x{10000}-\\x{EFFFF}'; $nc = $nsc . '_\\-.0-9\\x{B7}\\x{300}-\\x{36F}\\x{203F}-\\x{2040}'; } if (preg_match("/^[{$nsc}][{$nc}]*\$/uS", $name)) { return $name; } return '_' . preg_replace_callback("/[^{$nc}]/uS", function ($m) { return sprintf('.%X.', UtfNormal\Utils::utf8ToCodepoint($m[0])); }, str_replace('.', '.2E.', $name)); }
public function execute() { if (!$this->hasOption('unicode-data-file')) { $dataFile = 'UnicodeData.txt'; if (!file_exists($dataFile)) { $this->error("Unable to find UnicodeData.txt. Please specify " . "its location with --unicode-data-file=<FILE>"); exit(1); } } else { $dataFile = $this->getOption('unicode-data-file'); if (!file_exists($dataFile)) { $this->error('Unable to find the specified data file.'); exit(1); } } $file = fopen($dataFile, 'r'); if (!$file) { $this->error('Unable to open the data file.'); exit(1); } // For the file format, see http://www.unicode.org/reports/tr44/ $fieldNames = array('Code', 'Name', 'General_Category', 'Canonical_Combining_Class', 'Bidi_Class', 'Decomposition_Type_Mapping', 'Numeric_Type_Value_6', 'Numeric_Type_Value_7', 'Numeric_Type_Value_8', 'Bidi_Mirrored', 'Unicode_1_Name', 'ISO_Comment', 'Simple_Uppercase_Mapping', 'Simple_Lowercase_Mapping', 'Simple_Titlecase_Mapping'); $upper = array(); $lower = array(); $lineNum = 0; while (false !== ($line = fgets($file))) { ++$lineNum; # Strip comments $line = trim(substr($line, 0, strcspn($line, '#'))); if ($line === '') { continue; } # Split fields $numberedData = explode(';', $line); $data = array(); foreach ($fieldNames as $number => $name) { $data[$name] = $numberedData[$number]; } $source = UtfNormal\Utils::hexSequenceToUtf8($data['Code']); if ($data['Simple_Uppercase_Mapping']) { $upper[$source] = UtfNormal\Utils::hexSequenceToUtf8($data['Simple_Uppercase_Mapping']); } if ($data['Simple_Lowercase_Mapping']) { $lower[$source] = UtfNormal\Utils::hexSequenceToUtf8($data['Simple_Lowercase_Mapping']); } } global $IP; file_put_contents("{$IP}/serialized/Utf8Case.ser", serialize(array('wikiUpperChars' => $upper, 'wikiLowerChars' => $lower))); }