Пример #1
0
 /**
  * @return DOMDocument DOM to manipulate
  */
 public function getDoc()
 {
     if (!$this->doc) {
         // DOMDocument::loadHTML apparently isn't very good with encodings, so
         // convert input to ASCII by encoding everything above 128 as entities.
         if (function_exists('mb_convert_encoding')) {
             $html = mb_convert_encoding($this->html, 'HTML-ENTITIES', 'UTF-8');
         } else {
             $html = preg_replace_callback('/[\\x{80}-\\x{10ffff}]/u', function ($m) {
                 return '&#' . UtfNormal\Utils::utf8ToCodepoint($m[0]) . ';';
             }, $this->html);
         }
         // Workaround for bug that caused spaces before references
         // to disappear during processing: https://phabricator.wikimedia.org/T55086
         // TODO: Please replace with a better fix if one can be found.
         $html = str_replace(' <', '&#32;<', $html);
         libxml_use_internal_errors(true);
         $loader = libxml_disable_entity_loader();
         $this->doc = new DOMDocument();
         $this->doc->strictErrorChecking = false;
         $this->doc->loadHTML($html);
         libxml_disable_entity_loader($loader);
         libxml_use_internal_errors(false);
         $this->doc->encoding = 'UTF-8';
     }
     return $this->doc;
 }
 public function execute()
 {
     $hexPairs = array('0D23 0D4D 200D' => '0D7A', '0D28 0D4D 200D' => '0D7B', '0D30 0D4D 200D' => '0D7C', '0D32 0D4D 200D' => '0D7D', '0D33 0D4D 200D' => '0D7E', '0D15 0D4D 200D' => '0D7F');
     $pairs = array();
     foreach ($hexPairs as $hexSource => $hexDest) {
         $source = UtfNormal\Utils::hexSequenceToUtf8($hexSource);
         $dest = UtfNormal\Utils::hexSequenceToUtf8($hexDest);
         $pairs[$source] = $dest;
     }
     global $IP;
     file_put_contents("{$IP}/serialized/normalize-ml.ser", serialize($pairs));
     echo "ml: " . count($pairs) . " pairs written.\n";
 }
Пример #3
0
 /**
  * Get the first character of a string.
  *
  * @param string $s
  * @return string
  */
 function firstChar($s)
 {
     $matches = array();
     preg_match('/^([\\x00-\\x7f]|[\\xc0-\\xdf][\\x80-\\xbf]|' . '[\\xe0-\\xef][\\x80-\\xbf]{2}|[\\xf0-\\xf7][\\x80-\\xbf]{3})/', $s, $matches);
     if (isset($matches[1])) {
         if (strlen($matches[1]) != 3) {
             return $matches[1];
         }
         // Break down Hangul syllables to grab the first jamo
         $code = UtfNormal\Utils::utf8ToCodepoint($matches[1]);
         if ($code < 0xac00 || 0xd7a4 <= $code) {
             return $matches[1];
         } elseif ($code < 0xb098) {
             return "ㄱ";
         } elseif ($code < 0xb2e4) {
             return "ㄴ";
         } elseif ($code < 0xb77c) {
             return "ㄷ";
         } elseif ($code < 0xb9c8) {
             return "ㄹ";
         } elseif ($code < 0xbc14) {
             return "ㅁ";
         } elseif ($code < 0xc0ac) {
             return "ㅂ";
         } elseif ($code < 0xc544) {
             return "ㅅ";
         } elseif ($code < 0xc790) {
             return "ㅇ";
         } elseif ($code < 0xcc28) {
             return "ㅈ";
         } elseif ($code < 0xce74) {
             return "ㅊ";
         } elseif ($code < 0xd0c0) {
             return "ㅋ";
         } elseif ($code < 0xd30c) {
             return "ㅌ";
         } elseif ($code < 0xd558) {
             return "ㅍ";
         } else {
             return "ㅎ";
         }
     } else {
         return '';
     }
 }
 function generateFirstChars()
 {
     $file = fopen("{$this->dataDir}/allkeys.txt", 'r');
     if (!$file) {
         $this->error("Unable to open allkeys.txt");
         exit(1);
     }
     global $IP;
     $outFile = fopen("{$IP}/serialized/first-letters-root.ser", 'w');
     if (!$outFile) {
         $this->error("Unable to open output file first-letters-root.ser");
         exit(1);
     }
     $goodTertiaryChars = array();
     // For each character with an entry in allkeys.txt, overwrite the implicit
     // entry in $this->weights that came from the UCD.
     // Also gather a list of tertiary weights, for use in selecting the group header
     while (false !== ($line = fgets($file))) {
         // We're only interested in single-character weights, pick them out with a regex
         $line = trim($line);
         if (!preg_match('/^([0-9A-F]+)\\s*;\\s*([^#]*)/', $line, $m)) {
             continue;
         }
         $cp = hexdec($m[1]);
         $allWeights = trim($m[2]);
         $primary = '';
         $tertiary = '';
         if (!isset($this->weights[$cp])) {
             // Non-printable, ignore
             continue;
         }
         foreach (StringUtils::explode('[', $allWeights) as $weightStr) {
             preg_match_all('/[*.]([0-9A-F]+)/', $weightStr, $m);
             if (!empty($m[1])) {
                 if ($m[1][0] !== '0000') {
                     $primary .= '.' . $m[1][0];
                 }
                 if ($m[1][2] !== '0000') {
                     $tertiary .= '.' . $m[1][2];
                 }
             }
         }
         $this->weights[$cp] = $primary;
         if ($tertiary === '.0008' || $tertiary === '.000E') {
             $goodTertiaryChars[$cp] = true;
         }
     }
     fclose($file);
     // Identify groups of characters with the same primary weight
     $this->groups = array();
     asort($this->weights, SORT_STRING);
     $prevWeight = reset($this->weights);
     $group = array();
     foreach ($this->weights as $cp => $weight) {
         if ($weight !== $prevWeight) {
             $this->groups[$prevWeight] = $group;
             $prevWeight = $weight;
             if (isset($this->groups[$weight])) {
                 $group = $this->groups[$weight];
             } else {
                 $group = array();
             }
         }
         $group[] = $cp;
     }
     if ($group) {
         $this->groups[$prevWeight] = $group;
     }
     // If one character has a given primary weight sequence, and a second
     // character has a longer primary weight sequence with an initial
     // portion equal to the first character, then remove the second
     // character. This avoids having characters like U+A732 (double A)
     // polluting the basic latin sort area.
     foreach ($this->groups as $weight => $group) {
         if (preg_match('/(\\.[0-9A-F]*)\\./', $weight, $m)) {
             if (isset($this->groups[$m[1]])) {
                 unset($this->groups[$weight]);
             }
         }
     }
     ksort($this->groups, SORT_STRING);
     // Identify the header character in each group
     $headerChars = array();
     $prevChar = "";
     $tertiaryCollator = new Collator('root');
     $primaryCollator = new Collator('root');
     $primaryCollator->setStrength(Collator::PRIMARY);
     $numOutOfOrder = 0;
     foreach ($this->groups as $weight => $group) {
         $uncomposedChars = array();
         $goodChars = array();
         foreach ($group as $cp) {
             if (isset($goodTertiaryChars[$cp])) {
                 $goodChars[] = $cp;
             }
             if (!isset($this->mappedChars[$cp])) {
                 $uncomposedChars[] = $cp;
             }
         }
         $x = array_intersect($goodChars, $uncomposedChars);
         if (!$x) {
             $x = $uncomposedChars;
             if (!$x) {
                 $x = $group;
             }
         }
         // Use ICU to pick the lowest sorting character in the selection
         $tertiaryCollator->sort($x);
         $cp = $x[0];
         $char = UtfNormal\Utils::codepointToUtf8($cp);
         $headerChars[] = $char;
         if ($primaryCollator->compare($char, $prevChar) <= 0) {
             $numOutOfOrder++;
             /*
             				printf( "Out of order: U+%05X > U+%05X\n",
             					utf8ToCodepoint( $prevChar ),
             					utf8ToCodepoint( $char ) );
             */
         }
         $prevChar = $char;
         if ($this->debugOutFile) {
             fwrite($this->debugOutFile, sprintf("%05X %s %s (%s)\n", $cp, $weight, $char, implode(' ', array_map('UtfNormal\\Utils::codepointToUtf8', $group))));
         }
     }
     print "Out of order: {$numOutOfOrder} / " . count($headerChars) . "\n";
     fwrite($outFile, serialize($headerChars));
 }
Пример #5
0
 /**
  * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
  * return the UTF-8 encoding of that character. Otherwise, returns
  * pseudo-entity source (eg "&foo;")
  *
  * @param string $name
  * @return string
  */
 static function decodeEntity($name)
 {
     if (isset(self::$htmlEntityAliases[$name])) {
         $name = self::$htmlEntityAliases[$name];
     }
     if (isset(self::$htmlEntities[$name])) {
         return UtfNormal\Utils::codepointToUtf8(self::$htmlEntities[$name]);
     } else {
         return "&{$name};";
     }
 }
Пример #6
0
 public function execute()
 {
     if (!$this->hasOption('unicode-data-file')) {
         $dataFile = 'UnicodeData.txt';
         if (!file_exists($dataFile)) {
             $this->error("Unable to find UnicodeData.txt. Please specify " . "its location with --unicode-data-file=<FILE>");
             exit(1);
         }
     } else {
         $dataFile = $this->getOption('unicode-data-file');
         if (!file_exists($dataFile)) {
             $this->error('Unable to find the specified data file.');
             exit(1);
         }
     }
     $file = fopen($dataFile, 'r');
     if (!$file) {
         $this->error('Unable to open the data file.');
         exit(1);
     }
     // For the file format, see http://www.unicode.org/reports/tr44/
     $fieldNames = array('Code', 'Name', 'General_Category', 'Canonical_Combining_Class', 'Bidi_Class', 'Decomposition_Type_Mapping', 'Numeric_Type_Value_6', 'Numeric_Type_Value_7', 'Numeric_Type_Value_8', 'Bidi_Mirrored', 'Unicode_1_Name', 'ISO_Comment', 'Simple_Uppercase_Mapping', 'Simple_Lowercase_Mapping', 'Simple_Titlecase_Mapping');
     $pairs = array();
     $lineNum = 0;
     while (false !== ($line = fgets($file))) {
         ++$lineNum;
         # Strip comments
         $line = trim(substr($line, 0, strcspn($line, '#')));
         if ($line === '') {
             continue;
         }
         # Split fields
         $numberedData = explode(';', $line);
         $data = array();
         foreach ($fieldNames as $number => $name) {
             $data[$name] = $numberedData[$number];
         }
         $code = base_convert($data['Code'], 16, 10);
         if ($code >= 0xfb50 && $code <= 0xfdff || $code >= 0xfe70 && $code <= 0xfeff) {
             if ($data['Decomposition_Type_Mapping'] === '') {
                 // No decomposition
                 continue;
             }
             if (!preg_match('/^ *(<\\w*>) +([0-9A-F ]*)$/', $data['Decomposition_Type_Mapping'], $m)) {
                 $this->error("Can't parse Decomposition_Type/Mapping on line {$lineNum}");
                 $this->error($line);
                 continue;
             }
             $source = UtfNormal\Utils::hexSequenceToUtf8($data['Code']);
             $dest = UtfNormal\Utils::hexSequenceToUtf8($m[2]);
             $pairs[$source] = $dest;
         }
     }
     global $IP;
     file_put_contents("{$IP}/serialized/normalize-ar.ser", serialize($pairs));
     echo "ar: " . count($pairs) . " pairs written.\n";
 }
Пример #7
0
 function getFirstLetter($string)
 {
     $string = strval($string);
     if ($string === '') {
         return '';
     }
     // Check for CJK
     $firstChar = mb_substr($string, 0, 1, 'UTF-8');
     if (ord($firstChar) > 0x7f && self::isCjk(UtfNormal\Utils::utf8ToCodepoint($firstChar))) {
         return $firstChar;
     }
     $sortKey = $this->getPrimarySortKey($string);
     // Do a binary search to find the correct letter to sort under
     $min = ArrayUtils::findLowerBound(array($this, 'getSortKeyByLetterIndex'), $this->getFirstLetterCount(), 'strcmp', $sortKey);
     if ($min === false) {
         // Before the first letter
         return '';
     }
     return $this->getLetterByIndex($min);
 }
Пример #8
0
 public function getFirstLetter($string)
 {
     $string = strval($string);
     if ($string === '') {
         return '';
     }
     $firstChar = mb_substr($string, 0, 1, 'UTF-8');
     // If the first character is a CJK character, just return that character.
     if (ord($firstChar) > 0x7f && self::isCjk(UtfNormal\Utils::utf8ToCodepoint($firstChar))) {
         return $firstChar;
     }
     $sortKey = $this->getPrimarySortKey($string);
     // Do a binary search to find the correct letter to sort under
     $min = ArrayUtils::findLowerBound([$this, 'getSortKeyByLetterIndex'], $this->getFirstLetterCount(), 'strcmp', $sortKey);
     if ($min === false) {
         // Before the first letter
         return '';
     }
     $sortLetter = $this->getLetterByIndex($min);
     if ($this->useNumericCollation) {
         // If the sort letter is a number, return '0–9' (or localized equivalent).
         // ASCII value of 0 is 48. ASCII value of 9 is 57.
         // Note that this also applies to non-Arabic numerals since they are
         // mapped to Arabic numeral sort letters. For example, ২ sorts as 2.
         if (ord($sortLetter) >= 48 && ord($sortLetter) <= 57) {
             $sortLetter = wfMessage('category-header-numerals')->numParams(0, 9)->text();
         }
     }
     return $sortLetter;
 }
Пример #9
0
 /**
  * Reverse the previously applied transliteration of non-ASCII characters
  * back to UTF-8. Used to protect data from corruption by broken web browsers
  * as listed in $wgBrowserBlackList.
  *
  * @param string $invalue
  * @return string
  */
 private function unmakeSafe($invalue)
 {
     $result = "";
     $valueLength = strlen($invalue);
     for ($i = 0; $i < $valueLength; $i++) {
         if (substr($invalue, $i, 3) == "&#x" && $invalue[$i + 3] != '0') {
             $i += 3;
             $hexstring = "";
             do {
                 $hexstring .= $invalue[$i];
                 $i++;
             } while (ctype_xdigit($invalue[$i]) && $i < strlen($invalue));
             // Do some sanity checks. These aren't needed for reversibility,
             // but should help keep the breakage down if the editor
             // breaks one of the entities whilst editing.
             if (substr($invalue, $i, 1) == ";" && strlen($hexstring) <= 6) {
                 $codepoint = hexdec($hexstring);
                 $result .= UtfNormal\Utils::codepointToUtf8($codepoint);
             } else {
                 $result .= "&#x" . $hexstring . substr($invalue, $i, 1);
             }
         } else {
             $result .= substr($invalue, $i, 1);
         }
     }
     // reverse the transform that we made for reversibility reasons.
     return strtr($result, array("&#x0" => "&#x"));
 }
Пример #10
0
 /**
  * Mangle XML-invalid names to be valid in XML
  * @param string $name
  * @param array $preserveKeys Names to not mangle
  * @return string Mangled name
  */
 private static function mangleName($name, $preserveKeys = [])
 {
     static $nsc = null, $nc = null;
     if (in_array($name, $preserveKeys, true)) {
         return $name;
     }
     if ($name === '') {
         return '_';
     }
     if ($nsc === null) {
         // Note we omit ':' from $nsc and $nc because it's reserved for XML
         // namespacing, and we omit '_' from $nsc (but not $nc) because we
         // reserve it.
         $nsc = 'A-Za-z\\x{C0}-\\x{D6}\\x{D8}-\\x{F6}\\x{F8}-\\x{2FF}\\x{370}-\\x{37D}\\x{37F}-\\x{1FFF}' . '\\x{200C}-\\x{200D}\\x{2070}-\\x{218F}\\x{2C00}-\\x{2FEF}\\x{3001}-\\x{D7FF}' . '\\x{F900}-\\x{FDCF}\\x{FDF0}-\\x{FFFD}\\x{10000}-\\x{EFFFF}';
         $nc = $nsc . '_\\-.0-9\\x{B7}\\x{300}-\\x{36F}\\x{203F}-\\x{2040}';
     }
     if (preg_match("/^[{$nsc}][{$nc}]*\$/uS", $name)) {
         return $name;
     }
     return '_' . preg_replace_callback("/[^{$nc}]/uS", function ($m) {
         return sprintf('.%X.', UtfNormal\Utils::utf8ToCodepoint($m[0]));
     }, str_replace('.', '.2E.', $name));
 }
Пример #11
0
 public function execute()
 {
     if (!$this->hasOption('unicode-data-file')) {
         $dataFile = 'UnicodeData.txt';
         if (!file_exists($dataFile)) {
             $this->error("Unable to find UnicodeData.txt. Please specify " . "its location with --unicode-data-file=<FILE>");
             exit(1);
         }
     } else {
         $dataFile = $this->getOption('unicode-data-file');
         if (!file_exists($dataFile)) {
             $this->error('Unable to find the specified data file.');
             exit(1);
         }
     }
     $file = fopen($dataFile, 'r');
     if (!$file) {
         $this->error('Unable to open the data file.');
         exit(1);
     }
     // For the file format, see http://www.unicode.org/reports/tr44/
     $fieldNames = array('Code', 'Name', 'General_Category', 'Canonical_Combining_Class', 'Bidi_Class', 'Decomposition_Type_Mapping', 'Numeric_Type_Value_6', 'Numeric_Type_Value_7', 'Numeric_Type_Value_8', 'Bidi_Mirrored', 'Unicode_1_Name', 'ISO_Comment', 'Simple_Uppercase_Mapping', 'Simple_Lowercase_Mapping', 'Simple_Titlecase_Mapping');
     $upper = array();
     $lower = array();
     $lineNum = 0;
     while (false !== ($line = fgets($file))) {
         ++$lineNum;
         # Strip comments
         $line = trim(substr($line, 0, strcspn($line, '#')));
         if ($line === '') {
             continue;
         }
         # Split fields
         $numberedData = explode(';', $line);
         $data = array();
         foreach ($fieldNames as $number => $name) {
             $data[$name] = $numberedData[$number];
         }
         $source = UtfNormal\Utils::hexSequenceToUtf8($data['Code']);
         if ($data['Simple_Uppercase_Mapping']) {
             $upper[$source] = UtfNormal\Utils::hexSequenceToUtf8($data['Simple_Uppercase_Mapping']);
         }
         if ($data['Simple_Lowercase_Mapping']) {
             $lower[$source] = UtfNormal\Utils::hexSequenceToUtf8($data['Simple_Lowercase_Mapping']);
         }
     }
     global $IP;
     file_put_contents("{$IP}/serialized/Utf8Case.ser", serialize(array('wikiUpperChars' => $upper, 'wikiLowerChars' => $lower)));
 }