/** * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD, * return the UTF-8 encoding of that character. Otherwise, returns * pseudo-entity source (eg "&foo;") * * @param string $name * @return string */ static function decodeEntity($name) { if (isset(self::$htmlEntityAliases[$name])) { $name = self::$htmlEntityAliases[$name]; } if (isset(self::$htmlEntities[$name])) { return UtfNormal\Utils::codepointToUtf8(self::$htmlEntities[$name]); } else { return "&{$name};"; } }
function generateFirstChars() { $file = fopen("{$this->dataDir}/allkeys.txt", 'r'); if (!$file) { $this->error("Unable to open allkeys.txt"); exit(1); } global $IP; $outFile = fopen("{$IP}/serialized/first-letters-root.ser", 'w'); if (!$outFile) { $this->error("Unable to open output file first-letters-root.ser"); exit(1); } $goodTertiaryChars = array(); // For each character with an entry in allkeys.txt, overwrite the implicit // entry in $this->weights that came from the UCD. // Also gather a list of tertiary weights, for use in selecting the group header while (false !== ($line = fgets($file))) { // We're only interested in single-character weights, pick them out with a regex $line = trim($line); if (!preg_match('/^([0-9A-F]+)\\s*;\\s*([^#]*)/', $line, $m)) { continue; } $cp = hexdec($m[1]); $allWeights = trim($m[2]); $primary = ''; $tertiary = ''; if (!isset($this->weights[$cp])) { // Non-printable, ignore continue; } foreach (StringUtils::explode('[', $allWeights) as $weightStr) { preg_match_all('/[*.]([0-9A-F]+)/', $weightStr, $m); if (!empty($m[1])) { if ($m[1][0] !== '0000') { $primary .= '.' . $m[1][0]; } if ($m[1][2] !== '0000') { $tertiary .= '.' . $m[1][2]; } } } $this->weights[$cp] = $primary; if ($tertiary === '.0008' || $tertiary === '.000E') { $goodTertiaryChars[$cp] = true; } } fclose($file); // Identify groups of characters with the same primary weight $this->groups = array(); asort($this->weights, SORT_STRING); $prevWeight = reset($this->weights); $group = array(); foreach ($this->weights as $cp => $weight) { if ($weight !== $prevWeight) { $this->groups[$prevWeight] = $group; $prevWeight = $weight; if (isset($this->groups[$weight])) { $group = $this->groups[$weight]; } else { $group = array(); } } $group[] = $cp; } if ($group) { $this->groups[$prevWeight] = $group; } // If one character has a given primary weight sequence, and a second // character has a longer primary weight sequence with an initial // portion equal to the first character, then remove the second // character. This avoids having characters like U+A732 (double A) // polluting the basic latin sort area. foreach ($this->groups as $weight => $group) { if (preg_match('/(\\.[0-9A-F]*)\\./', $weight, $m)) { if (isset($this->groups[$m[1]])) { unset($this->groups[$weight]); } } } ksort($this->groups, SORT_STRING); // Identify the header character in each group $headerChars = array(); $prevChar = ""; $tertiaryCollator = new Collator('root'); $primaryCollator = new Collator('root'); $primaryCollator->setStrength(Collator::PRIMARY); $numOutOfOrder = 0; foreach ($this->groups as $weight => $group) { $uncomposedChars = array(); $goodChars = array(); foreach ($group as $cp) { if (isset($goodTertiaryChars[$cp])) { $goodChars[] = $cp; } if (!isset($this->mappedChars[$cp])) { $uncomposedChars[] = $cp; } } $x = array_intersect($goodChars, $uncomposedChars); if (!$x) { $x = $uncomposedChars; if (!$x) { $x = $group; } } // Use ICU to pick the lowest sorting character in the selection $tertiaryCollator->sort($x); $cp = $x[0]; $char = UtfNormal\Utils::codepointToUtf8($cp); $headerChars[] = $char; if ($primaryCollator->compare($char, $prevChar) <= 0) { $numOutOfOrder++; /* printf( "Out of order: U+%05X > U+%05X\n", utf8ToCodepoint( $prevChar ), utf8ToCodepoint( $char ) ); */ } $prevChar = $char; if ($this->debugOutFile) { fwrite($this->debugOutFile, sprintf("%05X %s %s (%s)\n", $cp, $weight, $char, implode(' ', array_map('UtfNormal\\Utils::codepointToUtf8', $group)))); } } print "Out of order: {$numOutOfOrder} / " . count($headerChars) . "\n"; fwrite($outFile, serialize($headerChars)); }
/** * Reverse the previously applied transliteration of non-ASCII characters * back to UTF-8. Used to protect data from corruption by broken web browsers * as listed in $wgBrowserBlackList. * * @param string $invalue * @return string */ private function unmakeSafe($invalue) { $result = ""; $valueLength = strlen($invalue); for ($i = 0; $i < $valueLength; $i++) { if (substr($invalue, $i, 3) == "&#x" && $invalue[$i + 3] != '0') { $i += 3; $hexstring = ""; do { $hexstring .= $invalue[$i]; $i++; } while (ctype_xdigit($invalue[$i]) && $i < strlen($invalue)); // Do some sanity checks. These aren't needed for reversibility, // but should help keep the breakage down if the editor // breaks one of the entities whilst editing. if (substr($invalue, $i, 1) == ";" && strlen($hexstring) <= 6) { $codepoint = hexdec($hexstring); $result .= UtfNormal\Utils::codepointToUtf8($codepoint); } else { $result .= "&#x" . $hexstring . substr($invalue, $i, 1); } } else { $result .= substr($invalue, $i, 1); } } // reverse the transform that we made for reversibility reasons. return strtr($result, array("�" => "&#x")); }