コード例 #1
0
ファイル: Sanitizer.php プロジェクト: MediaWiki-stable/1.26.1
 /**
  * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
  * return the UTF-8 encoding of that character. Otherwise, returns
  * pseudo-entity source (eg "&foo;")
  *
  * @param string $name
  * @return string
  */
 static function decodeEntity($name)
 {
     if (isset(self::$htmlEntityAliases[$name])) {
         $name = self::$htmlEntityAliases[$name];
     }
     if (isset(self::$htmlEntities[$name])) {
         return UtfNormal\Utils::codepointToUtf8(self::$htmlEntities[$name]);
     } else {
         return "&{$name};";
     }
 }
コード例 #2
0
 function generateFirstChars()
 {
     $file = fopen("{$this->dataDir}/allkeys.txt", 'r');
     if (!$file) {
         $this->error("Unable to open allkeys.txt");
         exit(1);
     }
     global $IP;
     $outFile = fopen("{$IP}/serialized/first-letters-root.ser", 'w');
     if (!$outFile) {
         $this->error("Unable to open output file first-letters-root.ser");
         exit(1);
     }
     $goodTertiaryChars = array();
     // For each character with an entry in allkeys.txt, overwrite the implicit
     // entry in $this->weights that came from the UCD.
     // Also gather a list of tertiary weights, for use in selecting the group header
     while (false !== ($line = fgets($file))) {
         // We're only interested in single-character weights, pick them out with a regex
         $line = trim($line);
         if (!preg_match('/^([0-9A-F]+)\\s*;\\s*([^#]*)/', $line, $m)) {
             continue;
         }
         $cp = hexdec($m[1]);
         $allWeights = trim($m[2]);
         $primary = '';
         $tertiary = '';
         if (!isset($this->weights[$cp])) {
             // Non-printable, ignore
             continue;
         }
         foreach (StringUtils::explode('[', $allWeights) as $weightStr) {
             preg_match_all('/[*.]([0-9A-F]+)/', $weightStr, $m);
             if (!empty($m[1])) {
                 if ($m[1][0] !== '0000') {
                     $primary .= '.' . $m[1][0];
                 }
                 if ($m[1][2] !== '0000') {
                     $tertiary .= '.' . $m[1][2];
                 }
             }
         }
         $this->weights[$cp] = $primary;
         if ($tertiary === '.0008' || $tertiary === '.000E') {
             $goodTertiaryChars[$cp] = true;
         }
     }
     fclose($file);
     // Identify groups of characters with the same primary weight
     $this->groups = array();
     asort($this->weights, SORT_STRING);
     $prevWeight = reset($this->weights);
     $group = array();
     foreach ($this->weights as $cp => $weight) {
         if ($weight !== $prevWeight) {
             $this->groups[$prevWeight] = $group;
             $prevWeight = $weight;
             if (isset($this->groups[$weight])) {
                 $group = $this->groups[$weight];
             } else {
                 $group = array();
             }
         }
         $group[] = $cp;
     }
     if ($group) {
         $this->groups[$prevWeight] = $group;
     }
     // If one character has a given primary weight sequence, and a second
     // character has a longer primary weight sequence with an initial
     // portion equal to the first character, then remove the second
     // character. This avoids having characters like U+A732 (double A)
     // polluting the basic latin sort area.
     foreach ($this->groups as $weight => $group) {
         if (preg_match('/(\\.[0-9A-F]*)\\./', $weight, $m)) {
             if (isset($this->groups[$m[1]])) {
                 unset($this->groups[$weight]);
             }
         }
     }
     ksort($this->groups, SORT_STRING);
     // Identify the header character in each group
     $headerChars = array();
     $prevChar = "";
     $tertiaryCollator = new Collator('root');
     $primaryCollator = new Collator('root');
     $primaryCollator->setStrength(Collator::PRIMARY);
     $numOutOfOrder = 0;
     foreach ($this->groups as $weight => $group) {
         $uncomposedChars = array();
         $goodChars = array();
         foreach ($group as $cp) {
             if (isset($goodTertiaryChars[$cp])) {
                 $goodChars[] = $cp;
             }
             if (!isset($this->mappedChars[$cp])) {
                 $uncomposedChars[] = $cp;
             }
         }
         $x = array_intersect($goodChars, $uncomposedChars);
         if (!$x) {
             $x = $uncomposedChars;
             if (!$x) {
                 $x = $group;
             }
         }
         // Use ICU to pick the lowest sorting character in the selection
         $tertiaryCollator->sort($x);
         $cp = $x[0];
         $char = UtfNormal\Utils::codepointToUtf8($cp);
         $headerChars[] = $char;
         if ($primaryCollator->compare($char, $prevChar) <= 0) {
             $numOutOfOrder++;
             /*
             				printf( "Out of order: U+%05X > U+%05X\n",
             					utf8ToCodepoint( $prevChar ),
             					utf8ToCodepoint( $char ) );
             */
         }
         $prevChar = $char;
         if ($this->debugOutFile) {
             fwrite($this->debugOutFile, sprintf("%05X %s %s (%s)\n", $cp, $weight, $char, implode(' ', array_map('UtfNormal\\Utils::codepointToUtf8', $group))));
         }
     }
     print "Out of order: {$numOutOfOrder} / " . count($headerChars) . "\n";
     fwrite($outFile, serialize($headerChars));
 }
コード例 #3
0
ファイル: EditPage.php プロジェクト: OrBin/mediawiki
 /**
  * Reverse the previously applied transliteration of non-ASCII characters
  * back to UTF-8. Used to protect data from corruption by broken web browsers
  * as listed in $wgBrowserBlackList.
  *
  * @param string $invalue
  * @return string
  */
 private function unmakeSafe($invalue)
 {
     $result = "";
     $valueLength = strlen($invalue);
     for ($i = 0; $i < $valueLength; $i++) {
         if (substr($invalue, $i, 3) == "&#x" && $invalue[$i + 3] != '0') {
             $i += 3;
             $hexstring = "";
             do {
                 $hexstring .= $invalue[$i];
                 $i++;
             } while (ctype_xdigit($invalue[$i]) && $i < strlen($invalue));
             // Do some sanity checks. These aren't needed for reversibility,
             // but should help keep the breakage down if the editor
             // breaks one of the entities whilst editing.
             if (substr($invalue, $i, 1) == ";" && strlen($hexstring) <= 6) {
                 $codepoint = hexdec($hexstring);
                 $result .= UtfNormal\Utils::codepointToUtf8($codepoint);
             } else {
                 $result .= "&#x" . $hexstring . substr($invalue, $i, 1);
             }
         } else {
             $result .= substr($invalue, $i, 1);
         }
     }
     // reverse the transform that we made for reversibility reasons.
     return strtr($result, array("&#x0" => "&#x"));
 }