Ejemplo n.º 1
0
 /**
  * Clean up all illegal character sequences.
  */
 private static function cleanTitle($title)
 {
     # Normalize to Unicode Canonical Form C
     $title = SGString::decodeCharReferencesAndNormalize($title);
     # initialization
     static $rxTc = false;
     if (!$rxTc) {
         # Matching titles will be held as illegal.
         $rxTc = '/' . '[^' . SGTitle::$legalChars . ']' . '|%[0-9A-Fa-f]{2}' . '|&[A-Za-z0-9\\x80-\\xff]+;' . '|&#[0-9]+;' . '|&#x[0-9A-Fa-f]+;' . '/S';
     }
     # Replace illegal characters
     $title = preg_replace($rxTc, '', $title);
     # Strip Unicode bidi override characters.
     # Sometimes they slip into cut-n-pasted page titles, where the
     # override chars get included in list displays.
     $title = preg_replace('/\\xE2\\x80[\\x8E\\x8F\\xAA-\\xAE]/S', '', $title);
     # FIXME: removal of characters above may lead to disallowed character sequences!
     # Clean up whitespace
     # Note: use of the /u option on preg_replace here will cause
     # input with invalid UTF-8 sequences to be nullified out in PHP 5.2.x,
     # conveniently disabling them.
     $title = preg_replace('/[ _\\xA0\\x{1680}\\x{180E}\\x{2000}-\\x{200A}\\x{2028}\\x{2029}\\x{202F}\\x{205F}\\x{3000}]+/u', ' ', $title);
     $title = trim($title);
     if ($title == '') {
         return '';
     }
     # TODO: XML allows only [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
     # we should check $pattern = '/[^\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}]+/u';
     # whether this restriction is ensured.
     # Contained illegal UTF-8 sequences or forbidden Unicode chars.
     if (false !== strpos($title, UTF8_REPLACEMENT)) {
         return '';
     }
     # Pages with "/./" or "/../" appearing in the URLs will often be un-
     # reachable due to the way web browsers deal with 'relative' URLs.
     if (strpos($title, '.') !== false && ($title === '.' || $title === '..' || strpos($title, './') === 0 || strpos($title, '../') === 0 || strpos($title, '/./') !== false || strpos($title, '/../') !== false || substr($title, -2) == '/.' || substr($title, -3) == '/..')) {
         return '';
     }
     # Limit the size of titles to 255 bytes.
     if (strlen($title) > 255) {
         return '';
     }
     return $title;
 }
Ejemplo n.º 2
0
 /**
  * Return UTF-8 string for a codepoint if that is a valid
  * character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
  * @param $codepoint Integer
  * @return String
  */
 static function decodeChar($codepoint)
 {
     if (SGString::validateCodepoint($codepoint)) {
         return codepointToUtf8($codepoint);
     } else {
         return UTF8_REPLACEMENT;
     }
 }