/** * Clean up all illegal character sequences. */ private static function cleanTitle($title) { # Normalize to Unicode Canonical Form C $title = SGString::decodeCharReferencesAndNormalize($title); # initialization static $rxTc = false; if (!$rxTc) { # Matching titles will be held as illegal. $rxTc = '/' . '[^' . SGTitle::$legalChars . ']' . '|%[0-9A-Fa-f]{2}' . '|&[A-Za-z0-9\\x80-\\xff]+;' . '|&#[0-9]+;' . '|&#x[0-9A-Fa-f]+;' . '/S'; } # Replace illegal characters $title = preg_replace($rxTc, '', $title); # Strip Unicode bidi override characters. # Sometimes they slip into cut-n-pasted page titles, where the # override chars get included in list displays. $title = preg_replace('/\\xE2\\x80[\\x8E\\x8F\\xAA-\\xAE]/S', '', $title); # FIXME: removal of characters above may lead to disallowed character sequences! # Clean up whitespace # Note: use of the /u option on preg_replace here will cause # input with invalid UTF-8 sequences to be nullified out in PHP 5.2.x, # conveniently disabling them. $title = preg_replace('/[ _\\xA0\\x{1680}\\x{180E}\\x{2000}-\\x{200A}\\x{2028}\\x{2029}\\x{202F}\\x{205F}\\x{3000}]+/u', ' ', $title); $title = trim($title); if ($title == '') { return ''; } # TODO: XML allows only [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] # we should check $pattern = '/[^\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}]+/u'; # whether this restriction is ensured. # Contained illegal UTF-8 sequences or forbidden Unicode chars. if (false !== strpos($title, UTF8_REPLACEMENT)) { return ''; } # Pages with "/./" or "/../" appearing in the URLs will often be un- # reachable due to the way web browsers deal with 'relative' URLs. if (strpos($title, '.') !== false && ($title === '.' || $title === '..' || strpos($title, './') === 0 || strpos($title, '../') === 0 || strpos($title, '/./') !== false || strpos($title, '/../') !== false || substr($title, -2) == '/.' || substr($title, -3) == '/..')) { return ''; } # Limit the size of titles to 255 bytes. if (strlen($title) > 255) { return ''; } return $title; }
/** * Return UTF-8 string for a codepoint if that is a valid * character reference, otherwise U+FFFD REPLACEMENT CHARACTER. * @param $codepoint Integer * @return String */ static function decodeChar($codepoint) { if (SGString::validateCodepoint($codepoint)) { return codepointToUtf8($codepoint); } else { return UTF8_REPLACEMENT; } }