function gs_utf8_decompose_to_ascii($str) { static $map = null; if (!is_array($map)) { $map = _gs_utf8_get_map(); } $str = UtfNormal::toNFD(strTr($str, $map)); # return "safe" ASCII without control chars, newlines etc. //$str = preg_replace('/[^a-z0-9\-_. *#\'"!$()\/]/i', '', $str); $str = preg_replace('/[^\\x20-\\x7E]/', '', $str); return $str; }
function normalize_form_d_php($c) { return UtfNormal::toNFD($c, "php"); }
public function ustringToNFD($s) { $this->checkString('toNFD', $s, false); if (!$this->checkEncoding($s)) { return array(null); } return array(UtfNormal::toNFD($s)); }
/** * TODO: does too much in one routine, refactor... * @param $testName * @return array */ public static function checkUnicodeString($testName) { # Start with some sanity checking if (!is_string($testName)) { return array("ERROR", wfMsg('antispoof-badtype')); } if (strlen($testName) == 0) { return array("ERROR", wfMsg('antispoof-empty')); } if (array_intersect(self::stringToList($testName), self::$character_blacklist)) { return array("ERROR", wfMsg('antispoof-blacklisted')); } # Perform Unicode normalization $testName = UtfNormal::toNFD($testName); $testChars = self::stringToList($testName); # Be paranoid: check again, just in case Unicode normalization code changes... if (array_intersect($testChars, self::$character_blacklist)) { return array("ERROR", wfMsg('antispoof-blacklisted')); } # Check for this: should not happen in any valid Unicode string if (self::getScriptCode($testChars[0]) == "SCRIPT_COMBINING_MARKS") { return array("ERROR", wfMsg('antispoof-combining')); } # Strip all combining characters in order to crudely strip accents # Note: NFD normalization should have decomposed all accented chars earlier $testChars = self::stripScript($testChars, "SCRIPT_COMBINING_MARKS"); $testScripts = array_unique(array_map(array('AntiSpoof', 'getScriptCode'), $testChars)); if (in_array("SCRIPT_UNASSIGNED", $testScripts) || in_array("SCRIPT_DEPRECATED", $testScripts)) { return array("ERROR", wfMsg('antispoof-unassigned')); } # We don't mind ASCII punctuation or digits $testScripts = array_diff($testScripts, array("SCRIPT_ASCII_PUNCTUATION", "SCRIPT_ASCII_DIGITS")); if (!$testScripts) { return array("ERROR", wfMsg('antispoof-noletters')); } if (count($testScripts) > 1 && !self::isAllowedScriptCombination($testScripts)) { return array("ERROR", wfMsg('antispoof-mixedscripts')); } # At this point, we should probably check for BiDi violations if they aren't # caught above... # Replace characters in confusables set with equivalence chars $testChars = self::equivString($testChars); # Do very simple sequence processing: "vv" -> "w", "rn" -> "m"... # Not exhaustive, but ups the ante... # Do this _after_ canonicalization: looks weird, but needed for consistency $testChars = self::mergePairs($testChars, self::equivString(self::stringToList("VV")), self::equivString(self::stringToList("W"))); $testChars = self::mergePairs($testChars, self::equivString(self::stringToList("RN")), self::equivString(self::stringToList("M"))); # Squeeze out all punctuation chars # TODO: almost the same code occurs twice, refactor into own routine $testChars = self::stripScript($testChars, "SCRIPT_ASCII_PUNCTUATION"); $testName = self::listToString($testChars); # Remove all remaining spaces, just in case any have snuck through... $testName = self::hardjoin(explode(" ", $testName)); # Reduce repeated char sequences to single character # BUG: TODO: implement this if (strlen($testName) < 1) { return array("ERROR", wfMsg('antispoof-tooshort')); } # Don't ASCIIfy: we assume we are UTF-8 capable on output # Prepend version string, for futureproofing if this algorithm changes $testName = "v2:" . $testName; # And return the canonical version of the name return array("OK", $testName); }
/** * Normalise the text so it can be used with strtr() safely * * 1. decodeCharReferences * 2. split into NFD codepoints or NFC fully combined * 3. add bookends on word boundaries * * @param $word String from user input * @param $flags may include self::DECOMPOSE, self::IGNORE_ENDINGS * @return String */ static function forTransliteration( $word, $flags ) { static $regexes = null; // NOTE: this is very slightly inconsistent with MediaWiki if an NFD code-point // has been HTML escaped it will be converted to NFC if it passes through // transliteration unchanged, I think that's a WONTFIX though. $word = Sanitizer::decodeCharReferences( $word ); if ( $flags & self::DECOMPOSE ) { $word = UtfNormal::toNFD( $word ); $word = preg_replace( '/./u', '$0' . self::LETTER_END, $word ); } else { $word = preg_replace( '/\X/u', '$0' . self::LETTER_END, $word ); } if ( !$regexes ) { // A "letter" is a unicode letter followed by some combining characters // A "non-letter" is any other character followed by some combining characters // "end" is done first so it watches out for word-endings in "start" // If it should treat endings then the start and end of the string are non-letters // Otherwise it does not touch the start or end of the string, only internal transitions $combining = '(?:[\pM]*' . self::LETTER_END . ')'; $nonletter = '[^\pL' . self::LETTER_END . self::WORD_END . '\pM]'; $regexes = array ( 'endings' => array ( 'start' => "/(^$combining?|$nonletter$combining)([\pL])/u", 'end' => "/([\pL]$combining)([^\pL]|$)/u", ), 'ignore-endings' => array ( 'start' => "/($nonletter$combining)([\pL])/u", 'end' => "/([\pL]$combining)([^\pL])/u", ), ); } $regex = $regexes[$flags & self::IGNORE_ENDINGS ? 'ignore-endings' : 'endings']; $word = preg_replace( $regex['end'], '$1' . self::WORD_END . '$2', $word ); $word = preg_replace( $regex['start'], '$1' . self::WORD_START . '$2', $word ); return $word; }