/** * TODO: does too much in one routine, refactor... * @param $testName * @return array */ public static function checkUnicodeString($testName) { global $wgAntiSpoofBlacklist; # Start with some sanity checking if (!is_array($wgAntiSpoofBlacklist)) { throw new MWException('$wgAntiSpoofBlacklist should be an array!'); } if (!is_string($testName)) { return array("ERROR", wfMessage('antispoof-badtype')->text()); } if (strlen($testName) == 0) { return array("ERROR", wfMessage('antispoof-empty')->text()); } foreach (self::stringToList($testName) as $char) { if (in_array($char, $wgAntiSpoofBlacklist)) { return self::badCharErr('antispoof-blacklisted', $char); } } # Perform Unicode _compatibility_ decomposition $testName = UtfNormal::toNFKD($testName); $testChars = self::stringToList($testName); # Be paranoid: check again, just in case Unicode normalization code changes... foreach ($testChars as $char) { if (in_array($char, $wgAntiSpoofBlacklist)) { return self::badCharErr('antispoof-blacklisted', $char); } } # Check for this: should not happen in any valid Unicode string if (self::getScriptCode($testChars[0]) == "SCRIPT_COMBINING_MARKS") { return self::badCharErr('antispoof-combining', $testChars[0]); } # Strip all combining characters in order to crudely strip accents # Note: NFKD normalization should have decomposed all accented chars earlier $testChars = self::stripScript($testChars, "SCRIPT_COMBINING_MARKS"); $testScripts = array_map(array('AntiSpoof', 'getScriptCode'), $testChars); $unassigned = array_search("SCRIPT_UNASSIGNED", $testScripts); if ($unassigned !== False) { return self::badCharErr('antispoof-unassigned', $testChars[$unassigned]); } $deprecated = array_search("SCRIPT_DEPRECATED", $testScripts); if ($deprecated !== False) { return self::badCharErr('antispoof-deprecated', $testChars[$deprecated]); } $testScripts = array_unique($testScripts); # We don't mind ASCII punctuation or digits $testScripts = array_diff($testScripts, array("SCRIPT_ASCII_PUNCTUATION", "SCRIPT_ASCII_DIGITS")); if (!$testScripts) { return array("ERROR", wfMessage('antispoof-noletters')->text()); } if (count($testScripts) > 1 && !self::isAllowedScriptCombination($testScripts)) { return array("ERROR", wfMessage('antispoof-mixedscripts')->text()); } # At this point, we should probably check for BiDi violations if they aren't # caught above... # Replace characters in confusables set with equivalence chars $testChars = self::equivString($testChars); # Do very simple sequence processing: "vv" -> "w", "rn" -> "m"... # Not exhaustive, but ups the ante... # Do this _after_ canonicalization: looks weird, but needed for consistency $testChars = self::mergePairs($testChars, self::equivString(self::stringToList("VV")), self::equivString(self::stringToList("W"))); $testChars = self::mergePairs($testChars, self::equivString(self::stringToList("RN")), self::equivString(self::stringToList("M"))); # Squeeze out all punctuation chars # TODO: almost the same code occurs twice, refactor into own routine $testChars = self::stripScript($testChars, "SCRIPT_ASCII_PUNCTUATION"); $testName = self::listToString($testChars); # Remove all remaining spaces, just in case any have snuck through... $testName = self::hardjoin(explode(" ", $testName)); # Reduce repeated char sequences to single character # BUG: TODO: implement this if (strlen($testName) < 1) { return array("ERROR", wfMessage('antispoof-tooshort')->text()); } # Don't ASCIIfy: we assume we are UTF-8 capable on output # Prepend version string, for futureproofing if this algorithm changes $testName = "v2:" . $testName; # And return the canonical version of the name return array("OK", $testName); }
function normalize_form_kd_php($c) { return UtfNormal::toNFKD($c, "php"); }
/** * Replaces any non-ASCII character by its linguistically most logical substitution * * @access private * @param string A Shibboleth attribute or other string * @return string ascii-version of attribute */ function toAscii($string) { require_once 'include/Unicode/UtfNormal.php'; // Normalize to NFKD. // This separates letters from combining marks. // See http://unicode.org/reports/tr15 $string = UtfNormal::toNFKD($string); // Replace german usages of diaeresis by appending an e $string = preg_replace('/([aouAOU])\\xcc\\x88/', '\\1e', $string); // Replace the combined ae character by separated a and e $string = preg_replace('/\\xc3\\x86/', 'AE', $string); $string = preg_replace('/\\xc3\\xa6/', 'ae', $string); // Replace the combined thorn character by th $string = preg_replace('/\\xc3\\x9e/', 'TH', $string); $string = preg_replace('/\\xc3\\xbe/', 'th', $string); // Replace the letter eth by d $string = preg_replace('/\\xc3\\x90/', 'D', $string); $string = preg_replace('/\\xc4\\x91/', 'd', $string); $string = preg_replace('/\\xc4\\x90/', 'D', $string); // Replace the combined ss character $string = preg_replace('/\\xc3\\x9f/', 'ss', $string); // Get rid of everything except the characters a to z and the hyphen $string = preg_replace('/[^a-zA-Z\\-]/i', '', $string); return $string; }