/**
  * TODO: does too much in one routine, refactor...
  * @param $testName
  * @return array
  */
 public static function checkUnicodeString($testName)
 {
     global $wgAntiSpoofBlacklist;
     # Start with some sanity checking
     if (!is_array($wgAntiSpoofBlacklist)) {
         throw new MWException('$wgAntiSpoofBlacklist should be an array!');
     }
     if (!is_string($testName)) {
         return array("ERROR", wfMessage('antispoof-badtype')->text());
     }
     if (strlen($testName) == 0) {
         return array("ERROR", wfMessage('antispoof-empty')->text());
     }
     foreach (self::stringToList($testName) as $char) {
         if (in_array($char, $wgAntiSpoofBlacklist)) {
             return self::badCharErr('antispoof-blacklisted', $char);
         }
     }
     # Perform Unicode _compatibility_ decomposition
     $testName = UtfNormal::toNFKD($testName);
     $testChars = self::stringToList($testName);
     # Be paranoid: check again, just in case Unicode normalization code changes...
     foreach ($testChars as $char) {
         if (in_array($char, $wgAntiSpoofBlacklist)) {
             return self::badCharErr('antispoof-blacklisted', $char);
         }
     }
     # Check for this: should not happen in any valid Unicode string
     if (self::getScriptCode($testChars[0]) == "SCRIPT_COMBINING_MARKS") {
         return self::badCharErr('antispoof-combining', $testChars[0]);
     }
     # Strip all combining characters in order to crudely strip accents
     # Note: NFKD normalization should have decomposed all accented chars earlier
     $testChars = self::stripScript($testChars, "SCRIPT_COMBINING_MARKS");
     $testScripts = array_map(array('AntiSpoof', 'getScriptCode'), $testChars);
     $unassigned = array_search("SCRIPT_UNASSIGNED", $testScripts);
     if ($unassigned !== False) {
         return self::badCharErr('antispoof-unassigned', $testChars[$unassigned]);
     }
     $deprecated = array_search("SCRIPT_DEPRECATED", $testScripts);
     if ($deprecated !== False) {
         return self::badCharErr('antispoof-deprecated', $testChars[$deprecated]);
     }
     $testScripts = array_unique($testScripts);
     # We don't mind ASCII punctuation or digits
     $testScripts = array_diff($testScripts, array("SCRIPT_ASCII_PUNCTUATION", "SCRIPT_ASCII_DIGITS"));
     if (!$testScripts) {
         return array("ERROR", wfMessage('antispoof-noletters')->text());
     }
     if (count($testScripts) > 1 && !self::isAllowedScriptCombination($testScripts)) {
         return array("ERROR", wfMessage('antispoof-mixedscripts')->text());
     }
     # At this point, we should probably check for BiDi violations if they aren't
     # caught above...
     # Replace characters in confusables set with equivalence chars
     $testChars = self::equivString($testChars);
     # Do very simple sequence processing: "vv" -> "w", "rn" -> "m"...
     # Not exhaustive, but ups the ante...
     # Do this _after_ canonicalization: looks weird, but needed for consistency
     $testChars = self::mergePairs($testChars, self::equivString(self::stringToList("VV")), self::equivString(self::stringToList("W")));
     $testChars = self::mergePairs($testChars, self::equivString(self::stringToList("RN")), self::equivString(self::stringToList("M")));
     # Squeeze out all punctuation chars
     # TODO: almost the same code occurs twice, refactor into own routine
     $testChars = self::stripScript($testChars, "SCRIPT_ASCII_PUNCTUATION");
     $testName = self::listToString($testChars);
     # Remove all remaining spaces, just in case any have snuck through...
     $testName = self::hardjoin(explode(" ", $testName));
     # Reduce repeated char sequences to single character
     # BUG: TODO: implement this
     if (strlen($testName) < 1) {
         return array("ERROR", wfMessage('antispoof-tooshort')->text());
     }
     # Don't ASCIIfy: we assume we are UTF-8 capable on output
     # Prepend version string, for futureproofing if this algorithm changes
     $testName = "v2:" . $testName;
     # And return the canonical version of the name
     return array("OK", $testName);
 }
Example #2
0
function normalize_form_kd_php($c)
{
    return UtfNormal::toNFKD($c, "php");
}
 /**
  * Replaces any non-ASCII character by its linguistically most logical substitution
  *
  * @access private
  * @param string A Shibboleth attribute or other string
  * @return string ascii-version of attribute
  */
 function toAscii($string)
 {
     require_once 'include/Unicode/UtfNormal.php';
     // Normalize to NFKD.
     // This separates letters from combining marks.
     // See http://unicode.org/reports/tr15
     $string = UtfNormal::toNFKD($string);
     // Replace german usages of diaeresis by appending an e
     $string = preg_replace('/([aouAOU])\\xcc\\x88/', '\\1e', $string);
     // Replace the combined ae character by separated a and e
     $string = preg_replace('/\\xc3\\x86/', 'AE', $string);
     $string = preg_replace('/\\xc3\\xa6/', 'ae', $string);
     // Replace the combined thorn character by th
     $string = preg_replace('/\\xc3\\x9e/', 'TH', $string);
     $string = preg_replace('/\\xc3\\xbe/', 'th', $string);
     // Replace the letter eth by d
     $string = preg_replace('/\\xc3\\x90/', 'D', $string);
     $string = preg_replace('/\\xc4\\x91/', 'd', $string);
     $string = preg_replace('/\\xc4\\x90/', 'D', $string);
     // Replace the combined ss character
     $string = preg_replace('/\\xc3\\x9f/', 'ss', $string);
     // Get rid of everything except the characters a to z and the hyphen
     $string = preg_replace('/[^a-zA-Z\\-]/i', '', $string);
     return $string;
 }