Example #1
0
function gs_utf8_decompose_to_ascii($str)
{
    static $map = null;
    if (!is_array($map)) {
        $map = _gs_utf8_get_map();
    }
    $str = UtfNormal::toNFD(strTr($str, $map));
    # return "safe" ASCII without control chars, newlines etc.
    //$str = preg_replace('/[^a-z0-9\-_. *#\'"!$()\/]/i', '', $str);
    $str = preg_replace('/[^\\x20-\\x7E]/', '', $str);
    return $str;
}
Example #2
0
function normalize_form_d_php($c)
{
    return UtfNormal::toNFD($c, "php");
}
 public function ustringToNFD($s)
 {
     $this->checkString('toNFD', $s, false);
     if (!$this->checkEncoding($s)) {
         return array(null);
     }
     return array(UtfNormal::toNFD($s));
 }
Example #4
0
 /**
  * TODO: does too much in one routine, refactor...
  * @param $testName
  * @return array
  */
 public static function checkUnicodeString($testName)
 {
     # Start with some sanity checking
     if (!is_string($testName)) {
         return array("ERROR", wfMsg('antispoof-badtype'));
     }
     if (strlen($testName) == 0) {
         return array("ERROR", wfMsg('antispoof-empty'));
     }
     if (array_intersect(self::stringToList($testName), self::$character_blacklist)) {
         return array("ERROR", wfMsg('antispoof-blacklisted'));
     }
     # Perform Unicode normalization
     $testName = UtfNormal::toNFD($testName);
     $testChars = self::stringToList($testName);
     # Be paranoid: check again, just in case Unicode normalization code changes...
     if (array_intersect($testChars, self::$character_blacklist)) {
         return array("ERROR", wfMsg('antispoof-blacklisted'));
     }
     # Check for this: should not happen in any valid Unicode string
     if (self::getScriptCode($testChars[0]) == "SCRIPT_COMBINING_MARKS") {
         return array("ERROR", wfMsg('antispoof-combining'));
     }
     # Strip all combining characters in order to crudely strip accents
     # Note: NFD normalization should have decomposed all accented chars earlier
     $testChars = self::stripScript($testChars, "SCRIPT_COMBINING_MARKS");
     $testScripts = array_unique(array_map(array('AntiSpoof', 'getScriptCode'), $testChars));
     if (in_array("SCRIPT_UNASSIGNED", $testScripts) || in_array("SCRIPT_DEPRECATED", $testScripts)) {
         return array("ERROR", wfMsg('antispoof-unassigned'));
     }
     # We don't mind ASCII punctuation or digits
     $testScripts = array_diff($testScripts, array("SCRIPT_ASCII_PUNCTUATION", "SCRIPT_ASCII_DIGITS"));
     if (!$testScripts) {
         return array("ERROR", wfMsg('antispoof-noletters'));
     }
     if (count($testScripts) > 1 && !self::isAllowedScriptCombination($testScripts)) {
         return array("ERROR", wfMsg('antispoof-mixedscripts'));
     }
     # At this point, we should probably check for BiDi violations if they aren't
     # caught above...
     # Replace characters in confusables set with equivalence chars
     $testChars = self::equivString($testChars);
     # Do very simple sequence processing: "vv" -> "w", "rn" -> "m"...
     # Not exhaustive, but ups the ante...
     # Do this _after_ canonicalization: looks weird, but needed for consistency
     $testChars = self::mergePairs($testChars, self::equivString(self::stringToList("VV")), self::equivString(self::stringToList("W")));
     $testChars = self::mergePairs($testChars, self::equivString(self::stringToList("RN")), self::equivString(self::stringToList("M")));
     # Squeeze out all punctuation chars
     # TODO: almost the same code occurs twice, refactor into own routine
     $testChars = self::stripScript($testChars, "SCRIPT_ASCII_PUNCTUATION");
     $testName = self::listToString($testChars);
     # Remove all remaining spaces, just in case any have snuck through...
     $testName = self::hardjoin(explode(" ", $testName));
     # Reduce repeated char sequences to single character
     # BUG: TODO: implement this
     if (strlen($testName) < 1) {
         return array("ERROR", wfMsg('antispoof-tooshort'));
     }
     # Don't ASCIIfy: we assume we are UTF-8 capable on output
     # Prepend version string, for futureproofing if this algorithm changes
     $testName = "v2:" . $testName;
     # And return the canonical version of the name
     return array("OK", $testName);
 }
	/**
	 * Normalise the text so it can be used with strtr() safely
	 *
	 * 1. decodeCharReferences
	 * 2. split into NFD codepoints or NFC fully combined
	 * 3. add bookends on word boundaries
	 *
	 * @param $word  String from user input
	 * @param $flags  may include self::DECOMPOSE, self::IGNORE_ENDINGS
	 * @return String
	 */
	static function forTransliteration( $word, $flags ) {
		static $regexes = null;

		// NOTE: this is very slightly inconsistent with MediaWiki if an NFD code-point
		// has been HTML escaped it will be converted to NFC if it passes through
		// transliteration unchanged, I think that's a WONTFIX though.
		$word = Sanitizer::decodeCharReferences( $word );

		if ( $flags & self::DECOMPOSE ) {
			$word = UtfNormal::toNFD( $word );
			$word = preg_replace( '/./u', '$0' . self::LETTER_END, $word );
		} else {
			$word = preg_replace( '/\X/u', '$0' . self::LETTER_END, $word );
		}

		if ( !$regexes ) {
			// A "letter" is a unicode letter followed by some combining characters
			// A "non-letter" is any other character followed by some combining characters
			// "end" is done first so it watches out for word-endings in "start"
			// If it should treat endings then the start and end of the string are non-letters
			// Otherwise it does not touch the start or end of the string, only internal transitions
			$combining = '(?:[\pM]*' . self::LETTER_END . ')';
			$nonletter = '[^\pL' . self::LETTER_END . self::WORD_END . '\pM]';
			$regexes = array (
				'endings' => array (
					'start' => "/(^$combining?|$nonletter$combining)([\pL])/u",
					'end' => "/([\pL]$combining)([^\pL]|$)/u",
				),
				'ignore-endings' => array (
					'start' => "/($nonletter$combining)([\pL])/u",
					'end' => "/([\pL]$combining)([^\pL])/u",
				),
			);
		}

		$regex = $regexes[$flags & self::IGNORE_ENDINGS ? 'ignore-endings' : 'endings'];
		$word = preg_replace( $regex['end'], '$1' . self::WORD_END . '$2', $word );
		$word = preg_replace( $regex['start'], '$1' . self::WORD_START . '$2', $word );

		return $word;
	}