PHP UtfNormal::toNFD Examples

Programming Language: PHP

Class/Type: UtfNormal

Method/Function: toNFD

Examples at hotexamples.com: 5

PHP UtfNormal::toNFD - 5 examples found. These are the top rated real world PHP examples of UtfNormal::toNFD extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

cleanUp(30)

toNFC(7)

quickIsNFCVerify(6)

toNFD(5)

fastCombiningSort(3)

fastDecompose(3)

toNFKD(3)

NFC(2)

NFKC(2)

loadData(2)

quickIsNFC(2)

NFD(1)

NFKD(1)

fastCompose(1)

quickisNFCVerify(1)

toNFKC(1)

Example #1

Show file

File: gs_utf_normal.php Project: rkania/GS3

function gs_utf8_decompose_to_ascii($str)
{
    static $map = null;
    if (!is_array($map)) {
        $map = _gs_utf8_get_map();
    }
    $str = UtfNormal::toNFD(strTr($str, $map));
    # return "safe" ASCII without control chars, newlines etc.
    //$str = preg_replace('/[^a-z0-9\-_. *#\'"!$()\/]/i', '', $str);
    $str = preg_replace('/[^\\x20-\\x7E]/', '', $str);
    return $str;
}

Example #2

Show file

File: UtfNormalTest2.php Project: seedbank/old-repo

function normalize_form_d_php($c)
{
    return UtfNormal::toNFD($c, "php");
}

Example #3

Show file

File: UstringLibrary.php Project: negati-ve/openshift-mediawiki

 public function ustringToNFD($s)
 {
     $this->checkString('toNFD', $s, false);
     if (!$this->checkEncoding($s)) {
         return array(null);
     }
     return array(UtfNormal::toNFD($s));
 }

Example #4

Show file

File: AntiSpoof_body.php Project: Tjorriemorrie/app

 /**
  * TODO: does too much in one routine, refactor...
  * @param $testName
  * @return array
  */
 public static function checkUnicodeString($testName)
 {
     # Start with some sanity checking
     if (!is_string($testName)) {
         return array("ERROR", wfMsg('antispoof-badtype'));
     }
     if (strlen($testName) == 0) {
         return array("ERROR", wfMsg('antispoof-empty'));
     }
     if (array_intersect(self::stringToList($testName), self::$character_blacklist)) {
         return array("ERROR", wfMsg('antispoof-blacklisted'));
     }
     # Perform Unicode normalization
     $testName = UtfNormal::toNFD($testName);
     $testChars = self::stringToList($testName);
     # Be paranoid: check again, just in case Unicode normalization code changes...
     if (array_intersect($testChars, self::$character_blacklist)) {
         return array("ERROR", wfMsg('antispoof-blacklisted'));
     }
     # Check for this: should not happen in any valid Unicode string
     if (self::getScriptCode($testChars[0]) == "SCRIPT_COMBINING_MARKS") {
         return array("ERROR", wfMsg('antispoof-combining'));
     }
     # Strip all combining characters in order to crudely strip accents
     # Note: NFD normalization should have decomposed all accented chars earlier
     $testChars = self::stripScript($testChars, "SCRIPT_COMBINING_MARKS");
     $testScripts = array_unique(array_map(array('AntiSpoof', 'getScriptCode'), $testChars));
     if (in_array("SCRIPT_UNASSIGNED", $testScripts) || in_array("SCRIPT_DEPRECATED", $testScripts)) {
         return array("ERROR", wfMsg('antispoof-unassigned'));
     }
     # We don't mind ASCII punctuation or digits
     $testScripts = array_diff($testScripts, array("SCRIPT_ASCII_PUNCTUATION", "SCRIPT_ASCII_DIGITS"));
     if (!$testScripts) {
         return array("ERROR", wfMsg('antispoof-noletters'));
     }
     if (count($testScripts) > 1 && !self::isAllowedScriptCombination($testScripts)) {
         return array("ERROR", wfMsg('antispoof-mixedscripts'));
     }
     # At this point, we should probably check for BiDi violations if they aren't
     # caught above...
     # Replace characters in confusables set with equivalence chars
     $testChars = self::equivString($testChars);
     # Do very simple sequence processing: "vv" -> "w", "rn" -> "m"...
     # Not exhaustive, but ups the ante...
     # Do this _after_ canonicalization: looks weird, but needed for consistency
     $testChars = self::mergePairs($testChars, self::equivString(self::stringToList("VV")), self::equivString(self::stringToList("W")));
     $testChars = self::mergePairs($testChars, self::equivString(self::stringToList("RN")), self::equivString(self::stringToList("M")));
     # Squeeze out all punctuation chars
     # TODO: almost the same code occurs twice, refactor into own routine
     $testChars = self::stripScript($testChars, "SCRIPT_ASCII_PUNCTUATION");
     $testName = self::listToString($testChars);
     # Remove all remaining spaces, just in case any have snuck through...
     $testName = self::hardjoin(explode(" ", $testName));
     # Reduce repeated char sequences to single character
     # BUG: TODO: implement this
     if (strlen($testName) < 1) {
         return array("ERROR", wfMsg('antispoof-tooshort'));
     }
     # Don't ASCIIfy: we assume we are UTF-8 capable on output
     # Prepend version string, for futureproofing if this algorithm changes
     $testName = "v2:" . $testName;
     # And return the canonical version of the name
     return array("OK", $testName);
 }

Example #5

Show file

File: Transliterator_body.php Project: realsoc/mediawiki-extensions

	/**
	 * Normalise the text so it can be used with strtr() safely
	 *
	 * 1. decodeCharReferences
	 * 2. split into NFD codepoints or NFC fully combined
	 * 3. add bookends on word boundaries
	 *
	 * @param $word  String from user input
	 * @param $flags  may include self::DECOMPOSE, self::IGNORE_ENDINGS
	 * @return String
	 */
	static function forTransliteration( $word, $flags ) {
		static $regexes = null;

		// NOTE: this is very slightly inconsistent with MediaWiki if an NFD code-point
		// has been HTML escaped it will be converted to NFC if it passes through
		// transliteration unchanged, I think that's a WONTFIX though.
		$word = Sanitizer::decodeCharReferences( $word );

		if ( $flags & self::DECOMPOSE ) {
			$word = UtfNormal::toNFD( $word );
			$word = preg_replace( '/./u', '$0' . self::LETTER_END, $word );
		} else {
			$word = preg_replace( '/\X/u', '$0' . self::LETTER_END, $word );
		}

		if ( !$regexes ) {
			// A "letter" is a unicode letter followed by some combining characters
			// A "non-letter" is any other character followed by some combining characters
			// "end" is done first so it watches out for word-endings in "start"
			// If it should treat endings then the start and end of the string are non-letters
			// Otherwise it does not touch the start or end of the string, only internal transitions
			$combining = '(?:[\pM]*' . self::LETTER_END . ')';
			$nonletter = '[^\pL' . self::LETTER_END . self::WORD_END . '\pM]';
			$regexes = array (
				'endings' => array (
					'start' => "/(^$combining?|$nonletter$combining)([\pL])/u",
					'end' => "/([\pL]$combining)([^\pL]|$)/u",
				),
				'ignore-endings' => array (
					'start' => "/($nonletter$combining)([\pL])/u",
					'end' => "/([\pL]$combining)([^\pL])/u",
				),
			);
		}

		$regex = $regexes[$flags & self::IGNORE_ENDINGS ? 'ignore-endings' : 'endings'];
		$word = preg_replace( $regex['end'], '$1' . self::WORD_END . '$2', $word );
		$word = preg_replace( $regex['start'], '$1' . self::WORD_START . '$2', $word );

		return $word;
	}