Beispiel #1
0
/**
* A wrapper function for the normalizer which takes care of including the class if required and modifies the passed strings
* to be in NFC (Normalization Form Composition).
*
* @param	mixed	$strings	a string or an array of strings to normalize
* @return	mixed				the normalized content, preserving array keys if array given.
*/
function utf8_normalize_nfc($strings)
{
	if (empty($strings))
	{
		return $strings;
	}

	if (!class_exists('utf_normalizer'))
	{
		include(IP_ROOT_PATH . 'includes/utf/utf_normalizer.' . PHP_EXT);
	}

	if (!is_array($strings))
	{
		utf_normalizer::nfc($strings);
	}
	else if (is_array($strings))
	{
		foreach ($strings as $key => $string)
		{
			if (is_array($string))
			{
				foreach ($string as $_key => $_string)
				{
					utf_normalizer::nfc($strings[$key][$_key]);
				}
			}
			else
			{
				utf_normalizer::nfc($strings[$key]);
			}
		}
	}

	return $strings;
}
 /**
  * Clean up a text to remove non-alphanumeric characters
  *
  * This method receives a UTF-8 string, normalizes and validates it, replaces all
  * non-alphanumeric characters with strings then returns the result.
  *
  * Any number of "allowed chars" can be passed as a UTF-8 string in NFC.
  *
  * @param	string	$text			Text to split, in UTF-8 (not normalized or sanitized)
  * @param	string	$allowed_chars	String of special chars to allow
  * @param	string	$encoding		Text encoding
  * @return	string					Cleaned up text, only alphanumeric chars are left
  *
  * @todo normalizer::cleanup being able to be used?
  */
 function cleanup($text, $allowed_chars = null, $encoding = 'utf-8')
 {
     global $phpbb_root_path, $phpEx;
     static $conv = array(), $conv_loaded = array();
     $words = $allow = array();
     // Convert the text to UTF-8
     $encoding = strtolower($encoding);
     if ($encoding != 'utf-8') {
         $text = utf8_recode($text, $encoding);
     }
     $utf_len_mask = array("À" => 2, "Ð" => 2, "à" => 3, "ð" => 4);
     /**
      * Replace HTML entities and NCRs
      */
     $text = htmlspecialchars_decode(utf8_decode_ncr($text), ENT_QUOTES);
     /**
      * Load the UTF-8 normalizer
      *
      * If we use it more widely, an instance of that class should be held in a
      * a global variable instead
      */
     utf_normalizer::nfc($text);
     /**
      * The first thing we do is:
      *
      * - convert ASCII-7 letters to lowercase
      * - remove the ASCII-7 non-alpha characters
      * - remove the bytes that should not appear in a valid UTF-8 string: 0xC0,
      *   0xC1 and 0xF5-0xFF
      *
      * @todo in theory, the third one is already taken care of during normalization and those chars should have been replaced by Unicode replacement chars
      */
     $sb_match = "ISTCPAMELRDOJBNHFGVWUQKYXZ\r\n\t!\"#\$%&'()*+,-./:;<=>?@[\\]^_`{|}~\v\fÀÁõö÷øùúûüýþÿ";
     $sb_replace = 'istcpamelrdojbnhfgvwuqkyxz                                                                              ';
     /**
      * This is the list of legal ASCII chars, it is automatically extended
      * with ASCII chars from $allowed_chars
      */
     $legal_ascii = ' eaisntroludcpmghbfvq10xy2j9kw354867z';
     /**
      * Prepare an array containing the extra chars to allow
      */
     if (isset($allowed_chars[0])) {
         $pos = 0;
         $len = strlen($allowed_chars);
         do {
             $c = $allowed_chars[$pos];
             if ($c < "€") {
                 /**
                  * ASCII char
                  */
                 $sb_pos = strpos($sb_match, $c);
                 if (is_int($sb_pos)) {
                     /**
                      * Remove the char from $sb_match and its corresponding
                      * replacement in $sb_replace
                      */
                     $sb_match = substr($sb_match, 0, $sb_pos) . substr($sb_match, $sb_pos + 1);
                     $sb_replace = substr($sb_replace, 0, $sb_pos) . substr($sb_replace, $sb_pos + 1);
                     $legal_ascii .= $c;
                 }
                 ++$pos;
             } else {
                 /**
                  * UTF-8 char
                  */
                 $utf_len = $utf_len_mask[$c & "ð"];
                 $allow[substr($allowed_chars, $pos, $utf_len)] = 1;
                 $pos += $utf_len;
             }
         } while ($pos < $len);
     }
     $text = strtr($text, $sb_match, $sb_replace);
     $ret = '';
     $pos = 0;
     $len = strlen($text);
     do {
         /**
          * Do all consecutive ASCII chars at once
          */
         if ($spn = strspn($text, $legal_ascii, $pos)) {
             $ret .= substr($text, $pos, $spn);
             $pos += $spn;
         }
         if ($pos >= $len) {
             return $ret;
         }
         /**
          * Capture the UTF char
          */
         $utf_len = $utf_len_mask[$text[$pos] & "ð"];
         $utf_char = substr($text, $pos, $utf_len);
         $pos += $utf_len;
         if ($utf_char >= UTF8_HANGUL_FIRST && $utf_char <= UTF8_HANGUL_LAST || $utf_char >= UTF8_CJK_FIRST && $utf_char <= UTF8_CJK_LAST || $utf_char >= UTF8_CJK_B_FIRST && $utf_char <= UTF8_CJK_B_LAST) {
             /**
              * All characters within these ranges are valid
              *
              * We separate them with a space in order to index each character
              * individually
              */
             $ret .= ' ' . $utf_char . ' ';
             continue;
         }
         if (isset($allow[$utf_char])) {
             /**
              * The char is explicitly allowed
              */
             $ret .= $utf_char;
             continue;
         }
         if (isset($conv[$utf_char])) {
             /**
              * The char is mapped to something, maybe to itself actually
              */
             $ret .= $conv[$utf_char];
             continue;
         }
         /**
          * The char isn't mapped, but did we load its conversion table?
          *
          * The search indexer table is split into blocks. The block number of
          * each char is equal to its codepoint right-shifted for 11 bits. It
          * means that out of the 11, 16 or 21 meaningful bits of a 2-, 3- or
          * 4- byte sequence we only keep the leftmost 0, 5 or 10 bits. Thus,
          * all UTF chars encoded in 2 bytes are in the same first block.
          */
         if (isset($utf_char[2])) {
             if (isset($utf_char[3])) {
                 /**
                  * 1111 0nnn 10nn nnnn 10nx xxxx 10xx xxxx
                  * 0000 0111 0011 1111 0010 0000
                  */
                 $idx = (ord($utf_char[0]) & 0x7) << 7 | (ord($utf_char[1]) & 0x3f) << 1 | (ord($utf_char[2]) & 0x20) >> 5;
             } else {
                 /**
                  * 1110 nnnn 10nx xxxx 10xx xxxx
                  * 0000 0111 0010 0000
                  */
                 $idx = (ord($utf_char[0]) & 0x7) << 1 | (ord($utf_char[1]) & 0x20) >> 5;
             }
         } else {
             /**
              * 110x xxxx 10xx xxxx
              * 0000 0000 0000 0000
              */
             $idx = 0;
         }
         /**
          * Check if the required conv table has been loaded already
          */
         if (!isset($conv_loaded[$idx])) {
             $conv_loaded[$idx] = 1;
             $file = $phpbb_root_path . 'includes/utf/data/search_indexer_' . $idx . '.' . $phpEx;
             if (file_exists($file)) {
                 $conv += (include $file);
             }
         }
         if (isset($conv[$utf_char])) {
             $ret .= $conv[$utf_char];
         } else {
             /**
              * We add an entry to the conversion table so that we
              * don't have to convert to codepoint and perform the checks
              * that are above this block
              */
             $conv[$utf_char] = ' ';
             $ret .= ' ';
         }
     } while (1);
     return $ret;
 }
Beispiel #3
0
 /**
  * A wrapper function for the normalizer which takes care of including the class if
  * required and modifies the passed strings to be in NFC (Normalization Form Composition).
  *
  * @param	mixed	$strings	a string or an array of strings to normalize
  * @return	mixed				the normalized content, preserving array keys if array given.
  */
 function utf8_normalize_nfc($strings)
 {
     if (empty($strings)) {
         return $strings;
     }
     if (!class_exists('utf_normalizer')) {
         global $src_root_path, $phpEx;
         include $src_root_path . 'includes/utf/utf_normalizer.' . $phpEx;
     }
     if (!is_array($strings)) {
         utf_normalizer::nfc($strings);
     } else {
         if (is_array($strings)) {
             foreach ($strings as $key => $string) {
                 if (is_array($string)) {
                     foreach ($string as $_key => $_string) {
                         utf_normalizer::nfc($strings[$key][$_key]);
                     }
                 } else {
                     utf_normalizer::nfc($strings[$key]);
                 }
             }
         }
     }
     return $strings;
 }