/** * A wrapper function for the normalizer which takes care of including the class if required and modifies the passed strings * to be in NFC (Normalization Form Composition). * * @param mixed $strings a string or an array of strings to normalize * @return mixed the normalized content, preserving array keys if array given. */ function utf8_normalize_nfc($strings) { if (empty($strings)) { return $strings; } if (!class_exists('utf_normalizer')) { include(IP_ROOT_PATH . 'includes/utf/utf_normalizer.' . PHP_EXT); } if (!is_array($strings)) { utf_normalizer::nfc($strings); } else if (is_array($strings)) { foreach ($strings as $key => $string) { if (is_array($string)) { foreach ($string as $_key => $_string) { utf_normalizer::nfc($strings[$key][$_key]); } } else { utf_normalizer::nfc($strings[$key]); } } } return $strings; }
/** * Clean up a text to remove non-alphanumeric characters * * This method receives a UTF-8 string, normalizes and validates it, replaces all * non-alphanumeric characters with strings then returns the result. * * Any number of "allowed chars" can be passed as a UTF-8 string in NFC. * * @param string $text Text to split, in UTF-8 (not normalized or sanitized) * @param string $allowed_chars String of special chars to allow * @param string $encoding Text encoding * @return string Cleaned up text, only alphanumeric chars are left * * @todo normalizer::cleanup being able to be used? */ function cleanup($text, $allowed_chars = null, $encoding = 'utf-8') { global $phpbb_root_path, $phpEx; static $conv = array(), $conv_loaded = array(); $words = $allow = array(); // Convert the text to UTF-8 $encoding = strtolower($encoding); if ($encoding != 'utf-8') { $text = utf8_recode($text, $encoding); } $utf_len_mask = array("À" => 2, "Ð" => 2, "à" => 3, "ð" => 4); /** * Replace HTML entities and NCRs */ $text = htmlspecialchars_decode(utf8_decode_ncr($text), ENT_QUOTES); /** * Load the UTF-8 normalizer * * If we use it more widely, an instance of that class should be held in a * a global variable instead */ utf_normalizer::nfc($text); /** * The first thing we do is: * * - convert ASCII-7 letters to lowercase * - remove the ASCII-7 non-alpha characters * - remove the bytes that should not appear in a valid UTF-8 string: 0xC0, * 0xC1 and 0xF5-0xFF * * @todo in theory, the third one is already taken care of during normalization and those chars should have been replaced by Unicode replacement chars */ $sb_match = "ISTCPAMELRDOJBNHFGVWUQKYXZ\r\n\t!\"#\$%&'()*+,-./:;<=>?@[\\]^_`{|}~\v\fÀÁõö÷øùúûüýþÿ"; $sb_replace = 'istcpamelrdojbnhfgvwuqkyxz '; /** * This is the list of legal ASCII chars, it is automatically extended * with ASCII chars from $allowed_chars */ $legal_ascii = ' eaisntroludcpmghbfvq10xy2j9kw354867z'; /** * Prepare an array containing the extra chars to allow */ if (isset($allowed_chars[0])) { $pos = 0; $len = strlen($allowed_chars); do { $c = $allowed_chars[$pos]; if ($c < "€") { /** * ASCII char */ $sb_pos = strpos($sb_match, $c); if (is_int($sb_pos)) { /** * Remove the char from $sb_match and its corresponding * replacement in $sb_replace */ $sb_match = substr($sb_match, 0, $sb_pos) . substr($sb_match, $sb_pos + 1); $sb_replace = substr($sb_replace, 0, $sb_pos) . substr($sb_replace, $sb_pos + 1); $legal_ascii .= $c; } ++$pos; } else { /** * UTF-8 char */ $utf_len = $utf_len_mask[$c & "ð"]; $allow[substr($allowed_chars, $pos, $utf_len)] = 1; $pos += $utf_len; } } while ($pos < $len); } $text = strtr($text, $sb_match, $sb_replace); $ret = ''; $pos = 0; $len = strlen($text); do { /** * Do all consecutive ASCII chars at once */ if ($spn = strspn($text, $legal_ascii, $pos)) { $ret .= substr($text, $pos, $spn); $pos += $spn; } if ($pos >= $len) { return $ret; } /** * Capture the UTF char */ $utf_len = $utf_len_mask[$text[$pos] & "ð"]; $utf_char = substr($text, $pos, $utf_len); $pos += $utf_len; if ($utf_char >= UTF8_HANGUL_FIRST && $utf_char <= UTF8_HANGUL_LAST || $utf_char >= UTF8_CJK_FIRST && $utf_char <= UTF8_CJK_LAST || $utf_char >= UTF8_CJK_B_FIRST && $utf_char <= UTF8_CJK_B_LAST) { /** * All characters within these ranges are valid * * We separate them with a space in order to index each character * individually */ $ret .= ' ' . $utf_char . ' '; continue; } if (isset($allow[$utf_char])) { /** * The char is explicitly allowed */ $ret .= $utf_char; continue; } if (isset($conv[$utf_char])) { /** * The char is mapped to something, maybe to itself actually */ $ret .= $conv[$utf_char]; continue; } /** * The char isn't mapped, but did we load its conversion table? * * The search indexer table is split into blocks. The block number of * each char is equal to its codepoint right-shifted for 11 bits. It * means that out of the 11, 16 or 21 meaningful bits of a 2-, 3- or * 4- byte sequence we only keep the leftmost 0, 5 or 10 bits. Thus, * all UTF chars encoded in 2 bytes are in the same first block. */ if (isset($utf_char[2])) { if (isset($utf_char[3])) { /** * 1111 0nnn 10nn nnnn 10nx xxxx 10xx xxxx * 0000 0111 0011 1111 0010 0000 */ $idx = (ord($utf_char[0]) & 0x7) << 7 | (ord($utf_char[1]) & 0x3f) << 1 | (ord($utf_char[2]) & 0x20) >> 5; } else { /** * 1110 nnnn 10nx xxxx 10xx xxxx * 0000 0111 0010 0000 */ $idx = (ord($utf_char[0]) & 0x7) << 1 | (ord($utf_char[1]) & 0x20) >> 5; } } else { /** * 110x xxxx 10xx xxxx * 0000 0000 0000 0000 */ $idx = 0; } /** * Check if the required conv table has been loaded already */ if (!isset($conv_loaded[$idx])) { $conv_loaded[$idx] = 1; $file = $phpbb_root_path . 'includes/utf/data/search_indexer_' . $idx . '.' . $phpEx; if (file_exists($file)) { $conv += (include $file); } } if (isset($conv[$utf_char])) { $ret .= $conv[$utf_char]; } else { /** * We add an entry to the conversion table so that we * don't have to convert to codepoint and perform the checks * that are above this block */ $conv[$utf_char] = ' '; $ret .= ' '; } } while (1); return $ret; }
/** * A wrapper function for the normalizer which takes care of including the class if * required and modifies the passed strings to be in NFC (Normalization Form Composition). * * @param mixed $strings a string or an array of strings to normalize * @return mixed the normalized content, preserving array keys if array given. */ function utf8_normalize_nfc($strings) { if (empty($strings)) { return $strings; } if (!class_exists('utf_normalizer')) { global $src_root_path, $phpEx; include $src_root_path . 'includes/utf/utf_normalizer.' . $phpEx; } if (!is_array($strings)) { utf_normalizer::nfc($strings); } else { if (is_array($strings)) { foreach ($strings as $key => $string) { if (is_array($string)) { foreach ($string as $_key => $_string) { utf_normalizer::nfc($strings[$key][$_key]); } } else { utf_normalizer::nfc($strings[$key]); } } } } return $strings; }