function print_utf8($ascii) { $char = utf8_chr($ascii); if (preg_match("#[\\x00-\\x1F]|\\x7F|(?:\\xC2[\\x80-\\xA0])#", $char)) { return htmlentities("<control>"); } return $char; }
/** * Takes an array of ints representing the Unicode characters and returns * a UTF-8 string. * * @param array $array array of unicode code points representing a string * @return string UTF-8 character string */ function utf8_from_unicode($array) { $str = ''; foreach ($array as $value) { $str .= utf8_chr($value); } return $str; }
function filter_named_entities(&$content) { global $html_named_entities_mapping_mine; foreach ($html_named_entities_mapping_mine as $name => $value) { $content = str_replace('&' . $name . ';', utf8_chr($value), $content); } $content = str_replace('í', 'i', $content); # Ugly hack }
/** * @dataProvider providerFailingUtf8Chr */ public function testFailingUtf8ChrReturnValue($cp) { $this->assertFalse(@utf8_chr($cp, true)); }
/** * decode_ncr() 回调函数 * 函数会忽略大部分 (不是全部) 错误的 NCR */ function utf8_from_ncr_callback($m) { $cp = strncasecmp($m[1], 'x', 1) ? $m[1] : hexdec(substr($m[1], 1)); return utf8_chr($cp); }
/** * UTF-8 aware replacement for trim(). * * Strip whitespace (or other characters) from the beginning and end of * a string. * * @param string $str The UTF-8 encoded string * @param mixed $stripchars The stripped characters * @param int $striptype The optional argument $striptype can be * UTF8_STRIP_BOTH, UTF8_STRIP_LEFT, or UTF8_STRIP_RIGHT. * If $striptype is not specified it is assumed to be * UTF8_STRIP_BOTH. * @return string The stripped string */ function utf8_strip($str, $stripchars = null, $striptype = UTF8_STRIP_BOTH) { static $defaults; global $unicode_separators_array; if ($stripchars === null) { if ($defaults === null) { foreach ($unicode_separators_array as $cp) { $defaults[] = utf8_chr($cp); } } $stripchars = $defaults; } elseif (is_array($stripchars)) { $chars = array(); foreach ($stripchars as $char) { if (($char = utf8_get_char($char)) !== false) { $chars[] = $char; } } $stripchars = $chars; } else { $stripchars = utf8_split($stripchars, 1); } $left = $striptype & UTF8_STRIP_LEFT; $right = $striptype & UTF8_STRIP_RIGHT; $rv = $buffer = ''; while ($char = utf8_get_char($str, $i)) { $state = in_array($char, $stripchars); if ($left) { if ($state) { continue; } else { $left = false; } } if ($right) { if ($state) { $buffer .= $char; continue; } else { $rv .= $buffer; $buffer = ''; } } $rv .= $char; } return $rv; }
} } // do some tests for things that transform into something with the number one if (strpos($temp_hold, utf8_chr(0x31)) !== false) { // any kind of letter L? if (strpos($value[0], 'LETTER L') !== false || strpos($value[0], 'IOTA') !== false || strpos($value[0], 'SMALL L ') !== false || preg_match('/SMALL LIGATURE [^L]*L /', $value[0])) { // replace all of the mappings that transform some sort of letter l to number one instead to some sort of letter l to latin small letter l $temp_hold = str_replace(utf8_chr(0x31), utf8_chr(0x6c), $temp_hold); } } // uppercased chars that were folded do not exist in this universe, // no amount of normalization could ever "trick" this into not working if (in_array($value[1], $casefold_array[1])) { continue; } $uniarray[utf8_chr(hexdec((string) $value[1]))] = $temp_hold; } echo "Writing to confusables.{$phpEx}\n"; $fp = fopen($phpbb_root_path . 'includes/utf/data/confusables.' . $phpEx, 'wb'); fwrite($fp, '<?php return ' . my_var_export($uniarray) . ';'); fclose($fp); /** * Return a parsable string representation of a variable * * This is function is limited to array/strings/integers * * @param mixed $var Variable * @return string PHP code representing the variable */ function my_var_export($var) {
} else { if ($cp > 0x7ff) { return chr(0xe0 | $cp >> 12) . chr(0x80 | $cp >> 6 & 0x3f) . chr(0x80 | $cp & 0x3f); } else { if ($cp > 0x7f) { return chr(0xc0 | $cp >> 6) . chr(0x80 | $cp & 0x3f); } else { return chr($cp); } } } } preg_match_all('/^([0-9A-F]+); ([CFS]); ([0-9A-F]+(?: [0-9A-F]+)*);/im', $unidata, $array, PREG_SET_ORDER); $uniarray = array(); foreach ($array as $value) { $uniarray[$value[2]][utf8_chr(hexdec((string) $value[1]))] = implode(array_map('utf8_chr', array_map('hexdec', explode(' ', $value[3])))); } foreach ($uniarray as $idx => $contents) { echo "Writing to case_fold_{$idx}.{$phpEx}\n"; $fp = fopen($phpbb_root_path . 'includes/utf/data/case_fold_' . strtolower($idx) . '.' . $phpEx, 'wb'); fwrite($fp, '<?php return ' . my_var_export($contents) . ';'); fclose($fp); } /** * Return a parsable string representation of a variable * * This is function is limited to array/strings/integers * * @param mixed $var Variable * @return string PHP code representing the variable */
/** * Callback function for utf8_decode_entities * @param array * @return string */ function utf8_hexchr_callback($matches) { return utf8_chr(hexdec($matches[1])); }
/** * utf8_strrpos( ) * * Find position of last occurrence of a char in a UTF-8 string * @since 1.3 * * @param string $haystack The string to search in * @param string $needle The string to search for * @param int $offset Number of char to ignore from start or end * @return int THe position of last occurrance of needle */ function utf8_strrpos($haystack, $needle, $offset = 0) { if ((int) $needle === $needle && $needle >= 0) { $needle = utf8_chr($needle); } $needle = utf8_clean((string) $needle); $offset = (int) $offset; $haystack = utf8_clean($haystack); if (mbstring_loaded()) { //mb_strrpos returns wrong position if invalid characters are found in $haystack before $needle return mb_strrpos($haystack, $needle, $offset, 'UTF-8'); } if (iconv_loaded() && $offset === 0) { //iconv_strrpos is not tolerant to invalid characters //iconv_strrpos does not accept $offset return iconv_strrpos($haystack, $needle, 'UTF-8'); } if ($offset > 0) { $haystack = utf8_substr($haystack, $offset); } else { if ($offset < 0) { $haystack = utf8_substr($haystack, 0, $offset); } } if (($pos = strrpos($haystack, $needle)) !== false) { $left = substr($haystack, 0, $pos); return ($offset > 0 ? $offset : 0) + utf8_strlen($left); } return false; }
/** * Returns a string with the first character of each word converted to * uppercase and the remainder to lowercase. * * @param string $str The UTF-8 encoded string * @return string with the first character of each word converted to uppercase * and the remainder to lowercase */ function utf8_capwords($str) { $rv = ''; $state = true; while (($char = utf8_get_char($str, $i, $cp)) !== false) { if (!($issep = unicode_is_separator($cp))) { $_ = $state ? unicode_upcase($cp) : unicode_downcase($cp); if ($_ != $cp) { $char = utf8_chr($_); } } $state = $issep; $rv .= $char; } return $rv; }