/** * Checks for invalid UTF8 in a string. Stolen from WP * * @since 1.6 * * @param string $string The text which is to be checked. * @param boolean $strip Optional. Whether to attempt to strip out invalid UTF8. Default is false. * @return string The checked text. */ function yourls_check_invalid_utf8($string, $strip = false) { $string = (string) $string; if (0 === strlen($string)) { return ''; } // We can't demand utf8 in the PCRE installation, so just return the string in those cases if (!yourls_supports_pcre_u()) { return $string; } // preg_match fails when it encounters invalid UTF8 in $string if (1 === @preg_match('/^./us', $string)) { return $string; } // Attempt to strip the bad chars if requested (not recommended) if ($strip && function_exists('iconv')) { return iconv('utf-8', 'utf-8', $string); } return ''; }
function yourls_mb_strlen($str, $encoding = null) { if (null === $encoding) { $encoding = 'UTF-8'; } // The solution below works only for UTF-8, // so in case of a different charset just use built-in strlen() if (!in_array($encoding, array('utf8', 'utf-8', 'UTF8', 'UTF-8'))) { return strlen($str); } if (yourls_supports_pcre_u()) { // Use the regex unicode support to separate the UTF-8 characters into an array preg_match_all('/./us', $str, $match); return count($match[0]); } $regex = '/(?: [\\x00-\\x7F] # single-byte sequences 0xxxxxxx | [\\xC2-\\xDF][\\x80-\\xBF] # double-byte sequences 110xxxxx 10xxxxxx | \\xE0[\\xA0-\\xBF][\\x80-\\xBF] # triple-byte sequences 1110xxxx 10xxxxxx * 2 | [\\xE1-\\xEC][\\x80-\\xBF]{2} | \\xED[\\x80-\\x9F][\\x80-\\xBF] | [\\xEE-\\xEF][\\x80-\\xBF]{2} | \\xF0[\\x90-\\xBF][\\x80-\\xBF]{2} # four-byte sequences 11110xxx 10xxxxxx * 3 | [\\xF1-\\xF3][\\x80-\\xBF]{3} | \\xF4[\\x80-\\x8F][\\x80-\\xBF]{2} )/x'; $count = 1; // Start at 1 instead of 0 since the first thing we do is decrement do { // We had some string left over from the last round, but we counted it in that last round. $count--; // Split by UTF-8 character, limit to 1000 characters (last array element will contain the rest of the string) $pieces = preg_split($regex, $str, 1000); // Increment $count += count($pieces); } while ($str = array_pop($pieces)); // If there's anything left over, repeat the loop. // Fencepost: preg_split() always returns one extra item in the array return --$count; }