Ejemplo n.º 1
0
/**
 * Checks for invalid UTF8 in a string. Stolen from WP
 *
 * @since 1.6
 *
 * @param string $string The text which is to be checked.
 * @param boolean $strip Optional. Whether to attempt to strip out invalid UTF8. Default is false.
 * @return string The checked text.
 */
function yourls_check_invalid_utf8($string, $strip = false)
{
    $string = (string) $string;
    if (0 === strlen($string)) {
        return '';
    }
    // We can't demand utf8 in the PCRE installation, so just return the string in those cases
    if (!yourls_supports_pcre_u()) {
        return $string;
    }
    // preg_match fails when it encounters invalid UTF8 in $string
    if (1 === @preg_match('/^./us', $string)) {
        return $string;
    }
    // Attempt to strip the bad chars if requested (not recommended)
    if ($strip && function_exists('iconv')) {
        return iconv('utf-8', 'utf-8', $string);
    }
    return '';
}
Ejemplo n.º 2
0
function yourls_mb_strlen($str, $encoding = null)
{
    if (null === $encoding) {
        $encoding = 'UTF-8';
    }
    // The solution below works only for UTF-8,
    // so in case of a different charset just use built-in strlen()
    if (!in_array($encoding, array('utf8', 'utf-8', 'UTF8', 'UTF-8'))) {
        return strlen($str);
    }
    if (yourls_supports_pcre_u()) {
        // Use the regex unicode support to separate the UTF-8 characters into an array
        preg_match_all('/./us', $str, $match);
        return count($match[0]);
    }
    $regex = '/(?:
          [\\x00-\\x7F]                  # single-byte sequences   0xxxxxxx
        | [\\xC2-\\xDF][\\x80-\\xBF]       # double-byte sequences   110xxxxx 10xxxxxx
        | \\xE0[\\xA0-\\xBF][\\x80-\\xBF]   # triple-byte sequences   1110xxxx 10xxxxxx * 2
        | [\\xE1-\\xEC][\\x80-\\xBF]{2}
        | \\xED[\\x80-\\x9F][\\x80-\\xBF]
        | [\\xEE-\\xEF][\\x80-\\xBF]{2}
        | \\xF0[\\x90-\\xBF][\\x80-\\xBF]{2} # four-byte sequences   11110xxx 10xxxxxx * 3
        | [\\xF1-\\xF3][\\x80-\\xBF]{3}
        | \\xF4[\\x80-\\x8F][\\x80-\\xBF]{2}
    )/x';
    $count = 1;
    // Start at 1 instead of 0 since the first thing we do is decrement
    do {
        // We had some string left over from the last round, but we counted it in that last round.
        $count--;
        // Split by UTF-8 character, limit to 1000 characters (last array element will contain the rest of the string)
        $pieces = preg_split($regex, $str, 1000);
        // Increment
        $count += count($pieces);
    } while ($str = array_pop($pieces));
    // If there's anything left over, repeat the loop.
    // Fencepost: preg_split() always returns one extra item in the array
    return --$count;
}