/** * UTF-8 aware replacement for strpos(). * * Returns the position of the first occurrence of $sub in the $str. If $sub is * not found, it returns false. * * This will get alot slower if an negative $offset is used. * * This function may return boolean false, but may also return a non-boolean 0 * which evaluates to false. Use the === operator for testing the return value * of this function. * * @param string $str The string to search in * @param string $sub The string to search for * @param int $offset If presented, it specifies the position in the string * to begin the search * @param bool $ci Should the function be case-insensitive? * @return int The position or false on failure */ function utf8_search($str, $sub, $offset = 0, $ci = false) { $n = $r = 0; // The position of matching UTF-8 character within the $sub if ($offset < 0) { $offset += utf8_length($str); if ($offset < 0) { $offset = 0; } } while ($n < $offset) { if (utf8_get_char($str, $j) === false) { return false; } ++$n; } while (true) { if (!isset($sub_cp[$r])) { if (utf8_get_char($sub, $i, $cp) !== false) { $sub_cp[] = array($cp); if ($ci && ($_ = unicode_swapcase($cp)) != $cp) { $sub_cp[$r][] = $_; } } elseif (!$r) { trigger_error('utf8_search: The string to search for is empty'); return false; } else { return $n; } } $offset = $n + $r; if (!isset($str_cp[$offset])) { if (utf8_get_char($str, $j, $cp) !== false) { $str_cp[$offset] = $cp; } else { return false; } } if (in_array($str_cp[$offset], $sub_cp[$r])) { $r++; } else { unset($str_cp[$n++]); $r = 0; } } }
/** * @dataProvider providerUtf8GetChar */ public function testUtf8GetChar($str, $rchars, $runicode) { $chars = $unicode = array(); while (($char = utf8_get_char($str, $i, $cp)) !== false) { $chars[] = $char; $unicode[] = $cp; } $this->assertEquals($chars, $rchars); $this->assertEquals($unicode, $runicode); }
/** * UTF-8 aware replacement for substr(). * * Returns the portion of the string specified by the start and end positions. * * @param string $str The UTF-8 encoded string * @param int $start The start position * @param int $end The end position * @return string The portion of the string */ function utf8_slice($str, $start, $end = null) { $start = (int) $start; if ($end !== null) { $end = (int) $end; } if ($start < 0 || $end < 0) { $len = utf8_length($str); if ($start < 0) { $start += $len; if ($start < 0) { $start = 0; } } if ($end < 0) { $end += $len; if ($end < 0) { $end = 0; } } } if ($end === null) { $end = -1; } elseif ($start > $end) { return ''; } $j = 0; $chunk = ''; while (($char = utf8_get_char($str, $i)) !== false) { if ($j >= $start) { $chunk .= $char; } if ($j++ == $end) { break; } } return $chunk; }
/** * UTF-8 aware replacement for trim(). * * Strip whitespace (or other characters) from the beginning and end of * a string. * * @param string $str The UTF-8 encoded string * @param mixed $stripchars The stripped characters * @param int $striptype The optional argument $striptype can be * UTF8_STRIP_BOTH, UTF8_STRIP_LEFT, or UTF8_STRIP_RIGHT. * If $striptype is not specified it is assumed to be * UTF8_STRIP_BOTH. * @return string The stripped string */ function utf8_strip($str, $stripchars = null, $striptype = UTF8_STRIP_BOTH) { static $defaults; global $unicode_separators_array; if ($stripchars === null) { if ($defaults === null) { foreach ($unicode_separators_array as $cp) { $defaults[] = utf8_chr($cp); } } $stripchars = $defaults; } elseif (is_array($stripchars)) { $chars = array(); foreach ($stripchars as $char) { if (($char = utf8_get_char($char)) !== false) { $chars[] = $char; } } $stripchars = $chars; } else { $stripchars = utf8_split($stripchars, 1); } $left = $striptype & UTF8_STRIP_LEFT; $right = $striptype & UTF8_STRIP_RIGHT; $rv = $buffer = ''; while ($char = utf8_get_char($str, $i)) { $state = in_array($char, $stripchars); if ($left) { if ($state) { continue; } else { $left = false; } } if ($right) { if ($state) { $buffer .= $char; continue; } else { $rv .= $buffer; $buffer = ''; } } $rv .= $char; } return $rv; }
/** * Returns a string with the first character of each word converted to * uppercase and the remainder to lowercase. * * @param string $str The UTF-8 encoded string * @return string with the first character of each word converted to uppercase * and the remainder to lowercase */ function utf8_capwords($str) { $rv = ''; $state = true; while (($char = utf8_get_char($str, $i, $cp)) !== false) { if (!($issep = unicode_is_separator($cp))) { $_ = $state ? unicode_upcase($cp) : unicode_downcase($cp); if ($_ != $cp) { $char = utf8_chr($_); } } $state = $issep; $rv .= $char; } return $rv; }