/** * Given a string and a character index in the string, in * terms of the UTF-8 character position, returns the byte * index of that character. Can be useful when you want to * PHP's native string functions but we warned, locating * the byte can be expensive * Takes variable number of parameters - first must be * the search string then 1 to n UTF-8 character positions * to obtain byte indexes for - it is more efficient to search * the string for multiple characters at once, than make * repeated calls to this function * * @author Chris Smith<*****@*****.**> * @param string string to locate index in * @param int (n times) * @return mixed - int if only one input int, array if more * @return boolean TRUE if it's all ASCII * @package utf8 * @subpackage position */ function utf8_byte_position() { $args = func_get_args(); $str =& array_shift($args); if (!is_string($str)) { return false; } $result = array(); // trivial byte index, character offset pair $prev = array(0, 0); // use a short piece of str to estimate bytes per character // $i (& $j) -> byte indexes into $str $i = utf8_locate_next_chr($str, 300); // $c -> character offset into $str $c = strlen(utf8_decode(substr($str, 0, $i))); // deal with arguments from lowest to highest sort($args); foreach ($args as $offset) { // sanity checks FIXME // 0 is an easy check if ($offset == 0) { $result[] = 0; continue; } // ensure no endless looping $safety_valve = 50; do { if ($c - $prev[1] == 0) { // Hack: gone past end of string $error = 0; $i = strlen($str); break; } $j = $i + (int) (($offset - $c) * ($i - $prev[0]) / ($c - $prev[1])); // correct to utf8 character boundary $j = utf8_locate_next_chr($str, $j); // save the index, offset for use next iteration $prev = array($i, $c); if ($j > $i) { // determine new character offset $c += strlen(utf8_decode(substr($str, $i, $j - $i))); } else { // ditto $c -= strlen(utf8_decode(substr($str, $j, $i - $j))); } $error = abs($c - $offset); // ready for next time around $i = $j; // from 7 it is faster to iterate over the string } while ($error > 7 && --$safety_valve); if ($error && $error <= 7) { if ($c < $offset) { // move up while ($error--) { $i = utf8_locate_next_chr($str, ++$i); } } else { // move down while ($error--) { $i = utf8_locate_current_chr($str, --$i); } } // ready for next arg $c = $offset; } $result[] = $i; } if (count($result) == 1) { return $result[0]; } return $result; }
function test_bounds() { // bounds checking $tests = array(); $tests[] = array('aaживπά우리をあöä', -2, 0); $tests[] = array('aaживπά우리をあöä', 128, 29); foreach ($tests as $test) { $this->assertIdentical(utf8_locate_current_chr($test[0], $test[1]), $test[2]); } $tests[] = array('aaживπά우리をあöä', -2, 0); $tests[] = array('aaживπά우리をあöä', 128, 29); foreach ($tests as $test) { $this->assertIdentical(utf8_locate_next_chr($test[0], $test[1]), $test[2]); } }