function test_search($test_enc, $str, $look_for, $opt, $in_enc = 'EUC-JP') { mb_regex_encoding($test_enc); $str = mb_convert_encoding($str, $test_enc, $in_enc); $look_for = mb_convert_encoding($look_for, $test_enc, $in_enc); mb_ereg_search_init($str, $look_for, $opt); while (mb_ereg_search_pos()) { $regs = mb_ereg_search_getregs(); array_shift($regs); printf("(%s) (%d) %s\n", $test_enc, mb_ereg_search_getpos(), mb_convert_encoding(is_array($regs) ? implode('-', $regs) : '', $in_enc, $test_enc)); } }
/** * Regular expression split and return all parts. * * @param string $pattern Pattern * @param string $subject Subject * @param int $limit Limit * @param string $option Option * @return string[] Array of split parts, array with original string otherwise * @throws MbRegexException When compilation error occurs * @link http://php.net/function.mb-split.php */ public static function split($pattern, $subject, $option = '', $limit = -1) { static::setUp($pattern); $position = 0; $lastPosition = 0; $res = array(); $subjectLen = \mb_strlen($subject); do { \mb_ereg_search_init($subject, $pattern, $option); \mb_ereg_search_setpos($position); $matches = \mb_ereg_search_regs(); if ($matches === false) { break; } $position = \mb_ereg_search_getpos(); $res[] = \mb_substr($subject, $lastPosition, $position - \mb_strlen($matches[0]) - $lastPosition); $lastPosition = $position; } while ($position < $subjectLen && --$limit !== 1); if ($lastPosition <= $subjectLen) { $res[] = \mb_substr($subject, $lastPosition); } static::tearDown(); return $res; }
$num = 4; $str = "This string has four words."; $str = mb_ereg_replace("four", $num, $str); var_dump($str); $test = "http://test.com/test"; $test = mb_ereg_replace("[[:alpha:]]+://[^<>[:space:]]+[[:alnum:]/]", "<a href=\"\\0\">\\0</a>", $test); var_dump($test); $str = "PrÜÝ" . "fung abc pÜ"; $reg = "\\w+"; mb_regex_encoding("UTF-8"); mb_ereg_search_init($str, $reg); $r = mb_ereg_search(); $r = mb_ereg_search_getregs(); // get first result var_dump($r === array("PrÜÝ" . "fung")); var_dump(mb_ereg_search_getpos()); $str = "PrÜÝ" . "fung abc pÜ"; $reg = "\\w+"; mb_regex_encoding("UTF-8"); mb_ereg_search_init($str, $reg); $r = mb_ereg_search(); $r = mb_ereg_search_getregs(); // get first result var_dump($r == array("PrÜÝ" . "fung")); $str = "PrÜÝ" . "fung abc pÜ"; $reg = "\\w+"; mb_regex_encoding("UTF-8"); mb_ereg_search_init($str, $reg); $r = mb_ereg_search(); $r = mb_ereg_search_getregs(); // get first result
/** * split text to search tokens * * @access private * @param string $text 'UTF-8' encoded search text * @return array array of search text token */ function _split_to_tokens($text) { $tokens = array(); // set search token patterns // 1. double quoted phrase // 2. single byte word contains html entities and latin1 letters // 3. multi byte word // 4. symbol - !#$%&'()*+,-./:;<=>?@[\]~_`{|}~ and latin1 supplement symbol $pattern = sprintf('%s|%s|%s|%s', $this->_regex_patterns['phrase'], $this->_regex_patterns['sbword'], $this->_regex_patterns['mbword'], $this->_regex_patterns['symbol']); mb_ereg_search_init($text, $pattern); $len = strlen($text); for ($i = 0; $i < $len; $i = mb_ereg_search_getpos()) { mb_ereg_search_setpos($i); $regs = mb_ereg_search_regs(); if ($regs === false) { break; } // put back token encoding if changed to 'UTF-8' $tokens[] = $regs[0]; } return $tokens; }