/** * Regular expression split and return all parts. * * @param string $pattern Pattern * @param string $subject Subject * @param int $limit Limit * @param string $option Option * @return string[] Array of split parts, array with original string otherwise * @throws MbRegexException When compilation error occurs * @link http://php.net/function.mb-split.php */ public static function split($pattern, $subject, $option = '', $limit = -1) { static::setUp($pattern); $position = 0; $lastPosition = 0; $res = array(); $subjectLen = \mb_strlen($subject); do { \mb_ereg_search_init($subject, $pattern, $option); \mb_ereg_search_setpos($position); $matches = \mb_ereg_search_regs(); if ($matches === false) { break; } $position = \mb_ereg_search_getpos(); $res[] = \mb_substr($subject, $lastPosition, $position - \mb_strlen($matches[0]) - $lastPosition); $lastPosition = $position; } while ($position < $subjectLen && --$limit !== 1); if ($lastPosition <= $subjectLen) { $res[] = \mb_substr($subject, $lastPosition); } static::tearDown(); return $res; }
<?php $subject = "foo bar bà€œz baz"; $pattern = '\\s+'; $position = 13; mb_regex_encoding('utf-8'); mb_internal_encoding('utf-8'); mb_ereg_search_init($subject, '\\G' . $pattern, 'msi'); mb_ereg_search_setpos($position); var_dump(mb_ereg_search_regs());
$r = mb_ereg_search(); $r = mb_ereg_search_getregs(); // get first result var_dump($r === array("PrÜÝ" . "fung")); $r = mb_ereg_search_regs(); // get next result var_dump($r); $str = "PrÜÝ" . "fung abc pÜ"; $reg = "\\w+"; mb_regex_encoding("UTF-8"); mb_ereg_search_init($str, $reg); $r = mb_ereg_search(); $r = mb_ereg_search_getregs(); // get first result var_dump($r === array("PrÜÝ" . "fung")); var_dump(mb_ereg_search_setpos(15)); $r = mb_ereg_search_regs(); // get next result var_dump($r == array("pÜ")); $str = "PrÜÝ" . "fung abc pÜ"; mb_regex_encoding("UTF-8"); mb_ereg_search_init($str); $r = mb_ereg_search_regs("abc", "ms"); var_dump($r); $str = "PrÜÝ" . "fung abc pÜ"; $reg = "\\w+"; mb_regex_encoding("UTF-8"); mb_ereg_search_init($str, $reg); $r = mb_ereg_search(); $r = mb_ereg_search_getregs(); // get first result
protected function match($pattern, $position = null, $options = 'msi') { /*{{{*/ if (null === $position) { $position = $this->bytepos; } if ($this->position >= $this->length) { return false; } mb_ereg_search_init($this->text, '\\G' . $pattern, $options); mb_ereg_search_setpos($position); return mb_ereg_search_regs(); }
function wrsqz_extractTextFromMathML($formula, $encoded=true){ //Algorythm: We scan the mathML tag by tag. //If a tag is one of the allowed (math, mrow) we save it at the stack //and continue with the next. //If the tag is not allowed (mfenced, mfrac,...) we skip all mathML until its //closure (</mfenced>, </mfrac>) //If the tag is <mtext> we rearange the formula //If a tag is a closure of allowed tag, we pop it from the stack. //rapid return if nothing to do. if(strpos($formula,'mtext')===false) return $formula; //initializations $opentag = $encoded ? '«' : '<'; $closetag = $encoded ? '»' : '>'; //tags where an <mtext> can live inside. $allowedtags = array('math', 'mrow'); $pattern = $opentag.'([^'.$opentag.$closetag.']*)'.$closetag; //regexp that matches a single tag label mb_ereg_search_init($formula, $pattern); $stack = array(); //stack of opened tags $omittedcontent=false; //there is math content before the current point? $lasttag=null; //last tag of the stack $length = strlen($formula); $beginformula = strpos($formula, $opentag); //position of the first character of the last formula (in bytes). $pos=array(0,0); //CAUTION: If you change this function, be very carefull with multibyte // and non-multibyte functions. while(($pos[0]+$pos[1])<$length){ $pos = mb_ereg_search_pos($pattern); if($pos[0]+$pos[1] < $length){ //this will be always true but the last iteration mb_ereg_search_setpos($pos[0]+$pos[1]); } $tag = substr($formula, $pos[0],$pos[1]); $trimmedTag = mb_substr($tag,1,-1); //skip autoclosed tags if(mb_substr($trimmedTag,-1) == '/'){ continue; } //discard attributes if(($spacepos = mb_strpos($trimmedTag,' '))!==false){ $trimmedTag=mb_substr($trimmedTag,0,$spacepos); } if(in_array($trimmedTag,$allowedtags)){ //allowed tag $stack[]=array($trimmedTag,$tag); $lasttag = $trimmedTag; }else if($trimmedTag == '/'.$lasttag){ //close allowed tag array_pop($stack); $lasttag = end($stack); $lasttag = $lasttag[0]; //discard empty formulas if(empty($stack) && !$omittedcontent){ $formula1 = substr($formula, 0, $beginformula); if($pos[0]+$pos[1]<$length){ //this isn't the end. $formula2 = substr($formula, $pos[0]+$pos[1]); $formula = $formula1 . $formula2; $length = strlen($formula); mb_ereg_search_init($formula, $pattern); mb_ereg_search_setpos($beginformula); }else{ //this is the last iteration. $length and mb_ereg_search //string and position will be wrong, but it doesn't matter. $formula = $formula1; } } }else if($trimmedTag == 'mtext'){ $pos2 = mb_ereg_search_pos($opentag.'/mtext'.$closetag); $text = substr($formula, $pos[0]+$pos[1], $pos2[0]-($pos[0]+$pos[1])); //Decode some chars in text if($encoded) $text=wrsqz_mathmlDecode($text); $text = str_replace('·','·',$text); $text = str_replace(''',''',$text); $formula1 = substr($formula, 0, $pos[0]); //until <mtext> $formula2 = substr($formula, $pos2[0]+$pos2[1]); //from </mtext> if($omittedcontent){ //we have a non-empty formula before the text so we must close it //compute the tail (close tags) of the formula before the text //and the head (open tags) of the formula after the text. $copystack = $stack; //copy stack $tail1 = ''; $head2 = ''; while($stacktag = array_pop($copystack)){ $tail1.= $opentag.'/'.$stacktag[0].$closetag; $head2 = $stacktag[1] . $head2; } $formula1 = $formula1 . $tail1; $formula2 = $head2 . $formula2; //update $formula $formula = $formula1 . $text . $formula2; $beginformula = $pos[0]+strlen($tail1)+strlen($text); $position = $beginformula+strlen($head2); }else{ //we have an empty formula before the text so we must skip it. $head = substr($formula1, 0, $beginformula); //all before the empty formula $formula1 = substr($formula1, $beginformula); $formula = $head . $text . $formula1 . $formula2; $beginformula += strlen($text); $position = $beginformula +strlen($formula1); } //update parameters with the new formula. $length = strlen($formula); $omittedcontent = false; mb_ereg_search_init($formula, $pattern); mb_ereg_search_setpos($position); }else{ //not allowed tag: go to its closure and remember that we omitted content $pos = mb_ereg_search_pos($opentag.'/'.$trimmedTag.$closetag); if($pos === false){ return $formula; //this is an error in XML (unclosed tag); } $omittedcontent=true; mb_ereg_search_setpos($pos[0]+$pos[1]); } } return $formula; }
/** * @param $pattern * @param $subject * @param array $matches * @param string $option * @param int $offset * @return bool */ function mb_ereg_match_all($pattern, $subject, array &$matches, $option = 'msr', $offset = 0) { @mb_ereg_search_setpos($offset); if (!mb_ereg_search_init($subject, $pattern, $option)) { return false; } $matches = array(); while ($r = mb_ereg_search_regs()) { $matches[] = $r; } return !empty($matches); }
/** * split text to search tokens * * @access private * @param string $text 'UTF-8' encoded search text * @return array array of search text token */ function _split_to_tokens($text) { $tokens = array(); // set search token patterns // 1. double quoted phrase // 2. single byte word contains html entities and latin1 letters // 3. multi byte word // 4. symbol - !#$%&'()*+,-./:;<=>?@[\]~_`{|}~ and latin1 supplement symbol $pattern = sprintf('%s|%s|%s|%s', $this->_regex_patterns['phrase'], $this->_regex_patterns['sbword'], $this->_regex_patterns['mbword'], $this->_regex_patterns['symbol']); mb_ereg_search_init($text, $pattern); $len = strlen($text); for ($i = 0; $i < $len; $i = mb_ereg_search_getpos()) { mb_ereg_search_setpos($i); $regs = mb_ereg_search_regs(); if ($regs === false) { break; } // put back token encoding if changed to 'UTF-8' $tokens[] = $regs[0]; } return $tokens; }