Beispiel #1
0
 /**
  * Regular expression split and return all parts.
  *
  * @param string $pattern Pattern
  * @param string $subject Subject
  * @param int    $limit   Limit
  * @param string $option  Option
  * @return string[] Array of split parts, array with original string otherwise
  * @throws MbRegexException When compilation error occurs
  * @link http://php.net/function.mb-split.php
  */
 public static function split($pattern, $subject, $option = '', $limit = -1)
 {
     static::setUp($pattern);
     $position = 0;
     $lastPosition = 0;
     $res = array();
     $subjectLen = \mb_strlen($subject);
     do {
         \mb_ereg_search_init($subject, $pattern, $option);
         \mb_ereg_search_setpos($position);
         $matches = \mb_ereg_search_regs();
         if ($matches === false) {
             break;
         }
         $position = \mb_ereg_search_getpos();
         $res[] = \mb_substr($subject, $lastPosition, $position - \mb_strlen($matches[0]) - $lastPosition);
         $lastPosition = $position;
     } while ($position < $subjectLen && --$limit !== 1);
     if ($lastPosition <= $subjectLen) {
         $res[] = \mb_substr($subject, $lastPosition);
     }
     static::tearDown();
     return $res;
 }
Beispiel #2
0
<?php

$subject = "foo bar bà€œz baz";
$pattern = '\\s+';
$position = 13;
mb_regex_encoding('utf-8');
mb_internal_encoding('utf-8');
mb_ereg_search_init($subject, '\\G' . $pattern, 'msi');
mb_ereg_search_setpos($position);
var_dump(mb_ereg_search_regs());
Beispiel #3
0
$r = mb_ereg_search();
$r = mb_ereg_search_getregs();
// get first result
var_dump($r === array("PrÜÝ" . "fung"));
$r = mb_ereg_search_regs();
// get next result
var_dump($r);
$str = "PrÜÝ" . "fung abc pÜ";
$reg = "\\w+";
mb_regex_encoding("UTF-8");
mb_ereg_search_init($str, $reg);
$r = mb_ereg_search();
$r = mb_ereg_search_getregs();
// get first result
var_dump($r === array("PrÜÝ" . "fung"));
var_dump(mb_ereg_search_setpos(15));
$r = mb_ereg_search_regs();
// get next result
var_dump($r == array("pÜ"));
$str = "PrÜÝ" . "fung abc pÜ";
mb_regex_encoding("UTF-8");
mb_ereg_search_init($str);
$r = mb_ereg_search_regs("abc", "ms");
var_dump($r);
$str = "PrÜÝ" . "fung abc pÜ";
$reg = "\\w+";
mb_regex_encoding("UTF-8");
mb_ereg_search_init($str, $reg);
$r = mb_ereg_search();
$r = mb_ereg_search_getregs();
// get first result
Beispiel #4
0
 protected function match($pattern, $position = null, $options = 'msi')
 {
     /*{{{*/
     if (null === $position) {
         $position = $this->bytepos;
     }
     if ($this->position >= $this->length) {
         return false;
     }
     mb_ereg_search_init($this->text, '\\G' . $pattern, $options);
     mb_ereg_search_setpos($position);
     return mb_ereg_search_regs();
 }
function wrsqz_extractTextFromMathML($formula, $encoded=true){
    //Algorythm: We scan the mathML tag by tag.
    //If a tag is one of the allowed (math, mrow) we save it at the stack
    //and continue with the next.
    //If the tag is not allowed (mfenced, mfrac,...) we skip all mathML until its
    //closure (</mfenced>, </mfrac>)
    //If the tag is <mtext> we rearange the formula
    //If a tag is a closure of allowed tag, we pop it from the stack.

    //rapid return if nothing to do.
    if(strpos($formula,'mtext')===false) return $formula;
    //initializations
    $opentag = $encoded ? '«' : '<';
    $closetag = $encoded ? '»' : '>';
    //tags where an <mtext> can live inside.
    $allowedtags = array('math', 'mrow');

    $pattern = $opentag.'([^'.$opentag.$closetag.']*)'.$closetag; //regexp that matches a single tag label
    mb_ereg_search_init($formula, $pattern);
    $stack = array();       //stack of opened tags
    $omittedcontent=false;  //there is math content before the current point?
    $lasttag=null;          //last tag of the stack
    $length = strlen($formula);
    $beginformula = strpos($formula, $opentag);   //position of the first character of the last formula (in bytes).
    $pos=array(0,0);
    //CAUTION: If you change this function, be very carefull with multibyte
    //         and non-multibyte functions.
    while(($pos[0]+$pos[1])<$length){
        $pos = mb_ereg_search_pos($pattern);

        if($pos[0]+$pos[1] < $length){
            //this will be always true but the last iteration
            mb_ereg_search_setpos($pos[0]+$pos[1]);
        }
        $tag = substr($formula, $pos[0],$pos[1]);
        $trimmedTag = mb_substr($tag,1,-1);
        //skip autoclosed tags
        if(mb_substr($trimmedTag,-1) == '/'){
            continue;
        }
        //discard attributes
        if(($spacepos = mb_strpos($trimmedTag,' '))!==false){
            $trimmedTag=mb_substr($trimmedTag,0,$spacepos);
        }      
        if(in_array($trimmedTag,$allowedtags)){
        //allowed tag
            $stack[]=array($trimmedTag,$tag);
            $lasttag = $trimmedTag;
        }else if($trimmedTag == '/'.$lasttag){
        //close allowed tag
            array_pop($stack);
            $lasttag = end($stack);
            $lasttag = $lasttag[0];
            //discard empty formulas
            if(empty($stack) && !$omittedcontent){
                $formula1 = substr($formula, 0, $beginformula);
                if($pos[0]+$pos[1]<$length){
                    //this isn't the end.
                    $formula2 = substr($formula, $pos[0]+$pos[1]);
                    $formula = $formula1 . $formula2;
                    $length = strlen($formula);
                    mb_ereg_search_init($formula, $pattern);
                    mb_ereg_search_setpos($beginformula);
                }else{
                    //this is the last iteration. $length and mb_ereg_search
                    //string and position will be wrong, but it doesn't matter.
                    $formula = $formula1;
                }
                
            }
        }else if($trimmedTag == 'mtext'){
            $pos2 = mb_ereg_search_pos($opentag.'/mtext'.$closetag);
            $text = substr($formula, $pos[0]+$pos[1], $pos2[0]-($pos[0]+$pos[1]));
            //Decode some chars in text
            if($encoded) $text=wrsqz_mathmlDecode($text);
            $text = str_replace('&centerdot;','&middot;',$text);
            $text = str_replace('&apos;','&#39;',$text);
            $formula1 = substr($formula, 0, $pos[0]);  //until <mtext>
            $formula2 = substr($formula, $pos2[0]+$pos2[1]); //from </mtext>
            if($omittedcontent){ 
                //we have a non-empty formula before the text so we must close it
                //compute the tail (close tags) of the formula before the text
                //and the head (open tags) of the formula after the text.
                $copystack = $stack; //copy stack
                $tail1 = '';
                $head2 = '';
                while($stacktag = array_pop($copystack)){
                    $tail1.= $opentag.'/'.$stacktag[0].$closetag;
                    $head2 = $stacktag[1] . $head2;
                }
                $formula1 = $formula1 . $tail1;
                $formula2 = $head2 . $formula2;
                //update $formula
                $formula = $formula1 . $text . $formula2;
                $beginformula = $pos[0]+strlen($tail1)+strlen($text);
                $position = $beginformula+strlen($head2);
            }else{
            //we have an empty formula before the text so we must skip it.
                $head = substr($formula1, 0, $beginformula); //all before the empty formula
                $formula1 = substr($formula1, $beginformula);

                $formula = $head . $text . $formula1 . $formula2;
                $beginformula += strlen($text);
                $position = $beginformula +strlen($formula1);
            }
            //update parameters with the new formula.
            $length = strlen($formula);
            $omittedcontent = false;
            mb_ereg_search_init($formula, $pattern);
            mb_ereg_search_setpos($position);
            
        }else{
        //not allowed tag: go to its closure and remember that we omitted content
            $pos = mb_ereg_search_pos($opentag.'/'.$trimmedTag.$closetag);
            if($pos === false){
                return $formula; //this is an error in XML (unclosed tag);
            }
            $omittedcontent=true;
            mb_ereg_search_setpos($pos[0]+$pos[1]);
        }
    }

    return $formula;

}
Beispiel #6
0
/**
 * @param $pattern
 * @param $subject
 * @param array $matches
 * @param string $option
 * @param int $offset
 * @return bool
 */
function mb_ereg_match_all($pattern, $subject, array &$matches, $option = 'msr', $offset = 0)
{
    @mb_ereg_search_setpos($offset);
    if (!mb_ereg_search_init($subject, $pattern, $option)) {
        return false;
    }
    $matches = array();
    while ($r = mb_ereg_search_regs()) {
        $matches[] = $r;
    }
    return !empty($matches);
}
 /**
  * split text to search tokens
  *
  * @access private
  * @param string $text 'UTF-8' encoded search text
  * @return array array of search text token
  */
 function _split_to_tokens($text)
 {
     $tokens = array();
     // set search token patterns
     // 1. double quoted phrase
     // 2. single byte word contains html entities and latin1 letters
     // 3. multi byte word
     // 4. symbol - !#$%&'()*+,-./:;<=>?@[\]~_`{|}~ and latin1 supplement symbol
     $pattern = sprintf('%s|%s|%s|%s', $this->_regex_patterns['phrase'], $this->_regex_patterns['sbword'], $this->_regex_patterns['mbword'], $this->_regex_patterns['symbol']);
     mb_ereg_search_init($text, $pattern);
     $len = strlen($text);
     for ($i = 0; $i < $len; $i = mb_ereg_search_getpos()) {
         mb_ereg_search_setpos($i);
         $regs = mb_ereg_search_regs();
         if ($regs === false) {
             break;
         }
         // put back token encoding if changed to 'UTF-8'
         $tokens[] = $regs[0];
     }
     return $tokens;
 }