Beispiel #1
0
 /**
  * search consecutive Japanese/Kanji chars
  * @param  MbString     &$mbText [the haystack]
  * @param  integer      $offset  [search neighbors to this offset]
  * @return array/false           [0/1 = position/length, false otherwise]
  */
 protected function searchConsecutiveJpnChars(MbString &$mbText, $offset = 0)
 {
     $mbText_len = $mbText->strlen();
     for ($idx = $offset; $idx < $mbText_len; ++$idx) {
         // search a Japanese char/word's location ($idx) in $mbText
         if (!$this->mbMustBeJpn->has($mbText[$idx])) {
             continue;
         }
         // search consecutive Japanese chars frontward
         for ($idxF = $idx - 1; $idxF >= 0; --$idxF) {
             if (!$this->mbMayBeJpnChars->has($mbText[$idxF])) {
                 break;
             }
         }
         ++$idxF;
         // position correction
         // search consecutive Japanese chars backward
         for ($idxB = $idx + 1; $idxB < $mbText_len; ++$idxB) {
             if (!$this->mbMayBeJpnChars->has($mbText[$idxB])) {
                 break;
             }
         }
         --$idxB;
         // position correction
         // a Japanese region has been found
         break;
     }
     return $idx < $mbText_len ? [$idxF, $idxB - $idxF + 1] : false;
 }
Beispiel #2
0
 protected function replaceRepeatPattern(&$text, array $convTable, $context)
 {
     // split text into text parts (even key) and symbol parts (odd key)
     $textSplit = preg_split($this->punctuationRegex, $text, -1, PREG_SPLIT_DELIM_CAPTURE);
     $textSplitCnt = count($textSplit);
     // get text parts and merge them into a new string to $textNoSymbol
     $textNoSymbol = [];
     for ($i = 0; $i < $textSplitCnt; $i += 2) {
         $textNoSymbol[] =& $textSplit[$i];
     }
     $textNoSymbol = implode('', $textNoSymbol);
     $mbTextNoSymbol = new MbString($textNoSymbol, $this->encoding);
     $mbTextNoSymbol_len = $mbTextNoSymbol->strlen();
     // do conversion on the de-symboled text, i.e., $mbTextNoSymbol
     foreach ($convTable as $sr => &$repArr) {
         list($rep, $conditionRegex) = $repArr;
         $mbSr = new MbString($sr, $this->encoding);
         $mbRep = new MbString($rep, $this->encoding);
         $mbSr_len = $mbSr->strlen();
         $mbRep_len = $mbRep->strlen();
         // skip replacements which will cause different lengths
         if ($mbSr_len != $mbRep_len) {
             unset($mbSr, $mbRep);
             continue;
         }
         // start the replacement
         $seek = -1;
         while (true) {
             // find the position of the searched string
             $seek = $mbTextNoSymbol->strpos($sr, $seek + 1);
             if ($seek === false) {
                 break;
             }
             // check the $conditionRegex
             $textSlice = $mbTextNoSymbol->substr($seek > $context ? $seek - $context : 0, $mbSr_len + $context << 1);
             if (!empty($conditionRegex) && !preg_match("/{$conditionRegex}/u", $textSlice)) {
                 continue;
             }
             // replace frontward
             $seekFront = $seek;
             while ($seekFront > 0) {
                 --$seekFront;
                 // check $charToCheck is in $sr or not
                 $charToCheck = $mbTextNoSymbol[$seekFront];
                 $charToCheckPosInSr = $mbSr->strpos($charToCheck);
                 if ($charToCheckPosInSr === false) {
                     break;
                 }
                 // replace $charToCheck with the corresponding one
                 $repChar = $mbRep[$charToCheckPosInSr];
                 $mbTextNoSymbol->substr_replace_i($repChar, $seekFront, 1);
             }
             // replace backward
             $seekBack = $seek + $mbSr_len;
             while ($seekBack < $mbTextNoSymbol_len) {
                 // check $charToCheck is in $sr or not
                 $charToCheck = $mbTextNoSymbol[$seekBack];
                 $charToCheckPosInSr = $mbSr->strpos($charToCheck);
                 if ($charToCheckPosInSr === false) {
                     break;
                 }
                 // replace $charToCheck with the corresponding one
                 $repChar = $mbRep[$charToCheckPosInSr];
                 $mbTextNoSymbol->substr_replace_i($repChar, $seekBack, 1);
                 ++$seekBack;
             }
             // replace the center
             $mbTextNoSymbol->substr_replace_i($rep, $seek, $mbSr_len);
         }
         unset($mbSr, $mbRep);
     }
     // patch text parts in $textSplit by using $mbTextNoSymbol
     for ($i = $seek = 0; $i < $textSplitCnt; $i += 2) {
         $pieceLength = MbString::static_strlen($textSplit[$i], $this->encoding);
         $textSplit[$i] = $mbTextNoSymbol->substr($seek, $pieceLength);
         $seek += $pieceLength;
     }
     unset($mbTextNoSymbol);
     // re-construct $text by concatenating $textSplit
     $text = implode('', $textSplit);
 }