コード例 #1
0
 /**
  * @since 0.1
  *
  * {@inheritDoc}
  */
 public function tokenize($string)
 {
     if ($this->tokenizer !== null) {
         $string = implode(" ", $this->tokenizer->tokenize($string));
     }
     $result = explode(" ", $this->splitByCharacterGroup(str_replace($this->compound, ' ', $string)));
     foreach ($result as $key => $value) {
         if ($value === '') {
             unset($result[$key]);
         }
         // Single katakana/hiragana are exempted
         if (mb_strlen($value) === 1 && CharacterExaminer::contains(CharacterExaminer::HIRAGANA_KATAKANA, $value)) {
             unset($result[$key]);
         }
     }
     if ($result !== false) {
         return array_values($result);
     }
     return array();
 }
コード例 #2
0
ファイル: SanitizerFactory.php プロジェクト: onoi/tesa
 /**
  * @since 0.1
  *
  * @param string $text
  *
  * @return Tokenizer
  */
 public function newCJKMatchableTokenizer($text)
 {
     $tokenizer = null;
     if (CharacterExaminer::contains(CharacterExaminer::HIRAGANA_KATAKANA, $text)) {
         $tokenizer = $this->newJaTinySegmenterTokenizer();
     } else {
         $tokenizer = $this->newNGramTokenizer($tokenizer);
     }
     $tokenizer = $this->newCJKSimpleCharacterRegExTokenizer($tokenizer);
     return $this->newGenericRegExTokenizer($tokenizer);
 }
コード例 #3
0
ファイル: CharacterExaminerTest.php プロジェクト: onoi/tesa
 public function testToContainUnknownCharacters()
 {
     $this->assertFalse(CharacterExaminer::contains('Foo', '鿩'));
 }