/** * @since 0.1 * * {@inheritDoc} */ public function tokenize($string) { if ($this->tokenizer !== null) { $string = implode(" ", $this->tokenizer->tokenize($string)); } $result = explode(" ", $this->splitByCharacterGroup(str_replace($this->compound, ' ', $string))); foreach ($result as $key => $value) { if ($value === '') { unset($result[$key]); } // Single katakana/hiragana are exempted if (mb_strlen($value) === 1 && CharacterExaminer::contains(CharacterExaminer::HIRAGANA_KATAKANA, $value)) { unset($result[$key]); } } if ($result !== false) { return array_values($result); } return array(); }
/** * @since 0.1 * * @param string $text * * @return Tokenizer */ public function newCJKMatchableTokenizer($text) { $tokenizer = null; if (CharacterExaminer::contains(CharacterExaminer::HIRAGANA_KATAKANA, $text)) { $tokenizer = $this->newJaTinySegmenterTokenizer(); } else { $tokenizer = $this->newNGramTokenizer($tokenizer); } $tokenizer = $this->newCJKSimpleCharacterRegExTokenizer($tokenizer); return $this->newGenericRegExTokenizer($tokenizer); }
public function testToContainUnknownCharacters() { $this->assertFalse(CharacterExaminer::contains('Foo', '鿩')); }