/** * @since 0.1 * * {@inheritDoc} */ public function tokenize($string) { if ($this->tokenizer !== null) { $string = implode(" ", $this->tokenizer->tokenize($string)); } return $this->loadModel()->segment($string); }
/** * @since 0.1 * * @param string $string * * @return array|false */ public function tokenize($string) { if ($this->tokenizer !== null) { $string = implode(" ", $this->tokenizer->tokenize($string)); } if (!$this->isAvailable()) { return $this->tokenizer !== null ? $this->tokenizer->tokenize($string) : array($string); } return $this->createTokens($string); }
/** * @since 0.1 * * {@inheritDoc} */ public function tokenize($string) { if ($this->tokenizer !== null) { $string = implode(" ", $this->tokenizer->tokenize($string)); } $result = explode(" ", $this->splitByCharacterGroup(str_replace($this->compound, ' ', $string))); foreach ($result as $key => $value) { if ($value === '') { unset($result[$key]); } // Single katakana/hiragana are exempted if (mb_strlen($value) === 1 && CharacterExaminer::contains(CharacterExaminer::HIRAGANA_KATAKANA, $value)) { unset($result[$key]); } } if ($result !== false) { return array_values($result); } return array(); }
/** * @since 0.1 * * @param Tokenizer $tokenizer * @param StopwordAnalyzer $stopwordAnalyzer * * @return string */ public function sanitizeWith(Tokenizer $tokenizer, StopwordAnalyzer $stopwordAnalyzer, Synonymizer $synonymizer) { // Treat non-words tokenizers (Ja,Zh*) differently $minLength = $tokenizer->isWordTokenizer() ? $this->minLength : 1; $words = $tokenizer->tokenize($this->string); if (!$words || !is_array($words)) { return $this->string; } $index = array(); $pos = 0; foreach ($words as $key => $word) { $word = $synonymizer->synonymize($word); // If it is not an exemption and less than the required minimum length // or identified as stop word it is removed if (!isset($this->whiteList[$word]) && (mb_strlen($word) < $minLength || $stopwordAnalyzer->isStopWord($word))) { continue; } // Simple proximity, check for same words appearing next to each other if (isset($index[$pos - 1]) && $index[$pos - 1] === $word) { continue; } $index[] = trim($word); $pos++; } return implode(' ', $index); }