Example #1
0
 /**
  * @since 0.1
  *
  * {@inheritDoc}
  */
 public function tokenize($string)
 {
     if ($this->tokenizer !== null) {
         $string = implode(" ", $this->tokenizer->tokenize($string));
     }
     return $this->loadModel()->segment($string);
 }
Example #2
0
 /**
  * @since 0.1
  *
  * @param string $string
  *
  * @return array|false
  */
 public function tokenize($string)
 {
     if ($this->tokenizer !== null) {
         $string = implode(" ", $this->tokenizer->tokenize($string));
     }
     if (!$this->isAvailable()) {
         return $this->tokenizer !== null ? $this->tokenizer->tokenize($string) : array($string);
     }
     return $this->createTokens($string);
 }
Example #3
0
 /**
  * @since 0.1
  *
  * {@inheritDoc}
  */
 public function tokenize($string)
 {
     if ($this->tokenizer !== null) {
         $string = implode(" ", $this->tokenizer->tokenize($string));
     }
     $result = explode(" ", $this->splitByCharacterGroup(str_replace($this->compound, ' ', $string)));
     foreach ($result as $key => $value) {
         if ($value === '') {
             unset($result[$key]);
         }
         // Single katakana/hiragana are exempted
         if (mb_strlen($value) === 1 && CharacterExaminer::contains(CharacterExaminer::HIRAGANA_KATAKANA, $value)) {
             unset($result[$key]);
         }
     }
     if ($result !== false) {
         return array_values($result);
     }
     return array();
 }
Example #4
0
 /**
  * @since 0.1
  *
  * @param Tokenizer $tokenizer
  * @param StopwordAnalyzer $stopwordAnalyzer
  *
  * @return string
  */
 public function sanitizeWith(Tokenizer $tokenizer, StopwordAnalyzer $stopwordAnalyzer, Synonymizer $synonymizer)
 {
     // Treat non-words tokenizers (Ja,Zh*) differently
     $minLength = $tokenizer->isWordTokenizer() ? $this->minLength : 1;
     $words = $tokenizer->tokenize($this->string);
     if (!$words || !is_array($words)) {
         return $this->string;
     }
     $index = array();
     $pos = 0;
     foreach ($words as $key => $word) {
         $word = $synonymizer->synonymize($word);
         // If it is not an exemption and less than the required minimum length
         // or identified as stop word it is removed
         if (!isset($this->whiteList[$word]) && (mb_strlen($word) < $minLength || $stopwordAnalyzer->isStopWord($word))) {
             continue;
         }
         // Simple proximity, check for same words appearing next to each other
         if (isset($index[$pos - 1]) && $index[$pos - 1] === $word) {
             continue;
         }
         $index[] = trim($word);
         $pos++;
     }
     return implode(' ', $index);
 }