PHP Onoi\Tesa\Tokenizer Tokenizer Examples

Programming Language: PHP

Namespace/Package Name: Onoi\Tesa\Tokenizer

Class/Type: Tokenizer

Examples at hotexamples.com: 4

PHP Onoi\Tesa\Tokenizer Tokenizer - 4 examples found. These are the top rated real world PHP examples of Onoi\Tesa\Tokenizer\Tokenizer extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

tokenize(4)

isWordTokenizer(1)

setOption(1)

Example #1

Show file

File: JaTinySegmenterTokenizer.php Project: onoi/tesa

 /**
  * @since 0.1
  *
  * {@inheritDoc}
  */
 public function tokenize($string)
 {
     if ($this->tokenizer !== null) {
         $string = implode(" ", $this->tokenizer->tokenize($string));
     }
     return $this->loadModel()->segment($string);
 }

Example #2

Show file

File: IcuWordBoundaryTokenizer.php Project: onoi/tesa

 /**
  * @since 0.1
  *
  * @param string $string
  *
  * @return array|false
  */
 public function tokenize($string)
 {
     if ($this->tokenizer !== null) {
         $string = implode(" ", $this->tokenizer->tokenize($string));
     }
     if (!$this->isAvailable()) {
         return $this->tokenizer !== null ? $this->tokenizer->tokenize($string) : array($string);
     }
     return $this->createTokens($string);
 }

Example #3

Show file

File: JaCompoundGroupTokenizer.php Project: onoi/tesa

 /**
  * @since 0.1
  *
  * {@inheritDoc}
  */
 public function tokenize($string)
 {
     if ($this->tokenizer !== null) {
         $string = implode(" ", $this->tokenizer->tokenize($string));
     }
     $result = explode(" ", $this->splitByCharacterGroup(str_replace($this->compound, ' ', $string)));
     foreach ($result as $key => $value) {
         if ($value === '') {
             unset($result[$key]);
         }
         // Single katakana/hiragana are exempted
         if (mb_strlen($value) === 1 && CharacterExaminer::contains(CharacterExaminer::HIRAGANA_KATAKANA, $value)) {
             unset($result[$key]);
         }
     }
     if ($result !== false) {
         return array_values($result);
     }
     return array();
 }

Example #4

Show file

File: Sanitizer.php Project: onoi/tesa

 /**
  * @since 0.1
  *
  * @param Tokenizer $tokenizer
  * @param StopwordAnalyzer $stopwordAnalyzer
  *
  * @return string
  */
 public function sanitizeWith(Tokenizer $tokenizer, StopwordAnalyzer $stopwordAnalyzer, Synonymizer $synonymizer)
 {
     // Treat non-words tokenizers (Ja,Zh*) differently
     $minLength = $tokenizer->isWordTokenizer() ? $this->minLength : 1;
     $words = $tokenizer->tokenize($this->string);
     if (!$words || !is_array($words)) {
         return $this->string;
     }
     $index = array();
     $pos = 0;
     foreach ($words as $key => $word) {
         $word = $synonymizer->synonymize($word);
         // If it is not an exemption and less than the required minimum length
         // or identified as stop word it is removed
         if (!isset($this->whiteList[$word]) && (mb_strlen($word) < $minLength || $stopwordAnalyzer->isStopWord($word))) {
             continue;
         }
         // Simple proximity, check for same words appearing next to each other
         if (isset($index[$pos - 1]) && $index[$pos - 1] === $word) {
             continue;
         }
         $index[] = trim($word);
         $pos++;
     }
     return implode(' ', $index);
 }