/**
  * @dataProvider textByLanguageProvider
  */
 public function testByLanguage($languageCode, $text, $expected)
 {
     $sanitizerFactory = new SanitizerFactory();
     $sanitizer = $sanitizerFactory->newSanitizer($text);
     $sanitizer->toLowercase();
     $text = $sanitizer->sanitizeWith($sanitizerFactory->newGenericRegExTokenizer(), $sanitizerFactory->newCdbStopwordAnalyzer($languageCode), $sanitizerFactory->newNullSynonymizer());
     $this->assertEquals($expected, $text);
 }
예제 #2
0
 /**
  * @dataProvider textProvider
  */
 public function testNgramWithBeginEndMarker($languageCode, $ngramSize, $text, $expected)
 {
     $sanitizerFactory = new SanitizerFactory();
     $tokenizer = $sanitizerFactory->newNGramTokenizer($sanitizerFactory->newGenericRegExTokenizer());
     $tokenizer->withMarker(true);
     $tokenizer->setNgramSize($ngramSize);
     $tokens = $tokenizer->tokenize($text);
     $this->assertEquals($expected, $tokens);
 }
예제 #3
0
 private function predictLanguage($text)
 {
     if ($this->languageDetection === array()) {
         return null;
     }
     $languageDetector = $this->sanitizerFactory->newNullLanguageDetector();
     if (isset($this->languageDetection['TextCatLanguageDetector'])) {
         $languageDetector = $this->sanitizerFactory->newTextCatLanguageDetector();
         $languageDetector->setLanguageCandidates($this->languageDetection['TextCatLanguageDetector']);
     }
     return $languageDetector->detect(Normalizer::reduceLengthTo($text, 200));
 }
예제 #4
0
 /**
  * @dataProvider tinyTextProvider
  */
 public function testJaTinySegmenterTokenizer($text, $expected)
 {
     $sanitizerFactory = new SanitizerFactory();
     $tokenier = $sanitizerFactory->newJaTinySegmenterTokenizer($sanitizerFactory->newPunctuationRegExTokenizer());
     $this->assertEquals($expected, $tokenier->tokenize($text));
 }