/**
  * @dataProvider textByLanguageProvider
  */
 public function testByLanguage($languageCode, $text, $expected)
 {
     $sanitizerFactory = new SanitizerFactory();
     $sanitizer = $sanitizerFactory->newSanitizer($text);
     $sanitizer->toLowercase();
     $text = $sanitizer->sanitizeWith($sanitizerFactory->newGenericRegExTokenizer(), $sanitizerFactory->newCdbStopwordAnalyzer($languageCode), $sanitizerFactory->newNullSynonymizer());
     $this->assertEquals($expected, $text);
 }
Example #2
0
 /**
  * @dataProvider icuTextProvider
  */
 public function testIcuWordBoundaryTokenizer($text, $expected)
 {
     $sanitizerFactory = new SanitizerFactory();
     $tokenier = $sanitizerFactory->newIcuWordBoundaryTokenizer($sanitizerFactory->newGenericRegExTokenizer());
     if (!$tokenier->isAvailable() || INTL_ICU_VERSION != '54.1') {
         $this->markTestSkipped('ICU extension is not available or does not match the expected version constraint.');
     }
     $this->assertEquals($expected, $tokenier->tokenize($text));
 }
Example #3
0
 /**
  * @dataProvider textProvider
  */
 public function testNgramWithBeginEndMarker($languageCode, $ngramSize, $text, $expected)
 {
     $sanitizerFactory = new SanitizerFactory();
     $tokenizer = $sanitizerFactory->newNGramTokenizer($sanitizerFactory->newGenericRegExTokenizer());
     $tokenizer->withMarker(true);
     $tokenizer->setNgramSize($ngramSize);
     $tokens = $tokenizer->tokenize($text);
     $this->assertEquals($expected, $tokens);
 }