/** * @dataProvider textByLanguageProvider */ public function testByLanguage($languageCode, $text, $expected) { $sanitizerFactory = new SanitizerFactory(); $sanitizer = $sanitizerFactory->newSanitizer($text); $sanitizer->toLowercase(); $text = $sanitizer->sanitizeWith($sanitizerFactory->newGenericRegExTokenizer(), $sanitizerFactory->newCdbStopwordAnalyzer($languageCode), $sanitizerFactory->newNullSynonymizer()); $this->assertEquals($expected, $text); }
/** * @dataProvider icuTextProvider */ public function testIcuWordBoundaryTokenizer($text, $expected) { $sanitizerFactory = new SanitizerFactory(); $tokenier = $sanitizerFactory->newIcuWordBoundaryTokenizer($sanitizerFactory->newGenericRegExTokenizer()); if (!$tokenier->isAvailable() || INTL_ICU_VERSION != '54.1') { $this->markTestSkipped('ICU extension is not available or does not match the expected version constraint.'); } $this->assertEquals($expected, $tokenier->tokenize($text)); }
/** * @dataProvider textProvider */ public function testNgramWithBeginEndMarker($languageCode, $ngramSize, $text, $expected) { $sanitizerFactory = new SanitizerFactory(); $tokenizer = $sanitizerFactory->newNGramTokenizer($sanitizerFactory->newGenericRegExTokenizer()); $tokenizer->withMarker(true); $tokenizer->setNgramSize($ngramSize); $tokens = $tokenizer->tokenize($text); $this->assertEquals($expected, $tokens); }