/** * @dataProvider textByLanguageProvider */ public function testByLanguage($languageCode, $text, $expected) { $sanitizerFactory = new SanitizerFactory(); $sanitizer = $sanitizerFactory->newSanitizer($text); $sanitizer->toLowercase(); $text = $sanitizer->sanitizeWith($sanitizerFactory->newGenericRegExTokenizer(), $sanitizerFactory->newCdbStopwordAnalyzer($languageCode), $sanitizerFactory->newNullSynonymizer()); $this->assertEquals($expected, $text); }
/** * @dataProvider textProvider */ public function testNgramWithBeginEndMarker($languageCode, $ngramSize, $text, $expected) { $sanitizerFactory = new SanitizerFactory(); $tokenizer = $sanitizerFactory->newNGramTokenizer($sanitizerFactory->newGenericRegExTokenizer()); $tokenizer->withMarker(true); $tokenizer->setNgramSize($ngramSize); $tokens = $tokenizer->tokenize($text); $this->assertEquals($expected, $tokens); }
private function predictLanguage($text) { if ($this->languageDetection === array()) { return null; } $languageDetector = $this->sanitizerFactory->newNullLanguageDetector(); if (isset($this->languageDetection['TextCatLanguageDetector'])) { $languageDetector = $this->sanitizerFactory->newTextCatLanguageDetector(); $languageDetector->setLanguageCandidates($this->languageDetection['TextCatLanguageDetector']); } return $languageDetector->detect(Normalizer::reduceLengthTo($text, 200)); }
/** * @dataProvider tinyTextProvider */ public function testJaTinySegmenterTokenizer($text, $expected) { $sanitizerFactory = new SanitizerFactory(); $tokenier = $sanitizerFactory->newJaTinySegmenterTokenizer($sanitizerFactory->newPunctuationRegExTokenizer()); $this->assertEquals($expected, $tokenier->tokenize($text)); }