/** * @since 2.5 * * @param string $text * @param boolean $isSearchTerm * * @return string */ public function sanitize($text, $isSearchTerm = false) { $start = microtime(true); $text = rawurldecode(trim($text)); $exemptionList = ''; // Those have special meaning when running a match search against // the fulltext index (wildcard, phrase matching markers etc.) if ($isSearchTerm) { $exemptionList = array('*', '"', '+', '-', '&', ',', '@'); } $sanitizer = $this->sanitizerFactory->newSanitizer($text); $sanitizer->toLowercase(); $sanitizer->applyTransliteration(); $sanitizer->convertDoubleWidth(); $sanitizer->replace(array('http://', 'https://', 'mailto:', '%2A', '_', '[', "\n", "\t"), array('', '', '', '*', ' ', '[', "", "")); $language = $this->predictLanguage($text); $sanitizer->setOption(Sanitizer::WHITELIST, $exemptionList); $sanitizer->setOption(Sanitizer::MIN_LENGTH, $this->minTokenSize); $tokenizer = $this->sanitizerFactory->newPreferredTokenizerByLanguage($text, $language); $tokenizer->setOption(Tokenizer::REGEX_EXEMPTION, $exemptionList); $text = $sanitizer->sanitizeWith($tokenizer, $this->sanitizerFactory->newStopwordAnalyzerByLanguage($language), $this->sanitizerFactory->newSynonymizerByLanguage($language)); // Remove possible spaces added by the tokenizer $text = str_replace(array(' *', '* ', ' "', '" ', '+ ', '- ', '@ '), array('*', '*', '"', '"', ' +', ' -', '@'), $text); //var_dump( $language, $text, (microtime( true ) - $start ) ); return $text; }
/** * @dataProvider textByLanguageProvider */ public function testByLanguage($languageCode, $text, $expected) { $sanitizerFactory = new SanitizerFactory(); $sanitizer = $sanitizerFactory->newSanitizer($text); $sanitizer->toLowercase(); $text = $sanitizer->sanitizeWith($sanitizerFactory->newGenericRegExTokenizer(), $sanitizerFactory->newCdbStopwordAnalyzer($languageCode), $sanitizerFactory->newNullSynonymizer()); $this->assertEquals($expected, $text); }