PHP Onoi\Tesa SanitizerFactory::newSanitizer示例

 /**
  * @since 2.5
  *
  * @param string $text
  * @param boolean $isSearchTerm
  *
  * @return string
  */
 public function sanitize($text, $isSearchTerm = false)
 {
     $start = microtime(true);
     $text = rawurldecode(trim($text));
     $exemptionList = '';
     // Those have special meaning when running a match search against
     // the fulltext index (wildcard, phrase matching markers etc.)
     if ($isSearchTerm) {
         $exemptionList = array('*', '"', '+', '-', '&', ',', '@');
     }
     $sanitizer = $this->sanitizerFactory->newSanitizer($text);
     $sanitizer->toLowercase();
     $sanitizer->applyTransliteration();
     $sanitizer->convertDoubleWidth();
     $sanitizer->replace(array('http://', 'https://', 'mailto:', '%2A', '_', '&#x005B;', "\n", "\t"), array('', '', '', '*', ' ', '[', "", ""));
     $language = $this->predictLanguage($text);
     $sanitizer->setOption(Sanitizer::WHITELIST, $exemptionList);
     $sanitizer->setOption(Sanitizer::MIN_LENGTH, $this->minTokenSize);
     $tokenizer = $this->sanitizerFactory->newPreferredTokenizerByLanguage($text, $language);
     $tokenizer->setOption(Tokenizer::REGEX_EXEMPTION, $exemptionList);
     $text = $sanitizer->sanitizeWith($tokenizer, $this->sanitizerFactory->newStopwordAnalyzerByLanguage($language), $this->sanitizerFactory->newSynonymizerByLanguage($language));
     // Remove possible spaces added by the tokenizer
     $text = str_replace(array(' *', '* ', ' "', '" ', '+ ', '- ', '@ '), array('*', '*', '"', '"', ' +', ' -', '@'), $text);
     //var_dump( $language, $text, (microtime( true ) - $start ) );
     return $text;
 }

示例#2

显示文件

文件： CombinedSanitizerTextStopwordTest.php 项目： onoi/tesa

 /**
  * @dataProvider textByLanguageProvider
  */
 public function testByLanguage($languageCode, $text, $expected)
 {
     $sanitizerFactory = new SanitizerFactory();
     $sanitizer = $sanitizerFactory->newSanitizer($text);
     $sanitizer->toLowercase();
     $text = $sanitizer->sanitizeWith($sanitizerFactory->newGenericRegExTokenizer(), $sanitizerFactory->newCdbStopwordAnalyzer($languageCode), $sanitizerFactory->newNullSynonymizer());
     $this->assertEquals($expected, $text);
 }