/** * @see Zend_Search_Lucene_Analysis_TokenFilter */ public function normalize(Zend_Search_Lucene_Analysis_Token $srcToken) { $text = $this->stemmer->doStem($srcToken->getTermText()); $newToken = new Zend_Search_Lucene_Analysis_Token($text, $srcToken->getStartOffset(), $srcToken->getEndOffset()); $newToken->setPositionIncrement($srcToken->getPositionIncrement()); return $newToken; }
/** * Normalize Token or remove it (if null is returned). * * @param Zend_Search_Lucene_Analysis_Token $srcToken * @return Zend_Search_Lucene_Analysis_Token */ public function normalize(Zend_Search_Lucene_Analysis_Token $srcToken) { // gets token text, invokes hook_search_preprocess(). $processed_text = $srcToken->getTermText(); search_invoke_preprocess($processed_text); // returns the new processed token $newToken = new Zend_Search_Lucene_Analysis_Token($processed_text, $srcToken->getStartOffset(), $srcToken->getEndOffset()); $newToken->setPositionIncrement($srcToken->getPositionIncrement()); return $newToken; }
/** * Normalize Token or remove it (if null is returned) * * @param Zend_Search_Lucene_Analysis_Token $srcToken * @return Zend_Search_Lucene_Analysis_Token */ public function normalize(Zend_Search_Lucene_Analysis_Token $srcToken) { $newToken = new Zend_Search_Lucene_Analysis_Token( strtolower( $srcToken->getTermText() ), $srcToken->getStartOffset(), $srcToken->getEndOffset()); $newToken->setPositionIncrement($srcToken->getPositionIncrement()); return $newToken; }
/** * Normalize Token or remove it (if null is returned) * * @param Zend_Search_Lucene_Analysis_Token $srcToken * @return Zend_Search_Lucene_Analysis_Token */ public function normalize(Zend_Search_Lucene_Analysis_Token $srcToken) { if ($this->mbString) { $value = mb_strtolower($srcToken->getTermText(), 'utf8'); } else { $value = strtolower($srcToken->getTermText()); } $newToken = new Zend_Search_Lucene_Analysis_Token($value, $srcToken->getStartOffset(), $srcToken->getEndOffset()); $newToken->setPositionIncrement($srcToken->getPositionIncrement()); return $newToken; }
/** * Normalize Token or remove it (if null is returned) * * @param Zend_Search_Lucene_Analysis_Token $srcToken * @return Zend_Search_Lucene_Analysis_Token */ public function normalize(Zend_Search_Lucene_Analysis_Token $po_srctoken) { $vo_lang_analyzer = new LanguageDetection(); $vs_original_string = $po_srctoken->getTermText(); $vs_lang_code = $vo_lang_analyzer->analyze($vs_original_string); /* stem text with respect to language that has been detected */ $vo_stemmer = new SnoballStemmer(); if ($vs_lang_code) { $vs_stemmed_string = $vo_stemmer->stem($vs_original_string, $vs_lang_code); } else { /* if language could not be detected, don't do any stemming at all */ $vs_stemmed_string = $vs_original_string; } /* build new token to return */ $vo_new_token = new Zend_Search_Lucene_Analysis_Token($vs_stemmed_string, $po_srctoken->getStartOffset(), $po_srctoken->getEndOffset()); $vo_new_token->setPositionIncrement($po_srctoken->getPositionIncrement()); return $vo_new_token; }
/** * Normalize Token or remove it (if null is returned) * * @param Zend_Search_Lucene_Analysis_Token $srcToken * @return Zend_Search_Lucene_Analysis_Token */ public function normalize(Zend_Search_Lucene_Analysis_Token $srcToken) { //iconv("utf-8", "us-ascii//TRANSLIT", $url); // TRANSLIT does the whole job // We could use also remove_accents() in uri.php // Problem: ñ -> n //$token = strtolower(iconv("utf-8", "us-ascii//TRANSLIT", $srcToken->getTermText())); $token = strtolower($srcToken->getTermText()); if (strlen($token) < 2 || array_key_exists($token, $this->_stopSet)) { return null; } $newToken = new Zend_Search_Lucene_Analysis_Token($token, $srcToken->getStartOffset(), $srcToken->getEndOffset()); $newToken->setPositionIncrement($srcToken->getPositionIncrement()); return $newToken; }