/** * @see Zend_Search_Lucene_Analysis_TokenFilter */ public function normalize(Zend_Search_Lucene_Analysis_Token $srcToken) { $text = $this->stemmer->doStem($srcToken->getTermText()); $newToken = new Zend_Search_Lucene_Analysis_Token($text, $srcToken->getStartOffset(), $srcToken->getEndOffset()); $newToken->setPositionIncrement($srcToken->getPositionIncrement()); return $newToken; }
/** * Normalize Token or remove it (if null is returned) * * @param Zend_Search_Lucene_Analysis_Token $srcToken * @return Zend_Search_Lucene_Analysis_Token */ public function normalize(Zend_Search_Lucene_Analysis_Token $srcToken) { if (array_key_exists($srcToken->getTermText(), $this->_stopSet)) { return null; } else { return $srcToken; } }
/** * Normalize Token or remove it (if null is returned) * * @param Zend_Search_Lucene_Analysis_Token $srcToken * @return Zend_Search_Lucene_Analysis_Token */ public function normalize(Zend_Search_Lucene_Analysis_Token $srcToken) { if (strlen($srcToken->getTermText()) < $this->length) { return null; } else { return $srcToken; } }
/** * If not numeric, calls the parent method. * * @param Zend_Search_Lucene_Analysis_Token $srcToken * @return Zend_Search_Lucene_Analysis_Token */ public function normalize(Zend_Search_Lucene_Analysis_Token $srcToken) { if (!ctype_digit($srcToken->getTermText())) { return parent::normalize($srcToken); } else { return $srcToken; } }
/** * Normalize Token or remove it (if null is returned). * * @param Zend_Search_Lucene_Analysis_Token $srcToken * @return Zend_Search_Lucene_Analysis_Token */ public function normalize(Zend_Search_Lucene_Analysis_Token $srcToken) { // gets token text, invokes hook_search_preprocess(). $processed_text = $srcToken->getTermText(); search_invoke_preprocess($processed_text); // returns the new processed token $newToken = new Zend_Search_Lucene_Analysis_Token($processed_text, $srcToken->getStartOffset(), $srcToken->getEndOffset()); $newToken->setPositionIncrement($srcToken->getPositionIncrement()); return $newToken; }
/** * Normalize Token or remove it (if null is returned) * * @param Zend_Search_Lucene_Analysis_Token $srcToken * @return Zend_Search_Lucene_Analysis_Token */ public function normalize(Zend_Search_Lucene_Analysis_Token $srcToken) { $newToken = new Zend_Search_Lucene_Analysis_Token( strtolower( $srcToken->getTermText() ), $srcToken->getStartOffset(), $srcToken->getEndOffset()); $newToken->setPositionIncrement($srcToken->getPositionIncrement()); return $newToken; }
/** * Normalize Token or remove it (if null is returned) * * @param Zend_Search_Lucene_Analysis_Token $srcToken * @return Zend_Search_Lucene_Analysis_Token */ public function normalize(Zend_Search_Lucene_Analysis_Token $srcToken) { if ($this->mbString) { $value = mb_strtolower($srcToken->getTermText(), 'utf8'); } else { $value = strtolower($srcToken->getTermText()); } $newToken = new Zend_Search_Lucene_Analysis_Token($value, $srcToken->getStartOffset(), $srcToken->getEndOffset()); $newToken->setPositionIncrement($srcToken->getPositionIncrement()); return $newToken; }
/** * Normalize Token or remove it (if null is returned) * * @param Zend_Search_Lucene_Analysis_Token $srcToken * @return Zend_Search_Lucene_Analysis_Token */ public function normalize(Zend_Search_Lucene_Analysis_Token $po_srctoken) { $vo_lang_analyzer = new LanguageDetection(); $vs_original_string = $po_srctoken->getTermText(); $vs_lang_code = $vo_lang_analyzer->analyze($vs_original_string); /* stem text with respect to language that has been detected */ $vo_stemmer = new SnoballStemmer(); if ($vs_lang_code) { $vs_stemmed_string = $vo_stemmer->stem($vs_original_string, $vs_lang_code); } else { /* if language could not be detected, don't do any stemming at all */ $vs_stemmed_string = $vs_original_string; } /* build new token to return */ $vo_new_token = new Zend_Search_Lucene_Analysis_Token($vs_stemmed_string, $po_srctoken->getStartOffset(), $po_srctoken->getEndOffset()); $vo_new_token->setPositionIncrement($po_srctoken->getPositionIncrement()); return $vo_new_token; }
/** * Normalize Token or remove it (if null is returned) * * @param Zend_Search_Lucene_Analysis_Token $srcToken * @return Zend_Search_Lucene_Analysis_Token */ public function normalize(Zend_Search_Lucene_Analysis_Token $srcToken) { $srcToken->setTermText(mb_strtolower($srcToken->getTermText(), 'UTF-8')); return $srcToken; }
/** * Normalize Token or remove it (if null is returned) * * @param Zend_Search_Lucene_Analysis_Token $srcToken * @return Zend_Search_Lucene_Analysis_Token */ public function normalize(Zend_Search_Lucene_Analysis_Token $srcToken) { //iconv("utf-8", "us-ascii//TRANSLIT", $url); // TRANSLIT does the whole job // We could use also remove_accents() in uri.php // Problem: ñ -> n //$token = strtolower(iconv("utf-8", "us-ascii//TRANSLIT", $srcToken->getTermText())); $token = strtolower($srcToken->getTermText()); if (strlen($token) < 2 || array_key_exists($token, $this->_stopSet)) { return null; } $newToken = new Zend_Search_Lucene_Analysis_Token($token, $srcToken->getStartOffset(), $srcToken->getEndOffset()); $newToken->setPositionIncrement($srcToken->getPositionIncrement()); return $newToken; }
<?php /** * This file is part of the sfLucene package. * (c) Carl Vondrick <*****@*****.**> * * For the full copyright and license information, please view the LICENSE * file that was distributed with this source code. */ require dirname(__FILE__) . '/../../bootstrap/unit.php'; require 'util/xfLuceneZendManager.class.php'; require 'stemmer/xfLuceneStemmerTokenFilter.class.php'; require 'stemmer/xfLuceneStemmer.interface.php'; require 'stemmer/xfLuceneStemmerPorter.class.php'; require 'vendor/PorterStemmer/PorterStemmer.class.php'; $t = new lime_test(2, new lime_output_color()); $s = new xfLuceneStemmerPorter(); $filter = new xfLuceneStemmerTokenFilter($s); $token = new Zend_Search_Lucene_Analysis_Token('nationalize', 10, 21); $token->setPositionIncrement(0); $response = $filter->normalize($token); $t->isa_ok($response, 'Zend_Search_Lucene_Analysis_Token', '->normalize() returns a Zend_Search_Lucene_Analysis_Token'); $t->is($response->getTermText(), 'nation', '->normalize() consults the stemmer');
/** * Tokenization stream API * Get next token * Returns null at the end of stream * * @return Zend_Search_Lucene_Analysis_Token|null */ public function nextToken() { // есть ли нам откуда брать данные? if (!$this->num_chunks) { return null; } // сначала отдаём уже имеющиеся токены if (sizeof($this->token_stack)) { return array_pop($this->token_stack); } while ($this->num_chunks > $this->current_chunk) { $word = $this->input_chunks[$this->current_chunk + 1]; // специальный случай: идентификаторы сайтов и разделов в виде sub123, site5 if ($this->ignore_numbers && ($word == 'site' || $word == 'sub') && preg_match("/^(\\d+)/", $this->input_chunks[$this->current_chunk + 2], $matches)) { $word .= $matches[1]; } $word_length = mb_strlen($word, 'UTF-8'); $delimiter_length = mb_strlen($this->input_chunks[$this->current_chunk], 'UTF-8'); $start_position = $this->current_chunk == 1 ? 0 : $this->char_position + $delimiter_length + 1; $end_position = $start_position + $word_length; // готовимся к следующему циклу $this->char_position = $end_position; $this->current_chunk += 2; if (!$word_length) { continue; } // на входе была строка без значащих символов? // применяем фильтры $processed = $this->apply_nc_filters($word); $count = sizeof($processed); if ($count > 0) { for ($i = 1; $i < $count; $i++) { // i.e. if $count > 1 $token = new Zend_Search_Lucene_Analysis_Token($processed[$i], $start_position, $end_position); // умная книга Lucene in Action советует установить $token->setPositionIncrement(0), // но, по-моему, разницы нет (в исходниках ZSL отмечено "todo: Process // $token->getPositionIncrement()" - может быть, в будущем заработает) $token->setPositionIncrement(0); $this->token_stack[] = $token; } return new Zend_Search_Lucene_Analysis_Token($processed[0], $start_position, $end_position); } } return null; }