/**
  * @see Zend_Search_Lucene_Analysis_TokenFilter
  */
 public function normalize(Zend_Search_Lucene_Analysis_Token $srcToken)
 {
     $text = $this->stemmer->doStem($srcToken->getTermText());
     $newToken = new Zend_Search_Lucene_Analysis_Token($text, $srcToken->getStartOffset(), $srcToken->getEndOffset());
     $newToken->setPositionIncrement($srcToken->getPositionIncrement());
     return $newToken;
 }
Пример #2
0
 /**
  * Normalize Token or remove it (if null is returned)
  *
  * @param Zend_Search_Lucene_Analysis_Token $srcToken
  * @return Zend_Search_Lucene_Analysis_Token
  */
 public function normalize(Zend_Search_Lucene_Analysis_Token $srcToken) {
     if (array_key_exists($srcToken->getTermText(), $this->_stopSet)) {
         return null;
     } else {
         return $srcToken;
     }
 }
Пример #3
0
 /**
  * Normalize Token or remove it (if null is returned)
  *
  * @param Zend_Search_Lucene_Analysis_Token $srcToken
  * @return Zend_Search_Lucene_Analysis_Token
  */
 public function normalize(Zend_Search_Lucene_Analysis_Token $srcToken) {
     if (strlen($srcToken->getTermText()) < $this->length) {
         return null;
     } else {
         return $srcToken;
     }
 }
Пример #4
0
 /**
  * If not numeric, calls the parent method.
  *
  * @param Zend_Search_Lucene_Analysis_Token $srcToken
  * @return Zend_Search_Lucene_Analysis_Token
  */
 public function normalize(Zend_Search_Lucene_Analysis_Token $srcToken)
 {
     if (!ctype_digit($srcToken->getTermText())) {
         return parent::normalize($srcToken);
     } else {
         return $srcToken;
     }
 }
Пример #5
0
 /**
  * Normalize Token or remove it (if null is returned).
  *
  * @param Zend_Search_Lucene_Analysis_Token $srcToken
  * @return Zend_Search_Lucene_Analysis_Token
  */
 public function normalize(Zend_Search_Lucene_Analysis_Token $srcToken)
 {
     // gets token text, invokes hook_search_preprocess().
     $processed_text = $srcToken->getTermText();
     search_invoke_preprocess($processed_text);
     // returns the new processed token
     $newToken = new Zend_Search_Lucene_Analysis_Token($processed_text, $srcToken->getStartOffset(), $srcToken->getEndOffset());
     $newToken->setPositionIncrement($srcToken->getPositionIncrement());
     return $newToken;
 }
Пример #6
0
    /**
     * Normalize Token or remove it (if null is returned)
     *
     * @param Zend_Search_Lucene_Analysis_Token $srcToken
     * @return Zend_Search_Lucene_Analysis_Token
     */
    public function normalize(Zend_Search_Lucene_Analysis_Token $srcToken)
    {
        $newToken = new Zend_Search_Lucene_Analysis_Token(
                                     strtolower( $srcToken->getTermText() ),
                                     $srcToken->getStartOffset(),
                                     $srcToken->getEndOffset());

        $newToken->setPositionIncrement($srcToken->getPositionIncrement());

        return $newToken;
    }
 /**
  * Normalize Token or remove it (if null is returned)
  *
  * @param Zend_Search_Lucene_Analysis_Token $srcToken
  * @return Zend_Search_Lucene_Analysis_Token
  */
 public function normalize(Zend_Search_Lucene_Analysis_Token $srcToken)
 {
     if ($this->mbString) {
         $value = mb_strtolower($srcToken->getTermText(), 'utf8');
     } else {
         $value = strtolower($srcToken->getTermText());
     }
     $newToken = new Zend_Search_Lucene_Analysis_Token($value, $srcToken->getStartOffset(), $srcToken->getEndOffset());
     $newToken->setPositionIncrement($srcToken->getPositionIncrement());
     return $newToken;
 }
Пример #8
0
 /**
  * Normalize Token or remove it (if null is returned)
  *
  * @param Zend_Search_Lucene_Analysis_Token $srcToken
  * @return Zend_Search_Lucene_Analysis_Token
  */
 public function normalize(Zend_Search_Lucene_Analysis_Token $po_srctoken)
 {
     $vo_lang_analyzer = new LanguageDetection();
     $vs_original_string = $po_srctoken->getTermText();
     $vs_lang_code = $vo_lang_analyzer->analyze($vs_original_string);
     /* stem text with respect to language that has been detected */
     $vo_stemmer = new SnoballStemmer();
     if ($vs_lang_code) {
         $vs_stemmed_string = $vo_stemmer->stem($vs_original_string, $vs_lang_code);
     } else {
         /* if language could not be detected, don't do any stemming at all */
         $vs_stemmed_string = $vs_original_string;
     }
     /* build new token to return */
     $vo_new_token = new Zend_Search_Lucene_Analysis_Token($vs_stemmed_string, $po_srctoken->getStartOffset(), $po_srctoken->getEndOffset());
     $vo_new_token->setPositionIncrement($po_srctoken->getPositionIncrement());
     return $vo_new_token;
 }
Пример #9
0
 /**
  * Normalize Token or remove it (if null is returned)
  *
  * @param Zend_Search_Lucene_Analysis_Token $srcToken
  * @return Zend_Search_Lucene_Analysis_Token
  */
 public function normalize(Zend_Search_Lucene_Analysis_Token $srcToken)
 {
     $srcToken->setTermText(mb_strtolower($srcToken->getTermText(), 'UTF-8'));
     return $srcToken;
 }
Пример #10
0
 /**
  * Normalize Token or remove it (if null is returned)
  *
  * @param Zend_Search_Lucene_Analysis_Token $srcToken
  * @return Zend_Search_Lucene_Analysis_Token
  */
 public function normalize(Zend_Search_Lucene_Analysis_Token $srcToken)
 {
     //iconv("utf-8", "us-ascii//TRANSLIT", $url); // TRANSLIT does the whole job
     // We could use also remove_accents() in uri.php
     // Problem: ñ -> n
     //$token = strtolower(iconv("utf-8", "us-ascii//TRANSLIT", $srcToken->getTermText()));
     $token = strtolower($srcToken->getTermText());
     if (strlen($token) < 2 || array_key_exists($token, $this->_stopSet)) {
         return null;
     }
     $newToken = new Zend_Search_Lucene_Analysis_Token($token, $srcToken->getStartOffset(), $srcToken->getEndOffset());
     $newToken->setPositionIncrement($srcToken->getPositionIncrement());
     return $newToken;
 }
<?php

/**
 * This file is part of the sfLucene package.
 * (c) Carl Vondrick <*****@*****.**>
 *
 * For the full copyright and license information, please view the LICENSE
 * file that was distributed with this source code.
 */
require dirname(__FILE__) . '/../../bootstrap/unit.php';
require 'util/xfLuceneZendManager.class.php';
require 'stemmer/xfLuceneStemmerTokenFilter.class.php';
require 'stemmer/xfLuceneStemmer.interface.php';
require 'stemmer/xfLuceneStemmerPorter.class.php';
require 'vendor/PorterStemmer/PorterStemmer.class.php';
$t = new lime_test(2, new lime_output_color());
$s = new xfLuceneStemmerPorter();
$filter = new xfLuceneStemmerTokenFilter($s);
$token = new Zend_Search_Lucene_Analysis_Token('nationalize', 10, 21);
$token->setPositionIncrement(0);
$response = $filter->normalize($token);
$t->isa_ok($response, 'Zend_Search_Lucene_Analysis_Token', '->normalize() returns a Zend_Search_Lucene_Analysis_Token');
$t->is($response->getTermText(), 'nation', '->normalize() consults the stemmer');
Пример #12
0
 /**
  * Tokenization stream API
  * Get next token
  * Returns null at the end of stream
  *
  * @return Zend_Search_Lucene_Analysis_Token|null
  */
 public function nextToken()
 {
     // есть ли нам откуда брать данные?
     if (!$this->num_chunks) {
         return null;
     }
     // сначала отдаём уже имеющиеся токены
     if (sizeof($this->token_stack)) {
         return array_pop($this->token_stack);
     }
     while ($this->num_chunks > $this->current_chunk) {
         $word = $this->input_chunks[$this->current_chunk + 1];
         // специальный случай: идентификаторы сайтов и разделов в виде sub123, site5
         if ($this->ignore_numbers && ($word == 'site' || $word == 'sub') && preg_match("/^(\\d+)/", $this->input_chunks[$this->current_chunk + 2], $matches)) {
             $word .= $matches[1];
         }
         $word_length = mb_strlen($word, 'UTF-8');
         $delimiter_length = mb_strlen($this->input_chunks[$this->current_chunk], 'UTF-8');
         $start_position = $this->current_chunk == 1 ? 0 : $this->char_position + $delimiter_length + 1;
         $end_position = $start_position + $word_length;
         // готовимся к следующему циклу
         $this->char_position = $end_position;
         $this->current_chunk += 2;
         if (!$word_length) {
             continue;
         }
         // на входе была строка без значащих символов?
         // применяем фильтры
         $processed = $this->apply_nc_filters($word);
         $count = sizeof($processed);
         if ($count > 0) {
             for ($i = 1; $i < $count; $i++) {
                 // i.e. if $count > 1
                 $token = new Zend_Search_Lucene_Analysis_Token($processed[$i], $start_position, $end_position);
                 // умная книга Lucene in Action советует установить $token->setPositionIncrement(0),
                 // но, по-моему, разницы нет (в исходниках ZSL отмечено "todo: Process
                 // $token->getPositionIncrement()" - может быть, в будущем заработает)
                 $token->setPositionIncrement(0);
                 $this->token_stack[] = $token;
             }
             return new Zend_Search_Lucene_Analysis_Token($processed[0], $start_position, $end_position);
         }
     }
     return null;
 }