public function __construct() { parent::__construct(); $this->addFilter(new Zend_Search_Lucene_Analysis_TokenFilter_LowerCaseUtf8()); }
/** * Zend Search Lucene makes it awfully hard to have multiple Lucene indexes * open at the same time. This method combats that by configuring all the * static variables for this instance. */ public function configure() { sfLuceneToolkit::loadZend(); $this->getEventDispatcher()->notify(new sfEvent($this, 'lucene.configure.pre')); Zend_Search_Lucene_Search_QueryParser::setDefaultEncoding($this->getParameter('encoding')); switch (strtolower($this->getParameter('analyzer'))) { default: throw new sfLuceneException('Unknown analyzer: ' . $this->getParameter('analzyer')); case 'text': $analyzer = new Zend_Search_Lucene_Analysis_Analyzer_Common_Text(); break; case 'textnum': $analyzer = new Zend_Search_Lucene_Analysis_Analyzer_Common_TextNum(); break; case 'utf8': case 'utf-8': $analyzer = new Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8(); break; case 'utf8num': case 'utf-8num': $analyzer = new Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num(); break; } if (!$this->getParameter('case_sensitive', false)) { $analyzer->addFilter(new sfLuceneLowerCaseFilter($this->getParameter('mb_string', false))); } if (count($this->getParameter('stop_words'))) { $analyzer->addFilter(new Zend_Search_Lucene_Analysis_TokenFilter_StopWords($this->getParameter('stop_words'))); } if ($this->getParameter('short_words') > 0) { $analyzer->addFilter(new Zend_Search_Lucene_Analysis_TokenFilter_ShortWords($this->getParameter('short_words'))); } Zend_Search_Lucene_Analysis_Analyzer::setDefault($analyzer); $this->getEventDispatcher()->notify(new sfEvent($this, 'lucene.configure.post')); }
public function testUtf8Num() { if (@preg_match('/\\pL/u', 'a') != 1) { // PCRE unicode support is turned off return; } $analyzer = new Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num(); // UTF-8 text with a cyrillic symbols $tokenList = $analyzer->tokenize('Слово1 Слово2 ДругоеСлово', 'UTF-8'); $this->assertEquals(count($tokenList), 3); $this->assertEquals($tokenList[0]->getTermText(), 'Слово1'); $this->assertEquals($tokenList[0]->getStartOffset(), 0); $this->assertEquals($tokenList[0]->getEndOffset(), 6); $this->assertEquals($tokenList[0]->getPositionIncrement(), 1); $this->assertEquals($tokenList[1]->getTermText(), 'Слово2'); $this->assertEquals($tokenList[1]->getStartOffset(), 7); $this->assertEquals($tokenList[1]->getEndOffset(), 13); $this->assertEquals($tokenList[1]->getPositionIncrement(), 1); $this->assertEquals($tokenList[2]->getTermText(), 'ДругоеСлово'); $this->assertEquals($tokenList[2]->getStartOffset(), 14); $this->assertEquals($tokenList[2]->getEndOffset(), 25); $this->assertEquals($tokenList[2]->getPositionIncrement(), 1); }
public function testUtf8Num() { $analyzer = new Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num(); // UTF-8 text with a cyrillic symbols $tokenList = $analyzer->tokenize('Слово1 Слово2 ДругоеСлово'); $this->assertEquals(count($tokenList), 3); $this->assertEquals($tokenList[0]->getTermText(), 'Слово1'); $this->assertEquals($tokenList[0]->getStartOffset(), 0); $this->assertEquals($tokenList[0]->getEndOffset(), 6); $this->assertEquals($tokenList[0]->getPositionIncrement(), 1); $this->assertEquals($tokenList[1]->getTermText(), 'Слово2'); $this->assertEquals($tokenList[1]->getStartOffset(), 7); $this->assertEquals($tokenList[1]->getEndOffset(), 13); $this->assertEquals($tokenList[1]->getPositionIncrement(), 1); $this->assertEquals($tokenList[2]->getTermText(), 'ДругоеСлово'); $this->assertEquals($tokenList[2]->getStartOffset(), 14); $this->assertEquals($tokenList[2]->getEndOffset(), 24); $this->assertEquals($tokenList[2]->getPositionIncrement(), 1); }