public function __construct() { parent::__construct(); $this->addFilter(new TokenFilter\LowerCaseUtf8()); }
public function testEncoding() { if (PHP_OS == 'AIX') { $this->markTestSkipped('Test not available on AIX'); } /** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8 */ $analyzer = new Utf8\Utf8(); // UTF-8 text with a cyrillic symbols $tokenList = $analyzer->tokenize(iconv('UTF-8', 'Windows-1251', 'Слово1 Слово2 ДругоеСлово'), 'Windows-1251'); $this->assertEquals(count($tokenList), 3); $this->assertEquals($tokenList[0]->getTermText(), 'Слово'); $this->assertEquals($tokenList[0]->getStartOffset(), 0); $this->assertEquals($tokenList[0]->getEndOffset(), 5); $this->assertEquals($tokenList[0]->getPositionIncrement(), 1); $this->assertEquals($tokenList[1]->getTermText(), 'Слово'); $this->assertEquals($tokenList[1]->getStartOffset(), 7); $this->assertEquals($tokenList[1]->getEndOffset(), 12); $this->assertEquals($tokenList[1]->getPositionIncrement(), 1); $this->assertEquals($tokenList[2]->getTermText(), 'ДругоеСлово'); $this->assertEquals($tokenList[2]->getStartOffset(), 14); $this->assertEquals($tokenList[2]->getEndOffset(), 25); $this->assertEquals($tokenList[2]->getPositionIncrement(), 1); }