public function testUtf8Num() { if (@preg_match('/\\pL/u', 'a') != 1) { // PCRE unicode support is turned off return; } $analyzer = new Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num(); // UTF-8 text with a cyrillic symbols $tokenList = $analyzer->tokenize('Слово1 Слово2 ДругоеСлово', 'UTF-8'); $this->assertEquals(count($tokenList), 3); $this->assertEquals($tokenList[0]->getTermText(), 'Слово1'); $this->assertEquals($tokenList[0]->getStartOffset(), 0); $this->assertEquals($tokenList[0]->getEndOffset(), 6); $this->assertEquals($tokenList[0]->getPositionIncrement(), 1); $this->assertEquals($tokenList[1]->getTermText(), 'Слово2'); $this->assertEquals($tokenList[1]->getStartOffset(), 7); $this->assertEquals($tokenList[1]->getEndOffset(), 13); $this->assertEquals($tokenList[1]->getPositionIncrement(), 1); $this->assertEquals($tokenList[2]->getTermText(), 'ДругоеСлово'); $this->assertEquals($tokenList[2]->getStartOffset(), 14); $this->assertEquals($tokenList[2]->getEndOffset(), 25); $this->assertEquals($tokenList[2]->getPositionIncrement(), 1); }
public function testUtf8Num() { $analyzer = new Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num(); // UTF-8 text with a cyrillic symbols $tokenList = $analyzer->tokenize('Слово1 Слово2 ДругоеСлово'); $this->assertEquals(count($tokenList), 3); $this->assertEquals($tokenList[0]->getTermText(), 'Слово1'); $this->assertEquals($tokenList[0]->getStartOffset(), 0); $this->assertEquals($tokenList[0]->getEndOffset(), 6); $this->assertEquals($tokenList[0]->getPositionIncrement(), 1); $this->assertEquals($tokenList[1]->getTermText(), 'Слово2'); $this->assertEquals($tokenList[1]->getStartOffset(), 7); $this->assertEquals($tokenList[1]->getEndOffset(), 13); $this->assertEquals($tokenList[1]->getPositionIncrement(), 1); $this->assertEquals($tokenList[2]->getTermText(), 'ДругоеСлово'); $this->assertEquals($tokenList[2]->getStartOffset(), 14); $this->assertEquals($tokenList[2]->getEndOffset(), 24); $this->assertEquals($tokenList[2]->getPositionIncrement(), 1); }