public function testFields() { $document = new Zend_Search_Lucene_Document(); $document->addField(Zend_Search_Lucene_Field::Text('title', 'Title')); $document->addField(Zend_Search_Lucene_Field::Text('annotation', 'Annotation')); $document->addField(Zend_Search_Lucene_Field::Text('body', 'Document body, document body, document body...')); $fieldnamesDiffArray = array_diff($document->getFieldNames(), array('title', 'annotation', 'body')); $this->assertTrue(is_array($fieldnamesDiffArray)); $this->assertEquals(count($fieldnamesDiffArray), 0); $this->assertEquals($document->title, 'Title'); $this->assertEquals($document->annotation, 'Annotation'); $this->assertEquals($document->body, 'Document body, document body, document body...'); $this->assertEquals($document->getField('title')->value, 'Title'); $this->assertEquals($document->getField('annotation')->value, 'Annotation'); $this->assertEquals($document->getField('body')->value, 'Document body, document body, document body...'); $this->assertEquals($document->getFieldValue('title'), 'Title'); $this->assertEquals($document->getFieldValue('annotation'), 'Annotation'); $this->assertEquals($document->getFieldValue('body'), 'Document body, document body, document body...'); if (PHP_OS == 'AIX') { return; // tests below here not valid on AIX } $wordsWithUmlautsIso88591 = iconv('UTF-8', 'ISO-8859-1', 'Words with umlauts: åãü...'); $document->addField(Zend_Search_Lucene_Field::Text('description', $wordsWithUmlautsIso88591, 'ISO-8859-1')); $this->assertEquals($document->description, $wordsWithUmlautsIso88591); $this->assertEquals($document->getFieldUtf8Value('description'), 'Words with umlauts: åãü...'); }
public function testFields() { $document = new Zend_Search_Lucene_Document(); $document->addField(Zend_Search_Lucene_Field::Text('title', 'Title')); $document->addField(Zend_Search_Lucene_Field::Text('annotation', 'Annotation')); $document->addField(Zend_Search_Lucene_Field::Text('body', 'Document body, document body, document body...')); $fieldnamesDiffArray = array_diff($document->getFieldNames(), array('title', 'annotation', 'body')); $this->assertTrue(is_array($fieldnamesDiffArray)); $this->assertEquals(count($fieldnamesDiffArray), 0); $this->assertEquals($document->title, 'Title'); $this->assertEquals($document->annotation, 'Annotation'); $this->assertEquals($document->body, 'Document body, document body, document body...'); $this->assertEquals($document->getField('title')->value, 'Title'); $this->assertEquals($document->getField('annotation')->value, 'Annotation'); $this->assertEquals($document->getField('body')->value, 'Document body, document body, document body...'); $this->assertEquals($document->getFieldValue('title'), 'Title'); $this->assertEquals($document->getFieldValue('annotation'), 'Annotation'); $this->assertEquals($document->getFieldValue('body'), 'Document body, document body, document body...'); $document->addField(Zend_Search_Lucene_Field::Text('description', 'Words with umlauts: εγό...', 'ISO-8859-1')); $this->assertEquals($document->description, 'Words with umlauts: εγό...'); $this->assertEquals($document->getFieldUtf8Value('description'), 'Words with umlauts: Γ₯ãü...'); }
/** * Extract data from a PDF document and add this to the Lucene index. * * @param \Zend_Search_Lucene_Proxy $Index The Lucene index object. * @param string $type ['html', 'docx', 'xsls', 'pptx', 'content'] * @param array $indexValues * @param string $locale * @param object $obj * @param string $pathFile The path to the PDF document. * * @return \Zend_Search_Lucene_Proxy * @access public * @static * @author Etienne de Longeaux <*****@*****.**> * @since 2012-06-11 */ public static function index(\Zend_Search_Lucene_Proxy $Index, $type, $indexValues = null, $locale = '', $obj = null, $pathFile = '') { // ignore invalid characters for lucene text search \Zend_Search_Lucene_Search_QueryParser::setDefaultEncoding('utf-8'); \Zend_Search_Lucene_Analysis_Analyzer::setDefault(new \Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8_CaseInsensitive()); self::$_index = $Index; self::$_doc = null; switch ($type) { case "html": self::$_doc = \Zend_Search_Lucene_Document_Html::loadHtmlFile($pathFile, false); $indexValues['Key'] = filemtime($pathFile); $indexValues['Contents'] = self::$_doc->getFieldUtf8Value('body'); break; case "docx": self::$_doc = \Zend_Search_Lucene_Document_Docx::loadDocxFile($pathFile, false); $indexValues['Key'] = filemtime($pathFile); $indexValues['Contents'] = self::$_doc->getFieldUtf8Value('body'); break; case "xsls": self::$_doc = \Zend_Search_Lucene_Document_Xlsx::loadXlsxFile($pathFile, false); $indexValues['Key'] = filemtime($pathFile); $indexValues['Contents'] = self::$_doc->getFieldUtf8Value('body'); break; case "pptx": self::$_doc = \Zend_Search_Lucene_Document_Pptx::loadPptxFile($pathFile, false); $indexValues['Key'] = filemtime($pathFile); $indexValues['Contents'] = self::$_doc->getFieldUtf8Value('body'); break; case "page": // we create a new instance of Zend_Search_Lucene_Document self::$_doc = \Zend_Search_Lucene_Document_Html::loadHTML($indexValues['Contents'], false); $indexValues['Contents'] = self::$_doc->getFieldUtf8Value('body'); break; } if (self::$_doc instanceof \Zend_Search_Lucene_Document) { // Remove all accens $indexValues['Contents'] = \Sfynx\ToolBundle\Util\PiStringManager::minusculesSansAccents($indexValues['Contents']); // Remove all doublons $indexValues['Contents'] = \Sfynx\ToolBundle\Util\PiStringManager::uniqueWord($indexValues['Contents']); // clean the content $indexValues['Contents'] = \Sfynx\ToolBundle\Util\PiStringManager::cleanContent($indexValues['Contents']); // Delete all stop words $stopWord = \Sfynx\ToolBundle\Util\PiStringManager::stopWord(strtolower($locale)); if ($stopWord) { $wordsIndex = explode(' ', $indexValues['Contents']); $diff = array_diff($wordsIndex, $stopWord); $indexValues['Contents'] = implode(' ', $diff); } // print_r($locale); // print_r('<br /><br /><br />'); // print_r(implode(' ', $wordsIndex)); // print_r('<br /><br /><br />'); // print_r(implode(' ', $stopWord)); // print_r('<br /><br /><br />'); // print_r($indexValues['Contents']); // print_r('<br /><br /><br />'); // If the document creation was sucessful then add it to our index. try { setlocale(LC_ALL, $locale); self::defaultAddFields($indexValues); self::addDocument(); // print_r($indexValues['Key']); // print_r('<br />'); // print_r($indexValues['Contents']); // print_r('<br /><br /><br />'); } catch (\Exception $e) { setlocale(LC_ALL, 'fr_FR'); self::defaultAddFields($indexValues); try { self::addDocument(); } catch (\Exception $e) { } } } // Return the Lucene index object. return self::$_index; }