Beispiel #1
0
 public function testFields()
 {
     $document = new Zend_Search_Lucene_Document();
     $document->addField(Zend_Search_Lucene_Field::Text('title', 'Title'));
     $document->addField(Zend_Search_Lucene_Field::Text('annotation', 'Annotation'));
     $document->addField(Zend_Search_Lucene_Field::Text('body', 'Document body, document body, document body...'));
     $fieldnamesDiffArray = array_diff($document->getFieldNames(), array('title', 'annotation', 'body'));
     $this->assertTrue(is_array($fieldnamesDiffArray));
     $this->assertEquals(count($fieldnamesDiffArray), 0);
     $this->assertEquals($document->title, 'Title');
     $this->assertEquals($document->annotation, 'Annotation');
     $this->assertEquals($document->body, 'Document body, document body, document body...');
     $this->assertEquals($document->getField('title')->value, 'Title');
     $this->assertEquals($document->getField('annotation')->value, 'Annotation');
     $this->assertEquals($document->getField('body')->value, 'Document body, document body, document body...');
     $this->assertEquals($document->getFieldValue('title'), 'Title');
     $this->assertEquals($document->getFieldValue('annotation'), 'Annotation');
     $this->assertEquals($document->getFieldValue('body'), 'Document body, document body, document body...');
     if (PHP_OS == 'AIX') {
         return;
         // tests below here not valid on AIX
     }
     $wordsWithUmlautsIso88591 = iconv('UTF-8', 'ISO-8859-1', 'Words with umlauts: åãü...');
     $document->addField(Zend_Search_Lucene_Field::Text('description', $wordsWithUmlautsIso88591, 'ISO-8859-1'));
     $this->assertEquals($document->description, $wordsWithUmlautsIso88591);
     $this->assertEquals($document->getFieldUtf8Value('description'), 'Words with umlauts: åãü...');
 }
 public function testFields()
 {
     $document = new Zend_Search_Lucene_Document();
     $document->addField(Zend_Search_Lucene_Field::Text('title', 'Title'));
     $document->addField(Zend_Search_Lucene_Field::Text('annotation', 'Annotation'));
     $document->addField(Zend_Search_Lucene_Field::Text('body', 'Document body, document body, document body...'));
     $fieldnamesDiffArray = array_diff($document->getFieldNames(), array('title', 'annotation', 'body'));
     $this->assertTrue(is_array($fieldnamesDiffArray));
     $this->assertEquals(count($fieldnamesDiffArray), 0);
     $this->assertEquals($document->title, 'Title');
     $this->assertEquals($document->annotation, 'Annotation');
     $this->assertEquals($document->body, 'Document body, document body, document body...');
     $this->assertEquals($document->getField('title')->value, 'Title');
     $this->assertEquals($document->getField('annotation')->value, 'Annotation');
     $this->assertEquals($document->getField('body')->value, 'Document body, document body, document body...');
     $this->assertEquals($document->getFieldValue('title'), 'Title');
     $this->assertEquals($document->getFieldValue('annotation'), 'Annotation');
     $this->assertEquals($document->getFieldValue('body'), 'Document body, document body, document body...');
     $document->addField(Zend_Search_Lucene_Field::Text('description', 'Words with umlauts: εγό...', 'ISO-8859-1'));
     $this->assertEquals($document->description, 'Words with umlauts: εγό...');
     $this->assertEquals($document->getFieldUtf8Value('description'), 'Words with umlauts: Γ₯ãü...');
 }
Beispiel #3
0
 /**
  * Extract data from a PDF document and add this to the Lucene index.
  *
  * @param \Zend_Search_Lucene_Proxy $Index             The Lucene index object.
  * @param string                    $type            ['html', 'docx', 'xsls', 'pptx', 'content']
  * @param array                        $indexValues
  * @param string                    $locale
  * @param object                    $obj
  * @param string                     $pathFile        The path to the PDF document.
  *
  * @return \Zend_Search_Lucene_Proxy
  * @access    public
  * @static
  * @author Etienne de Longeaux <*****@*****.**>
  * @since 2012-06-11
  */
 public static function index(\Zend_Search_Lucene_Proxy $Index, $type, $indexValues = null, $locale = '', $obj = null, $pathFile = '')
 {
     // ignore invalid characters for lucene text search
     \Zend_Search_Lucene_Search_QueryParser::setDefaultEncoding('utf-8');
     \Zend_Search_Lucene_Analysis_Analyzer::setDefault(new \Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8_CaseInsensitive());
     self::$_index = $Index;
     self::$_doc = null;
     switch ($type) {
         case "html":
             self::$_doc = \Zend_Search_Lucene_Document_Html::loadHtmlFile($pathFile, false);
             $indexValues['Key'] = filemtime($pathFile);
             $indexValues['Contents'] = self::$_doc->getFieldUtf8Value('body');
             break;
         case "docx":
             self::$_doc = \Zend_Search_Lucene_Document_Docx::loadDocxFile($pathFile, false);
             $indexValues['Key'] = filemtime($pathFile);
             $indexValues['Contents'] = self::$_doc->getFieldUtf8Value('body');
             break;
         case "xsls":
             self::$_doc = \Zend_Search_Lucene_Document_Xlsx::loadXlsxFile($pathFile, false);
             $indexValues['Key'] = filemtime($pathFile);
             $indexValues['Contents'] = self::$_doc->getFieldUtf8Value('body');
             break;
         case "pptx":
             self::$_doc = \Zend_Search_Lucene_Document_Pptx::loadPptxFile($pathFile, false);
             $indexValues['Key'] = filemtime($pathFile);
             $indexValues['Contents'] = self::$_doc->getFieldUtf8Value('body');
             break;
         case "page":
             // we create a new instance of Zend_Search_Lucene_Document
             self::$_doc = \Zend_Search_Lucene_Document_Html::loadHTML($indexValues['Contents'], false);
             $indexValues['Contents'] = self::$_doc->getFieldUtf8Value('body');
             break;
     }
     if (self::$_doc instanceof \Zend_Search_Lucene_Document) {
         // Remove all accens
         $indexValues['Contents'] = \Sfynx\ToolBundle\Util\PiStringManager::minusculesSansAccents($indexValues['Contents']);
         // Remove all doublons
         $indexValues['Contents'] = \Sfynx\ToolBundle\Util\PiStringManager::uniqueWord($indexValues['Contents']);
         // clean the content
         $indexValues['Contents'] = \Sfynx\ToolBundle\Util\PiStringManager::cleanContent($indexValues['Contents']);
         // Delete all stop words
         $stopWord = \Sfynx\ToolBundle\Util\PiStringManager::stopWord(strtolower($locale));
         if ($stopWord) {
             $wordsIndex = explode(' ', $indexValues['Contents']);
             $diff = array_diff($wordsIndex, $stopWord);
             $indexValues['Contents'] = implode(' ', $diff);
         }
         //             print_r($locale);
         //             print_r('<br /><br /><br />');
         //             print_r(implode(' ', $wordsIndex));
         //             print_r('<br /><br /><br />');
         //             print_r(implode(' ', $stopWord));
         //             print_r('<br /><br /><br />');
         //             print_r($indexValues['Contents']);
         //             print_r('<br /><br /><br />');
         // If the document creation was sucessful then add it to our index.
         try {
             setlocale(LC_ALL, $locale);
             self::defaultAddFields($indexValues);
             self::addDocument();
             //                 print_r($indexValues['Key']);
             //                 print_r('<br />');
             //                 print_r($indexValues['Contents']);
             //                 print_r('<br /><br /><br />');
         } catch (\Exception $e) {
             setlocale(LC_ALL, 'fr_FR');
             self::defaultAddFields($indexValues);
             try {
                 self::addDocument();
             } catch (\Exception $e) {
             }
         }
     }
     // Return the Lucene index object.
     return self::$_index;
 }