/**
  * Returns a string containing the text in the given Microsoft Word DOCX 
  * document.
  *
  * @param   String  $filename   Full filesystem path to the file to process.
  * @return  String  Text extracted from the file.
  */
 public static function extract($filename)
 {
     if (!extension_loaded('zip')) {
         return '';
     }
     if (!file_exists($filename)) {
         return '';
     }
     try {
         $doc = Zend_Search_Lucene_Document_Docx::loadDocxFile($filename, true);
     } catch (Exception $e) {
         return '';
     }
     return $doc->body;
 }
예제 #2
0
파일: IndexDocx.php 프로젝트: uniqid/lucene
 /**
  * php index.php docx index
  */
 public function index()
 {
     $indexDir = APP_PATH . '/' . self::INDEX_DIR;
     is_dir($indexDir) || mkdir($indexDir, 0777, true);
     $index = self::create($indexDir);
     $inputDir = APP_PATH . '/' . self::INPUT_DIR;
     is_dir($inputDir) || mkdir($inputDir, 0777, true);
     echo sprintf("Create index for %s \n\n", $inputDir);
     foreach (new DirectoryIterator($inputDir) as $fileInfo) {
         if ($fileInfo->isDot()) {
             continue;
         }
         echo sprintf("File : %s \n", $fileInfo->getFilename());
         $doc = Zend_Search_Lucene_Document_Docx::loadDocxFile($inputDir . $fileInfo->getFilename());
         $index->addDocument($doc);
     }
     echo "\n###Done###\n";
 }
 public function testDocx()
 {
     if (!class_exists('ZipArchive')) {
         $this->markTestSkipped('ZipArchive class (Zip extension) is not loaded');
     }
     $docxDocument = Zend_Search_Lucene_Document_Docx::loadDocxFile(dirname(__FILE__) . '/_openXmlDocuments/test.docx', true);
     $this->assertTrue($docxDocument instanceof Zend_Search_Lucene_Document_Docx);
     $this->assertEquals($docxDocument->getFieldValue('title'), 'Test document');
     $this->assertEquals($docxDocument->getFieldValue('description'), 'This is a test document which can be used to demonstrate something.');
     $this->assertTrue($docxDocument->getFieldValue('body') != '');
     try {
         $docxDocument1 = Zend_Search_Lucene_Document_Docx::loadDocxFile(dirname(__FILE__) . '/_openXmlDocuments/dummy.docx', true);
         $this->fail('File not readable exception is expected.');
     } catch (Zend_Search_Lucene_Document_Exception $e) {
         if (strpos($e->getMessage(), 'is not readable') === false) {
             // Passthrough exception
             throw $e;
         }
     }
 }
예제 #4
0
 /**
  * index a file
  *
  * @author Jörn Dreyer <*****@*****.**>
  *
  * @param string $path the path of the file
  *
  * @return bool
  */
 public static function indexFile($path = '', $user = null)
 {
     if (!Filesystem::isValidPath($path)) {
         return;
     }
     if ($path === '') {
         //ignore the empty path element
         return false;
     }
     if (is_null($user)) {
         $view = Filesystem::getView();
         $user = \OCP\User::getUser();
     } else {
         $view = new \OC\Files\View('/' . $user . '/files');
     }
     if (!$view) {
         Util::writeLog('search_lucene', 'could not resolve filesystem view', Util::WARN);
         return false;
     }
     if (!$view->file_exists($path)) {
         Util::writeLog('search_lucene', 'file vanished, ignoring', Util::DEBUG);
         return true;
     }
     $root = $view->getRoot();
     $pk = md5($root . $path);
     // the cache already knows mime and other basic stuff
     $data = $view->getFileInfo($path);
     if (isset($data['mimetype'])) {
         $mimeType = $data['mimetype'];
         // initialize plain lucene document
         $doc = new \Zend_Search_Lucene_Document();
         // index content for local files only
         $localFile = $view->getLocalFile($path);
         if ($localFile) {
             //try to use special lucene document types
             if ('text/plain' === $mimeType) {
                 $body = $view->file_get_contents($path);
                 if ($body != '') {
                     $doc->addField(\Zend_Search_Lucene_Field::UnStored('body', $body));
                 }
             } else {
                 if ('text/html' === $mimeType) {
                     //TODO could be indexed, even if not local
                     $doc = \Zend_Search_Lucene_Document_Html::loadHTML($view->file_get_contents($path));
                 } else {
                     if ('application/pdf' === $mimeType) {
                         $doc = Pdf::loadPdf($view->file_get_contents($path));
                         // commented the mimetype checks, as the zend classes only understand docx and not doc files.
                         // FIXME distinguish doc and docx, xls and xlsx, ppt and pptx, in oc core mimetype helper ...
                         //} else if ('application/msword' === $mimeType) {
                     } else {
                         if (strtolower(substr($data['name'], -5)) === '.docx') {
                             $doc = \Zend_Search_Lucene_Document_Docx::loadDocxFile($localFile);
                             //} else if ('application/msexcel' === $mimeType) {
                         } else {
                             if (strtolower(substr($data['name'], -5)) === '.xlsx') {
                                 $doc = \Zend_Search_Lucene_Document_Xlsx::loadXlsxFile($localFile);
                                 //} else if ('application/mspowerpoint' === $mimeType) {
                             } else {
                                 if (strtolower(substr($data['name'], -5)) === '.pptx') {
                                     $doc = \Zend_Search_Lucene_Document_Pptx::loadPptxFile($localFile);
                                 } else {
                                     if (strtolower(substr($data['name'], -4)) === '.odt') {
                                         $doc = Odt::loadOdtFile($localFile);
                                     } else {
                                         if (strtolower(substr($data['name'], -4)) === '.ods') {
                                             $doc = Ods::loadOdsFile($localFile);
                                         }
                                     }
                                 }
                             }
                         }
                     }
                 }
             }
         }
         // Store filecache id as unique id to lookup by when deleting
         $doc->addField(\Zend_Search_Lucene_Field::Keyword('pk', $pk));
         // Store filename
         $doc->addField(\Zend_Search_Lucene_Field::Text('filename', $data['name'], 'UTF-8'));
         // Store document path to identify it in the search results
         $doc->addField(\Zend_Search_Lucene_Field::Text('path', $path, 'UTF-8'));
         $doc->addField(\Zend_Search_Lucene_Field::unIndexed('size', $data['size']));
         $doc->addField(\Zend_Search_Lucene_Field::unIndexed('mimetype', $mimeType));
         //self::extractMetadata($doc, $path, $view, $mimeType);
         Lucene::updateFile($doc, $path, $user);
         return true;
     } else {
         Util::writeLog('search_lucene', 'need mimetype for content extraction', Util::ERROR);
         return false;
     }
 }
예제 #5
0
/**
 * Given a path to a DocX document returns a lucene document with filename and contents set.
 * @param $path
 * @return Zend_Search_Lucene_Document
 */
function createDocXDocument($path)
{
    $doc = Zend_Search_Lucene_Document_Docx::loadDocxFile($path);
    $doc->addField(Zend_Search_Lucene_Field::Text('filename', basename($path)));
    return $doc;
}
 /**
  * @param AJXP_Node $ajxpNode
  * @param Zend_Search_Lucene_Interface $index
  * @throws Exception
  * @return Zend_Search_Lucene_Document
  */
 public function createIndexedDocument($ajxpNode, &$index)
 {
     if (!empty($this->metaFields)) {
         $ajxpNode->loadNodeInfo(false, false, "all");
     } else {
         $ajxpNode->loadNodeInfo();
     }
     $ext = strtolower(pathinfo($ajxpNode->getLabel(), PATHINFO_EXTENSION));
     $parseContent = $this->indexContent;
     if ($parseContent && $ajxpNode->bytesize > $this->getFilteredOption("PARSE_CONTENT_MAX_SIZE")) {
         $parseContent = false;
     }
     if ($parseContent && in_array($ext, explode(",", $this->getFilteredOption("PARSE_CONTENT_HTML")))) {
         $doc = @Zend_Search_Lucene_Document_Html::loadHTMLFile($ajxpNode->getUrl());
     } elseif ($parseContent && $ext == "docx" && class_exists("Zend_Search_Lucene_Document_Docx")) {
         $realFile = call_user_func(array($ajxpNode->wrapperClassName, "getRealFSReference"), $ajxpNode->getUrl());
         $doc = @Zend_Search_Lucene_Document_Docx::loadDocxFile($realFile);
     } elseif ($parseContent && $ext == "docx" && class_exists("Zend_Search_Lucene_Document_Pptx")) {
         $realFile = call_user_func(array($ajxpNode->wrapperClassName, "getRealFSReference"), $ajxpNode->getUrl());
         $doc = @Zend_Search_Lucene_Document_Pptx::loadPptxFile($realFile);
     } elseif ($parseContent && $ext == "xlsx" && class_exists("Zend_Search_Lucene_Document_Xlsx")) {
         $realFile = call_user_func(array($ajxpNode->wrapperClassName, "getRealFSReference"), $ajxpNode->getUrl());
         $doc = @Zend_Search_Lucene_Document_Xlsx::loadXlsxFile($realFile);
     } else {
         $doc = new Zend_Search_Lucene_Document();
     }
     if ($doc == null) {
         throw new Exception("Could not load document");
     }
     $doc->addField(Zend_Search_Lucene_Field::Keyword("node_url", $ajxpNode->getUrl()), SystemTextEncoding::getEncoding());
     $doc->addField(Zend_Search_Lucene_Field::Keyword("node_path", str_replace("/", "AJXPFAKESEP", $ajxpNode->getPath())), SystemTextEncoding::getEncoding());
     $doc->addField(Zend_Search_Lucene_Field::Text("basename", basename($ajxpNode->getPath())), SystemTextEncoding::getEncoding());
     $doc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_node", "yes"), SystemTextEncoding::getEncoding());
     $doc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_scope", "shared"));
     $doc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_modiftime", date("Ymd", $ajxpNode->ajxp_modiftime)));
     $doc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_bytesize", $ajxpNode->bytesize));
     $ajxpMime = $ajxpNode->ajxp_mime;
     if (empty($ajxpMime)) {
         $doc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_mime", pathinfo($ajxpNode->getLabel(), PATHINFO_EXTENSION)));
     } else {
         $doc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_mime", $ajxpNode->ajxp_mime));
     }
     // Store a cached copy of the metadata
     $serializedMeta = base64_encode(serialize($ajxpNode->metadata));
     $doc->addField(Zend_Search_Lucene_Field::Binary("serialized_metadata", $serializedMeta));
     if (isset($ajxpNode->indexableMetaKeys["shared"])) {
         foreach ($ajxpNode->indexableMetaKeys["shared"] as $sharedField) {
             if ($ajxpNode->{$sharedField}) {
                 $doc->addField(Zend_search_Lucene_Field::keyword($sharedField, $ajxpNode->{$sharedField}));
             }
         }
     }
     foreach ($this->metaFields as $field) {
         if ($ajxpNode->{$field} != null) {
             $doc->addField(Zend_Search_Lucene_Field::Text("ajxp_meta_{$field}", $ajxpNode->{$field}), SystemTextEncoding::getEncoding());
         }
     }
     if (isset($ajxpNode->indexableMetaKeys["user"]) && count($ajxpNode->indexableMetaKeys["user"]) && AuthService::usersEnabled() && AuthService::getLoggedUser() != null) {
         $privateDoc = new Zend_Search_Lucene_Document();
         $privateDoc->addField(Zend_Search_Lucene_Field::Keyword("node_url", $ajxpNode->getUrl(), SystemTextEncoding::getEncoding()));
         $privateDoc->addField(Zend_Search_Lucene_Field::Keyword("node_path", str_replace("/", "AJXPFAKESEP", $ajxpNode->getPath()), SystemTextEncoding::getEncoding()));
         $privateDoc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_scope", "user"));
         $privateDoc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_user", AuthService::getLoggedUser()->getId()));
         foreach ($ajxpNode->indexableMetaKeys["user"] as $userField) {
             if ($ajxpNode->{$userField}) {
                 $privateDoc->addField(Zend_search_Lucene_Field::keyword($userField, $ajxpNode->{$userField}));
             }
         }
         $privateDoc->addField(Zend_Search_Lucene_Field::Binary("serialized_metadata", $serializedMeta));
         $index->addDocument($privateDoc);
     }
     if ($parseContent) {
         $body = $this->extractIndexableContent($ajxpNode);
         if (!empty($body)) {
             $doc->addField(Zend_Search_Lucene_Field::unStored("body", $body));
         }
     }
     $index->addDocument($doc);
     return $doc;
 }
 /**
  * @param AJXP_Node $ajxpNode
  * @param Zend_Search_Lucene_Interface $index
  * @throws Exception
  * @return Zend_Search_Lucene_Document
  */
 public function createIndexedDocument($ajxpNode, &$index)
 {
     $ajxpNode->loadNodeInfo();
     $ext = strtolower(pathinfo($ajxpNode->getLabel(), PATHINFO_EXTENSION));
     $parseContent = $this->indexContent;
     if ($parseContent && $ajxpNode->bytesize > $this->getFilteredOption("PARSE_CONTENT_MAX_SIZE")) {
         $parseContent = false;
     }
     if ($parseContent && in_array($ext, explode(",", $this->getFilteredOption("PARSE_CONTENT_HTML")))) {
         $doc = @Zend_Search_Lucene_Document_Html::loadHTMLFile($ajxpNode->getUrl());
     } elseif ($parseContent && $ext == "docx" && class_exists("Zend_Search_Lucene_Document_Docx")) {
         $realFile = call_user_func(array($ajxpNode->wrapperClassName, "getRealFSReference"), $ajxpNode->getUrl());
         $doc = @Zend_Search_Lucene_Document_Docx::loadDocxFile($realFile);
     } elseif ($parseContent && $ext == "docx" && class_exists("Zend_Search_Lucene_Document_Pptx")) {
         $realFile = call_user_func(array($ajxpNode->wrapperClassName, "getRealFSReference"), $ajxpNode->getUrl());
         $doc = @Zend_Search_Lucene_Document_Pptx::loadPptxFile($realFile);
     } elseif ($parseContent && $ext == "xlsx" && class_exists("Zend_Search_Lucene_Document_Xlsx")) {
         $realFile = call_user_func(array($ajxpNode->wrapperClassName, "getRealFSReference"), $ajxpNode->getUrl());
         $doc = @Zend_Search_Lucene_Document_Xlsx::loadXlsxFile($realFile);
     } else {
         $doc = new Zend_Search_Lucene_Document();
     }
     if ($doc == null) {
         throw new Exception("Could not load document");
     }
     $doc->addField(Zend_Search_Lucene_Field::Keyword("node_url", $ajxpNode->getUrl()), SystemTextEncoding::getEncoding());
     $doc->addField(Zend_Search_Lucene_Field::Keyword("node_path", str_replace("/", "AJXPFAKESEP", $ajxpNode->getPath())), SystemTextEncoding::getEncoding());
     $doc->addField(Zend_Search_Lucene_Field::Text("basename", basename($ajxpNode->getPath())), SystemTextEncoding::getEncoding());
     $doc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_node", "yes"), SystemTextEncoding::getEncoding());
     $doc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_scope", "shared"));
     $doc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_modiftime", date("Ymd", $ajxpNode->ajxp_modiftime)));
     $doc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_bytesize", $ajxpNode->bytesize));
     $ajxpMime = $ajxpNode->ajxp_mime;
     if (empty($ajxpMime)) {
         $doc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_mime", pathinfo($ajxpNode->getLabel(), PATHINFO_EXTENSION)));
     } else {
         $doc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_mime", $ajxpNode->ajxp_mime));
     }
     // Store a cached copy of the metadata
     $serializedMeta = base64_encode(serialize($ajxpNode->metadata));
     $doc->addField(Zend_Search_Lucene_Field::Binary("serialized_metadata", $serializedMeta));
     if (isset($ajxpNode->indexableMetaKeys["shared"])) {
         foreach ($ajxpNode->indexableMetaKeys["shared"] as $sharedField) {
             if ($ajxpNode->{$sharedField}) {
                 $doc->addField(Zend_search_Lucene_Field::keyword($sharedField, $ajxpNode->{$sharedField}));
             }
         }
     }
     foreach ($this->metaFields as $field) {
         if ($ajxpNode->{$field} != null) {
             $doc->addField(Zend_Search_Lucene_Field::Text("ajxp_meta_{$field}", $ajxpNode->{$field}), SystemTextEncoding::getEncoding());
         }
     }
     if (isset($ajxpNode->indexableMetaKeys["user"]) && count($ajxpNode->indexableMetaKeys["user"]) && AuthService::usersEnabled() && AuthService::getLoggedUser() != null) {
         $privateDoc = new Zend_Search_Lucene_Document();
         $privateDoc->addField(Zend_Search_Lucene_Field::Keyword("node_url", $ajxpNode->getUrl(), SystemTextEncoding::getEncoding()));
         $privateDoc->addField(Zend_Search_Lucene_Field::Keyword("node_path", str_replace("/", "AJXPFAKESEP", $ajxpNode->getPath()), SystemTextEncoding::getEncoding()));
         $privateDoc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_scope", "user"));
         $privateDoc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_user", AuthService::getLoggedUser()->getId()));
         foreach ($ajxpNode->indexableMetaKeys["user"] as $userField) {
             if ($ajxpNode->{$userField}) {
                 $privateDoc->addField(Zend_search_Lucene_Field::keyword($userField, $ajxpNode->{$userField}));
             }
         }
         $privateDoc->addField(Zend_Search_Lucene_Field::Binary("serialized_metadata", $serializedMeta));
         $index->addDocument($privateDoc);
     }
     if ($parseContent && in_array($ext, explode(",", $this->getFilteredOption("PARSE_CONTENT_TXT")))) {
         $doc->addField(Zend_Search_Lucene_Field::unStored("body", file_get_contents($ajxpNode->getUrl())));
     }
     $unoconv = $this->getFilteredOption("UNOCONV");
     $pipe = false;
     if ($parseContent && !empty($unoconv) && in_array($ext, array("doc", "odt", "xls", "ods"))) {
         $targetExt = "txt";
         if (in_array($ext, array("xls", "ods"))) {
             $targetExt = "csv";
         } else {
             if (in_array($ext, array("odp", "ppt"))) {
                 $targetExt = "pdf";
                 $pipe = true;
             }
         }
         $realFile = call_user_func(array($ajxpNode->wrapperClassName, "getRealFSReference"), $ajxpNode->getUrl());
         $unoconv = "HOME=" . AJXP_Utils::getAjxpTmpDir() . " " . $unoconv . " --stdout -f {$targetExt} " . escapeshellarg($realFile);
         if ($pipe) {
             $newTarget = str_replace(".{$ext}", ".pdf", $realFile);
             $unoconv .= " > {$newTarget}";
             register_shutdown_function("unlink", $newTarget);
         }
         $output = array();
         exec($unoconv, $output, $return);
         if (!$pipe) {
             $out = implode("\n", $output);
             $enc = 'ISO-8859-1';
             $asciiString = iconv($enc, 'ASCII//TRANSLIT//IGNORE', $out);
             $doc->addField(Zend_Search_Lucene_Field::unStored("body", $asciiString));
         } else {
             $ext = "pdf";
         }
     }
     $pdftotext = $this->getFilteredOption("PDFTOTEXT");
     if ($parseContent && !empty($pdftotext) && in_array($ext, array("pdf"))) {
         $realFile = call_user_func(array($ajxpNode->wrapperClassName, "getRealFSReference"), $ajxpNode->getUrl());
         if ($pipe && isset($newTarget) && is_file($newTarget)) {
             $realFile = $newTarget;
         }
         $cmd = $pdftotext . " " . escapeshellarg($realFile) . " -";
         $output = array();
         exec($cmd, $output, $return);
         $out = implode("\n", $output);
         $enc = 'UTF8';
         $asciiString = iconv($enc, 'ASCII//TRANSLIT//IGNORE', $out);
         $doc->addField(Zend_Search_Lucene_Field::unStored("body", $asciiString));
     }
     $index->addDocument($doc);
     return $doc;
 }
예제 #8
0
 function get_native_handler($type)
 {
     switch ($type) {
         case 'text/plain':
             return function (FileGallery_Wrapper $wrapper) {
                 return $wrapper->getContents();
             };
         case 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
             return function (FileGallery_Wrapper $wrapper) {
                 $document = Zend_Search_Lucene_Document_Docx::loadDocxFile($wrapper->getReadableFile(), true);
                 return $document->getField('body')->getUtf8Value();
             };
         case 'application/vnd.openxmlformats-officedocument.presentationml.presentation':
             return function (FileGallery_Wrapper $wrapper) {
                 $document = Zend_Search_Lucene_Document_Pptx::loadPptxFile($wrapper->getReadableFile(), true);
                 return $document->getField('body')->getUtf8Value();
             };
         case 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet':
             return function (FileGallery_Wrapper $wrapper) {
                 $document = Zend_Search_Lucene_Document_Xlsx::loadXlsxFile($wrapper->getReadableFile(), true);
                 return $document->getField('body')->getUtf8Value();
             };
     }
 }
예제 #9
0
 /**
  * Detect if document pointed to by $ps_filepath is a valid Word, Excel or PowerPoint XML (OpenOffice) document.
  *
  * @param string $ps_filepath The path to the file to analyze
  * @param string $ps_sig The signature (first 9 bytes) of the file
  * @return string WORD if the document is a Word doc, EXCEL if the document is an Excel doc, PPT if it is a PowerPoint doc or boolean false if it's not a valid Word or Excel XML (OpenOffice) file
  */
 private function isWordExcelorPPTXMLdoc($ps_filepath, $ps_sig)
 {
     if (substr($ps_sig, 0, 2) == 'PK') {
         $o_unzip = new UnZipFile($ps_filepath);
         if (is_array($va_list = $o_unzip->getFileList())) {
             foreach ($va_list as $vs_file => $vn_size) {
                 if (substr($vs_file, 0, 5) == 'word/') {
                     try {
                         $o_doc = Zend_Search_Lucene_Document_Docx::loadDocxFile($ps_filepath);
                         $this->opa_metadata = array('WORD' => array('title' => $o_doc->getFieldUtf8Value('title'), 'subject' => $o_doc->getFieldUtf8Value('subject'), 'creator' => $o_doc->getFieldUtf8Value('creator'), 'created' => $o_doc->getFieldUtf8Value('created'), 'modified' => $o_doc->getFieldUtf8Value('modified')));
                         $this->handle['content'] = $o_doc->getFieldUtf8Value('body');
                     } catch (Exception $e) {
                         // noop
                     }
                     return 'WORD';
                 }
                 if (substr($vs_file, 0, 3) == 'xl/') {
                     try {
                         $o_doc = Zend_Search_Lucene_Document_Xlsx::loadXlsxFile($ps_filepath);
                         $this->opa_metadata = array('EXCEL' => array('title' => $o_doc->getFieldUtf8Value('title'), 'creator' => $o_doc->getFieldUtf8Value('creator'), 'created' => $o_doc->getFieldUtf8Value('created'), 'modified' => $o_doc->getFieldUtf8Value('modified')));
                         $this->handle['content'] = $o_doc->getFieldUtf8Value('body');
                     } catch (Exception $e) {
                         // noop
                     }
                     return 'EXCEL';
                 }
                 if (substr($vs_file, 0, 4) == 'ppt/') {
                     try {
                         $o_doc = Zend_Search_Lucene_Document_Pptx::loadPptxFile($ps_filepath);
                         $this->opa_metadata = array('PPT' => array('title' => $o_doc->getFieldUtf8Value('title'), 'creator' => $o_doc->getFieldUtf8Value('creator'), 'created' => $o_doc->getFieldUtf8Value('created'), 'modified' => $o_doc->getFieldUtf8Value('modified')));
                         $this->handle['content'] = $o_doc->getFieldUtf8Value('body');
                     } catch (Exception $e) {
                         // noop
                     }
                     return 'PPT';
                 }
             }
         }
         return false;
     }
     return false;
 }
예제 #10
0
 /**
  * Extract data from a PDF document and add this to the Lucene index.
  *
  * @param \Zend_Search_Lucene_Proxy $Index             The Lucene index object.
  * @param string                    $type            ['html', 'docx', 'xsls', 'pptx', 'content']
  * @param array                        $indexValues
  * @param string                    $locale
  * @param object                    $obj
  * @param string                     $pathFile        The path to the PDF document.
  *
  * @return \Zend_Search_Lucene_Proxy
  * @access    public
  * @static
  * @author Etienne de Longeaux <*****@*****.**>
  * @since 2012-06-11
  */
 public static function index(\Zend_Search_Lucene_Proxy $Index, $type, $indexValues = null, $locale = '', $obj = null, $pathFile = '')
 {
     // ignore invalid characters for lucene text search
     \Zend_Search_Lucene_Search_QueryParser::setDefaultEncoding('utf-8');
     \Zend_Search_Lucene_Analysis_Analyzer::setDefault(new \Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8_CaseInsensitive());
     self::$_index = $Index;
     self::$_doc = null;
     switch ($type) {
         case "html":
             self::$_doc = \Zend_Search_Lucene_Document_Html::loadHtmlFile($pathFile, false);
             $indexValues['Key'] = filemtime($pathFile);
             $indexValues['Contents'] = self::$_doc->getFieldUtf8Value('body');
             break;
         case "docx":
             self::$_doc = \Zend_Search_Lucene_Document_Docx::loadDocxFile($pathFile, false);
             $indexValues['Key'] = filemtime($pathFile);
             $indexValues['Contents'] = self::$_doc->getFieldUtf8Value('body');
             break;
         case "xsls":
             self::$_doc = \Zend_Search_Lucene_Document_Xlsx::loadXlsxFile($pathFile, false);
             $indexValues['Key'] = filemtime($pathFile);
             $indexValues['Contents'] = self::$_doc->getFieldUtf8Value('body');
             break;
         case "pptx":
             self::$_doc = \Zend_Search_Lucene_Document_Pptx::loadPptxFile($pathFile, false);
             $indexValues['Key'] = filemtime($pathFile);
             $indexValues['Contents'] = self::$_doc->getFieldUtf8Value('body');
             break;
         case "page":
             // we create a new instance of Zend_Search_Lucene_Document
             self::$_doc = \Zend_Search_Lucene_Document_Html::loadHTML($indexValues['Contents'], false);
             $indexValues['Contents'] = self::$_doc->getFieldUtf8Value('body');
             break;
     }
     if (self::$_doc instanceof \Zend_Search_Lucene_Document) {
         // Remove all accens
         $indexValues['Contents'] = \Sfynx\ToolBundle\Util\PiStringManager::minusculesSansAccents($indexValues['Contents']);
         // Remove all doublons
         $indexValues['Contents'] = \Sfynx\ToolBundle\Util\PiStringManager::uniqueWord($indexValues['Contents']);
         // clean the content
         $indexValues['Contents'] = \Sfynx\ToolBundle\Util\PiStringManager::cleanContent($indexValues['Contents']);
         // Delete all stop words
         $stopWord = \Sfynx\ToolBundle\Util\PiStringManager::stopWord(strtolower($locale));
         if ($stopWord) {
             $wordsIndex = explode(' ', $indexValues['Contents']);
             $diff = array_diff($wordsIndex, $stopWord);
             $indexValues['Contents'] = implode(' ', $diff);
         }
         //             print_r($locale);
         //             print_r('<br /><br /><br />');
         //             print_r(implode(' ', $wordsIndex));
         //             print_r('<br /><br /><br />');
         //             print_r(implode(' ', $stopWord));
         //             print_r('<br /><br /><br />');
         //             print_r($indexValues['Contents']);
         //             print_r('<br /><br /><br />');
         // If the document creation was sucessful then add it to our index.
         try {
             setlocale(LC_ALL, $locale);
             self::defaultAddFields($indexValues);
             self::addDocument();
             //                 print_r($indexValues['Key']);
             //                 print_r('<br />');
             //                 print_r($indexValues['Contents']);
             //                 print_r('<br /><br /><br />');
         } catch (\Exception $e) {
             setlocale(LC_ALL, 'fr_FR');
             self::defaultAddFields($indexValues);
             try {
                 self::addDocument();
             } catch (\Exception $e) {
             }
         }
     }
     // Return the Lucene index object.
     return self::$_index;
 }