/** * @dataProvider dataProvider */ function testParseSplit($term, $field, $descriptiveLocation, $skipped) { if ($skipped) { $this->markTestSkipped('TODO search ' . $descriptiveLocation . ' in ' . $field); } $doc = Docx::loadDocxFile(__DIR__ . '/data/libreoffice/document split.docx', true); $value = $doc->getFieldValue($field); $containsTestTerm = is_string(stristr($value, $term)); $this->assertTrue($containsTestTerm, $field . '/' . $descriptiveLocation . ' does not contain "' . $term . '" in ' . $value); }
public function generateIndexAction() { $searchIndexLocation = $this->getIndexLocation(); $index = Lucene\Lucene::create($searchIndexLocation); $userTable = $this->getServiceLocator()->get('UserTable'); $uploadTable = $this->getServiceLocator()->get('UploadTable'); $allUploads = $uploadTable->fetchAll(); foreach ($allUploads as $fileUpload) { $uploadOwner = $userTable->getById($fileUpload->getUserId()); // создание полей lucene $fileUploadId = Document\Field::unIndexed('upload_id', $fileUpload->getId()); $label = Document\Field::Text('label', $fileUpload->getLabel()); $owner = Document\Field::Text('owner', $uploadOwner->getName()); $uploadPath = $this->getFileUploadLocation(); $fileName = $fileUpload->getFilename(); $filePath = $uploadPath . DIRECTORY_SEPARATOR . $fileName; if (substr_compare($fileName, ".xlsx", strlen($fileName) - strlen(".xlsx"), strlen(".xlsx")) === 0) { // Индексирование таблицы excel $indexDoc = Lucene\Document\Xlsx::loadXlsxFile($filePath); } else { if (substr_compare($fileName, ".docx", strlen($fileName) - strlen(".docx"), strlen(".docx")) === 0) { // Индексирование документа Word $indexDoc = Lucene\Document\Docx::loadDocxFile($filePath); } else { $indexDoc = new Lucene\Document(); } } // создание нового документа и добавление всех полей $indexDoc = new Lucene\Document(); $indexDoc->addField($label); $indexDoc->addField($owner); $indexDoc->addField($fileUploadId); $index->addDocument($indexDoc); } $index->commit(); $response = $this->getResponse(); $response->setContent("Index Ok"); return $response; }
/** * index a file * * @param File $file the file to be indexed * @param bool $commit * * @return bool true when something was stored in the index, false otherwise (eg, folders are not indexed) * @throws NotIndexedException when an unsupported file type is encountered */ public function indexFile(File $file, $commit = true) { // we decide how to index on mime type or file extension $mimeType = $file->getMimeType(); $fileExtension = strtolower(pathinfo($file->getName(), PATHINFO_EXTENSION)); // initialize plain lucene document $doc = new Document(); // index content for local files only $storage = $file->getStorage(); if ($storage->isLocal()) { $path = $storage->getLocalFile($file->getInternalPath()); //try to use special lucene document types if ('text/html' === $mimeType) { //TODO could be indexed, even if not local $doc = HTML::loadHTML($file->getContent()); } else { if ('text/' === substr($mimeType, 0, 5) || 'application/x-tex' === $mimeType) { $body = $file->getContent(); if ($body != '') { $doc->addField(Document\Field::UnStored('body', $body)); } } else { if ('application/pdf' === $mimeType) { $doc = Pdf::loadPdf($file->getContent()); // the zend classes only understand docx and not doc files } else { if ($fileExtension === 'docx') { $doc = Document\Docx::loadDocxFile($path); //} else if ('application/msexcel' === $mimeType) { } else { if ($fileExtension === 'xlsx') { $doc = Document\Xlsx::loadXlsxFile($path); //} else if ('application/mspowerpoint' === $mimeType) { } else { if ($fileExtension === 'pptx') { $doc = Document\Pptx::loadPptxFile($path); } else { if ($fileExtension === 'odt') { $doc = Odt::loadOdtFile($path); } else { if ($fileExtension === 'ods') { $doc = Ods::loadOdsFile($path); } else { throw new NotIndexedException(); } } } } } } } } } // Store filecache id as unique id to lookup by when deleting $doc->addField(Document\Field::Keyword('fileId', $file->getId())); // Store document path for the search results $doc->addField(Document\Field::Text('path', $file->getPath(), 'UTF-8')); $doc->addField(Document\Field::unIndexed('mtime', $file->getMTime())); $doc->addField(Document\Field::unIndexed('size', $file->getSize())); $doc->addField(Document\Field::unIndexed('mimetype', $mimeType)); $this->index->updateFile($doc, $file->getId(), $commit); return true; }
public function processAction() { $userEmail = $this->getAuthService()->getStorage()->read(); if (!$userEmail) { $this->flashMessenger()->addErrorMessage("not authorized"); return $this->getResponse()->setContent("not authorized"); } $request = $this->getRequest(); $form = new UploadForm(); $uploadFile = $this->params()->fromFiles('fileupload'); if ($request->isPost()) { $form->setData($request->getPost()); if ($form->isValid()) { // Получение конфигурации из конфигурационных данных модуля $uploadPath = $this->getFileUploadLocation(); // Сохранение выгруженного файла $adapter = new \Zend\File\Transfer\Adapter\Http(); $adapter->setDestination($uploadPath); if ($adapter->receive($uploadFile['name'])) { $userTable = $this->getServiceLocator()->get('UserTable'); $user = $userTable->getUserByEmail($userEmail); $upload = new \Users\Model\Upload(); // Успешная выгрузка файла $exchange_data = array(); $exchange_data['label'] = $request->getPost()->get('label'); $exchange_data['filename'] = $uploadFile['name']; $exchange_data['user_id'] = $user->getId(); $upload->exchangeArray($exchange_data); $uploadTable = $this->getServiceLocator()->get('UploadTable'); $uploadTable->save($upload); $upload->setId($uploadTable->getLastInsertValue()); //добавить в Lucene $searchIndexLocation = $this->getIndexLocation(); $index = Lucene\Lucene::create($searchIndexLocation); // создание полей lucene $fileUploadId = Document\Field::unIndexed('upload_id', $upload->getId()); $label = Document\Field::Text('label', $upload->getLabel()); $owner = Document\Field::Text('owner', $user->getName()); $uploadPath = $this->getFileUploadLocation(); $fileName = $upload->getFilename(); $filePath = $uploadPath . DIRECTORY_SEPARATOR . $fileName; if (substr_compare($fileName, ".xlsx", strlen($fileName) - strlen(".xlsx"), strlen(".xlsx")) === 0) { // Индексирование таблицы excel $indexDoc = Lucene\Document\Xlsx::loadXlsxFile($filePath); } else { if (substr_compare($fileName, ".docx", strlen($fileName) - strlen(".docx"), strlen(".docx")) === 0) { // Индексирование документа Word $indexDoc = Lucene\Document\Docx::loadDocxFile($filePath); } else { $indexDoc = new Lucene\Document(); } } // создание нового документа и добавление всех полей $indexDoc = new Lucene\Document(); $indexDoc->addField($label); $indexDoc->addField($owner); $indexDoc->addField($fileUploadId); $index->addDocument($indexDoc); $index->commit(); } } } return $this->redirect()->toRoute('uploads', array('action' => 'index')); }
public function testDocx() { if (!class_exists('ZipArchive')) { $this->markTestSkipped('ZipArchive class (Zip extension) is not loaded'); } $docxDocument = Document\Docx::loadDocxFile(__DIR__ . '/_openXmlDocuments/test.docx', true); $this->assertTrue($docxDocument instanceof Document\Docx); $this->assertEquals($docxDocument->getFieldValue('title'), 'Test document'); $this->assertEquals($docxDocument->getFieldValue('description'), 'This is a test document which can be used to demonstrate something.'); $this->assertTrue($docxDocument->getFieldValue('body') != ''); try { $docxDocument1 = Document\Docx::loadDocxFile(__DIR__ . '/_openXmlDocuments/dummy.docx', true); $this->fail('File not readable exception is expected.'); } catch (Document\Exception\InvalidArgumentException $e) { if (strpos($e->getMessage(), 'is not readable') === false) { // Passthrough exception throw $e; } } }
public function generateIndexAction() { $searchIndexLocation = $this->getIndexLocation(); $index = Lucene\Lucene::create($searchIndexLocation); $userTable = $this->getServiceLocator()->get('UserTable'); $uploadTable = $this->getServiceLocator()->get('UploadTable'); $allUploads = $uploadTable->fetchAll(); foreach ($allUploads as $fileUpload) { // $uploadOwner = $userTable->getUser($fileUpload->user_id); // id field $fileUploadId = Document\Field::unIndexed('upload_id', $fileUpload->id); // label field $label = Document\Field::Text('label', $fileUpload->label); // owner field $owner = Document\Field::Text('owner', $uploadOwner->name); if (substr_compare($fileUpload->filename, ".xlsx", strlen($fileUpload->filename) - strlen(".xlsx"), strlen(".xlsx")) === 0) { // index excel sheet $uploadPath = $this->getFileUploadLocation(); $indexDoc = Lucene\Document\Xlsx::loadXlsxFile($uploadPath . "/" . $fileUpload->filename); } else { if (substr_compare($fileUpload->filename, ".docx", strlen($fileUpload->filename) - strlen(".docx"), strlen(".docx")) === 0) { // index word doc $uploadPath = $this->getFileUploadLocation(); $indexDoc = Lucene\Document\Docx::loadDocxFile($uploadPath . "/" . $fileUpload->filename); } else { $indexDoc = new Lucene\Document(); } } $indexDoc->addField($label); $indexDoc->addField($owner); $indexDoc->addField($fileUploadId); $index->addDocument($indexDoc); } $index->commit(); }
/** * Read .docx * * @param $attributeValue * @param $value * @return null|string * @throws \Zend_Search_Lucene_Document_Exception */ private function readDocx($attributeValue, $value) { $filePath = $this->getFilePath($attributeValue['basePath'], $value); if (!file_exists($filePath)) { return null; } return Docx::loadDocxFile($filePath)->body; }