/** * Object constructor * * @param string $data * @param boolean $storeContent */ private function __construct($data, $storeContent) { try { $zendpdf = \Zend_Pdf::parse($data); // Store meta data properties if (isset($zendpdf->properties['Title'])) { $this->addField(\Zend_Search_Lucene_Field::UnStored('title', $zendpdf->properties['Title'])); } if (isset($zendpdf->properties['Author'])) { $this->addField(\Zend_Search_Lucene_Field::UnStored('author', $zendpdf->properties['Author'])); } if (isset($zendpdf->properties['Subject'])) { $this->addField(\Zend_Search_Lucene_Field::UnStored('subject', $zendpdf->properties['Subject'])); } if (isset($zendpdf->properties['Keywords'])) { $this->addField(\Zend_Search_Lucene_Field::UnStored('keywords', $zendpdf->properties['Keywords'])); } //TODO handle PDF 1.6 metadata Zend_Pdf::getMetadata() //do the content extraction $pdfParse = new \App_Search_Helper_PdfParser(); $body = $pdfParse->pdf2txt($zendpdf->render()); if ($body != '') { // Store contents if ($storeContent) { $this->addField(\Zend_Search_Lucene_Field::Text('body', $body, 'UTF-8')); } else { $this->addField(\Zend_Search_Lucene_Field::UnStored('body', $body, 'UTF-8')); } } } catch (\Exception $e) { Util::writeLog('search_lucene', $e->getMessage() . ' Trace:\\n' . $e->getTraceAsString(), Util::ERROR); } }
/** * Extract data from a PDF document and add this to the Lucene index. * * @param string $pdfPath The path to the PDF document. * @param Zend_Search_Lucene_Proxy $luceneIndex The Lucene index object. * @return Zend_Search_Lucene_Proxy */ public static function index($pdfPath, $luceneIndex) { // Load the PDF document. $pdf = Zend_Pdf::load($pdfPath); $key = md5($pdfPath); /** * Set up array to contain the document index data. * The Filename will be used to retrive the document if it is found in * the search resutls. * The Key will be used to uniquely identify the document so we can * delete it from the search index. */ $indexValues = array('Filename' => $pdfPath, 'Key' => $key, 'Title' => '', 'Author' => '', 'Subject' => '', 'Keywords' => '', 'Creator' => '', 'Producer' => '', 'CreationDate' => '', 'ModDate' => '', 'Contents' => ''); // Go through each meta data item and add to index array. foreach ($pdf->properties as $meta => $metaValue) { switch ($meta) { case 'Title': $indexValues['Title'] = $pdf->properties['Title']; break; case 'Subject': $indexValues['Subject'] = $pdf->properties['Subject']; break; case 'Author': $indexValues['Author'] = $pdf->properties['Author']; break; case 'Keywords': $indexValues['Keywords'] = $pdf->properties['Keywords']; break; case 'CreationDate': $dateCreated = $pdf->properties['CreationDate']; $distance = substr($dateCreated, 16, 2); if (!is_long($distance)) { $distance = null; } // Convert date from the PDF format of D:20090731160351+01'00' $dateCreated = mktime(substr($dateCreated, 10, 2), substr($dateCreated, 12, 2), substr($dateCreated, 14, 2), substr($dateCreated, 6, 2), substr($dateCreated, 8, 2), substr($dateCreated, 2, 4), $distance); //distance $indexValues['CreationDate'] = date('Ymd', $dateCreated); break; case 'Date': $indexValues['Date'] = $pdf->properties['Date']; break; } } /** * Parse the contents of the PDF document and pass the text to the * contents item in the $indexValues array. */ $pdfParse = new App_Search_Helper_PdfParser(); $indexValues['Contents'] = $pdfParse->pdf2txt($pdf->render()); // Create the document using the values $doc = new App_Search_Lucene_Document($indexValues); if ($doc !== false) { // If the document creation was sucessful then add it to our index. $luceneIndex->addDocument($doc); } // Return the Lucene index object. return $luceneIndex; }
/** * extract the metadata from a file * * uses getid3 to extract metadata. * if possible also adds content (currently only for plain text files) * hint: use OC\Files\Filesystem::getFileInfo($path) to get metadata for the last param * * @author Jörn Dreyer <*****@*****.**> * * @param Zend_Search_Lucene_Document $doc to add the metadata to * @param string $path path of the file to extract metadata from * @param string $mimetype depending on the mimetype different extractions are performed * * @return void */ private static function extractMetadata(\Zend_Search_Lucene_Document $doc, $path, \OC\Files\View $view, $mimetype) { $file = $view->getLocalFile($path); if (is_dir($file)) { // Don't lose time analizing a directory for file-specific metadata return; } $getID3 = new \getID3(); $getID3->encoding = 'UTF-8'; $data = $getID3->analyze($file); // TODO index meta information from media files? //show me what you got /*foreach ($data as $key => $value) { Util::writeLog('search_lucene', 'getid3 extracted '.$key.': '.$value, Util::DEBUG); if (is_array($value)) { foreach ($value as $k => $v) { Util::writeLog('search_lucene', ' ' . $value .'-' .$k.': '.$v, Util::DEBUG); } } }*/ if ('application/pdf' === $mimetype) { try { $zendpdf = \Zend_Pdf::parse($view->file_get_contents($path)); //we currently only display the filename, so we only index metadata here if (isset($zendpdf->properties['Title'])) { $doc->addField(\Zend_Search_Lucene_Field::UnStored('title', $zendpdf->properties['Title'])); } if (isset($zendpdf->properties['Author'])) { $doc->addField(\Zend_Search_Lucene_Field::UnStored('author', $zendpdf->properties['Author'])); } if (isset($zendpdf->properties['Subject'])) { $doc->addField(\Zend_Search_Lucene_Field::UnStored('subject', $zendpdf->properties['Subject'])); } if (isset($zendpdf->properties['Keywords'])) { $doc->addField(\Zend_Search_Lucene_Field::UnStored('keywords', $zendpdf->properties['Keywords'])); } //TODO handle PDF 1.6 metadata Zend_Pdf::getMetadata() //do the content extraction $pdfParse = new \App_Search_Helper_PdfParser(); $body = $pdfParse->pdf2txt($zendpdf->render()); } catch (Exception $e) { Util::writeLog('search_lucene', $e->getMessage() . ' Trace:\\n' . $e->getTraceAsString(), Util::ERROR); } } if ($body != '') { $doc->addField(\Zend_Search_Lucene_Field::UnStored('body', $body)); } if (isset($data['error'])) { Util::writeLog('search_lucene', 'failed to extract meta information for ' . $view->getAbsolutePath($path) . ': ' . $data['error']['0'], Util::WARN); return; } }