App_Search_Helper_PdfParser PHP Code Examples

Example #1

0

Show file

File: Pdf.php Project: omusico/isle-web-framework

 /**
  * Object constructor
  *
  * @param string  $data
  * @param boolean $storeContent
  */
 private function __construct($data, $storeContent)
 {
     try {
         $zendpdf = \Zend_Pdf::parse($data);
         // Store meta data properties
         if (isset($zendpdf->properties['Title'])) {
             $this->addField(\Zend_Search_Lucene_Field::UnStored('title', $zendpdf->properties['Title']));
         }
         if (isset($zendpdf->properties['Author'])) {
             $this->addField(\Zend_Search_Lucene_Field::UnStored('author', $zendpdf->properties['Author']));
         }
         if (isset($zendpdf->properties['Subject'])) {
             $this->addField(\Zend_Search_Lucene_Field::UnStored('subject', $zendpdf->properties['Subject']));
         }
         if (isset($zendpdf->properties['Keywords'])) {
             $this->addField(\Zend_Search_Lucene_Field::UnStored('keywords', $zendpdf->properties['Keywords']));
         }
         //TODO handle PDF 1.6 metadata Zend_Pdf::getMetadata()
         //do the content extraction
         $pdfParse = new \App_Search_Helper_PdfParser();
         $body = $pdfParse->pdf2txt($zendpdf->render());
         if ($body != '') {
             // Store contents
             if ($storeContent) {
                 $this->addField(\Zend_Search_Lucene_Field::Text('body', $body, 'UTF-8'));
             } else {
                 $this->addField(\Zend_Search_Lucene_Field::UnStored('body', $body, 'UTF-8'));
             }
         }
     } catch (\Exception $e) {
         Util::writeLog('search_lucene', $e->getMessage() . ' Trace:\\n' . $e->getTraceAsString(), Util::ERROR);
     }
 }

Example #2

0

Show file

File: Pdfs.php Project: philipnorton42/PDFSearch

 /**
  * Extract data from a PDF document and add this to the Lucene index.
  *
  * @param string $pdfPath                       The path to the PDF document.
  * @param Zend_Search_Lucene_Proxy $luceneIndex The Lucene index object.
  * @return Zend_Search_Lucene_Proxy
  */
 public static function index($pdfPath, $luceneIndex)
 {
     // Load the PDF document.
     $pdf = Zend_Pdf::load($pdfPath);
     $key = md5($pdfPath);
     /**
      * Set up array to contain the document index data.
      * The Filename will be used to retrive the document if it is found in
      * the search resutls.
      * The Key will be used to uniquely identify the document so we can
      * delete it from the search index.
      */
     $indexValues = array('Filename' => $pdfPath, 'Key' => $key, 'Title' => '', 'Author' => '', 'Subject' => '', 'Keywords' => '', 'Creator' => '', 'Producer' => '', 'CreationDate' => '', 'ModDate' => '', 'Contents' => '');
     // Go through each meta data item and add to index array.
     foreach ($pdf->properties as $meta => $metaValue) {
         switch ($meta) {
             case 'Title':
                 $indexValues['Title'] = $pdf->properties['Title'];
                 break;
             case 'Subject':
                 $indexValues['Subject'] = $pdf->properties['Subject'];
                 break;
             case 'Author':
                 $indexValues['Author'] = $pdf->properties['Author'];
                 break;
             case 'Keywords':
                 $indexValues['Keywords'] = $pdf->properties['Keywords'];
                 break;
             case 'CreationDate':
                 $dateCreated = $pdf->properties['CreationDate'];
                 $distance = substr($dateCreated, 16, 2);
                 if (!is_long($distance)) {
                     $distance = null;
                 }
                 // Convert date from the PDF format of D:20090731160351+01'00'
                 $dateCreated = mktime(substr($dateCreated, 10, 2), substr($dateCreated, 12, 2), substr($dateCreated, 14, 2), substr($dateCreated, 6, 2), substr($dateCreated, 8, 2), substr($dateCreated, 2, 4), $distance);
                 //distance
                 $indexValues['CreationDate'] = date('Ymd', $dateCreated);
                 break;
             case 'Date':
                 $indexValues['Date'] = $pdf->properties['Date'];
                 break;
         }
     }
     /**
      * Parse the contents of the PDF document and pass the text to the
      * contents item in the $indexValues array.
      */
     $pdfParse = new App_Search_Helper_PdfParser();
     $indexValues['Contents'] = $pdfParse->pdf2txt($pdf->render());
     // Create the document using the values
     $doc = new App_Search_Lucene_Document($indexValues);
     if ($doc !== false) {
         // If the document creation was sucessful then add it to our index.
         $luceneIndex->addDocument($doc);
     }
     // Return the Lucene index object.
     return $luceneIndex;
 }

Example #3

0

Show file

File: indexer.php Project: omusico/isle-web-framework

 /**
  * extract the metadata from a file
  *
  * uses getid3 to extract metadata.
  * if possible also adds content (currently only for plain text files)
  * hint: use OC\Files\Filesystem::getFileInfo($path) to get metadata for the last param
  *
  * @author Jörn Dreyer <*****@*****.**>
  *
  * @param Zend_Search_Lucene_Document $doc      to add the metadata to
  * @param string                      $path     path of the file to extract metadata from
  * @param string                      $mimetype depending on the mimetype different extractions are performed
  *
  * @return void
  */
 private static function extractMetadata(\Zend_Search_Lucene_Document $doc, $path, \OC\Files\View $view, $mimetype)
 {
     $file = $view->getLocalFile($path);
     if (is_dir($file)) {
         // Don't lose time analizing a directory for file-specific metadata
         return;
     }
     $getID3 = new \getID3();
     $getID3->encoding = 'UTF-8';
     $data = $getID3->analyze($file);
     // TODO index meta information from media files?
     //show me what you got
     /*foreach ($data as $key => $value) {
     			Util::writeLog('search_lucene',
     						'getid3 extracted '.$key.': '.$value,
     						Util::DEBUG);
     			if (is_array($value)) {
     				foreach ($value as $k => $v) {
     					Util::writeLog('search_lucene',
     							'  ' . $value .'-' .$k.': '.$v,
     							Util::DEBUG);
     				}
     			}
     		}*/
     if ('application/pdf' === $mimetype) {
         try {
             $zendpdf = \Zend_Pdf::parse($view->file_get_contents($path));
             //we currently only display the filename, so we only index metadata here
             if (isset($zendpdf->properties['Title'])) {
                 $doc->addField(\Zend_Search_Lucene_Field::UnStored('title', $zendpdf->properties['Title']));
             }
             if (isset($zendpdf->properties['Author'])) {
                 $doc->addField(\Zend_Search_Lucene_Field::UnStored('author', $zendpdf->properties['Author']));
             }
             if (isset($zendpdf->properties['Subject'])) {
                 $doc->addField(\Zend_Search_Lucene_Field::UnStored('subject', $zendpdf->properties['Subject']));
             }
             if (isset($zendpdf->properties['Keywords'])) {
                 $doc->addField(\Zend_Search_Lucene_Field::UnStored('keywords', $zendpdf->properties['Keywords']));
             }
             //TODO handle PDF 1.6 metadata Zend_Pdf::getMetadata()
             //do the content extraction
             $pdfParse = new \App_Search_Helper_PdfParser();
             $body = $pdfParse->pdf2txt($zendpdf->render());
         } catch (Exception $e) {
             Util::writeLog('search_lucene', $e->getMessage() . ' Trace:\\n' . $e->getTraceAsString(), Util::ERROR);
         }
     }
     if ($body != '') {
         $doc->addField(\Zend_Search_Lucene_Field::UnStored('body', $body));
     }
     if (isset($data['error'])) {
         Util::writeLog('search_lucene', 'failed to extract meta information for ' . $view->getAbsolutePath($path) . ': ' . $data['error']['0'], Util::WARN);
         return;
     }
 }

PHP App_Search_Helper_PdfParser Examples