Example #1
0
 /**
  * Object constructor
  *
  * @param string  $data
  * @param boolean $storeContent
  * @throws NotIndexedException
  */
 private function __construct($data, $storeContent)
 {
     //TODO check PDF >1.5 metadata extraction
     //do the content extraction
     $parser = new Parser();
     try {
         $pdf = $parser->parseContent($data);
         $body = $pdf->getText();
         // Store contents
         if ($storeContent) {
             $this->addField(Document\Field::Text('body', $body, 'UTF-8'));
         } else {
             $this->addField(Document\Field::UnStored('body', $body, 'UTF-8'));
         }
         $details = $pdf->getDetails();
         // Store meta data properties
         foreach ($details as $key => $value) {
             $key = strtolower($key);
             if ($key === 'author') {
                 $key = 'creator';
             }
             $this->addField(Document\Field::Text($key, $value, 'UTF-8'));
         }
     } catch (\Exception $ex) {
         throw new NotIndexedException(null, null, $ex);
     }
 }
 public static function analyze($pdf_data)
 {
     $parser = new PdfParser\Parser();
     return new self($parser->parseContent($pdf_data));
 }
Example #3
0
 /**
  * Handle action related to mime type detection.
  * These action can be exclude or link to handle custom content (like image, video, pdf, etc ..).
  *
  * @param array  $mimeInfo      From getMimeActionInfo() function
  * @param string $effective_url Current content url
  * @param string $body          Content from the response
  *
  * @return array|null
  */
 private function handleMimeAction($mimeInfo, $effective_url, $body = '')
 {
     if (!isset($mimeInfo['action'])) {
         return;
     }
     $infos = array('status' => 200, 'title' => $mimeInfo['name'], 'language' => '', 'html' => '', 'url' => $effective_url, 'content_type' => $mimeInfo['mime'], 'open_graph' => array());
     switch ($mimeInfo['action']) {
         case 'exclude':
             throw new \Exception(sprintf('This is url "%s" is blocked by mime action.', $effective_url));
         case 'link':
             $infos['html'] = '<a href="' . $effective_url . '">Download ' . $mimeInfo['name'] . '</a>';
             if ($mimeInfo['type'] == 'image') {
                 $infos['html'] = '<a href="' . $effective_url . '"><img src="' . $effective_url . '" alt="' . $mimeInfo['name'] . '" /></a>';
             }
             if ($mimeInfo['mime'] == 'application/pdf') {
                 $parser = new PdfParser();
                 $pdf = $parser->parseContent($body);
                 $infos['html'] = nl2br($pdf->getText());
                 // update title in case of details are present
                 $details = $pdf->getDetails();
                 if (isset($details['Title']) && '' !== trim($details['Title'])) {
                     $infos['title'] = $details['Title'];
                 }
             }
             if ($mimeInfo['mime'] == 'text/plain') {
                 $infos['html'] = '<pre>' . $body . '</pre>';
             }
             return $infos;
     }
     return;
 }
 /**
  *
  * Extract content from resource
  *
  * @param \VDB\Spider\Resource $resource
  *
  * @return string
  */
 public function extractContentFromResource(Resource $resource)
 {
     $pdf = $this->pdfParser->parseContent($resource->getResponse()->getBody(true));
     return $this->stripBinaryContent($pdf->getText());
 }