Ejemplo n.º 1
0
 /**
  * Extract Metadata from a PDF file
  * 
  * @since 2.10
  *
  * @param	string	full path to the desired file
  *
  * @return	array	( 'xmp' => array( key => value ), 'pdf' => array( key => value ) ) for each metadata field, in string format
  */
 public static function mla_extract_pdf_metadata($file_name)
 {
     $xmp = array();
     $metadata = array();
     self::$pdf_indirect_objects = NULL;
     $chunksize = 16384;
     if (!file_exists($file_name)) {
         return array('xmp' => $xmp, 'pdf' => $metadata);
     }
     $filesize = filesize($file_name);
     $file_offset = $chunksize < $filesize ? $filesize - $chunksize : 0;
     $tail = file_get_contents($file_name, false, NULL, $file_offset);
     //error_log( __LINE__ . " MLAPDF::mla_extract_pdf_metadata( {$file_name}, {$file_offset} ) tail = \r\n" . MLAData::mla_hex_dump( $tail ), 0 );
     if (0 == $file_offset) {
         $header = substr($tail, 0, 128);
     } else {
         $header = file_get_contents($file_name, false, NULL, 0, 128);
     }
     //error_log( __LINE__ . " MLAPDF::mla_extract_pdf_metadata( {$file_name}, {$file_offset} ) header = \r\n" . MLAData::mla_hex_dump( $header ), 0 );
     if ('%PDF-' == substr($header, 0, 5)) {
         $metadata['PDF_Version'] = substr($header, 1, 7);
         $metadata['PDF_VersionNumber'] = substr($header, 5, 3);
     }
     /*
      * Find the xref and (optional) trailer
      */
     $match_count = preg_match_all('/startxref[\\x00-\\x20]+(\\d+)[\\x00-\\x20]+\\%\\%EOF/', $tail, $matches, PREG_OFFSET_CAPTURE);
     if (0 == $match_count) {
         /* translators: 1: ERROR tag 2: path and file */
         error_log(sprintf(_x('%1$s: File "%2$s", startxref not found.', 'error_log', 'media-library-assistant'), __('ERROR', 'media-library-assistant'), $path), 0);
         return array('xmp' => $xmp, 'pdf' => $metadata);
     }
     $startxref = (int) $matches[1][$match_count - 1][0];
     $trailer_dictionaries = self::_extract_pdf_trailer($file_name, $startxref);
     //error_log( __LINE__ . " MLAPDF::mla_extract_pdf_metadata trailer_dictionaries = " . var_export( $trailer_dictionaries, true ), 0 );
     if (is_array($trailer_dictionaries)) {
         $info_reference = NULL;
         foreach ($trailer_dictionaries as $trailer_dictionary) {
             if (isset($trailer_dictionary['Info'])) {
                 $info_reference = $trailer_dictionary['Info'];
                 break;
             }
         }
         //error_log( __LINE__ . " MLAPDF::mla_extract_pdf_metadata info_reference = " . var_export( $info_reference, true ), 0 );
         if (isset($info_reference)) {
             $info_object = self::_find_pdf_indirect_dictionary($file_name, $info_reference['object'], $info_reference['generation']);
             /*
              * Handle single or multiple Info instances
              */
             $info_objects = array();
             if ($info_object) {
                 if (1 == $info_object['count']) {
                     $info_objects[] = $info_object;
                 } else {
                     for ($index = 0; $index < $info_object['count']; $index++) {
                         $info_objects[] = self::_find_pdf_indirect_dictionary($file_name, $info_reference['object'], $info_reference['generation'], $index);
                     }
                 }
             }
             //error_log( __LINE__ . " MLAPDF::mla_extract_pdf_metadata info_objects = " . var_export( $info_objects, true ), 0 );
             foreach ($info_objects as $info_object) {
                 $info_dictionary = self::_parse_pdf_dictionary($info_object['content'], 0);
                 //error_log( __LINE__ . " MLAPDF::mla_extract_pdf_metadata info_dictionary = " . var_export( $info_dictionary, true ), 0 );
                 unset($info_dictionary['/length']);
                 foreach ($info_dictionary as $name => $value) {
                     if ('string' == $value['type']) {
                         $prefix = substr($value['value'], 0, 2);
                         if ('D:' == $prefix) {
                             $metadata[$name] = MLAData::mla_parse_pdf_date($value['value']);
                         } elseif (chr(0xfe) . chr(0xff) == $prefix) {
                             $metadata[$name] = self::_parse_pdf_UTF16BE($value['value']);
                         } else {
                             $metadata[$name] = $value['value'];
                         }
                     } else {
                         $metadata[$name] = $value['value'];
                     }
                 }
                 // each info entry
             }
             // foreach Info object
             /*
              * Remove spurious "Filter" dictionaries
              */
             unset($metadata['Filter']);
             unset($metadata['Length']);
             unset($metadata['Length1']);
         }
         // found Info reference
         //error_log( __LINE__ . ' MLAPDF::mla_extract_pdf_metadata pdf metadata = ' . var_export( $metadata, true ), 0 );
         /*
          * Look for XMP Metadata
          */
         $root_reference = NULL;
         //error_log( __LINE__ . " MLAPDF::mla_extract_pdf_metadata info_dictionary = " . var_export( $info_dictionary, true ), 0 );
         foreach ($trailer_dictionaries as $trailer_dictionary) {
             if (isset($trailer_dictionary['Root'])) {
                 $root_reference = $trailer_dictionary['Root'];
                 break;
             }
         }
         //error_log( __LINE__ . " MLAPDF::mla_extract_pdf_metadata root_reference = " . var_export( $root_reference, true ), 0 );
         if (isset($root_reference)) {
             $root_object = self::_find_pdf_indirect_dictionary($file_name, $root_reference['object'], $root_reference['generation']);
             //error_log( __LINE__ . " MLAPDF::mla_extract_pdf_metadata root_object = " . var_export( $root_object, true ), 0 );
             if ($root_object) {
                 $root_dictionary = self::_parse_pdf_dictionary($root_object['content'], 0);
                 //error_log( __LINE__ . " MLAPDF::mla_extract_pdf_metadata root_dictionary = " . var_export( $root_dictionary, true ), 0 );
                 unset($root_dictionary['/length']);
                 if (isset($root_dictionary['Metadata'])) {
                     $xmp_object = self::_find_pdf_indirect_dictionary($file_name, $root_dictionary['Metadata']['object'], $root_dictionary['Metadata']['generation']);
                     //error_log( __LINE__ . " MLAPDF::mla_extract_pdf_metadata xmp_object = " . var_export( $xmp_object, true ), 0 );
                     $xmp = MLAData::mla_parse_xmp_metadata($file_name, $xmp_object['start'] + $xmp_object['length']);
                     if (is_array($xmp)) {
                         $metadata = array_merge($metadata, $xmp);
                     } else {
                         $xmp = array();
                         $xmp = MLAData::mla_parse_xmp_metadata($file_name, 0);
                         //error_log( __LINE__ . ' MLAPDF::mla_extract_pdf_metadata recovered xmp = ' . var_export( $xmp, true ), 0 );
                     }
                 }
                 // found Metadata reference
             }
             // found Root object
         }
         // found Root reference
     }
     // found trailer_dictionaries
     //error_log( __LINE__ . ' MLAPDF::mla_extract_pdf_metadata pdf = ' . var_export( $metadata, true ), 0 );
     //error_log( __LINE__ . ' MLAPDF::mla_extract_pdf_metadata xmp = ' . var_export( $xmp, true ), 0 );
     return array('xmp' => $xmp, 'pdf' => $metadata);
 }