/** * Extract Metadata from a PDF file * * @since 2.10 * * @param string full path to the desired file * * @return array ( 'xmp' => array( key => value ), 'pdf' => array( key => value ) ) for each metadata field, in string format */ public static function mla_extract_pdf_metadata($file_name) { $xmp = array(); $metadata = array(); self::$pdf_indirect_objects = NULL; $chunksize = 16384; if (!file_exists($file_name)) { return array('xmp' => $xmp, 'pdf' => $metadata); } $filesize = filesize($file_name); $file_offset = $chunksize < $filesize ? $filesize - $chunksize : 0; $tail = file_get_contents($file_name, false, NULL, $file_offset); //error_log( __LINE__ . " MLAPDF::mla_extract_pdf_metadata( {$file_name}, {$file_offset} ) tail = \r\n" . MLAData::mla_hex_dump( $tail ), 0 ); if (0 == $file_offset) { $header = substr($tail, 0, 128); } else { $header = file_get_contents($file_name, false, NULL, 0, 128); } //error_log( __LINE__ . " MLAPDF::mla_extract_pdf_metadata( {$file_name}, {$file_offset} ) header = \r\n" . MLAData::mla_hex_dump( $header ), 0 ); if ('%PDF-' == substr($header, 0, 5)) { $metadata['PDF_Version'] = substr($header, 1, 7); $metadata['PDF_VersionNumber'] = substr($header, 5, 3); } /* * Find the xref and (optional) trailer */ $match_count = preg_match_all('/startxref[\\x00-\\x20]+(\\d+)[\\x00-\\x20]+\\%\\%EOF/', $tail, $matches, PREG_OFFSET_CAPTURE); if (0 == $match_count) { /* translators: 1: ERROR tag 2: path and file */ error_log(sprintf(_x('%1$s: File "%2$s", startxref not found.', 'error_log', 'media-library-assistant'), __('ERROR', 'media-library-assistant'), $path), 0); return array('xmp' => $xmp, 'pdf' => $metadata); } $startxref = (int) $matches[1][$match_count - 1][0]; $trailer_dictionaries = self::_extract_pdf_trailer($file_name, $startxref); //error_log( __LINE__ . " MLAPDF::mla_extract_pdf_metadata trailer_dictionaries = " . var_export( $trailer_dictionaries, true ), 0 ); if (is_array($trailer_dictionaries)) { $info_reference = NULL; foreach ($trailer_dictionaries as $trailer_dictionary) { if (isset($trailer_dictionary['Info'])) { $info_reference = $trailer_dictionary['Info']; break; } } //error_log( __LINE__ . " MLAPDF::mla_extract_pdf_metadata info_reference = " . var_export( $info_reference, true ), 0 ); if (isset($info_reference)) { $info_object = self::_find_pdf_indirect_dictionary($file_name, $info_reference['object'], $info_reference['generation']); /* * Handle single or multiple Info instances */ $info_objects = array(); if ($info_object) { if (1 == $info_object['count']) { $info_objects[] = $info_object; } else { for ($index = 0; $index < $info_object['count']; $index++) { $info_objects[] = self::_find_pdf_indirect_dictionary($file_name, $info_reference['object'], $info_reference['generation'], $index); } } } //error_log( __LINE__ . " MLAPDF::mla_extract_pdf_metadata info_objects = " . var_export( $info_objects, true ), 0 ); foreach ($info_objects as $info_object) { $info_dictionary = self::_parse_pdf_dictionary($info_object['content'], 0); //error_log( __LINE__ . " MLAPDF::mla_extract_pdf_metadata info_dictionary = " . var_export( $info_dictionary, true ), 0 ); unset($info_dictionary['/length']); foreach ($info_dictionary as $name => $value) { if ('string' == $value['type']) { $prefix = substr($value['value'], 0, 2); if ('D:' == $prefix) { $metadata[$name] = MLAData::mla_parse_pdf_date($value['value']); } elseif (chr(0xfe) . chr(0xff) == $prefix) { $metadata[$name] = self::_parse_pdf_UTF16BE($value['value']); } else { $metadata[$name] = $value['value']; } } else { $metadata[$name] = $value['value']; } } // each info entry } // foreach Info object /* * Remove spurious "Filter" dictionaries */ unset($metadata['Filter']); unset($metadata['Length']); unset($metadata['Length1']); } // found Info reference //error_log( __LINE__ . ' MLAPDF::mla_extract_pdf_metadata pdf metadata = ' . var_export( $metadata, true ), 0 ); /* * Look for XMP Metadata */ $root_reference = NULL; //error_log( __LINE__ . " MLAPDF::mla_extract_pdf_metadata info_dictionary = " . var_export( $info_dictionary, true ), 0 ); foreach ($trailer_dictionaries as $trailer_dictionary) { if (isset($trailer_dictionary['Root'])) { $root_reference = $trailer_dictionary['Root']; break; } } //error_log( __LINE__ . " MLAPDF::mla_extract_pdf_metadata root_reference = " . var_export( $root_reference, true ), 0 ); if (isset($root_reference)) { $root_object = self::_find_pdf_indirect_dictionary($file_name, $root_reference['object'], $root_reference['generation']); //error_log( __LINE__ . " MLAPDF::mla_extract_pdf_metadata root_object = " . var_export( $root_object, true ), 0 ); if ($root_object) { $root_dictionary = self::_parse_pdf_dictionary($root_object['content'], 0); //error_log( __LINE__ . " MLAPDF::mla_extract_pdf_metadata root_dictionary = " . var_export( $root_dictionary, true ), 0 ); unset($root_dictionary['/length']); if (isset($root_dictionary['Metadata'])) { $xmp_object = self::_find_pdf_indirect_dictionary($file_name, $root_dictionary['Metadata']['object'], $root_dictionary['Metadata']['generation']); //error_log( __LINE__ . " MLAPDF::mla_extract_pdf_metadata xmp_object = " . var_export( $xmp_object, true ), 0 ); $xmp = MLAData::mla_parse_xmp_metadata($file_name, $xmp_object['start'] + $xmp_object['length']); if (is_array($xmp)) { $metadata = array_merge($metadata, $xmp); } else { $xmp = array(); $xmp = MLAData::mla_parse_xmp_metadata($file_name, 0); //error_log( __LINE__ . ' MLAPDF::mla_extract_pdf_metadata recovered xmp = ' . var_export( $xmp, true ), 0 ); } } // found Metadata reference } // found Root object } // found Root reference } // found trailer_dictionaries //error_log( __LINE__ . ' MLAPDF::mla_extract_pdf_metadata pdf = ' . var_export( $metadata, true ), 0 ); //error_log( __LINE__ . ' MLAPDF::mla_extract_pdf_metadata xmp = ' . var_export( $xmp, true ), 0 ); return array('xmp' => $xmp, 'pdf' => $metadata); }