/** * Extract Metadata from a PDF file * * @since 1.50 * * @param string full path to the desired file * * @return array ( key => value ) for each metadata field, in string format */ private static function _extract_pdf_metadata($file_name) { $metadata = array(); self::$pdf_indirect_objects = NULL; $chunksize = 16384; if (!file_exists($file_name)) { return $metadata; } $filesize = filesize($file_name); $file_offset = $chunksize < $filesize ? $filesize - $chunksize : 0; $tail = file_get_contents($file_name, false, NULL, $file_offset); if (0 == $file_offset) { $header = substr($tail, 0, 128); } else { $header = file_get_contents($file_name, false, NULL, 0, 128); } if ('%PDF-' == substr($header, 0, 5)) { $metadata['PDF_Version'] = substr($header, 1, 7); $metadata['PDF_VersionNumber'] = substr($header, 5, 3); } /* * Find the xref and (optional) trailer */ $match_count = preg_match_all('/startxref[\\x00-\\x20]+(\\d+)[\\x00-\\x20]+\\%\\%EOF/', $tail, $matches, PREG_OFFSET_CAPTURE); if (0 == $match_count) { error_log('ERROR: startxref not found ' . var_export($path, true), 0); return $metadata; } $startxref = (int) $matches[1][$match_count - 1][0]; $trailer_dictionaries = self::_extract_pdf_trailer($file_name, $startxref); if (is_array($trailer_dictionaries)) { $info_reference = NULL; foreach ($trailer_dictionaries as $trailer_dictionary) { if (isset($trailer_dictionary['Info'])) { $info_reference = $trailer_dictionary['Info']; break; } } if (isset($info_reference)) { $info_object = self::_find_pdf_indirect_dictionary($file_name, $info_reference['object'], $info_reference['generation']); if ($info_object) { $info_dictionary = self::_parse_pdf_dictionary($info_object['content'], 0); unset($info_dictionary['/length']); foreach ($info_dictionary as $name => $value) { if ('string' == $value['type']) { $prefix = substr($value['value'], 0, 2); if ('D:' == $prefix) { $metadata[$name] = self::_parse_pdf_date($value['value']); } elseif (chr(0xfe) . chr(0xff) == $prefix) { $metadata[$name] = self::_parse_pdf_UTF16BE($value['value']); } else { $metadata[$name] = $value['value']; } } else { $metadata[$name] = $value['value']; } } // each info entry } // found Info object } // found Info reference /* * Look for XMP Metadata */ $root_reference = NULL; foreach ($trailer_dictionaries as $trailer_dictionary) { if (isset($trailer_dictionary['Root'])) { $root_reference = $trailer_dictionary['Root']; break; } } if (isset($root_reference)) { $root_object = self::_find_pdf_indirect_dictionary($file_name, $root_reference['object'], $root_reference['generation']); if ($root_object) { $root_dictionary = self::_parse_pdf_dictionary($root_object['content'], 0); unset($root_dictionary['/length']); if (isset($root_dictionary['Metadata'])) { $xmp_object = self::_find_pdf_indirect_dictionary($file_name, $root_dictionary['Metadata']['object'], $root_dictionary['Metadata']['generation']); $xmp = self::_parse_xmp_metadata($file_name, $xmp_object['start'] + $xmp_object['length']); if (is_array($xmp)) { $metadata = array_merge($metadata, $xmp); } } // found Metadata reference } // found Root object } // found Root reference } // found trailer_dictionaries return $metadata; }