/** * Have a high offset to simulate a missing packet, * which should cause it to ignore the ExtendedXMP packet. * * @covers XMPReader::parseExtended */ public function testExtendedXMPMissingPacket() { $xmpPath = __DIR__ . '/../../data/xmp/'; $standardXMP = file_get_contents($xmpPath . 'xmpExt.xmp'); $extendedXMP = file_get_contents($xmpPath . 'xmpExt2.xmp'); $md5sum = '28C74E0AC2D796886759006FBE2E57B7'; // of xmpExt2.xmp $length = pack('N', strlen($extendedXMP)); $offset = pack('N', 2048); $extendedPacket = $md5sum . $length . $offset . $extendedXMP; $reader = new XMPReader(); $reader->parse($standardXMP); $reader->parseExtended($extendedPacket); $actual = $reader->getResults(); $expected = array('xmp-exif' => array('DigitalZoomRatio' => '0/10', 'Flash' => 9)); $this->assertEquals($expected, $actual); }
/** function for gif images. * * They don't really have native metadata, so just merges together * XMP and image comment. * * @param string $filename full path to file * @return Array metadata array */ public static function GIF($filename) { $meta = new self(); $baseArray = GIFMetadataExtractor::getMetadata($filename); if (count($baseArray['comment']) > 0) { $meta->addMetadata(array('GIFFileComment' => $baseArray['comment']), 'native'); } if ($baseArray['xmp'] !== '' && function_exists('xml_parser_create_ns')) { $xmp = new XMPReader(); $xmp->parse($baseArray['xmp']); $xmpRes = $xmp->getResults(); foreach ($xmpRes as $type => $xmpSection) { $meta->addMetadata($xmpSection, $type); } } unset($baseArray['comment']); unset($baseArray['xmp']); $baseArray['metadata'] = $meta->getMetadataArray(); $baseArray['metadata']['_MW_GIF_VERSION'] = GIFMetadataExtractor::VERSION; return $baseArray; }
/** * Test for multi-section, hostile XML * @covers checkParseSafety */ public function testCheckParseSafety() { // Test for detection $xmpPath = __DIR__ . '/../../data/xmp/'; $file = fopen($xmpPath . 'doctype-included.xmp', 'rb'); $valid = false; $reader = new XMPReader(); do { $chunk = fread($file, 10); $valid = $reader->parse($chunk, feof($file)); } while (!feof($file)); $this->assertFalse($valid, 'Check that doctype is detected in fragmented XML'); $this->assertEquals(array(), $reader->getResults(), 'Check that doctype is detected in fragmented XML'); fclose($file); unset($reader); // Test for false positives $file = fopen($xmpPath . 'doctype-not-included.xmp', 'rb'); $valid = false; $reader = new XMPReader(); do { $chunk = fread($file, 10); $valid = $reader->parse($chunk, feof($file)); } while (!feof($file)); $this->assertTrue($valid, 'Check for false-positive detecting doctype in fragmented XML'); $this->assertEquals(array('xmp-exif' => array('DigitalZoomRatio' => '0/10', 'Flash' => '9')), $reader->getResults(), 'Check that doctype is detected in fragmented XML'); }
/** * Postprocess the metadata (convert xmp into useful form, etc) * * This is used to generate the metadata table at the bottom * of the image description page. * * @param $data Array metadata * @return Array post-processed metadata */ protected function postProcessDump(array $data) { $meta = new BitmapMetadataHandler(); $items = array(); foreach ($data as $key => $val) { switch ($key) { case 'Title': $items['ObjectName'] = $val; break; case 'Subject': $items['ImageDescription'] = $val; break; case 'Keywords': // Sometimes we have empty keywords. This seems // to be a product of how pdfinfo deals with keywords // with spaces in them. Filter such empty keywords $keyList = array_filter(explode(' ', $val)); if (count($keyList) > 0) { $items['Keywords'] = $keyList; } break; case 'Author': $items['Artist'] = $val; break; case 'Creator': // Program used to create file. // Different from program used to convert to pdf. $items['Software'] = $val; break; case 'Producer': // Conversion program $items['pdf-Producer'] = $val; break; case 'ModTime': $timestamp = wfTimestamp(TS_EXIF, $val); if ($timestamp) { // 'if' is just paranoia $items['DateTime'] = $timestamp; } break; case 'CreationTime': $timestamp = wfTimestamp(TS_EXIF, $val); if ($timestamp) { $items['DateTimeDigitized'] = $timestamp; } break; // These last two (version and encryption) I was unsure // if we should include in the table, since they aren't // all that useful to editors. I leaned on the side // of including. However not including if file // is optimized/linearized since that is really useless // to an editor. // These last two (version and encryption) I was unsure // if we should include in the table, since they aren't // all that useful to editors. I leaned on the side // of including. However not including if file // is optimized/linearized since that is really useless // to an editor. case 'PDF version': $items['pdf-Version'] = $val; break; case 'Encrypted': // @todo: The value isn't i18n-ised. The appropriate // place to do that is in FormatMetadata.php // should add a hook a there. // For reference, if encrypted this fields value looks like: // "yes (print:yes copy:no change:no addNotes:no)" $items['pdf-Encrypted'] = $val; break; // Note 'pages' and 'Pages' are different keys (!) // Note 'pages' and 'Pages' are different keys (!) case 'pages': // A pdf document can have multiple sized pages in it. // (However 95% of the time, all pages are the same size) // get a list of all the unique page sizes in document. // This doesn't do anything with rotation as of yet, // mostly because I am unsure of what a good way to // present that information to the user would be. $pageSizes = array(); foreach ($val as $page) { if (isset($page['Page size'])) { $pageSizes[$page['Page size']] = true; } } $pageSizeArray = array_keys($pageSizes); if (count($pageSizeArray) > 0) { $items['pdf-PageSize'] = $pageSizeArray; } break; } } $meta->addMetadata($items, 'native'); if (isset($data['xmp']) && function_exists('xml_parser_create_ns')) { // func exists verifies that the xml extension required for XMPReader // is present (Almost always is present) // @todo: This only handles generic xmp properties. Would be improved // by handling pdf xmp properties (pdf and pdfx) via XMPInfo hook. $xmp = new XMPReader(LoggerFactory::getInstance('XMP')); $xmp->parse($data['xmp']); $xmpRes = $xmp->getResults(); foreach ($xmpRes as $type => $xmpSection) { $meta->addMetadata($xmpSection, $type); } } unset($data['xmp']); $data['mergedMetadata'] = $meta->getMetadataArray(); return $data; }
/** Function to extract metadata segments of interest from jpeg files * based on GIFMetadataExtractor. * * we can almost use getimagesize to do this * but gis doesn't support having multiple app1 segments * and those can't extract xmp on files containing both exif and xmp data * * @param string $filename Name of jpeg file * @return array Array of interesting segments. * @throws MWException If given invalid file. */ static function segmentSplitter($filename) { $showXMP = XMPReader::isSupported(); $segmentCount = 0; $segments = array('XMP_ext' => array(), 'COM' => array(), 'PSIR' => array()); if (!$filename) { throw new MWException("No filename specified for " . __METHOD__); } if (!file_exists($filename) || is_dir($filename)) { throw new MWException("Invalid file {$filename} passed to " . __METHOD__); } $fh = fopen($filename, "rb"); if (!$fh) { throw new MWException("Could not open file {$filename}"); } $buffer = fread($fh, 2); if ($buffer !== "ÿØ") { throw new MWException("Not a jpeg, no SOI"); } while (!feof($fh)) { $buffer = fread($fh, 1); $segmentCount++; if ($segmentCount > self::MAX_JPEG_SEGMENTS) { // this is just a sanity check throw new MWException('Too many jpeg segments. Aborting'); } if ($buffer !== "ÿ") { throw new MWException("Error reading jpeg file marker. " . "Expected 0xFF but got " . bin2hex($buffer)); } $buffer = fread($fh, 1); while ($buffer === "ÿ" && !feof($fh)) { // Skip through any 0xFF padding bytes. $buffer = fread($fh, 1); } if ($buffer === "þ") { // COM section -- file comment // First see if valid utf-8, // if not try to convert it to windows-1252. $com = $oldCom = trim(self::jpegExtractMarker($fh)); UtfNormal::quickIsNFCVerify($com); // turns $com to valid utf-8. // thus if no change, its utf-8, otherwise its something else. if ($com !== $oldCom) { wfSuppressWarnings(); $com = $oldCom = iconv('windows-1252', 'UTF-8//IGNORE', $oldCom); wfRestoreWarnings(); } // Try it again, if its still not a valid string, then probably // binary junk or some really weird encoding, so don't extract. UtfNormal::quickIsNFCVerify($com); if ($com === $oldCom) { $segments["COM"][] = $oldCom; } else { wfDebug(__METHOD__ . " Ignoring JPEG comment as is garbage.\n"); } } elseif ($buffer === "á") { // APP1 section (Exif, XMP, and XMP extended) // only extract if XMP is enabled. $temp = self::jpegExtractMarker($fh); // check what type of app segment this is. if (substr($temp, 0, 29) === "http://ns.adobe.com/xap/1.0/" && $showXMP) { $segments["XMP"] = substr($temp, 29); } elseif (substr($temp, 0, 35) === "http://ns.adobe.com/xmp/extension/" && $showXMP) { $segments["XMP_ext"][] = substr($temp, 35); } elseif (substr($temp, 0, 29) === "XMP://ns.adobe.com/xap/1.0/" && $showXMP) { // Some images (especially flickr images) seem to have this. // I really have no idea what the deal is with them, but // whatever... $segments["XMP"] = substr($temp, 29); wfDebug(__METHOD__ . ' Found XMP section with wrong app identifier ' . "Using anyways.\n"); } elseif (substr($temp, 0, 6) === "Exif") { // Just need to find out what the byte order is. // because php's exif plugin sucks... // This is a II for little Endian, MM for big. Not a unicode BOM. $byteOrderMarker = substr($temp, 6, 2); if ($byteOrderMarker === 'MM') { $segments['byteOrder'] = 'BE'; } elseif ($byteOrderMarker === 'II') { $segments['byteOrder'] = 'LE'; } else { wfDebug(__METHOD__ . " Invalid byte ordering?!\n"); } } } elseif ($buffer === "í") { // APP13 - PSIR. IPTC and some photoshop stuff $temp = self::jpegExtractMarker($fh); if (substr($temp, 0, 14) === "Photoshop 3.0") { $segments["PSIR"][] = $temp; } } elseif ($buffer === "Ù" || $buffer === "Ú") { // EOI - end of image or SOS - start of scan. either way we're past any interesting segments return $segments; } else { // segment we don't care about, so skip $size = wfUnpack("nint", fread($fh, 2), 2); if ($size['int'] <= 2) { throw new MWException("invalid marker size in jpeg"); } fseek($fh, $size['int'] - 2, SEEK_CUR); } } // shouldn't get here. throw new MWException("Reached end of jpeg file unexpectedly"); }
/** function for gif images. * * They don't really have native metadata, so just merges together * XMP and image comment. * * @param string $filename Full path to file * @return array Metadata array */ public static function GIF($filename) { $meta = new self(); $baseArray = GIFMetadataExtractor::getMetadata($filename); if (count($baseArray['comment']) > 0) { $meta->addMetadata(['GIFFileComment' => $baseArray['comment']], 'native'); } if ($baseArray['xmp'] !== '' && XMPReader::isSupported()) { $xmp = new XMPReader(LoggerFactory::getInstance('XMP')); $xmp->parse($baseArray['xmp']); $xmpRes = $xmp->getResults(); foreach ($xmpRes as $type => $xmpSection) { $meta->addMetadata($xmpSection, $type); } } unset($baseArray['comment']); unset($baseArray['xmp']); $baseArray['metadata'] = $meta->getMetadataArray(); $baseArray['metadata']['_MW_GIF_VERSION'] = GIFMetadataExtractor::VERSION; return $baseArray; }