/** * Helper function of a helper function to convert charset for iptc values. * @param string|array $data The IPTC string * @param string $charset The charset * * @return string */ private static function convIPTCHelper($data, $charset) { if ($charset) { MediaWiki\suppressWarnings(); $data = iconv($charset, "UTF-8//IGNORE", $data); MediaWiki\restoreWarnings(); if ($data === false) { $data = ""; wfDebugLog('iptc', __METHOD__ . " Error converting iptc data charset {$charset} to utf-8"); } } else { // treat as utf-8 if is valid utf-8. otherwise pretend its windows-1252 // most of the time if there is no 1:90 tag, it is either ascii, latin1, or utf-8 $oldData = $data; UtfNormal\Validator::quickIsNFCVerify($data); // make $data valid utf-8 if ($data === $oldData) { return $data; // if validation didn't change $data } else { return self::convIPTCHelper($oldData, 'Windows-1252'); } } return trim($data); }
/** * Do userComment tags and similar. See pg. 34 of exif standard. * basically first 8 bytes is charset, rest is value. * This has not been tested on any shift-JIS strings. * @param string $prop Prop name */ private function charCodeString($prop) { if (isset($this->mFilteredExifData[$prop])) { if (strlen($this->mFilteredExifData[$prop]) <= 8) { // invalid. Must be at least 9 bytes long. $this->debug($this->mFilteredExifData[$prop], __FUNCTION__, false); unset($this->mFilteredExifData[$prop]); return; } $charCode = substr($this->mFilteredExifData[$prop], 0, 8); $val = substr($this->mFilteredExifData[$prop], 8); switch ($charCode) { case "JIS": // JIS $charset = "Shift-JIS"; break; case "UNICODE": $charset = "UTF-16" . $this->byteOrder; break; default: // ascii or undefined. $charset = ""; break; } if ($charset) { MediaWiki\suppressWarnings(); $val = iconv($charset, 'UTF-8//IGNORE', $val); MediaWiki\restoreWarnings(); } else { // if valid utf-8, assume that, otherwise assume windows-1252 $valCopy = $val; UtfNormal\Validator::quickIsNFCVerify($valCopy); // validates $valCopy. if ($valCopy !== $val) { MediaWiki\suppressWarnings(); $val = iconv('Windows-1252', 'UTF-8//IGNORE', $val); MediaWiki\restoreWarnings(); } } // trim and check to make sure not only whitespace. $val = trim($val); if (strlen($val) === 0) { // only whitespace. $this->debug($this->mFilteredExifData[$prop], __FUNCTION__, "{$prop}: Is only whitespace"); unset($this->mFilteredExifData[$prop]); return; } // all's good. $this->mFilteredExifData[$prop] = $val; } }
/** Function to extract metadata segments of interest from jpeg files * based on GIFMetadataExtractor. * * we can almost use getimagesize to do this * but gis doesn't support having multiple app1 segments * and those can't extract xmp on files containing both exif and xmp data * * @param string $filename Name of jpeg file * @return array Array of interesting segments. * @throws MWException If given invalid file. */ static function segmentSplitter($filename) { $showXMP = XMPReader::isSupported(); $segmentCount = 0; $segments = ['XMP_ext' => [], 'COM' => [], 'PSIR' => []]; if (!$filename) { throw new MWException("No filename specified for " . __METHOD__); } if (!file_exists($filename) || is_dir($filename)) { throw new MWException("Invalid file {$filename} passed to " . __METHOD__); } $fh = fopen($filename, "rb"); if (!$fh) { throw new MWException("Could not open file {$filename}"); } $buffer = fread($fh, 2); if ($buffer !== "ÿØ") { throw new MWException("Not a jpeg, no SOI"); } while (!feof($fh)) { $buffer = fread($fh, 1); $segmentCount++; if ($segmentCount > self::MAX_JPEG_SEGMENTS) { // this is just a sanity check throw new MWException('Too many jpeg segments. Aborting'); } while ($buffer !== "ÿ") { // In theory JPEG files are not allowed to contain anything between the sections, // but in practice they sometimes do. It's customary to ignore the garbage data. $buffer = fread($fh, 1); } $buffer = fread($fh, 1); while ($buffer === "ÿ" && !feof($fh)) { // Skip through any 0xFF padding bytes. $buffer = fread($fh, 1); } if ($buffer === "þ") { // COM section -- file comment // First see if valid utf-8, // if not try to convert it to windows-1252. $com = $oldCom = trim(self::jpegExtractMarker($fh)); UtfNormal\Validator::quickIsNFCVerify($com); // turns $com to valid utf-8. // thus if no change, its utf-8, otherwise its something else. if ($com !== $oldCom) { MediaWiki\suppressWarnings(); $com = $oldCom = iconv('windows-1252', 'UTF-8//IGNORE', $oldCom); MediaWiki\restoreWarnings(); } // Try it again, if its still not a valid string, then probably // binary junk or some really weird encoding, so don't extract. UtfNormal\Validator::quickIsNFCVerify($com); if ($com === $oldCom) { $segments["COM"][] = $oldCom; } else { wfDebug(__METHOD__ . " Ignoring JPEG comment as is garbage.\n"); } } elseif ($buffer === "á") { // APP1 section (Exif, XMP, and XMP extended) // only extract if XMP is enabled. $temp = self::jpegExtractMarker($fh); // check what type of app segment this is. if (substr($temp, 0, 29) === "http://ns.adobe.com/xap/1.0/" && $showXMP) { $segments["XMP"] = substr($temp, 29); } elseif (substr($temp, 0, 35) === "http://ns.adobe.com/xmp/extension/" && $showXMP) { $segments["XMP_ext"][] = substr($temp, 35); } elseif (substr($temp, 0, 29) === "XMP://ns.adobe.com/xap/1.0/" && $showXMP) { // Some images (especially flickr images) seem to have this. // I really have no idea what the deal is with them, but // whatever... $segments["XMP"] = substr($temp, 29); wfDebug(__METHOD__ . ' Found XMP section with wrong app identifier ' . "Using anyways.\n"); } elseif (substr($temp, 0, 6) === "Exif") { // Just need to find out what the byte order is. // because php's exif plugin sucks... // This is a II for little Endian, MM for big. Not a unicode BOM. $byteOrderMarker = substr($temp, 6, 2); if ($byteOrderMarker === 'MM') { $segments['byteOrder'] = 'BE'; } elseif ($byteOrderMarker === 'II') { $segments['byteOrder'] = 'LE'; } else { wfDebug(__METHOD__ . " Invalid byte ordering?!\n"); } } } elseif ($buffer === "í") { // APP13 - PSIR. IPTC and some photoshop stuff $temp = self::jpegExtractMarker($fh); if (substr($temp, 0, 14) === "Photoshop 3.0") { $segments["PSIR"][] = $temp; } } elseif ($buffer === "Ù" || $buffer === "Ú") { // EOI - end of image or SOS - start of scan. either way we're past any interesting segments return $segments; } else { // segment we don't care about, so skip $size = wfUnpack("nint", fread($fh, 2), 2); if ($size['int'] < 2) { throw new MWException("invalid marker size in jpeg"); } fseek($fh, $size['int'] - 2, SEEK_CUR); } } // shouldn't get here. throw new MWException("Reached end of jpeg file unexpectedly"); }