/** * The ultimate convenience function! Clean up invalid UTF-8 sequences, * and convert to normal form C, canonical composition. * * Fast return for pure ASCII strings; some lesser optimizations for * strings containing only known-good characters. Not as fast as toNFC(). * * @param string $string a UTF-8 string * @return string a clean, shiny, normalized UTF-8 string * @static */ static function cleanUp($string) { if (NORMALIZE_ICU) { # We exclude a few chars that ICU would not. $string = preg_replace('/[\\x00-\\x08\\x0b\\x0c\\x0e-\\x1f]/', UTF8_REPLACEMENT, $string); $string = str_replace(UTF8_FFFE, UTF8_REPLACEMENT, $string); $string = str_replace(UTF8_FFFF, UTF8_REPLACEMENT, $string); # UnicodeString constructor fails if the string ends with a # head byte. Add a junk char at the end, we'll strip it off. return rtrim(utf8_normalize($string . "", UNORM_NFC), ""); } elseif (UtfNormal::quickIsNFCVerify($string)) { # Side effect -- $string has had UTF-8 errors cleaned up. return $string; } else { return UtfNormal::NFC($string); } }
/** * Helper function of a helper function to convert charset for iptc values. * @param string|array $data The IPTC string * @param string $charset The charset * * @return string */ private static function convIPTCHelper($data, $charset) { if ($charset) { wfSuppressWarnings(); $data = iconv($charset, "UTF-8//IGNORE", $data); wfRestoreWarnings(); if ($data === false) { $data = ""; wfDebugLog('iptc', __METHOD__ . " Error converting iptc data charset {$charset} to utf-8"); } } else { //treat as utf-8 if is valid utf-8. otherwise pretend its windows-1252 // most of the time if there is no 1:90 tag, it is either ascii, latin1, or utf-8 $oldData = $data; UtfNormal::quickIsNFCVerify($data); //make $data valid utf-8 if ($data === $oldData) { return $data; //if validation didn't change $data } else { return self::convIPTCHelper($oldData, 'Windows-1252'); } } return trim($data); }
/** * Do userComment tags and similar. See pg. 34 of exif standard. * basically first 8 bytes is charset, rest is value. * This has not been tested on any shift-JIS strings. * @param string $prop prop name. */ private function charCodeString($prop) { if (isset($this->mFilteredExifData[$prop])) { if (strlen($this->mFilteredExifData[$prop]) <= 8) { //invalid. Must be at least 9 bytes long. $this->debug($this->mFilteredExifData[$prop], __FUNCTION__, false); unset($this->mFilteredExifData[$prop]); return; } $charCode = substr($this->mFilteredExifData[$prop], 0, 8); $val = substr($this->mFilteredExifData[$prop], 8); switch ($charCode) { case "JIS": //JIS $charset = "Shift-JIS"; break; case "UNICODE": $charset = "UTF-16" . $this->byteOrder; break; default: //ascii or undefined. $charset = ""; break; } // This could possibly check to see if iconv is really installed // or if we're using the compatibility wrapper in globalFunctions.php if ($charset) { wfSuppressWarnings(); $val = iconv($charset, 'UTF-8//IGNORE', $val); wfRestoreWarnings(); } else { // if valid utf-8, assume that, otherwise assume windows-1252 $valCopy = $val; UtfNormal::quickIsNFCVerify($valCopy); //validates $valCopy. if ($valCopy !== $val) { wfSuppressWarnings(); $val = iconv('Windows-1252', 'UTF-8//IGNORE', $val); wfRestoreWarnings(); } } //trim and check to make sure not only whitespace. $val = trim($val); if (strlen($val) === 0) { //only whitespace. $this->debug($this->mFilteredExifData[$prop], __FUNCTION__, "{$prop}: Is only whitespace"); unset($this->mFilteredExifData[$prop]); return; } //all's good. $this->mFilteredExifData[$prop] = $val; } }
/** * The ultimate convenience function! Clean up invalid UTF-8 sequences, * and convert to normal form C, canonical composition. * * Fast return for pure ASCII strings; some lesser optimizations for * strings containing only known-good characters. Not as fast as toNFC(). * * @param $string String: a UTF-8 string * @return string a clean, shiny, normalized UTF-8 string */ static function cleanUp($string) { if (NORMALIZE_ICU) { $string = self::replaceForNativeNormalize($string); # UnicodeString constructor fails if the string ends with a # head byte. Add a junk char at the end, we'll strip it off. return rtrim(utf8_normalize($string . "", UNORM_NFC), ""); } elseif (NORMALIZE_INTL) { $string = self::replaceForNativeNormalize($string); $norm = normalizer_normalize($string, Normalizer::FORM_C); if ($norm === null || $norm === false) { # normalizer_normalize will either return false or null # (depending on which doc you read) if invalid utf8 string. # quickIsNFCVerify cleans up invalid sequences. if (UtfNormal::quickIsNFCVerify($string)) { # if that's true, the string is actually already normal. return $string; } else { # Now we are valid but non-normal return normalizer_normalize($string, Normalizer::FORM_C); } } else { return $norm; } } elseif (UtfNormal::quickIsNFCVerify($string)) { # Side effect -- $string has had UTF-8 errors cleaned up. return $string; } else { return UtfNormal::NFC($string); } }
/** Function to extract metadata segments of interest from jpeg files * based on GIFMetadataExtractor. * * we can almost use getimagesize to do this * but gis doesn't support having multiple app1 segments * and those can't extract xmp on files containing both exif and xmp data * * @param string $filename Name of jpeg file * @return array Array of interesting segments. * @throws MWException If given invalid file. */ static function segmentSplitter($filename) { $showXMP = XMPReader::isSupported(); $segmentCount = 0; $segments = array('XMP_ext' => array(), 'COM' => array(), 'PSIR' => array()); if (!$filename) { throw new MWException("No filename specified for " . __METHOD__); } if (!file_exists($filename) || is_dir($filename)) { throw new MWException("Invalid file {$filename} passed to " . __METHOD__); } $fh = fopen($filename, "rb"); if (!$fh) { throw new MWException("Could not open file {$filename}"); } $buffer = fread($fh, 2); if ($buffer !== "ÿØ") { throw new MWException("Not a jpeg, no SOI"); } while (!feof($fh)) { $buffer = fread($fh, 1); $segmentCount++; if ($segmentCount > self::MAX_JPEG_SEGMENTS) { // this is just a sanity check throw new MWException('Too many jpeg segments. Aborting'); } if ($buffer !== "ÿ") { throw new MWException("Error reading jpeg file marker. " . "Expected 0xFF but got " . bin2hex($buffer)); } $buffer = fread($fh, 1); while ($buffer === "ÿ" && !feof($fh)) { // Skip through any 0xFF padding bytes. $buffer = fread($fh, 1); } if ($buffer === "þ") { // COM section -- file comment // First see if valid utf-8, // if not try to convert it to windows-1252. $com = $oldCom = trim(self::jpegExtractMarker($fh)); UtfNormal::quickIsNFCVerify($com); // turns $com to valid utf-8. // thus if no change, its utf-8, otherwise its something else. if ($com !== $oldCom) { wfSuppressWarnings(); $com = $oldCom = iconv('windows-1252', 'UTF-8//IGNORE', $oldCom); wfRestoreWarnings(); } // Try it again, if its still not a valid string, then probably // binary junk or some really weird encoding, so don't extract. UtfNormal::quickIsNFCVerify($com); if ($com === $oldCom) { $segments["COM"][] = $oldCom; } else { wfDebug(__METHOD__ . " Ignoring JPEG comment as is garbage.\n"); } } elseif ($buffer === "á") { // APP1 section (Exif, XMP, and XMP extended) // only extract if XMP is enabled. $temp = self::jpegExtractMarker($fh); // check what type of app segment this is. if (substr($temp, 0, 29) === "http://ns.adobe.com/xap/1.0/" && $showXMP) { $segments["XMP"] = substr($temp, 29); } elseif (substr($temp, 0, 35) === "http://ns.adobe.com/xmp/extension/" && $showXMP) { $segments["XMP_ext"][] = substr($temp, 35); } elseif (substr($temp, 0, 29) === "XMP://ns.adobe.com/xap/1.0/" && $showXMP) { // Some images (especially flickr images) seem to have this. // I really have no idea what the deal is with them, but // whatever... $segments["XMP"] = substr($temp, 29); wfDebug(__METHOD__ . ' Found XMP section with wrong app identifier ' . "Using anyways.\n"); } elseif (substr($temp, 0, 6) === "Exif") { // Just need to find out what the byte order is. // because php's exif plugin sucks... // This is a II for little Endian, MM for big. Not a unicode BOM. $byteOrderMarker = substr($temp, 6, 2); if ($byteOrderMarker === 'MM') { $segments['byteOrder'] = 'BE'; } elseif ($byteOrderMarker === 'II') { $segments['byteOrder'] = 'LE'; } else { wfDebug(__METHOD__ . " Invalid byte ordering?!\n"); } } } elseif ($buffer === "í") { // APP13 - PSIR. IPTC and some photoshop stuff $temp = self::jpegExtractMarker($fh); if (substr($temp, 0, 14) === "Photoshop 3.0") { $segments["PSIR"][] = $temp; } } elseif ($buffer === "Ù" || $buffer === "Ú") { // EOI - end of image or SOS - start of scan. either way we're past any interesting segments return $segments; } else { // segment we don't care about, so skip $size = wfUnpack("nint", fread($fh, 2), 2); if ($size['int'] <= 2) { throw new MWException("invalid marker size in jpeg"); } fseek($fh, $size['int'] - 2, SEEK_CUR); } } // shouldn't get here. throw new MWException("Reached end of jpeg file unexpectedly"); }
/** * @throws Exception * @param $filename string * @return array */ static function getMetadata($filename) { self::$gif_frame_sep = pack("C", ord(",")); self::$gif_extension_sep = pack("C", ord("!")); self::$gif_term = pack("C", ord(";")); $frameCount = 0; $duration = 0.0; $isLooped = false; $xmp = ""; $comment = array(); if (!$filename) { throw new Exception("No file name specified"); } elseif (!file_exists($filename) || is_dir($filename)) { throw new Exception("File {$filename} does not exist"); } $fh = fopen($filename, 'rb'); if (!$fh) { throw new Exception("Unable to open file {$filename}"); } // Check for the GIF header $buf = fread($fh, 6); if (!($buf == 'GIF87a' || $buf == 'GIF89a')) { throw new Exception("Not a valid GIF file; header: {$buf}"); } // Skip over width and height. fread($fh, 4); // Read BPP $buf = fread($fh, 1); $bpp = self::decodeBPP($buf); // Skip over background and aspect ratio fread($fh, 2); // Skip over the GCT self::readGCT($fh, $bpp); while (!feof($fh)) { $buf = fread($fh, 1); if ($buf == self::$gif_frame_sep) { // Found a frame $frameCount++; ## Skip bounding box fread($fh, 8); ## Read BPP $buf = fread($fh, 1); $bpp = self::decodeBPP($buf); ## Read GCT self::readGCT($fh, $bpp); fread($fh, 1); self::skipBlock($fh); } elseif ($buf == self::$gif_extension_sep) { $buf = fread($fh, 1); if (strlen($buf) < 1) { throw new Exception("Ran out of input"); } $extension_code = unpack('C', $buf); $extension_code = $extension_code[1]; if ($extension_code == 0xf9) { // Graphics Control Extension. fread($fh, 1); // Block size fread($fh, 1); // Transparency, disposal method, user input $buf = fread($fh, 2); // Delay, in hundredths of seconds. if (strlen($buf) < 2) { throw new Exception("Ran out of input"); } $delay = unpack('v', $buf); $delay = $delay[1]; $duration += $delay * 0.01; fread($fh, 1); // Transparent colour index $term = fread($fh, 1); // Should be a terminator if (strlen($term) < 1) { throw new Exception("Ran out of input"); } $term = unpack('C', $term); $term = $term[1]; if ($term != 0) { throw new Exception("Malformed Graphics Control Extension block"); } } elseif ($extension_code == 0xfe) { // Comment block(s). $data = self::readBlock($fh); if ($data === "") { throw new Exception('Read error, zero-length comment block'); } // The standard says this should be ASCII, however its unclear if // thats true in practise. Check to see if its valid utf-8, if so // assume its that, otherwise assume its windows-1252 (iso-8859-1) $dataCopy = $data; // quickIsNFCVerify has the side effect of replacing any invalid characters UtfNormal::quickIsNFCVerify($dataCopy); if ($dataCopy !== $data) { wfSuppressWarnings(); $data = iconv('windows-1252', 'UTF-8', $data); wfRestoreWarnings(); } $commentCount = count($comment); if ($commentCount === 0 || $comment[$commentCount - 1] !== $data) { // Some applications repeat the same comment on each // frame of an animated GIF image, so if this comment // is identical to the last, only extract once. $comment[] = $data; } } elseif ($extension_code == 0xff) { // Application extension (Netscape info about the animated gif) // or XMP (or theoretically any other type of extension block) $blockLength = fread($fh, 1); if (strlen($blockLength) < 1) { throw new Exception("Ran out of input"); } $blockLength = unpack('C', $blockLength); $blockLength = $blockLength[1]; $data = fread($fh, $blockLength); if ($blockLength != 11) { wfDebug(__METHOD__ . ' GIF application block with wrong length'); fseek($fh, -($blockLength + 1), SEEK_CUR); self::skipBlock($fh); continue; } // NETSCAPE2.0 (application name for animated gif) if ($data == 'NETSCAPE2.0') { $data = fread($fh, 2); // Block length and introduction, should be 03 01 if ($data != "") { throw new Exception("Expected , got {$data}"); } // Unsigned little-endian integer, loop count or zero for "forever" $loopData = fread($fh, 2); if (strlen($loopData) < 2) { throw new Exception("Ran out of input"); } $loopData = unpack('v', $loopData); $loopCount = $loopData[1]; if ($loopCount != 1) { $isLooped = true; } // Read out terminator byte fread($fh, 1); } elseif ($data == 'XMP DataXMP') { // application name for XMP data. // see pg 18 of XMP spec part 3. $xmp = self::readBlock($fh, true); if (substr($xmp, -257, 3) !== "ÿþ" || substr($xmp, -4) !== "") { // this is just a sanity check. throw new Exception("XMP does not have magic trailer!"); } // strip out trailer. $xmp = substr($xmp, 0, -257); } else { // unrecognized extension block fseek($fh, -($blockLength + 1), SEEK_CUR); self::skipBlock($fh); continue; } } else { self::skipBlock($fh); } } elseif ($buf == self::$gif_term) { break; } else { if (strlen($buf) < 1) { throw new Exception("Ran out of input"); } $byte = unpack('C', $buf); $byte = $byte[1]; throw new Exception("At position: " . ftell($fh) . ", Unknown byte " . $byte); } } return array('frameCount' => $frameCount, 'looped' => $isLooped, 'duration' => $duration, 'xmp' => $xmp, 'comment' => $comment); }