UtfNormal::quickIsNFCVerify PHP Code Examples

Example #1

0

Show file

File: UtfNormal.php Project: BackupTheBerlios/shoutwiki-svn

 /**
  * The ultimate convenience function! Clean up invalid UTF-8 sequences,
  * and convert to normal form C, canonical composition.
  *
  * Fast return for pure ASCII strings; some lesser optimizations for
  * strings containing only known-good characters. Not as fast as toNFC().
  *
  * @param string $string a UTF-8 string
  * @return string a clean, shiny, normalized UTF-8 string
  * @static
  */
 static function cleanUp($string)
 {
     if (NORMALIZE_ICU) {
         # We exclude a few chars that ICU would not.
         $string = preg_replace('/[\\x00-\\x08\\x0b\\x0c\\x0e-\\x1f]/', UTF8_REPLACEMENT, $string);
         $string = str_replace(UTF8_FFFE, UTF8_REPLACEMENT, $string);
         $string = str_replace(UTF8_FFFF, UTF8_REPLACEMENT, $string);
         # UnicodeString constructor fails if the string ends with a
         # head byte. Add a junk char at the end, we'll strip it off.
         return rtrim(utf8_normalize($string . "", UNORM_NFC), "");
     } elseif (UtfNormal::quickIsNFCVerify($string)) {
         # Side effect -- $string has had UTF-8 errors cleaned up.
         return $string;
     } else {
         return UtfNormal::NFC($string);
     }
 }

Example #2

0

Show file

File: IPTC.php Project: Tarendai/spring-website

 /**
  * Helper function of a helper function to convert charset for iptc values.
  * @param string|array $data The IPTC string
  * @param string $charset The charset
  *
  * @return string
  */
 private static function convIPTCHelper($data, $charset)
 {
     if ($charset) {
         wfSuppressWarnings();
         $data = iconv($charset, "UTF-8//IGNORE", $data);
         wfRestoreWarnings();
         if ($data === false) {
             $data = "";
             wfDebugLog('iptc', __METHOD__ . " Error converting iptc data charset {$charset} to utf-8");
         }
     } else {
         //treat as utf-8 if is valid utf-8. otherwise pretend its windows-1252
         // most of the time if there is no 1:90 tag, it is either ascii, latin1, or utf-8
         $oldData = $data;
         UtfNormal::quickIsNFCVerify($data);
         //make $data valid utf-8
         if ($data === $oldData) {
             return $data;
             //if validation didn't change $data
         } else {
             return self::convIPTCHelper($oldData, 'Windows-1252');
         }
     }
     return trim($data);
 }

Example #3

0

Show file

File: Exif.php Project: mangowi/mediawiki

 /**
  * Do userComment tags and similar. See pg. 34 of exif standard.
  * basically first 8 bytes is charset, rest is value.
  * This has not been tested on any shift-JIS strings.
  * @param string $prop prop name.
  */
 private function charCodeString($prop)
 {
     if (isset($this->mFilteredExifData[$prop])) {
         if (strlen($this->mFilteredExifData[$prop]) <= 8) {
             //invalid. Must be at least 9 bytes long.
             $this->debug($this->mFilteredExifData[$prop], __FUNCTION__, false);
             unset($this->mFilteredExifData[$prop]);
             return;
         }
         $charCode = substr($this->mFilteredExifData[$prop], 0, 8);
         $val = substr($this->mFilteredExifData[$prop], 8);
         switch ($charCode) {
             case "JIS":
                 //JIS
                 $charset = "Shift-JIS";
                 break;
             case "UNICODE":
                 $charset = "UTF-16" . $this->byteOrder;
                 break;
             default:
                 //ascii or undefined.
                 $charset = "";
                 break;
         }
         // This could possibly check to see if iconv is really installed
         // or if we're using the compatibility wrapper in globalFunctions.php
         if ($charset) {
             wfSuppressWarnings();
             $val = iconv($charset, 'UTF-8//IGNORE', $val);
             wfRestoreWarnings();
         } else {
             // if valid utf-8, assume that, otherwise assume windows-1252
             $valCopy = $val;
             UtfNormal::quickIsNFCVerify($valCopy);
             //validates $valCopy.
             if ($valCopy !== $val) {
                 wfSuppressWarnings();
                 $val = iconv('Windows-1252', 'UTF-8//IGNORE', $val);
                 wfRestoreWarnings();
             }
         }
         //trim and check to make sure not only whitespace.
         $val = trim($val);
         if (strlen($val) === 0) {
             //only whitespace.
             $this->debug($this->mFilteredExifData[$prop], __FUNCTION__, "{$prop}: Is only whitespace");
             unset($this->mFilteredExifData[$prop]);
             return;
         }
         //all's good.
         $this->mFilteredExifData[$prop] = $val;
     }
 }

Example #4

0

Show file

File: UtfNormal.php Project: GodelDesign/Godel

 /**
  * The ultimate convenience function! Clean up invalid UTF-8 sequences,
  * and convert to normal form C, canonical composition.
  *
  * Fast return for pure ASCII strings; some lesser optimizations for
  * strings containing only known-good characters. Not as fast as toNFC().
  *
  * @param $string String: a UTF-8 string
  * @return string a clean, shiny, normalized UTF-8 string
  */
 static function cleanUp($string)
 {
     if (NORMALIZE_ICU) {
         $string = self::replaceForNativeNormalize($string);
         # UnicodeString constructor fails if the string ends with a
         # head byte. Add a junk char at the end, we'll strip it off.
         return rtrim(utf8_normalize($string . "", UNORM_NFC), "");
     } elseif (NORMALIZE_INTL) {
         $string = self::replaceForNativeNormalize($string);
         $norm = normalizer_normalize($string, Normalizer::FORM_C);
         if ($norm === null || $norm === false) {
             # normalizer_normalize will either return false or null
             # (depending on which doc you read) if invalid utf8 string.
             # quickIsNFCVerify cleans up invalid sequences.
             if (UtfNormal::quickIsNFCVerify($string)) {
                 # if that's true, the string is actually already normal.
                 return $string;
             } else {
                 # Now we are valid but non-normal
                 return normalizer_normalize($string, Normalizer::FORM_C);
             }
         } else {
             return $norm;
         }
     } elseif (UtfNormal::quickIsNFCVerify($string)) {
         # Side effect -- $string has had UTF-8 errors cleaned up.
         return $string;
     } else {
         return UtfNormal::NFC($string);
     }
 }

Example #5

0

Show file

File: JpegMetadataExtractor.php Project: rploaiza/dbpedia-latinoamerica

 /** Function to extract metadata segments of interest from jpeg files
  * based on GIFMetadataExtractor.
  *
  * we can almost use getimagesize to do this
  * but gis doesn't support having multiple app1 segments
  * and those can't extract xmp on files containing both exif and xmp data
  *
  * @param string $filename Name of jpeg file
  * @return array Array of interesting segments.
  * @throws MWException If given invalid file.
  */
 static function segmentSplitter($filename)
 {
     $showXMP = XMPReader::isSupported();
     $segmentCount = 0;
     $segments = array('XMP_ext' => array(), 'COM' => array(), 'PSIR' => array());
     if (!$filename) {
         throw new MWException("No filename specified for " . __METHOD__);
     }
     if (!file_exists($filename) || is_dir($filename)) {
         throw new MWException("Invalid file {$filename} passed to " . __METHOD__);
     }
     $fh = fopen($filename, "rb");
     if (!$fh) {
         throw new MWException("Could not open file {$filename}");
     }
     $buffer = fread($fh, 2);
     if ($buffer !== "ÿØ") {
         throw new MWException("Not a jpeg, no SOI");
     }
     while (!feof($fh)) {
         $buffer = fread($fh, 1);
         $segmentCount++;
         if ($segmentCount > self::MAX_JPEG_SEGMENTS) {
             // this is just a sanity check
             throw new MWException('Too many jpeg segments. Aborting');
         }
         if ($buffer !== "ÿ") {
             throw new MWException("Error reading jpeg file marker. " . "Expected 0xFF but got " . bin2hex($buffer));
         }
         $buffer = fread($fh, 1);
         while ($buffer === "ÿ" && !feof($fh)) {
             // Skip through any 0xFF padding bytes.
             $buffer = fread($fh, 1);
         }
         if ($buffer === "þ") {
             // COM section -- file comment
             // First see if valid utf-8,
             // if not try to convert it to windows-1252.
             $com = $oldCom = trim(self::jpegExtractMarker($fh));
             UtfNormal::quickIsNFCVerify($com);
             // turns $com to valid utf-8.
             // thus if no change, its utf-8, otherwise its something else.
             if ($com !== $oldCom) {
                 wfSuppressWarnings();
                 $com = $oldCom = iconv('windows-1252', 'UTF-8//IGNORE', $oldCom);
                 wfRestoreWarnings();
             }
             // Try it again, if its still not a valid string, then probably
             // binary junk or some really weird encoding, so don't extract.
             UtfNormal::quickIsNFCVerify($com);
             if ($com === $oldCom) {
                 $segments["COM"][] = $oldCom;
             } else {
                 wfDebug(__METHOD__ . " Ignoring JPEG comment as is garbage.\n");
             }
         } elseif ($buffer === "á") {
             // APP1 section (Exif, XMP, and XMP extended)
             // only extract if XMP is enabled.
             $temp = self::jpegExtractMarker($fh);
             // check what type of app segment this is.
             if (substr($temp, 0, 29) === "http://ns.adobe.com/xap/1.0/" && $showXMP) {
                 $segments["XMP"] = substr($temp, 29);
             } elseif (substr($temp, 0, 35) === "http://ns.adobe.com/xmp/extension/" && $showXMP) {
                 $segments["XMP_ext"][] = substr($temp, 35);
             } elseif (substr($temp, 0, 29) === "XMP://ns.adobe.com/xap/1.0/" && $showXMP) {
                 // Some images (especially flickr images) seem to have this.
                 // I really have no idea what the deal is with them, but
                 // whatever...
                 $segments["XMP"] = substr($temp, 29);
                 wfDebug(__METHOD__ . ' Found XMP section with wrong app identifier ' . "Using anyways.\n");
             } elseif (substr($temp, 0, 6) === "Exif") {
                 // Just need to find out what the byte order is.
                 // because php's exif plugin sucks...
                 // This is a II for little Endian, MM for big. Not a unicode BOM.
                 $byteOrderMarker = substr($temp, 6, 2);
                 if ($byteOrderMarker === 'MM') {
                     $segments['byteOrder'] = 'BE';
                 } elseif ($byteOrderMarker === 'II') {
                     $segments['byteOrder'] = 'LE';
                 } else {
                     wfDebug(__METHOD__ . " Invalid byte ordering?!\n");
                 }
             }
         } elseif ($buffer === "í") {
             // APP13 - PSIR. IPTC and some photoshop stuff
             $temp = self::jpegExtractMarker($fh);
             if (substr($temp, 0, 14) === "Photoshop 3.0") {
                 $segments["PSIR"][] = $temp;
             }
         } elseif ($buffer === "Ù" || $buffer === "Ú") {
             // EOI - end of image or SOS - start of scan. either way we're past any interesting segments
             return $segments;
         } else {
             // segment we don't care about, so skip
             $size = wfUnpack("nint", fread($fh, 2), 2);
             if ($size['int'] <= 2) {
                 throw new MWException("invalid marker size in jpeg");
             }
             fseek($fh, $size['int'] - 2, SEEK_CUR);
         }
     }
     // shouldn't get here.
     throw new MWException("Reached end of jpeg file unexpectedly");
 }

Example #6

0

Show file

File: GIFMetadataExtractor.php Project: nischayn22/mediawiki-core

 /**
  * @throws Exception
  * @param $filename string
  * @return array
  */
 static function getMetadata($filename)
 {
     self::$gif_frame_sep = pack("C", ord(","));
     self::$gif_extension_sep = pack("C", ord("!"));
     self::$gif_term = pack("C", ord(";"));
     $frameCount = 0;
     $duration = 0.0;
     $isLooped = false;
     $xmp = "";
     $comment = array();
     if (!$filename) {
         throw new Exception("No file name specified");
     } elseif (!file_exists($filename) || is_dir($filename)) {
         throw new Exception("File {$filename} does not exist");
     }
     $fh = fopen($filename, 'rb');
     if (!$fh) {
         throw new Exception("Unable to open file {$filename}");
     }
     // Check for the GIF header
     $buf = fread($fh, 6);
     if (!($buf == 'GIF87a' || $buf == 'GIF89a')) {
         throw new Exception("Not a valid GIF file; header: {$buf}");
     }
     // Skip over width and height.
     fread($fh, 4);
     // Read BPP
     $buf = fread($fh, 1);
     $bpp = self::decodeBPP($buf);
     // Skip over background and aspect ratio
     fread($fh, 2);
     // Skip over the GCT
     self::readGCT($fh, $bpp);
     while (!feof($fh)) {
         $buf = fread($fh, 1);
         if ($buf == self::$gif_frame_sep) {
             // Found a frame
             $frameCount++;
             ## Skip bounding box
             fread($fh, 8);
             ## Read BPP
             $buf = fread($fh, 1);
             $bpp = self::decodeBPP($buf);
             ## Read GCT
             self::readGCT($fh, $bpp);
             fread($fh, 1);
             self::skipBlock($fh);
         } elseif ($buf == self::$gif_extension_sep) {
             $buf = fread($fh, 1);
             if (strlen($buf) < 1) {
                 throw new Exception("Ran out of input");
             }
             $extension_code = unpack('C', $buf);
             $extension_code = $extension_code[1];
             if ($extension_code == 0xf9) {
                 // Graphics Control Extension.
                 fread($fh, 1);
                 // Block size
                 fread($fh, 1);
                 // Transparency, disposal method, user input
                 $buf = fread($fh, 2);
                 // Delay, in hundredths of seconds.
                 if (strlen($buf) < 2) {
                     throw new Exception("Ran out of input");
                 }
                 $delay = unpack('v', $buf);
                 $delay = $delay[1];
                 $duration += $delay * 0.01;
                 fread($fh, 1);
                 // Transparent colour index
                 $term = fread($fh, 1);
                 // Should be a terminator
                 if (strlen($term) < 1) {
                     throw new Exception("Ran out of input");
                 }
                 $term = unpack('C', $term);
                 $term = $term[1];
                 if ($term != 0) {
                     throw new Exception("Malformed Graphics Control Extension block");
                 }
             } elseif ($extension_code == 0xfe) {
                 // Comment block(s).
                 $data = self::readBlock($fh);
                 if ($data === "") {
                     throw new Exception('Read error, zero-length comment block');
                 }
                 // The standard says this should be ASCII, however its unclear if
                 // thats true in practise. Check to see if its valid utf-8, if so
                 // assume its that, otherwise assume its windows-1252 (iso-8859-1)
                 $dataCopy = $data;
                 // quickIsNFCVerify has the side effect of replacing any invalid characters
                 UtfNormal::quickIsNFCVerify($dataCopy);
                 if ($dataCopy !== $data) {
                     wfSuppressWarnings();
                     $data = iconv('windows-1252', 'UTF-8', $data);
                     wfRestoreWarnings();
                 }
                 $commentCount = count($comment);
                 if ($commentCount === 0 || $comment[$commentCount - 1] !== $data) {
                     // Some applications repeat the same comment on each
                     // frame of an animated GIF image, so if this comment
                     // is identical to the last, only extract once.
                     $comment[] = $data;
                 }
             } elseif ($extension_code == 0xff) {
                 // Application extension (Netscape info about the animated gif)
                 // or XMP (or theoretically any other type of extension block)
                 $blockLength = fread($fh, 1);
                 if (strlen($blockLength) < 1) {
                     throw new Exception("Ran out of input");
                 }
                 $blockLength = unpack('C', $blockLength);
                 $blockLength = $blockLength[1];
                 $data = fread($fh, $blockLength);
                 if ($blockLength != 11) {
                     wfDebug(__METHOD__ . ' GIF application block with wrong length');
                     fseek($fh, -($blockLength + 1), SEEK_CUR);
                     self::skipBlock($fh);
                     continue;
                 }
                 // NETSCAPE2.0 (application name for animated gif)
                 if ($data == 'NETSCAPE2.0') {
                     $data = fread($fh, 2);
                     // Block length and introduction, should be 03 01
                     if ($data != "") {
                         throw new Exception("Expected , got {$data}");
                     }
                     // Unsigned little-endian integer, loop count or zero for "forever"
                     $loopData = fread($fh, 2);
                     if (strlen($loopData) < 2) {
                         throw new Exception("Ran out of input");
                     }
                     $loopData = unpack('v', $loopData);
                     $loopCount = $loopData[1];
                     if ($loopCount != 1) {
                         $isLooped = true;
                     }
                     // Read out terminator byte
                     fread($fh, 1);
                 } elseif ($data == 'XMP DataXMP') {
                     // application name for XMP data.
                     // see pg 18 of XMP spec part 3.
                     $xmp = self::readBlock($fh, true);
                     if (substr($xmp, -257, 3) !== "ÿþ" || substr($xmp, -4) !== "") {
                         // this is just a sanity check.
                         throw new Exception("XMP does not have magic trailer!");
                     }
                     // strip out trailer.
                     $xmp = substr($xmp, 0, -257);
                 } else {
                     // unrecognized extension block
                     fseek($fh, -($blockLength + 1), SEEK_CUR);
                     self::skipBlock($fh);
                     continue;
                 }
             } else {
                 self::skipBlock($fh);
             }
         } elseif ($buf == self::$gif_term) {
             break;
         } else {
             if (strlen($buf) < 1) {
                 throw new Exception("Ran out of input");
             }
             $byte = unpack('C', $buf);
             $byte = $byte[1];
             throw new Exception("At position: " . ftell($fh) . ", Unknown byte " . $byte);
         }
     }
     return array('frameCount' => $frameCount, 'looped' => $isLooped, 'duration' => $duration, 'xmp' => $xmp, 'comment' => $comment);
 }

PHP UtfNormal::quickIsNFCVerify Examples