unichr() public static method

+----------+----------+----------+----------+
public static unichr ( $code )
Esempio n. 1
0
 /**
  * Callback function for substituteNonSpecialEntities() that does the work.
  * 
  * @warning Though this is public in order to let the callback happen,
  *          calling it directly is not recommended.
  * @param $matches  PCRE matches array, with 0 the entire match, and
  *                  either index 1, 2 or 3 set with a hex value, dec value,
  *                  or string (respectively).
  * @returns Replacement string.
  */
 function nonSpecialEntityCallback($matches)
 {
     // replaces all but big five
     $entity = $matches[0];
     $is_num = @$matches[0][1] === '#';
     if ($is_num) {
         $is_hex = @$entity[2] === 'x';
         $code = $is_hex ? hexdec($matches[1]) : (int) $matches[2];
         // abort for special characters
         if (isset($this->_special_dec2str[$code])) {
             return $entity;
         }
         return HTMLPurifier_Encoder::unichr($code);
     } else {
         if (isset($this->_special_ent2dec[$matches[3]])) {
             return $entity;
         }
         if (!$this->_entity_lookup) {
             $this->_entity_lookup = HTMLPurifier_EntityLookup::instance();
         }
         if (isset($this->_entity_lookup->table[$matches[3]])) {
             return $this->_entity_lookup->table[$matches[3]];
         } else {
             return $entity;
         }
     }
 }
Esempio n. 2
0
 /**
  * Cleans a UTF-8 string for well-formedness and SGML validity
  * 
  * It will parse according to UTF-8 and return a valid UTF8 string, with
  * non-SGML codepoints excluded.
  * 
  * @note Just for reference, the non-SGML code points are 0 to 31 and
  *       127 to 159, inclusive.  However, we allow code points 9, 10
  *       and 13, which are the tab, line feed and carriage return
  *       respectively. 128 and above the code points map to multibyte
  *       UTF-8 representations.
  * 
  * @note Fallback code adapted from utf8ToUnicode by Henri Sivonen and
  *       hsivonen@iki.fi at <http://iki.fi/hsivonen/php-utf8/> under the
  *       LGPL license.  Notes on what changed are inside, but in general,
  *       the original code transformed UTF-8 text into an array of integer
  *       Unicode codepoints. Understandably, transforming that back to
  *       a string would be somewhat expensive, so the function was modded to
  *       directly operate on the string.  However, this discourages code
  *       reuse, and the logic enumerated here would be useful for any
  *       function that needs to be able to understand UTF-8 characters.
  *       As of right now, only smart lossless character encoding converters
  *       would need that, and I'm probably not going to implement them.
  *       Once again, PHP 6 should solve all our problems.
  */
 public static function cleanUTF8($str, $force_php = false)
 {
     static $non_sgml_chars = array();
     if (empty($non_sgml_chars)) {
         for ($i = 0; $i <= 31; $i++) {
             // non-SGML ASCII chars
             // save \r, \t and \n
             if ($i == 9 || $i == 13 || $i == 10) {
                 continue;
             }
             $non_sgml_chars[chr($i)] = '';
         }
         for ($i = 127; $i <= 159; $i++) {
             $non_sgml_chars[HTMLPurifier_Encoder::unichr($i)] = '';
         }
     }
     static $iconv = null;
     if ($iconv === null) {
         $iconv = function_exists('iconv');
     }
     if ($iconv && !$force_php) {
         // do the shortcut way
         $str = @iconv('UTF-8', 'UTF-8//IGNORE', $str);
         return strtr($str, $non_sgml_chars);
     }
     $mState = 0;
     // cached expected number of octets after the current octet
     // until the beginning of the next UTF8 character sequence
     $mUcs4 = 0;
     // cached Unicode character
     $mBytes = 1;
     // cached expected number of octets in the current sequence
     // original code involved an $out that was an array of Unicode
     // codepoints.  Instead of having to convert back into UTF-8, we've
     // decided to directly append valid UTF-8 characters onto a string
     // $out once they're done.  $char accumulates raw bytes, while $mUcs4
     // turns into the Unicode code point, so there's some redundancy.
     $out = '';
     $char = '';
     $len = strlen($str);
     for ($i = 0; $i < $len; $i++) {
         $in = ord($str[$i]);
         $char .= $str[$i];
         // append byte to char
         if (0 == $mState) {
             // When mState is zero we expect either a US-ASCII character
             // or a multi-octet sequence.
             if (0 == (0x80 & $in)) {
                 // US-ASCII, pass straight through.
                 if (($in <= 31 || $in == 127) && !($in == 9 || $in == 13 || $in == 10)) {
                     // control characters, remove
                 } else {
                     $out .= $char;
                 }
                 // reset
                 $char = '';
                 $mBytes = 1;
             } elseif (0xc0 == (0xe0 & $in)) {
                 // First octet of 2 octet sequence
                 $mUcs4 = $in;
                 $mUcs4 = ($mUcs4 & 0x1f) << 6;
                 $mState = 1;
                 $mBytes = 2;
             } elseif (0xe0 == (0xf0 & $in)) {
                 // First octet of 3 octet sequence
                 $mUcs4 = $in;
                 $mUcs4 = ($mUcs4 & 0xf) << 12;
                 $mState = 2;
                 $mBytes = 3;
             } elseif (0xf0 == (0xf8 & $in)) {
                 // First octet of 4 octet sequence
                 $mUcs4 = $in;
                 $mUcs4 = ($mUcs4 & 0x7) << 18;
                 $mState = 3;
                 $mBytes = 4;
             } elseif (0xf8 == (0xfc & $in)) {
                 // First octet of 5 octet sequence.
                 //
                 // This is illegal because the encoded codepoint must be
                 // either:
                 // (a) not the shortest form or
                 // (b) outside the Unicode range of 0-0x10FFFF.
                 // Rather than trying to resynchronize, we will carry on
                 // until the end of the sequence and let the later error
                 // handling code catch it.
                 $mUcs4 = $in;
                 $mUcs4 = ($mUcs4 & 0x3) << 24;
                 $mState = 4;
                 $mBytes = 5;
             } elseif (0xfc == (0xfe & $in)) {
                 // First octet of 6 octet sequence, see comments for 5
                 // octet sequence.
                 $mUcs4 = $in;
                 $mUcs4 = ($mUcs4 & 1) << 30;
                 $mState = 5;
                 $mBytes = 6;
             } else {
                 // Current octet is neither in the US-ASCII range nor a
                 // legal first octet of a multi-octet sequence.
                 $mState = 0;
                 $mUcs4 = 0;
                 $mBytes = 1;
                 $char = '';
             }
         } else {
             // When mState is non-zero, we expect a continuation of the
             // multi-octet sequence
             if (0x80 == (0xc0 & $in)) {
                 // Legal continuation.
                 $shift = ($mState - 1) * 6;
                 $tmp = $in;
                 $tmp = ($tmp & 0x3f) << $shift;
                 $mUcs4 |= $tmp;
                 if (0 == --$mState) {
                     // End of the multi-octet sequence. mUcs4 now contains
                     // the final Unicode codepoint to be output
                     // Check for illegal sequences and codepoints.
                     // From Unicode 3.1, non-shortest form is illegal
                     if (2 == $mBytes && $mUcs4 < 0x80 || 3 == $mBytes && $mUcs4 < 0x800 || 4 == $mBytes && $mUcs4 < 0x10000 || 4 < $mBytes || ($mUcs4 & 0xfffff800) == 0xd800 || $mUcs4 > 0x10ffff) {
                     } elseif (0xfeff != $mUcs4 && !($mUcs4 >= 128 && $mUcs4 <= 159)) {
                         $out .= $char;
                     }
                     // initialize UTF8 cache (reset)
                     $mState = 0;
                     $mUcs4 = 0;
                     $mBytes = 1;
                     $char = '';
                 }
             } else {
                 // ((0xC0 & (*in) != 0x80) && (mState != 0))
                 // Incomplete multi-octet sequence.
                 // used to result in complete fail, but we'll reset
                 $mState = 0;
                 $mUcs4 = 0;
                 $mBytes = 1;
                 $char = '';
             }
         }
     }
     return $out;
 }
 public function validate($string, $config, $context)
 {
     static $generic_names = array('serif' => true, 'sans-serif' => true, 'monospace' => true, 'fantasy' => true, 'cursive' => true);
     // assume that no font names contain commas in them
     $fonts = explode(',', $string);
     $final = '';
     foreach ($fonts as $font) {
         $font = trim($font);
         if ($font === '') {
             continue;
         }
         // match a generic name
         if (isset($generic_names[$font])) {
             $final .= $font . ', ';
             continue;
         }
         // match a quoted name
         if ($font[0] === '"' || $font[0] === "'") {
             $length = strlen($font);
             if ($length <= 2) {
                 continue;
             }
             $quote = $font[0];
             if ($font[$length - 1] !== $quote) {
                 continue;
             }
             $font = substr($font, 1, $length - 2);
             $new_font = '';
             for ($i = 0, $c = strlen($font); $i < $c; $i++) {
                 if ($font[$i] === '\\') {
                     $i++;
                     if ($i >= $c) {
                         $new_font .= '\\';
                         break;
                     }
                     if (ctype_xdigit($font[$i])) {
                         $code = $font[$i];
                         for ($a = 1, $i++; $i < $c && $a < 6; $i++, $a++) {
                             if (!ctype_xdigit($font[$i])) {
                                 break;
                             }
                             $code .= $font[$i];
                         }
                         // We have to be extremely careful when adding
                         // new characters, to make sure we're not breaking
                         // the encoding.
                         $char = HTMLPurifier_Encoder::unichr(hexdec($code));
                         if (HTMLPurifier_Encoder::cleanUTF8($char) === '') {
                             continue;
                         }
                         $new_font .= $char;
                         if ($i < $c && trim($font[$i]) !== '') {
                             $i--;
                         }
                         continue;
                     }
                     if ($font[$i] === "\n") {
                         continue;
                     }
                 }
                 $new_font .= $font[$i];
             }
             $font = $new_font;
         }
         // $font is a pure representation of the font name
         if (ctype_alnum($font) && $font !== '') {
             // very simple font, allow it in unharmed
             $final .= $font . ', ';
             continue;
         }
         // complicated font, requires quoting
         // armor single quotes and new lines
         $font = str_replace("\\", "\\\\", $font);
         $font = str_replace("'", "\\'", $font);
         $final .= "'{$font}', ";
     }
     $final = rtrim($final, ', ');
     if ($final === '') {
         return false;
     }
     return $final;
 }
Esempio n. 4
0
 /**
  * Parses a possibly escaped CSS string and returns the "pure" 
  * version of it.
  */
 protected function expandCSSEscape($string)
 {
     // flexibly parse it
     $ret = '';
     for ($i = 0, $c = strlen($string); $i < $c; $i++) {
         if ($string[$i] === '\\') {
             $i++;
             if ($i >= $c) {
                 $ret .= '\\';
                 break;
             }
             if (ctype_xdigit($string[$i])) {
                 $code = $string[$i];
                 for ($a = 1, $i++; $i < $c && $a < 6; $i++, $a++) {
                     if (!ctype_xdigit($string[$i])) {
                         break;
                     }
                     $code .= $string[$i];
                 }
                 // We have to be extremely careful when adding
                 // new characters, to make sure we're not breaking
                 // the encoding.
                 $char = HTMLPurifier_Encoder::unichr(hexdec($code));
                 if (HTMLPurifier_Encoder::cleanUTF8($char) === '') {
                     continue;
                 }
                 $ret .= $char;
                 if ($i < $c && trim($string[$i]) !== '') {
                     $i--;
                 }
                 continue;
             }
             if ($string[$i] === "\n") {
                 continue;
             }
         }
         $ret .= $string[$i];
     }
     return $ret;
 }