/**
 * Takes an UTF-8 string and returns an array of integer values representing the Unicode characters.
 * Astral planes are supported ie. the ints in the output can be > 0xFFFF. Occurrances of the BOM are ignored.
 * Surrogates are not allowed.
 * @param string $string				The UTF-8 encoded string.
 * @return array						Returns an array of unicode code points.
 * @author Henri Sivonen, mailto:hsivonen@iki.fi
 * @link http://hsivonen.iki.fi/php-utf8/
 * @author Ivan Tcholakov, August 2009, adaptation for the Dokeos LMS.
 */
function _api_utf8_to_unicode(&$string)
{
    $str = (string) $string;
    $state = 0;
    // cached expected number of octets after the current octet
    // until the beginning of the next UTF8 character sequence
    $codepoint = 0;
    // cached Unicode character
    $bytes = 1;
    // cached expected number of octets in the current sequence
    $result = array();
    $len = api_byte_count($str);
    for ($i = 0; $i < $len; $i++) {
        $byte = ord($str[$i]);
        if ($state == 0) {
            // When state is zero we expect either a US-ASCII character or a multi-octet sequence.
            if (0 == (0x80 & $byte)) {
                // US-ASCII, pass straight through.
                $result[] = $byte;
                $bytes = 1;
            } else {
                if (0xc0 == (0xe0 & $byte)) {
                    // First octet of 2 octet sequence
                    $codepoint = $byte;
                    $codepoint = ($codepoint & 0x1f) << 6;
                    $state = 1;
                    $bytes = 2;
                } else {
                    if (0xe0 == (0xf0 & $byte)) {
                        // First octet of 3 octet sequence
                        $codepoint = $byte;
                        $codepoint = ($codepoint & 0xf) << 12;
                        $state = 2;
                        $bytes = 3;
                    } else {
                        if (0xf0 == (0xf8 & $byte)) {
                            // First octet of 4 octet sequence
                            $codepoint = $byte;
                            $codepoint = ($codepoint & 0x7) << 18;
                            $state = 3;
                            $bytes = 4;
                        } else {
                            if (0xf8 == (0xfc & $byte)) {
                                // First octet of 5 octet sequence.
                                // This is illegal because the encoded codepoint must be either
                                // (a) not the shortest form or
                                // (b) outside the Unicode range of 0-0x10FFFF.
                                // Rather than trying to resynchronize, we will carry on until the end
                                // of the sequence and let the later error handling code catch it.
                                $codepoint = $byte;
                                $codepoint = ($codepoint & 0x3) << 24;
                                $state = 4;
                                $bytes = 5;
                            } else {
                                if (0xfc == (0xfe & $byte)) {
                                    // First octet of 6 octet sequence, see comments for 5 octet sequence.
                                    $codepoint = $byte;
                                    $codepoint = ($codepoint & 1) << 30;
                                    $state = 5;
                                    $bytes = 6;
                                } else {
                                    // Current octet is neither in the US-ASCII range nor a legal first octet of a multi-octet sequence.
                                    $state = 0;
                                    $codepoint = 0;
                                    $bytes = 1;
                                    $result[] = 0xfffd;
                                    // U+FFFD REPLACEMENT CHARACTER is the general substitute character in the Unicode Standard.
                                    continue;
                                }
                            }
                        }
                    }
                }
            }
        } else {
            // When state is non-zero, we expect a continuation of the multi-octet sequence
            if (0x80 == (0xc0 & $byte)) {
                // Legal continuation.
                $shift = ($state - 1) * 6;
                $tmp = $byte;
                $tmp = ($tmp & 0x3f) << $shift;
                $codepoint |= $tmp;
                // End of the multi-octet sequence. $codepoint now contains the final Unicode codepoint to be output
                if (0 == --$state) {
                    // Check for illegal sequences and codepoints.
                    // From Unicode 3.1, non-shortest form is illegal
                    if (2 == $bytes && $codepoint < 0x80 || 3 == $bytes && $codepoint < 0x800 || 4 == $bytes && $codepoint < 0x10000 || 4 < $bytes || ($codepoint & 0xfffff800) == 0xd800 || $codepoint > 0x10ffff) {
                        $state = 0;
                        $codepoint = 0;
                        $bytes = 1;
                        $result[] = 0xfffd;
                        continue;
                    }
                    if (0xfeff != $codepoint) {
                        // BOM is legal but we don't want to output it
                        $result[] = $codepoint;
                    }
                    // Initialize UTF8 cache
                    $state = 0;
                    $codepoint = 0;
                    $bytes = 1;
                }
            } else {
                // ((0xC0 & (*in) != 0x80) && (state != 0))
                // Incomplete multi-octet sequence.
                $state = 0;
                $codepoint = 0;
                $bytes = 1;
                $result[] = 0xfffd;
            }
        }
    }
    return $result;
}
Пример #2
0
 /**
  * Checks a string for UTF-8 validity.
  * 
  * @param string $string	The string to be tested.
  * @return bool				Returns TRUE when the tested string is valid UTF-8, FALSE othewise.
  * @link http://en.wikipedia.org/wiki/UTF-8
  * @author see internationalization.lib.php
  */
 static function is_valid(&$string)
 {
     //return @mb_detect_encoding($string, 'UTF-8', true) == 'UTF-8' ? true : false;
     // Ivan Tcholakov, 05-OCT-2008: I do not trust mb_detect_encoding(). I have
     // found a string with a single cyrillic letter (single byte), that is
     // wrongly detected as UTF-8. Possibly, there would be problems with other
     // languages too. An alternative implementation will be used.
     $str = (string) $string;
     $len = api_byte_count($str);
     $i = 0;
     while ($i < $len) {
         $byte1 = ord($str[$i++]);
         // Here the current character begins. Its size is
         // determined by the senior bits in the first byte.
         if (($byte1 & 0x80) == 0x0) {
             // 0xxxxxxx
             //    &
             // 10000000
             // --------
             // 00000000
             // This is s valid character and it contains a single byte.
         } elseif (($byte1 & 0xe0) == 0xc0) {
             // 110xxxxx 10xxxxxx
             //    &        &
             // 11100000 11000000
             // -------- --------
             // 11000000 10000000
             // The character contains two bytes.
             if ($i == $len) {
                 return false;
                 // Here the string ends unexpectedly.
             }
             if (!((ord($str[$i++]) & 0xc0) == 0x80)) {
                 return false;
             }
             // Invalid second byte, invalid string.
         } elseif (($byte1 & 0xf0) == 0xe0) {
             // 1110xxxx 10xxxxxx 10xxxxxx
             //    &        &        &
             // 11110000 11000000 11000000
             // -------- -------- --------
             // 11100000 10000000 10000000
             // This is a character of three bytes.
             if ($i == $len) {
                 return false;
                 // Unexpected end of the string.
             }
             if (!((ord($str[$i++]) & 0xc0) == 0x80)) {
                 return false;
                 // Invalid second byte.
             }
             if ($i == $len) {
                 return false;
                 // Unexpected end of the string.
             }
             if (!((ord($str[$i++]) & 0xc0) == 0x80)) {
                 return false;
                 // Invalid third byte, invalid string.
             }
         } elseif (($byte1 & 0xf8) == 0xf0) {
             // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
             //    &        &        &        &
             // 11111000 11000000 11000000 11000000
             // -------- -------- -------- --------
             // 11110000 10000000 10000000 10000000
             // This is a character of four bytes.
             if ($i == $len) {
                 return false;
             }
             if (!((ord($str[$i++]) & 0xc0) == 0x80)) {
                 return false;
             }
             if ($i == $len) {
                 return false;
             }
             if (!((ord($str[$i++]) & 0xc0) == 0x80)) {
                 return false;
             }
             if ($i == $len) {
                 return false;
             }
             if (!((ord($str[$i++]) & 0xc0) == 0x80)) {
                 return false;
             }
         } elseif (($byte1 & 0xfc) == 0xf8) {
             // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
             //    &        &        &        &        &
             // 11111100 11000000 11000000 11000000 11000000
             // -------- -------- -------- -------- --------
             // 11111000 10000000 10000000 10000000 10000000
             // This is a character of five bytes.
             if ($i == $len) {
                 return false;
             }
             if (!((ord($str[$i++]) & 0xc0) == 0x80)) {
                 return false;
             }
             if ($i == $len) {
                 return false;
             }
             if (!((ord($str[$i++]) & 0xc0) == 0x80)) {
                 return false;
             }
             if ($i == $len) {
                 return false;
             }
             if (!((ord($str[$i++]) & 0xc0) == 0x80)) {
                 return false;
             }
             if ($i == $len) {
                 return false;
             }
             if (!((ord($str[$i++]) & 0xc0) == 0x80)) {
                 return false;
             }
         } elseif (($byte1 & 0xfe) == 0xfc) {
             // 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
             //    &        &        &        &        &        &
             // 11111110 11000000 11000000 11000000 11000000 11000000
             // -------- -------- -------- -------- -------- --------
             // 11111100 10000000 10000000 10000000 10000000 10000000
             // This is a character of six bytes.
             if ($i == $len) {
                 return false;
             }
             if (!((ord($str[$i++]) & 0xc0) == 0x80)) {
                 return false;
             }
             if ($i == $len) {
                 return false;
             }
             if (!((ord($str[$i++]) & 0xc0) == 0x80)) {
                 return false;
             }
             if ($i == $len) {
                 return false;
             }
             if (!((ord($str[$i++]) & 0xc0) == 0x80)) {
                 return false;
             }
             if ($i == $len) {
                 return false;
             }
             if (!((ord($str[$i++]) & 0xc0) == 0x80)) {
                 return false;
             }
             if ($i == $len) {
                 return false;
             }
             if (!((ord($str[$i++]) & 0xc0) == 0x80)) {
                 return false;
             }
         } else {
             return false;
             // In any other case the character is invalid.
         }
         // Here the current character is valid, it
         // matches to some of the cases above.
         // The next character is to be examinated.
     }
     return true;
     // Empty strings are valid too.
 }
Пример #3
0
 /**
  * Checks if a string contains multibyte characters.
  * @access public
  * @param string $str multi-byte text to wrap encode
  * @return bool
  */
 public function HasMultiBytes($str) {
 	// Modified by Ivan Tcholakov, 24-JAN-2010.
   // (function_exists('mb_strlen')) {
   //  return (strlen($str) > mb_strlen($str, $this->CharSet));
   //} else { // Assume no multibytes (we can't handle without mbstring functions anyway)
   //  return false;
   //}
   return (api_byte_count($str) > api_strlen($str, $this->CharSet));
   //
 }
Пример #4
0
 /**
  * Parses CSV data (one line) into an array. This function is not affected by the OS-locale settings.
  * @param string $string                  The input string.
  * @param string $delimiter (optional)    The field delimiter, one character only. The default delimiter character is comma {,).
  * @param string $enclosure (optional)    The field enclosure, one character only. The default enclosure character is quote (").
  * @param string $escape (optional)       The escape character, one character only. The default escape character is backslash (\).
  * @return array                          Returns an array containing the fields read.
  * Note: In order this function to work correctly with UTF-8, limitation for the parameters $delimiter, $enclosure and $escape
  * should be kept. These parameters should be single ASCII characters only. Thus the implementation of this function is faster.
  * @link http://php.net/manual/en/function.str-getcsv.php   (exists as of PHP 5 >= 5.3.0)
  */
 static function &api_str_getcsv(&$string, $delimiter = ',', $enclosure = '"', $escape = '\\')
 {
     $delimiter = (string) $delimiter;
     if (api_byte_count($delimiter) > 1) {
         $delimiter = $delimiter[1];
     }
     $enclosure = (string) $enclosure;
     if (api_byte_count($enclosure) > 1) {
         $enclosure = $enclosure[1];
     }
     $escape = (string) $escape;
     if (api_byte_count($escape) > 1) {
         $escape = $escape[1];
     }
     $str = (string) $string;
     $len = api_byte_count($str);
     $enclosed = false;
     $escaped = false;
     $value = '';
     $result = array();
     for ($i = 0; $i < $len; $i++) {
         $char = $str[$i];
         if ($char == $escape) {
             if (!$escaped) {
                 $escaped = true;
                 continue;
             }
         }
         $escaped = false;
         switch ($char) {
             case $enclosure:
                 if ($enclosed && $str[$i + 1] == $enclosure) {
                     $value .= $char;
                     $i++;
                 } else {
                     $enclosed = !$enclosed;
                 }
                 break;
             case $delimiter:
                 if (!$enclosed) {
                     $result[] = $value;
                     $value = '';
                 } else {
                     $value .= $char;
                 }
                 break;
             default:
                 $value .= $char;
                 break;
         }
     }
     if (!empty($value)) {
         $result[] = $value;
     }
     return $result;
 }
/**
 * This function returns a string or an array with all occurrences of search in subject (ignoring case) replaced with the given replace value.
 * @param mixed $search					String or array of strings to be found.
 * @param mixed $replace				String or array of strings used for replacement.
 * @param mixed $subject				String or array of strings being searced.
 * @param int $count (optional)			The number of matched and replaced needles will be returned in count, which is passed by reference.
 * @param string $encoding (optional)	The used internally by this function character encoding. If it is omitted, the platform character set will be used by default.
 * @return mixed						String or array as a result.
 * Notes:
 * If $subject is an array, then the search and replace is performed with every entry of subject, the return value is an array.
 * If $search and $replace are arrays, then the function takes a value from each array and uses it to do search and replace on subject.
 * If $replace has fewer values than search, then an empty string is used for the rest of replacement values.
 * If $search is an array and $replace is a string, then this replacement string is used for every value of search.
 * This function is aimed at replacing the function str_ireplace() for human-language strings.
 * @link http://php.net/manual/en/function.str-ireplace
 * @author Henri Sivonen, mailto:hsivonen@iki.fi
 * @link http://hsivonen.iki.fi/php-utf8/
 * Adaptation for Chamilo 1.8.7, 2010
 * Initial implementation Dokeos LMS, August 2009
 * @author Ivan Tcholakov
 */
function api_str_ireplace($search, $replace, $subject, &$count = null, $encoding = null)
{
    if (empty($encoding)) {
        $encoding = _api_mb_internal_encoding();
    }
    if (api_is_encoding_supported($encoding)) {
        if (!is_array($search) && !is_array($replace)) {
            if (!api_is_utf8($encoding)) {
                $search = api_utf8_encode($search, $encoding);
            }
            $slen = api_byte_count($search);
            if ($slen == 0) {
                return $subject;
            }
            if (!api_is_utf8($encoding)) {
                $replace = api_utf8_encode($replace, $encoding);
                $subject = api_utf8_encode($subject, $encoding);
            }
            $lendif = api_byte_count($replace) - api_byte_count($search);
            $search = api_strtolower($search, 'UTF-8');
            $search = preg_quote($search);
            $lstr = api_strtolower($subject, 'UTF-8');
            $i = 0;
            $matched = 0;
            while (preg_match('/(.*)' . $search . '/Us', $lstr, $matches)) {
                if ($i === $count) {
                    break;
                }
                $mlen = api_byte_count($matches[0]);
                $lstr = substr($lstr, $mlen);
                $subject = substr_replace($subject, $replace, $matched + api_byte_count($matches[1]), $slen);
                $matched += $mlen + $lendif;
                $i++;
            }
            if (!api_is_utf8($encoding)) {
                $subject = api_utf8_decode($subject, $encoding);
            }
            return $subject;
        } else {
            foreach (array_keys($search) as $k) {
                if (is_array($replace)) {
                    if (array_key_exists($k, $replace)) {
                        $subject = api_str_ireplace($search[$k], $replace[$k], $subject, $count);
                    } else {
                        $subject = api_str_ireplace($search[$k], '', $subject, $count);
                    }
                } else {
                    $subject = api_str_ireplace($search[$k], $replace, $subject, $count);
                }
            }
            return $subject;
        }
    }
    if (is_null($count)) {
        return str_ireplace($search, $replace, $subject);
    }
    return str_ireplace($search, $replace, $subject, $count);
}