/** * Takes an UTF-8 string and returns an array of integer values representing the Unicode characters. * Astral planes are supported ie. the ints in the output can be > 0xFFFF. Occurrances of the BOM are ignored. * Surrogates are not allowed. * @param string $string The UTF-8 encoded string. * @return array Returns an array of unicode code points. * @author Henri Sivonen, mailto:hsivonen@iki.fi * @link http://hsivonen.iki.fi/php-utf8/ * @author Ivan Tcholakov, August 2009, adaptation for the Dokeos LMS. */ function _api_utf8_to_unicode(&$string) { $str = (string) $string; $state = 0; // cached expected number of octets after the current octet // until the beginning of the next UTF8 character sequence $codepoint = 0; // cached Unicode character $bytes = 1; // cached expected number of octets in the current sequence $result = array(); $len = api_byte_count($str); for ($i = 0; $i < $len; $i++) { $byte = ord($str[$i]); if ($state == 0) { // When state is zero we expect either a US-ASCII character or a multi-octet sequence. if (0 == (0x80 & $byte)) { // US-ASCII, pass straight through. $result[] = $byte; $bytes = 1; } else { if (0xc0 == (0xe0 & $byte)) { // First octet of 2 octet sequence $codepoint = $byte; $codepoint = ($codepoint & 0x1f) << 6; $state = 1; $bytes = 2; } else { if (0xe0 == (0xf0 & $byte)) { // First octet of 3 octet sequence $codepoint = $byte; $codepoint = ($codepoint & 0xf) << 12; $state = 2; $bytes = 3; } else { if (0xf0 == (0xf8 & $byte)) { // First octet of 4 octet sequence $codepoint = $byte; $codepoint = ($codepoint & 0x7) << 18; $state = 3; $bytes = 4; } else { if (0xf8 == (0xfc & $byte)) { // First octet of 5 octet sequence. // This is illegal because the encoded codepoint must be either // (a) not the shortest form or // (b) outside the Unicode range of 0-0x10FFFF. // Rather than trying to resynchronize, we will carry on until the end // of the sequence and let the later error handling code catch it. $codepoint = $byte; $codepoint = ($codepoint & 0x3) << 24; $state = 4; $bytes = 5; } else { if (0xfc == (0xfe & $byte)) { // First octet of 6 octet sequence, see comments for 5 octet sequence. $codepoint = $byte; $codepoint = ($codepoint & 1) << 30; $state = 5; $bytes = 6; } else { // Current octet is neither in the US-ASCII range nor a legal first octet of a multi-octet sequence. $state = 0; $codepoint = 0; $bytes = 1; $result[] = 0xfffd; // U+FFFD REPLACEMENT CHARACTER is the general substitute character in the Unicode Standard. continue; } } } } } } } else { // When state is non-zero, we expect a continuation of the multi-octet sequence if (0x80 == (0xc0 & $byte)) { // Legal continuation. $shift = ($state - 1) * 6; $tmp = $byte; $tmp = ($tmp & 0x3f) << $shift; $codepoint |= $tmp; // End of the multi-octet sequence. $codepoint now contains the final Unicode codepoint to be output if (0 == --$state) { // Check for illegal sequences and codepoints. // From Unicode 3.1, non-shortest form is illegal if (2 == $bytes && $codepoint < 0x80 || 3 == $bytes && $codepoint < 0x800 || 4 == $bytes && $codepoint < 0x10000 || 4 < $bytes || ($codepoint & 0xfffff800) == 0xd800 || $codepoint > 0x10ffff) { $state = 0; $codepoint = 0; $bytes = 1; $result[] = 0xfffd; continue; } if (0xfeff != $codepoint) { // BOM is legal but we don't want to output it $result[] = $codepoint; } // Initialize UTF8 cache $state = 0; $codepoint = 0; $bytes = 1; } } else { // ((0xC0 & (*in) != 0x80) && (state != 0)) // Incomplete multi-octet sequence. $state = 0; $codepoint = 0; $bytes = 1; $result[] = 0xfffd; } } } return $result; }
/** * Checks a string for UTF-8 validity. * * @param string $string The string to be tested. * @return bool Returns TRUE when the tested string is valid UTF-8, FALSE othewise. * @link http://en.wikipedia.org/wiki/UTF-8 * @author see internationalization.lib.php */ static function is_valid(&$string) { //return @mb_detect_encoding($string, 'UTF-8', true) == 'UTF-8' ? true : false; // Ivan Tcholakov, 05-OCT-2008: I do not trust mb_detect_encoding(). I have // found a string with a single cyrillic letter (single byte), that is // wrongly detected as UTF-8. Possibly, there would be problems with other // languages too. An alternative implementation will be used. $str = (string) $string; $len = api_byte_count($str); $i = 0; while ($i < $len) { $byte1 = ord($str[$i++]); // Here the current character begins. Its size is // determined by the senior bits in the first byte. if (($byte1 & 0x80) == 0x0) { // 0xxxxxxx // & // 10000000 // -------- // 00000000 // This is s valid character and it contains a single byte. } elseif (($byte1 & 0xe0) == 0xc0) { // 110xxxxx 10xxxxxx // & & // 11100000 11000000 // -------- -------- // 11000000 10000000 // The character contains two bytes. if ($i == $len) { return false; // Here the string ends unexpectedly. } if (!((ord($str[$i++]) & 0xc0) == 0x80)) { return false; } // Invalid second byte, invalid string. } elseif (($byte1 & 0xf0) == 0xe0) { // 1110xxxx 10xxxxxx 10xxxxxx // & & & // 11110000 11000000 11000000 // -------- -------- -------- // 11100000 10000000 10000000 // This is a character of three bytes. if ($i == $len) { return false; // Unexpected end of the string. } if (!((ord($str[$i++]) & 0xc0) == 0x80)) { return false; // Invalid second byte. } if ($i == $len) { return false; // Unexpected end of the string. } if (!((ord($str[$i++]) & 0xc0) == 0x80)) { return false; // Invalid third byte, invalid string. } } elseif (($byte1 & 0xf8) == 0xf0) { // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx // & & & & // 11111000 11000000 11000000 11000000 // -------- -------- -------- -------- // 11110000 10000000 10000000 10000000 // This is a character of four bytes. if ($i == $len) { return false; } if (!((ord($str[$i++]) & 0xc0) == 0x80)) { return false; } if ($i == $len) { return false; } if (!((ord($str[$i++]) & 0xc0) == 0x80)) { return false; } if ($i == $len) { return false; } if (!((ord($str[$i++]) & 0xc0) == 0x80)) { return false; } } elseif (($byte1 & 0xfc) == 0xf8) { // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx // & & & & & // 11111100 11000000 11000000 11000000 11000000 // -------- -------- -------- -------- -------- // 11111000 10000000 10000000 10000000 10000000 // This is a character of five bytes. if ($i == $len) { return false; } if (!((ord($str[$i++]) & 0xc0) == 0x80)) { return false; } if ($i == $len) { return false; } if (!((ord($str[$i++]) & 0xc0) == 0x80)) { return false; } if ($i == $len) { return false; } if (!((ord($str[$i++]) & 0xc0) == 0x80)) { return false; } if ($i == $len) { return false; } if (!((ord($str[$i++]) & 0xc0) == 0x80)) { return false; } } elseif (($byte1 & 0xfe) == 0xfc) { // 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx // & & & & & & // 11111110 11000000 11000000 11000000 11000000 11000000 // -------- -------- -------- -------- -------- -------- // 11111100 10000000 10000000 10000000 10000000 10000000 // This is a character of six bytes. if ($i == $len) { return false; } if (!((ord($str[$i++]) & 0xc0) == 0x80)) { return false; } if ($i == $len) { return false; } if (!((ord($str[$i++]) & 0xc0) == 0x80)) { return false; } if ($i == $len) { return false; } if (!((ord($str[$i++]) & 0xc0) == 0x80)) { return false; } if ($i == $len) { return false; } if (!((ord($str[$i++]) & 0xc0) == 0x80)) { return false; } if ($i == $len) { return false; } if (!((ord($str[$i++]) & 0xc0) == 0x80)) { return false; } } else { return false; // In any other case the character is invalid. } // Here the current character is valid, it // matches to some of the cases above. // The next character is to be examinated. } return true; // Empty strings are valid too. }
/** * Checks if a string contains multibyte characters. * @access public * @param string $str multi-byte text to wrap encode * @return bool */ public function HasMultiBytes($str) { // Modified by Ivan Tcholakov, 24-JAN-2010. // (function_exists('mb_strlen')) { // return (strlen($str) > mb_strlen($str, $this->CharSet)); //} else { // Assume no multibytes (we can't handle without mbstring functions anyway) // return false; //} return (api_byte_count($str) > api_strlen($str, $this->CharSet)); // }
/** * Parses CSV data (one line) into an array. This function is not affected by the OS-locale settings. * @param string $string The input string. * @param string $delimiter (optional) The field delimiter, one character only. The default delimiter character is comma {,). * @param string $enclosure (optional) The field enclosure, one character only. The default enclosure character is quote ("). * @param string $escape (optional) The escape character, one character only. The default escape character is backslash (\). * @return array Returns an array containing the fields read. * Note: In order this function to work correctly with UTF-8, limitation for the parameters $delimiter, $enclosure and $escape * should be kept. These parameters should be single ASCII characters only. Thus the implementation of this function is faster. * @link http://php.net/manual/en/function.str-getcsv.php (exists as of PHP 5 >= 5.3.0) */ static function &api_str_getcsv(&$string, $delimiter = ',', $enclosure = '"', $escape = '\\') { $delimiter = (string) $delimiter; if (api_byte_count($delimiter) > 1) { $delimiter = $delimiter[1]; } $enclosure = (string) $enclosure; if (api_byte_count($enclosure) > 1) { $enclosure = $enclosure[1]; } $escape = (string) $escape; if (api_byte_count($escape) > 1) { $escape = $escape[1]; } $str = (string) $string; $len = api_byte_count($str); $enclosed = false; $escaped = false; $value = ''; $result = array(); for ($i = 0; $i < $len; $i++) { $char = $str[$i]; if ($char == $escape) { if (!$escaped) { $escaped = true; continue; } } $escaped = false; switch ($char) { case $enclosure: if ($enclosed && $str[$i + 1] == $enclosure) { $value .= $char; $i++; } else { $enclosed = !$enclosed; } break; case $delimiter: if (!$enclosed) { $result[] = $value; $value = ''; } else { $value .= $char; } break; default: $value .= $char; break; } } if (!empty($value)) { $result[] = $value; } return $result; }
/** * This function returns a string or an array with all occurrences of search in subject (ignoring case) replaced with the given replace value. * @param mixed $search String or array of strings to be found. * @param mixed $replace String or array of strings used for replacement. * @param mixed $subject String or array of strings being searced. * @param int $count (optional) The number of matched and replaced needles will be returned in count, which is passed by reference. * @param string $encoding (optional) The used internally by this function character encoding. If it is omitted, the platform character set will be used by default. * @return mixed String or array as a result. * Notes: * If $subject is an array, then the search and replace is performed with every entry of subject, the return value is an array. * If $search and $replace are arrays, then the function takes a value from each array and uses it to do search and replace on subject. * If $replace has fewer values than search, then an empty string is used for the rest of replacement values. * If $search is an array and $replace is a string, then this replacement string is used for every value of search. * This function is aimed at replacing the function str_ireplace() for human-language strings. * @link http://php.net/manual/en/function.str-ireplace * @author Henri Sivonen, mailto:hsivonen@iki.fi * @link http://hsivonen.iki.fi/php-utf8/ * Adaptation for Chamilo 1.8.7, 2010 * Initial implementation Dokeos LMS, August 2009 * @author Ivan Tcholakov */ function api_str_ireplace($search, $replace, $subject, &$count = null, $encoding = null) { if (empty($encoding)) { $encoding = _api_mb_internal_encoding(); } if (api_is_encoding_supported($encoding)) { if (!is_array($search) && !is_array($replace)) { if (!api_is_utf8($encoding)) { $search = api_utf8_encode($search, $encoding); } $slen = api_byte_count($search); if ($slen == 0) { return $subject; } if (!api_is_utf8($encoding)) { $replace = api_utf8_encode($replace, $encoding); $subject = api_utf8_encode($subject, $encoding); } $lendif = api_byte_count($replace) - api_byte_count($search); $search = api_strtolower($search, 'UTF-8'); $search = preg_quote($search); $lstr = api_strtolower($subject, 'UTF-8'); $i = 0; $matched = 0; while (preg_match('/(.*)' . $search . '/Us', $lstr, $matches)) { if ($i === $count) { break; } $mlen = api_byte_count($matches[0]); $lstr = substr($lstr, $mlen); $subject = substr_replace($subject, $replace, $matched + api_byte_count($matches[1]), $slen); $matched += $mlen + $lendif; $i++; } if (!api_is_utf8($encoding)) { $subject = api_utf8_decode($subject, $encoding); } return $subject; } else { foreach (array_keys($search) as $k) { if (is_array($replace)) { if (array_key_exists($k, $replace)) { $subject = api_str_ireplace($search[$k], $replace[$k], $subject, $count); } else { $subject = api_str_ireplace($search[$k], '', $subject, $count); } } else { $subject = api_str_ireplace($search[$k], $replace, $subject, $count); } } return $subject; } } if (is_null($count)) { return str_ireplace($search, $replace, $subject); } return str_ireplace($search, $replace, $subject, $count); }