Example #1
0
/**
 * Performs lots of magic to make sure data encodings are converted correctly. Input, and output too (as often stores internally in UTF or performs automatic dynamic conversions from internal to external charsets).
 * Roll on PHP6 that has a true internal UTF string model. For now, anyone who uses UTF will get some (albeit minor) imperfections from PHP's manipulations of the strings.
 *
 * @param  boolean				Whether we know we are working in UTF-8. This is the case for AJAX calls.
 */
function convert_data_encodings($known_utf8 = false)
{
    global $VALID_ENCODING, $CONVERTED_ENCODING;
    $VALID_ENCODING = true;
    if ($CONVERTED_ENCODING) {
        return;
    }
    // Already done it
    if (array_key_exists('KNOWN_UTF8', $GLOBALS) && $GLOBALS['KNOWN_UTF8']) {
        $known_utf8 = true;
    }
    $charset = get_charset();
    $done_something = false;
    // Conversion of parameters that might be in the wrong character encoding (e.g. Javascript uses UTF to make requests regardless of document encoding, so the stuff needs converting)
    //  If we don't have any PHP extensions (mbstring etc) that can perform the detection/conversion, our code will take this into account and use utf8_decode at points where it knows that it's being communicated with by Javascript.
    if (@strlen(ini_get('unicode.runtime_encoding')) > 0) {
        @ini_set('default_charset', $charset);
        @ini_set('unicode.runtime_encoding', $charset);
        @ini_set('unicode.output_encoding', $charset);
        @ini_set('unicode.semantics', '1');
        $done_something = true;
    } elseif ($known_utf8 && (version_compare(phpversion(), '4.3.0') >= 0 || strtolower($charset) == 'iso-8859-1') && will_be_unicode_neutered(serialize($_GET) . serialize($_POST)) && in_array(strtolower($charset), array('iso-8859-1', 'iso-8859-15', 'koi8-r', 'big5', 'gb2312', 'big5-hkscs', 'shift_jis', 'euc-jp'))) {
        require_code('character_sets');
        do_environment_utf8_conversion($charset);
        $done_something = true;
    } elseif (function_exists('iconv_set_encoding') && get_value('disable_iconv') !== '1') {
        $encoding = $known_utf8 ? 'UTF-8' : $charset;
        if (@iconv_set_encoding('input_encoding', $encoding)) {
            iconv_set_encoding('output_encoding', $charset);
            iconv_set_encoding('internal_encoding', $charset);
        } else {
            $VALID_ENCODING = false;
        }
        $done_something = true;
    } elseif (function_exists('mb_convert_encoding') && get_value('disable_mbstring') !== '1') {
        if (function_exists('mb_list_encodings')) {
            $VALID_ENCODING = in_array(strtolower($charset), array_map('strtolower', mb_list_encodings()));
        } else {
            $VALID_ENCODING = true;
        }
        if ($VALID_ENCODING) {
            $encoding = $known_utf8 ? 'UTF-8' : '';
            if (function_exists('mb_http_input') && $encoding == '') {
                if (count($_POST) != 0) {
                    $encoding = mb_http_input('P');
                    if (!is_string($encoding) || $encoding == 'pass') {
                        $encoding = '';
                    }
                }
            }
            if (function_exists('mb_http_input') && $encoding == '') {
                $encoding = mb_http_input('G');
                if (!is_string($encoding) || $encoding == 'pass') {
                    $encoding = '';
                }
                if (function_exists('mb_detect_encoding') && $encoding == '' && ocp_srv('REQUEST_URI') != '') {
                    $encoding = mb_detect_encoding(urldecode(ocp_srv('REQUEST_URI')), $charset . ',UTF-8,ISO-8859-1');
                    if (!is_string($encoding) || $encoding == 'pass') {
                        $encoding = '';
                    }
                }
            }
            if ($encoding != '') {
                foreach ($_GET as $key => $val) {
                    if (is_string($val)) {
                        $_GET[$key] = mb_convert_encoding($val, $charset, $encoding);
                    } elseif (is_array($val)) {
                        foreach ($val as $i => $v) {
                            $_GET[$key][$i] = mb_convert_encoding($v, $charset, $encoding);
                        }
                    }
                }
                foreach ($_POST as $key => $val) {
                    if (is_string($val)) {
                        $_POST[$key] = mb_convert_encoding($val, $charset, $encoding);
                    } elseif (is_array($val)) {
                        foreach ($val as $i => $v) {
                            $_POST[$key][$i] = mb_convert_encoding($v, $charset, $encoding);
                        }
                    }
                }
            }
            if (function_exists('mb_http_output')) {
                mb_http_output($charset);
            }
        }
        $done_something = true;
    } elseif ($known_utf8 && strtolower($charset) != 'utf-8' && strtolower($charset) != 'utf8') {
        require_code('character_sets');
        do_simple_environment_utf8_conversion();
        $done_something = true;
    }
    if ($done_something) {
        $CONVERTED_ENCODING = true;
    }
}
Example #2
0
/**
 * Convert some data from one encoding to the internal encoding.
 *
 * @param  string					Data to convert.
 * @param  ?string				Charset to convert from (NULL: that read by the last http_download_file call).
 * @param  ?string				Charset to convert to (NULL: current encoding).
 * @return string					Converted data.
 */
function convert_to_internal_encoding($data, $input_charset = NULL, $internal_charset = NULL)
{
    global $VALID_ENCODING;
    convert_data_encodings();
    // In case it hasn't run yet. We need $VALID_ENCODING to be set.
    if (is_null($input_charset)) {
        $input_charset = $GLOBALS['HTTP_CHARSET'];
    }
    if ($input_charset === '' || is_null($input_charset)) {
        return $data;
    }
    if (is_null($internal_charset)) {
        $internal_charset = get_charset();
    }
    if ((version_compare(phpversion(), '4.3.0') >= 0 || strtolower($internal_charset) == 'iso-8859-1') && strtolower($input_charset) == 'utf-8' && will_be_unicode_neutered($data) && in_array(strtolower($internal_charset), array('iso-8859-1', 'iso-8859-15', 'koi8-r', 'big5', 'gb2312', 'big5-hkscs', 'shift_jis', 'euc-jp'))) {
        $test = entity_utf8_decode($data, $internal_charset);
        if ($test !== false) {
            $data = $test;
        }
    } elseif (function_exists('unicode_decode') && $internal_charset != 'utf-8' && $input_charset == 'utf-8' && $VALID_ENCODING) {
        $test = @unicode_decode($data, $input_charset);
        if ($test !== false) {
            $data = $test;
        }
    } elseif (function_exists('unicode_encode') && $internal_charset == 'utf-8' && $input_charset != 'utf-8' && $VALID_ENCODING) {
        $test = @unicode_encode($data, $input_charset);
        if ($test !== false) {
            $data = $test;
        }
    } elseif (function_exists('iconv') && $VALID_ENCODING && get_value('disable_iconv') !== '1') {
        $test = @iconv($input_charset, $internal_charset . '//TRANSLIT', $data);
        if ($test !== false) {
            $data = $test;
        }
    } elseif (function_exists('mb_convert_encoding') && $VALID_ENCODING && get_value('disable_mbstring') !== '1') {
        if (function_exists('mb_list_encodings')) {
            $good_encoding = in_array(strtolower($input_charset), array_map('strtolower', mb_list_encodings()));
        } else {
            $good_encoding = true;
        }
        if ($good_encoding) {
            $test = @mb_convert_encoding($data, $internal_charset, $input_charset);
            if ($test !== false) {
                $data = $test;
            }
        }
    } elseif (strtolower($input_charset) == 'utf-8' && strtolower(substr($internal_charset, 0, 3)) != 'utf') {
        $test = utf8_decode($data);
        // Imperfect as it assumes ISO-8859-1, but it's our last resort.
        if ($test !== false) {
            $data = $test;
        }
    } elseif (strtolower($internal_charset) == 'utf-8' && strtolower(substr($input_charset, 0, 3)) != 'utf') {
        $test = utf8_encode($data);
        // Imperfect as it assumes ISO-8859-1, but it's our last resort.
        if ($test !== false) {
            $data = $test;
        }
    }
    return $data;
}