Example #1
0
/**
* Recode a string to UTF-8
*
* If the encoding is not supported, the string is returned as-is
*
* @param	string	$string		Original string
* @param	string	$encoding	Original encoding (lowered)
* @return	string				The string, encoded in UTF-8
*/
function utf8_recode($string, $encoding)
{
	$encoding = strtolower($encoding);

	if ($encoding == 'utf-8' || !is_string($string) || empty($string))
	{
		return $string;
	}

	// we force iso-8859-1 to be cp1252
	if ($encoding == 'iso-8859-1')
	{
		$encoding = 'cp1252';
	}
	// convert iso-8859-8-i to iso-8859-8
	else if ($encoding == 'iso-8859-8-i')
	{
		$encoding = 'iso-8859-8';
		$string = hebrev($string);
	}

	// First, try iconv()
	if (function_exists('iconv'))
	{
		$ret = @iconv($encoding, 'utf-8', $string);

		if (!empty($ret))
		{
			return $ret;
		}
	}

	// Try the mb_string extension
	if (function_exists('mb_convert_encoding'))
	{
		// mbstring is nasty on PHP4, we must make *sure* that we send a good encoding
		switch ($encoding)
		{
			case 'iso-8859-1':
			case 'iso-8859-2':
			case 'iso-8859-4':
			case 'iso-8859-7':
			case 'iso-8859-9':
			case 'iso-8859-15':
			case 'windows-1251':
			case 'windows-1252':
			case 'cp1252':
			case 'shift_jis':
			case 'euc-kr':
			case 'big5':
			case 'gb2312':
				$ret = @mb_convert_encoding($string, 'utf-8', $encoding);

				if (!empty($ret))
				{
					return $ret;
				}
		}
	}

	// Try the recode extension
	if (function_exists('recode_string'))
	{
		$ret = @recode_string($encoding . '..utf-8', $string);

		if (!empty($ret))
		{
			return $ret;
		}
	}

	// If nothing works, check if we have a custom transcoder available
	if (!preg_match('#^[a-z0-9_ \\-]+$#', $encoding))
	{
		// Make sure the encoding name is alphanumeric, we don't want it to be abused into loading arbitrary files
		trigger_error('Unknown encoding: ' . $encoding, E_USER_ERROR);
	}

	// iso-8859-* character encoding
	if (preg_match('/iso[_ -]?8859[_ -]?(\\d+)/', $encoding, $array))
	{
		switch ($array[1])
		{
			case '1':
			case '2':
			case '4':
			case '7':
			case '8':
			case '9':
			case '15':
				if (!function_exists('iso_8859_' . $array[1]))
				{
					if (!file_exists(IP_ROOT_PATH . 'includes/utf/data/recode_basic.' . PHP_EXT))
					{
						trigger_error('Basic reencoder file is missing', E_USER_ERROR);
					}
					include(IP_ROOT_PATH . 'includes/utf/data/recode_basic.' . PHP_EXT);
				}
				return call_user_func('iso_8859_' . $array[1], $string);
			break;

			default:
				trigger_error('Unknown encoding: ' . $encoding, E_USER_ERROR);
			break;
		}
	}

	// CP/WIN character encoding
	if (preg_match('/(?:cp|windows)[_\- ]?(\\d+)/', $encoding, $array))
	{
		switch ($array[1])
		{
			case '932':
			break;
			case '1250':
			case '1251':
			case '1252':
			case '1254':
			case '1255':
			case '1256':
			case '1257':
			case '874':
				if (!function_exists('cp' . $array[1]))
				{
					if (!file_exists(IP_ROOT_PATH . 'includes/utf/data/recode_basic.' . PHP_EXT))
					{
						trigger_error('Basic reencoder file is missing', E_USER_ERROR);
					}
					include(IP_ROOT_PATH . 'includes/utf/data/recode_basic.' . PHP_EXT);
				}
				return call_user_func('cp' . $array[1], $string);
			break;

			default:
				trigger_error('Unknown encoding: ' . $encoding, E_USER_ERROR);
			break;
		}
	}

	// TIS-620
	if (preg_match('/tis[_ -]?620/', $encoding))
	{
		if (!function_exists('tis_620'))
		{
			if (!file_exists(IP_ROOT_PATH . 'includes/utf/data/recode_basic.' . PHP_EXT))
			{
				trigger_error('Basic reencoder file is missing', E_USER_ERROR);
			}
			include(IP_ROOT_PATH . 'includes/utf/data/recode_basic.' . PHP_EXT);
		}
		return tis_620($string);
	}

	// SJIS
	if (preg_match('/sjis(?:[_ -]?win)?|(?:cp|ibm)[_ -]?932|shift[_ -]?jis/', $encoding))
	{
		if (!function_exists('sjis'))
		{
			if (!file_exists(IP_ROOT_PATH . 'includes/utf/data/recode_cjk.' . PHP_EXT))
			{
				trigger_error('CJK reencoder file is missing', E_USER_ERROR);
			}
			include(IP_ROOT_PATH . 'includes/utf/data/recode_cjk.' . PHP_EXT);
		}
		return sjis($string);
	}

	// EUC_KR
	if (preg_match('/euc[_ -]?kr/', $encoding))
	{
		if (!function_exists('euc_kr'))
		{
			if (!file_exists(IP_ROOT_PATH . 'includes/utf/data/recode_cjk.' . PHP_EXT))
			{
				trigger_error('CJK reencoder file is missing', E_USER_ERROR);
			}
			include(IP_ROOT_PATH . 'includes/utf/data/recode_cjk.' . PHP_EXT);
		}
		return euc_kr($string);
	}

	// BIG-5
	if (preg_match('/big[_ -]?5/', $encoding))
	{
		if (!function_exists('big5'))
		{
			if (!file_exists(IP_ROOT_PATH . 'includes/utf/data/recode_cjk.' . PHP_EXT))
			{
				trigger_error('CJK reencoder file is missing', E_USER_ERROR);
			}
			include(IP_ROOT_PATH . 'includes/utf/data/recode_cjk.' . PHP_EXT);
		}
		return big5($string);
	}

	// GB2312
	if (preg_match('/gb[_ -]?2312/', $encoding))
	{
		if (!function_exists('gb2312'))
		{
			if (!file_exists(IP_ROOT_PATH . 'includes/utf/data/recode_cjk.' . PHP_EXT))
			{
				trigger_error('CJK reencoder file is missing', E_USER_ERROR);
			}
			include(IP_ROOT_PATH . 'includes/utf/data/recode_cjk.' . PHP_EXT);
		}
		return gb2312($string);
	}

	// Trigger an error?! Fow now just give bad data :-(
	trigger_error('Unknown encoding: ' . $encoding, E_USER_ERROR);
	//return $string; // use utf_normalizer::cleanup() ?
}
Example #2
0
/**
* Recode a string to UTF-8
*
* If the encoding is not supported, the string is returned as-is
*
* @param	string	$string		Original string
* @param	string	$encoding	Original encoding (lowered)
* @return	string				The string, encoded in UTF-8
*/
function utf8_recode($string, $encoding = 'iso-8859-1', $gym_sitemaps)
{
    $encoding = strtolower($encoding);
    if ($encoding == 'utf-8' || !is_string($string) || !isset($string[0])) {
        return $string;
    }
    // start with something simple
    if ($gym_sitemaps->rss_config['rss_charset_conv'] === 'utf8_encode' || $encoding == 'iso-8859-1') {
        return utf8_encode($string);
    }
    // First, try iconv()
    if (function_exists('iconv') && ($gym_sitemaps->rss_config['rss_charset_conv'] === 'auto' || $gym_sitemaps->rss_config['rss_charset_conv'] === 'iconv')) {
        $ret = @iconv($encoding, 'utf-8', $string);
        if (isset($ret[0])) {
            return $ret;
        }
    }
    // Try the mb_string extension
    if (function_exists('mb_convert_encoding') && ($gym_sitemaps->rss_config['rss_charset_conv'] === 'auto' || $gym_sitemaps->rss_config['rss_charset_conv'] === 'iconv')) {
        $ret = @mb_convert_encoding($string, 'utf-8', $encoding);
        if (isset($ret[0])) {
            return $ret;
        }
    }
    // Try the recode extension
    if (function_exists('recode_string') && ($gym_sitemaps->rss_config['rss_charset_conv'] === 'auto' || $gym_sitemaps->rss_config['rss_charset_conv'] === 'recode_string')) {
        $ret = @recode_string($encoding . '..utf-8', $string);
        if (isset($ret[0])) {
            return $ret;
        }
    }
    // If nothing works, check if we have a custom transcoder available
    if (!preg_match('#^[a-z0-9\\-]+$#', $encoding)) {
        // Make sure the encoding name is alphanumeric, we don't want it to be abused into loading arbitrary files
        $gym_sitemaps->mx_sitemaps_message_die(GENERAL_MESSAGE, 'Unknown encoding: ' . $encoding);
    }
    global $phpEx;
    // iso-8859-* character encoding
    if (preg_match('/iso[_ -]?8859[_ -]?(\\d+)/', $encoding, $array)) {
        switch ($array[1]) {
            case '1':
            case '2':
            case '4':
            case '7':
            case '9':
            case '15':
                if (!function_exists('iso_8859_' . $array[1])) {
                    if (!file_exists($gym_sitemaps->path_config['module_path'] . 'includes/utf/data/recode_basic.' . $phpEx)) {
                        $gym_sitemaps->mx_sitemaps_message_die(GENERAL_MESSAGE, 'Basic reencoder file is missing');
                    }
                    include $gym_sitemaps->path_config['module_path'] . 'includes/utf/data/recode_basic.' . $phpEx;
                }
                return call_user_func('iso_8859_' . $array[1], $string);
                break;
            default:
                $gym_sitemaps->mx_sitemaps_message_die(GENERAL_MESSAGE, 'Unknown encoding: ' . $encoding);
                break;
        }
    }
    // CP/WIN character encoding
    if (preg_match('/(?:cp|windows)[_\\- ]?(\\d+)/', $encoding, $array)) {
        switch ($array[1]) {
            case '932':
                break;
            case '1250':
            case '1251':
            case '1254':
            case '1255':
            case '1256':
            case '1257':
            case '874':
                if (!function_exists('cp' . $array[1])) {
                    if (!file_exists($gym_sitemaps->path_config['module_path'] . 'includes/utf/data/recode_basic.' . $phpEx)) {
                        $gym_sitemaps->mx_sitemaps_message_die(GENERAL_MESSAGE, 'Basic reencoder file is missing');
                    }
                    include $gym_sitemaps->path_config['module_path'] . 'includes/utf/data/recode_basic.' . $phpEx;
                }
                return call_user_func('cp' . $array[1], $string);
                break;
            default:
                $gym_sitemaps->mx_sitemaps_message_die(GENERAL_MESSAGE, 'Unknown encoding: ' . $encoding);
                break;
        }
    }
    // TIS-620
    if (preg_match('/tis[_ -]?620/', $encoding)) {
        if (!function_exists('tis_620')) {
            if (!file_exists($gym_sitemaps->path_config['module_path'] . 'includes/utf/data/recode_basic.' . $phpEx)) {
                $gym_sitemaps->mx_sitemaps_message_die(GENERAL_MESSAGE, 'Basic reencoder file is missing');
            }
            include $gym_sitemaps->path_config['module_path'] . 'includes/utf/data/recode_basic.' . $phpEx;
        }
        return tis_620($string);
    }
    // SJIS
    if (preg_match('/sjis(?:[_ -]?win)?|(?:cp|ibm)[_ -]?932|shift[_ -]?jis/', $encoding)) {
        if (!function_exists('sjis')) {
            if (!file_exists($gym_sitemaps->path_config['module_path'] . 'includes/utf/data/recode_cjk.' . $phpEx)) {
                $gym_sitemaps->mx_sitemaps_message_die(GENERAL_MESSAGE, 'CJK reencoder file is missing');
            }
            include $gym_sitemaps->path_config['module_path'] . 'includes/utf/data/recode_cjk.' . $phpEx;
        }
        return sjis($string);
    }
    // EUC_KR
    if (preg_match('/euc[_ -]?kr/', $encoding)) {
        if (!function_exists('euc_kr')) {
            if (!file_exists($gym_sitemaps->path_config['module_path'] . 'includes/utf/data/recode_cjk.' . $phpEx)) {
                $gym_sitemaps->mx_sitemaps_message_die(GENERAL_MESSAGE, 'CJK reencoder file is missing');
            }
            include $gym_sitemaps->path_config['module_path'] . 'includes/utf/data/recode_cjk.' . $phpEx;
        }
        return euc_kr($string);
    }
    // BIG-5
    if (preg_match('/big[_ -]?5/', $encoding)) {
        if (!function_exists('big5')) {
            if (!file_exists($gym_sitemaps->path_config['module_path'] . 'includes/utf/data/recode_cjk.' . $phpEx)) {
                $gym_sitemaps->mx_sitemaps_message_die(GENERAL_MESSAGE, 'CJK reencoder file is missing');
            }
            include $gym_sitemaps->path_config['module_path'] . 'includes/utf/data/recode_cjk.' . $phpEx;
        }
        return big5($string);
    }
    // GB2312
    if (preg_match('/gb[_ -]?2312/', $encoding)) {
        if (!function_exists('gb2312')) {
            if (!file_exists($gym_sitemaps->path_config['module_path'] . 'includes/utf/data/recode_cjk.' . $phpEx)) {
                $gym_sitemaps->mx_sitemaps_message_die(GENERAL_MESSAGE, 'CJK reencoder file is missing');
            }
            include $gym_sitemaps->path_config['module_path'] . 'includes/utf/data/recode_cjk.' . $phpEx;
        }
        return gb2312($string);
    }
    // Trigger an error?! Fow now just give bad data :-(
    //trigger_error('Unknown encoding: ' . $encoding, E_USER_ERROR);
    return $string;
}