/** * Recode a string to UTF-8 * * If the encoding is not supported, the string is returned as-is * * @param string $string Original string * @param string $encoding Original encoding (lowered) * @return string The string, encoded in UTF-8 */ function utf8_recode($string, $encoding) { $encoding = strtolower($encoding); if ($encoding == 'utf-8' || !is_string($string) || empty($string)) { return $string; } // we force iso-8859-1 to be cp1252 if ($encoding == 'iso-8859-1') { $encoding = 'cp1252'; } // convert iso-8859-8-i to iso-8859-8 else if ($encoding == 'iso-8859-8-i') { $encoding = 'iso-8859-8'; $string = hebrev($string); } // First, try iconv() if (function_exists('iconv')) { $ret = @iconv($encoding, 'utf-8', $string); if (!empty($ret)) { return $ret; } } // Try the mb_string extension if (function_exists('mb_convert_encoding')) { // mbstring is nasty on PHP4, we must make *sure* that we send a good encoding switch ($encoding) { case 'iso-8859-1': case 'iso-8859-2': case 'iso-8859-4': case 'iso-8859-7': case 'iso-8859-9': case 'iso-8859-15': case 'windows-1251': case 'windows-1252': case 'cp1252': case 'shift_jis': case 'euc-kr': case 'big5': case 'gb2312': $ret = @mb_convert_encoding($string, 'utf-8', $encoding); if (!empty($ret)) { return $ret; } } } // Try the recode extension if (function_exists('recode_string')) { $ret = @recode_string($encoding . '..utf-8', $string); if (!empty($ret)) { return $ret; } } // If nothing works, check if we have a custom transcoder available if (!preg_match('#^[a-z0-9_ \\-]+$#', $encoding)) { // Make sure the encoding name is alphanumeric, we don't want it to be abused into loading arbitrary files trigger_error('Unknown encoding: ' . $encoding, E_USER_ERROR); } // iso-8859-* character encoding if (preg_match('/iso[_ -]?8859[_ -]?(\\d+)/', $encoding, $array)) { switch ($array[1]) { case '1': case '2': case '4': case '7': case '8': case '9': case '15': if (!function_exists('iso_8859_' . $array[1])) { if (!file_exists(IP_ROOT_PATH . 'includes/utf/data/recode_basic.' . PHP_EXT)) { trigger_error('Basic reencoder file is missing', E_USER_ERROR); } include(IP_ROOT_PATH . 'includes/utf/data/recode_basic.' . PHP_EXT); } return call_user_func('iso_8859_' . $array[1], $string); break; default: trigger_error('Unknown encoding: ' . $encoding, E_USER_ERROR); break; } } // CP/WIN character encoding if (preg_match('/(?:cp|windows)[_\- ]?(\\d+)/', $encoding, $array)) { switch ($array[1]) { case '932': break; case '1250': case '1251': case '1252': case '1254': case '1255': case '1256': case '1257': case '874': if (!function_exists('cp' . $array[1])) { if (!file_exists(IP_ROOT_PATH . 'includes/utf/data/recode_basic.' . PHP_EXT)) { trigger_error('Basic reencoder file is missing', E_USER_ERROR); } include(IP_ROOT_PATH . 'includes/utf/data/recode_basic.' . PHP_EXT); } return call_user_func('cp' . $array[1], $string); break; default: trigger_error('Unknown encoding: ' . $encoding, E_USER_ERROR); break; } } // TIS-620 if (preg_match('/tis[_ -]?620/', $encoding)) { if (!function_exists('tis_620')) { if (!file_exists(IP_ROOT_PATH . 'includes/utf/data/recode_basic.' . PHP_EXT)) { trigger_error('Basic reencoder file is missing', E_USER_ERROR); } include(IP_ROOT_PATH . 'includes/utf/data/recode_basic.' . PHP_EXT); } return tis_620($string); } // SJIS if (preg_match('/sjis(?:[_ -]?win)?|(?:cp|ibm)[_ -]?932|shift[_ -]?jis/', $encoding)) { if (!function_exists('sjis')) { if (!file_exists(IP_ROOT_PATH . 'includes/utf/data/recode_cjk.' . PHP_EXT)) { trigger_error('CJK reencoder file is missing', E_USER_ERROR); } include(IP_ROOT_PATH . 'includes/utf/data/recode_cjk.' . PHP_EXT); } return sjis($string); } // EUC_KR if (preg_match('/euc[_ -]?kr/', $encoding)) { if (!function_exists('euc_kr')) { if (!file_exists(IP_ROOT_PATH . 'includes/utf/data/recode_cjk.' . PHP_EXT)) { trigger_error('CJK reencoder file is missing', E_USER_ERROR); } include(IP_ROOT_PATH . 'includes/utf/data/recode_cjk.' . PHP_EXT); } return euc_kr($string); } // BIG-5 if (preg_match('/big[_ -]?5/', $encoding)) { if (!function_exists('big5')) { if (!file_exists(IP_ROOT_PATH . 'includes/utf/data/recode_cjk.' . PHP_EXT)) { trigger_error('CJK reencoder file is missing', E_USER_ERROR); } include(IP_ROOT_PATH . 'includes/utf/data/recode_cjk.' . PHP_EXT); } return big5($string); } // GB2312 if (preg_match('/gb[_ -]?2312/', $encoding)) { if (!function_exists('gb2312')) { if (!file_exists(IP_ROOT_PATH . 'includes/utf/data/recode_cjk.' . PHP_EXT)) { trigger_error('CJK reencoder file is missing', E_USER_ERROR); } include(IP_ROOT_PATH . 'includes/utf/data/recode_cjk.' . PHP_EXT); } return gb2312($string); } // Trigger an error?! Fow now just give bad data :-( trigger_error('Unknown encoding: ' . $encoding, E_USER_ERROR); //return $string; // use utf_normalizer::cleanup() ? }
/** * Recode a string to UTF-8 * * If the encoding is not supported, the string is returned as-is * * @param string $string Original string * @param string $encoding Original encoding (lowered) * @return string The string, encoded in UTF-8 */ function utf8_recode($string, $encoding = 'iso-8859-1', $gym_sitemaps) { $encoding = strtolower($encoding); if ($encoding == 'utf-8' || !is_string($string) || !isset($string[0])) { return $string; } // start with something simple if ($gym_sitemaps->rss_config['rss_charset_conv'] === 'utf8_encode' || $encoding == 'iso-8859-1') { return utf8_encode($string); } // First, try iconv() if (function_exists('iconv') && ($gym_sitemaps->rss_config['rss_charset_conv'] === 'auto' || $gym_sitemaps->rss_config['rss_charset_conv'] === 'iconv')) { $ret = @iconv($encoding, 'utf-8', $string); if (isset($ret[0])) { return $ret; } } // Try the mb_string extension if (function_exists('mb_convert_encoding') && ($gym_sitemaps->rss_config['rss_charset_conv'] === 'auto' || $gym_sitemaps->rss_config['rss_charset_conv'] === 'iconv')) { $ret = @mb_convert_encoding($string, 'utf-8', $encoding); if (isset($ret[0])) { return $ret; } } // Try the recode extension if (function_exists('recode_string') && ($gym_sitemaps->rss_config['rss_charset_conv'] === 'auto' || $gym_sitemaps->rss_config['rss_charset_conv'] === 'recode_string')) { $ret = @recode_string($encoding . '..utf-8', $string); if (isset($ret[0])) { return $ret; } } // If nothing works, check if we have a custom transcoder available if (!preg_match('#^[a-z0-9\\-]+$#', $encoding)) { // Make sure the encoding name is alphanumeric, we don't want it to be abused into loading arbitrary files $gym_sitemaps->mx_sitemaps_message_die(GENERAL_MESSAGE, 'Unknown encoding: ' . $encoding); } global $phpEx; // iso-8859-* character encoding if (preg_match('/iso[_ -]?8859[_ -]?(\\d+)/', $encoding, $array)) { switch ($array[1]) { case '1': case '2': case '4': case '7': case '9': case '15': if (!function_exists('iso_8859_' . $array[1])) { if (!file_exists($gym_sitemaps->path_config['module_path'] . 'includes/utf/data/recode_basic.' . $phpEx)) { $gym_sitemaps->mx_sitemaps_message_die(GENERAL_MESSAGE, 'Basic reencoder file is missing'); } include $gym_sitemaps->path_config['module_path'] . 'includes/utf/data/recode_basic.' . $phpEx; } return call_user_func('iso_8859_' . $array[1], $string); break; default: $gym_sitemaps->mx_sitemaps_message_die(GENERAL_MESSAGE, 'Unknown encoding: ' . $encoding); break; } } // CP/WIN character encoding if (preg_match('/(?:cp|windows)[_\\- ]?(\\d+)/', $encoding, $array)) { switch ($array[1]) { case '932': break; case '1250': case '1251': case '1254': case '1255': case '1256': case '1257': case '874': if (!function_exists('cp' . $array[1])) { if (!file_exists($gym_sitemaps->path_config['module_path'] . 'includes/utf/data/recode_basic.' . $phpEx)) { $gym_sitemaps->mx_sitemaps_message_die(GENERAL_MESSAGE, 'Basic reencoder file is missing'); } include $gym_sitemaps->path_config['module_path'] . 'includes/utf/data/recode_basic.' . $phpEx; } return call_user_func('cp' . $array[1], $string); break; default: $gym_sitemaps->mx_sitemaps_message_die(GENERAL_MESSAGE, 'Unknown encoding: ' . $encoding); break; } } // TIS-620 if (preg_match('/tis[_ -]?620/', $encoding)) { if (!function_exists('tis_620')) { if (!file_exists($gym_sitemaps->path_config['module_path'] . 'includes/utf/data/recode_basic.' . $phpEx)) { $gym_sitemaps->mx_sitemaps_message_die(GENERAL_MESSAGE, 'Basic reencoder file is missing'); } include $gym_sitemaps->path_config['module_path'] . 'includes/utf/data/recode_basic.' . $phpEx; } return tis_620($string); } // SJIS if (preg_match('/sjis(?:[_ -]?win)?|(?:cp|ibm)[_ -]?932|shift[_ -]?jis/', $encoding)) { if (!function_exists('sjis')) { if (!file_exists($gym_sitemaps->path_config['module_path'] . 'includes/utf/data/recode_cjk.' . $phpEx)) { $gym_sitemaps->mx_sitemaps_message_die(GENERAL_MESSAGE, 'CJK reencoder file is missing'); } include $gym_sitemaps->path_config['module_path'] . 'includes/utf/data/recode_cjk.' . $phpEx; } return sjis($string); } // EUC_KR if (preg_match('/euc[_ -]?kr/', $encoding)) { if (!function_exists('euc_kr')) { if (!file_exists($gym_sitemaps->path_config['module_path'] . 'includes/utf/data/recode_cjk.' . $phpEx)) { $gym_sitemaps->mx_sitemaps_message_die(GENERAL_MESSAGE, 'CJK reencoder file is missing'); } include $gym_sitemaps->path_config['module_path'] . 'includes/utf/data/recode_cjk.' . $phpEx; } return euc_kr($string); } // BIG-5 if (preg_match('/big[_ -]?5/', $encoding)) { if (!function_exists('big5')) { if (!file_exists($gym_sitemaps->path_config['module_path'] . 'includes/utf/data/recode_cjk.' . $phpEx)) { $gym_sitemaps->mx_sitemaps_message_die(GENERAL_MESSAGE, 'CJK reencoder file is missing'); } include $gym_sitemaps->path_config['module_path'] . 'includes/utf/data/recode_cjk.' . $phpEx; } return big5($string); } // GB2312 if (preg_match('/gb[_ -]?2312/', $encoding)) { if (!function_exists('gb2312')) { if (!file_exists($gym_sitemaps->path_config['module_path'] . 'includes/utf/data/recode_cjk.' . $phpEx)) { $gym_sitemaps->mx_sitemaps_message_die(GENERAL_MESSAGE, 'CJK reencoder file is missing'); } include $gym_sitemaps->path_config['module_path'] . 'includes/utf/data/recode_cjk.' . $phpEx; } return gb2312($string); } // Trigger an error?! Fow now just give bad data :-( //trigger_error('Unknown encoding: ' . $encoding, E_USER_ERROR); return $string; }