Lossless (character-wise) conversion of HTML to ASCII
public static convertToASCIIDumbLossless ( string $str ) : string | ||
$str | string | UTF-8 string to be converted to ASCII |
return | string | ASCII encoded string with non-ASCII character entity-ized |
/** * Converts a string from UTF-8 based on configuration. * @note Currently, this is a lossy conversion, with unexpressable * characters being omitted. */ public static function convertFromUTF8($str, $config, $context) { static $iconv = null; if ($iconv === null) { $iconv = function_exists('iconv'); } $encoding = $config->get('Core', 'Encoding'); if ($encoding === 'utf-8') { return $str; } if ($config->get('Core', 'EscapeNonASCIICharacters')) { $str = HTMLPurifier_Encoder::convertToASCIIDumbLossless($str); } if ($iconv && !$config->get('Test', 'ForceNoIconv')) { return @iconv('utf-8', $encoding . '//IGNORE', $str); } elseif ($encoding === 'iso-8859-1') { return @utf8_decode($str); } trigger_error('Encoding not supported', E_USER_ERROR); }
/** * Converts a string from UTF-8 based on configuration. * @note Currently, this is a lossy conversion, with unexpressable * characters being omitted. */ public static function convertFromUTF8($str, $config, $context) { $encoding = $config->get('Core.Encoding'); if ($encoding === 'utf-8') { return $str; } static $iconv = null; if ($iconv === null) { $iconv = function_exists('iconv'); } if ($escape = $config->get('Core.EscapeNonASCIICharacters')) { $str = HTMLPurifier_Encoder::convertToASCIIDumbLossless($str); } set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler')); if ($iconv && !$config->get('Test.ForceNoIconv')) { // Undo our previous fix in convertToUTF8, otherwise iconv will barf $ascii_fix = HTMLPurifier_Encoder::testEncodingSupportsASCII($encoding); if (!$escape && !empty($ascii_fix)) { $clear_fix = array(); foreach ($ascii_fix as $utf8 => $native) { $clear_fix[$utf8] = ''; } $str = strtr($str, $clear_fix); } $str = strtr($str, array_flip($ascii_fix)); // Normal stuff $str = iconv('utf-8', $encoding . '//IGNORE', $str); restore_error_handler(); return $str; } elseif ($encoding === 'iso-8859-1') { $str = utf8_decode($str); restore_error_handler(); return $str; } trigger_error('Encoding not supported', E_USER_ERROR); }
public function test_convertToASCIIDumbLossless() { // Uppercase thorn letter $this->assertIdentical(HTMLPurifier_Encoder::convertToASCIIDumbLossless("Þorn"), "Þorn"); $this->assertIdentical(HTMLPurifier_Encoder::convertToASCIIDumbLossless("an"), "an"); // test up to four bytes $this->assertIdentical(HTMLPurifier_Encoder::convertToASCIIDumbLossless("ó € "), "󠀠"); }