/** * Callback function for substituteNonSpecialEntities() that does the work. * * @warning Though this is public in order to let the callback happen, * calling it directly is not recommended. * @param $matches PCRE matches array, with 0 the entire match, and * either index 1, 2 or 3 set with a hex value, dec value, * or string (respectively). * @returns Replacement string. */ function nonSpecialEntityCallback($matches) { // replaces all but big five $entity = $matches[0]; $is_num = @$matches[0][1] === '#'; if ($is_num) { $is_hex = @$entity[2] === 'x'; $code = $is_hex ? hexdec($matches[1]) : (int) $matches[2]; // abort for special characters if (isset($this->_special_dec2str[$code])) { return $entity; } return HTMLPurifier_Encoder::unichr($code); } else { if (isset($this->_special_ent2dec[$matches[3]])) { return $entity; } if (!$this->_entity_lookup) { $this->_entity_lookup = HTMLPurifier_EntityLookup::instance(); } if (isset($this->_entity_lookup->table[$matches[3]])) { return $this->_entity_lookup->table[$matches[3]]; } else { return $entity; } } }
/** * Calculates the result of this requirement. * * @return string */ protected function calculateResult() { if (function_exists('iconv')) { // See if it's the buggy version if (\HTMLPurifier_Encoder::testIconvTruncateBug() != \HTMLPurifier_Encoder::ICONV_OK) { return RequirementResult::Warning; } else { return RequirementResult::Success; } } else { return RequirementResult::Failed; } }
/** * Converts a string from UTF-8 based on configuration. * @note Currently, this is a lossy conversion, with unexpressable * characters being omitted. */ public static function convertFromUTF8($str, $config, $context) { $encoding = $config->get('Core.Encoding'); if ($encoding === 'utf-8') { return $str; } static $iconv = null; if ($iconv === null) { $iconv = function_exists('iconv'); } if ($escape = $config->get('Core.EscapeNonASCIICharacters')) { $str = HTMLPurifier_Encoder::convertToASCIIDumbLossless($str); } set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler')); if ($iconv && !$config->get('Test.ForceNoIconv')) { // Undo our previous fix in convertToUTF8, otherwise iconv will barf $ascii_fix = HTMLPurifier_Encoder::testEncodingSupportsASCII($encoding); if (!$escape && !empty($ascii_fix)) { $clear_fix = array(); foreach ($ascii_fix as $utf8 => $native) { $clear_fix[$utf8] = ''; } $str = strtr($str, $clear_fix); } $str = strtr($str, array_flip($ascii_fix)); // Normal stuff $str = iconv('utf-8', $encoding . '//IGNORE', $str); restore_error_handler(); return $str; } elseif ($encoding === 'iso-8859-1') { $str = utf8_decode($str); restore_error_handler(); return $str; } trigger_error('Encoding not supported', E_USER_ERROR); }
/** * Converts a string from UTF-8 based on configuration. * @note Currently, this is a lossy conversion, with unexpressable * characters being omitted. */ public static function convertFromUTF8($str, $config, $context) { static $iconv = null; if ($iconv === null) { $iconv = function_exists('iconv'); } $encoding = $config->get('Core', 'Encoding'); if ($encoding === 'utf-8') { return $str; } if ($config->get('Core', 'EscapeNonASCIICharacters')) { $str = HTMLPurifier_Encoder::convertToASCIIDumbLossless($str); } if ($iconv && !$config->get('Test', 'ForceNoIconv')) { return @iconv('utf-8', $encoding . '//IGNORE', $str); } elseif ($encoding === 'iso-8859-1') { return @utf8_decode($str); } trigger_error('Encoding not supported', E_USER_ERROR); }
public function validate($string, $config, $context) { static $generic_names = array('serif' => true, 'sans-serif' => true, 'monospace' => true, 'fantasy' => true, 'cursive' => true); // assume that no font names contain commas in them $fonts = explode(',', $string); $final = ''; foreach ($fonts as $font) { $font = trim($font); if ($font === '') { continue; } // match a generic name if (isset($generic_names[$font])) { $final .= $font . ', '; continue; } // match a quoted name if ($font[0] === '"' || $font[0] === "'") { $length = strlen($font); if ($length <= 2) { continue; } $quote = $font[0]; if ($font[$length - 1] !== $quote) { continue; } $font = substr($font, 1, $length - 2); $new_font = ''; for ($i = 0, $c = strlen($font); $i < $c; $i++) { if ($font[$i] === '\\') { $i++; if ($i >= $c) { $new_font .= '\\'; break; } if (ctype_xdigit($font[$i])) { $code = $font[$i]; for ($a = 1, $i++; $i < $c && $a < 6; $i++, $a++) { if (!ctype_xdigit($font[$i])) { break; } $code .= $font[$i]; } // We have to be extremely careful when adding // new characters, to make sure we're not breaking // the encoding. $char = HTMLPurifier_Encoder::unichr(hexdec($code)); if (HTMLPurifier_Encoder::cleanUTF8($char) === '') { continue; } $new_font .= $char; if ($i < $c && trim($font[$i]) !== '') { $i--; } continue; } if ($font[$i] === "\n") { continue; } } $new_font .= $font[$i]; } $font = $new_font; } // $font is a pure representation of the font name if (ctype_alnum($font) && $font !== '') { // very simple font, allow it in unharmed $final .= $font . ', '; continue; } // complicated font, requires quoting // armor single quotes and new lines $font = str_replace("\\", "\\\\", $font); $font = str_replace("'", "\\'", $font); $final .= "'{$font}', "; } $final = rtrim($final, ', '); if ($final === '') { return false; } return $final; }
/** * Filters an HTML snippet/document to be XSS-free and standards-compliant. * * @param $html String of HTML to purify * @param $config HTMLPurifier_Config object for this operation, if omitted, * defaults to the config object specified during this * object's construction. The parameter can also be any type * that HTMLPurifier_Config::create() supports. * @return Purified HTML */ public function purify($html, $config = null) { // :TODO: make the config merge in, instead of replace $config = $config ? HTMLPurifier_Config::create($config) : $this->config; // implementation is partially environment dependant, partially // configuration dependant $lexer = HTMLPurifier_Lexer::create($config); $context = new HTMLPurifier_Context(); // setup HTML generator $this->generator = new HTMLPurifier_Generator($config, $context); $context->register('Generator', $this->generator); // set up global context variables if ($config->get('Core.CollectErrors')) { // may get moved out if other facilities use it $language_factory = HTMLPurifier_LanguageFactory::instance(); $language = $language_factory->create($config, $context); $context->register('Locale', $language); $error_collector = new HTMLPurifier_ErrorCollector($context); $context->register('ErrorCollector', $error_collector); } // setup id_accumulator context, necessary due to the fact that // AttrValidator can be called from many places $id_accumulator = HTMLPurifier_IDAccumulator::build($config, $context); $context->register('IDAccumulator', $id_accumulator); $html = HTMLPurifier_Encoder::convertToUTF8($html, $config, $context); // setup filters $filter_flags = $config->getBatch('Filter'); $custom_filters = $filter_flags['Custom']; unset($filter_flags['Custom']); $filters = array(); foreach ($filter_flags as $filter => $flag) { if (!$flag) { continue; } if (strpos($filter, '.') !== false) { continue; } $class = "HTMLPurifier_Filter_{$filter}"; $filters[] = new $class(); } foreach ($custom_filters as $filter) { // maybe "HTMLPurifier_Filter_$filter", but be consistent with AutoFormat $filters[] = $filter; } $filters = array_merge($filters, $this->filters); // maybe prepare(), but later for ($i = 0, $filter_size = count($filters); $i < $filter_size; $i++) { $html = $filters[$i]->preFilter($html, $config, $context); } // purified HTML $html = $this->generator->generateFromTokens($this->strategy->execute($lexer->tokenizeHTML($html, $config, $context), $config, $context)); for ($i = $filter_size - 1; $i >= 0; $i--) { $html = $filters[$i]->postFilter($html, $config, $context); } $html = HTMLPurifier_Encoder::convertFromUTF8($html, $config, $context); $this->context =& $context; return $html; }
function escapeHTML($string) { $string = HTMLPurifier_Encoder::cleanUTF8($string); $string = htmlspecialchars($string, ENT_COMPAT, 'UTF-8'); return $string; }
function &search_index($data) { // Be sure we will parse UTF-8 data if (function_exists('mb_check_encoding') && function_exists('iconv') && function_exists('mb_detect_encoding') && mb_check_encoding($data, 'UTF-8')) { $data = iconv(mb_detect_encoding($data), 'UTF-8//TRANSLIT', $data); } // Clean the UTF-8 string using HTML Purifier @(require_once 'lib/HTMLPurifier.auto.php'); @(require_once 'HTMLPurifier/Encoder.php'); if (class_exists('HTMLPurifier_Encoder')) { $utf8encoder = new HTMLPurifier_Encoder(); $data = $utf8encoder->cleanUTF8($data); unset($utf8encoder); } // Remove remaining HTML numeric entities if (function_exists('mb_decode_numericentity')) { if (!function_exists('utf8_entity_decode')) { function utf8_entity_decode($entity) { $convmap = array(0x0, 0x10000, 0, 0xfffff); return mb_decode_numericentity($entity, $convmap, 'UTF-8'); } } $data = preg_replace('/&#\\d{2,5};/ue', "utf8_entity_decode('\$0')", $data); $data = preg_replace('/&#x([a-fA-F0-7]{2,8});/ue', "utf8_entity_decode('&#'.hexdec('\$1').';')", $data); } // Lowerize $data = function_exists('mb_convert_case') ? mb_convert_case($data, MB_CASE_LOWER, 'UTF-8') : strtolower($data); // Convert punctuations to spaces $data = preg_replace('/[\\pP\\pZ\\pS]/u', ' ', $data); if ($data != '') { // Split into words (do NOT use the split function that doesn't correctly handle some characters !) $sstrings = preg_split('/\\s+/u', $data, -1, PREG_SPLIT_NO_EMPTY); foreach ($sstrings as $value) { // Keep only alpha-num words if (preg_match('/^[\\pL\\pN]+$/u', $value)) { if (isset($words[$value])) { $words[$value]++; // count words } else { $words[$value] = 1; } } } } return $words; }
/** * Parses a possibly escaped CSS string and returns the "pure" * version of it. */ protected function expandCSSEscape($string) { // flexibly parse it $ret = ''; for ($i = 0, $c = strlen($string); $i < $c; $i++) { if ($string[$i] === '\\') { $i++; if ($i >= $c) { $ret .= '\\'; break; } if (ctype_xdigit($string[$i])) { $code = $string[$i]; for ($a = 1, $i++; $i < $c && $a < 6; $i++, $a++) { if (!ctype_xdigit($string[$i])) { break; } $code .= $string[$i]; } // We have to be extremely careful when adding // new characters, to make sure we're not breaking // the encoding. $char = HTMLPurifier_Encoder::unichr(hexdec($code)); if (HTMLPurifier_Encoder::cleanUTF8($char) === '') { continue; } $ret .= $char; if ($i < $c && trim($string[$i]) !== '') { $i--; } continue; } if ($string[$i] === "\n") { continue; } } $ret .= $string[$i]; } return $ret; }
/** * Filters an HTML snippet/document to be XSS-free and standards-compliant. * * @param $html String of HTML to purify * @param $config HTMLPurifier_Config object for this operation, if omitted, * defaults to the config object specified during this * object's construction. The parameter can also be any type * that HTMLPurifier_Config::create() supports. * @return Purified HTML */ function purify($html, $config = null) { $config = $config ? HTMLPurifier_Config::create($config) : $this->config; // implementation is partially environment dependant, partially // configuration dependant $lexer = HTMLPurifier_Lexer::create($config); $context = new HTMLPurifier_Context(); // our friendly neighborhood generator, all primed with configuration too! $this->generator->generateFromTokens(array(), $config, $context); $context->register('Generator', $this->generator); // set up global context variables if ($config->get('Core', 'CollectErrors')) { // may get moved out if other facilities use it $language_factory = HTMLPurifier_LanguageFactory::instance(); $language = $language_factory->create($config, $context); $context->register('Locale', $language); $error_collector = new HTMLPurifier_ErrorCollector($context); $context->register('ErrorCollector', $error_collector); } $html = HTMLPurifier_Encoder::convertToUTF8($html, $config, $context); for ($i = 0, $size = count($this->filters); $i < $size; $i++) { $html = $this->filters[$i]->preFilter($html, $config, $context); } // purified HTML $html = $this->generator->generateFromTokens($this->strategy->execute($lexer->tokenizeHTML($html, $config, $context), $config, $context), $config, $context); for ($i = $size - 1; $i >= 0; $i--) { $html = $this->filters[$i]->postFilter($html, $config, $context); } $html = HTMLPurifier_Encoder::convertFromUTF8($html, $config, $context); $this->context =& $context; return $html; }
/** * Returns whether iconv is installed and not buggy. * * @return bool */ public static function checkForIconv() { if (!isset(static::$_iconv)) { // Check if iconv is installed. Note we can't just use HTMLPurifier_Encoder::iconvAvailable() because they // don't consider iconv "installed" if it's there but "unusable". if (function_exists('iconv') && \HTMLPurifier_Encoder::testIconvTruncateBug() === \HTMLPurifier_Encoder::ICONV_OK) { static::$_iconv = true; } else { static::$_iconv = false; } } return static::$_iconv; }
public function testIconvChunking() { if (!HTMLPurifier_Encoder::iconvAvailable()) { return; } if (HTMLPurifier_Encoder::testIconvTruncateBug() !== HTMLPurifier_Encoder::ICONV_TRUNCATES) { return; } $this->assertIdentical(HTMLPurifier_Encoder::iconv('utf-8', 'iso-8859-1//IGNORE', "aó € b", 4), 'ab'); $this->assertIdentical(HTMLPurifier_Encoder::iconv('utf-8', 'iso-8859-1//IGNORE', "aaä¸b", 4), 'aab'); $this->assertIdentical(HTMLPurifier_Encoder::iconv('utf-8', 'iso-8859-1//IGNORE', "aaaαb", 4), 'aaab'); $this->assertIdentical(HTMLPurifier_Encoder::iconv('utf-8', 'iso-8859-1//IGNORE', "aaaaó € b", 4), 'aaaab'); $this->assertIdentical(HTMLPurifier_Encoder::iconv('utf-8', 'iso-8859-1//IGNORE', "aaaaä¸b", 4), 'aaaab'); $this->assertIdentical(HTMLPurifier_Encoder::iconv('utf-8', 'iso-8859-1//IGNORE', "aaaaαb", 4), 'aaaab'); }
/** * Returns whether iconv is installed and not buggy. * * @return bool */ public static function checkForIconv() { if (!isset(static::$_iconv)) { static::$_iconv = false; // Check if iconv is installed. Note we can't just use HTMLPurifier_Encoder::iconvAvailable() because they // don't consider iconv "installed" if it's there but "unusable". if (!function_exists('iconv')) { Craft::log('iconv is not installed. Will fallback to mbstring.', LogLevel::Warning); } else { if (\HTMLPurifier_Encoder::testIconvTruncateBug() != \HTMLPurifier_Encoder::ICONV_OK) { Craft::log('Buggy iconv installed. Will fallback to mbstring.', LogLevel::Warning); } else { static::$_iconv = true; } } } return static::$_iconv; }
/** * Converts a string to UTF-8 based on configuration. */ public static function convertToUTF8($str, $config, $context) { $encoding = $config->get('Core.Encoding'); if ($encoding === 'utf-8') { return $str; } static $iconv = null; if ($iconv === null) { $iconv = self::iconvAvailable(); } if ($iconv && !$config->get('Test.ForceNoIconv')) { // unaffected by bugs, since UTF-8 support all characters $str = self::unsafeIconv($encoding, 'utf-8//IGNORE', $str); if ($str === false) { // $encoding is not a valid encoding trigger_error('Invalid encoding ' . $encoding, E_USER_ERROR); return ''; } // If the string is bjorked by Shift_JIS or a similar encoding // that doesn't support all of ASCII, convert the naughty // characters to their true byte-wise ASCII/UTF-8 equivalents. $str = strtr($str, self::testEncodingSupportsASCII($encoding)); return $str; } elseif ($encoding === 'iso-8859-1') { $str = utf8_encode($str); return $str; } $bug = HTMLPurifier_Encoder::testIconvTruncateBug(); if ($bug == self::ICONV_OK) { trigger_error('Encoding not supported, please install iconv', E_USER_ERROR); } else { trigger_error('You have a buggy version of iconv, see https://bugs.php.net/bug.php?id=48147 and http://sourceware.org/bugzilla/show_bug.cgi?id=13541', E_USER_ERROR); } }
public function normalize($html, $config, $context) { if ($config->get('Core.NormalizeNewlines')) { $html = str_replace("\r\n", "\n", $html); $html = str_replace("\r", "\n", $html); } if ($config->get('HTML.Trusted')) { $html = $this->escapeCommentedCDATA($html); } $html = $this->escapeCDATA($html); $html = $this->removeIEConditional($html); if ($config->get('Core.ConvertDocumentToFragment')) { $e = false; if ($config->get('Core.CollectErrors')) { $e =& $context->get('ErrorCollector'); } $new_html = $this->extractBody($html); if ($e && $new_html != $html) { $e->send(E_WARNING, 'Lexer: Extracted body'); } $html = $new_html; } $html = $this->_entity_parser->substituteNonSpecialEntities($html); $html = HTMLPurifier_Encoder::cleanUTF8($html); if ($config->get('Core.RemoveProcessingInstructions')) { $html = preg_replace('#<\\?.+?\\?>#s', '', $html); } return $html; }
/** * Takes a piece of HTML and normalizes it by converting entities, fixing * encoding, extracting bits, and other good stuff. */ function normalize($html, $config, &$context) { // extract body from document if applicable if ($config->get('Core', 'AcceptFullDocuments')) { $html = $this->extractBody($html); } // normalize newlines to \n $html = str_replace("\r\n", "\n", $html); $html = str_replace("\r", "\n", $html); if ($config->get('HTML', 'Trusted')) { // escape convoluted CDATA $html = $this->escapeCommentedCDATA($html); } // escape CDATA $html = $this->escapeCDATA($html); // expand entities that aren't the big five $html = $this->_entity_parser->substituteNonSpecialEntities($html); // clean into wellformed UTF-8 string for an SGML context: this has // to be done after entity expansion because the entities sometimes // represent non-SGML characters (horror, horror!) $html = HTMLPurifier_Encoder::cleanUTF8($html); return $html; }
/** * Takes a piece of HTML and normalizes it by converting entities, fixing * encoding, extracting bits, and other good stuff. * @param string $html HTML. * @param HTMLPurifier_Config $config * @param HTMLPurifier_Context $context * @return string * @todo Consider making protected */ public function normalize($html, $config, $context) { // normalize newlines to \n if ($config->get('Core.NormalizeNewlines')) { $html = str_replace("\r\n", "\n", $html); $html = str_replace("\r", "\n", $html); } if ($config->get('HTML.Trusted')) { // escape convoluted CDATA $html = $this->escapeCommentedCDATA($html); } // escape CDATA $html = $this->escapeCDATA($html); $html = $this->removeIEConditional($html); // extract body from document if applicable if ($config->get('Core.ConvertDocumentToFragment')) { $e = false; if ($config->get('Core.CollectErrors')) { $e =& $context->get('ErrorCollector'); } $new_html = $this->extractBody($html); if ($e && $new_html != $html) { $e->send(E_WARNING, 'Lexer: Extracted body'); } $html = $new_html; } // expand entities that aren't the big five $html = $this->_entity_parser->substituteNonSpecialEntities($html); // clean into wellformed UTF-8 string for an SGML context: this has // to be done after entity expansion because the entities sometimes // represent non-SGML characters (horror, horror!) $html = HTMLPurifier_Encoder::cleanUTF8($html); // if processing instructions are to removed, remove them now if ($config->get('Core.RemoveProcessingInstructions')) { $html = preg_replace('#<\\?.+?\\?>#s', '', $html); } return $html; }
function testShiftJIS() { if (!function_exists('iconv')) { return; } $this->config->set('Core', 'Encoding', 'Shift_JIS'); // This actually looks like a Yen, but we're going to treat it differently $this->assertIdentical(HTMLPurifier_Encoder::convertFromUTF8('\\~', $this->config, $this->context), '\\~'); $this->assertIdentical(HTMLPurifier_Encoder::convertToUTF8('\\~', $this->config, $this->context), '\\~'); }