public function assertCleanUTF8($string, $expect = null) { if ($expect === null) { $expect = $string; } $this->assertIdentical(HTMLPurifier_Encoder::cleanUTF8($string), $expect, 'iconv: %s'); $this->assertIdentical(HTMLPurifier_Encoder::cleanUTF8($string, true), $expect, 'PHP: %s'); }
/** * Takes a piece of HTML and normalizes it by converting entities, fixing * encoding, extracting bits, and other good stuff. * @param string $html HTML. * @param HTMLPurifier_Config $config * @param HTMLPurifier_Context $context * @return string * @todo Consider making protected */ public function normalize($html, $config, $context) { // normalize newlines to \n if ($config->get('Core.NormalizeNewlines')) { $html = str_replace("\r\n", "\n", $html); $html = str_replace("\r", "\n", $html); } if ($config->get('HTML.Trusted')) { // escape convoluted CDATA $html = $this->escapeCommentedCDATA($html); } // escape CDATA $html = $this->escapeCDATA($html); $html = $this->removeIEConditional($html); // extract body from document if applicable if ($config->get('Core.ConvertDocumentToFragment')) { $e = false; if ($config->get('Core.CollectErrors')) { $e =& $context->get('ErrorCollector'); } $new_html = $this->extractBody($html); if ($e && $new_html != $html) { $e->send(E_WARNING, 'Lexer: Extracted body'); } $html = $new_html; } // expand entities that aren't the big five $html = $this->_entity_parser->substituteNonSpecialEntities($html); // clean into wellformed UTF-8 string for an SGML context: this has // to be done after entity expansion because the entities sometimes // represent non-SGML characters (horror, horror!) $html = HTMLPurifier_Encoder::cleanUTF8($html); // if processing instructions are to removed, remove them now if ($config->get('Core.RemoveProcessingInstructions')) { $html = preg_replace('#<\\?.+?\\?>#s', '', $html); } return $html; }
/** * Takes a piece of HTML and normalizes it by converting entities, fixing * encoding, extracting bits, and other good stuff. */ function normalize($html, $config, &$context) { // extract body from document if applicable if ($config->get('Core', 'AcceptFullDocuments')) { $html = $this->extractBody($html); } // normalize newlines to \n $html = str_replace("\r\n", "\n", $html); $html = str_replace("\r", "\n", $html); if ($config->get('HTML', 'Trusted')) { // escape convoluted CDATA $html = $this->escapeCommentedCDATA($html); } // escape CDATA $html = $this->escapeCDATA($html); // expand entities that aren't the big five $html = $this->_entity_parser->substituteNonSpecialEntities($html); // clean into wellformed UTF-8 string for an SGML context: this has // to be done after entity expansion because the entities sometimes // represent non-SGML characters (horror, horror!) $html = HTMLPurifier_Encoder::cleanUTF8($html); return $html; }
public function validate($string, $config, $context) { static $generic_names = array('serif' => true, 'sans-serif' => true, 'monospace' => true, 'fantasy' => true, 'cursive' => true); // assume that no font names contain commas in them $fonts = explode(',', $string); $final = ''; foreach ($fonts as $font) { $font = trim($font); if ($font === '') { continue; } // match a generic name if (isset($generic_names[$font])) { $final .= $font . ', '; continue; } // match a quoted name if ($font[0] === '"' || $font[0] === "'") { $length = strlen($font); if ($length <= 2) { continue; } $quote = $font[0]; if ($font[$length - 1] !== $quote) { continue; } $font = substr($font, 1, $length - 2); $new_font = ''; for ($i = 0, $c = strlen($font); $i < $c; $i++) { if ($font[$i] === '\\') { $i++; if ($i >= $c) { $new_font .= '\\'; break; } if (ctype_xdigit($font[$i])) { $code = $font[$i]; for ($a = 1, $i++; $i < $c && $a < 6; $i++, $a++) { if (!ctype_xdigit($font[$i])) { break; } $code .= $font[$i]; } // We have to be extremely careful when adding // new characters, to make sure we're not breaking // the encoding. $char = HTMLPurifier_Encoder::unichr(hexdec($code)); if (HTMLPurifier_Encoder::cleanUTF8($char) === '') { continue; } $new_font .= $char; if ($i < $c && trim($font[$i]) !== '') { $i--; } continue; } if ($font[$i] === "\n") { continue; } } $new_font .= $font[$i]; } $font = $new_font; } // $font is a pure representation of the font name if (ctype_alnum($font) && $font !== '') { // very simple font, allow it in unharmed $final .= $font . ', '; continue; } // complicated font, requires quoting // armor single quotes and new lines $font = str_replace("\\", "\\\\", $font); $font = str_replace("'", "\\'", $font); $final .= "'{$font}', "; } $final = rtrim($final, ', '); if ($final === '') { return false; } return $final; }
function escapeHTML($string) { $string = HTMLPurifier_Encoder::cleanUTF8($string); $string = htmlspecialchars($string, ENT_COMPAT, 'UTF-8'); return $string; }
/** * Parses a possibly escaped CSS string and returns the "pure" * version of it. */ protected function expandCSSEscape($string) { // flexibly parse it $ret = ''; for ($i = 0, $c = strlen($string); $i < $c; $i++) { if ($string[$i] === '\\') { $i++; if ($i >= $c) { $ret .= '\\'; break; } if (ctype_xdigit($string[$i])) { $code = $string[$i]; for ($a = 1, $i++; $i < $c && $a < 6; $i++, $a++) { if (!ctype_xdigit($string[$i])) { break; } $code .= $string[$i]; } // We have to be extremely careful when adding // new characters, to make sure we're not breaking // the encoding. $char = HTMLPurifier_Encoder::unichr(hexdec($code)); if (HTMLPurifier_Encoder::cleanUTF8($char) === '') { continue; } $ret .= $char; if ($i < $c && trim($string[$i]) !== '') { $i--; } continue; } if ($string[$i] === "\n") { continue; } } $ret .= $string[$i]; } return $ret; }
/** * Attempts to convert a string to UTF-8 and clean any non-valid UTF-8 characters. * * @param $string * * @return bool|string */ public static function convertToUTF8($string) { // Don't wrap in a class_exists in case the server already has it's own version of HTMLPurifier and they have // open_basedir restrictions require_once Craft::getPathOfAlias('system.vendors.htmlpurifier') . '/HTMLPurifier.standalone.php'; // If it's already a UTF8 string, just clean and return it if (static::isUTF8($string)) { return \HTMLPurifier_Encoder::cleanUTF8($string); } // Otherwise set HTMLPurifier to the actual string encoding $config = \HTMLPurifier_Config::createDefault(); $config->set('Core.Encoding', static::getEncoding($string)); // Clean it $string = \HTMLPurifier_Encoder::cleanUTF8($string); // Convert it to UTF8 if possible if (static::checkForIconv()) { $string = \HTMLPurifier_Encoder::convertToUTF8($string, $config, null); } else { $encoding = static::getEncoding($string); $string = mb_convert_encoding($string, 'utf-8', $encoding); } return $string; }
public function normalize($html, $config, $context) { if ($config->get('Core.NormalizeNewlines')) { $html = str_replace("\r\n", "\n", $html); $html = str_replace("\r", "\n", $html); } if ($config->get('HTML.Trusted')) { $html = $this->escapeCommentedCDATA($html); } $html = $this->escapeCDATA($html); $html = $this->removeIEConditional($html); if ($config->get('Core.ConvertDocumentToFragment')) { $e = false; if ($config->get('Core.CollectErrors')) { $e =& $context->get('ErrorCollector'); } $new_html = $this->extractBody($html); if ($e && $new_html != $html) { $e->send(E_WARNING, 'Lexer: Extracted body'); } $html = $new_html; } $html = $this->_entity_parser->substituteNonSpecialEntities($html); $html = HTMLPurifier_Encoder::cleanUTF8($html); if ($config->get('Core.RemoveProcessingInstructions')) { $html = preg_replace('#<\\?.+?\\?>#s', '', $html); } return $html; }
function &search_index($data) { // Be sure we will parse UTF-8 data if (function_exists('mb_check_encoding') && function_exists('iconv') && function_exists('mb_detect_encoding') && mb_check_encoding($data, 'UTF-8')) { $data = iconv(mb_detect_encoding($data), 'UTF-8//TRANSLIT', $data); } // Clean the UTF-8 string using HTML Purifier @(require_once 'lib/HTMLPurifier.auto.php'); @(require_once 'HTMLPurifier/Encoder.php'); if (class_exists('HTMLPurifier_Encoder')) { $utf8encoder = new HTMLPurifier_Encoder(); $data = $utf8encoder->cleanUTF8($data); unset($utf8encoder); } // Remove remaining HTML numeric entities if (function_exists('mb_decode_numericentity')) { if (!function_exists('utf8_entity_decode')) { function utf8_entity_decode($entity) { $convmap = array(0x0, 0x10000, 0, 0xfffff); return mb_decode_numericentity($entity, $convmap, 'UTF-8'); } } $data = preg_replace('/&#\\d{2,5};/ue', "utf8_entity_decode('\$0')", $data); $data = preg_replace('/&#x([a-fA-F0-7]{2,8});/ue', "utf8_entity_decode('&#'.hexdec('\$1').';')", $data); } // Lowerize $data = function_exists('mb_convert_case') ? mb_convert_case($data, MB_CASE_LOWER, 'UTF-8') : strtolower($data); // Convert punctuations to spaces $data = preg_replace('/[\\pP\\pZ\\pS]/u', ' ', $data); if ($data != '') { // Split into words (do NOT use the split function that doesn't correctly handle some characters !) $sstrings = preg_split('/\\s+/u', $data, -1, PREG_SPLIT_NO_EMPTY); foreach ($sstrings as $value) { // Keep only alpha-num words if (preg_match('/^[\\pL\\pN]+$/u', $value)) { if (isset($words[$value])) { $words[$value]++; // count words } else { $words[$value] = 1; } } } } return $words; }