Exemplo n.º 1
0
 /**
  * Callback function for substituteNonSpecialEntities() that does the work.
  * 
  * @warning Though this is public in order to let the callback happen,
  *          calling it directly is not recommended.
  * @param $matches  PCRE matches array, with 0 the entire match, and
  *                  either index 1, 2 or 3 set with a hex value, dec value,
  *                  or string (respectively).
  * @returns Replacement string.
  */
 function nonSpecialEntityCallback($matches)
 {
     // replaces all but big five
     $entity = $matches[0];
     $is_num = @$matches[0][1] === '#';
     if ($is_num) {
         $is_hex = @$entity[2] === 'x';
         $code = $is_hex ? hexdec($matches[1]) : (int) $matches[2];
         // abort for special characters
         if (isset($this->_special_dec2str[$code])) {
             return $entity;
         }
         return HTMLPurifier_Encoder::unichr($code);
     } else {
         if (isset($this->_special_ent2dec[$matches[3]])) {
             return $entity;
         }
         if (!$this->_entity_lookup) {
             $this->_entity_lookup = HTMLPurifier_EntityLookup::instance();
         }
         if (isset($this->_entity_lookup->table[$matches[3]])) {
             return $this->_entity_lookup->table[$matches[3]];
         } else {
             return $entity;
         }
     }
 }
Exemplo n.º 2
0
 /**
  * Calculates the result of this requirement.
  *
  * @return string
  */
 protected function calculateResult()
 {
     if (function_exists('iconv')) {
         // See if it's the buggy version
         if (\HTMLPurifier_Encoder::testIconvTruncateBug() != \HTMLPurifier_Encoder::ICONV_OK) {
             return RequirementResult::Warning;
         } else {
             return RequirementResult::Success;
         }
     } else {
         return RequirementResult::Failed;
     }
 }
Exemplo n.º 3
0
 /**
  * Converts a string from UTF-8 based on configuration.
  * @note Currently, this is a lossy conversion, with unexpressable
  *       characters being omitted.
  */
 public static function convertFromUTF8($str, $config, $context)
 {
     $encoding = $config->get('Core.Encoding');
     if ($encoding === 'utf-8') {
         return $str;
     }
     static $iconv = null;
     if ($iconv === null) {
         $iconv = function_exists('iconv');
     }
     if ($escape = $config->get('Core.EscapeNonASCIICharacters')) {
         $str = HTMLPurifier_Encoder::convertToASCIIDumbLossless($str);
     }
     set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
     if ($iconv && !$config->get('Test.ForceNoIconv')) {
         // Undo our previous fix in convertToUTF8, otherwise iconv will barf
         $ascii_fix = HTMLPurifier_Encoder::testEncodingSupportsASCII($encoding);
         if (!$escape && !empty($ascii_fix)) {
             $clear_fix = array();
             foreach ($ascii_fix as $utf8 => $native) {
                 $clear_fix[$utf8] = '';
             }
             $str = strtr($str, $clear_fix);
         }
         $str = strtr($str, array_flip($ascii_fix));
         // Normal stuff
         $str = iconv('utf-8', $encoding . '//IGNORE', $str);
         restore_error_handler();
         return $str;
     } elseif ($encoding === 'iso-8859-1') {
         $str = utf8_decode($str);
         restore_error_handler();
         return $str;
     }
     trigger_error('Encoding not supported', E_USER_ERROR);
 }
Exemplo n.º 4
0
 /**
  * Converts a string from UTF-8 based on configuration.
  * @note Currently, this is a lossy conversion, with unexpressable
  *       characters being omitted.
  */
 public static function convertFromUTF8($str, $config, $context)
 {
     static $iconv = null;
     if ($iconv === null) {
         $iconv = function_exists('iconv');
     }
     $encoding = $config->get('Core', 'Encoding');
     if ($encoding === 'utf-8') {
         return $str;
     }
     if ($config->get('Core', 'EscapeNonASCIICharacters')) {
         $str = HTMLPurifier_Encoder::convertToASCIIDumbLossless($str);
     }
     if ($iconv && !$config->get('Test', 'ForceNoIconv')) {
         return @iconv('utf-8', $encoding . '//IGNORE', $str);
     } elseif ($encoding === 'iso-8859-1') {
         return @utf8_decode($str);
     }
     trigger_error('Encoding not supported', E_USER_ERROR);
 }
 public function validate($string, $config, $context)
 {
     static $generic_names = array('serif' => true, 'sans-serif' => true, 'monospace' => true, 'fantasy' => true, 'cursive' => true);
     // assume that no font names contain commas in them
     $fonts = explode(',', $string);
     $final = '';
     foreach ($fonts as $font) {
         $font = trim($font);
         if ($font === '') {
             continue;
         }
         // match a generic name
         if (isset($generic_names[$font])) {
             $final .= $font . ', ';
             continue;
         }
         // match a quoted name
         if ($font[0] === '"' || $font[0] === "'") {
             $length = strlen($font);
             if ($length <= 2) {
                 continue;
             }
             $quote = $font[0];
             if ($font[$length - 1] !== $quote) {
                 continue;
             }
             $font = substr($font, 1, $length - 2);
             $new_font = '';
             for ($i = 0, $c = strlen($font); $i < $c; $i++) {
                 if ($font[$i] === '\\') {
                     $i++;
                     if ($i >= $c) {
                         $new_font .= '\\';
                         break;
                     }
                     if (ctype_xdigit($font[$i])) {
                         $code = $font[$i];
                         for ($a = 1, $i++; $i < $c && $a < 6; $i++, $a++) {
                             if (!ctype_xdigit($font[$i])) {
                                 break;
                             }
                             $code .= $font[$i];
                         }
                         // We have to be extremely careful when adding
                         // new characters, to make sure we're not breaking
                         // the encoding.
                         $char = HTMLPurifier_Encoder::unichr(hexdec($code));
                         if (HTMLPurifier_Encoder::cleanUTF8($char) === '') {
                             continue;
                         }
                         $new_font .= $char;
                         if ($i < $c && trim($font[$i]) !== '') {
                             $i--;
                         }
                         continue;
                     }
                     if ($font[$i] === "\n") {
                         continue;
                     }
                 }
                 $new_font .= $font[$i];
             }
             $font = $new_font;
         }
         // $font is a pure representation of the font name
         if (ctype_alnum($font) && $font !== '') {
             // very simple font, allow it in unharmed
             $final .= $font . ', ';
             continue;
         }
         // complicated font, requires quoting
         // armor single quotes and new lines
         $font = str_replace("\\", "\\\\", $font);
         $font = str_replace("'", "\\'", $font);
         $final .= "'{$font}', ";
     }
     $final = rtrim($final, ', ');
     if ($final === '') {
         return false;
     }
     return $final;
 }
Exemplo n.º 6
0
 /**
  * Filters an HTML snippet/document to be XSS-free and standards-compliant.
  *
  * @param $html String of HTML to purify
  * @param $config HTMLPurifier_Config object for this operation, if omitted,
  *                defaults to the config object specified during this
  *                object's construction. The parameter can also be any type
  *                that HTMLPurifier_Config::create() supports.
  * @return Purified HTML
  */
 public function purify($html, $config = null)
 {
     // :TODO: make the config merge in, instead of replace
     $config = $config ? HTMLPurifier_Config::create($config) : $this->config;
     // implementation is partially environment dependant, partially
     // configuration dependant
     $lexer = HTMLPurifier_Lexer::create($config);
     $context = new HTMLPurifier_Context();
     // setup HTML generator
     $this->generator = new HTMLPurifier_Generator($config, $context);
     $context->register('Generator', $this->generator);
     // set up global context variables
     if ($config->get('Core.CollectErrors')) {
         // may get moved out if other facilities use it
         $language_factory = HTMLPurifier_LanguageFactory::instance();
         $language = $language_factory->create($config, $context);
         $context->register('Locale', $language);
         $error_collector = new HTMLPurifier_ErrorCollector($context);
         $context->register('ErrorCollector', $error_collector);
     }
     // setup id_accumulator context, necessary due to the fact that
     // AttrValidator can be called from many places
     $id_accumulator = HTMLPurifier_IDAccumulator::build($config, $context);
     $context->register('IDAccumulator', $id_accumulator);
     $html = HTMLPurifier_Encoder::convertToUTF8($html, $config, $context);
     // setup filters
     $filter_flags = $config->getBatch('Filter');
     $custom_filters = $filter_flags['Custom'];
     unset($filter_flags['Custom']);
     $filters = array();
     foreach ($filter_flags as $filter => $flag) {
         if (!$flag) {
             continue;
         }
         if (strpos($filter, '.') !== false) {
             continue;
         }
         $class = "HTMLPurifier_Filter_{$filter}";
         $filters[] = new $class();
     }
     foreach ($custom_filters as $filter) {
         // maybe "HTMLPurifier_Filter_$filter", but be consistent with AutoFormat
         $filters[] = $filter;
     }
     $filters = array_merge($filters, $this->filters);
     // maybe prepare(), but later
     for ($i = 0, $filter_size = count($filters); $i < $filter_size; $i++) {
         $html = $filters[$i]->preFilter($html, $config, $context);
     }
     // purified HTML
     $html = $this->generator->generateFromTokens($this->strategy->execute($lexer->tokenizeHTML($html, $config, $context), $config, $context));
     for ($i = $filter_size - 1; $i >= 0; $i--) {
         $html = $filters[$i]->postFilter($html, $config, $context);
     }
     $html = HTMLPurifier_Encoder::convertFromUTF8($html, $config, $context);
     $this->context =& $context;
     return $html;
 }
Exemplo n.º 7
0
function escapeHTML($string)
{
    $string = HTMLPurifier_Encoder::cleanUTF8($string);
    $string = htmlspecialchars($string, ENT_COMPAT, 'UTF-8');
    return $string;
}
function &search_index($data)
{
    // Be sure we will parse UTF-8 data
    if (function_exists('mb_check_encoding') && function_exists('iconv') && function_exists('mb_detect_encoding') && mb_check_encoding($data, 'UTF-8')) {
        $data = iconv(mb_detect_encoding($data), 'UTF-8//TRANSLIT', $data);
    }
    // Clean the UTF-8 string using HTML Purifier
    @(require_once 'lib/HTMLPurifier.auto.php');
    @(require_once 'HTMLPurifier/Encoder.php');
    if (class_exists('HTMLPurifier_Encoder')) {
        $utf8encoder = new HTMLPurifier_Encoder();
        $data = $utf8encoder->cleanUTF8($data);
        unset($utf8encoder);
    }
    // Remove remaining HTML numeric entities
    if (function_exists('mb_decode_numericentity')) {
        if (!function_exists('utf8_entity_decode')) {
            function utf8_entity_decode($entity)
            {
                $convmap = array(0x0, 0x10000, 0, 0xfffff);
                return mb_decode_numericentity($entity, $convmap, 'UTF-8');
            }
        }
        $data = preg_replace('/&#\\d{2,5};/ue', "utf8_entity_decode('\$0')", $data);
        $data = preg_replace('/&#x([a-fA-F0-7]{2,8});/ue', "utf8_entity_decode('&#'.hexdec('\$1').';')", $data);
    }
    // Lowerize
    $data = function_exists('mb_convert_case') ? mb_convert_case($data, MB_CASE_LOWER, 'UTF-8') : strtolower($data);
    // Convert punctuations to spaces
    $data = preg_replace('/[\\pP\\pZ\\pS]/u', ' ', $data);
    if ($data != '') {
        // Split into words (do NOT use the split function that doesn't correctly handle some characters !)
        $sstrings = preg_split('/\\s+/u', $data, -1, PREG_SPLIT_NO_EMPTY);
        foreach ($sstrings as $value) {
            // Keep only alpha-num words
            if (preg_match('/^[\\pL\\pN]+$/u', $value)) {
                if (isset($words[$value])) {
                    $words[$value]++;
                    // count words
                } else {
                    $words[$value] = 1;
                }
            }
        }
    }
    return $words;
}
Exemplo n.º 9
0
 /**
  * Parses a possibly escaped CSS string and returns the "pure" 
  * version of it.
  */
 protected function expandCSSEscape($string)
 {
     // flexibly parse it
     $ret = '';
     for ($i = 0, $c = strlen($string); $i < $c; $i++) {
         if ($string[$i] === '\\') {
             $i++;
             if ($i >= $c) {
                 $ret .= '\\';
                 break;
             }
             if (ctype_xdigit($string[$i])) {
                 $code = $string[$i];
                 for ($a = 1, $i++; $i < $c && $a < 6; $i++, $a++) {
                     if (!ctype_xdigit($string[$i])) {
                         break;
                     }
                     $code .= $string[$i];
                 }
                 // We have to be extremely careful when adding
                 // new characters, to make sure we're not breaking
                 // the encoding.
                 $char = HTMLPurifier_Encoder::unichr(hexdec($code));
                 if (HTMLPurifier_Encoder::cleanUTF8($char) === '') {
                     continue;
                 }
                 $ret .= $char;
                 if ($i < $c && trim($string[$i]) !== '') {
                     $i--;
                 }
                 continue;
             }
             if ($string[$i] === "\n") {
                 continue;
             }
         }
         $ret .= $string[$i];
     }
     return $ret;
 }
Exemplo n.º 10
0
 /**
  * Filters an HTML snippet/document to be XSS-free and standards-compliant.
  * 
  * @param $html String of HTML to purify
  * @param $config HTMLPurifier_Config object for this operation, if omitted,
  *                defaults to the config object specified during this
  *                object's construction. The parameter can also be any type
  *                that HTMLPurifier_Config::create() supports.
  * @return Purified HTML
  */
 function purify($html, $config = null)
 {
     $config = $config ? HTMLPurifier_Config::create($config) : $this->config;
     // implementation is partially environment dependant, partially
     // configuration dependant
     $lexer = HTMLPurifier_Lexer::create($config);
     $context = new HTMLPurifier_Context();
     // our friendly neighborhood generator, all primed with configuration too!
     $this->generator->generateFromTokens(array(), $config, $context);
     $context->register('Generator', $this->generator);
     // set up global context variables
     if ($config->get('Core', 'CollectErrors')) {
         // may get moved out if other facilities use it
         $language_factory = HTMLPurifier_LanguageFactory::instance();
         $language = $language_factory->create($config, $context);
         $context->register('Locale', $language);
         $error_collector = new HTMLPurifier_ErrorCollector($context);
         $context->register('ErrorCollector', $error_collector);
     }
     $html = HTMLPurifier_Encoder::convertToUTF8($html, $config, $context);
     for ($i = 0, $size = count($this->filters); $i < $size; $i++) {
         $html = $this->filters[$i]->preFilter($html, $config, $context);
     }
     // purified HTML
     $html = $this->generator->generateFromTokens($this->strategy->execute($lexer->tokenizeHTML($html, $config, $context), $config, $context), $config, $context);
     for ($i = $size - 1; $i >= 0; $i--) {
         $html = $this->filters[$i]->postFilter($html, $config, $context);
     }
     $html = HTMLPurifier_Encoder::convertFromUTF8($html, $config, $context);
     $this->context =& $context;
     return $html;
 }
Exemplo n.º 11
0
 /**
  * Returns whether iconv is installed and not buggy.
  *
  * @return bool
  */
 public static function checkForIconv()
 {
     if (!isset(static::$_iconv)) {
         // Check if iconv is installed. Note we can't just use HTMLPurifier_Encoder::iconvAvailable() because they
         // don't consider iconv "installed" if it's there but "unusable".
         if (function_exists('iconv') && \HTMLPurifier_Encoder::testIconvTruncateBug() === \HTMLPurifier_Encoder::ICONV_OK) {
             static::$_iconv = true;
         } else {
             static::$_iconv = false;
         }
     }
     return static::$_iconv;
 }
Exemplo n.º 12
0
 public function testIconvChunking()
 {
     if (!HTMLPurifier_Encoder::iconvAvailable()) {
         return;
     }
     if (HTMLPurifier_Encoder::testIconvTruncateBug() !== HTMLPurifier_Encoder::ICONV_TRUNCATES) {
         return;
     }
     $this->assertIdentical(HTMLPurifier_Encoder::iconv('utf-8', 'iso-8859-1//IGNORE', "aó € b", 4), 'ab');
     $this->assertIdentical(HTMLPurifier_Encoder::iconv('utf-8', 'iso-8859-1//IGNORE', "aa中b", 4), 'aab');
     $this->assertIdentical(HTMLPurifier_Encoder::iconv('utf-8', 'iso-8859-1//IGNORE', "aaaαb", 4), 'aaab');
     $this->assertIdentical(HTMLPurifier_Encoder::iconv('utf-8', 'iso-8859-1//IGNORE', "aaaaó € b", 4), 'aaaab');
     $this->assertIdentical(HTMLPurifier_Encoder::iconv('utf-8', 'iso-8859-1//IGNORE', "aaaa中b", 4), 'aaaab');
     $this->assertIdentical(HTMLPurifier_Encoder::iconv('utf-8', 'iso-8859-1//IGNORE', "aaaaαb", 4), 'aaaab');
 }
Exemplo n.º 13
0
 /**
  * Returns whether iconv is installed and not buggy.
  *
  * @return bool
  */
 public static function checkForIconv()
 {
     if (!isset(static::$_iconv)) {
         static::$_iconv = false;
         // Check if iconv is installed. Note we can't just use HTMLPurifier_Encoder::iconvAvailable() because they
         // don't consider iconv "installed" if it's there but "unusable".
         if (!function_exists('iconv')) {
             Craft::log('iconv is not installed.  Will fallback to mbstring.', LogLevel::Warning);
         } else {
             if (\HTMLPurifier_Encoder::testIconvTruncateBug() != \HTMLPurifier_Encoder::ICONV_OK) {
                 Craft::log('Buggy iconv installed.  Will fallback to mbstring.', LogLevel::Warning);
             } else {
                 static::$_iconv = true;
             }
         }
     }
     return static::$_iconv;
 }
Exemplo n.º 14
0
 /**
  * Converts a string to UTF-8 based on configuration.
  */
 public static function convertToUTF8($str, $config, $context)
 {
     $encoding = $config->get('Core.Encoding');
     if ($encoding === 'utf-8') {
         return $str;
     }
     static $iconv = null;
     if ($iconv === null) {
         $iconv = self::iconvAvailable();
     }
     if ($iconv && !$config->get('Test.ForceNoIconv')) {
         // unaffected by bugs, since UTF-8 support all characters
         $str = self::unsafeIconv($encoding, 'utf-8//IGNORE', $str);
         if ($str === false) {
             // $encoding is not a valid encoding
             trigger_error('Invalid encoding ' . $encoding, E_USER_ERROR);
             return '';
         }
         // If the string is bjorked by Shift_JIS or a similar encoding
         // that doesn't support all of ASCII, convert the naughty
         // characters to their true byte-wise ASCII/UTF-8 equivalents.
         $str = strtr($str, self::testEncodingSupportsASCII($encoding));
         return $str;
     } elseif ($encoding === 'iso-8859-1') {
         $str = utf8_encode($str);
         return $str;
     }
     $bug = HTMLPurifier_Encoder::testIconvTruncateBug();
     if ($bug == self::ICONV_OK) {
         trigger_error('Encoding not supported, please install iconv', E_USER_ERROR);
     } else {
         trigger_error('You have a buggy version of iconv, see https://bugs.php.net/bug.php?id=48147 and http://sourceware.org/bugzilla/show_bug.cgi?id=13541', E_USER_ERROR);
     }
 }
 public function normalize($html, $config, $context)
 {
     if ($config->get('Core.NormalizeNewlines')) {
         $html = str_replace("\r\n", "\n", $html);
         $html = str_replace("\r", "\n", $html);
     }
     if ($config->get('HTML.Trusted')) {
         $html = $this->escapeCommentedCDATA($html);
     }
     $html = $this->escapeCDATA($html);
     $html = $this->removeIEConditional($html);
     if ($config->get('Core.ConvertDocumentToFragment')) {
         $e = false;
         if ($config->get('Core.CollectErrors')) {
             $e =& $context->get('ErrorCollector');
         }
         $new_html = $this->extractBody($html);
         if ($e && $new_html != $html) {
             $e->send(E_WARNING, 'Lexer: Extracted body');
         }
         $html = $new_html;
     }
     $html = $this->_entity_parser->substituteNonSpecialEntities($html);
     $html = HTMLPurifier_Encoder::cleanUTF8($html);
     if ($config->get('Core.RemoveProcessingInstructions')) {
         $html = preg_replace('#<\\?.+?\\?>#s', '', $html);
     }
     return $html;
 }
Exemplo n.º 16
0
 /**
  * Takes a piece of HTML and normalizes it by converting entities, fixing
  * encoding, extracting bits, and other good stuff.
  */
 function normalize($html, $config, &$context)
 {
     // extract body from document if applicable
     if ($config->get('Core', 'AcceptFullDocuments')) {
         $html = $this->extractBody($html);
     }
     // normalize newlines to \n
     $html = str_replace("\r\n", "\n", $html);
     $html = str_replace("\r", "\n", $html);
     if ($config->get('HTML', 'Trusted')) {
         // escape convoluted CDATA
         $html = $this->escapeCommentedCDATA($html);
     }
     // escape CDATA
     $html = $this->escapeCDATA($html);
     // expand entities that aren't the big five
     $html = $this->_entity_parser->substituteNonSpecialEntities($html);
     // clean into wellformed UTF-8 string for an SGML context: this has
     // to be done after entity expansion because the entities sometimes
     // represent non-SGML characters (horror, horror!)
     $html = HTMLPurifier_Encoder::cleanUTF8($html);
     return $html;
 }
 /**
  * Takes a piece of HTML and normalizes it by converting entities, fixing
  * encoding, extracting bits, and other good stuff.
  * @param string $html HTML.
  * @param HTMLPurifier_Config $config
  * @param HTMLPurifier_Context $context
  * @return string
  * @todo Consider making protected
  */
 public function normalize($html, $config, $context)
 {
     // normalize newlines to \n
     if ($config->get('Core.NormalizeNewlines')) {
         $html = str_replace("\r\n", "\n", $html);
         $html = str_replace("\r", "\n", $html);
     }
     if ($config->get('HTML.Trusted')) {
         // escape convoluted CDATA
         $html = $this->escapeCommentedCDATA($html);
     }
     // escape CDATA
     $html = $this->escapeCDATA($html);
     $html = $this->removeIEConditional($html);
     // extract body from document if applicable
     if ($config->get('Core.ConvertDocumentToFragment')) {
         $e = false;
         if ($config->get('Core.CollectErrors')) {
             $e =& $context->get('ErrorCollector');
         }
         $new_html = $this->extractBody($html);
         if ($e && $new_html != $html) {
             $e->send(E_WARNING, 'Lexer: Extracted body');
         }
         $html = $new_html;
     }
     // expand entities that aren't the big five
     $html = $this->_entity_parser->substituteNonSpecialEntities($html);
     // clean into wellformed UTF-8 string for an SGML context: this has
     // to be done after entity expansion because the entities sometimes
     // represent non-SGML characters (horror, horror!)
     $html = HTMLPurifier_Encoder::cleanUTF8($html);
     // if processing instructions are to removed, remove them now
     if ($config->get('Core.RemoveProcessingInstructions')) {
         $html = preg_replace('#<\\?.+?\\?>#s', '', $html);
     }
     return $html;
 }
Exemplo n.º 18
0
 function testShiftJIS()
 {
     if (!function_exists('iconv')) {
         return;
     }
     $this->config->set('Core', 'Encoding', 'Shift_JIS');
     // This actually looks like a Yen, but we're going to treat it differently
     $this->assertIdentical(HTMLPurifier_Encoder::convertFromUTF8('\\~', $this->config, $this->context), '\\~');
     $this->assertIdentical(HTMLPurifier_Encoder::convertToUTF8('\\~', $this->config, $this->context), '\\~');
 }