cleanUTF8() public static method

It will parse according to UTF-8 and return a valid UTF8 string, with non-SGML codepoints excluded.
public static cleanUTF8 ( string $str, boolean $force_php = false ) : string
$str string The string to clean
$force_php boolean
return string
示例#1
0
 public function assertCleanUTF8($string, $expect = null)
 {
     if ($expect === null) {
         $expect = $string;
     }
     $this->assertIdentical(HTMLPurifier_Encoder::cleanUTF8($string), $expect, 'iconv: %s');
     $this->assertIdentical(HTMLPurifier_Encoder::cleanUTF8($string, true), $expect, 'PHP: %s');
 }
 /**
  * Takes a piece of HTML and normalizes it by converting entities, fixing
  * encoding, extracting bits, and other good stuff.
  * @param string $html HTML.
  * @param HTMLPurifier_Config $config
  * @param HTMLPurifier_Context $context
  * @return string
  * @todo Consider making protected
  */
 public function normalize($html, $config, $context)
 {
     // normalize newlines to \n
     if ($config->get('Core.NormalizeNewlines')) {
         $html = str_replace("\r\n", "\n", $html);
         $html = str_replace("\r", "\n", $html);
     }
     if ($config->get('HTML.Trusted')) {
         // escape convoluted CDATA
         $html = $this->escapeCommentedCDATA($html);
     }
     // escape CDATA
     $html = $this->escapeCDATA($html);
     $html = $this->removeIEConditional($html);
     // extract body from document if applicable
     if ($config->get('Core.ConvertDocumentToFragment')) {
         $e = false;
         if ($config->get('Core.CollectErrors')) {
             $e =& $context->get('ErrorCollector');
         }
         $new_html = $this->extractBody($html);
         if ($e && $new_html != $html) {
             $e->send(E_WARNING, 'Lexer: Extracted body');
         }
         $html = $new_html;
     }
     // expand entities that aren't the big five
     $html = $this->_entity_parser->substituteNonSpecialEntities($html);
     // clean into wellformed UTF-8 string for an SGML context: this has
     // to be done after entity expansion because the entities sometimes
     // represent non-SGML characters (horror, horror!)
     $html = HTMLPurifier_Encoder::cleanUTF8($html);
     // if processing instructions are to removed, remove them now
     if ($config->get('Core.RemoveProcessingInstructions')) {
         $html = preg_replace('#<\\?.+?\\?>#s', '', $html);
     }
     return $html;
 }
示例#3
0
 /**
  * Takes a piece of HTML and normalizes it by converting entities, fixing
  * encoding, extracting bits, and other good stuff.
  */
 function normalize($html, $config, &$context)
 {
     // extract body from document if applicable
     if ($config->get('Core', 'AcceptFullDocuments')) {
         $html = $this->extractBody($html);
     }
     // normalize newlines to \n
     $html = str_replace("\r\n", "\n", $html);
     $html = str_replace("\r", "\n", $html);
     if ($config->get('HTML', 'Trusted')) {
         // escape convoluted CDATA
         $html = $this->escapeCommentedCDATA($html);
     }
     // escape CDATA
     $html = $this->escapeCDATA($html);
     // expand entities that aren't the big five
     $html = $this->_entity_parser->substituteNonSpecialEntities($html);
     // clean into wellformed UTF-8 string for an SGML context: this has
     // to be done after entity expansion because the entities sometimes
     // represent non-SGML characters (horror, horror!)
     $html = HTMLPurifier_Encoder::cleanUTF8($html);
     return $html;
 }
 public function validate($string, $config, $context)
 {
     static $generic_names = array('serif' => true, 'sans-serif' => true, 'monospace' => true, 'fantasy' => true, 'cursive' => true);
     // assume that no font names contain commas in them
     $fonts = explode(',', $string);
     $final = '';
     foreach ($fonts as $font) {
         $font = trim($font);
         if ($font === '') {
             continue;
         }
         // match a generic name
         if (isset($generic_names[$font])) {
             $final .= $font . ', ';
             continue;
         }
         // match a quoted name
         if ($font[0] === '"' || $font[0] === "'") {
             $length = strlen($font);
             if ($length <= 2) {
                 continue;
             }
             $quote = $font[0];
             if ($font[$length - 1] !== $quote) {
                 continue;
             }
             $font = substr($font, 1, $length - 2);
             $new_font = '';
             for ($i = 0, $c = strlen($font); $i < $c; $i++) {
                 if ($font[$i] === '\\') {
                     $i++;
                     if ($i >= $c) {
                         $new_font .= '\\';
                         break;
                     }
                     if (ctype_xdigit($font[$i])) {
                         $code = $font[$i];
                         for ($a = 1, $i++; $i < $c && $a < 6; $i++, $a++) {
                             if (!ctype_xdigit($font[$i])) {
                                 break;
                             }
                             $code .= $font[$i];
                         }
                         // We have to be extremely careful when adding
                         // new characters, to make sure we're not breaking
                         // the encoding.
                         $char = HTMLPurifier_Encoder::unichr(hexdec($code));
                         if (HTMLPurifier_Encoder::cleanUTF8($char) === '') {
                             continue;
                         }
                         $new_font .= $char;
                         if ($i < $c && trim($font[$i]) !== '') {
                             $i--;
                         }
                         continue;
                     }
                     if ($font[$i] === "\n") {
                         continue;
                     }
                 }
                 $new_font .= $font[$i];
             }
             $font = $new_font;
         }
         // $font is a pure representation of the font name
         if (ctype_alnum($font) && $font !== '') {
             // very simple font, allow it in unharmed
             $final .= $font . ', ';
             continue;
         }
         // complicated font, requires quoting
         // armor single quotes and new lines
         $font = str_replace("\\", "\\\\", $font);
         $font = str_replace("'", "\\'", $font);
         $final .= "'{$font}', ";
     }
     $final = rtrim($final, ', ');
     if ($final === '') {
         return false;
     }
     return $final;
 }
示例#5
0
function escapeHTML($string)
{
    $string = HTMLPurifier_Encoder::cleanUTF8($string);
    $string = htmlspecialchars($string, ENT_COMPAT, 'UTF-8');
    return $string;
}
示例#6
0
 /**
  * Parses a possibly escaped CSS string and returns the "pure" 
  * version of it.
  */
 protected function expandCSSEscape($string)
 {
     // flexibly parse it
     $ret = '';
     for ($i = 0, $c = strlen($string); $i < $c; $i++) {
         if ($string[$i] === '\\') {
             $i++;
             if ($i >= $c) {
                 $ret .= '\\';
                 break;
             }
             if (ctype_xdigit($string[$i])) {
                 $code = $string[$i];
                 for ($a = 1, $i++; $i < $c && $a < 6; $i++, $a++) {
                     if (!ctype_xdigit($string[$i])) {
                         break;
                     }
                     $code .= $string[$i];
                 }
                 // We have to be extremely careful when adding
                 // new characters, to make sure we're not breaking
                 // the encoding.
                 $char = HTMLPurifier_Encoder::unichr(hexdec($code));
                 if (HTMLPurifier_Encoder::cleanUTF8($char) === '') {
                     continue;
                 }
                 $ret .= $char;
                 if ($i < $c && trim($string[$i]) !== '') {
                     $i--;
                 }
                 continue;
             }
             if ($string[$i] === "\n") {
                 continue;
             }
         }
         $ret .= $string[$i];
     }
     return $ret;
 }
示例#7
0
 /**
  * Attempts to convert a string to UTF-8 and clean any non-valid UTF-8 characters.
  *
  * @param      $string
  *
  * @return bool|string
  */
 public static function convertToUTF8($string)
 {
     // Don't wrap in a class_exists in case the server already has it's own version of HTMLPurifier and they have
     // open_basedir restrictions
     require_once Craft::getPathOfAlias('system.vendors.htmlpurifier') . '/HTMLPurifier.standalone.php';
     // If it's already a UTF8 string, just clean and return it
     if (static::isUTF8($string)) {
         return \HTMLPurifier_Encoder::cleanUTF8($string);
     }
     // Otherwise set HTMLPurifier to the actual string encoding
     $config = \HTMLPurifier_Config::createDefault();
     $config->set('Core.Encoding', static::getEncoding($string));
     // Clean it
     $string = \HTMLPurifier_Encoder::cleanUTF8($string);
     // Convert it to UTF8 if possible
     if (static::checkForIconv()) {
         $string = \HTMLPurifier_Encoder::convertToUTF8($string, $config, null);
     } else {
         $encoding = static::getEncoding($string);
         $string = mb_convert_encoding($string, 'utf-8', $encoding);
     }
     return $string;
 }
 public function normalize($html, $config, $context)
 {
     if ($config->get('Core.NormalizeNewlines')) {
         $html = str_replace("\r\n", "\n", $html);
         $html = str_replace("\r", "\n", $html);
     }
     if ($config->get('HTML.Trusted')) {
         $html = $this->escapeCommentedCDATA($html);
     }
     $html = $this->escapeCDATA($html);
     $html = $this->removeIEConditional($html);
     if ($config->get('Core.ConvertDocumentToFragment')) {
         $e = false;
         if ($config->get('Core.CollectErrors')) {
             $e =& $context->get('ErrorCollector');
         }
         $new_html = $this->extractBody($html);
         if ($e && $new_html != $html) {
             $e->send(E_WARNING, 'Lexer: Extracted body');
         }
         $html = $new_html;
     }
     $html = $this->_entity_parser->substituteNonSpecialEntities($html);
     $html = HTMLPurifier_Encoder::cleanUTF8($html);
     if ($config->get('Core.RemoveProcessingInstructions')) {
         $html = preg_replace('#<\\?.+?\\?>#s', '', $html);
     }
     return $html;
 }
function &search_index($data)
{
    // Be sure we will parse UTF-8 data
    if (function_exists('mb_check_encoding') && function_exists('iconv') && function_exists('mb_detect_encoding') && mb_check_encoding($data, 'UTF-8')) {
        $data = iconv(mb_detect_encoding($data), 'UTF-8//TRANSLIT', $data);
    }
    // Clean the UTF-8 string using HTML Purifier
    @(require_once 'lib/HTMLPurifier.auto.php');
    @(require_once 'HTMLPurifier/Encoder.php');
    if (class_exists('HTMLPurifier_Encoder')) {
        $utf8encoder = new HTMLPurifier_Encoder();
        $data = $utf8encoder->cleanUTF8($data);
        unset($utf8encoder);
    }
    // Remove remaining HTML numeric entities
    if (function_exists('mb_decode_numericentity')) {
        if (!function_exists('utf8_entity_decode')) {
            function utf8_entity_decode($entity)
            {
                $convmap = array(0x0, 0x10000, 0, 0xfffff);
                return mb_decode_numericentity($entity, $convmap, 'UTF-8');
            }
        }
        $data = preg_replace('/&#\\d{2,5};/ue', "utf8_entity_decode('\$0')", $data);
        $data = preg_replace('/&#x([a-fA-F0-7]{2,8});/ue', "utf8_entity_decode('&#'.hexdec('\$1').';')", $data);
    }
    // Lowerize
    $data = function_exists('mb_convert_case') ? mb_convert_case($data, MB_CASE_LOWER, 'UTF-8') : strtolower($data);
    // Convert punctuations to spaces
    $data = preg_replace('/[\\pP\\pZ\\pS]/u', ' ', $data);
    if ($data != '') {
        // Split into words (do NOT use the split function that doesn't correctly handle some characters !)
        $sstrings = preg_split('/\\s+/u', $data, -1, PREG_SPLIT_NO_EMPTY);
        foreach ($sstrings as $value) {
            // Keep only alpha-num words
            if (preg_match('/^[\\pL\\pN]+$/u', $value)) {
                if (isset($words[$value])) {
                    $words[$value]++;
                    // count words
                } else {
                    $words[$value] = 1;
                }
            }
        }
    }
    return $words;
}