private function _decodeHeader($input)
 {
     //-- Remove white space between encoded-words
     $input = (string) preg_replace('/(=\\?[^?]+\\?(q|b)\\?[^?]*\\?=)(\\s)+=\\?/i', '\\1=?', (string) $input);
     // insensitive
     //-- For each encoded-word...
     $matches = array();
     while (preg_match('/(=\\?([^?]+)\\?(q|b)\\?([^?]*)\\?=)/i', (string) $input, $matches)) {
         // insensitive
         //--
         $encoded = $matches[1];
         $charset = $matches[2];
         $encoding = $matches[3];
         $text = $matches[4];
         //--
         if ((string) $charset == '' or (string) $charset == 'us-ascii') {
             $charset = 'iso-8859-1';
             // correction :: {{{SYNC-CHARSET-FIX}}}
         }
         //end if
         //--
         switch (strtoupper($encoding)) {
             case 'B':
                 $text = base64_decode($text);
                 $text = SmartUnicode::convert_charset($text, $charset, $this->charset);
                 // {{{SYNC-CHARSET-CONVERT}}}
                 break;
             case 'Q':
                 $text = quoted_printable_decode($text);
                 $text = SmartUnicode::convert_charset($text, $charset, $this->charset);
                 // {{{SYNC-CHARSET-CONVERT}}}
                 break;
             default:
                 // as is
         }
         //end switch
         //--
         $input = (string) str_replace($encoded, $text, $input);
         //--
     }
     //end while
     //--
     return (string) $input;
     //--
 }
예제 #2
0
 private function clean_html($y_comments, $y_extra_tags_remove = array(), $y_extra_tags_clean = array(), $y_allowed_tags = array())
 {
     //-- CLEANUP DISSALOWED AND FIX INVALID HTML TAGS
     // * it will use code standardize before to fix active PHP tags and weird characters
     // * will convert all UTF-8 characters to the coresponding HTML-ENTITIES
     // * will remove all tags that are unsafe like <script> or <head> and many other dissalowed unsafe tags
     // * if allowed tags are specified they will take precedence and will be filtered via strip_tags by allowing only these tags, at the end of cleanup to be safer !
     // * if DomDocument is detected and is allowed to be used by current settings will be used finally to do (post-processing) extra cleanup and fixes
     //--
     //--
     if ($this->is_clean != false) {
         return;
         // avoid to re-parse
     }
     //end if
     //--
     $this->is_clean = true;
     //--
     //--
     $this->standardize_html();
     // first, standardize the HTML Code
     //--
     //--
     $arr_tags_0x_list_comments = array('#\\<\\s*?\\!\\-?\\-?(.*?)\\-?\\-?\\>#si');
     //--
     $arr_tags_2x_list_bad = array('head', 'style', 'script', 'noscript', 'frameset', 'frame', 'iframe', 'canvas', 'audio', 'video', 'applet', 'param', 'object', 'form', 'xml', 'xmp', 'o:p');
     if (Smart::array_size($y_extra_tags_remove) > 0) {
         // add extra entries such as: img, p, div, ...
         for ($i = 0; $i < count($y_extra_tags_remove); $i++) {
             if (preg_match((string) $this->regex_tag_name, (string) $y_extra_tags_remove[$i])) {
                 if (!in_array((string) $y_extra_tags_remove[$i], $arr_tags_2x_list_bad)) {
                     $arr_tags_2x_list_bad[] = (string) $y_extra_tags_remove[$i];
                 }
                 //end if
             }
             //end if
         }
         //end for
     }
     //end if
     $arr_tags_2x_repl_bad = (array) $arr_tags_0x_list_comments;
     for ($i = 0; $i < count($arr_tags_0x_list_comments); $i++) {
         $arr_tags_2x_repl_good[] = '<!-- # -->';
         // comment
     }
     //end for
     for ($i = 0; $i < count($arr_tags_2x_list_bad); $i++) {
         $tmp_regex_tag = (array) $this->regex_tag((string) $arr_tags_2x_list_bad[$i]);
         // currently if nested tags some content between those tags may remain not removed ... but that is ok as long as the tag is replaced ; possible fix: match with siU instead of si but will go ungreedy and will match all content until very last end tag ... which may remove too many content
         $arr_tags_2x_repl_bad[] = $tmp_regex_tag['delimiter'] . '(' . $tmp_regex_tag['tag-start'] . ')' . '.*?' . '(' . $tmp_regex_tag['tag-end'] . ')' . $tmp_regex_tag['delimiter'] . 'si';
         // fix: paranthesis are required to correct match in this case (balanced regex)
         $arr_tags_2x_repl_good[] = '<!-- ' . Smart::escape_html((string) $arr_tags_2x_list_bad[$i]) . '/ -->';
     }
     //end if
     //--
     //--
     $arr_tags_1x_list_bad = (array) array_merge((array) $arr_tags_2x_list_bad, array('!doctype', 'html', 'body', 'base', 'meta', 'link', 'track', 'source', 'plaintext', 'marquee'));
     if (Smart::array_size($y_extra_tags_clean) > 0) {
         // add extra entries such as: img, p, div, ...
         for ($i = 0; $i < count($y_extra_tags_clean); $i++) {
             if (preg_match((string) $this->regex_tag_name, (string) $y_extra_tags_clean[$i])) {
                 if (!in_array((string) $y_extra_tags_clean[$i], $arr_tags_1x_list_bad)) {
                     $arr_tags_1x_list_bad[] = (string) $y_extra_tags_clean[$i];
                 }
                 //end if
             }
             //end if
         }
         //end for
     }
     //end if
     $arr_tags_1x_repl_bad = array();
     $arr_tags_1x_repl_good = array();
     for ($i = 0; $i < count($arr_tags_1x_list_bad); $i++) {
         $tmp_regex_tag = (array) $this->regex_tag((string) $arr_tags_1x_list_bad[$i]);
         $arr_tags_1x_repl_bad[] = $tmp_regex_tag['delimiter'] . $tmp_regex_tag['tag-start'] . $tmp_regex_tag['delimiter'] . 'si';
         $arr_tags_1x_repl_bad[] = $tmp_regex_tag['delimiter'] . $tmp_regex_tag['tag-end'] . $tmp_regex_tag['delimiter'] . 'si';
         $arr_tags_1x_repl_good[] = '<!-- ' . Smart::escape_html((string) $arr_tags_1x_list_bad[$i]) . ' -->';
         $arr_tags_1x_repl_good[] = '<!-- /' . Smart::escape_html((string) $arr_tags_1x_list_bad[$i]) . ' -->';
     }
     //end if
     //--
     //--
     $arr_all_repl_bad = (array) array_merge((array) $arr_tags_2x_repl_bad, (array) $arr_tags_1x_repl_bad);
     $arr_all_repl_good = (array) array_merge((array) $arr_tags_2x_repl_good, (array) $arr_tags_1x_repl_good);
     //--
     //print_r($arr_tags_2x_repl_bad);
     //print_r($arr_tags_2x_repl_good);
     //print_r($arr_tags_1x_repl_bad);
     //print_r($arr_tags_1x_repl_good);
     //print_r($arr_all_repl_bad);
     //print_r($arr_all_repl_good);
     //die('');
     //--
     //--
     $this->html = (string) preg_replace((array) $arr_all_repl_bad, (array) $arr_all_repl_good, (string) $this->html);
     //--
     //--
     $this->parse_elements();
     //--
     //--
     for ($i = 0; $i < Smart::array_size($this->elements); $i++) {
         //--
         $code = (string) $this->elements[$i];
         if (substr($code, 0, 4) != '<!--' and (strpos($code, '<') !== false or strpos($code, '>') !== false)) {
             // if valid tag and not a comment
             //--
             $tag_have_endline = false;
             if (substr($code, -1, 1) === "\n") {
                 $tag_have_endline = true;
             }
             //end if
             //--
             $code = trim(str_replace(array("\t", "\n", "\r"), array(' ', ' ', ' '), (string) $code));
             // make tabs and new lines as simple space
             $tmp_parse_attr = (array) $this->get_attributes($code);
             //--
             if (strpos($code, ' ') !== false and Smart::array_size($tmp_parse_attr) > 0) {
                 // tag have attributes
                 //--
                 $tmp_arr = explode(' ', $code);
                 // get tag parts
                 $this->elements[$i] = strtolower((string) $tmp_arr[0]);
                 // recompose the tags
                 foreach ($tmp_parse_attr as $key => $val) {
                     $tmp_is_valid_attr = true;
                     if (!preg_match((string) $this->regex_tag_name, (string) $key)) {
                         $tmp_is_valid_attr = false;
                         // remove invalid attributes
                     } elseif (substr((string) trim((string) $key), 0, 2) == 'on') {
                         $tmp_is_valid_attr = false;
                         // remove attributes starting with 'on' (all JS Events)
                     } elseif (substr((string) trim((string) $key), 0, 10) == 'formaction') {
                         $tmp_is_valid_attr = false;
                         // remove attributes starting with 'formaction'
                     } elseif (substr((string) trim((string) $val), 0, 2) == '&{') {
                         $tmp_is_valid_attr = false;
                         // remove attributes of which value are old Netscape JS ; Ex: border="&{getBorderWidth( )};"
                     } elseif (substr((string) trim((string) $val), 0, 11) == 'javascript:') {
                         $tmp_is_valid_attr = false;
                         // remove attributes that contain javascript:
                     } elseif (stripos((string) trim((string) $val), 'java') !== false and stripos((string) trim((string) $val), 'script') !== false and strpos((string) trim((string) $val), ':') !== false) {
                         $tmp_is_valid_attr = false;
                         // remove attributes that contain java + script + :
                     }
                     //end for
                     if ($tmp_is_valid_attr) {
                         $this->elements[$i] .= ' ' . strtolower($key) . '=' . '"' . str_replace(array('"', '<', '>'), array('&quot;', '&lt;', '&gt;'), (string) $val) . '"';
                     }
                     //end if
                 }
                 //end foreach
                 $this->elements[$i] .= '>';
                 if ($tag_have_endline) {
                     $this->elements[$i] .= "\n";
                 }
                 //end if
                 $tmp_arr = array();
                 //--
             } elseif (preg_match('/^[<' . $this->expr_tag_name . '\\/ >]+$/si', (string) $code)) {
                 // simple tags (includding tags like <br />) ; needs extra / and space
                 //--
                 $this->elements[$i] = strtolower((string) $code);
                 if ($tag_have_endline) {
                     $this->elements[$i] .= "\n";
                 }
                 //end if
                 //--
             } else {
                 //--
                 $this->elements[$i] = '';
                 // invalid tags, clear
                 //--
             }
             //end if
         }
         //end if
         //--
     }
     //end for
     //--
     //--
     $this->html = (string) SmartUnicode::convert_charset((string) implode('', (array) $this->elements), 'UTF-8', 'HTML-ENTITIES');
     //--
     if ($y_comments === false) {
         $this->html = preg_replace((array) $arr_tags_0x_list_comments, '', $this->html);
     }
     //end if
     //--
     //--
     if (Smart::array_size($y_allowed_tags) > 0) {
         $arr_striptags_allow = array();
         for ($i = 0; $i < count($y_allowed_tags); $i++) {
             if (preg_match((string) $this->regex_tag_name, (string) $y_allowed_tags[$i])) {
                 if (!in_array((string) $y_allowed_tags[$i], (array) $arr_striptags_allow)) {
                     // despite if a tag is specified as unallowed, if allowed here will take precedence
                     $arr_striptags_allow[] = '<' . $y_allowed_tags[$i] . '>';
                 }
                 //end if
             }
             //end if
         }
         //end for
         if (Smart::array_size($arr_striptags_allow) > 0) {
             //print_r($arr_striptags_allow);
             $str_striptags_allow = (string) implode(',', (array) $arr_striptags_allow);
             //echo $str_striptags_allow;
             $this->html = (string) strip_tags((string) $this->html, (string) $str_striptags_allow);
         }
         //end if
     }
     //end if
     //--
     //--
     $this->html = (string) trim((string) $this->html);
     //--
     //--
     $use_dom = false;
     //--
     if ($this->dom_processing !== false and class_exists('DOMDocument')) {
         //--
         $use_dom = true;
         //--
         if ((string) $this->html != '') {
             //--
             @libxml_use_internal_errors(true);
             @libxml_clear_errors();
             //--
             $dom = new DOMDocument(5, (string) SMART_FRAMEWORK_CHARSET);
             //--
             $dom->encoding = (string) SMART_FRAMEWORK_CHARSET;
             $dom->strictErrorChecking = false;
             // do not throw errors
             $dom->preserveWhiteSpace = true;
             // do not remove redundant white space
             $dom->formatOutput = true;
             // try to format pretty-print the code
             $dom->resolveExternals = false;
             // disable load external entities from a doctype declaration
             $dom->validateOnParse = false;
             // this must be explicit disabled as if set to true it may try to download the DTD and after to validate (insecure ...)
             //--
             @$dom->loadHTML((string) $this->html, LIBXML_ERR_WARNING | LIBXML_NONET | LIBXML_PARSEHUGE | LIBXML_BIGLINES | LIBXML_HTML_NODEFDTD | LIBXML_HTML_NOIMPLIED);
             $this->html = (string) @$dom->saveHTML();
             // get back from DOM
             //print_r($this->html);
             unset($dom);
             // clear DOM
             $this->html = (string) trim((string) preg_replace('~<(?:!DOCTYPE|/?(?:html|head|body))[^>]*>\\s*~i', '', (string) $this->html));
             // cleanup ; fixes: normally with the above options will add no doctype or html / body tags, but use it just in case ; alternative to this: explode by body to get content
             //--
             if ((string) SMART_FRAMEWORK_DEBUG_MODE == 'yes' or $this->dom_log_errors === true) {
                 // log errors if set :: OR ((string)$this->html == '')
                 $errors = (array) @libxml_get_errors();
                 if (Smart::array_size($errors) > 0) {
                     $notice_log = '';
                     foreach ($errors as $z => $error) {
                         if (is_object($error)) {
                             $notice_log .= 'PARSE-ERROR: [' . $the_ercode . '] / Level: ' . $error->level . ' / Line: ' . $error->line . ' / Column: ' . $error->column . ' / Code: ' . $error->code . ' / Message: ' . $error->message . "\n";
                         }
                         //end if
                     }
                     //end foreach
                     if ((string) $notice_log != '') {
                         Smart::log_notice('SmartHtmlParser NOTICE [DOMDocument]:' . "\n" . $notice_log . "\n" . '#END' . "\n");
                     }
                     //end if
                     if ((string) SMART_FRAMEWORK_DEBUG_MODE == 'yes') {
                         Smart::log_notice('SmartHtmlParser / Debug HTML-String:' . "\n" . $this->html . "\n" . '#END');
                     }
                     //end if
                 }
                 //end if
             }
             //end if
             //--
             @libxml_clear_errors();
             @libxml_use_internal_errors(false);
             //--
         }
         //end if
         //--
     }
     //end if
     //--
     //--
     if ($this->signature) {
         if ($use_dom) {
             $start_signature = '<!-- Smart/HTML.Cleaner [@] -->';
             $end_signature = '<!-- [/@] Smart/HTML.Cleaner -->';
         } else {
             $start_signature = '<!-- Smart/HTML.Cleaner [#] -->';
             $end_signature = '<!-- [/#] Smart/HTML.Cleaner -->';
         }
         //end if else
     } else {
         $start_signature = '';
         $end_signature = '';
     }
     //end if else
     //--
     //--
     $this->html = (string) $start_signature . $this->html . $end_signature;
     //--
 }