private function _decodeHeader($input) { //-- Remove white space between encoded-words $input = (string) preg_replace('/(=\\?[^?]+\\?(q|b)\\?[^?]*\\?=)(\\s)+=\\?/i', '\\1=?', (string) $input); // insensitive //-- For each encoded-word... $matches = array(); while (preg_match('/(=\\?([^?]+)\\?(q|b)\\?([^?]*)\\?=)/i', (string) $input, $matches)) { // insensitive //-- $encoded = $matches[1]; $charset = $matches[2]; $encoding = $matches[3]; $text = $matches[4]; //-- if ((string) $charset == '' or (string) $charset == 'us-ascii') { $charset = 'iso-8859-1'; // correction :: {{{SYNC-CHARSET-FIX}}} } //end if //-- switch (strtoupper($encoding)) { case 'B': $text = base64_decode($text); $text = SmartUnicode::convert_charset($text, $charset, $this->charset); // {{{SYNC-CHARSET-CONVERT}}} break; case 'Q': $text = quoted_printable_decode($text); $text = SmartUnicode::convert_charset($text, $charset, $this->charset); // {{{SYNC-CHARSET-CONVERT}}} break; default: // as is } //end switch //-- $input = (string) str_replace($encoded, $text, $input); //-- } //end while //-- return (string) $input; //-- }
private function clean_html($y_comments, $y_extra_tags_remove = array(), $y_extra_tags_clean = array(), $y_allowed_tags = array()) { //-- CLEANUP DISSALOWED AND FIX INVALID HTML TAGS // * it will use code standardize before to fix active PHP tags and weird characters // * will convert all UTF-8 characters to the coresponding HTML-ENTITIES // * will remove all tags that are unsafe like <script> or <head> and many other dissalowed unsafe tags // * if allowed tags are specified they will take precedence and will be filtered via strip_tags by allowing only these tags, at the end of cleanup to be safer ! // * if DomDocument is detected and is allowed to be used by current settings will be used finally to do (post-processing) extra cleanup and fixes //-- //-- if ($this->is_clean != false) { return; // avoid to re-parse } //end if //-- $this->is_clean = true; //-- //-- $this->standardize_html(); // first, standardize the HTML Code //-- //-- $arr_tags_0x_list_comments = array('#\\<\\s*?\\!\\-?\\-?(.*?)\\-?\\-?\\>#si'); //-- $arr_tags_2x_list_bad = array('head', 'style', 'script', 'noscript', 'frameset', 'frame', 'iframe', 'canvas', 'audio', 'video', 'applet', 'param', 'object', 'form', 'xml', 'xmp', 'o:p'); if (Smart::array_size($y_extra_tags_remove) > 0) { // add extra entries such as: img, p, div, ... for ($i = 0; $i < count($y_extra_tags_remove); $i++) { if (preg_match((string) $this->regex_tag_name, (string) $y_extra_tags_remove[$i])) { if (!in_array((string) $y_extra_tags_remove[$i], $arr_tags_2x_list_bad)) { $arr_tags_2x_list_bad[] = (string) $y_extra_tags_remove[$i]; } //end if } //end if } //end for } //end if $arr_tags_2x_repl_bad = (array) $arr_tags_0x_list_comments; for ($i = 0; $i < count($arr_tags_0x_list_comments); $i++) { $arr_tags_2x_repl_good[] = '<!-- # -->'; // comment } //end for for ($i = 0; $i < count($arr_tags_2x_list_bad); $i++) { $tmp_regex_tag = (array) $this->regex_tag((string) $arr_tags_2x_list_bad[$i]); // currently if nested tags some content between those tags may remain not removed ... but that is ok as long as the tag is replaced ; possible fix: match with siU instead of si but will go ungreedy and will match all content until very last end tag ... which may remove too many content $arr_tags_2x_repl_bad[] = $tmp_regex_tag['delimiter'] . '(' . $tmp_regex_tag['tag-start'] . ')' . '.*?' . '(' . $tmp_regex_tag['tag-end'] . ')' . $tmp_regex_tag['delimiter'] . 'si'; // fix: paranthesis are required to correct match in this case (balanced regex) $arr_tags_2x_repl_good[] = '<!-- ' . Smart::escape_html((string) $arr_tags_2x_list_bad[$i]) . '/ -->'; } //end if //-- //-- $arr_tags_1x_list_bad = (array) array_merge((array) $arr_tags_2x_list_bad, array('!doctype', 'html', 'body', 'base', 'meta', 'link', 'track', 'source', 'plaintext', 'marquee')); if (Smart::array_size($y_extra_tags_clean) > 0) { // add extra entries such as: img, p, div, ... for ($i = 0; $i < count($y_extra_tags_clean); $i++) { if (preg_match((string) $this->regex_tag_name, (string) $y_extra_tags_clean[$i])) { if (!in_array((string) $y_extra_tags_clean[$i], $arr_tags_1x_list_bad)) { $arr_tags_1x_list_bad[] = (string) $y_extra_tags_clean[$i]; } //end if } //end if } //end for } //end if $arr_tags_1x_repl_bad = array(); $arr_tags_1x_repl_good = array(); for ($i = 0; $i < count($arr_tags_1x_list_bad); $i++) { $tmp_regex_tag = (array) $this->regex_tag((string) $arr_tags_1x_list_bad[$i]); $arr_tags_1x_repl_bad[] = $tmp_regex_tag['delimiter'] . $tmp_regex_tag['tag-start'] . $tmp_regex_tag['delimiter'] . 'si'; $arr_tags_1x_repl_bad[] = $tmp_regex_tag['delimiter'] . $tmp_regex_tag['tag-end'] . $tmp_regex_tag['delimiter'] . 'si'; $arr_tags_1x_repl_good[] = '<!-- ' . Smart::escape_html((string) $arr_tags_1x_list_bad[$i]) . ' -->'; $arr_tags_1x_repl_good[] = '<!-- /' . Smart::escape_html((string) $arr_tags_1x_list_bad[$i]) . ' -->'; } //end if //-- //-- $arr_all_repl_bad = (array) array_merge((array) $arr_tags_2x_repl_bad, (array) $arr_tags_1x_repl_bad); $arr_all_repl_good = (array) array_merge((array) $arr_tags_2x_repl_good, (array) $arr_tags_1x_repl_good); //-- //print_r($arr_tags_2x_repl_bad); //print_r($arr_tags_2x_repl_good); //print_r($arr_tags_1x_repl_bad); //print_r($arr_tags_1x_repl_good); //print_r($arr_all_repl_bad); //print_r($arr_all_repl_good); //die(''); //-- //-- $this->html = (string) preg_replace((array) $arr_all_repl_bad, (array) $arr_all_repl_good, (string) $this->html); //-- //-- $this->parse_elements(); //-- //-- for ($i = 0; $i < Smart::array_size($this->elements); $i++) { //-- $code = (string) $this->elements[$i]; if (substr($code, 0, 4) != '<!--' and (strpos($code, '<') !== false or strpos($code, '>') !== false)) { // if valid tag and not a comment //-- $tag_have_endline = false; if (substr($code, -1, 1) === "\n") { $tag_have_endline = true; } //end if //-- $code = trim(str_replace(array("\t", "\n", "\r"), array(' ', ' ', ' '), (string) $code)); // make tabs and new lines as simple space $tmp_parse_attr = (array) $this->get_attributes($code); //-- if (strpos($code, ' ') !== false and Smart::array_size($tmp_parse_attr) > 0) { // tag have attributes //-- $tmp_arr = explode(' ', $code); // get tag parts $this->elements[$i] = strtolower((string) $tmp_arr[0]); // recompose the tags foreach ($tmp_parse_attr as $key => $val) { $tmp_is_valid_attr = true; if (!preg_match((string) $this->regex_tag_name, (string) $key)) { $tmp_is_valid_attr = false; // remove invalid attributes } elseif (substr((string) trim((string) $key), 0, 2) == 'on') { $tmp_is_valid_attr = false; // remove attributes starting with 'on' (all JS Events) } elseif (substr((string) trim((string) $key), 0, 10) == 'formaction') { $tmp_is_valid_attr = false; // remove attributes starting with 'formaction' } elseif (substr((string) trim((string) $val), 0, 2) == '&{') { $tmp_is_valid_attr = false; // remove attributes of which value are old Netscape JS ; Ex: border="&{getBorderWidth( )};" } elseif (substr((string) trim((string) $val), 0, 11) == 'javascript:') { $tmp_is_valid_attr = false; // remove attributes that contain javascript: } elseif (stripos((string) trim((string) $val), 'java') !== false and stripos((string) trim((string) $val), 'script') !== false and strpos((string) trim((string) $val), ':') !== false) { $tmp_is_valid_attr = false; // remove attributes that contain java + script + : } //end for if ($tmp_is_valid_attr) { $this->elements[$i] .= ' ' . strtolower($key) . '=' . '"' . str_replace(array('"', '<', '>'), array('"', '<', '>'), (string) $val) . '"'; } //end if } //end foreach $this->elements[$i] .= '>'; if ($tag_have_endline) { $this->elements[$i] .= "\n"; } //end if $tmp_arr = array(); //-- } elseif (preg_match('/^[<' . $this->expr_tag_name . '\\/ >]+$/si', (string) $code)) { // simple tags (includding tags like <br />) ; needs extra / and space //-- $this->elements[$i] = strtolower((string) $code); if ($tag_have_endline) { $this->elements[$i] .= "\n"; } //end if //-- } else { //-- $this->elements[$i] = ''; // invalid tags, clear //-- } //end if } //end if //-- } //end for //-- //-- $this->html = (string) SmartUnicode::convert_charset((string) implode('', (array) $this->elements), 'UTF-8', 'HTML-ENTITIES'); //-- if ($y_comments === false) { $this->html = preg_replace((array) $arr_tags_0x_list_comments, '', $this->html); } //end if //-- //-- if (Smart::array_size($y_allowed_tags) > 0) { $arr_striptags_allow = array(); for ($i = 0; $i < count($y_allowed_tags); $i++) { if (preg_match((string) $this->regex_tag_name, (string) $y_allowed_tags[$i])) { if (!in_array((string) $y_allowed_tags[$i], (array) $arr_striptags_allow)) { // despite if a tag is specified as unallowed, if allowed here will take precedence $arr_striptags_allow[] = '<' . $y_allowed_tags[$i] . '>'; } //end if } //end if } //end for if (Smart::array_size($arr_striptags_allow) > 0) { //print_r($arr_striptags_allow); $str_striptags_allow = (string) implode(',', (array) $arr_striptags_allow); //echo $str_striptags_allow; $this->html = (string) strip_tags((string) $this->html, (string) $str_striptags_allow); } //end if } //end if //-- //-- $this->html = (string) trim((string) $this->html); //-- //-- $use_dom = false; //-- if ($this->dom_processing !== false and class_exists('DOMDocument')) { //-- $use_dom = true; //-- if ((string) $this->html != '') { //-- @libxml_use_internal_errors(true); @libxml_clear_errors(); //-- $dom = new DOMDocument(5, (string) SMART_FRAMEWORK_CHARSET); //-- $dom->encoding = (string) SMART_FRAMEWORK_CHARSET; $dom->strictErrorChecking = false; // do not throw errors $dom->preserveWhiteSpace = true; // do not remove redundant white space $dom->formatOutput = true; // try to format pretty-print the code $dom->resolveExternals = false; // disable load external entities from a doctype declaration $dom->validateOnParse = false; // this must be explicit disabled as if set to true it may try to download the DTD and after to validate (insecure ...) //-- @$dom->loadHTML((string) $this->html, LIBXML_ERR_WARNING | LIBXML_NONET | LIBXML_PARSEHUGE | LIBXML_BIGLINES | LIBXML_HTML_NODEFDTD | LIBXML_HTML_NOIMPLIED); $this->html = (string) @$dom->saveHTML(); // get back from DOM //print_r($this->html); unset($dom); // clear DOM $this->html = (string) trim((string) preg_replace('~<(?:!DOCTYPE|/?(?:html|head|body))[^>]*>\\s*~i', '', (string) $this->html)); // cleanup ; fixes: normally with the above options will add no doctype or html / body tags, but use it just in case ; alternative to this: explode by body to get content //-- if ((string) SMART_FRAMEWORK_DEBUG_MODE == 'yes' or $this->dom_log_errors === true) { // log errors if set :: OR ((string)$this->html == '') $errors = (array) @libxml_get_errors(); if (Smart::array_size($errors) > 0) { $notice_log = ''; foreach ($errors as $z => $error) { if (is_object($error)) { $notice_log .= 'PARSE-ERROR: [' . $the_ercode . '] / Level: ' . $error->level . ' / Line: ' . $error->line . ' / Column: ' . $error->column . ' / Code: ' . $error->code . ' / Message: ' . $error->message . "\n"; } //end if } //end foreach if ((string) $notice_log != '') { Smart::log_notice('SmartHtmlParser NOTICE [DOMDocument]:' . "\n" . $notice_log . "\n" . '#END' . "\n"); } //end if if ((string) SMART_FRAMEWORK_DEBUG_MODE == 'yes') { Smart::log_notice('SmartHtmlParser / Debug HTML-String:' . "\n" . $this->html . "\n" . '#END'); } //end if } //end if } //end if //-- @libxml_clear_errors(); @libxml_use_internal_errors(false); //-- } //end if //-- } //end if //-- //-- if ($this->signature) { if ($use_dom) { $start_signature = '<!-- Smart/HTML.Cleaner [@] -->'; $end_signature = '<!-- [/@] Smart/HTML.Cleaner -->'; } else { $start_signature = '<!-- Smart/HTML.Cleaner [#] -->'; $end_signature = '<!-- [/#] Smart/HTML.Cleaner -->'; } //end if else } else { $start_signature = ''; $end_signature = ''; } //end if else //-- //-- $this->html = (string) $start_signature . $this->html . $end_signature; //-- }