function HTMLDocument($input, $insert = '', $inject = false, $footer = '') { if (strlen($input) > 65536) { if (version_compare(PHP_VERSION, '5.3.7') <= 0) { ini_set('pcre.backtrack_limit', 1000000); } } # # Apply parsing that only needs to be done once.. # # Record the charset global $charset; if (!isset($charset)) { $meta_equiv = preg_match('#(<meta[^>]*http\\-equiv\\s*=[^>]*>)#is', $input, $tmp, PREG_OFFSET_CAPTURE) ? $tmp[0][0] : null; if (isset($meta_equiv)) { $charset = preg_match('#charset\\s*=\\s*["\']+([^"\'\\s>]*)#is', $meta_equiv, $tmp, PREG_OFFSET_CAPTURE) ? $tmp[1][0] : null; } } if (!isset($charset)) { $meta_charset = preg_match('#<meta[^>]*charset\\s*=\\s*["\']+([^"\'\\s>]*)#is', $input, $tmp, PREG_OFFSET_CAPTURE) ? $tmp[1][0] : null; if (isset($meta_charset)) { $charset = $meta_charset; } } # Remove empty script comments $input = preg_replace('#/\\*\\s*\\*/#s', '', $input); # Remove conditional comments $input = preg_replace('#<\\!\\-\\-\\[if \\!IE\\]>\\s*\\-\\->(.*?)<\\!\\[endif\\]\\-\\->#s', '$1', $input); $input = preg_replace('#<\\!\\-\\-\\[if.*?<\\!\\[endif\\]\\-\\->#s', '', $input); # Prevent websites from calling disableOverride() $input = preg_replace('#disableOverride#s', 'disabled___disableOverride', $input); # Prevent websites from making STUN requests $input = preg_replace('#RTCPeerConnection#s', 'disabled___RTCPeerConnection', $input); # Remove titles if option is enabled if ($this->htmlOptions['stripTitle'] || $this->htmlOptions['encodePage']) { $input = preg_replace('#<title.*?</title>#is', '', $input, 1); $input = preg_replace('#<meta[^>]*name=["\'](title|description|keywords)["\'][^>]*>#is', '', $input, 3); $input = preg_replace('#<link[^>]*rel=["\'](icon|shortcut icon)["\'][^>]*>#is', '', $input, 2); } # Remove and record a <base> href $input = preg_replace_callback('#<base href\\s*=\\s*([\\\'"])?((?(1)(?(?<=")[^"]{1,2048}|[^\\\']{1,2048})|[^\\s"\\\'>]{1,2048}))(?(1)\\1|)[^>]*>#i', 'html_stripBase', $input, 1); # Proxy url= values in meta redirects $input = preg_replace_callback('#content\\s*=\\s*(["\\\'])?[0-9]+\\s*;\\s*url=([\\\'"]|&\\#39;)?((?(?<=")[^"]+|(?(?<=\\\')[^\\\']+|[^\\\'" >]+)))(?(2)\\2|)(?(1)\\1|)#i', 'html_metaRefresh', $input, 1); # Process forms $input = preg_replace_callback('#<form([^>]*)>(.*?)</form>#is', 'html_form', $input); # Remove scripts blocks (avoids individual processing below) if ($this->htmlOptions['stripJS']) { $input = preg_replace('#<script[^>]*>.*?</script>#is', '', $input); } # # Split up the document into its different types and parse them # # Build up new document into this var $new = ''; $offset = 0; # Find instances of script or style blocks while (preg_match('#<(s(?:cript|tyle))[^>]*>#i', $input, $match, PREG_OFFSET_CAPTURE, $offset)) { # What type of block is this? $block = strtolower($match[1][0]); # Start position of content $outerStart = $match[0][1]; $innerStart = $outerStart + strlen($match[0][0]); # Determine type of end tag and find it's position $endTag = "</{$block}>"; $innerEnd = stripos($input, $endTag, $innerStart); if ($innerEnd === false) { $endTag = "</"; $innerEnd = stripos($input, $endTag, $innerStart); if ($innerEnd === false) { $input = preg_replace('#<script[^>]*>.*?$#is', '', $input); break; } } $outerEnd = $innerEnd + strlen($endTag); # Parse everything up till here and add to the new document $new .= $this->HTML(substr($input, $offset, $innerStart - $offset)); # Find parsing function $parseFunction = $block == 'style' ? 'CSS' : 'JS'; # Add the parsed block $new .= $this->{$parseFunction}(substr($input, $innerStart, $innerEnd - $innerStart)); # Move offset to new position $offset = $innerEnd; } # And add the final chunk (between last script/style block and end of doc) $new .= $this->HTML(substr($input, $offset)); # Replace input with the updated document $input = $new; global $foundPlugin; if ($foundPlugin && function_exists('postParse')) { $input = postParse($input, 'html'); $foundPlugin = false; } # Make URLs relative $input = preg_replace('#=\\s*(["\'])?\\s*https?://[^"\'>/]*/#i', '=$1/', $input); # Encode the page if ($this->htmlOptions['encodePage']) { $input = encodePage($input); } # # Now add our own code bits # # Insert our mini form after the <body> if ($insert !== false) { # Check for a frameset if (($useFrames = stripos($input, '<frameset')) !== false) { # Flag the frames so only first displays mini-form $input = preg_replace_callback('#<frame[^>]+src\\s*=\\s*([\\\'"])?((?(1)(?(?<=")[^"]{1,2048}|[^\\\']{1,2048})|[^\\s"\\\'>]{1,2048}))(?(1)\\1|)#i', 'html_flagFrames', $input); } # Attempt to add after body $input = preg_replace('#(<body[^>]*>)#i', '$1' . $insert, $input, 1, $tmp); # Check it inserted and append (if not a frameset) if (!$tmp && !$useFrames) { $input = $insert . $input; } } # Insert our javascript library if ($inject) { # Generate javascript to insert $inject = injectionJS(); # Add our proxy javascript after <head> $input = preg_replace('#(<head[^>]*>)#i', '$1' . $inject, $input, 1, $tmp); # If no <head>, just prepend if (!$tmp) { $input = $inject . $input; } } # Add anything to the footer? if ($footer) { $input = preg_replace('#(</body[^>]*>)#i', $footer . '$1', $input, 1, $tmp); # If no </body>, just append the footer if (!$tmp) { $input .= $footer; } } # Return new document return $input; }
function HTMLDocument($input, $insert = '', $inject = false, $footer = '') { // // Apply parsing that only needs to be done once.. // // Remove titles if option is enabled if ($this->htmlOptions['stripTitle']) { $input = preg_replace('#<title.*?</title>#is', '', $input, 1); } // Remove and record a <base> href $input = preg_replace_callback('#<base href\\s*=\\s*([\\\'"])?((?(1)(?(?<=")[^"]{1,1000}|[^\\\']{1,1000})|[^\\s"\\\'>]{1,1000}))(?(1)\\1|)[^>]*>#i', 'html_stripBase', $input, 1); // Proxify url= values in meta redirects $input = preg_replace_callback('#content\\s*=\\s*(["\\\'])?[0-9]+\\s*;\\s*url=([\\\'"]|&\\#39;)?((?(?<=")[^"]+|(?(?<=\\\')[^\\\']+|[^\\\'" >]+)))(?(2)\\2|)(?(1)\\1|)#i', 'html_metaRefresh', $input, 1); // Process forms $input = preg_replace_callback('#<form([^>]*)>(.*?)</form>#is', 'html_form', $input); // Remove scripts blocks (avoids individual processing below) if ($this->htmlOptions['stripJS']) { $input = preg_replace('#<script[^>]*>.*?</script>#is', '', $input); } // // Split up the document into its different types and parse them // // Build up new document into this var $new = ''; $offset = 0; // Find instances of script or style blocks while (preg_match('#<(s(?:cript|tyle))[^>]*>#i', $input, $match, PREG_OFFSET_CAPTURE, $offset)) { // What type of block is this? $block = strtolower($match[1][0]); // Start position of content $outerStart = $match[0][1]; $innerStart = $outerStart + strlen($match[0][0]); // Determine type of end tag and find it's position $endTag = "</{$block}>"; $innerEnd = stripos($input, $endTag, $innerStart); $outerEnd = $innerEnd + strlen($endTag); // Parse everything up till here and add to the new document $new .= $this->HTML(substr($input, $offset, $innerStart - $offset)); // Find parsing function $parseFunction = $block == 'style' ? 'CSS' : 'JS'; // Add the parsed block $new .= $this->{$parseFunction}(substr($input, $innerStart, $innerEnd - $innerStart)); // Move offset to new position $offset = $innerEnd; } // And add the final chunk (between last script/style block and end of doc) $new .= $this->HTML(substr($input, $offset)); // Replace input with the updated document $input = $new; // Encode the page if ($this->htmlOptions['encodePage']) { $input = encodePage($input); } // // Now add our own code bits // // Insert our mini form after the <body> if ($insert !== false) { // Check for a frameset if (($useFrames = stripos($input, '<frameset')) !== false) { // Flag the frames so only first displays mini-form $input = preg_replace_callback('#<frame[^>]+src\\s*=\\s*([\\\'"])?((?(1)(?(?<=")[^"]{1,1000}|[^\\\']{1,1000})|[^\\s"\\\'>]{1,1000}))(?(1)\\1|)#i', 'html_flagFrames', $input); } // Attempt to add after body $input = preg_replace('#(<body[^>]*>)#i', '$1' . $insert, $input, 1, $tmp); // Check it inserted and append (if not a frameset) if (!$tmp && !$useFrames) { $input = $insert . $input; } } // Insert our javascript library if ($inject) { // Generate javascript to insert $inject = injectionJS(); // Add our proxy javascript after <head> $input = preg_replace('#(<head[^>]*>)#i', '$1' . $inject, $input, 1, $tmp); // If no <head>, just prepend if (!$tmp) { $input = $inject . $input; } } // Add anything to the footer? if ($footer) { $input = preg_replace('#(</body[^>]*>)#i', $footer . '$1', $input, 1, $tmp); // If no </body>, just append the footer if (!$tmp) { $input .= $footer; } } // Return new document return $input; }