/** * Gathers the advertisements from the remote page * @param post Array The post data submitted by the form. * @return Array The ads retrieved from the remote page. */ public function fetchAds($post) { $this->_client->setUri($post['url']); $response = $this->_client->request('GET')->getBody(); /** * If the tidy class exists, attempt to cleanup the XML returned from the * response requested from the remote site. */ if (class_exists('tidy')) { $tidy = new Tidy('/dev/null', array('indent' => true, 'tab-size' => 4, 'output-encoding' => 'utf8', 'newline' => 'LF', 'output-xhtml' => true), 'utf8'); $tidy->parseString($response); $tidy->cleanRepair(); $response = $tidy->value; } /** * Once we've attempted to clean up the retrieved HTML, attempt to parse the * result in a DomDocument. */ $xml = new DOMDocument('1.0', 'utf-8'); $xml->loadHTML($response); $result = array(); # Foreach of the anchor links in the page, foreach ($xml->getElementsByTagName('a') as $a) { # Get it's target HREF $href = $a->getAttribute('href'); if (preg_match("/^http:\\/\\/([a-z\\-]+\\.)?{$post['ad']}.*\$/i", $href)) { # If a link's target points to the search query (the advertising site) $result[] = $href; # Append the result. } } return $result; }
public function __construct($content) { if (extension_loaded('tidy')) { // using the tiny php extension $tidy = new Tidy(); $tidy->parseString($content, array('output-xhtml' => true, 'numeric-entities' => true, 'wrap' => 99999), 'utf8'); $tidy->cleanRepair(); $tidy = str_replace('xmlns="http://www.w3.org/1999/xhtml"', '', $tidy); $tidy = str_replace(' ', '', $tidy); } elseif (@shell_exec('which tidy')) { // using tiny through cli $CLI_content = escapeshellarg($content); $tidy = `echo {$CLI_content} | tidy -n -q -utf8 -asxhtml 2> /dev/null`; $tidy = str_replace('xmlns="http://www.w3.org/1999/xhtml"', '', $tidy); $tidy = str_replace(' ', '', $tidy); } else { // no tidy library found, hence no sanitizing $tidy = $content; } $this->simpleXML = @simplexml_load_string($tidy, 'SimpleXMLElement', LIBXML_NOWARNING); if (!$this->simpleXML) { throw new Exception('CSSContentParser::__construct(): Could not parse content.' . ' Please check the PHP extension tidy is installed.'); } parent::__construct(); }
/** * Transforms an XML file into compatible XHTML based on the stylesheet * @param $xml XML DOM tree, or string filename * @return string HTML output * @todo Rename to transformToXHTML, as transformToHTML is misleading */ public function transformToHTML($xml) { if (is_string($xml)) { $dom = new DOMDocument(); $dom->load($xml); } else { $dom = $xml; } $out = $this->xsltProcessor->transformToXML($dom); // fudges for HTML backwards compatibility // assumes that document is XHTML $out = str_replace('/>', ' />', $out); // <br /> not <br/> $out = str_replace(' xmlns=""', '', $out); // rm unnecessary xmlns if (class_exists('Tidy')) { // cleanup output $config = array('indent' => true, 'output-xhtml' => true, 'wrap' => 80); $tidy = new Tidy(); $tidy->parseString($out, $config, 'utf8'); $tidy->cleanRepair(); $out = (string) $tidy; } return $out; }
/** * Generates HTML from an array of tokens. * @param $tokens Array of HTMLPurifier_Token * @param $config HTMLPurifier_Config object * @return Generated HTML */ public function generateFromTokens($tokens) { if (!$tokens) { return ''; } // Basic algorithm $html = ''; for ($i = 0, $size = count($tokens); $i < $size; $i++) { if ($this->_scriptFix && $tokens[$i]->name === 'script' && $i + 2 < $size && $tokens[$i + 2] instanceof HTMLPurifier_Token_End) { // script special case // the contents of the script block must be ONE token // for this to work. $html .= $this->generateFromToken($tokens[$i++]); $html .= $this->generateScriptFromToken($tokens[$i++]); } $html .= $this->generateFromToken($tokens[$i]); } // Tidy cleanup if (extension_loaded('tidy') && $this->config->get('Output.TidyFormat')) { $tidy = new Tidy(); $tidy->parseString($html, array('indent' => true, 'output-xhtml' => $this->_xhtml, 'show-body-only' => true, 'indent-spaces' => 2, 'wrap' => 68), 'utf8'); $tidy->cleanRepair(); $html = (string) $tidy; // explicit cast necessary } // Normalize newlines to system defined value $nl = $this->config->get('Output.Newline'); if ($nl === null) { $nl = PHP_EOL; } if ($nl !== "\n") { $html = str_replace("\n", $nl, $html); } return $html; }
function __construct($content) { if(extension_loaded('tidy')) { // using the tiny php extension $tidy = new Tidy(); $tidy->parseString( $content, array( 'output-xhtml' => true, 'numeric-entities' => true, ), 'utf8' ); $tidy->cleanRepair(); $tidy = str_replace('xmlns="http://www.w3.org/1999/xhtml"','',$tidy); $tidy = str_replace(' ','',$tidy); } elseif(`which tidy`) { // using tiny through cli $CLI_content = escapeshellarg($content); $tidy = `echo $CLI_content | tidy -n -q -utf8 -asxhtml 2> /dev/null`; $tidy = str_replace('xmlns="http://www.w3.org/1999/xhtml"','',$tidy); $tidy = str_replace(' ','',$tidy); } else { // no tidy library found, hence no sanitizing $tidy = $content; } $this->simpleXML = new SimpleXMLElement($tidy); }
/** * receive the html content, fix/format the dom tree and return it * * @param string $content * @return string */ protected function _tidyFix($content) { $config = ['input-xml' => true, 'output-xml' => true, 'wrap' => false]; $tidy = new Tidy(); $tidy->parseString($content, $config, 'utf8'); $tidy->cleanRepair(); $content = (string) $tidy; return $content; }
public static function beforeResponse($request, $response) { if ($request['_format'] == 'html') { $tidy = new \Tidy(); $tidy->parseString($response, array('wrap' => 200, 'indent' => true), 'utf8'); $tidy->cleanRepair(); $html = $tidy->html(); $response = $html->value; } return $response; }
public function formatHtml($html, $charset = null, $charset_hint = null) { $html = $this->toUTF8($html, $charset, $charset_hint); $tidy = new Tidy(); $config = array("hide-comments" => true); $tidy->parseString($html, $config, 'UTF8'); $tidy->cleanRepair(); $html = (string) $tidy; $html = $this->moveMetaContentTypeToTop($html); $html = $this->formatDocType($html); return $html; }
function parse_html($html_code) { $this->html_code = $html_code; // Tidy HTML code $tidy = new Tidy(); $tidy->parseString($html_code, $this->tidy_config, 'utf8'); $tidy->cleanRepair(); $this->tidy_code = $tidy->value; $this->dom = DOMDocument::loadXML($tidy->value); $this->dom->normalizeDocument(); if ($this->dom == null) { trigger_error("Unable to parse XML Document!", E_USER_ERROR); } }
public function formatTables($text) { $text = preg_replace_callback('%<div class="rvps(?:14|8)">\\n*<table.*?>([\\s\\S]*?)</table>\\n*</div>%u', function ($matches) { $table = '<table>' . $matches[1] . '</table>'; $table = preg_replace('%(?:<p class="rvps(?:1|4|14)">)?<span class="rvts(?:9|15|23)">\\s*(.*?)\\s*</span>(?:</p>)?%u', '<b class="table-header">$1</b>', $table); $table = preg_replace('%<b class="table-header"><br></b>%u', '', $table); // rvps14 - rvps14 // rvps14 - rvps11 // rvps4 - rvps15 $config = array('clean' => true, 'output-html' => true, 'show-body-only' => true, 'wrap' => 0, 'indent' => true); $tidy = new \Tidy(); $tidy->parseString($table, $config, 'utf8'); $tidy->cleanRepair(); return $tidy . "\n"; }, $text); return $text; }
/** * Reads input and returns Tidy-filtered output. * * @param null $len * * @throws BuildException * @return the resulting stream, or -1 if the end of the resulting stream has been reached * */ public function read($len = null) { if (!class_exists('Tidy')) { throw new BuildException("You must enable the 'tidy' extension in your PHP configuration in order to use the Tidy filter."); } if (!$this->getInitialized()) { $this->_initialize(); $this->setInitialized(true); } $buffer = $this->in->read($len); if ($buffer === -1) { return -1; } $config = $this->getDistilledConfig(); $tidy = new Tidy(); $tidy->parseString($buffer, $config, $this->encoding); $tidy->cleanRepair(); return tidy_get_output($tidy); }
/** * Transforms an XML file into HTML based on the stylesheet * @param $xml XML DOM tree */ public function transformToHTML($xml) { $out = $this->xsltProcessor->transformToXML($xml); // fudges for HTML backwards compatibility $out = str_replace('/>', ' />', $out); // <br /> not <br/> $out = str_replace(' xmlns=""', '', $out); // rm unnecessary xmlns $out = str_replace(' xmlns="http://www.w3.org/1999/xhtml"', '', $out); // rm unnecessary xmlns if (class_exists('Tidy')) { // cleanup output $config = array('indent' => true, 'output-xhtml' => true, 'wrap' => 80); $tidy = new Tidy(); $tidy->parseString($out, $config, 'utf8'); $tidy->cleanRepair(); $out = (string) $tidy; } return $out; }
protected function loadHtml($uri) { if (preg_match('/^https?:/i', $uri) === 0) { $uri = $this->config->getBaseHref() . $uri; } $curl = curl_init($uri); curl_setopt($curl, CURLOPT_RETURNTRANSFER, true); $html = curl_exec($curl); $this->request_info = curl_getinfo($curl); curl_close($curl); $this->location = $uri; $tidy = new Tidy(); $tidy->parseString($html, array('output-xhtml' => true, 'char-encoding' => 'utf8', 'numeric-entities' => true), 'utf8'); $tidy->cleanRepair(); $this->document = new DOMDocument(); $this->document->resolveExternals = true; $this->document->loadXml($tidy); $this->xpath = new DOMXPath($this->document); $this->xpath->registerNamespace('atom', 'http://www.w3.org/2005/Atom'); $this->xpath->registerNamespace('html', 'http://www.w3.org/1999/xhtml'); }
$response['message'] = ""; //use php's filter to check for a valid url if (!filter_var($_POST['url'], FILTER_VALIDATE_URL) === false) { $url = $_POST['url']; $curl = new MyCurl($url); $curl->createCurl(); $response['code'] = $curl->getHttpStatus(); $response['message'] = HttpCodes::getType($response['code']); $html = $curl->__toString(); if (!is_string($html)) { $response['message'] = "Page Could not be loaded, check the domain. Nothing was returned."; } else { $tidy = new Tidy(); //load page into tidy object, set options, and clean html $tidy->parseString($html, array('indent' => 2, 'output-xhtml' => true)); $tidy->cleanRepair(); //html is now nicely indented $html = (string) $tidy; //count the tags and get the result in a $tag => $count array $tagCount = countTags($html); $response['tagCount'] = $tagCount; $response['html'] = htmlentities($html); } } else { $response['message'] = $_POST['url'] . " is not a valid URL"; } header('Content-Type: application/json'); echo json_encode($response); } else { //load the base view, located at ../views/base.php View::load("base");
function tidyToXml($htmlTagSoup) { // Create the Tidy object $tidy = new Tidy(); // Parse the HTML into memory, turning on the option to convert to // XHTML as part of the tidying process $tidy->parseString($htmlTagSoup, array('output-xhtml' => true)); // Do the tidying $tidy->cleanRepair(); // And get the tidied version as a string $tidied_xml = tidy_get_output($tidy); // Opinions seem to differ as to whether the non-breaking space // entity ' ' is predeclared as part of XHTML. Tidy thinks it // is, and so leaves it alone, while the XML parser we're about to // use on this string thinks otherwise. So replace any occurrences // of it with its numeric equivalent (which doesn't need to be // declared). return str_replace(' ', ' ', $tidied_xml); }
private function clean($content) { if (!$content) { return ''; } $tidy = new \Tidy(); $tidy->parseString($content, ['indent' => true, 'doctype' => 'omit', 'output-html' => true, 'show-body-only' => true, 'drop-empty-paras' => true, 'drop-font-tags' => true, 'drop-proprietary-attributes' => true, 'hide-comments' => true, 'logical-emphasis' => true]); $tidy->cleanRepair(); return (string) $content; }
/** * @dataProvider filterProvider * @covers Robo47_Filter_Tidy::filter */ public function testFilter($code) { $filter = new Robo47_Filter_Tidy(); $filtered = $filter->filter($code); $tidy = new Tidy(); $tidy->parseString($code, $filter->getConfig(), $filter->getEncoding()); $tidy->cleanRepair(); $this->assertEquals((string) $tidy, $filtered, 'Filter output missmatches direct tidy-output'); }
/** Clean and repair HTML @return string @param $html string @public **/ static function tidy($html) { if (!extension_loaded('tidy')) { return $html; } $tidy = new Tidy(); $tidy->parseString($html, self::$vars['TIDY'], str_replace('-', '', self::$vars['ENCODING'])); $tidy->cleanRepair(); return (string) $tidy; }
/** * Filter * * @see Zend_Filter_Interface::filter * @param string $value * @return string */ public function filter($value) { $this->_tidy->parseString($value, $this->getConfig(), $this->getEncoding()); $this->_tidy->cleanRepair(); return (string) $this->_tidy; }
/** * Custom function that formats a string of HTML using Tidy * @param string $string */ function unl_tidy($string) { if (class_exists('Tidy') && variable_get('unl_tidy')) { $tidy = new Tidy(); // Tidy options: http://tidy.sourceforge.net/docs/quickref.html $options = array('doctype' => 'omit', 'new-blocklevel-tags' => 'article,aside,header,footer,section,nav,hgroup,address,figure,figcaption,output', 'new-inline-tags' => 'video,audio,canvas,ruby,rt,rp,time,code,kbd,samp,var,mark,bdi,bdo,wbr,details,datalist,source,summary', 'output-xhtml' => true, 'show-body-only' => true, 'indent' => true, 'indent-spaces' => 2, 'vertical-space' => false, 'wrap' => 140, 'wrap-attributes' => false, 'force-output' => true, 'quiet' => true, 'tidy-mark' => false); // Add to prevent Tidy from removing script or comment if it is the first thing if (strtolower(substr(trim($string), 0, 7)) == '<script' || substr(trim($string), 0, 4) == '<!--') { $statement = ''; if (substr(trim($string), 0, 9) !== '<!-- Tidy') { $statement = "<!-- Tidy: Start field with something other than script or comment to remove this -->\n"; } $string = " " . $statement . $string; } $tidy->parseString($string, $options, 'utf8'); if ($tidy->cleanRepair()) { return $tidy; } } return $string; }
/** * Generates HTML from an array of tokens. * @param $tokens Array of HTMLPurifier_Token * @param $config HTMLPurifier_Config object * @return Generated HTML */ function generateFromTokens($tokens, $config, &$context) { $html = ''; if (!$config) { $config = HTMLPurifier_Config::createDefault(); } $this->_scriptFix = $config->get('Output', 'CommentScriptContents'); $this->_def = $config->getHTMLDefinition(); $this->_xhtml = $this->_def->doctype->xml; if (!$tokens) { return ''; } for ($i = 0, $size = count($tokens); $i < $size; $i++) { if ($this->_scriptFix && $tokens[$i]->name === 'script' && $i + 2 < $size && $tokens[$i + 2]->type == 'end') { // script special case // the contents of the script block must be ONE token // for this to work $html .= $this->generateFromToken($tokens[$i++]); $html .= $this->generateScriptFromToken($tokens[$i++]); // We're not going to do this: it wouldn't be valid anyway //while ($tokens[$i]->name != 'script') { // $html .= $this->generateScriptFromToken($tokens[$i++]); //} } $html .= $this->generateFromToken($tokens[$i]); } if ($config->get('Output', 'TidyFormat') && extension_loaded('tidy')) { $tidy_options = array('indent' => true, 'output-xhtml' => $this->_xhtml, 'show-body-only' => true, 'indent-spaces' => 2, 'wrap' => 68); if (version_compare(PHP_VERSION, '5', '<')) { tidy_set_encoding('utf8'); foreach ($tidy_options as $key => $value) { tidy_setopt($key, $value); } tidy_parse_string($html); tidy_clean_repair(); $html = tidy_get_output(); } else { $tidy = new Tidy(); $tidy->parseString($html, $tidy_options, 'utf8'); $tidy->cleanRepair(); $html = (string) $tidy; } } // normalize newlines to system $nl = $config->get('Output', 'Newline'); if ($nl === null) { $nl = PHP_EOL; } $html = str_replace("\n", $nl, $html); return $html; }
public function generateFromTokens($tokens) { if (!$tokens) { return ''; } $html = ''; for ($i = 0, $size = count($tokens); $i < $size; $i++) { if ($this->_scriptFix && $tokens[$i]->name === 'script' && $i + 2 < $size && $tokens[$i + 2] instanceof HTMLPurifier_Token_End) { $html .= $this->generateFromToken($tokens[$i++]); $html .= $this->generateScriptFromToken($tokens[$i++]); } $html .= $this->generateFromToken($tokens[$i]); } if (extension_loaded('tidy') && $this->config->get('Output.TidyFormat')) { $tidy = new Tidy(); $tidy->parseString($html, array('indent' => true, 'output-xhtml' => $this->_xhtml, 'show-body-only' => true, 'indent-spaces' => 2, 'wrap' => 68), 'utf8'); $tidy->cleanRepair(); $html = (string) $tidy; } if ($this->config->get('Core.NormalizeNewlines')) { $nl = $this->config->get('Output.Newline'); if ($nl === null) { $nl = PHP_EOL; } if ($nl !== "\n") { $html = str_replace("\n", $nl, $html); } } return $html; }
function __toString() { $this->set('jQuery', '<script src="http://ajax.googleapis.com/ajax/libs/jquery/1.9.1/jquery.min.js"></script>'); $this->set('bootstrap', '<link href="http://getbootstrap.com/dist/css/bootstrap.min.css" rel="stylesheet"> <script src="http://getbootstrap.com/dist/js/bootstrap.min.js"></script>'); $this->time = round(microtime() - $this->time, 4); $this->set("time", $this->time); $text = $this->parseVal($this->str); if (Config\Main::$tidyEnabled) { $tidy = new \Tidy(); $tidy->parseString($text, ["wrap" => 160]); $tidy->cleanRepair(); $text = $tidy; } return (string) $text; }