/** * Transforms an XML file into compatible XHTML based on the stylesheet * @param $xml XML DOM tree, or string filename * @return string HTML output * @todo Rename to transformToXHTML, as transformToHTML is misleading */ public function transformToHTML($xml) { if (is_string($xml)) { $dom = new DOMDocument(); $dom->load($xml); } else { $dom = $xml; } $out = $this->xsltProcessor->transformToXML($dom); // fudges for HTML backwards compatibility // assumes that document is XHTML $out = str_replace('/>', ' />', $out); // <br /> not <br/> $out = str_replace(' xmlns=""', '', $out); // rm unnecessary xmlns if (class_exists('Tidy')) { // cleanup output $config = array('indent' => true, 'output-xhtml' => true, 'wrap' => 80); $tidy = new Tidy(); $tidy->parseString($out, $config, 'utf8'); $tidy->cleanRepair(); $out = (string) $tidy; } return $out; }
public function __construct($content) { if (extension_loaded('tidy')) { // using the tiny php extension $tidy = new Tidy(); $tidy->parseString($content, array('output-xhtml' => true, 'numeric-entities' => true, 'wrap' => 99999), 'utf8'); $tidy->cleanRepair(); $tidy = str_replace('xmlns="http://www.w3.org/1999/xhtml"', '', $tidy); $tidy = str_replace(' ', '', $tidy); } elseif (@shell_exec('which tidy')) { // using tiny through cli $CLI_content = escapeshellarg($content); $tidy = `echo {$CLI_content} | tidy -n -q -utf8 -asxhtml 2> /dev/null`; $tidy = str_replace('xmlns="http://www.w3.org/1999/xhtml"', '', $tidy); $tidy = str_replace(' ', '', $tidy); } else { // no tidy library found, hence no sanitizing $tidy = $content; } $this->simpleXML = @simplexml_load_string($tidy, 'SimpleXMLElement', LIBXML_NOWARNING); if (!$this->simpleXML) { throw new Exception('CSSContentParser::__construct(): Could not parse content.' . ' Please check the PHP extension tidy is installed.'); } parent::__construct(); }
public function loadFeed() { if (!isset(RSSFilter::$feeds[$this->A("RSSFilterFeed")])) { RSSFilter::$feeds[$this->A("RSSFilterFeed")] = "<phynx></phynx>"; $content = @file_get_contents($this->A("RSSFilterFeed")); if ($content === false) { throw new Exception($this->A("RSSFilterFeed") . " could not be loaded!"); } $content = str_replace("", "", $content); try { libxml_use_internal_errors(true); RSSFilter::$feeds[$this->A("RSSFilterFeed")] = new SimpleXMLElement($content); } catch (Exception $e) { try { $config = array('indent' => true, 'clean' => true, 'input-xml' => true, 'output-xml' => true, 'wrap' => false); $tidy = new Tidy(); $xml = $tidy->repairString($content, $config); RSSFilter::$feeds[$this->A("RSSFilterFeed")] = new SimpleXMLElement($xml); } catch (ClassNotFoundException $e) { throw new Exception($this->A("RSSFilterFeed") . " contains errors, but Tidy not found!"); } catch (Exception $e) { #$errors = ""; #foreach(libxml_get_errors() as $error) # print_r($error->message); throw new Exception($this->A("RSSFilterFeed") . " contained errors even Tidy could not fix!"); } #$errors = ""; #foreach(libxml_get_errors() as $error) # print_r($error->message); #throw new Exception($this->A("RSSFilterFeed")); } } }
/** * Generates HTML from an array of tokens. * @param $tokens Array of HTMLPurifier_Token * @param $config HTMLPurifier_Config object * @return Generated HTML */ public function generateFromTokens($tokens) { if (!$tokens) { return ''; } // Basic algorithm $html = ''; for ($i = 0, $size = count($tokens); $i < $size; $i++) { if ($this->_scriptFix && $tokens[$i]->name === 'script' && $i + 2 < $size && $tokens[$i + 2] instanceof HTMLPurifier_Token_End) { // script special case // the contents of the script block must be ONE token // for this to work. $html .= $this->generateFromToken($tokens[$i++]); $html .= $this->generateScriptFromToken($tokens[$i++]); } $html .= $this->generateFromToken($tokens[$i]); } // Tidy cleanup if (extension_loaded('tidy') && $this->config->get('Output.TidyFormat')) { $tidy = new Tidy(); $tidy->parseString($html, array('indent' => true, 'output-xhtml' => $this->_xhtml, 'show-body-only' => true, 'indent-spaces' => 2, 'wrap' => 68), 'utf8'); $tidy->cleanRepair(); $html = (string) $tidy; // explicit cast necessary } // Normalize newlines to system defined value $nl = $this->config->get('Output.Newline'); if ($nl === null) { $nl = PHP_EOL; } if ($nl !== "\n") { $html = str_replace("\n", $nl, $html); } return $html; }
function __construct($content) { if(extension_loaded('tidy')) { // using the tiny php extension $tidy = new Tidy(); $tidy->parseString( $content, array( 'output-xhtml' => true, 'numeric-entities' => true, ), 'utf8' ); $tidy->cleanRepair(); $tidy = str_replace('xmlns="http://www.w3.org/1999/xhtml"','',$tidy); $tidy = str_replace(' ','',$tidy); } elseif(`which tidy`) { // using tiny through cli $CLI_content = escapeshellarg($content); $tidy = `echo $CLI_content | tidy -n -q -utf8 -asxhtml 2> /dev/null`; $tidy = str_replace('xmlns="http://www.w3.org/1999/xhtml"','',$tidy); $tidy = str_replace(' ','',$tidy); } else { // no tidy library found, hence no sanitizing $tidy = $content; } $this->simpleXML = new SimpleXMLElement($tidy); }
/** * Gathers the advertisements from the remote page * @param post Array The post data submitted by the form. * @return Array The ads retrieved from the remote page. */ public function fetchAds($post) { $this->_client->setUri($post['url']); $response = $this->_client->request('GET')->getBody(); /** * If the tidy class exists, attempt to cleanup the XML returned from the * response requested from the remote site. */ if (class_exists('tidy')) { $tidy = new Tidy('/dev/null', array('indent' => true, 'tab-size' => 4, 'output-encoding' => 'utf8', 'newline' => 'LF', 'output-xhtml' => true), 'utf8'); $tidy->parseString($response); $tidy->cleanRepair(); $response = $tidy->value; } /** * Once we've attempted to clean up the retrieved HTML, attempt to parse the * result in a DomDocument. */ $xml = new DOMDocument('1.0', 'utf-8'); $xml->loadHTML($response); $result = array(); # Foreach of the anchor links in the page, foreach ($xml->getElementsByTagName('a') as $a) { # Get it's target HREF $href = $a->getAttribute('href'); if (preg_match("/^http:\\/\\/([a-z\\-]+\\.)?{$post['ad']}.*\$/i", $href)) { # If a link's target points to the search query (the advertising site) $result[] = $href; # Append the result. } } return $result; }
/** * @see ExtensionInterface */ public function apply(Response $response) { $tidy = new \Tidy(); $tidy->parseString($response->getContent()); if ($tidy->errorBuffer) { throw new \Exception($tidy->errorBuffer); } }
/** * receive the html content, fix/format the dom tree and return it * * @param string $content * @return string */ protected function _tidyFix($content) { $config = ['input-xml' => true, 'output-xml' => true, 'wrap' => false]; $tidy = new Tidy(); $tidy->parseString($content, $config, 'utf8'); $tidy->cleanRepair(); $content = (string) $tidy; return $content; }
public static function beautifyHtml($html, $config = array("output-xhtml" => true, "char-encoding" => "utf8", "indent" => true, "indent-spaces" => 4, "wrap" => 0)) { if (!Ajde_Core_Autoloader::exists('Tidy')) { throw new Ajde_Exception('Class Tidy not found', 90023); } $tidy = new Tidy(); // http://bugs.php.net/bug.php?id=35647 return $tidy->repairString($html, $config, 'utf8'); }
public static function beforeResponse($request, $response) { if ($request['_format'] == 'html') { $tidy = new \Tidy(); $tidy->parseString($response, array('wrap' => 200, 'indent' => true), 'utf8'); $tidy->cleanRepair(); $html = $tidy->html(); $response = $html->value; } return $response; }
public static function beautifyHtml($html, $config = ['output-xhtml' => true, 'char-encoding' => 'utf8', 'indent' => true, 'indent-spaces' => 4, 'wrap' => 0]) { if (!class_exists('Tidy')) { throw new Ajde_Exception('Class Tidy not found', 90023); } $tidy = new Tidy(); // tidy does not produce valid utf8 when the encoding is specified in the config // so we provide a third parameter, 'utf8' to fix this // @see http://bugs.php.net/bug.php?id=35647 return $tidy->repairString($html, $config, 'utf8'); }
public static function beautifyHtml($html, $config = array("output-xhtml" => true, "char-encoding" => "utf8", "indent" => true, "indent-spaces" => 4, "wrap" => 0)) { if (!Ajde_Core_Autoloader::exists('Tidy')) { throw new Ajde_Exception('Class Tidy not found', 90023); } $tidy = new Tidy(); // tidy does not produce valid utf8 when the encoding is specified in the config // so we provide a third parameter, 'utf8' to fix this // @see http://bugs.php.net/bug.php?id=35647 return $tidy->repairString($html, $config, 'utf8'); }
public function formatHtml($html, $charset = null, $charset_hint = null) { $html = $this->toUTF8($html, $charset, $charset_hint); $tidy = new Tidy(); $config = array("hide-comments" => true); $tidy->parseString($html, $config, 'UTF8'); $tidy->cleanRepair(); $html = (string) $tidy; $html = $this->moveMetaContentTypeToTop($html); $html = $this->formatDocType($html); return $html; }
function parse_html($html_code) { $this->html_code = $html_code; // Tidy HTML code $tidy = new Tidy(); $tidy->parseString($html_code, $this->tidy_config, 'utf8'); $tidy->cleanRepair(); $this->tidy_code = $tidy->value; $this->dom = DOMDocument::loadXML($tidy->value); $this->dom->normalizeDocument(); if ($this->dom == null) { trigger_error("Unable to parse XML Document!", E_USER_ERROR); } }
public function formatTables($text) { $text = preg_replace_callback('%<div class="rvps(?:14|8)">\\n*<table.*?>([\\s\\S]*?)</table>\\n*</div>%u', function ($matches) { $table = '<table>' . $matches[1] . '</table>'; $table = preg_replace('%(?:<p class="rvps(?:1|4|14)">)?<span class="rvts(?:9|15|23)">\\s*(.*?)\\s*</span>(?:</p>)?%u', '<b class="table-header">$1</b>', $table); $table = preg_replace('%<b class="table-header"><br></b>%u', '', $table); // rvps14 - rvps14 // rvps14 - rvps11 // rvps4 - rvps15 $config = array('clean' => true, 'output-html' => true, 'show-body-only' => true, 'wrap' => 0, 'indent' => true); $tidy = new \Tidy(); $tidy->parseString($table, $config, 'utf8'); $tidy->cleanRepair(); return $tidy . "\n"; }, $text); return $text; }
/** * Reads input and returns Tidy-filtered output. * * @param null $len * * @throws BuildException * @return the resulting stream, or -1 if the end of the resulting stream has been reached * */ public function read($len = null) { if (!class_exists('Tidy')) { throw new BuildException("You must enable the 'tidy' extension in your PHP configuration in order to use the Tidy filter."); } if (!$this->getInitialized()) { $this->_initialize(); $this->setInitialized(true); } $buffer = $this->in->read($len); if ($buffer === -1) { return -1; } $config = $this->getDistilledConfig(); $tidy = new Tidy(); $tidy->parseString($buffer, $config, $this->encoding); $tidy->cleanRepair(); return tidy_get_output($tidy); }
/** * @param \CliTester $I * @param \Codeception\Scenario $scenario */ public function testDocumentValidHtml(\CliTester $I, \Codeception\Scenario $scenario) { $I->wantTo('verify that the default template produces valid HTML'); if (!class_exists('Tidy')) { $scenario->skip('Tidy is not available. See http://php.net/manual/en/tidy.installation.php'); } $template = dirname(dirname(__DIR__)) . '/src/Task/CodeSniffer/codestyle.html'; $outfile = dirname(__DIR__) . '/_output/codestyle.html'; if (file_exists($outfile)) { unlink($outfile); } $I->dontSeeFileFound($outfile); $I->runShellCommand('vendor/bin/robo document:codestyle --outfile ' . $outfile . ' --template ' . $template); $I->seeFileFound($outfile); $tidy = new \Tidy(); $tidy->parseFile($outfile); $I->assertEquals(0, $tidy->getStatus()); unlink($outfile); }
/** * Transforms an XML file into HTML based on the stylesheet * @param $xml XML DOM tree */ public function transformToHTML($xml) { $out = $this->xsltProcessor->transformToXML($xml); // fudges for HTML backwards compatibility $out = str_replace('/>', ' />', $out); // <br /> not <br/> $out = str_replace(' xmlns=""', '', $out); // rm unnecessary xmlns $out = str_replace(' xmlns="http://www.w3.org/1999/xhtml"', '', $out); // rm unnecessary xmlns if (class_exists('Tidy')) { // cleanup output $config = array('indent' => true, 'output-xhtml' => true, 'wrap' => 80); $tidy = new Tidy(); $tidy->parseString($out, $config, 'utf8'); $tidy->cleanRepair(); $out = (string) $tidy; } return $out; }
protected function loadHtml($uri) { if (preg_match('/^https?:/i', $uri) === 0) { $uri = $this->config->getBaseHref() . $uri; } $curl = curl_init($uri); curl_setopt($curl, CURLOPT_RETURNTRANSFER, true); $html = curl_exec($curl); $this->request_info = curl_getinfo($curl); curl_close($curl); $this->location = $uri; $tidy = new Tidy(); $tidy->parseString($html, array('output-xhtml' => true, 'char-encoding' => 'utf8', 'numeric-entities' => true), 'utf8'); $tidy->cleanRepair(); $this->document = new DOMDocument(); $this->document->resolveExternals = true; $this->document->loadXml($tidy); $this->xpath = new DOMXPath($this->document); $this->xpath->registerNamespace('atom', 'http://www.w3.org/2005/Atom'); $this->xpath->registerNamespace('html', 'http://www.w3.org/1999/xhtml'); }
public function generateResponse() { TemplateEngine::compile(); if (Gravel::$config['gravel']['tidy_html'] && class_exists('Tidy')) { $html = new \Tidy(); $config = ['indent' => 1, 'indent-spaces' => 4, 'output-xhtml' => 'false', 'wrap' => 0, 'hide-comments' => 0]; $html->parseString(TemplateEngine::$data['compiled'], $config); } else { $html = TemplateEngine::$data['compiled']; } if (Gravel::$config['gravel']['debug_mode']) { header("Content-Type: text/plain"); } echo $html; // if we don't have an ajax request we can output some debug info if (!isset($_SERVER['HTTP_X_REQUESTED_WITH']) || $_SERVER['HTTP_X_REQUESTED_WITH'] !== 'XMLHttpRequest') { $version = Gravel::$version; echo PHP_EOL . "<!-- Generated in " . number_format(microtime(true) - Gravel::$startTime, 5) . " seconds -->"; echo PHP_EOL . "<!-- Gravel PHP framework {$version} -->"; } }
public static function RunOn($text) { global $wgTidyInternal; $wrappedtext = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"' . ' "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"><html>' . '<head><title>test</title></head><body>' . $text . '</body></html>'; if ($wgTidyInternal) { $correctedtext = Tidy::internal($wrappedtext); } else { $correctedtext = Tidy::external($wrappedtext); } if (is_null($correctedtext)) { wfDebug("Tidy error detected!\n"); return $text . "\n<!-- Tidy found serious XHTML errors -->\n"; } return $correctedtext; }
/** * 功能说明:html标签闭合检测、修复 * @param mix $data 需要处理的html。可以是多维数组,程序自动进行递归处理 * @param string $encode 编码:默认utf-8 * @return mix 返回数据与参数data一致 */ public static function htmlFixSafe($data, $encode = 'utf8') { if (empty($data)) { return ''; } $tidyConfig = array('indent' => false, 'output-xhtml' => true, 'show-body-only' => true); $tidyObj = new Tidy(); if (is_array($data)) { foreach ($data as $key => $value) { $data[$key] = self::htmlFixSafe($value); } } else { $data = $tidyObj->repairString($data, $tidyConfig, $encode); } return $data; }
/** * Main function for prepare the useful email content * * @param $content string * @return string */ public function recognizeUsefulContent($content = null) { if (!$content) { $content = $this->getContent(); } if ($this->isHtml($content)) { $content = $this->HTMLPurifier->purify($content); $content = $this->removeReplies($content); } else { $position = strpos($content, self::DELIMITER_LINE); if ($position !== false) { $content = substr($content, 0, $position); } $content = $this->markdown->defaultTransform($content); } return $content; }
$response['tagCount'] = array(); $response['html'] = ""; $response['code'] = "N/A"; $response['message'] = ""; //use php's filter to check for a valid url if (!filter_var($_POST['url'], FILTER_VALIDATE_URL) === false) { $url = $_POST['url']; $curl = new MyCurl($url); $curl->createCurl(); $response['code'] = $curl->getHttpStatus(); $response['message'] = HttpCodes::getType($response['code']); $html = $curl->__toString(); if (!is_string($html)) { $response['message'] = "Page Could not be loaded, check the domain. Nothing was returned."; } else { $tidy = new Tidy(); //load page into tidy object, set options, and clean html $tidy->parseString($html, array('indent' => 2, 'output-xhtml' => true)); $tidy->cleanRepair(); //html is now nicely indented $html = (string) $tidy; //count the tags and get the result in a $tag => $count array $tagCount = countTags($html); $response['tagCount'] = $tagCount; $response['html'] = htmlentities($html); } } else { $response['message'] = $_POST['url'] . " is not a valid URL"; } header('Content-Type: application/json'); echo json_encode($response);
function tidyToXml($htmlTagSoup) { // Create the Tidy object $tidy = new Tidy(); // Parse the HTML into memory, turning on the option to convert to // XHTML as part of the tidying process $tidy->parseString($htmlTagSoup, array('output-xhtml' => true)); // Do the tidying $tidy->cleanRepair(); // And get the tidied version as a string $tidied_xml = tidy_get_output($tidy); // Opinions seem to differ as to whether the non-breaking space // entity ' ' is predeclared as part of XHTML. Tidy thinks it // is, and so leaves it alone, while the XML parser we're about to // use on this string thinks otherwise. So replace any occurrences // of it with its numeric equivalent (which doesn't need to be // declared). return str_replace(' ', ' ', $tidied_xml); }
private function clean($content) { if (!$content) { return ''; } $tidy = new \Tidy(); $tidy->parseString($content, ['indent' => true, 'doctype' => 'omit', 'output-html' => true, 'show-body-only' => true, 'drop-empty-paras' => true, 'drop-font-tags' => true, 'drop-proprietary-attributes' => true, 'hide-comments' => true, 'logical-emphasis' => true]); $tidy->cleanRepair(); return (string) $content; }
function closetags($html) { if (class_exists('Tidy')) { $tidy = new Tidy(); $clean = $tidy->repairString($html, array('output-xml' => true, 'input-xml' => true)); return $clean; } preg_match_all('#<(?!meta|img|br|hr|input\\b)\\b([a-z]+)(?: .*)?(?<![/|/ ])>#iU', $html, $result); $openedtags = $result[1]; preg_match_all('#</([a-z]+)>#iU', $html, $result); $closedtags = $result[1]; $len_opened = count($openedtags); if (count($closedtags) == $len_opened) { return $html; } $openedtags = array_reverse($openedtags); for ($i = 0; $i < $len_opened; $i++) { if (!in_array($openedtags[$i], $closedtags)) { $html .= '</' . $openedtags[$i] . '>'; } else { unset($closedtags[array_search($openedtags[$i], $closedtags)]); } } return $html; }
function onFunction($matches, $s) { $fns = explode('||', $matches[2]); for ($i = count($fns) - 1; $i >= 0; $i--) { $fn = explode(',', $fns[$i]); switch ($fn[0]) { case 'cleanhtml': $s = strip_tags($s, '<p><a><b><br><br/><i>'); break; case 'removehtml': $s = strip_tags($s); break; case 'splitbychars': $s = substr($s, $fn[1], $fn[2]); break; case 'splitbywords': $len = strlen($s); $pos = $fn[2] > $len ? $len : strpos($s, ' ', $fn[2]); if ($pos === false) { $pos = $len; } $s = substr($s, 0, $pos); break; case 'findimage': $index = isset($fn[1]) ? intval($fn[1]) - 1 : 0; preg_match_all('/(<img.*?src=[\'"](.*?)[\'"][^>]+>)|(background(-image)??\\s*?:.*?url\\((["|\']?)?(.+?)(["|\']?)?\\))/i', $s, $r); if (isset($r[2]) && !empty($r[2][$index])) { $s = $r[2][$index]; } else { if (isset($r[6]) && !empty($r[6][$index])) { $s = trim($r[6][$index], "'\" \t\n\r\v"); } else { $s = ''; } } break; } } if ($i !== -1) { if ($this->_tidy) { $tidy = new Tidy(); return $tidy->repairString($s, array('show-body-only' => true, 'input-encoding' => $this->_tidyInputEncoding, 'output-encoding' => $this->_tidyOutputEncoding)); } } return $this->closetags($s); }
<?php $start = microtime(true); $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, 'http://thinkphp.com.ua/'); // we want to pretend the Googlebot curl_setopt($ch, CURLOPT_USERAGENT, 'Googlebot/2.1 (+http://www.google.com/bot.html)'); curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); $content = curl_exec($ch); // fix the document, as it's xml $tidy = new Tidy(); $tidy->parseString($content, ['input-xml' => true, 'output-xml' => true, 'wrap' => false], 'utf8'); $tidy->cleanRepair(); $content = (string) $tidy; // load the string as simplexml object $xml = simplexml_load_string($content); // registering the namespace, so we can search $xml->registerXPathNamespace('xmlns', 'http://www.w3.org/1999/xhtml'); $eventsHeaders = []; foreach ($xml->xpath('//xmlns:h2') as $node) { // remove if present unset($node->span); // if the href is there, let's parse it if (isset($node->a['href'])) { $link = (string) $node->a['href']; } else { $link = null; } $eventsHeaders[] = ['title' => trim(strip_tags($node->asXml())), 'link' => $link]; } $finish = microtime(true);
/** Clean and repair HTML @return string @param $html string @public **/ static function tidy($html) { if (!extension_loaded('tidy')) { return $html; } $tidy = new Tidy(); $tidy->parseString($html, self::$vars['TIDY'], str_replace('-', '', self::$vars['ENCODING'])); $tidy->cleanRepair(); return (string) $tidy; }