public function __destruct() { try { if (!self::$html_set || !self::$head_set) { throw new PageException("<b>HTML class exception.</b><br />Either <html> or <head> or <body> is not set.</b><br />\n\t\t\t\t\tAll these tags need to be used in order to generate valid html forms."); } self::$output .= "</body>\n</html>"; if (self::$debug) { echo '<b>Tidy messages</b><br />'; $tidy = tidy_parse_string(self::$output); echo nl2br(htmlentities(tidy_get_error_buffer($tidy))); echo '<hr />'; $linedump = explode("\n", nl2br(htmlentities(str_replace("<br />", "\n", self::$output)))); // var_dump($linedump); for ($i = 0; $i < sizeof($linedump); ++$i) { if (trim(str_replace("<br>", "", $linedump[$i])) == '') { continue; } $il = strlen($i); $il4 = 4 - $il; $j = $i + 1; $linenr = str_repeat(" ", $il4) . $j; echo $linenr . ' : ' . $linedump[$i]; } echo '<hr />'; self::$output = ''; } else { echo self::$output; self::$output = ''; } } catch (PageException $e) { echo $e->getMessage(); } }
function tidy_html($html) { $tidy_config = array('output-xhtml' => true, 'show-body-only' => true); $tidy = tidy_parse_string($html, $tidy_config, 'UTF8'); $tidy->cleanRepair(); return tidy_get_output($tidy); }
public function afterRender($event, $view) { $tidyConfig = array('clean' => true, 'output-xhtml' => true, 'show-body-only' => true, 'wrap' => 0); $tidy = tidy_parse_string($view->getContent(), $tidyConfig, 'UTF8'); $tidy->cleanRepair(); $view->setContent((string) $tidy); }
protected function tidy($content) { $config = array('output-xhtml' => true); $tidy = tidy_parse_string($content, $config, 'utf8'); $tidy->cleanRepair(); return (string) $tidy; }
/** * @param $url * @param bool $use_tidy * @return array */ function get_url($url, $use_tidy = TRUE) { global $cookies; $smarty = TikiLib::lib('smarty'); $result = array(); $get = get_from_dom($url->getElementsByTagName('get')->item(0)); $post = get_from_dom($url->getElementsByTagName('post')->item(0)); $xpath = $url->getElementsByTagName('xpath')->item(0)->textContent; $data = $url->getElementsByTagName('data')->item(0)->textContent; $urlstr = $url->getAttribute("src"); $referer = $url->getAttribute("referer"); $result['data'] = $data; if (extension_loaded("tidy")) { $data = tidy_parse_string($data, array(), 'utf8'); tidy_diagnose($data); if ($use_tidy) { $result['ref_error_count'] = tidy_error_count($data); $result['ref_error_msg'] = tidy_get_error_buffer($data); } } else { $result['ref_error_msg'] = tra("Tidy Extension not present"); } $result['url'] = $urlstr; $result['xpath'] = $xpath; $result['method'] = $url->getAttribute("method"); $result['post'] = $post; $result['get'] = $get; $result['referer'] = $referer; return $result; }
/** * Filter a content item's content * * @return string */ function filter($item, $field = "content", $length = 0) { $nodefilters = array(); if (is_a($item, 'Zoo_Content_Interface')) { $txt = $item->{$field}; $nodefilters = Zoo::getService('content')->getFilters($item); } else { $txt = $item; } if ($length > 0) { $txt = substr($txt, 0, $length); } if (count($nodefilters)) { $ids = array(); foreach ($nodefilters as $nodefilter) { $ids[] = $nodefilter->filter_id; } $filters = Zoo::getService('filter')->getFilters($ids); foreach ($filters as $filter) { $txt = $filter->filter($txt); } if (extension_loaded('tidy')) { $config = array('indent' => TRUE, 'show-body-only' => TRUE, 'output-xhtml' => TRUE, 'wrap' => 0); $tidy = tidy_parse_string($txt, $config, 'UTF8'); $tidy->cleanRepair(); $txt = tidy_get_output($tidy); } } else { $txt = htmlspecialchars($txt); } return $txt; }
public function tidy($html, $encoding = 'utf-8') { if ($html == '') { return false; } $output = ''; $html = trim($html); //对于非utf-8编辑处理 if ($encoding !== 'utf-8') { $html = BaseModelCommon::convertEncoding($html, 'utf-8', $encoding); } $html = preg_replace("|\\/\\*(.*)\\*\\/|sU", "", $html); //过滤掉全部注释内容 $html = preg_replace("/<!\\[CDATA\\[(.*?)\\]\\]>/is", "\\1", $html); //过滤掉CDATA标签 $html = $this->_escapeUnicode($html); //转义Unicode字符 $tidy_conf = array('output-xhtml' => true, 'show-body-only' => true, 'join-classes' => true); $html = str_replace("&", "&", $html); $dom = tidy_parse_string($html, $tidy_conf, 'utf8'); $body = $dom->body(); if ($body->child) { foreach ($body->child as $child) { $this->_filterNode($child, $output); } } $html = $this->_unEscapeUnicode($output); //反转义Unicode字符 if ($encoding !== 'utf-8') { $html = BaseModelCommon::convertEncoding($html, $encoding, 'utf-8'); } $html = $this->_insertVideo($html); return $html; }
function load_html($html) { $tidy = tidy_parse_string($html); tidy_clean_repair($tidy); $html = tidy_get_html($tidy); phpQuery::unloadDocuments(); return phpQuery::newDocumentHTML($html); }
public function cleanHtml($html, $encoding = 'utf8') { $tidy = tidy_parse_string($html, $this->options(), $encoding = 'utf8'); $tidy->cleanRepair(); $html = join('', $tidy->body()->child ?: []); $html = str_replace(PHP_EOL, '', $html); return $html; }
public function prepareHtmlInput($html) { $config = array('wrap' => false, 'show-body-only' => true); $tidyNode = tidy_parse_string($html, $config, 'utf8')->body(); $htmlArray = $this->toArray($tidyNode); $html = implode("\n", $htmlArray); return $html; }
/** * Return pretty html. * * @param $html * @return mixed */ public function getPrettyHtml($html) { // TODO without tidy support $params = ['show-body-only' => true, 'indent' => true, 'output-html' => true, 'wrap' => 200]; $tidy = tidy_parse_string($html, $params, 'UTF8'); $tidy->cleanRepair(); $this->htmlOutput = $tidy; return $tidy; }
private function tidy($html) { if (function_exists('tidy_parse_string')) { $tidy = tidy_parse_string($html, array(), 'UTF8'); $tidy->cleanRepair(); $html = $tidy->value; } return $html; }
/** * Reads the raw content the page using HTML Tidy. * @param $response SimpleHttpResponse Fetched response. * @return SimplePage Newly parsed page. */ function parse($response) { $this->page = new SimplePage($response); $tidied = tidy_parse_string($input = $this->insertGuards($response->getContent()), array('output-xml' => false, 'wrap' => '0', 'indent' => 'no'), 'latin1'); $this->walkTree($tidied->html()); $this->attachLabels($this->widgets_by_id, $this->labels); $this->page->setForms($this->forms); $page = $this->page; $this->free(); return $page; }
public function __construct($fileName) { $tidy = tidy_parse_string(utf8_encode(file_get_contents($fileName))); $tidy->cleanRepair(); $html = $tidy->html(); $html = $html->value; $html = $this->removeTags($html, ["script", "style"]); $this->dom = new DOMDocument(); $this->dom->preserveWhiteSpace = false; $this->dom->loadHTML($html); }
/** * Straight forward string replacement template engine, could be replaced by a full * template engine if scope increased. * @return string */ public function render() { $template_copy = $this->template; $template_copy = str_replace('{DESTINATION NAME}', $this->destination->title, $template_copy); $template_copy = str_replace('{CONTENT}', $this->destination->getBodyHtml(), $template_copy); $template_copy = str_replace('{NAVIGATION}', $this->renderNavigation(), $template_copy); //clean up the html to make reviewing easier $tidy = tidy_parse_string($template_copy, ['indent' => true, 'output-xhtml' => true, 'wrap' => 0], 'utf8'); $tidy->cleanRepair(); return (string) $tidy; }
/** * Retrieves the actual output intend and parses it to tidyPHP for cleanup * cleaned up content the gets set to the grav context again. */ public function onOutputGenerated() { if ($this->skipCurrentSite($this->grav['uri']->path())) { return; } $originOutput = $this->grav->output; $config = array('indent' => $this->_getConfigSetting('indent'), 'indent-spaces' => $this->_getConfigSetting('indent_spaces'), 'wrap' => $this->_getConfigSetting('wrap'), 'hide-comments' => $this->_getConfigSetting('hide_comments'), 'new-blocklevel-tags' => implode(' ', $this->_getConfigSetting('blocklevel_tags')), 'new-empty-tags' => implode(' ', $this->_getConfigSetting('empty_tags')), 'new-inline-tags' => implode(' ', $this->_getConfigSetting('inline_tags')), 'newline' => 'LF'); /** @var tidy $tidy */ $tidy = tidy_parse_string($originOutput, $config, 'UTF8'); $tidy->cleanRepair(); $this->grav->output = $tidy; }
public function afterRender($event, $view) { if (!extension_loaded('tidy')) { return; } $options = array('hide-comments' => true, 'tidy-mark' => false, 'indent' => true, 'indent-spaces' => 4, 'new-blocklevel-tags' => 'article,header,footer,section,nav', 'new-inline-tags' => 'video,audio,canvas,ruby,rt,rp', 'doctype' => '<!DOCTYPE HTML>', 'sort-attributes' => 'alpha', 'vertical-space' => false, 'output-xhtml' => true, 'wrap' => 150, 'wrap-attributes' => false, 'break-before-br' => false); $buffer = tidy_parse_string($view->getContent(), $options, 'utf8'); tidy_clean_repair($buffer); $buffer = str_replace(array('<html lang="en" xmlns="http://www.w3.org/1999/xhtml">', '<html xmlns="http://www.w3.org/1999/xhtml">'), '<!DOCTYPE html>', $buffer); $buffer = str_replace(">\n</script>", "></script>", $buffer); $view->setContent((string) $buffer); }
/** * Trims content, then trims each line of content * * @param string $content * @param string $encoding * @throws \RuntimeException * @return string */ public function render($content = null, $encoding = 'utf8') { if (null === $content) { $content = $this->renderChildren(); } if (true === $this->hasTidy) { $tidy = tidy_parse_string($content, [], $encoding); $tidy->cleanRepair(); return (string) $tidy; } throw new \RuntimeException('TidyViewHelper requires the PHP extension "tidy" which is not installed or not loaded.', 1352059753); }
function output($title = '', $body = '', $head = '') { global $settings, $authid, $checkleft, $checkright, $head, $error, $error_die; if (theme('output_error') != false) { $body = theme('output_error'); $title = 'Error'; $panels = false; $lowerpanel = false; $panel = ''; unset($error_die); } else { $panels = true; } //display panels if ($panels != false) { $panel = theme('displaypanels'); $lowerpanel = theme('displaylowerpanel'); } if (isset($error) && !empty($error)) { $errors = '<br />' . theme('title', 'Error') . theme('start_content') . '<div class="errors"><ul>'; foreach ($error as $error1) { $errors .= '<li>' . $error1 . '</li>'; } $errors .= '</ul></div>' . theme('end_content'); unset($error); } else { $errors = ''; } if (isset($_GET['page']) && $_GET['page'] > 1) { $title = $title . ' - Page ' . $_GET['page']; } $output = theme('head', stripslashes($title), $head) . '<body>'; if ($settings['maintenance_mode'] == 'on') { $output .= '<div class="titlebg">WARNING: Maintenance Mode is on</div>'; } $output .= '<div id="container"> ' . theme('top') . theme('links'); $output .= $panel; //display the data $output .= $errors . '<br />' . stripslashes($body); $output .= $lowerpanel . theme('footer'); //SEO Friendly Links include IN_PATH . '/functions/seofriendlyurls.php'; //Check if the tidy library is installed if (extension_loaded('tidy')) { //yay it is, lets clean up all the HTML, so it looks all nice in View Source in your browser :) $options = array("indent" => true, 'wrap' => 0); $output = tidy_parse_string($output, $options); tidy_clean_repair($output); } die($output); }
/** * Trims content, then trims each line of content * * @param string $content * @throws \RuntimeException * @return string */ public function render($content = NULL) { if (NULL === $content) { $content = $this->renderChildren(); } if (TRUE === $this->hasTidy) { $configuration = array('output-xml' => TRUE, 'input-xml' => TRUE, 'indent' => TRUE, 'quote-nbsp' => FALSE, 'input-encoding' => 'utf8', 'output-encoding' => 'utf8', 'char-encoding' => 'utf8'); $tidy = tidy_parse_string($content, $configuration); $tidy->cleanRepair(); return (string) $tidy; } throw new \RuntimeException('TidyViewHelper requires the PHP extension "tidy" which is not installed or not loaded.', 1352059753); }
private function connect(\Step\Api\TokenUser $I, $url, $params = [], $acceptHeaders = 'application/json, text/javascript, */*;q=0.01') { $I->haveHttpHeader('Accept', $this->browserHeader); $I->sendGET('/'); $html = $I->grabResponse(); $tidy = tidy_parse_string($html); $head = $tidy->head(); $requestToken = $head->attribute['data-requesttoken']; $I->haveHttpHeader('Accept', $acceptHeaders); $I->haveHttpHeader('requesttoken', $requestToken); $params = array_merge($params, ['token' => $this->folderMetaData['token'], 'password' => $this->folderMetaData['password']]); $I->sendGET($url, $params); }
public static function purifyHtml($code) { //$code is not a complete page so we need to wrap it! $head = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"><html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> <head> <title>Just A Wrapper</title><meta http-equiv="content-type" content="text/html;charset=UTF-8"/> </head> <!--wrapdelimiter--><body>'; $tail = ' </body><!--wrapdelimiter--></html>'; $c = $head . $code . $tail; $config = array('indent' => false, 'output-xhtml' => TRUE, 'wrap' => 0); $c2 = tidy_parse_string($c, $config, 'UTF8'); $arr = explode("<!--wrapdelimiter-->", $c2); $out = $arr[1]; $out = str_replace("<body>", "", $out); $out = str_replace("</body>", "", $out); return $out; }
/** * tidyClean * clean html source code * * @param mixed $html * @param array $options * @access private * @return void */ private function tidyClean($html, $options = array()) { if (!$this->tidyAvailable) { return $html; } $search = array("'<script[^>]*?>.*?</script>'si", "'<style[^>]*?>.*?</style>'si"); $html = preg_replace($search, "", $html); if (empty($options)) { $options = array('indent' => true, 'show-body-only' => true); } $tidy = tidy_parse_string($html, $options, "utf8"); $tidy->cleanRepair(); return $tidy->value; }
/** * Turn a string or array into valid, standards-compliant (x)HTML * * Uses configuraton options in tidy.conf - which should minimally have show-body-only set to yes * * @param mixed $text The data to be tidied up * @return mixed $result Tidied data */ function tidy($text) { static $tidy_funcs; static $tidy_conf; if (!isset($tidy_conf)) { $tidy_conf = SETTINGS_INC . 'tidy.conf'; } if (is_array($text)) { $result = array(); foreach (array_keys($text) as $key) { $result[$key] = tidy($text[$key]); } return $result; } // determine what tidy libraries are available if (empty($tidy_funcs)) { $tidy_funcs = get_extension_funcs('tidy'); } $tidy_1_lib_available = !empty($tidy_funcs) && array_search('tidy_setopt', $tidy_funcs) !== false; $tidy_2_lib_available = !empty($tidy_funcs) && array_search('tidy_setopt', $tidy_funcs) === false; $tidy_command_line_available = TIDY_EXE ? file_exists(TIDY_EXE) : false; $text = protect_string_from_tidy($text); $text = '<html><body>' . $text . '</body></html>'; if ($tidy_2_lib_available) { $tidy = new tidy(); $tidy->parseString($text, $tidy_conf, 'utf8'); $tidy->cleanRepair(); $result = $tidy; } elseif ($tidy_1_lib_available) { tidy_load_config($tidy_conf); tidy_set_encoding('utf8'); tidy_parse_string($text); tidy_clean_repair(); $result = tidy_get_output(); } elseif ($tidy_command_line_available) { $arg = escapeshellarg($text); // escape the bad stuff in the text $cmd = 'echo ' . $arg . ' | ' . TIDY_EXE . ' -q -config ' . $tidy_conf . ' 2> /dev/null'; // the actual command - pipes the input to tidy which diverts its output to the random file $result = shell_exec($cmd); // execute the command } else { trigger_error('tidy does not appear to be available within php or at the command line - no tidying is taking place.'); $result = $text; } return trim($result); }
/** * Loads an HTML string * * @param string $str HTML text to load */ function load_html($str) { try { $tidy = new tidy(); $tidy = tidy_parse_string($str); //$tidy->cleanRepair(); $html = $tidy->html(); $str = $html->value; } catch (Exception $e) { //avisar al usuario de que no esta disponible tidy $doc = new DOMDocument(); $doc->loadHTML($str); $str = @$doc->saveHTML(); } $str = preg_replace('/>\\s*?</', '><', $str); $str = str_replace('</body>', '<close></body>', $str); parent::load_html($str); }
function perform() { $q = DB::query('SELECT link, neighborhood FROM listings WHERE scraped != TRUE', PDO::FETCH_ASSOC); $ps = DB::prepare('UPDATE listings SET scraped=TRUE, street=:street, description=:description, lat=:lat, lng=:lng WHERE link=:link'); /* Guzzle::sendAll(array_map(function ($listing) { return Guzzle::createRequest('GET', 'http://newyork.craigslist.org' . $listing['link']); }, iterator_to_array($q)), ['complete' => function ($event) use($ps) { try { $body = $event->getResponse()->getBody(); $crawler = new Crawler($body); $readability = new Readability($body); $street = $crawler->filter('.mapAndAttrs > .mapbox > div.mapaddress'); $ps->execute([ ':link' => parse_url($event->getRequest()->getUrl())['path'], ':lat' => null, ':lng' => null, ':street' => $street->count() ? $street->text() : null, ':description' => $readability->init() ? trim(strip_tags(tidy_parse_string($readability->getContent()->innerHTML, [], 'UTF8'))) : null ]); } catch (Exception $e) { Logger::error($e->getMessage(), $ps->errorinfo()); } }]); */ foreach ($q as $listing) { try { $body = Guzzle::get('http://newyork.craigslist.org' . $listing['link'])->getBody(); $crawler = new Crawler($body); $readability = new Readability($body); $street = $crawler->filter('.mapAndAttrs > .mapbox > div.mapaddress'); $url = 'http://maps.googleapis.com/maps/api/geocode/json?address=' . ($street->count() ? $street->text() : $listing['neighborhood']); $json = json_decode(Guzzle::get($url)->getBody(), true); $loc = isset($json['results'][0]) ? $json['results'][0]['geometry']['location'] : null; $ps->execute([':link' => $listing['link'], ':lat' => isset($loc['lat']) ? $loc['lat'] : null, ':lng' => isset($loc['lng']) ? $loc['lng'] : null, ':street' => $street->count() ? $street->text() : null, ':description' => $readability->init() ? trim(strip_tags(tidy_parse_string($readability->getContent()->innerHTML, [], 'UTF8'))) : null]); } catch (Exception $e) { Logger::error($e->getMessage(), $ps->errorinfo()); } } }
/** * Use the HTML tidy PECL extension to use the tidy library in-process, * saving the overhead of spawning a new process. Currently written to * the PHP 4.3.x version of the extension, may not work on PHP 5. * * 'pear install tidy' should be able to compile the extension module. */ private static function internal($text) { global $wgTidyConf; $fname = 'Parser::internalTidy'; wfProfileIn($fname); tidy_load_config($wgTidyConf); tidy_set_encoding('utf8'); tidy_parse_string($text); tidy_clean_repair(); if (tidy_get_status() == 2) { // 2 is magic number for fatal error // http://www.php.net/manual/en/function.tidy-get-status.php $cleansource = null; } else { $cleansource = tidy_get_output(); } wfProfileOut($fname); return $cleansource; }
function enlight_xpath($url, $xpath) { global $smarty, $cookies,$base_url; static $purifier; static $loaded = false; $result = array(); $data = $url->getElementsByTagName('data')->item(0)->textContent; if (trim($data) == '') { return tra('The page is empty'); } if (extension_loaded('tidy')) { $data = tidy_parse_string($data, array(), 'utf8'); tidy_diagnose($data); } else { if (!$loaded) { require_once('lib/htmlpurifier_tiki/HTMLPurifier.tiki.php'); $config = getHTMLPurifierTikiConfig(); $config->set('Attr.EnableID', true); $purifier = new HTMLPurifier($config); $loaded = true; } if ($purifier) { $data = '<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /></head><body>'.$purifier->purify($data).'</body></html>'; //$data = $purifier->purify($data); } } $dom_ref = DOMDocument::loadHTML($data); $xp_ref = new DomXPath($dom_ref); $res_ref = $xp_ref->query('//head'); $base = $dom_ref->createElement('base'); $base->setAttribute('href', $base_url); $res_ref->item(0)->insertBefore($base, $res_ref->item(0)->firstChild); $res_ref = $xp_ref->query($xpath); foreach ($res_ref as $ref) { $ref->setAttribute('style', 'background-color: red;'); } return $dom_ref->saveHTML(); }
function html_standardization($html) { if (!function_exists('tidy_repair_string')) { return $html; } $str = tidy_repair_string($html, array('output-xhtml' => true), 'utf8'); if (!$str) { return $html; } $str = tidy_parse_string($str, array('output-xhtml' => true), 'utf8'); $standard_html = ''; $nodes = @tidy_get_body($str)->child; if (!is_array($nodes)) { $returnVal = 0; return $html; } foreach ($nodes as $n) { $standard_html .= $n->value; } return $standard_html; }
public function tidy($content, $stripWord = false) { // Try to use the extension first if (extension_loaded('tidy')) { $tidy = tidy_parse_string($content, array( 'clean' => true, 'output-xhtml' => true, 'show-body-only' => true, 'quote-nbsp' => true, 'wrap' => 0, 'input-encoding' => 'utf8', 'output-encoding' => 'utf8', 'new-blocklevel-tags' => 'article aside audio details figcaption figure footer header hgroup nav section source summary temp track video', 'new-empty-tags' => 'command embed keygen source track wbr', 'new-inline-tags' => 'audio canvas command datalist embed keygen mark meter output progress time video wbr', 'bare' => $stripWord, 'word-2000' => $stripWord )); $tidy->cleanRepair(); return $this->rewriteShortcodes('' . $tidy); } // No PHP extension available, attempt to use CLI tidy. $retval = null; $output = null; @exec('tidy --version', $output, $retval); if ($retval === 0) { $tidy = ''; $input = escapeshellarg($content); // Doesn't work on Windows, sorry, stick to the extension. $tidy = @`echo $input | tidy -q --show-body-only yes --input-encoding utf8 --output-encoding utf8 --wrap 0 --clean yes --output-xhtml yes`; return $this->rewriteShortcodes($tidy); } // Fall back to default $doc = new SS_HTML4Value($content); return $doc->getContent(); }