Exemplo n.º 1
0
 /**
  *	Gathers the advertisements from the remote page
  *	@param post		Array				The post data submitted by the form.
  *	@return			Array				The ads retrieved from the remote page.
  */
 public function fetchAds($post)
 {
     $this->_client->setUri($post['url']);
     $response = $this->_client->request('GET')->getBody();
     /**
      *	If the tidy class exists, attempt to cleanup the XML returned from the
      *	response requested from the remote site.
      */
     if (class_exists('tidy')) {
         $tidy = new Tidy('/dev/null', array('indent' => true, 'tab-size' => 4, 'output-encoding' => 'utf8', 'newline' => 'LF', 'output-xhtml' => true), 'utf8');
         $tidy->parseString($response);
         $tidy->cleanRepair();
         $response = $tidy->value;
     }
     /**
      *	Once we've attempted to clean up the retrieved HTML, attempt to parse the
      *	result in a DomDocument.
      */
     $xml = new DOMDocument('1.0', 'utf-8');
     $xml->loadHTML($response);
     $result = array();
     # Foreach of the anchor links in the page,
     foreach ($xml->getElementsByTagName('a') as $a) {
         # Get it's target HREF
         $href = $a->getAttribute('href');
         if (preg_match("/^http:\\/\\/([a-z\\-]+\\.)?{$post['ad']}.*\$/i", $href)) {
             # If a link's target points to the search query (the advertising site)
             $result[] = $href;
             # Append the result.
         }
     }
     return $result;
 }
Exemplo n.º 2
0
 public function __construct($content)
 {
     if (extension_loaded('tidy')) {
         // using the tiny php extension
         $tidy = new Tidy();
         $tidy->parseString($content, array('output-xhtml' => true, 'numeric-entities' => true, 'wrap' => 99999), 'utf8');
         $tidy->cleanRepair();
         $tidy = str_replace('xmlns="http://www.w3.org/1999/xhtml"', '', $tidy);
         $tidy = str_replace(' ', '', $tidy);
     } elseif (@shell_exec('which tidy')) {
         // using tiny through cli
         $CLI_content = escapeshellarg($content);
         $tidy = `echo {$CLI_content} | tidy -n -q -utf8 -asxhtml 2> /dev/null`;
         $tidy = str_replace('xmlns="http://www.w3.org/1999/xhtml"', '', $tidy);
         $tidy = str_replace(' ', '', $tidy);
     } else {
         // no tidy library found, hence no sanitizing
         $tidy = $content;
     }
     $this->simpleXML = @simplexml_load_string($tidy, 'SimpleXMLElement', LIBXML_NOWARNING);
     if (!$this->simpleXML) {
         throw new Exception('CSSContentParser::__construct(): Could not parse content.' . ' Please check the PHP extension tidy is installed.');
     }
     parent::__construct();
 }
Exemplo n.º 3
0
 /**
  * Transforms an XML file into compatible XHTML based on the stylesheet
  * @param $xml XML DOM tree, or string filename
  * @return string HTML output
  * @todo Rename to transformToXHTML, as transformToHTML is misleading
  */
 public function transformToHTML($xml)
 {
     if (is_string($xml)) {
         $dom = new DOMDocument();
         $dom->load($xml);
     } else {
         $dom = $xml;
     }
     $out = $this->xsltProcessor->transformToXML($dom);
     // fudges for HTML backwards compatibility
     // assumes that document is XHTML
     $out = str_replace('/>', ' />', $out);
     // <br /> not <br/>
     $out = str_replace(' xmlns=""', '', $out);
     // rm unnecessary xmlns
     if (class_exists('Tidy')) {
         // cleanup output
         $config = array('indent' => true, 'output-xhtml' => true, 'wrap' => 80);
         $tidy = new Tidy();
         $tidy->parseString($out, $config, 'utf8');
         $tidy->cleanRepair();
         $out = (string) $tidy;
     }
     return $out;
 }
Exemplo n.º 4
0
 /**
  * Generates HTML from an array of tokens.
  * @param $tokens Array of HTMLPurifier_Token
  * @param $config HTMLPurifier_Config object
  * @return Generated HTML
  */
 public function generateFromTokens($tokens)
 {
     if (!$tokens) {
         return '';
     }
     // Basic algorithm
     $html = '';
     for ($i = 0, $size = count($tokens); $i < $size; $i++) {
         if ($this->_scriptFix && $tokens[$i]->name === 'script' && $i + 2 < $size && $tokens[$i + 2] instanceof HTMLPurifier_Token_End) {
             // script special case
             // the contents of the script block must be ONE token
             // for this to work.
             $html .= $this->generateFromToken($tokens[$i++]);
             $html .= $this->generateScriptFromToken($tokens[$i++]);
         }
         $html .= $this->generateFromToken($tokens[$i]);
     }
     // Tidy cleanup
     if (extension_loaded('tidy') && $this->config->get('Output.TidyFormat')) {
         $tidy = new Tidy();
         $tidy->parseString($html, array('indent' => true, 'output-xhtml' => $this->_xhtml, 'show-body-only' => true, 'indent-spaces' => 2, 'wrap' => 68), 'utf8');
         $tidy->cleanRepair();
         $html = (string) $tidy;
         // explicit cast necessary
     }
     // Normalize newlines to system defined value
     $nl = $this->config->get('Output.Newline');
     if ($nl === null) {
         $nl = PHP_EOL;
     }
     if ($nl !== "\n") {
         $html = str_replace("\n", $nl, $html);
     }
     return $html;
 }
Exemplo n.º 5
0
	function __construct($content) {
		if(extension_loaded('tidy')) {
			// using the tiny php extension
			$tidy = new Tidy();
			$tidy->parseString(
				$content, 
				array(
					'output-xhtml' => true,
					'numeric-entities' => true,
				), 
				'utf8'
			);
			$tidy->cleanRepair();
			$tidy = str_replace('xmlns="http://www.w3.org/1999/xhtml"','',$tidy);
			$tidy = str_replace('&#160;','',$tidy);
		} elseif(`which tidy`) {
			// using tiny through cli
			$CLI_content = escapeshellarg($content);
			$tidy = `echo $CLI_content | tidy -n -q -utf8 -asxhtml 2> /dev/null`;
			$tidy = str_replace('xmlns="http://www.w3.org/1999/xhtml"','',$tidy);
			$tidy = str_replace('&#160;','',$tidy);
		} else {
			// no tidy library found, hence no sanitizing
			$tidy = $content;
		}
		
		
		
		$this->simpleXML = new SimpleXMLElement($tidy);
	}
Exemplo n.º 6
0
 /**
  * receive the html content, fix/format the dom tree and return it
  * 
  * @param string $content
  * @return string
  */
 protected function _tidyFix($content)
 {
     $config = ['input-xml' => true, 'output-xml' => true, 'wrap' => false];
     $tidy = new Tidy();
     $tidy->parseString($content, $config, 'utf8');
     $tidy->cleanRepair();
     $content = (string) $tidy;
     return $content;
 }
Exemplo n.º 7
0
 public static function beforeResponse($request, $response)
 {
     if ($request['_format'] == 'html') {
         $tidy = new \Tidy();
         $tidy->parseString($response, array('wrap' => 200, 'indent' => true), 'utf8');
         $tidy->cleanRepair();
         $html = $tidy->html();
         $response = $html->value;
     }
     return $response;
 }
Exemplo n.º 8
0
 public function formatHtml($html, $charset = null, $charset_hint = null)
 {
     $html = $this->toUTF8($html, $charset, $charset_hint);
     $tidy = new Tidy();
     $config = array("hide-comments" => true);
     $tidy->parseString($html, $config, 'UTF8');
     $tidy->cleanRepair();
     $html = (string) $tidy;
     $html = $this->moveMetaContentTypeToTop($html);
     $html = $this->formatDocType($html);
     return $html;
 }
Exemplo n.º 9
0
 function parse_html($html_code)
 {
     $this->html_code = $html_code;
     // Tidy HTML code
     $tidy = new Tidy();
     $tidy->parseString($html_code, $this->tidy_config, 'utf8');
     $tidy->cleanRepair();
     $this->tidy_code = $tidy->value;
     $this->dom = DOMDocument::loadXML($tidy->value);
     $this->dom->normalizeDocument();
     if ($this->dom == null) {
         trigger_error("Unable to parse XML Document!", E_USER_ERROR);
     }
 }
Exemplo n.º 10
0
 public function formatTables($text)
 {
     $text = preg_replace_callback('%<div class="rvps(?:14|8)">\\n*<table.*?>([\\s\\S]*?)</table>\\n*</div>%u', function ($matches) {
         $table = '<table>' . $matches[1] . '</table>';
         $table = preg_replace('%(?:<p class="rvps(?:1|4|14)">)?<span class="rvts(?:9|15|23)">\\s*(.*?)\\s*</span>(?:</p>)?%u', '<b class="table-header">$1</b>', $table);
         $table = preg_replace('%<b class="table-header"><br></b>%u', '', $table);
         // rvps14 - rvps14
         // rvps14 - rvps11
         // rvps4 - rvps15
         $config = array('clean' => true, 'output-html' => true, 'show-body-only' => true, 'wrap' => 0, 'indent' => true);
         $tidy = new \Tidy();
         $tidy->parseString($table, $config, 'utf8');
         $tidy->cleanRepair();
         return $tidy . "\n";
     }, $text);
     return $text;
 }
Exemplo n.º 11
0
 /**
  * Reads input and returns Tidy-filtered output.
  *
  * @param null $len
  *
  * @throws BuildException
  * @return the resulting stream, or -1 if the end of the resulting stream has been reached
  *
  */
 public function read($len = null)
 {
     if (!class_exists('Tidy')) {
         throw new BuildException("You must enable the 'tidy' extension in your PHP configuration in order to use the Tidy filter.");
     }
     if (!$this->getInitialized()) {
         $this->_initialize();
         $this->setInitialized(true);
     }
     $buffer = $this->in->read($len);
     if ($buffer === -1) {
         return -1;
     }
     $config = $this->getDistilledConfig();
     $tidy = new Tidy();
     $tidy->parseString($buffer, $config, $this->encoding);
     $tidy->cleanRepair();
     return tidy_get_output($tidy);
 }
Exemplo n.º 12
0
 /**
  * Transforms an XML file into HTML based on the stylesheet
  * @param $xml XML DOM tree
  */
 public function transformToHTML($xml)
 {
     $out = $this->xsltProcessor->transformToXML($xml);
     // fudges for HTML backwards compatibility
     $out = str_replace('/>', ' />', $out);
     // <br /> not <br/>
     $out = str_replace(' xmlns=""', '', $out);
     // rm unnecessary xmlns
     $out = str_replace(' xmlns="http://www.w3.org/1999/xhtml"', '', $out);
     // rm unnecessary xmlns
     if (class_exists('Tidy')) {
         // cleanup output
         $config = array('indent' => true, 'output-xhtml' => true, 'wrap' => 80);
         $tidy = new Tidy();
         $tidy->parseString($out, $config, 'utf8');
         $tidy->cleanRepair();
         $out = (string) $tidy;
     }
     return $out;
 }
Exemplo n.º 13
0
 protected function loadHtml($uri)
 {
     if (preg_match('/^https?:/i', $uri) === 0) {
         $uri = $this->config->getBaseHref() . $uri;
     }
     $curl = curl_init($uri);
     curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
     $html = curl_exec($curl);
     $this->request_info = curl_getinfo($curl);
     curl_close($curl);
     $this->location = $uri;
     $tidy = new Tidy();
     $tidy->parseString($html, array('output-xhtml' => true, 'char-encoding' => 'utf8', 'numeric-entities' => true), 'utf8');
     $tidy->cleanRepair();
     $this->document = new DOMDocument();
     $this->document->resolveExternals = true;
     $this->document->loadXml($tidy);
     $this->xpath = new DOMXPath($this->document);
     $this->xpath->registerNamespace('atom', 'http://www.w3.org/2005/Atom');
     $this->xpath->registerNamespace('html', 'http://www.w3.org/1999/xhtml');
 }
Exemplo n.º 14
0
    $response['message'] = "";
    //use php's filter to check for a valid url
    if (!filter_var($_POST['url'], FILTER_VALIDATE_URL) === false) {
        $url = $_POST['url'];
        $curl = new MyCurl($url);
        $curl->createCurl();
        $response['code'] = $curl->getHttpStatus();
        $response['message'] = HttpCodes::getType($response['code']);
        $html = $curl->__toString();
        if (!is_string($html)) {
            $response['message'] = "Page Could not be loaded, check the domain. Nothing was returned.";
        } else {
            $tidy = new Tidy();
            //load page into tidy object, set options, and clean html
            $tidy->parseString($html, array('indent' => 2, 'output-xhtml' => true));
            $tidy->cleanRepair();
            //html is now nicely indented
            $html = (string) $tidy;
            //count the tags and get the result in a $tag => $count array
            $tagCount = countTags($html);
            $response['tagCount'] = $tagCount;
            $response['html'] = htmlentities($html);
        }
    } else {
        $response['message'] = $_POST['url'] . " is not a valid URL";
    }
    header('Content-Type: application/json');
    echo json_encode($response);
} else {
    //load the base view, located at ../views/base.php
    View::load("base");
Exemplo n.º 15
0
function tidyToXml($htmlTagSoup)
{
    // Create the Tidy object
    $tidy = new Tidy();
    // Parse the HTML into memory, turning on the option to convert to
    // XHTML as part of the tidying process
    $tidy->parseString($htmlTagSoup, array('output-xhtml' => true));
    // Do the tidying
    $tidy->cleanRepair();
    // And get the tidied version as a string
    $tidied_xml = tidy_get_output($tidy);
    // Opinions seem to differ as to whether the non-breaking space
    // entity '&nbsp;' is predeclared as part of XHTML.  Tidy thinks it
    // is, and so leaves it alone, while the XML parser we're about to
    // use on this string thinks otherwise.  So replace any occurrences
    // of it with its numeric equivalent (which doesn't need to be
    // declared).
    return str_replace('&nbsp;', '&#160;', $tidied_xml);
}
Exemplo n.º 16
0
 private function clean($content)
 {
     if (!$content) {
         return '';
     }
     $tidy = new \Tidy();
     $tidy->parseString($content, ['indent' => true, 'doctype' => 'omit', 'output-html' => true, 'show-body-only' => true, 'drop-empty-paras' => true, 'drop-font-tags' => true, 'drop-proprietary-attributes' => true, 'hide-comments' => true, 'logical-emphasis' => true]);
     $tidy->cleanRepair();
     return (string) $content;
 }
Exemplo n.º 17
0
 /**
  * @dataProvider filterProvider
  * @covers Robo47_Filter_Tidy::filter
  */
 public function testFilter($code)
 {
     $filter = new Robo47_Filter_Tidy();
     $filtered = $filter->filter($code);
     $tidy = new Tidy();
     $tidy->parseString($code, $filter->getConfig(), $filter->getEncoding());
     $tidy->cleanRepair();
     $this->assertEquals((string) $tidy, $filtered, 'Filter output missmatches direct tidy-output');
 }
Exemplo n.º 18
0
 /**
 		Clean and repair HTML
 			@return string
 			@param $html string
 			@public
 	**/
 static function tidy($html)
 {
     if (!extension_loaded('tidy')) {
         return $html;
     }
     $tidy = new Tidy();
     $tidy->parseString($html, self::$vars['TIDY'], str_replace('-', '', self::$vars['ENCODING']));
     $tidy->cleanRepair();
     return (string) $tidy;
 }
Exemplo n.º 19
0
 /**
  * Filter
  *
  * @see Zend_Filter_Interface::filter
  * @param string $value
  * @return string
  */
 public function filter($value)
 {
     $this->_tidy->parseString($value, $this->getConfig(), $this->getEncoding());
     $this->_tidy->cleanRepair();
     return (string) $this->_tidy;
 }
Exemplo n.º 20
0
/**
 * Custom function that formats a string of HTML using Tidy
 * @param string $string
 */
function unl_tidy($string)
{
    if (class_exists('Tidy') && variable_get('unl_tidy')) {
        $tidy = new Tidy();
        // Tidy options: http://tidy.sourceforge.net/docs/quickref.html
        $options = array('doctype' => 'omit', 'new-blocklevel-tags' => 'article,aside,header,footer,section,nav,hgroup,address,figure,figcaption,output', 'new-inline-tags' => 'video,audio,canvas,ruby,rt,rp,time,code,kbd,samp,var,mark,bdi,bdo,wbr,details,datalist,source,summary', 'output-xhtml' => true, 'show-body-only' => true, 'indent' => true, 'indent-spaces' => 2, 'vertical-space' => false, 'wrap' => 140, 'wrap-attributes' => false, 'force-output' => true, 'quiet' => true, 'tidy-mark' => false);
        // Add &nbsp; to prevent Tidy from removing script or comment if it is the first thing
        if (strtolower(substr(trim($string), 0, 7)) == '<script' || substr(trim($string), 0, 4) == '<!--') {
            $statement = '';
            if (substr(trim($string), 0, 9) !== '<!-- Tidy') {
                $statement = "<!-- Tidy: Start field with something other than script or comment to remove this -->\n";
            }
            $string = "&nbsp;" . $statement . $string;
        }
        $tidy->parseString($string, $options, 'utf8');
        if ($tidy->cleanRepair()) {
            return $tidy;
        }
    }
    return $string;
}
Exemplo n.º 21
0
 /**
  * Generates HTML from an array of tokens.
  * @param $tokens Array of HTMLPurifier_Token
  * @param $config HTMLPurifier_Config object
  * @return Generated HTML
  */
 function generateFromTokens($tokens, $config, &$context)
 {
     $html = '';
     if (!$config) {
         $config = HTMLPurifier_Config::createDefault();
     }
     $this->_scriptFix = $config->get('Output', 'CommentScriptContents');
     $this->_def = $config->getHTMLDefinition();
     $this->_xhtml = $this->_def->doctype->xml;
     if (!$tokens) {
         return '';
     }
     for ($i = 0, $size = count($tokens); $i < $size; $i++) {
         if ($this->_scriptFix && $tokens[$i]->name === 'script' && $i + 2 < $size && $tokens[$i + 2]->type == 'end') {
             // script special case
             // the contents of the script block must be ONE token
             // for this to work
             $html .= $this->generateFromToken($tokens[$i++]);
             $html .= $this->generateScriptFromToken($tokens[$i++]);
             // We're not going to do this: it wouldn't be valid anyway
             //while ($tokens[$i]->name != 'script') {
             //    $html .= $this->generateScriptFromToken($tokens[$i++]);
             //}
         }
         $html .= $this->generateFromToken($tokens[$i]);
     }
     if ($config->get('Output', 'TidyFormat') && extension_loaded('tidy')) {
         $tidy_options = array('indent' => true, 'output-xhtml' => $this->_xhtml, 'show-body-only' => true, 'indent-spaces' => 2, 'wrap' => 68);
         if (version_compare(PHP_VERSION, '5', '<')) {
             tidy_set_encoding('utf8');
             foreach ($tidy_options as $key => $value) {
                 tidy_setopt($key, $value);
             }
             tidy_parse_string($html);
             tidy_clean_repair();
             $html = tidy_get_output();
         } else {
             $tidy = new Tidy();
             $tidy->parseString($html, $tidy_options, 'utf8');
             $tidy->cleanRepair();
             $html = (string) $tidy;
         }
     }
     // normalize newlines to system
     $nl = $config->get('Output', 'Newline');
     if ($nl === null) {
         $nl = PHP_EOL;
     }
     $html = str_replace("\n", $nl, $html);
     return $html;
 }
 public function generateFromTokens($tokens)
 {
     if (!$tokens) {
         return '';
     }
     $html = '';
     for ($i = 0, $size = count($tokens); $i < $size; $i++) {
         if ($this->_scriptFix && $tokens[$i]->name === 'script' && $i + 2 < $size && $tokens[$i + 2] instanceof HTMLPurifier_Token_End) {
             $html .= $this->generateFromToken($tokens[$i++]);
             $html .= $this->generateScriptFromToken($tokens[$i++]);
         }
         $html .= $this->generateFromToken($tokens[$i]);
     }
     if (extension_loaded('tidy') && $this->config->get('Output.TidyFormat')) {
         $tidy = new Tidy();
         $tidy->parseString($html, array('indent' => true, 'output-xhtml' => $this->_xhtml, 'show-body-only' => true, 'indent-spaces' => 2, 'wrap' => 68), 'utf8');
         $tidy->cleanRepair();
         $html = (string) $tidy;
     }
     if ($this->config->get('Core.NormalizeNewlines')) {
         $nl = $this->config->get('Output.Newline');
         if ($nl === null) {
             $nl = PHP_EOL;
         }
         if ($nl !== "\n") {
             $html = str_replace("\n", $nl, $html);
         }
     }
     return $html;
 }
Exemplo n.º 23
0
    function __toString()
    {
        $this->set('jQuery', '<script src="http://ajax.googleapis.com/ajax/libs/jquery/1.9.1/jquery.min.js"></script>');
        $this->set('bootstrap', '<link href="http://getbootstrap.com/dist/css/bootstrap.min.css" rel="stylesheet">
<script src="http://getbootstrap.com/dist/js/bootstrap.min.js"></script>');
        $this->time = round(microtime() - $this->time, 4);
        $this->set("time", $this->time);
        $text = $this->parseVal($this->str);
        if (Config\Main::$tidyEnabled) {
            $tidy = new \Tidy();
            $tidy->parseString($text, ["wrap" => 160]);
            $tidy->cleanRepair();
            $text = $tidy;
        }
        return (string) $text;
    }