Ejemplo n.º 1
0
 /**
  * Transforms an XML file into compatible XHTML based on the stylesheet
  * @param $xml XML DOM tree, or string filename
  * @return string HTML output
  * @todo Rename to transformToXHTML, as transformToHTML is misleading
  */
 public function transformToHTML($xml)
 {
     if (is_string($xml)) {
         $dom = new DOMDocument();
         $dom->load($xml);
     } else {
         $dom = $xml;
     }
     $out = $this->xsltProcessor->transformToXML($dom);
     // fudges for HTML backwards compatibility
     // assumes that document is XHTML
     $out = str_replace('/>', ' />', $out);
     // <br /> not <br/>
     $out = str_replace(' xmlns=""', '', $out);
     // rm unnecessary xmlns
     if (class_exists('Tidy')) {
         // cleanup output
         $config = array('indent' => true, 'output-xhtml' => true, 'wrap' => 80);
         $tidy = new Tidy();
         $tidy->parseString($out, $config, 'utf8');
         $tidy->cleanRepair();
         $out = (string) $tidy;
     }
     return $out;
 }
Ejemplo n.º 2
0
 public function __construct($content)
 {
     if (extension_loaded('tidy')) {
         // using the tiny php extension
         $tidy = new Tidy();
         $tidy->parseString($content, array('output-xhtml' => true, 'numeric-entities' => true, 'wrap' => 99999), 'utf8');
         $tidy->cleanRepair();
         $tidy = str_replace('xmlns="http://www.w3.org/1999/xhtml"', '', $tidy);
         $tidy = str_replace('&#160;', '', $tidy);
     } elseif (@shell_exec('which tidy')) {
         // using tiny through cli
         $CLI_content = escapeshellarg($content);
         $tidy = `echo {$CLI_content} | tidy -n -q -utf8 -asxhtml 2> /dev/null`;
         $tidy = str_replace('xmlns="http://www.w3.org/1999/xhtml"', '', $tidy);
         $tidy = str_replace('&#160;', '', $tidy);
     } else {
         // no tidy library found, hence no sanitizing
         $tidy = $content;
     }
     $this->simpleXML = @simplexml_load_string($tidy, 'SimpleXMLElement', LIBXML_NOWARNING);
     if (!$this->simpleXML) {
         throw new Exception('CSSContentParser::__construct(): Could not parse content.' . ' Please check the PHP extension tidy is installed.');
     }
     parent::__construct();
 }
Ejemplo n.º 3
0
 public function loadFeed()
 {
     if (!isset(RSSFilter::$feeds[$this->A("RSSFilterFeed")])) {
         RSSFilter::$feeds[$this->A("RSSFilterFeed")] = "<phynx></phynx>";
         $content = @file_get_contents($this->A("RSSFilterFeed"));
         if ($content === false) {
             throw new Exception($this->A("RSSFilterFeed") . " could not be loaded!");
         }
         $content = str_replace("", "", $content);
         try {
             libxml_use_internal_errors(true);
             RSSFilter::$feeds[$this->A("RSSFilterFeed")] = new SimpleXMLElement($content);
         } catch (Exception $e) {
             try {
                 $config = array('indent' => true, 'clean' => true, 'input-xml' => true, 'output-xml' => true, 'wrap' => false);
                 $tidy = new Tidy();
                 $xml = $tidy->repairString($content, $config);
                 RSSFilter::$feeds[$this->A("RSSFilterFeed")] = new SimpleXMLElement($xml);
             } catch (ClassNotFoundException $e) {
                 throw new Exception($this->A("RSSFilterFeed") . " contains errors, but Tidy not found!");
             } catch (Exception $e) {
                 #$errors = "";
                 #foreach(libxml_get_errors() as $error)
                 #	print_r($error->message);
                 throw new Exception($this->A("RSSFilterFeed") . " contained errors even Tidy could not fix!");
             }
             #$errors = "";
             #foreach(libxml_get_errors() as $error)
             #	print_r($error->message);
             #throw new Exception($this->A("RSSFilterFeed"));
         }
     }
 }
Ejemplo n.º 4
0
 /**
  * Generates HTML from an array of tokens.
  * @param $tokens Array of HTMLPurifier_Token
  * @param $config HTMLPurifier_Config object
  * @return Generated HTML
  */
 public function generateFromTokens($tokens)
 {
     if (!$tokens) {
         return '';
     }
     // Basic algorithm
     $html = '';
     for ($i = 0, $size = count($tokens); $i < $size; $i++) {
         if ($this->_scriptFix && $tokens[$i]->name === 'script' && $i + 2 < $size && $tokens[$i + 2] instanceof HTMLPurifier_Token_End) {
             // script special case
             // the contents of the script block must be ONE token
             // for this to work.
             $html .= $this->generateFromToken($tokens[$i++]);
             $html .= $this->generateScriptFromToken($tokens[$i++]);
         }
         $html .= $this->generateFromToken($tokens[$i]);
     }
     // Tidy cleanup
     if (extension_loaded('tidy') && $this->config->get('Output.TidyFormat')) {
         $tidy = new Tidy();
         $tidy->parseString($html, array('indent' => true, 'output-xhtml' => $this->_xhtml, 'show-body-only' => true, 'indent-spaces' => 2, 'wrap' => 68), 'utf8');
         $tidy->cleanRepair();
         $html = (string) $tidy;
         // explicit cast necessary
     }
     // Normalize newlines to system defined value
     $nl = $this->config->get('Output.Newline');
     if ($nl === null) {
         $nl = PHP_EOL;
     }
     if ($nl !== "\n") {
         $html = str_replace("\n", $nl, $html);
     }
     return $html;
 }
Ejemplo n.º 5
0
	function __construct($content) {
		if(extension_loaded('tidy')) {
			// using the tiny php extension
			$tidy = new Tidy();
			$tidy->parseString(
				$content, 
				array(
					'output-xhtml' => true,
					'numeric-entities' => true,
				), 
				'utf8'
			);
			$tidy->cleanRepair();
			$tidy = str_replace('xmlns="http://www.w3.org/1999/xhtml"','',$tidy);
			$tidy = str_replace('&#160;','',$tidy);
		} elseif(`which tidy`) {
			// using tiny through cli
			$CLI_content = escapeshellarg($content);
			$tidy = `echo $CLI_content | tidy -n -q -utf8 -asxhtml 2> /dev/null`;
			$tidy = str_replace('xmlns="http://www.w3.org/1999/xhtml"','',$tidy);
			$tidy = str_replace('&#160;','',$tidy);
		} else {
			// no tidy library found, hence no sanitizing
			$tidy = $content;
		}
		
		
		
		$this->simpleXML = new SimpleXMLElement($tidy);
	}
Ejemplo n.º 6
0
 /**
  *	Gathers the advertisements from the remote page
  *	@param post		Array				The post data submitted by the form.
  *	@return			Array				The ads retrieved from the remote page.
  */
 public function fetchAds($post)
 {
     $this->_client->setUri($post['url']);
     $response = $this->_client->request('GET')->getBody();
     /**
      *	If the tidy class exists, attempt to cleanup the XML returned from the
      *	response requested from the remote site.
      */
     if (class_exists('tidy')) {
         $tidy = new Tidy('/dev/null', array('indent' => true, 'tab-size' => 4, 'output-encoding' => 'utf8', 'newline' => 'LF', 'output-xhtml' => true), 'utf8');
         $tidy->parseString($response);
         $tidy->cleanRepair();
         $response = $tidy->value;
     }
     /**
      *	Once we've attempted to clean up the retrieved HTML, attempt to parse the
      *	result in a DomDocument.
      */
     $xml = new DOMDocument('1.0', 'utf-8');
     $xml->loadHTML($response);
     $result = array();
     # Foreach of the anchor links in the page,
     foreach ($xml->getElementsByTagName('a') as $a) {
         # Get it's target HREF
         $href = $a->getAttribute('href');
         if (preg_match("/^http:\\/\\/([a-z\\-]+\\.)?{$post['ad']}.*\$/i", $href)) {
             # If a link's target points to the search query (the advertising site)
             $result[] = $href;
             # Append the result.
         }
     }
     return $result;
 }
Ejemplo n.º 7
0
 /**
  * @see ExtensionInterface
  */
 public function apply(Response $response)
 {
     $tidy = new \Tidy();
     $tidy->parseString($response->getContent());
     if ($tidy->errorBuffer) {
         throw new \Exception($tidy->errorBuffer);
     }
 }
Ejemplo n.º 8
0
 /**
  * receive the html content, fix/format the dom tree and return it
  * 
  * @param string $content
  * @return string
  */
 protected function _tidyFix($content)
 {
     $config = ['input-xml' => true, 'output-xml' => true, 'wrap' => false];
     $tidy = new Tidy();
     $tidy->parseString($content, $config, 'utf8');
     $tidy->cleanRepair();
     $content = (string) $tidy;
     return $content;
 }
Ejemplo n.º 9
0
 public static function beautifyHtml($html, $config = array("output-xhtml" => true, "char-encoding" => "utf8", "indent" => true, "indent-spaces" => 4, "wrap" => 0))
 {
     if (!Ajde_Core_Autoloader::exists('Tidy')) {
         throw new Ajde_Exception('Class Tidy not found', 90023);
     }
     $tidy = new Tidy();
     // http://bugs.php.net/bug.php?id=35647
     return $tidy->repairString($html, $config, 'utf8');
 }
Ejemplo n.º 10
0
 public static function beforeResponse($request, $response)
 {
     if ($request['_format'] == 'html') {
         $tidy = new \Tidy();
         $tidy->parseString($response, array('wrap' => 200, 'indent' => true), 'utf8');
         $tidy->cleanRepair();
         $html = $tidy->html();
         $response = $html->value;
     }
     return $response;
 }
Ejemplo n.º 11
0
 public static function beautifyHtml($html, $config = ['output-xhtml' => true, 'char-encoding' => 'utf8', 'indent' => true, 'indent-spaces' => 4, 'wrap' => 0])
 {
     if (!class_exists('Tidy')) {
         throw new Ajde_Exception('Class Tidy not found', 90023);
     }
     $tidy = new Tidy();
     // tidy does not produce valid utf8 when the encoding is specified in the config
     // so we provide a third parameter, 'utf8' to fix this
     // @see http://bugs.php.net/bug.php?id=35647
     return $tidy->repairString($html, $config, 'utf8');
 }
Ejemplo n.º 12
0
 public static function beautifyHtml($html, $config = array("output-xhtml" => true, "char-encoding" => "utf8", "indent" => true, "indent-spaces" => 4, "wrap" => 0))
 {
     if (!Ajde_Core_Autoloader::exists('Tidy')) {
         throw new Ajde_Exception('Class Tidy not found', 90023);
     }
     $tidy = new Tidy();
     // tidy does not produce valid utf8 when the encoding is specified in the config
     // so we provide a third parameter, 'utf8' to fix this
     // @see http://bugs.php.net/bug.php?id=35647
     return $tidy->repairString($html, $config, 'utf8');
 }
Ejemplo n.º 13
0
 public function formatHtml($html, $charset = null, $charset_hint = null)
 {
     $html = $this->toUTF8($html, $charset, $charset_hint);
     $tidy = new Tidy();
     $config = array("hide-comments" => true);
     $tidy->parseString($html, $config, 'UTF8');
     $tidy->cleanRepair();
     $html = (string) $tidy;
     $html = $this->moveMetaContentTypeToTop($html);
     $html = $this->formatDocType($html);
     return $html;
 }
Ejemplo n.º 14
0
 function parse_html($html_code)
 {
     $this->html_code = $html_code;
     // Tidy HTML code
     $tidy = new Tidy();
     $tidy->parseString($html_code, $this->tidy_config, 'utf8');
     $tidy->cleanRepair();
     $this->tidy_code = $tidy->value;
     $this->dom = DOMDocument::loadXML($tidy->value);
     $this->dom->normalizeDocument();
     if ($this->dom == null) {
         trigger_error("Unable to parse XML Document!", E_USER_ERROR);
     }
 }
Ejemplo n.º 15
0
 public function formatTables($text)
 {
     $text = preg_replace_callback('%<div class="rvps(?:14|8)">\\n*<table.*?>([\\s\\S]*?)</table>\\n*</div>%u', function ($matches) {
         $table = '<table>' . $matches[1] . '</table>';
         $table = preg_replace('%(?:<p class="rvps(?:1|4|14)">)?<span class="rvts(?:9|15|23)">\\s*(.*?)\\s*</span>(?:</p>)?%u', '<b class="table-header">$1</b>', $table);
         $table = preg_replace('%<b class="table-header"><br></b>%u', '', $table);
         // rvps14 - rvps14
         // rvps14 - rvps11
         // rvps4 - rvps15
         $config = array('clean' => true, 'output-html' => true, 'show-body-only' => true, 'wrap' => 0, 'indent' => true);
         $tidy = new \Tidy();
         $tidy->parseString($table, $config, 'utf8');
         $tidy->cleanRepair();
         return $tidy . "\n";
     }, $text);
     return $text;
 }
Ejemplo n.º 16
0
 /**
  * Reads input and returns Tidy-filtered output.
  *
  * @param null $len
  *
  * @throws BuildException
  * @return the resulting stream, or -1 if the end of the resulting stream has been reached
  *
  */
 public function read($len = null)
 {
     if (!class_exists('Tidy')) {
         throw new BuildException("You must enable the 'tidy' extension in your PHP configuration in order to use the Tidy filter.");
     }
     if (!$this->getInitialized()) {
         $this->_initialize();
         $this->setInitialized(true);
     }
     $buffer = $this->in->read($len);
     if ($buffer === -1) {
         return -1;
     }
     $config = $this->getDistilledConfig();
     $tidy = new Tidy();
     $tidy->parseString($buffer, $config, $this->encoding);
     $tidy->cleanRepair();
     return tidy_get_output($tidy);
 }
Ejemplo n.º 17
0
 /**
  * @param \CliTester            $I
  * @param \Codeception\Scenario $scenario
  */
 public function testDocumentValidHtml(\CliTester $I, \Codeception\Scenario $scenario)
 {
     $I->wantTo('verify that the default template produces valid HTML');
     if (!class_exists('Tidy')) {
         $scenario->skip('Tidy is not available. See http://php.net/manual/en/tidy.installation.php');
     }
     $template = dirname(dirname(__DIR__)) . '/src/Task/CodeSniffer/codestyle.html';
     $outfile = dirname(__DIR__) . '/_output/codestyle.html';
     if (file_exists($outfile)) {
         unlink($outfile);
     }
     $I->dontSeeFileFound($outfile);
     $I->runShellCommand('vendor/bin/robo document:codestyle --outfile ' . $outfile . ' --template ' . $template);
     $I->seeFileFound($outfile);
     $tidy = new \Tidy();
     $tidy->parseFile($outfile);
     $I->assertEquals(0, $tidy->getStatus());
     unlink($outfile);
 }
Ejemplo n.º 18
0
 /**
  * Transforms an XML file into HTML based on the stylesheet
  * @param $xml XML DOM tree
  */
 public function transformToHTML($xml)
 {
     $out = $this->xsltProcessor->transformToXML($xml);
     // fudges for HTML backwards compatibility
     $out = str_replace('/>', ' />', $out);
     // <br /> not <br/>
     $out = str_replace(' xmlns=""', '', $out);
     // rm unnecessary xmlns
     $out = str_replace(' xmlns="http://www.w3.org/1999/xhtml"', '', $out);
     // rm unnecessary xmlns
     if (class_exists('Tidy')) {
         // cleanup output
         $config = array('indent' => true, 'output-xhtml' => true, 'wrap' => 80);
         $tidy = new Tidy();
         $tidy->parseString($out, $config, 'utf8');
         $tidy->cleanRepair();
         $out = (string) $tidy;
     }
     return $out;
 }
Ejemplo n.º 19
0
 protected function loadHtml($uri)
 {
     if (preg_match('/^https?:/i', $uri) === 0) {
         $uri = $this->config->getBaseHref() . $uri;
     }
     $curl = curl_init($uri);
     curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
     $html = curl_exec($curl);
     $this->request_info = curl_getinfo($curl);
     curl_close($curl);
     $this->location = $uri;
     $tidy = new Tidy();
     $tidy->parseString($html, array('output-xhtml' => true, 'char-encoding' => 'utf8', 'numeric-entities' => true), 'utf8');
     $tidy->cleanRepair();
     $this->document = new DOMDocument();
     $this->document->resolveExternals = true;
     $this->document->loadXml($tidy);
     $this->xpath = new DOMXPath($this->document);
     $this->xpath->registerNamespace('atom', 'http://www.w3.org/2005/Atom');
     $this->xpath->registerNamespace('html', 'http://www.w3.org/1999/xhtml');
 }
Ejemplo n.º 20
0
 public function generateResponse()
 {
     TemplateEngine::compile();
     if (Gravel::$config['gravel']['tidy_html'] && class_exists('Tidy')) {
         $html = new \Tidy();
         $config = ['indent' => 1, 'indent-spaces' => 4, 'output-xhtml' => 'false', 'wrap' => 0, 'hide-comments' => 0];
         $html->parseString(TemplateEngine::$data['compiled'], $config);
     } else {
         $html = TemplateEngine::$data['compiled'];
     }
     if (Gravel::$config['gravel']['debug_mode']) {
         header("Content-Type: text/plain");
     }
     echo $html;
     // if we don't have an ajax request we can output some debug info
     if (!isset($_SERVER['HTTP_X_REQUESTED_WITH']) || $_SERVER['HTTP_X_REQUESTED_WITH'] !== 'XMLHttpRequest') {
         $version = Gravel::$version;
         echo PHP_EOL . "<!-- Generated in " . number_format(microtime(true) - Gravel::$startTime, 5) . " seconds -->";
         echo PHP_EOL . "<!-- Gravel PHP framework {$version} -->";
     }
 }
Ejemplo n.º 21
0
 public static function RunOn($text)
 {
     global $wgTidyInternal;
     $wrappedtext = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"' . ' "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"><html>' . '<head><title>test</title></head><body>' . $text . '</body></html>';
     if ($wgTidyInternal) {
         $correctedtext = Tidy::internal($wrappedtext);
     } else {
         $correctedtext = Tidy::external($wrappedtext);
     }
     if (is_null($correctedtext)) {
         wfDebug("Tidy error detected!\n");
         return $text . "\n<!-- Tidy found serious XHTML errors -->\n";
     }
     return $correctedtext;
 }
Ejemplo n.º 22
0
 /**
  * 功能说明:html标签闭合检测、修复
  * @param  mix    $data   需要处理的html。可以是多维数组,程序自动进行递归处理
  * @param  string $encode 编码:默认utf-8
  * @return mix 返回数据与参数data一致
  */
 public static function htmlFixSafe($data, $encode = 'utf8')
 {
     if (empty($data)) {
         return '';
     }
     $tidyConfig = array('indent' => false, 'output-xhtml' => true, 'show-body-only' => true);
     $tidyObj = new Tidy();
     if (is_array($data)) {
         foreach ($data as $key => $value) {
             $data[$key] = self::htmlFixSafe($value);
         }
     } else {
         $data = $tidyObj->repairString($data, $tidyConfig, $encode);
     }
     return $data;
 }
 /**
  * Main function for prepare the useful email content
  *
  * @param $content string
  * @return string
  */
 public function recognizeUsefulContent($content = null)
 {
     if (!$content) {
         $content = $this->getContent();
     }
     if ($this->isHtml($content)) {
         $content = $this->HTMLPurifier->purify($content);
         $content = $this->removeReplies($content);
     } else {
         $position = strpos($content, self::DELIMITER_LINE);
         if ($position !== false) {
             $content = substr($content, 0, $position);
         }
         $content = $this->markdown->defaultTransform($content);
     }
     return $content;
 }
Ejemplo n.º 24
0
 $response['tagCount'] = array();
 $response['html'] = "";
 $response['code'] = "N/A";
 $response['message'] = "";
 //use php's filter to check for a valid url
 if (!filter_var($_POST['url'], FILTER_VALIDATE_URL) === false) {
     $url = $_POST['url'];
     $curl = new MyCurl($url);
     $curl->createCurl();
     $response['code'] = $curl->getHttpStatus();
     $response['message'] = HttpCodes::getType($response['code']);
     $html = $curl->__toString();
     if (!is_string($html)) {
         $response['message'] = "Page Could not be loaded, check the domain. Nothing was returned.";
     } else {
         $tidy = new Tidy();
         //load page into tidy object, set options, and clean html
         $tidy->parseString($html, array('indent' => 2, 'output-xhtml' => true));
         $tidy->cleanRepair();
         //html is now nicely indented
         $html = (string) $tidy;
         //count the tags and get the result in a $tag => $count array
         $tagCount = countTags($html);
         $response['tagCount'] = $tagCount;
         $response['html'] = htmlentities($html);
     }
 } else {
     $response['message'] = $_POST['url'] . " is not a valid URL";
 }
 header('Content-Type: application/json');
 echo json_encode($response);
Ejemplo n.º 25
0
function tidyToXml($htmlTagSoup)
{
    // Create the Tidy object
    $tidy = new Tidy();
    // Parse the HTML into memory, turning on the option to convert to
    // XHTML as part of the tidying process
    $tidy->parseString($htmlTagSoup, array('output-xhtml' => true));
    // Do the tidying
    $tidy->cleanRepair();
    // And get the tidied version as a string
    $tidied_xml = tidy_get_output($tidy);
    // Opinions seem to differ as to whether the non-breaking space
    // entity '&nbsp;' is predeclared as part of XHTML.  Tidy thinks it
    // is, and so leaves it alone, while the XML parser we're about to
    // use on this string thinks otherwise.  So replace any occurrences
    // of it with its numeric equivalent (which doesn't need to be
    // declared).
    return str_replace('&nbsp;', '&#160;', $tidied_xml);
}
Ejemplo n.º 26
0
 private function clean($content)
 {
     if (!$content) {
         return '';
     }
     $tidy = new \Tidy();
     $tidy->parseString($content, ['indent' => true, 'doctype' => 'omit', 'output-html' => true, 'show-body-only' => true, 'drop-empty-paras' => true, 'drop-font-tags' => true, 'drop-proprietary-attributes' => true, 'hide-comments' => true, 'logical-emphasis' => true]);
     $tidy->cleanRepair();
     return (string) $content;
 }
Ejemplo n.º 27
0
 function closetags($html)
 {
     if (class_exists('Tidy')) {
         $tidy = new Tidy();
         $clean = $tidy->repairString($html, array('output-xml' => true, 'input-xml' => true));
         return $clean;
     }
     preg_match_all('#<(?!meta|img|br|hr|input\\b)\\b([a-z]+)(?: .*)?(?<![/|/ ])>#iU', $html, $result);
     $openedtags = $result[1];
     preg_match_all('#</([a-z]+)>#iU', $html, $result);
     $closedtags = $result[1];
     $len_opened = count($openedtags);
     if (count($closedtags) == $len_opened) {
         return $html;
     }
     $openedtags = array_reverse($openedtags);
     for ($i = 0; $i < $len_opened; $i++) {
         if (!in_array($openedtags[$i], $closedtags)) {
             $html .= '</' . $openedtags[$i] . '>';
         } else {
             unset($closedtags[array_search($openedtags[$i], $closedtags)]);
         }
     }
     return $html;
 }
Ejemplo n.º 28
0
 function onFunction($matches, $s)
 {
     $fns = explode('||', $matches[2]);
     for ($i = count($fns) - 1; $i >= 0; $i--) {
         $fn = explode(',', $fns[$i]);
         switch ($fn[0]) {
             case 'cleanhtml':
                 $s = strip_tags($s, '<p><a><b><br><br/><i>');
                 break;
             case 'removehtml':
                 $s = strip_tags($s);
                 break;
             case 'splitbychars':
                 $s = substr($s, $fn[1], $fn[2]);
                 break;
             case 'splitbywords':
                 $len = strlen($s);
                 $pos = $fn[2] > $len ? $len : strpos($s, ' ', $fn[2]);
                 if ($pos === false) {
                     $pos = $len;
                 }
                 $s = substr($s, 0, $pos);
                 break;
             case 'findimage':
                 $index = isset($fn[1]) ? intval($fn[1]) - 1 : 0;
                 preg_match_all('/(<img.*?src=[\'"](.*?)[\'"][^>]+>)|(background(-image)??\\s*?:.*?url\\((["|\']?)?(.+?)(["|\']?)?\\))/i', $s, $r);
                 if (isset($r[2]) && !empty($r[2][$index])) {
                     $s = $r[2][$index];
                 } else {
                     if (isset($r[6]) && !empty($r[6][$index])) {
                         $s = trim($r[6][$index], "'\" \t\n\r\v");
                     } else {
                         $s = '';
                     }
                 }
                 break;
         }
     }
     if ($i !== -1) {
         if ($this->_tidy) {
             $tidy = new Tidy();
             return $tidy->repairString($s, array('show-body-only' => true, 'input-encoding' => $this->_tidyInputEncoding, 'output-encoding' => $this->_tidyOutputEncoding));
         }
     }
     return $this->closetags($s);
 }
Ejemplo n.º 29
0
<?php

$start = microtime(true);
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, 'http://thinkphp.com.ua/');
// we want to pretend the Googlebot
curl_setopt($ch, CURLOPT_USERAGENT, 'Googlebot/2.1 (+http://www.google.com/bot.html)');
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
$content = curl_exec($ch);
// fix the document, as it's xml
$tidy = new Tidy();
$tidy->parseString($content, ['input-xml' => true, 'output-xml' => true, 'wrap' => false], 'utf8');
$tidy->cleanRepair();
$content = (string) $tidy;
// load the string as simplexml object
$xml = simplexml_load_string($content);
// registering the namespace, so we can search
$xml->registerXPathNamespace('xmlns', 'http://www.w3.org/1999/xhtml');
$eventsHeaders = [];
foreach ($xml->xpath('//xmlns:h2') as $node) {
    // remove if present
    unset($node->span);
    // if the href is there, let's parse it
    if (isset($node->a['href'])) {
        $link = (string) $node->a['href'];
    } else {
        $link = null;
    }
    $eventsHeaders[] = ['title' => trim(strip_tags($node->asXml())), 'link' => $link];
}
$finish = microtime(true);
Ejemplo n.º 30
0
 /**
 		Clean and repair HTML
 			@return string
 			@param $html string
 			@public
 	**/
 static function tidy($html)
 {
     if (!extension_loaded('tidy')) {
         return $html;
     }
     $tidy = new Tidy();
     $tidy->parseString($html, self::$vars['TIDY'], str_replace('-', '', self::$vars['ENCODING']));
     $tidy->cleanRepair();
     return (string) $tidy;
 }