Exemplo n.º 1
0
 /**
  * @param string $html
  * @param array $options
  */
 public function __construct($html, $options = array())
 {
     $html = ltrim($html);
     $this->html = $html;
     $this->options = $this->options($options);
     $SegmentedString = new SegmentedString($html);
     $HTMLTokenizer = new HTMLTokenizer($SegmentedString, $options);
     $this->tokens = $HTMLTokenizer->tokenizer();
 }
Exemplo n.º 2
0
	function test_tokenizer()
	{
		foreach ( $this->html_strs as $html_str ) {
			$t = new HTMLTokenizer( $html_str );
			$tokens = $t->parse();
			$new_str = (string)$tokens;

			$this->assert_identical( $html_str, $new_str, "<br>" . Utils::htmlspecialchars( $html_str ) . "<br>" . Utils::htmlspecialchars( $new_str ) );
		}
	}
 public static function do_highlight($in)
 {
     // Look, ma! No Regex!
     $tokenizer = new HTMLTokenizer($in, false);
     $tokens = $tokenizer->parse();
     // fetch div, pre, code slices that have a class="highlight"
     $slices = $tokens->slice(array('div', 'pre', 'code'), array('class' => 'highlight'));
     // iterate the found slices
     foreach ($slices as $slice) {
         // store the class to use once we've stripped the container
         $classAttr = $slice[0]['attrs']['class'];
         // unique name to use in the cache for this slice/markup
         $sliceCacheName = 'plugin.highlight.' . md5((string) $slice) . filemtime(__FILE__);
         // trim off the div, and determine the value
         $slice->trim_container();
         $sliceValue = trim((string) $slice);
         // see if it's already been cached
         if (Cache::has($sliceCacheName)) {
             $output = Cache::get($sliceCacheName);
         } else {
             // trim off the CDATA wrapper, if applicable
             if (substr($sliceValue, 0, 9) == '<![CDATA[' && substr($sliceValue, -3) == ']]>') {
                 $sliceValue = substr($sliceValue, 9, -3);
             }
             $classes = array_filter(explode(' ', trim(str_replace('highlight', '', $classAttr))));
             // ugly, refactor
             $geshi = new Geshi(trim($sliceValue), isset($classes[0]) ? $classes[0] : 'php', HighlightPlugin::$geshi_path . '/geshi/');
             $geshi->set_header_type(GESHI_HEADER_PRE);
             $geshi->set_overall_class('geshicode');
             $output = @$geshi->parse_code();
             // @ is slow, but geshi is full of E_NOTICE
             Cache::set($sliceCacheName, $output);
         }
         $slice->tokenize_replace($output);
         $tokens->replace_slice($slice);
     }
     return (string) $tokens;
 }
Exemplo n.º 4
0
 /**
  * function save_thumbnail
  * Determines whether a thumbnail needs to be created for this post, and adds it to the postinfo for this post
  * @param Post the post for which the thumb should be generated
  **/
 public function save_thumbnail($post)
 {
     // set up a temporary variable to capture the image tag(s)
     $matches = array();
     if (preg_match('/<img [^>]+>/', $post->content, $matches)) {
         // we got one! Now tease out the src element
         $html = new HTMLTokenizer($matches[0]);
         $tokens = $html->parse();
         foreach ($tokens as $node) {
             if ('img' == $node['name']) {
                 $elements = $node['attrs'];
             }
         }
     }
     if (!isset($elements['src'])) {
         // no src= found, so don't try to do anything else
         return;
     }
     $thumb = $post->info->photology_thumb;
     if (!isset($thumb)) {
         // no thumbnail exists for this post yet, so make one
         $post->info->photology_thumb = $this->make_thumbnail($elements['src']);
         $post->info->photology_md5 = md5_file($this->get_image_file($elements['src']));
         $post->info->commit();
     } else {
         // a thumbnail exists; we should check whether we need to update it
         if (true) {
             // ( md5_file( $this->get_image_file( $elements['src'] ) ) != $post->info->photology_md5 ) {
             // the image has a different MD5 sum than the
             // one we previously calculated for it, so
             // generate a new thumbnail
             $post->info->photology_thumb = $this->make_thumbnail($elements['src']);
             $post->info->photology_md5 = md5_file($this->get_image_file($elements['src']));
             $post->info->commit();
         }
     }
 }
 /**
  * Check if links are 404 or 302
  */
 protected function check_links(Posts $posts)
 {
     foreach ($posts as $post) {
         $tokenizer = new HTMLTokenizer($post->content_out, false);
         $nodes = $tokenizer->parse();
         $urls = array();
         foreach ($nodes as $node) {
             if ($node['type'] == HTMLTokenizer::NODE_TYPE_ELEMENT_OPEN && strtolower($node['name']) == 'a') {
                 $urls[] = $node['attrs']['href'];
             }
         }
         $urls = array_unique($urls);
         if (count($urls) > 0) {
             foreach ($urls as $url) {
                 $request = new RemoteRequest($url, 'HEAD');
                 $headers = RemoteRequestSucks::head($url);
                 if ($headers) {
                     $status = $headers['status'];
                     // is it 404 not found?
                     if ($status == 404) {
                         $message = _t("404 at %s in post %s, got: %s", array($url, $post->slug, $status), 'sitemaintenance');
                         SiteMaintenanceLog::report_log($message, 'err', '404', serialize($headers));
                     } elseif ($status == 301) {
                         if (isset($headers['location'])) {
                             $location = $headers['location'];
                         } else {
                             $location = _t('unknown', self::TEXT_DOMAIN);
                         }
                         $message = _t("301 at %s in post %s, moved to: %s", array($url, $post->slug, $location), 'sitemaintenance');
                         SiteMaintenanceLog::report_log($message, 'err', '301', serialize($headers));
                     }
                 }
             }
         }
     }
 }
Exemplo n.º 6
0
 /**
  * @todo TODO must build DOM to really properly remove offending elements
  * @todo TODO properly filter URLs
  */
 public static function filter_html_elements($str)
 {
     $tokenizer = new HTMLTokenizer($str);
     // tokenize, baby
     $tokens = $tokenizer->parse();
     // filter token stream
     $filtered = new HTMLTokenSet();
     $stack = array();
     foreach ($tokens as $node) {
         switch ($node['type']) {
             case HTMLTokenizer::NODE_TYPE_TEXT:
                 $node['value'] = html_entity_decode($node['value'], ENT_QUOTES, MultiByte::hab_encoding());
                 break;
             case HTMLTokenizer::NODE_TYPE_ELEMENT_OPEN:
             case HTMLTokenizer::NODE_TYPE_ELEMENT_EMPTY:
                 // is this element allowed at all?
                 if (!in_array(strtolower($node['name']), self::$whitelist_elements)) {
                     if (!in_array(strtolower($node['name']), self::$elements_empty)) {
                         array_push($stack, $node['name']);
                     }
                     //$node = null; //remove the node completely
                     // convert the node to text
                     $node = array('type' => HTMLTokenizer::NODE_TYPE_TEXT, 'name' => '#text', 'value' => HTMLTokenSet::token_to_string($node), 'attrs' => array());
                 } else {
                     // check attributes
                     foreach ($node['attrs'] as $k => $v) {
                         $attr_ok = false;
                         // if the attribute is in the global whitelist and validates
                         if (array_key_exists(strtolower($k), self::$whitelist_attributes['*']) && self::check_attr_value(strtolower($k), $v, self::$whitelist_attributes['*'][strtolower($k)])) {
                             $attr_ok = true;
                         }
                         // if there is a whitelist for this node and this attribute is in that list and it validates
                         if (array_key_exists(strtolower($node['name']), self::$whitelist_attributes) && array_key_exists(strtolower($k), self::$whitelist_attributes[strtolower($node['name'])]) && self::check_attr_value(strtolower($k), $v, self::$whitelist_attributes[strtolower($node['name'])][strtolower($k)])) {
                             $attr_ok = true;
                         }
                         // if it wasn't in one of the whitelists or failed its check, remove it
                         if ($attr_ok != true) {
                             unset($node['attrs'][$k]);
                         }
                     }
                 }
                 break;
             case HTMLTokenizer::NODE_TYPE_ELEMENT_CLOSE:
                 if (!in_array(strtolower($node['name']), self::$whitelist_elements)) {
                     if (strtolower($temp = array_pop($stack)) !== strtolower($node['name'])) {
                         // something weird happened (Luke, use the DOM!)
                         array_push($stack, $temp);
                     }
                     //$node = null;
                     //convert the node to text
                     $node = array('type' => HTMLTokenizer::NODE_TYPE_TEXT, 'name' => '#text', 'value' => HTMLTokenSet::token_to_string($node), 'attrs' => array());
                 }
                 break;
             case HTMLTokenizer::NODE_TYPE_PI:
             case HTMLTokenizer::NODE_TYPE_COMMENT:
             case HTMLTokenizer::NODE_TYPE_CDATA_SECTION:
             case HTMLTokenizer::NODE_TYPE_STATEMENT:
             default:
                 $node = null;
                 break;
         }
         if ($node != null) {
             $filtered[] = $node;
         }
     }
     // rebuild our output string
     return preg_replace('#<([^>\\s]+)(?:\\s+[^>]+)?></\\1>#u', '', (string) $filtered);
 }
Exemplo n.º 7
0
	/**
	 * Scan all links in the content and send them a Pingback.
	 * @param string $content The post content to search
	 * @param string $source_uri The source of the content
	 * @param Post $post The post object of the source of the ping
	 * @param boolean $force If true, force the system to ping all links even if that had been pinged before
	 */
	public function pingback_all_links( $content, $source_uri, $post = NULL, $force = false )
	{
		
		$tokenizer = new HTMLTokenizer( $content, false );
		$tokens = $tokenizer->parse();

		// slice out only A tags
		$slices = $tokens->slice( array( 'a' ), array( ) );

		$urls = array();
		foreach ( $slices as $slice ) {

			// if there is no href attribute, just skip it, though there is something wrong
			if ( !isset( $slice[0]['attrs']['href'] ) ) {
				continue;
			}
			else {
				$url = $slice[0]['attrs']['href'];
			}

			// make sure it's a valid URL before we waste our time
			$parsed = InputFilter::parse_url( $url );
			
			if ( $parsed['is_error'] || $parsed['is_pseudo'] || $parsed['is_relative'] ) {
				continue;
			}
			else {
				$urls[] = $url;
			}

		}
		
		if ( is_object( $post ) && isset( $post->info->pingbacks_successful ) ) {
			$fn = ( $force === true ) ? 'array_merge' : 'array_diff';
			$links = $fn( $urls, $post->info->pingbacks_successful );
		}
		else {
			$links = $urls;
		}

		$links = array_unique( $links );

		foreach ( $links as $target_uri ) {
			if ( $this->send_pingback( $source_uri, $target_uri, $post ) ) {
				EventLog::log( _t( 'Sent pingbacks for "%1$s", target: %2$s', array( $post->title, $target_uri ) ), 'info', 'Pingback' );
			}
		}
	}
Exemplo n.º 8
0
 /**
  * @todo TODO must build DOM to really properly remove offending elements
  * @todo TODO properly filter URLs
  */
 public static function filter_html_elements($str)
 {
     $tokenizer = new HTMLTokenizer($str);
     // tokenize, baby
     $tokens = $tokenizer->parse();
     // filter token stream
     $filtered = new HTMLTokenSet();
     $stack = array();
     foreach ($tokens as $node) {
         switch ($node['type']) {
             case HTMLTokenizer::NODE_TYPE_TEXT:
                 // XXX use blog charset setting
                 $node['value'] = html_entity_decode($node['value'], ENT_QUOTES, 'utf-8');
                 break;
             case HTMLTokenizer::NODE_TYPE_ELEMENT_OPEN:
                 // is this element allowed at all?
                 if (!in_array(strtolower($node['name']), self::$whitelist_elements)) {
                     if (!in_array(strtolower($node['name']), self::$elements_empty)) {
                         array_push($stack, $node['name']);
                     }
                     //$node = NULL; //remove the node completely
                     // convert the node to text
                     $node = array('type' => HTMLTokenizer::NODE_TYPE_TEXT, 'name' => '#text', 'value' => HTMLTokenSet::token_to_string($node), 'attrs' => array());
                 } else {
                     // check attributes
                     foreach ($node['attrs'] as $k => $v) {
                         $attr_ok = (in_array(strtolower($k), self::$whitelist_attributes['*']) || array_key_exists(strtolower($node['name']), self::$whitelist_attributes) && array_key_exists(strtolower($k), self::$whitelist_attributes[strtolower($node['name'])])) && self::check_attr_value(strtolower($k), $v, self::$whitelist_attributes[strtolower($node['name'])][strtolower($k)]);
                         if (!$attr_ok) {
                             unset($node['attrs'][$k]);
                         }
                     }
                 }
                 break;
             case HTMLTokenizer::NODE_TYPE_ELEMENT_CLOSE:
                 if (!in_array(strtolower($node['name']), self::$whitelist_elements)) {
                     if (strtolower($temp = array_pop($stack)) !== strtolower($node['name'])) {
                         // something weird happened (Luke, use the DOM!)
                         array_push($stack, $temp);
                     }
                     //$node = NULL;
                     //convert the node to text
                     $node = array('type' => HTMLTokenizer::NODE_TYPE_TEXT, 'name' => '#text', 'value' => HTMLTokenSet::token_to_string($node), 'attrs' => array());
                 }
                 break;
             case HTMLTokenizer::NODE_TYPE_PI:
             case HTMLTokenizer::NODE_TYPE_COMMENT:
             case HTMLTokenizer::NODE_TYPE_CDATA_SECTION:
             case HTMLTokenizer::NODE_TYPE_STATEMENT:
                 $node = NULL;
                 break;
             default:
         }
         if ($node != NULL) {
             $filtered[] = $node;
         }
     }
     // rebuild our output string
     return preg_replace('@<([^>\\s]+)(?:\\s+[^>]+)?></\\1>@', '', (string) $filtered);
 }
Exemplo n.º 9
0
	/**
	 * Returns a truncated version of post content when the post isn't being displayed on its own.
	 * Posts are split either at the comment <!--more--> or at the specified maximums.
	 * Use only after applying autop or other paragrpah styling methods.
	 * Apply to posts using:
	 * <code>Format::apply_with_hook_params( 'more', 'post_content_out' );</code>
	 * @param string $content The post content
	 * @param Post $post The Post object of the post
	 * @param string $more_text The text to use in the "read more" link.
	 * @param integer $max_words null or the maximum number of words to use before showing the more link
	 * @param integer $max_paragraphs null or the maximum number of paragraphs to use before showing the more link
	 * @return string The post content, suitable for display
	 */
	public static function more( $content, $post, $properties = array() )
	{
		// If the post requested is the post under consideration, always return the full post
		if ( $post->slug == Controller::get_var( 'slug' ) ) {
			return $content;
		}
		elseif ( is_string( $properties ) ) {
			$args = func_get_args();
			$more_text = $properties;
			$max_words = ( isset( $args[3] ) ? $args[3] : null );
			$max_paragraphs = ( isset( $args[4] ) ? $args[4] : null );
			$paramstring = "";
		}
		else {
			$paramstring = "";
			$paramarray = Utils::get_params( $properties );

			$more_text = ( isset( $paramarray['more_text'] ) ? $paramarray['more_text'] : 'Read More' );
			$max_words = ( isset( $paramarray['max_words'] ) ? $paramarray['max_words'] : null );
			$max_paragraphs = ( isset( $paramarray['max_paragraphs'] ) ? $paramarray['max_paragraphs'] : null );

			if ( isset( $paramarray['title:before'] ) || isset( $paramarray['title'] ) || isset( $paramarray['title:after'] ) ) {
				$paramstring .= 'title="';

				if ( isset( $paramarray['title:before'] ) ) {
					$paramstring .= $paramarray['title:before'];
				}
				if ( isset( $paramarray['title'] ) ) {
					$paramstring .= $post->title;
				}
				if ( isset( $paramarray['title:after'] ) ) {
					$paramstring .= $paramarray['title:after'];
				}
				$paramstring .= '" ';
			}
			if ( isset( $paramarray['class'] ) ) {
				$paramstring .= 'class="' . $paramarray['class'] . '" ';
			}

		}
		$matches = preg_split( '/<!--\s*more\s*-->/isu', $content, 2, PREG_SPLIT_NO_EMPTY );
		if ( count( $matches ) > 1 ) {
			return ( $more_text != '' ) ? reset( $matches ) . ' <a ' . $paramstring . 'href="' . $post->permalink . '">' . $more_text . '</a>' : reset( $matches );
		}
		elseif ( isset( $max_words ) || isset( $max_paragraphs ) ) {
			$max_words = empty( $max_words ) ? 9999999 : intval( $max_words );
			$max_paragraphs = empty( $max_paragraphs ) ? 9999999 : intval( $max_paragraphs );
			$summary = Format::summarize( $content, $max_words, $max_paragraphs );
			if ( MultiByte::strlen( $summary ) >= MultiByte::strlen( $content ) ) {
				return $content;
			}
			else {
				if ( strlen( $more_text  ) ) {
					// Tokenize the summary and link
					$ht = new HTMLTokenizer( $summary );
					$summary_set = $ht->parse();
					$ht = new HTMLTokenizer( '<a ' . $paramstring . ' href="' . $post->permalink . '">' . $more_text . '</a>' );
					$link_set= $ht->parse();
					// Find out where to put the link
					$end = $summary_set->end();
					$key = $summary_set->key();
					// Inject the link
					$summary_set->insert( $link_set, $key );

					return (string)$summary_set;
				}
				else {
					return $summary;
				}
			}
		}

	return $content;
	}
Exemplo n.º 10
0
 /**
  * Replace a full set of tokens with new tokens. The tokens are replaced
  * in place as well as being returned
  *
  * @param string $source The text to create the new set of tokens from
  * @return HTMLTokenSet The new set of tokens created
  */
 public function tokenize_replace($source)
 {
     $ht = new HTMLTokenizer($source, $this->escape);
     $this->tokens = $ht->parse()->tokens;
     return $this->tokens;
 }
Exemplo n.º 11
0
 /**
  * Returns a truncated version of post content when the post isn't being displayed on its own.
  * Posts are split either at the comment <!--more--> or at the specified maximums.
  * Use only after applying autop or other paragrpah styling methods.
  * Apply to posts using:
  * <code>Format::apply_with_hook_params( 'more', 'post_content_out' );</code>
  * @param string $content The post content
  * @param Post $post The Post object of the post
  * @param string $more_text The text to use in the "read more" link.
  * @param integer $max_words null or the maximum number of words to use before showing the more link
  * @param integer $max_paragraphs null or the maximum number of paragraphs to use before showing the more link
  * @param boolean $inside_last Should the link be placed inside the last element, or not? Default: true
  * @return string The post content, suitable for display
  */
 public static function more($content, $post, $properties = array())
 {
     // If the post requested is the post under consideration, always return the full post
     if ($post->slug == Controller::get_var('slug')) {
         return $content;
     } elseif (is_string($properties)) {
         $args = func_get_args();
         $more_text = $properties;
         $max_words = isset($args[3]) ? $args[3] : null;
         $max_paragraphs = isset($args[4]) ? $args[4] : null;
         $inside_last = isset($args[5]) ? $args[5] : true;
         $paramstring = "";
     } else {
         $paramstring = "";
         $paramarray = Utils::get_params($properties);
         $more_text = isset($paramarray['more_text']) ? $paramarray['more_text'] : 'Read More';
         $max_words = isset($paramarray['max_words']) ? $paramarray['max_words'] : null;
         $max_paragraphs = isset($paramarray['max_paragraphs']) ? $paramarray['max_paragraphs'] : null;
         $inside_last = isset($paramarray['inside_last']) ? $paramarray['inside_last'] : true;
         if (isset($paramarray['title:before']) || isset($paramarray['title']) || isset($paramarray['title:after'])) {
             $paramstring .= 'title="';
             if (isset($paramarray['title:before'])) {
                 $paramstring .= $paramarray['title:before'];
             }
             if (isset($paramarray['title'])) {
                 $paramstring .= $post->title;
             }
             if (isset($paramarray['title:after'])) {
                 $paramstring .= $paramarray['title:after'];
             }
             $paramstring .= '" ';
         }
         if (isset($paramarray['class'])) {
             $paramstring .= 'class="' . $paramarray['class'] . '" ';
         }
     }
     $link_text = '<a ' . $paramstring . ' href="' . $post->permalink . '">' . $more_text . '</a>';
     // if we want it inside the last element, make sure there's a space before the link
     if ($inside_last) {
         $link_text = ' ' . $link_text;
     }
     // check for a <!--more--> link, which sets exactly where we should split
     $matches = preg_split('/<!--\\s*more\\s*-->/isu', $content, 2, PREG_SPLIT_NO_EMPTY);
     if (count($matches) > 1) {
         $summary = reset($matches);
     } else {
         // otherwise, we need to summarize it automagically
         $max_words = empty($max_words) ? 9999999 : intval($max_words);
         $max_paragraphs = empty($max_paragraphs) ? 9999999 : intval($max_paragraphs);
         $summary = Format::summarize($content, $max_words, $max_paragraphs);
     }
     // if the summary is equal to the length of the content (or somehow greater??), there's no need to add a link, just return the content
     if (MultiByte::strlen($summary) >= MultiByte::strlen($content)) {
         return $content;
     } else {
         // make sure there's actually text to append before we waste our time
         if (strlen($more_text)) {
             // parse out the summary and stick in our linky goodness
             // tokenize the summary
             $ht = new HTMLTokenizer($summary);
             $summary_set = $ht->parse();
             // tokenize the link we're adding
             $ht = new HTMLTokenizer($link_text);
             $link_set = $ht->parse();
             // find out where to put the link by bumping the iterator to the last element
             $end = $summary_set->end();
             // and what index is that?
             $key = $summary_set->key();
             // if we want it inside the last element, we're good to go - if we want it outside, we need to add it as the *next* element
             if ($inside_last == false) {
                 $key++;
             }
             // if the element is a text node, there were no tags; probably not autop'ed yet, just add link as new line
             if ($end['type'] == HTMLTokenizer::NODE_TYPE_TEXT) {
                 $summary_set->insert($link_set, $key + 1);
             } else {
                 // inject it, whereever we decided it should go
                 $summary_set->insert($link_set, $key);
             }
             // and return a stringified version
             return (string) $summary_set;
         } else {
             // no text to append? just return the summary
             return $summary;
         }
     }
     return $content;
 }
Exemplo n.º 12
0
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"
[ <!ATTLIST html habari CDATA #IMPLIED> ]
>
<html>
<head habari="rocks">
<title>Foo Bar</title>
</head>
<body>
<h1>Hello World</h1>
<p>This is a good <a href="http://google.com/search?q=html">HTML</a> document.</p>
<![CDATA[This is &amp; <a href="foo">CDATA</a>.]]><strong>Lo bob</strong>.
</body>
</html>
_EOF_;
$html_strs[] = <<<_EOF_
<html><title>Oh &apos;eck!<body>This is a badly tag-soupy HTML document.</html>
_EOF_;
$html_strs[] = <<<_EOF_
<html>
<head><title>Hey</title></head>
<body onLoad="window.alert('zomg.');">
<p onClick="window.alert('stole yer cookies!');">Do not click here.</p>
<script>alert("See this?")</script>
</body>
</html>
_EOF_;
foreach ($html_strs as $html_str) {
    $t = new HTMLTokenizer($html_str);
    $tokens = $t->parse();
    Utils::debug($html_str, $tokens);
}