/** * @param string $html * @param array $options */ public function __construct($html, $options = array()) { $html = ltrim($html); $this->html = $html; $this->options = $this->options($options); $SegmentedString = new SegmentedString($html); $HTMLTokenizer = new HTMLTokenizer($SegmentedString, $options); $this->tokens = $HTMLTokenizer->tokenizer(); }
function test_tokenizer() { foreach ( $this->html_strs as $html_str ) { $t = new HTMLTokenizer( $html_str ); $tokens = $t->parse(); $new_str = (string)$tokens; $this->assert_identical( $html_str, $new_str, "<br>" . Utils::htmlspecialchars( $html_str ) . "<br>" . Utils::htmlspecialchars( $new_str ) ); } }
public static function do_highlight($in) { // Look, ma! No Regex! $tokenizer = new HTMLTokenizer($in, false); $tokens = $tokenizer->parse(); // fetch div, pre, code slices that have a class="highlight" $slices = $tokens->slice(array('div', 'pre', 'code'), array('class' => 'highlight')); // iterate the found slices foreach ($slices as $slice) { // store the class to use once we've stripped the container $classAttr = $slice[0]['attrs']['class']; // unique name to use in the cache for this slice/markup $sliceCacheName = 'plugin.highlight.' . md5((string) $slice) . filemtime(__FILE__); // trim off the div, and determine the value $slice->trim_container(); $sliceValue = trim((string) $slice); // see if it's already been cached if (Cache::has($sliceCacheName)) { $output = Cache::get($sliceCacheName); } else { // trim off the CDATA wrapper, if applicable if (substr($sliceValue, 0, 9) == '<![CDATA[' && substr($sliceValue, -3) == ']]>') { $sliceValue = substr($sliceValue, 9, -3); } $classes = array_filter(explode(' ', trim(str_replace('highlight', '', $classAttr)))); // ugly, refactor $geshi = new Geshi(trim($sliceValue), isset($classes[0]) ? $classes[0] : 'php', HighlightPlugin::$geshi_path . '/geshi/'); $geshi->set_header_type(GESHI_HEADER_PRE); $geshi->set_overall_class('geshicode'); $output = @$geshi->parse_code(); // @ is slow, but geshi is full of E_NOTICE Cache::set($sliceCacheName, $output); } $slice->tokenize_replace($output); $tokens->replace_slice($slice); } return (string) $tokens; }
/** * function save_thumbnail * Determines whether a thumbnail needs to be created for this post, and adds it to the postinfo for this post * @param Post the post for which the thumb should be generated **/ public function save_thumbnail($post) { // set up a temporary variable to capture the image tag(s) $matches = array(); if (preg_match('/<img [^>]+>/', $post->content, $matches)) { // we got one! Now tease out the src element $html = new HTMLTokenizer($matches[0]); $tokens = $html->parse(); foreach ($tokens as $node) { if ('img' == $node['name']) { $elements = $node['attrs']; } } } if (!isset($elements['src'])) { // no src= found, so don't try to do anything else return; } $thumb = $post->info->photology_thumb; if (!isset($thumb)) { // no thumbnail exists for this post yet, so make one $post->info->photology_thumb = $this->make_thumbnail($elements['src']); $post->info->photology_md5 = md5_file($this->get_image_file($elements['src'])); $post->info->commit(); } else { // a thumbnail exists; we should check whether we need to update it if (true) { // ( md5_file( $this->get_image_file( $elements['src'] ) ) != $post->info->photology_md5 ) { // the image has a different MD5 sum than the // one we previously calculated for it, so // generate a new thumbnail $post->info->photology_thumb = $this->make_thumbnail($elements['src']); $post->info->photology_md5 = md5_file($this->get_image_file($elements['src'])); $post->info->commit(); } } }
/** * Check if links are 404 or 302 */ protected function check_links(Posts $posts) { foreach ($posts as $post) { $tokenizer = new HTMLTokenizer($post->content_out, false); $nodes = $tokenizer->parse(); $urls = array(); foreach ($nodes as $node) { if ($node['type'] == HTMLTokenizer::NODE_TYPE_ELEMENT_OPEN && strtolower($node['name']) == 'a') { $urls[] = $node['attrs']['href']; } } $urls = array_unique($urls); if (count($urls) > 0) { foreach ($urls as $url) { $request = new RemoteRequest($url, 'HEAD'); $headers = RemoteRequestSucks::head($url); if ($headers) { $status = $headers['status']; // is it 404 not found? if ($status == 404) { $message = _t("404 at %s in post %s, got: %s", array($url, $post->slug, $status), 'sitemaintenance'); SiteMaintenanceLog::report_log($message, 'err', '404', serialize($headers)); } elseif ($status == 301) { if (isset($headers['location'])) { $location = $headers['location']; } else { $location = _t('unknown', self::TEXT_DOMAIN); } $message = _t("301 at %s in post %s, moved to: %s", array($url, $post->slug, $location), 'sitemaintenance'); SiteMaintenanceLog::report_log($message, 'err', '301', serialize($headers)); } } } } } }
/** * @todo TODO must build DOM to really properly remove offending elements * @todo TODO properly filter URLs */ public static function filter_html_elements($str) { $tokenizer = new HTMLTokenizer($str); // tokenize, baby $tokens = $tokenizer->parse(); // filter token stream $filtered = new HTMLTokenSet(); $stack = array(); foreach ($tokens as $node) { switch ($node['type']) { case HTMLTokenizer::NODE_TYPE_TEXT: $node['value'] = html_entity_decode($node['value'], ENT_QUOTES, MultiByte::hab_encoding()); break; case HTMLTokenizer::NODE_TYPE_ELEMENT_OPEN: case HTMLTokenizer::NODE_TYPE_ELEMENT_EMPTY: // is this element allowed at all? if (!in_array(strtolower($node['name']), self::$whitelist_elements)) { if (!in_array(strtolower($node['name']), self::$elements_empty)) { array_push($stack, $node['name']); } //$node = null; //remove the node completely // convert the node to text $node = array('type' => HTMLTokenizer::NODE_TYPE_TEXT, 'name' => '#text', 'value' => HTMLTokenSet::token_to_string($node), 'attrs' => array()); } else { // check attributes foreach ($node['attrs'] as $k => $v) { $attr_ok = false; // if the attribute is in the global whitelist and validates if (array_key_exists(strtolower($k), self::$whitelist_attributes['*']) && self::check_attr_value(strtolower($k), $v, self::$whitelist_attributes['*'][strtolower($k)])) { $attr_ok = true; } // if there is a whitelist for this node and this attribute is in that list and it validates if (array_key_exists(strtolower($node['name']), self::$whitelist_attributes) && array_key_exists(strtolower($k), self::$whitelist_attributes[strtolower($node['name'])]) && self::check_attr_value(strtolower($k), $v, self::$whitelist_attributes[strtolower($node['name'])][strtolower($k)])) { $attr_ok = true; } // if it wasn't in one of the whitelists or failed its check, remove it if ($attr_ok != true) { unset($node['attrs'][$k]); } } } break; case HTMLTokenizer::NODE_TYPE_ELEMENT_CLOSE: if (!in_array(strtolower($node['name']), self::$whitelist_elements)) { if (strtolower($temp = array_pop($stack)) !== strtolower($node['name'])) { // something weird happened (Luke, use the DOM!) array_push($stack, $temp); } //$node = null; //convert the node to text $node = array('type' => HTMLTokenizer::NODE_TYPE_TEXT, 'name' => '#text', 'value' => HTMLTokenSet::token_to_string($node), 'attrs' => array()); } break; case HTMLTokenizer::NODE_TYPE_PI: case HTMLTokenizer::NODE_TYPE_COMMENT: case HTMLTokenizer::NODE_TYPE_CDATA_SECTION: case HTMLTokenizer::NODE_TYPE_STATEMENT: default: $node = null; break; } if ($node != null) { $filtered[] = $node; } } // rebuild our output string return preg_replace('#<([^>\\s]+)(?:\\s+[^>]+)?></\\1>#u', '', (string) $filtered); }
/** * Scan all links in the content and send them a Pingback. * @param string $content The post content to search * @param string $source_uri The source of the content * @param Post $post The post object of the source of the ping * @param boolean $force If true, force the system to ping all links even if that had been pinged before */ public function pingback_all_links( $content, $source_uri, $post = NULL, $force = false ) { $tokenizer = new HTMLTokenizer( $content, false ); $tokens = $tokenizer->parse(); // slice out only A tags $slices = $tokens->slice( array( 'a' ), array( ) ); $urls = array(); foreach ( $slices as $slice ) { // if there is no href attribute, just skip it, though there is something wrong if ( !isset( $slice[0]['attrs']['href'] ) ) { continue; } else { $url = $slice[0]['attrs']['href']; } // make sure it's a valid URL before we waste our time $parsed = InputFilter::parse_url( $url ); if ( $parsed['is_error'] || $parsed['is_pseudo'] || $parsed['is_relative'] ) { continue; } else { $urls[] = $url; } } if ( is_object( $post ) && isset( $post->info->pingbacks_successful ) ) { $fn = ( $force === true ) ? 'array_merge' : 'array_diff'; $links = $fn( $urls, $post->info->pingbacks_successful ); } else { $links = $urls; } $links = array_unique( $links ); foreach ( $links as $target_uri ) { if ( $this->send_pingback( $source_uri, $target_uri, $post ) ) { EventLog::log( _t( 'Sent pingbacks for "%1$s", target: %2$s', array( $post->title, $target_uri ) ), 'info', 'Pingback' ); } } }
/** * @todo TODO must build DOM to really properly remove offending elements * @todo TODO properly filter URLs */ public static function filter_html_elements($str) { $tokenizer = new HTMLTokenizer($str); // tokenize, baby $tokens = $tokenizer->parse(); // filter token stream $filtered = new HTMLTokenSet(); $stack = array(); foreach ($tokens as $node) { switch ($node['type']) { case HTMLTokenizer::NODE_TYPE_TEXT: // XXX use blog charset setting $node['value'] = html_entity_decode($node['value'], ENT_QUOTES, 'utf-8'); break; case HTMLTokenizer::NODE_TYPE_ELEMENT_OPEN: // is this element allowed at all? if (!in_array(strtolower($node['name']), self::$whitelist_elements)) { if (!in_array(strtolower($node['name']), self::$elements_empty)) { array_push($stack, $node['name']); } //$node = NULL; //remove the node completely // convert the node to text $node = array('type' => HTMLTokenizer::NODE_TYPE_TEXT, 'name' => '#text', 'value' => HTMLTokenSet::token_to_string($node), 'attrs' => array()); } else { // check attributes foreach ($node['attrs'] as $k => $v) { $attr_ok = (in_array(strtolower($k), self::$whitelist_attributes['*']) || array_key_exists(strtolower($node['name']), self::$whitelist_attributes) && array_key_exists(strtolower($k), self::$whitelist_attributes[strtolower($node['name'])])) && self::check_attr_value(strtolower($k), $v, self::$whitelist_attributes[strtolower($node['name'])][strtolower($k)]); if (!$attr_ok) { unset($node['attrs'][$k]); } } } break; case HTMLTokenizer::NODE_TYPE_ELEMENT_CLOSE: if (!in_array(strtolower($node['name']), self::$whitelist_elements)) { if (strtolower($temp = array_pop($stack)) !== strtolower($node['name'])) { // something weird happened (Luke, use the DOM!) array_push($stack, $temp); } //$node = NULL; //convert the node to text $node = array('type' => HTMLTokenizer::NODE_TYPE_TEXT, 'name' => '#text', 'value' => HTMLTokenSet::token_to_string($node), 'attrs' => array()); } break; case HTMLTokenizer::NODE_TYPE_PI: case HTMLTokenizer::NODE_TYPE_COMMENT: case HTMLTokenizer::NODE_TYPE_CDATA_SECTION: case HTMLTokenizer::NODE_TYPE_STATEMENT: $node = NULL; break; default: } if ($node != NULL) { $filtered[] = $node; } } // rebuild our output string return preg_replace('@<([^>\\s]+)(?:\\s+[^>]+)?></\\1>@', '', (string) $filtered); }
/** * Returns a truncated version of post content when the post isn't being displayed on its own. * Posts are split either at the comment <!--more--> or at the specified maximums. * Use only after applying autop or other paragrpah styling methods. * Apply to posts using: * <code>Format::apply_with_hook_params( 'more', 'post_content_out' );</code> * @param string $content The post content * @param Post $post The Post object of the post * @param string $more_text The text to use in the "read more" link. * @param integer $max_words null or the maximum number of words to use before showing the more link * @param integer $max_paragraphs null or the maximum number of paragraphs to use before showing the more link * @return string The post content, suitable for display */ public static function more( $content, $post, $properties = array() ) { // If the post requested is the post under consideration, always return the full post if ( $post->slug == Controller::get_var( 'slug' ) ) { return $content; } elseif ( is_string( $properties ) ) { $args = func_get_args(); $more_text = $properties; $max_words = ( isset( $args[3] ) ? $args[3] : null ); $max_paragraphs = ( isset( $args[4] ) ? $args[4] : null ); $paramstring = ""; } else { $paramstring = ""; $paramarray = Utils::get_params( $properties ); $more_text = ( isset( $paramarray['more_text'] ) ? $paramarray['more_text'] : 'Read More' ); $max_words = ( isset( $paramarray['max_words'] ) ? $paramarray['max_words'] : null ); $max_paragraphs = ( isset( $paramarray['max_paragraphs'] ) ? $paramarray['max_paragraphs'] : null ); if ( isset( $paramarray['title:before'] ) || isset( $paramarray['title'] ) || isset( $paramarray['title:after'] ) ) { $paramstring .= 'title="'; if ( isset( $paramarray['title:before'] ) ) { $paramstring .= $paramarray['title:before']; } if ( isset( $paramarray['title'] ) ) { $paramstring .= $post->title; } if ( isset( $paramarray['title:after'] ) ) { $paramstring .= $paramarray['title:after']; } $paramstring .= '" '; } if ( isset( $paramarray['class'] ) ) { $paramstring .= 'class="' . $paramarray['class'] . '" '; } } $matches = preg_split( '/<!--\s*more\s*-->/isu', $content, 2, PREG_SPLIT_NO_EMPTY ); if ( count( $matches ) > 1 ) { return ( $more_text != '' ) ? reset( $matches ) . ' <a ' . $paramstring . 'href="' . $post->permalink . '">' . $more_text . '</a>' : reset( $matches ); } elseif ( isset( $max_words ) || isset( $max_paragraphs ) ) { $max_words = empty( $max_words ) ? 9999999 : intval( $max_words ); $max_paragraphs = empty( $max_paragraphs ) ? 9999999 : intval( $max_paragraphs ); $summary = Format::summarize( $content, $max_words, $max_paragraphs ); if ( MultiByte::strlen( $summary ) >= MultiByte::strlen( $content ) ) { return $content; } else { if ( strlen( $more_text ) ) { // Tokenize the summary and link $ht = new HTMLTokenizer( $summary ); $summary_set = $ht->parse(); $ht = new HTMLTokenizer( '<a ' . $paramstring . ' href="' . $post->permalink . '">' . $more_text . '</a>' ); $link_set= $ht->parse(); // Find out where to put the link $end = $summary_set->end(); $key = $summary_set->key(); // Inject the link $summary_set->insert( $link_set, $key ); return (string)$summary_set; } else { return $summary; } } } return $content; }
/** * Replace a full set of tokens with new tokens. The tokens are replaced * in place as well as being returned * * @param string $source The text to create the new set of tokens from * @return HTMLTokenSet The new set of tokens created */ public function tokenize_replace($source) { $ht = new HTMLTokenizer($source, $this->escape); $this->tokens = $ht->parse()->tokens; return $this->tokens; }
/** * Returns a truncated version of post content when the post isn't being displayed on its own. * Posts are split either at the comment <!--more--> or at the specified maximums. * Use only after applying autop or other paragrpah styling methods. * Apply to posts using: * <code>Format::apply_with_hook_params( 'more', 'post_content_out' );</code> * @param string $content The post content * @param Post $post The Post object of the post * @param string $more_text The text to use in the "read more" link. * @param integer $max_words null or the maximum number of words to use before showing the more link * @param integer $max_paragraphs null or the maximum number of paragraphs to use before showing the more link * @param boolean $inside_last Should the link be placed inside the last element, or not? Default: true * @return string The post content, suitable for display */ public static function more($content, $post, $properties = array()) { // If the post requested is the post under consideration, always return the full post if ($post->slug == Controller::get_var('slug')) { return $content; } elseif (is_string($properties)) { $args = func_get_args(); $more_text = $properties; $max_words = isset($args[3]) ? $args[3] : null; $max_paragraphs = isset($args[4]) ? $args[4] : null; $inside_last = isset($args[5]) ? $args[5] : true; $paramstring = ""; } else { $paramstring = ""; $paramarray = Utils::get_params($properties); $more_text = isset($paramarray['more_text']) ? $paramarray['more_text'] : 'Read More'; $max_words = isset($paramarray['max_words']) ? $paramarray['max_words'] : null; $max_paragraphs = isset($paramarray['max_paragraphs']) ? $paramarray['max_paragraphs'] : null; $inside_last = isset($paramarray['inside_last']) ? $paramarray['inside_last'] : true; if (isset($paramarray['title:before']) || isset($paramarray['title']) || isset($paramarray['title:after'])) { $paramstring .= 'title="'; if (isset($paramarray['title:before'])) { $paramstring .= $paramarray['title:before']; } if (isset($paramarray['title'])) { $paramstring .= $post->title; } if (isset($paramarray['title:after'])) { $paramstring .= $paramarray['title:after']; } $paramstring .= '" '; } if (isset($paramarray['class'])) { $paramstring .= 'class="' . $paramarray['class'] . '" '; } } $link_text = '<a ' . $paramstring . ' href="' . $post->permalink . '">' . $more_text . '</a>'; // if we want it inside the last element, make sure there's a space before the link if ($inside_last) { $link_text = ' ' . $link_text; } // check for a <!--more--> link, which sets exactly where we should split $matches = preg_split('/<!--\\s*more\\s*-->/isu', $content, 2, PREG_SPLIT_NO_EMPTY); if (count($matches) > 1) { $summary = reset($matches); } else { // otherwise, we need to summarize it automagically $max_words = empty($max_words) ? 9999999 : intval($max_words); $max_paragraphs = empty($max_paragraphs) ? 9999999 : intval($max_paragraphs); $summary = Format::summarize($content, $max_words, $max_paragraphs); } // if the summary is equal to the length of the content (or somehow greater??), there's no need to add a link, just return the content if (MultiByte::strlen($summary) >= MultiByte::strlen($content)) { return $content; } else { // make sure there's actually text to append before we waste our time if (strlen($more_text)) { // parse out the summary and stick in our linky goodness // tokenize the summary $ht = new HTMLTokenizer($summary); $summary_set = $ht->parse(); // tokenize the link we're adding $ht = new HTMLTokenizer($link_text); $link_set = $ht->parse(); // find out where to put the link by bumping the iterator to the last element $end = $summary_set->end(); // and what index is that? $key = $summary_set->key(); // if we want it inside the last element, we're good to go - if we want it outside, we need to add it as the *next* element if ($inside_last == false) { $key++; } // if the element is a text node, there were no tags; probably not autop'ed yet, just add link as new line if ($end['type'] == HTMLTokenizer::NODE_TYPE_TEXT) { $summary_set->insert($link_set, $key + 1); } else { // inject it, whereever we decided it should go $summary_set->insert($link_set, $key); } // and return a stringified version return (string) $summary_set; } else { // no text to append? just return the summary return $summary; } } return $content; }
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd" [ <!ATTLIST html habari CDATA #IMPLIED> ] > <html> <head habari="rocks"> <title>Foo Bar</title> </head> <body> <h1>Hello World</h1> <p>This is a good <a href="http://google.com/search?q=html">HTML</a> document.</p> <![CDATA[This is & <a href="foo">CDATA</a>.]]><strong>Lo bob</strong>. </body> </html> _EOF_; $html_strs[] = <<<_EOF_ <html><title>Oh 'eck!<body>This is a badly tag-soupy HTML document.</html> _EOF_; $html_strs[] = <<<_EOF_ <html> <head><title>Hey</title></head> <body onLoad="window.alert('zomg.');"> <p onClick="window.alert('stole yer cookies!');">Do not click here.</p> <script>alert("See this?")</script> </body> </html> _EOF_; foreach ($html_strs as $html_str) { $t = new HTMLTokenizer($html_str); $tokens = $t->parse(); Utils::debug($html_str, $tokens); }