Beispiel #1
0
 public function more($content, $post)
 {
     $more_text = 'Read the rest →';
     $max_paragraphs = 1;
     $showmore = false;
     $matches = preg_split('/<!--\\s*more\\s*-->/is', $content, 2, PREG_SPLIT_NO_EMPTY);
     if (count($matches) > 1) {
         $summary = $matches[0];
         $remainder = $matches[1];
         if (trim($remainder) != '') {
             $showmore = true;
         }
     } else {
         $ht = new HtmlTokenizer($content, false);
         $set = $ht->parse();
         $stack = array();
         $para = 0;
         $token = $set->current();
         $summary = new HTMLTokenSet(false);
         $remainder = new HTMLTokenSet(false);
         $set->rewind();
         for ($token = $set->current(); $set->valid(); $token = $set->next()) {
             if ($token['type'] == HTMLTokenizer::NODE_TYPE_ELEMENT_OPEN) {
                 $stack[$token['name']] = $token['name'];
             }
             if ($para < $max_paragraphs) {
                 $summary[] = $token;
             }
             if ($para >= $max_paragraphs) {
                 $remainder[] = $token;
                 $showmore = true;
             }
             if ($token['type'] == HTMLTokenizer::NODE_TYPE_ELEMENT_CLOSE) {
                 if (isset($stack[$token['name']])) {
                     while (end($stack) != $token['name']) {
                         array_pop($stack);
                     }
                     array_pop($stack);
                 }
                 if (count($stack) == 0) {
                     $para++;
                 }
             }
         }
     }
     if ($post->slug == Controller::get_var('slug')) {
         $content = $summary . '<div id="more" class="moreanchor">' . 'Continues here &#8594;' . '</div>' . $remainder;
     } elseif ($showmore == true) {
         $content = $summary . '<p class="more"><a href="' . $post->permalink . '#more">' . $more_text . '</a></p>';
     } else {
         $content = $summary . $remainder;
     }
     return $content;
 }
Beispiel #2
0
 /**
  * Extract an entity like "<!DOCTYPE html>".
  *
  * @return array|string
  */
 private function extractEntity()
 {
     $entity = array(0 => '', 1 => [], 2 => '', 'html' => '');
     $token = $this->tokenizer->current();
     if ($token !== '<!') {
         // Sanity check
         throw new \Exception('Sanity check failed. Expected a "<!" token');
     }
     while ($this->tokenizer->valid()) {
         $token = $this->tokenizer->current();
         $this->tokenizer->next();
         $entity['html'] .= is_array($token) ? $token[1] : $token;
         if ($token[0] == 'T_DTD_ENTITY' || $token == '<!') {
             $entity[0] .= is_array($token) ? $token[1] : $token;
         } elseif ($token[0] == 'T_DTD_ATTRIBUTES') {
             $entity[1][] = $token[1];
         } elseif ($token == '>') {
             $entity[2] = $token;
             return $entity;
         } else {
             $this->warnings[] = 'TagIterator: Unexpected token in entity: "' . (is_array($token) ? '[' . $token[0] . '] ' . $token[1] : $token) . '"';
             return $entity['html'];
         }
     }
 }
Beispiel #3
0
 public function attValueEnd($ch)
 {
     $this->c_quote = $this->att_quote;
     $c_val = $this->stack;
     $res = parent::attValueEnd($ch);
     if ($this->debug) {
         fx::debug($res === false, $c_val);
     }
     if ($res !== false) {
         $this->c_att['value'] = $c_val;
         $this->addAtt();
         $this->stack = '';
     }
     $this->c_quote = null;
     return $res;
 }
Beispiel #4
0
 /**
  * Splits a text in smaller parts having a length lower than $maxLength
  * Texts would be cut after a period.
  * 
  * @TODO Improve me to work with HTML
  * @param string $text Text to split in smaller parts
  * @param int $maxLength Maximum length of a text part
  * @param boolean $html If true HTML tags will be handled properly 
  * @return array 
  */
 protected function _splitText($text, $maxLength, $html = false)
 {
     if (strlen($text) <= $maxLength) {
         $texts = array($text);
     } else {
         if ($html) {
             App::import('Lib', 'I18n.HtmlTokenizer');
             $Tokenizer = new HtmlTokenizer($text);
             $texts = $Tokenizer->tokens($maxLength);
         } else {
             $sentences = preg_split("/[\\.][\\s]+/", $text);
             $texts = array('');
             $i = 0;
             foreach ($sentences as $sentence) {
                 if (empty($sentence)) {
                     continue;
                 }
                 $sentence .= '. ';
                 if (empty($texts[$i]) && strlen($sentence) >= $maxLength) {
                     // Cut the string before the latest word
                     while (strlen($sentence) >= $maxLength) {
                         $sentencePart = substr($sentence, 0, $maxLength);
                         $sentencePart = substr($sentencePart, 0, strrpos($sentencePart, ' ') + 1);
                         $texts[$i++] = $sentencePart;
                         $sentence = substr($sentence, strlen($sentencePart));
                     }
                     $texts[$i++] = $sentence;
                     $texts[$i] = '';
                 } elseif (strlen($texts[$i]) + strlen($sentence) < $maxLength) {
                     $texts[$i] .= $sentence;
                 } else {
                     $i++;
                     $texts[$i] = $sentence;
                 }
             }
             // Removes the ". " of the latest text if it was not finished with a period
             if (substr(trim($text), -1) !== '.') {
                 $texts[count($texts) - 1] = substr($texts[count($texts) - 1], 0, -2);
             }
         }
     }
     return $texts;
 }
Beispiel #5
0
	/**
	 * Returns a shortened version of whatever is passed in.
	 * @param string $value A string to shorten
	 * @param integer $count Maximum words to display [100]
	 * @param integer $max_paragraphs Maximum paragraphs to display [1]
	 * @return string The string, shortened
	 */
	public static function summarize( $text, $count = 100, $max_paragraphs = 1 )
	{
		$ellipsis = '&hellip;';

		$showmore = false;

		$ht = new HtmlTokenizer($text, false);
		$set = $ht->parse();

		$stack = array();
		$para = 0;
		$token = $set->current();
		$summary = new HTMLTokenSet();
		$set->rewind();
		$remaining_words = $count;
		// $bail lets the loop end naturally and close all open elements without adding new ones.
		$bail = false;
		for ( $token = $set->current(); $set->valid(); $token = $set->next() ) {
			if ( !$bail && $token['type'] == HTMLTokenizer::NODE_TYPE_ELEMENT_OPEN ) {
				$stack[] = $token;
			}
			if ( !$bail ) {
				switch ( $token['type'] ) {
					case HTMLTokenizer::NODE_TYPE_TEXT:
						$words = preg_split( '/(\\s+)/u', $token['value'], -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY );
						// word count is doubled because spaces between words are captured as their own array elements via PREG_SPLIT_DELIM_CAPTURE
						$words = array_slice( $words, 0, $remaining_words * 2 );
						$remaining_words -= count( $words ) / 2;
						$token['value'] = implode( '', $words );
						if ( $remaining_words <= 0 ) {
							$token['value'] .= $ellipsis;
							$summary[] = $token;
							$bail = true;
						}
						else {
							$summary[] = $token;
						}
						break;
					case HTMLTokenizer::NODE_TYPE_ELEMENT_CLOSE;
						// don't handle this case here
						break;
					default:
						$summary[] = $token;
						break;
				}
			}
			if ( $token['type'] == HTMLTokenizer::NODE_TYPE_ELEMENT_CLOSE ) {
				do {
					$end = array_pop( $stack );
					$end['type'] = HTMLTokenizer::NODE_TYPE_ELEMENT_CLOSE;
					$end['attrs'] = null;
					$end['value'] = null;
					$summary[] = $end;
				} while ( ( $bail || $end['name'] != $token['name'] ) && count( $stack ) > 0 );
				if ( count( $stack ) == 0 ) {
					$para++;
				}
				if ( $bail || $para >= $max_paragraphs ) {
					break;
				}
			}
		}

		return (string) $summary;
	}
 protected function parseHTML($content)
 {
     $tokenizer = HtmlTokenizer::create(StringInputStream::create($content))->lowercaseTags(true)->lowercaseAttributes(true);
     $insideHead = false;
     while ($token = $tokenizer->nextToken()) {
         if (!$insideHead) {
             if ($token instanceof SgmlOpenTag && $token->getId() == 'head') {
                 $insideHead = true;
                 continue;
             }
         }
         if ($insideHead) {
             if ($token instanceof SgmlEndTag && $token->getId() == 'head') {
                 break;
             }
             if ($token instanceof SgmlOpenTag && $token->getId() == 'link' && $token->hasAttribute('rel') && $token->hasAttribute('href')) {
                 if ($token->getAttribute('rel') == 'openid.server') {
                     $this->server = HttpUrl::create()->parse($token->getAttribute('href'));
                 }
                 if ($token->getAttribute('rel') == 'openid.delegate') {
                     $this->realId = HttpUrl::create()->parse($token->getAttribute('href'));
                 }
             }
             if ($token instanceof SgmlOpenTag && $token->getId() == 'meta' && $token->hasAttribute('content') && $token->hasAttribute('http-equiv') && mb_strtolower($token->getAttribute('http-equiv')) == self::HEADER_XRDS_LOCATION) {
                 $this->loadXRDS($token->getAttribute('content'));
                 return $this;
             }
         }
     }
     return $this;
 }
Beispiel #7
0
 /**
  * Auxiliary function to extract tokens from an child node
  * 
  * @param DomNode $child the child node to extract tokens from
  * @param array tokes array of tokes previously extracted
  * @param int $length maximum size for each token extracted from the HTML document
  * @return array list of tokens
  */
 protected function _tokenizeChild($child, $tokens, $length)
 {
     $attributes = $this->_getNodeAttributes($child);
     if ($attributes) {
         $attributes = ' ' . $attributes;
     }
     $tokens[] = "<{$child->tagName}{$attributes}>";
     foreach ($child->childNodes as $c) {
         $cContent = $this->_getTokenContents($c);
         if (!$c->hasChildNodes()) {
             if (!trim($cContent)) {
                 continue;
             }
             if (isset($child->tagName) && $child->tagName == 'code') {
                 $cContent = str_replace('??>', '?>', $cContent);
             }
             $tokens[] = $cContent;
             continue;
         }
         $tokenizer = new HtmlTokenizer($cContent);
         $newTokens = $tokenizer->tokens($length, $c);
         if (array_sum(array_map('strlen', $newTokens)) <= $length) {
             $tokens[] = join('', $newTokens);
         } else {
             $tokens = array_merge($tokens, $newTokens);
         }
     }
     $tokens[] = "</{$child->tagName}>";
     return $tokens;
 }