public function more($content, $post) { $more_text = 'Read the rest →'; $max_paragraphs = 1; $showmore = false; $matches = preg_split('/<!--\\s*more\\s*-->/is', $content, 2, PREG_SPLIT_NO_EMPTY); if (count($matches) > 1) { $summary = $matches[0]; $remainder = $matches[1]; if (trim($remainder) != '') { $showmore = true; } } else { $ht = new HtmlTokenizer($content, false); $set = $ht->parse(); $stack = array(); $para = 0; $token = $set->current(); $summary = new HTMLTokenSet(false); $remainder = new HTMLTokenSet(false); $set->rewind(); for ($token = $set->current(); $set->valid(); $token = $set->next()) { if ($token['type'] == HTMLTokenizer::NODE_TYPE_ELEMENT_OPEN) { $stack[$token['name']] = $token['name']; } if ($para < $max_paragraphs) { $summary[] = $token; } if ($para >= $max_paragraphs) { $remainder[] = $token; $showmore = true; } if ($token['type'] == HTMLTokenizer::NODE_TYPE_ELEMENT_CLOSE) { if (isset($stack[$token['name']])) { while (end($stack) != $token['name']) { array_pop($stack); } array_pop($stack); } if (count($stack) == 0) { $para++; } } } } if ($post->slug == Controller::get_var('slug')) { $content = $summary . '<div id="more" class="moreanchor">' . 'Continues here →' . '</div>' . $remainder; } elseif ($showmore == true) { $content = $summary . '<p class="more"><a href="' . $post->permalink . '#more">' . $more_text . '</a></p>'; } else { $content = $summary . $remainder; } return $content; }
/** * Extract an entity like "<!DOCTYPE html>". * * @return array|string */ private function extractEntity() { $entity = array(0 => '', 1 => [], 2 => '', 'html' => ''); $token = $this->tokenizer->current(); if ($token !== '<!') { // Sanity check throw new \Exception('Sanity check failed. Expected a "<!" token'); } while ($this->tokenizer->valid()) { $token = $this->tokenizer->current(); $this->tokenizer->next(); $entity['html'] .= is_array($token) ? $token[1] : $token; if ($token[0] == 'T_DTD_ENTITY' || $token == '<!') { $entity[0] .= is_array($token) ? $token[1] : $token; } elseif ($token[0] == 'T_DTD_ATTRIBUTES') { $entity[1][] = $token[1]; } elseif ($token == '>') { $entity[2] = $token; return $entity; } else { $this->warnings[] = 'TagIterator: Unexpected token in entity: "' . (is_array($token) ? '[' . $token[0] . '] ' . $token[1] : $token) . '"'; return $entity['html']; } } }
public function attValueEnd($ch) { $this->c_quote = $this->att_quote; $c_val = $this->stack; $res = parent::attValueEnd($ch); if ($this->debug) { fx::debug($res === false, $c_val); } if ($res !== false) { $this->c_att['value'] = $c_val; $this->addAtt(); $this->stack = ''; } $this->c_quote = null; return $res; }
/** * Splits a text in smaller parts having a length lower than $maxLength * Texts would be cut after a period. * * @TODO Improve me to work with HTML * @param string $text Text to split in smaller parts * @param int $maxLength Maximum length of a text part * @param boolean $html If true HTML tags will be handled properly * @return array */ protected function _splitText($text, $maxLength, $html = false) { if (strlen($text) <= $maxLength) { $texts = array($text); } else { if ($html) { App::import('Lib', 'I18n.HtmlTokenizer'); $Tokenizer = new HtmlTokenizer($text); $texts = $Tokenizer->tokens($maxLength); } else { $sentences = preg_split("/[\\.][\\s]+/", $text); $texts = array(''); $i = 0; foreach ($sentences as $sentence) { if (empty($sentence)) { continue; } $sentence .= '. '; if (empty($texts[$i]) && strlen($sentence) >= $maxLength) { // Cut the string before the latest word while (strlen($sentence) >= $maxLength) { $sentencePart = substr($sentence, 0, $maxLength); $sentencePart = substr($sentencePart, 0, strrpos($sentencePart, ' ') + 1); $texts[$i++] = $sentencePart; $sentence = substr($sentence, strlen($sentencePart)); } $texts[$i++] = $sentence; $texts[$i] = ''; } elseif (strlen($texts[$i]) + strlen($sentence) < $maxLength) { $texts[$i] .= $sentence; } else { $i++; $texts[$i] = $sentence; } } // Removes the ". " of the latest text if it was not finished with a period if (substr(trim($text), -1) !== '.') { $texts[count($texts) - 1] = substr($texts[count($texts) - 1], 0, -2); } } } return $texts; }
/** * Returns a shortened version of whatever is passed in. * @param string $value A string to shorten * @param integer $count Maximum words to display [100] * @param integer $max_paragraphs Maximum paragraphs to display [1] * @return string The string, shortened */ public static function summarize( $text, $count = 100, $max_paragraphs = 1 ) { $ellipsis = '…'; $showmore = false; $ht = new HtmlTokenizer($text, false); $set = $ht->parse(); $stack = array(); $para = 0; $token = $set->current(); $summary = new HTMLTokenSet(); $set->rewind(); $remaining_words = $count; // $bail lets the loop end naturally and close all open elements without adding new ones. $bail = false; for ( $token = $set->current(); $set->valid(); $token = $set->next() ) { if ( !$bail && $token['type'] == HTMLTokenizer::NODE_TYPE_ELEMENT_OPEN ) { $stack[] = $token; } if ( !$bail ) { switch ( $token['type'] ) { case HTMLTokenizer::NODE_TYPE_TEXT: $words = preg_split( '/(\\s+)/u', $token['value'], -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY ); // word count is doubled because spaces between words are captured as their own array elements via PREG_SPLIT_DELIM_CAPTURE $words = array_slice( $words, 0, $remaining_words * 2 ); $remaining_words -= count( $words ) / 2; $token['value'] = implode( '', $words ); if ( $remaining_words <= 0 ) { $token['value'] .= $ellipsis; $summary[] = $token; $bail = true; } else { $summary[] = $token; } break; case HTMLTokenizer::NODE_TYPE_ELEMENT_CLOSE; // don't handle this case here break; default: $summary[] = $token; break; } } if ( $token['type'] == HTMLTokenizer::NODE_TYPE_ELEMENT_CLOSE ) { do { $end = array_pop( $stack ); $end['type'] = HTMLTokenizer::NODE_TYPE_ELEMENT_CLOSE; $end['attrs'] = null; $end['value'] = null; $summary[] = $end; } while ( ( $bail || $end['name'] != $token['name'] ) && count( $stack ) > 0 ); if ( count( $stack ) == 0 ) { $para++; } if ( $bail || $para >= $max_paragraphs ) { break; } } } return (string) $summary; }
protected function parseHTML($content) { $tokenizer = HtmlTokenizer::create(StringInputStream::create($content))->lowercaseTags(true)->lowercaseAttributes(true); $insideHead = false; while ($token = $tokenizer->nextToken()) { if (!$insideHead) { if ($token instanceof SgmlOpenTag && $token->getId() == 'head') { $insideHead = true; continue; } } if ($insideHead) { if ($token instanceof SgmlEndTag && $token->getId() == 'head') { break; } if ($token instanceof SgmlOpenTag && $token->getId() == 'link' && $token->hasAttribute('rel') && $token->hasAttribute('href')) { if ($token->getAttribute('rel') == 'openid.server') { $this->server = HttpUrl::create()->parse($token->getAttribute('href')); } if ($token->getAttribute('rel') == 'openid.delegate') { $this->realId = HttpUrl::create()->parse($token->getAttribute('href')); } } if ($token instanceof SgmlOpenTag && $token->getId() == 'meta' && $token->hasAttribute('content') && $token->hasAttribute('http-equiv') && mb_strtolower($token->getAttribute('http-equiv')) == self::HEADER_XRDS_LOCATION) { $this->loadXRDS($token->getAttribute('content')); return $this; } } } return $this; }
/** * Auxiliary function to extract tokens from an child node * * @param DomNode $child the child node to extract tokens from * @param array tokes array of tokes previously extracted * @param int $length maximum size for each token extracted from the HTML document * @return array list of tokens */ protected function _tokenizeChild($child, $tokens, $length) { $attributes = $this->_getNodeAttributes($child); if ($attributes) { $attributes = ' ' . $attributes; } $tokens[] = "<{$child->tagName}{$attributes}>"; foreach ($child->childNodes as $c) { $cContent = $this->_getTokenContents($c); if (!$c->hasChildNodes()) { if (!trim($cContent)) { continue; } if (isset($child->tagName) && $child->tagName == 'code') { $cContent = str_replace('??>', '?>', $cContent); } $tokens[] = $cContent; continue; } $tokenizer = new HtmlTokenizer($cContent); $newTokens = $tokenizer->tokens($length, $c); if (array_sum(array_map('strlen', $newTokens)) <= $length) { $tokens[] = join('', $newTokens); } else { $tokens = array_merge($tokens, $newTokens); } } $tokens[] = "</{$child->tagName}>"; return $tokens; }