/** * Splits a text in smaller parts having a length lower than $maxLength * Texts would be cut after a period. * * @TODO Improve me to work with HTML * @param string $text Text to split in smaller parts * @param int $maxLength Maximum length of a text part * @param boolean $html If true HTML tags will be handled properly * @return array */ protected function _splitText($text, $maxLength, $html = false) { if (strlen($text) <= $maxLength) { $texts = array($text); } else { if ($html) { App::import('Lib', 'I18n.HtmlTokenizer'); $Tokenizer = new HtmlTokenizer($text); $texts = $Tokenizer->tokens($maxLength); } else { $sentences = preg_split("/[\\.][\\s]+/", $text); $texts = array(''); $i = 0; foreach ($sentences as $sentence) { if (empty($sentence)) { continue; } $sentence .= '. '; if (empty($texts[$i]) && strlen($sentence) >= $maxLength) { // Cut the string before the latest word while (strlen($sentence) >= $maxLength) { $sentencePart = substr($sentence, 0, $maxLength); $sentencePart = substr($sentencePart, 0, strrpos($sentencePart, ' ') + 1); $texts[$i++] = $sentencePart; $sentence = substr($sentence, strlen($sentencePart)); } $texts[$i++] = $sentence; $texts[$i] = ''; } elseif (strlen($texts[$i]) + strlen($sentence) < $maxLength) { $texts[$i] .= $sentence; } else { $i++; $texts[$i] = $sentence; } } // Removes the ". " of the latest text if it was not finished with a period if (substr(trim($text), -1) !== '.') { $texts[count($texts) - 1] = substr($texts[count($texts) - 1], 0, -2); } } } return $texts; }
/** * Auxiliary function to extract tokens from an child node * * @param DomNode $child the child node to extract tokens from * @param array tokes array of tokes previously extracted * @param int $length maximum size for each token extracted from the HTML document * @return array list of tokens */ protected function _tokenizeChild($child, $tokens, $length) { $attributes = $this->_getNodeAttributes($child); if ($attributes) { $attributes = ' ' . $attributes; } $tokens[] = "<{$child->tagName}{$attributes}>"; foreach ($child->childNodes as $c) { $cContent = $this->_getTokenContents($c); if (!$c->hasChildNodes()) { if (!trim($cContent)) { continue; } if (isset($child->tagName) && $child->tagName == 'code') { $cContent = str_replace('??>', '?>', $cContent); } $tokens[] = $cContent; continue; } $tokenizer = new HtmlTokenizer($cContent); $newTokens = $tokenizer->tokens($length, $c); if (array_sum(array_map('strlen', $newTokens)) <= $length) { $tokens[] = join('', $newTokens); } else { $tokens = array_merge($tokens, $newTokens); } } $tokens[] = "</{$child->tagName}>"; return $tokens; }