コード例 #1
0
ファイル: google_translate.php プロジェクト: real34/i18n
 /**
  * Splits a text in smaller parts having a length lower than $maxLength
  * Texts would be cut after a period.
  * 
  * @TODO Improve me to work with HTML
  * @param string $text Text to split in smaller parts
  * @param int $maxLength Maximum length of a text part
  * @param boolean $html If true HTML tags will be handled properly 
  * @return array 
  */
 protected function _splitText($text, $maxLength, $html = false)
 {
     if (strlen($text) <= $maxLength) {
         $texts = array($text);
     } else {
         if ($html) {
             App::import('Lib', 'I18n.HtmlTokenizer');
             $Tokenizer = new HtmlTokenizer($text);
             $texts = $Tokenizer->tokens($maxLength);
         } else {
             $sentences = preg_split("/[\\.][\\s]+/", $text);
             $texts = array('');
             $i = 0;
             foreach ($sentences as $sentence) {
                 if (empty($sentence)) {
                     continue;
                 }
                 $sentence .= '. ';
                 if (empty($texts[$i]) && strlen($sentence) >= $maxLength) {
                     // Cut the string before the latest word
                     while (strlen($sentence) >= $maxLength) {
                         $sentencePart = substr($sentence, 0, $maxLength);
                         $sentencePart = substr($sentencePart, 0, strrpos($sentencePart, ' ') + 1);
                         $texts[$i++] = $sentencePart;
                         $sentence = substr($sentence, strlen($sentencePart));
                     }
                     $texts[$i++] = $sentence;
                     $texts[$i] = '';
                 } elseif (strlen($texts[$i]) + strlen($sentence) < $maxLength) {
                     $texts[$i] .= $sentence;
                 } else {
                     $i++;
                     $texts[$i] = $sentence;
                 }
             }
             // Removes the ". " of the latest text if it was not finished with a period
             if (substr(trim($text), -1) !== '.') {
                 $texts[count($texts) - 1] = substr($texts[count($texts) - 1], 0, -2);
             }
         }
     }
     return $texts;
 }
コード例 #2
0
ファイル: html_tokenizer.php プロジェクト: real34/i18n
 /**
  * Auxiliary function to extract tokens from an child node
  * 
  * @param DomNode $child the child node to extract tokens from
  * @param array tokes array of tokes previously extracted
  * @param int $length maximum size for each token extracted from the HTML document
  * @return array list of tokens
  */
 protected function _tokenizeChild($child, $tokens, $length)
 {
     $attributes = $this->_getNodeAttributes($child);
     if ($attributes) {
         $attributes = ' ' . $attributes;
     }
     $tokens[] = "<{$child->tagName}{$attributes}>";
     foreach ($child->childNodes as $c) {
         $cContent = $this->_getTokenContents($c);
         if (!$c->hasChildNodes()) {
             if (!trim($cContent)) {
                 continue;
             }
             if (isset($child->tagName) && $child->tagName == 'code') {
                 $cContent = str_replace('??>', '?>', $cContent);
             }
             $tokens[] = $cContent;
             continue;
         }
         $tokenizer = new HtmlTokenizer($cContent);
         $newTokens = $tokenizer->tokens($length, $c);
         if (array_sum(array_map('strlen', $newTokens)) <= $length) {
             $tokens[] = join('', $newTokens);
         } else {
             $tokens = array_merge($tokens, $newTokens);
         }
     }
     $tokens[] = "</{$child->tagName}>";
     return $tokens;
 }