function break_into_words($content) { $content = trim($content); if ($content == '') { return array(); } // Extract Unicode characters from the raw content data $ptr = 0; $utf8_chars = array(); $ucs2_chars = array(); $size = strlen($content); while ($ptr < $size) { $utf8_char = ManagerEncoding::get_next_utf8_char($content, $ptr); $utf8_chars[] = $utf8_char; $ucs2_chars[] = utf8_to_code($utf8_char); } // Get unicode line breaking classes $classes = array_map(array($this, 'get_line_break_class'), $ucs2_chars); $this->find_line_break($classes, $breaks, count($classes)); // Make words array $words = array(); $word = ''; for ($i = 0, $size = count($breaks); $i < $size; $i++) { $word .= $utf8_chars[$i]; $break = $breaks[$i]; if ($break == LB_INDIRECT || $break == LB_INDIRECT_CM || $break == LB_DIRECT || $break == LB_EXPLICIT) { $words[] = trim($word); $word = ''; } } return $words; }
function process_word($raw_content, &$pipeline) { if ($raw_content === '') { return false; } $ptr = 0; $word = ''; $hyphens = array(); $encoding = 'iso-8859-1'; $manager_encoding =& ManagerEncoding::get(); $text_box =& TextBox::create_empty($pipeline); $len = strlen($raw_content); while ($ptr < $len) { $char = $manager_encoding->getNextUTF8Char($raw_content, $ptr); // Check if current char is a soft hyphen character. It it is, // remove it from the word (as it should not be drawn normally) // and store its location if ($char == SYMBOL_SHY) { $hyphens[] = strlen($word); } else { $mapping = $manager_encoding->getMapping($char); /** * If this character is not found in predefined encoding vectors, * we'll use "Custom" encoding and add single-character TextBox * * @TODO: handle characters without known glyph names */ if (is_null($mapping)) { /** * No mapping to default encoding vectors found for this character */ /** * Add last word */ if ($word !== '') { $text_box->add_subword($word, $encoding, $hyphens); } /** * Add current symbol */ $custom_char = $manager_encoding->addCustomChar(utf8_to_code($char)); $text_box->add_subword($custom_char, $manager_encoding->getCustomEncodingName(), $hyphens); $word = ''; } else { if (isset($mapping[$encoding])) { $word .= $mapping[$encoding]; } else { // This condition prevents empty text boxes from appearing; say, if word starts with a national // character, an () - text box with no letters will be generated, in rare case causing a random line // wraps, if container is narrow if ($word !== '') { $text_box->add_subword($word, $encoding, $hyphens); } reset($mapping); list($encoding, $add) = each($mapping); $word = $mapping[$encoding]; $hyphens = array(); } } } } if ($word !== '') { $text_box->add_subword($word, $encoding, $hyphens); } $this->add_child($text_box); return true; }