public function testUTF8v() { $strings = array('' => array(), 'x' => array('x'), 'quack' => array('q', 'u', 'a', 'c', 'k'), "x東y" => array('x', "東", 'y'), "x͠y" => array("x", "͠", 'y')); foreach ($strings as $str => $expect) { $this->assertEqual($expect, phutil_utf8v($str), 'Vector of ' . $str); } }
public function render() { $viewer = $this->getViewer(); $rows = array(); foreach ($this->events as $event) { // Limit display log size. If a daemon gets stuck in an output loop this // page can be like >100MB if we don't truncate stuff. Try to do cheap // line-based truncation first, and fall back to expensive UTF-8 character // truncation if that doesn't get things short enough. $message = $event->getMessage(); $more = null; if (!$this->showFullMessage) { $more_lines = null; $more_chars = null; $line_limit = 12; if (substr_count($message, "\n") > $line_limit) { $message = explode("\n", $message); $more_lines = count($message) - $line_limit; $message = array_slice($message, 0, $line_limit); $message = implode("\n", $message); } $char_limit = 8192; if (strlen($message) > $char_limit) { $message = phutil_utf8v($message); $more_chars = count($message) - $char_limit; $message = array_slice($message, 0, $char_limit); $message = implode('', $message); } if ($more_chars) { $more = new PhutilNumber($more_chars); $more = pht('Show %d more character(s)...', $more); } else { if ($more_lines) { $more = new PhutilNumber($more_lines); $more = pht('Show %d more line(s)...', $more); } } if ($more) { $id = $event->getID(); $more = array("\n...\n", phutil_tag('a', array('href' => "/daemon/event/{$id}/"), $more)); } } $row = array($event->getLogType(), phabricator_date($event->getEpoch(), $viewer), phabricator_time($event->getEpoch(), $viewer), array($message, $more)); if ($this->combinedLog) { array_unshift($row, phutil_tag('a', array('href' => '/daemon/log/' . $event->getLogID() . '/'), pht('Daemon %s', $event->getLogID()))); } $rows[] = $row; } $classes = array('', '', 'right', 'wide prewrap'); $headers = array('Type', 'Date', 'Time', 'Message'); if ($this->combinedLog) { array_unshift($classes, 'pri'); array_unshift($headers, 'Daemon'); } $log_table = new AphrontTableView($rows); $log_table->setHeaders($headers); $log_table->setColumnClasses($classes); return $log_table->render(); }
public function render() { $rows = array(); if (!$this->user) { throw new Exception("Call setUser() before rendering!"); } foreach ($this->events as $event) { // Limit display log size. If a daemon gets stuck in an output loop this // page can be like >100MB if we don't truncate stuff. Try to do cheap // line-based truncation first, and fall back to expensive UTF-8 character // truncation if that doesn't get things short enough. $message = $event->getMessage(); $more_lines = null; $more_chars = null; $line_limit = 12; if (substr_count($message, "\n") > $line_limit) { $message = explode("\n", $message); $more_lines = count($message) - $line_limit; $message = array_slice($message, 0, $line_limit); $message = implode("\n", $message); } $char_limit = 8192; if (strlen($message) > $char_limit) { $message = phutil_utf8v($message); $more_chars = count($message) - $char_limit; $message = array_slice($message, 0, $char_limit); $message = implode('', $message); } $more = null; if ($more_chars) { $more = number_format($more_chars); $more = "\n<... {$more} more characters ...>"; } else { if ($more_lines) { $more = number_format($more_lines); $more = "\n<... {$more} more lines ...>"; } } $row = array(phutil_escape_html($event->getLogType()), phabricator_date($event->getEpoch(), $this->user), phabricator_time($event->getEpoch(), $this->user), str_replace("\n", '<br />', phutil_escape_html($message . $more))); if ($this->combinedLog) { array_unshift($row, phutil_render_tag('a', array('href' => '/daemon/log/' . $event->getLogID() . '/'), phutil_escape_html('Daemon ' . $event->getLogID()))); } $rows[] = $row; } $classes = array('', '', 'right', 'wide wrap'); $headers = array('Type', 'Date', 'Time', 'Message'); if ($this->combinedLog) { array_unshift($classes, 'pri'); array_unshift($headers, 'Daemon'); } $log_table = new AphrontTableView($rows); $log_table->setHeaders($headers); $log_table->setColumnClasses($classes); return $log_table->render(); }
public function getHighlightFuture($source) { $color = 0; $colors = array('rbw_r', 'rbw_o', 'rbw_y', 'rbw_g', 'rbw_b', 'rbw_i', 'rbw_v'); $result = array(); foreach (phutil_utf8v($source) as $character) { if ($character == ' ' || $character == "\n") { $result[] = $character; continue; } $result[] = '<span class="' . $colors[$color] . '">' . phutil_escape_html($character) . '</span>'; $color = ($color + 1) % count($colors); } $result = implode('', $result); return new ImmediateFuture($result); }
public function getHighlightFuture($source) { $color = 0; $colors = array('rbw_r', 'rbw_o', 'rbw_y', 'rbw_g', 'rbw_b', 'rbw_i', 'rbw_v'); $result = array(); foreach (phutil_utf8v($source) as $character) { if ($character == ' ' || $character == "\n") { $result[] = $character; continue; } $result[] = phutil_tag('span', array('class' => $colors[$color]), $character); $color = ($color + 1) % count($colors); } $result = phutil_implode_html('', $result); return new ImmediateFuture($result); }
public function getStringParts() { $input_text = $this->inputText; $text_array = phutil_utf8v($input_text); for ($ii = 0; $ii < count($text_array); $ii++) { $char = $text_array[$ii]; $char_hex = bin2hex($char); if (array_key_exists($char, self::$invisibleChars)) { $text_array[$ii] = array('special' => true, 'value' => '<' . self::$invisibleChars[$char] . '>'); } else { if (ord($char) < 32) { $text_array[$ii] = array('special' => true, 'value' => '<0x' . $char_hex . '>'); } else { $text_array[$ii] = array('special' => false, 'value' => $char); } } } return $text_array; }
public function correctSpelling($input, array $options) { $matrix = $this->getEditDistanceMatrix(); if (!$matrix) { throw new PhutilInvalidStateException('setEditDistanceMatrix'); } $max_distance = $this->getMaximumDistance(); if (!$max_distance) { throw new PhutilInvalidStateException('setMaximumDistance'); } $input = $this->normalizeString($input); foreach ($options as $key => $option) { $options[$key] = $this->normalizeString($option); } $distances = array(); $inputv = phutil_utf8v($input); foreach ($options as $option) { $optionv = phutil_utf8v($option); $matrix->setSequences($optionv, $inputv); $distances[$option] = $matrix->getEditDistance(); } asort($distances); $best = min($max_distance, head($distances)); foreach ($distances as $option => $distance) { if ($distance > $best) { unset($distances[$option]); } } // Before filtering, check if we have multiple equidistant matches and // return them if we do. This prevents us from, e.g., matching "alnd" with // both "land" and "amend", then dropping "land" for being too short, and // incorrectly completing to "amend". if (count($distances) > 1) { return array_keys($distances); } foreach ($distances as $option => $distance) { if (phutil_utf8_strlen($option) < $distance) { unset($distances[$option]); } } return array_keys($distances); }
private function wrapICSLine($line) { $out = array(); $buf = ''; // NOTE: The line may contain sequences of combining characters which are // more than 80 bytes in length. If it does, we'll split them in the // middle of the sequence. This is okay and generally anticipated by // RFC5545, which even allows implementations to split multibyte // characters. The sequence will be stitched back together properly by // whatever is parsing things. foreach (phutil_utf8v($line) as $character) { // If adding this character would bring the line over 75 bytes, start // a new line. if (strlen($buf) + strlen($character) > 75) { $out[] = $buf . "\r\n"; $buf = ' '; } $buf .= $character; } $out[] = $buf . "\r\n"; return implode('', $out); }
/** * Split a UTF-8 string into an array of characters. Combining characters * are not split. * * @param string A valid utf-8 string. * @return list A list of characters in the string. */ function phutil_utf8v_combined($string) { $components = phutil_utf8v($string); $array_length = count($components); // If the first character in the string is a combining character, // prepend a space to the string. if ($array_length > 0 && phutil_utf8_is_combining_character($components[0])) { $string = ' ' . $string; $components = phutil_utf8v($string); $array_length++; } for ($index = 1; $index < $array_length; $index++) { if (phutil_utf8_is_combining_character($components[$index])) { $components[$index - 1] = $components[$index - 1] . $components[$index]; unset($components[$index]); $components = array_values($components); $index--; $array_length = count($components); } } return $components; }
/** * Find the words which are part of the query string, and bold them in a * result string. This makes it easier for users to see why a result * matched their query. */ private function emboldenQuery($str) { $query = $this->query->getParameter('query'); if (!strlen($query) || !strlen($str)) { return $str; } // This algorithm is safe but not especially fast, so don't bother if // we're dealing with a lot of data. This mostly prevents silly/malicious // queries from doing anything bad. if (strlen($query) + strlen($str) > 2048) { return $str; } // Keep track of which characters we're going to make bold. This is // byte oriented, but we'll make sure we don't put a bold in the middle // of a character later. $bold = array_fill(0, strlen($str), false); // Split the query into words. $parts = preg_split('/ +/', $query); // Find all occurrences of each word, and mark them to be emboldened. foreach ($parts as $part) { $part = trim($part); $part = trim($part, '"+'); if (!strlen($part)) { continue; } $matches = null; $has_matches = preg_match_all('/(?:^|\\b)(' . preg_quote($part, '/') . ')/i', $str, $matches, PREG_OFFSET_CAPTURE); if (!$has_matches) { continue; } // Flag the matching part of the range for boldening. foreach ($matches[1] as $match) { $offset = $match[1]; for ($ii = 0; $ii < strlen($match[0]); $ii++) { $bold[$offset + $ii] = true; } } } // Split the string into ranges, applying bold styling as required. $out = array(); $buf = ''; $pos = 0; $is_bold = false; foreach (phutil_utf8v($str) as $chr) { if ($bold[$pos] != $is_bold) { if (strlen($buf)) { if ($is_bold) { $out[] = phutil_tag('strong', array(), $buf); } else { $out[] = $buf; } $buf = ''; } $is_bold = !$is_bold; } $buf .= $chr; $pos += strlen($chr); } if (strlen($buf)) { if ($is_bold) { $out[] = phutil_tag('strong', array(), $buf); } else { $out[] = $buf; } } return $out; }
private function unescapeTextValue($data) { $result = array(); $buf = ''; $esc = false; foreach (phutil_utf8v($data) as $c) { if (!$esc) { if ($c == '\\') { $esc = true; } else { if ($c == ',') { $result[] = $buf; $buf = ''; } else { $buf .= $c; } } } else { switch ($c) { case 'n': case 'N': $buf .= "\n"; break; default: $buf .= $c; break; } } } if ($esc) { $this->raiseParseFailure(self::PARSE_UNESCAPED_BACKSLASH, pht('ICS document contains TEXT value ending with unescaped ' . 'backslash.')); } $result[] = $buf; return $result; }
protected function withNgramsConstraint(PhabricatorSearchNgrams $index, $value) { if (strlen($value)) { $this->ngrams[] = array('index' => $index, 'value' => $value, 'length' => count(phutil_utf8v($value))); } return $this; }
/** * Format the log string, replacing "%x" variables with values. * * @return string Finalized, log string for writing to disk. * @task internals */ private function format() { // Always convert '%%' to literal '%'. $map = array('%' => '%') + $this->data; $result = ''; $saw_percent = false; foreach (phutil_utf8v($this->format) as $c) { if ($saw_percent) { $saw_percent = false; if (array_key_exists($c, $map)) { $result .= addcslashes($map[$c], "..\\..ÿ"); } else { $result .= '-'; } } else { if ($c == '%') { $saw_percent = true; } else { $result .= $c; } } } return rtrim($result) . "\n"; }
/** * Hard-wrap a piece of UTF-8 text with embedded HTML tags and entities. * * @param string An HTML string with tags and entities. * @return string Hard-wrapped string. */ protected function lineWrap($line) { $c = 0; $break_here = array(); // Convert the UTF-8 string into a list of UTF-8 characters. $vector = phutil_utf8v($line); $len = count($vector); $byte_pos = 0; for ($ii = 0; $ii < $len; ++$ii) { // An ampersand indicates an HTML entity; consume the whole thing (until // ";") but treat it all as one character. if ($vector[$ii] == '&') { do { ++$ii; } while ($vector[$ii] != ';'); ++$c; // An "<" indicates an HTML tag, consume the whole thing but don't treat // it as a character. } else { if ($vector[$ii] == '<') { do { ++$ii; } while ($vector[$ii] != '>'); } else { ++$c; } } // Keep track of where we need to break the string later. if ($c == $this->lineWidth) { $break_here[$ii] = true; $c = 0; } } $result = array(); foreach ($vector as $ii => $char) { $result[] = $char; if (isset($break_here[$ii])) { $result[] = "<span class=\"over-the-line\">⬅</span><br />"; } } return implode('', $result); }
/** * Shorten a string to provide a summary, respecting UTF-8 characters. This * function attempts to truncate strings at word boundaries. * * NOTE: This function makes a best effort to apply some reasonable rules but * will not work well for the full range of unicode languages. For instance, * no effort is made to deal with combining characters. * * @param string UTF-8 string to shorten. * @param int Maximum length of the result. * @param string If the string is shortened, add this at the end. Defaults to * horizontal ellipsis. * @return string A string with no more than the specified character length. */ function phutil_utf8_shorten($string, $length, $terminal = "…") { $terminal_len = count(phutil_utf8v($terminal)); if ($terminal_len >= $length) { // If you provide a terminal we still enforce that the result (including // the terminal) is no longer than $length, but we can't do that if the // terminal is too long. throw new Exception("String terminal length must be less than string length!"); } $string_v = phutil_utf8v($string); $string_len = count($string_v); if ($string_len <= $length) { // If the string is already shorter than the requested length, simply return // it unmodified. return $string; } // NOTE: This is not complete, and there are many other word boundary // characters and reasonable places to break words in the UTF-8 character // space. For now, this gives us reasonable behavior for latin langauges. We // don't necessarily have access to PCRE+Unicode so there isn't a great way // for us to look up character attributes. // If we encounter these, prefer to break on them instead of cutting the // string off in the middle of a word. static $break_characters = array(' ' => true, "\n" => true, ';' => true, ':' => true, '[' => true, '(' => true, ',' => true, '-' => true); // If we encounter these, shorten to this character exactly without appending // the terminal. static $stop_characters = array('.' => true, '!' => true, '?' => true); // Search backward in the string, looking for reasonable places to break it. $word_boundary = null; $stop_boundary = null; // If we do a word break with a terminal, we have to look beyond at least the // number of characters in the terminal. $terminal_area = $length - $terminal_len; for ($ii = $length; $ii >= 0; $ii--) { $c = $string_v[$ii]; if (isset($break_characters[$c]) && $ii <= $terminal_area) { $word_boundary = $ii; } else { if (isset($stop_characters[$c]) && $ii < $length) { $stop_boundary = $ii + 1; break; } else { if ($word_boundary !== null) { break; } } } } if ($stop_boundary !== null) { // We found a character like ".". Cut the string there, without appending // the terminal. $string_part = array_slice($string_v, 0, $stop_boundary); return implode('', $string_part); } // If we didn't find any boundary characters or we found ONLY boundary // characters, just break at the maximum character length. if ($word_boundary === null || $word_boundary === 0) { $word_boundary = $length - $terminal_len; } $string_part = array_slice($string_v, 0, $word_boundary); $string_part = implode('', $string_part); return $string_part . $terminal; }
public function truncateString($string) { // First, check if the string has fewer bytes than the most restrictive // limit. Codepoints and glyphs always take up at least one byte, so we can // just return the string unmodified if we're under all of the limits. $byte_len = strlen($string); if ($byte_len <= $this->minimumLimit) { return $string; } // If we need the vector of codepoints, build it. $string_pv = null; if ($this->maximumCodepoints) { $string_pv = phutil_utf8v($string); $point_len = count($string_pv); } // We always need the combined vector, even if we're only doing byte or // codepoint truncation, because we don't want to truncate to half of a // combining character. $string_gv = phutil_utf8v_combined($string); $glyph_len = count($string_gv); // Now, check if we're still over the limits. For example, a string may // be over the raw byte limit but under the glyph limit if it contains // several multibyte characters. $too_long = false; if ($this->maximumBytes && $byte_len > $this->maximumBytes) { $too_long = true; } if ($this->maximumCodepoints && $point_len > $this->maximumCodepoints) { $too_long = true; } if ($this->maximumGlyphs && $glyph_len > $this->maximumGlyphs) { $too_long = true; } if (!$too_long) { return $string; } // This string is legitimately longer than at least one of the limits, so // we need to truncate it. Find the minimum cutoff point: this is the last // glyph we can possibly return while satisfying the limits and having space // for the terminator. $cutoff = $glyph_len; if ($this->maximumBytes) { if ($byte_len <= $this->maximumBytes) { $cutoff = $glyph_len; } else { $bytes = $this->terminatorBytes; for ($ii = 0; $ii < $glyph_len; $ii++) { $bytes += strlen($string_gv[$ii]); if ($bytes > $this->maximumBytes) { $cutoff = $ii; break; } } } } if ($this->maximumCodepoints) { if ($point_len <= $this->maximumCodepoints) { $cutoff = min($cutoff, $glyph_len); } else { $points = 0; for ($ii = 0; $ii < $glyph_len; $ii++) { $glyph_bytes = strlen($string_gv[$ii]); while ($points < $point_len) { $glyph_bytes -= strlen($string_pv[$points]); $points++; if ($glyph_bytes <= 0) { break; } } $points_total = $points + $this->terminatorCodepoints; if ($points_total > $this->maximumCodepoints) { $cutoff = min($cutoff, $ii); break; } } } } if ($this->maximumGlyphs) { if ($glyph_len <= $this->maximumGlyphs) { $cutoff = min($cutoff, $glyph_len); } else { $cutoff = min($cutoff, $this->maximumGlyphs - $this->terminatorGlyphs); } } // If we don't have enough characters for anything, just return the // terminator. if ($cutoff <= 0) { return $this->terminator; } // Otherwise, we're going to try to cut the string off somewhere reasonable // rather than somewhere arbitrary. // NOTE: This is not complete, and there are many other word boundary // characters and reasonable places to break words in the UTF-8 character // space. For now, this gives us reasonable behavior for latin languages. We // don't necessarily have access to PCRE+Unicode so there isn't a great way // for us to look up character attributes. // If we encounter these, prefer to break on them instead of cutting the // string off in the middle of a word. static $break_characters = array(' ' => true, "\n" => true, ';' => true, ':' => true, '[' => true, '(' => true, ',' => true, '-' => true); // If we encounter these, shorten to this character exactly without // appending the terminal. static $stop_characters = array('.' => true, '!' => true, '?' => true); // Search backward in the string, looking for reasonable places to break it. $word_boundary = null; $stop_boundary = null; // If we do a word break with a terminal, we have to look beyond at least // the number of characters in the terminal. If the terminal is longer than // the required length, we'll skip this whole block and return it on its // own. // Only search backward for a while. At some point we don't get a better // result by looking through the whole string, and if this is "MMM..." or // a non-latin language without word break characters we're just wasting // time. $search = max(0, $cutoff - 256); for ($ii = min($cutoff, $glyph_len - 1); $ii >= $search; $ii--) { $c = $string_gv[$ii]; if (isset($break_characters[$c])) { $word_boundary = $ii; } else { if (isset($stop_characters[$c])) { $stop_boundary = $ii + 1; break; } else { if ($word_boundary !== null) { break; } } } } if ($stop_boundary !== null) { // We found a character like ".". Cut the string there, without appending // the terminal. $string_part = array_slice($string_gv, 0, $stop_boundary); return implode('', $string_part); } // If we didn't find any boundary characters or we found ONLY boundary // characters, just break at the maximum character length. if ($word_boundary === null || $word_boundary === 0) { $word_boundary = $cutoff; } $string_part = array_slice($string_gv, 0, $word_boundary); $string_part = implode('', $string_part); return $string_part . $this->terminator; }
/** * Hard-wrap a block of UTF-8 text with embedded HTML tags and entities. * * @param string An HTML string with tags and entities. * @return list List of hard-wrapped lines. * @group utf8 */ function phutil_utf8_hard_wrap_html($string, $width) { $break_here = array(); // Convert the UTF-8 string into a list of UTF-8 characters. $vector = phutil_utf8v($string); $len = count($vector); $char_pos = 0; for ($ii = 0; $ii < $len; ++$ii) { // An ampersand indicates an HTML entity; consume the whole thing (until // ";") but treat it all as one character. if ($vector[$ii] == '&') { do { ++$ii; } while ($vector[$ii] != ';'); ++$char_pos; // An "<" indicates an HTML tag, consume the whole thing but don't treat // it as a character. } else { if ($vector[$ii] == '<') { do { ++$ii; } while ($vector[$ii] != '>'); } else { ++$char_pos; } } // Keep track of where we need to break the string later. if ($char_pos == $width) { $break_here[$ii] = true; $char_pos = 0; } } $result = array(); $string = ''; foreach ($vector as $ii => $char) { $string .= $char; if (isset($break_here[$ii])) { $result[] = $string; $string = ''; } } if (strlen($string)) { $result[] = $string; } return $result; }
/** * Soft wrap text for display on a console, respecting UTF8 character boundaries * and ANSI color escape sequences. * * @param string Text to wrap. * @param int Optional indent level. * @return string Wrapped text. * * @group console */ function phutil_console_wrap($text, $indent = 0) { $lines = array(); $width = 78 - $indent; $esc = chr(27); $break_pos = null; $len_after_break = 0; $line_len = 0; $line = array(); $lines = array(); $vector = phutil_utf8v($text); $vector_len = count($vector); for ($ii = 0; $ii < $vector_len; $ii++) { $chr = $vector[$ii]; // If this is an ANSI escape sequence for a color code, just consume it // without counting it toward the character limit. This prevents lines // with bold/color on them from wrapping too early. if ($chr == $esc) { for ($ii; $ii < $vector_len; $ii++) { $line[] = $vector[$ii]; if ($vector[$ii] == 'm') { break; } } continue; } $line[] = $chr; ++$line_len; ++$len_after_break; if ($line_len > $width) { if ($break_pos !== null) { $slice = array_slice($line, 0, $break_pos); while (count($slice) && end($slice) == ' ') { array_pop($slice); } $slice[] = "\n"; $lines[] = $slice; $line = array_slice($line, $break_pos); $line_len = $len_after_break; $len_after_break = 0; $break_pos = null; } } if ($chr == " ") { $break_pos = count($line); $len_after_break = 0; } if ($chr == "\n") { $lines[] = $line; $line = array(); $len_after_break = 0; $line_len = 0; $break_pos = null; } } if ($line) { if ($line) { $lines[] = $line; } } $pre = null; if ($indent) { $pre = str_repeat(' ', $indent); } foreach ($lines as $idx => $line) { $lines[$idx] = $pre . implode('', $line); } return implode('', $lines); }