public function setTerminator($terminator) { $this->terminator = $terminator; $this->terminatorBytes = strlen($terminator); $this->terminatorCodepoints = count(phutil_utf8v($terminator)); $this->terminatorGlyphs = count(phutil_utf8v_combined($terminator)); return $this; }
public function testLevenshtein() { $tests = array(array('a', 'b', 'x'), array('kalrmr(array($b))', 'array($b)', 'dddddddssssssssds'), array('array($b)', 'kalrmr(array($b))', 'iiiiiiissssssssis'), array('zkalrmr(array($b))z', 'xarray($b)x', 'dddddddxsssssssssdx'), array('xarray($b)x', 'zkalrmr(array($b))z', 'iiiiiiixsssssssssix'), array('abcdefghi', 'abcdefghi', 'sssssssss'), array('abcdefghi', 'abcdefghijkl', 'sssssssssiii'), array('abcdefghijkl', 'abcdefghi', 'sssssssssddd'), array('xyzabcdefghi', 'abcdefghi', 'dddsssssssss'), array('abcdefghi', 'xyzabcdefghi', 'iiisssssssss'), array('abcdefg', 'abxdxfg', 'ssxxxss'), array('private function a($a, $b) {', 'public function and($b, $c) {', 'siixxdddxsssssssssssiixxxxxxxsss'), array(' if (' . 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx' . 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx' . 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx) {', ' if(' . 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx' . 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx' . 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx) {', 'ssssssssssds' . 'ssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssss' . 'ssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssss' . 'sssssssssssssssssssssssssssssssssssssss')); foreach ($tests as $test) { $this->assertEqual($test[2], ArcanistDiffUtils::generateEditString(str_split($test[0]), str_split($test[1])), pht("'%s' vs '%s'", $test[0], $test[1])); } $utf8_tests = array(array('GrumpyCat', "Grumpy☃at", 'ssssssxss')); foreach ($tests as $test) { $this->assertEqual($test[2], ArcanistDiffUtils::generateEditString(phutil_utf8v_combined($test[0]), phutil_utf8v_combined($test[1])), pht("'%s' vs '%s' (utf8)", $test[0], $test[1])); } }
private function splitCorpus($corpus, $level) { switch ($level) { case 1: // Level 1: Split into sentences. $expr = '/([\\n,!;?\\.]+)/'; break; case 2: // Level 2: Split into words. $expr = '/(\\s+)/'; break; case 3: // Level 3: Split into characters. return phutil_utf8v_combined($corpus); } $pieces = preg_split($expr, $corpus, -1, PREG_SPLIT_DELIM_CAPTURE); return $this->stitchPieces($pieces); }
public function truncateString($string) { // First, check if the string has fewer bytes than the most restrictive // limit. Codepoints and glyphs always take up at least one byte, so we can // just return the string unmodified if we're under all of the limits. $byte_len = strlen($string); if ($byte_len <= $this->minimumLimit) { return $string; } // If we need the vector of codepoints, build it. $string_pv = null; if ($this->maximumCodepoints) { $string_pv = phutil_utf8v($string); $point_len = count($string_pv); } // We always need the combined vector, even if we're only doing byte or // codepoint truncation, because we don't want to truncate to half of a // combining character. $string_gv = phutil_utf8v_combined($string); $glyph_len = count($string_gv); // Now, check if we're still over the limits. For example, a string may // be over the raw byte limit but under the glyph limit if it contains // several multibyte characters. $too_long = false; if ($this->maximumBytes && $byte_len > $this->maximumBytes) { $too_long = true; } if ($this->maximumCodepoints && $point_len > $this->maximumCodepoints) { $too_long = true; } if ($this->maximumGlyphs && $glyph_len > $this->maximumGlyphs) { $too_long = true; } if (!$too_long) { return $string; } // This string is legitimately longer than at least one of the limits, so // we need to truncate it. Find the minimum cutoff point: this is the last // glyph we can possibly return while satisfying the limits and having space // for the terminator. $cutoff = $glyph_len; if ($this->maximumBytes) { if ($byte_len <= $this->maximumBytes) { $cutoff = $glyph_len; } else { $bytes = $this->terminatorBytes; for ($ii = 0; $ii < $glyph_len; $ii++) { $bytes += strlen($string_gv[$ii]); if ($bytes > $this->maximumBytes) { $cutoff = $ii; break; } } } } if ($this->maximumCodepoints) { if ($point_len <= $this->maximumCodepoints) { $cutoff = min($cutoff, $glyph_len); } else { $points = 0; for ($ii = 0; $ii < $glyph_len; $ii++) { $glyph_bytes = strlen($string_gv[$ii]); while ($points < $point_len) { $glyph_bytes -= strlen($string_pv[$points]); $points++; if ($glyph_bytes <= 0) { break; } } $points_total = $points + $this->terminatorCodepoints; if ($points_total > $this->maximumCodepoints) { $cutoff = min($cutoff, $ii); break; } } } } if ($this->maximumGlyphs) { if ($glyph_len <= $this->maximumGlyphs) { $cutoff = min($cutoff, $glyph_len); } else { $cutoff = min($cutoff, $this->maximumGlyphs - $this->terminatorGlyphs); } } // If we don't have enough characters for anything, just return the // terminator. if ($cutoff <= 0) { return $this->terminator; } // Otherwise, we're going to try to cut the string off somewhere reasonable // rather than somewhere arbitrary. // NOTE: This is not complete, and there are many other word boundary // characters and reasonable places to break words in the UTF-8 character // space. For now, this gives us reasonable behavior for latin languages. We // don't necessarily have access to PCRE+Unicode so there isn't a great way // for us to look up character attributes. // If we encounter these, prefer to break on them instead of cutting the // string off in the middle of a word. static $break_characters = array(' ' => true, "\n" => true, ';' => true, ':' => true, '[' => true, '(' => true, ',' => true, '-' => true); // If we encounter these, shorten to this character exactly without // appending the terminal. static $stop_characters = array('.' => true, '!' => true, '?' => true); // Search backward in the string, looking for reasonable places to break it. $word_boundary = null; $stop_boundary = null; // If we do a word break with a terminal, we have to look beyond at least // the number of characters in the terminal. If the terminal is longer than // the required length, we'll skip this whole block and return it on its // own. // Only search backward for a while. At some point we don't get a better // result by looking through the whole string, and if this is "MMM..." or // a non-latin language without word break characters we're just wasting // time. $search = max(0, $cutoff - 256); for ($ii = min($cutoff, $glyph_len - 1); $ii >= $search; $ii--) { $c = $string_gv[$ii]; if (isset($break_characters[$c])) { $word_boundary = $ii; } else { if (isset($stop_characters[$c])) { $stop_boundary = $ii + 1; break; } else { if ($word_boundary !== null) { break; } } } } if ($stop_boundary !== null) { // We found a character like ".". Cut the string there, without appending // the terminal. $string_part = array_slice($string_gv, 0, $stop_boundary); return implode('', $string_part); } // If we didn't find any boundary characters or we found ONLY boundary // characters, just break at the maximum character length. if ($word_boundary === null || $word_boundary === 0) { $word_boundary = $cutoff; } $string_part = array_slice($string_gv, 0, $word_boundary); $string_part = implode('', $string_part); return $string_part . $this->terminator; }
public function testUTF8vCombined() { // Empty string. $string = ''; $this->assertEqual(array(), phutil_utf8v_combined($string)); // Single character. $string = 'x'; $this->assertEqual(array('x'), phutil_utf8v_combined($string)); // No combining characters. $string = 'cat'; $this->assertEqual(array('c', 'a', 't'), phutil_utf8v_combined($string)); // String with a combining character in the middle. $string = "ca͠t"; $this->assertEqual(array('c', "a͠", 't'), phutil_utf8v_combined($string)); // String starting with a combined character. $string = "c͠at"; $this->assertEqual(array("c͠", 'a', 't'), phutil_utf8v_combined($string)); // String with trailing combining character. $string = "cat͠"; $this->assertEqual(array('c', 'a', "t͠"), phutil_utf8v_combined($string)); // String with muliple combined characters. $string = "c͠a͠t͠"; $this->assertEqual(array("c͠", "a͠", "t͠"), phutil_utf8v_combined($string)); // String with multiple combining characters. $string = "ca͠͠t"; $this->assertEqual(array('c', "a͠͠", 't'), phutil_utf8v_combined($string)); // String beginning with a combining character. $string = "͠͠c"; $this->assertEqual(array(" ͠͠", 'c'), phutil_utf8v_combined($string)); }
public static function computeIntralineEdits($o, $n) { if (preg_match('/[\\x80-\\xFF]/', $o . $n)) { $ov = phutil_utf8v_combined($o); $nv = phutil_utf8v_combined($n); $multibyte = true; } else { $ov = str_split($o); $nv = str_split($n); $multibyte = false; } $result = self::generateEditString($ov, $nv); // Now we have a character-based description of the edit. We need to // convert into a byte-based description. Walk through the edit string and // adjust each operation to reflect the number of bytes in the underlying // character. $o_pos = 0; $n_pos = 0; $result_len = strlen($result); $o_run = array(); $n_run = array(); $old_char_len = 1; $new_char_len = 1; for ($ii = 0; $ii < $result_len; $ii++) { $c = $result[$ii]; if ($multibyte) { $old_char_len = strlen($ov[$o_pos]); $new_char_len = strlen($nv[$n_pos]); } switch ($c) { case 's': case 'x': $byte_o = $old_char_len; $byte_n = $new_char_len; $o_pos++; $n_pos++; break; case 'i': $byte_o = 0; $byte_n = $new_char_len; $n_pos++; break; case 'd': $byte_o = $old_char_len; $byte_n = 0; $o_pos++; break; } if ($byte_o) { if ($c == 's') { $o_run[] = array(0, $byte_o); } else { $o_run[] = array(1, $byte_o); } } if ($byte_n) { if ($c == 's') { $n_run[] = array(0, $byte_n); } else { $n_run[] = array(1, $byte_n); } } } $o_run = self::collapseIntralineRuns($o_run); $n_run = self::collapseIntralineRuns($n_run); return array($o_run, $n_run); }
public static function computeIntralineEdits($o, $n) { if (preg_match('/[\\x80-\\xFF]/', $o . $n)) { $ov = phutil_utf8v_combined($o); $nv = phutil_utf8v_combined($n); $multibyte = true; } else { $ov = str_split($o); $nv = str_split($n); $multibyte = false; } $result = self::generateEditString($ov, $nv); // Smooth the string out, by replacing short runs of similar characters // with 'x' operations. This makes the result more readable to humans, since // there are fewer choppy runs of short added and removed substrings. do { $original = $result; $result = preg_replace('/([xdi])(s{3})([xdi])/', '$1xxx$3', $result); $result = preg_replace('/([xdi])(s{2})([xdi])/', '$1xx$3', $result); $result = preg_replace('/([xdi])(s{1})([xdi])/', '$1x$3', $result); } while ($result != $original); // Now we have a character-based description of the edit. We need to // convert into a byte-based description. Walk through the edit string and // adjust each operation to reflect the number of bytes in the underlying // character. $o_pos = 0; $n_pos = 0; $result_len = strlen($result); $o_run = array(); $n_run = array(); $old_char_len = 1; $new_char_len = 1; for ($ii = 0; $ii < $result_len; $ii++) { $c = $result[$ii]; if ($multibyte) { $old_char_len = strlen($ov[$o_pos]); $new_char_len = strlen($nv[$n_pos]); } switch ($c) { case 's': case 'x': $byte_o = $old_char_len; $byte_n = $new_char_len; $o_pos++; $n_pos++; break; case 'i': $byte_o = 0; $byte_n = $new_char_len; $n_pos++; break; case 'd': $byte_o = $old_char_len; $byte_n = 0; $o_pos++; break; } if ($byte_o) { if ($c == 's') { $o_run[] = array(0, $byte_o); } else { $o_run[] = array(1, $byte_o); } } if ($byte_n) { if ($c == 's') { $n_run[] = array(0, $byte_n); } else { $n_run[] = array(1, $byte_n); } } } $o_run = self::collapseIntralineRuns($o_run); $n_run = self::collapseIntralineRuns($n_run); return array($o_run, $n_run); }
/** * Find the console display length of a UTF-8 string. This may differ from the * character length of the string if it contains double-width characters, like * many Chinese characters. * * This method is based on a C implementation here, which is based on the IEEE * standards. The source has more discussion and addresses more considerations * than this implementation does. * * http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c * * NOTE: We currently assume width 1 for East-Asian ambiguous characters. * * NOTE: This function is VERY slow. * * @param string A valid UTF-8 string. * @return int The console display length of the string. */ function phutil_utf8_console_strlen($string) { // Formatting and colors don't contribute any width in the console. $string = preg_replace("/\\[\\d*m/", '', $string); // In the common case of an ASCII string, just return the string length. if (preg_match('/^[\\x01-\\x7F]*\\z/', $string)) { return strlen($string); } $len = 0; // NOTE: To deal with combining characters, we're splitting the string into // glyphs first (characters with combiners) and then counting just the width // of the first character in each glyph. $display_glyphs = phutil_utf8v_combined($string); foreach ($display_glyphs as $display_glyph) { $glyph_codepoints = phutil_utf8v_codepoints($display_glyph); foreach ($glyph_codepoints as $c) { if ($c == 0) { break; } $len += 1 + ($c >= 0x1100 && ($c <= 0x115f || $c == 0x2329 || $c == 0x232a || $c >= 0x2e80 && $c <= 0xa4cf && $c != 0x303f || $c >= 0xac00 && $c <= 0xd7a3 || $c >= 0xf900 && $c <= 0xfaff || $c >= 0xfe10 && $c <= 0xfe19 || $c >= 0xfe30 && $c <= 0xfe6f || $c >= 0xff00 && $c <= 0xff60 || $c >= 0xffe0 && $c <= 0xffe6 || $c >= 0x20000 && $c <= 0x2fffd || $c >= 0x30000 && $c <= 0x3fffd)); break; } } return $len; }
/** * Shorten a string to provide a summary, respecting UTF-8 characters. This * function attempts to truncate strings at word boundaries. * * NOTE: This function makes a best effort to apply some reasonable rules but * will not work well for the full range of unicode languages. * * @param string UTF-8 string to shorten. * @param int Maximum length of the result. * @param string If the string is shortened, add this at the end. Defaults to * horizontal ellipsis. * @return string A string with no more than the specified character length. * * @group utf8 */ function phutil_utf8_shorten($string, $length, $terminal = "…") { // If the string has fewer bytes than the minimum length, we can return // it unmodified without doing any heavy lifting. if (strlen($string) <= $length) { return $string; } $string_v = phutil_utf8v_combined($string); $string_len = count($string_v); if ($string_len <= $length) { // If the string is already shorter than the requested length, simply return // it unmodified. return $string; } // NOTE: This is not complete, and there are many other word boundary // characters and reasonable places to break words in the UTF-8 character // space. For now, this gives us reasonable behavior for latin langauges. We // don't necessarily have access to PCRE+Unicode so there isn't a great way // for us to look up character attributes. // If we encounter these, prefer to break on them instead of cutting the // string off in the middle of a word. static $break_characters = array(' ' => true, "\n" => true, ';' => true, ':' => true, '[' => true, '(' => true, ',' => true, '-' => true); // If we encounter these, shorten to this character exactly without appending // the terminal. static $stop_characters = array('.' => true, '!' => true, '?' => true); // Search backward in the string, looking for reasonable places to break it. $word_boundary = null; $stop_boundary = null; $terminal_len = phutil_utf8_strlen($terminal); // If we do a word break with a terminal, we have to look beyond at least the // number of characters in the terminal. If the terminal is longer than the // required length, we'll skip this whole block and return it on its own $terminal_area = $length - min($length, $terminal_len); for ($ii = $length; $ii >= 0; $ii--) { $c = $string_v[$ii]; if (isset($break_characters[$c]) && $ii <= $terminal_area) { $word_boundary = $ii; } else { if (isset($stop_characters[$c]) && $ii < $length) { $stop_boundary = $ii + 1; break; } else { if ($word_boundary !== null) { break; } } } } if ($stop_boundary !== null) { // We found a character like ".". Cut the string there, without appending // the terminal. $string_part = array_slice($string_v, 0, $stop_boundary); return implode('', $string_part); } // If we didn't find any boundary characters or we found ONLY boundary // characters, just break at the maximum character length. if ($word_boundary === null || $word_boundary === 0) { $word_boundary = $terminal_area; } $string_part = array_slice($string_v, 0, $word_boundary); $string_part = implode('', $string_part); return $string_part . $terminal; }