public function setTerminator($terminator)
 {
     $this->terminator = $terminator;
     $this->terminatorBytes = strlen($terminator);
     $this->terminatorCodepoints = count(phutil_utf8v($terminator));
     $this->terminatorGlyphs = count(phutil_utf8v_combined($terminator));
     return $this;
 }
 public function testLevenshtein()
 {
     $tests = array(array('a', 'b', 'x'), array('kalrmr(array($b))', 'array($b)', 'dddddddssssssssds'), array('array($b)', 'kalrmr(array($b))', 'iiiiiiissssssssis'), array('zkalrmr(array($b))z', 'xarray($b)x', 'dddddddxsssssssssdx'), array('xarray($b)x', 'zkalrmr(array($b))z', 'iiiiiiixsssssssssix'), array('abcdefghi', 'abcdefghi', 'sssssssss'), array('abcdefghi', 'abcdefghijkl', 'sssssssssiii'), array('abcdefghijkl', 'abcdefghi', 'sssssssssddd'), array('xyzabcdefghi', 'abcdefghi', 'dddsssssssss'), array('abcdefghi', 'xyzabcdefghi', 'iiisssssssss'), array('abcdefg', 'abxdxfg', 'ssxxxss'), array('private function a($a, $b) {', 'public function and($b, $c) {', 'siixxdddxsssssssssssiixxxxxxxsss'), array('        if (' . 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx' . 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx' . 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx) {', '        if(' . 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx' . 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx' . 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx) {', 'ssssssssssds' . 'ssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssss' . 'ssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssss' . 'sssssssssssssssssssssssssssssssssssssss'));
     foreach ($tests as $test) {
         $this->assertEqual($test[2], ArcanistDiffUtils::generateEditString(str_split($test[0]), str_split($test[1])), pht("'%s' vs '%s'", $test[0], $test[1]));
     }
     $utf8_tests = array(array('GrumpyCat', "Grumpy☃at", 'ssssssxss'));
     foreach ($tests as $test) {
         $this->assertEqual($test[2], ArcanistDiffUtils::generateEditString(phutil_utf8v_combined($test[0]), phutil_utf8v_combined($test[1])), pht("'%s' vs '%s' (utf8)", $test[0], $test[1]));
     }
 }
 private function splitCorpus($corpus, $level)
 {
     switch ($level) {
         case 1:
             // Level 1: Split into sentences.
             $expr = '/([\\n,!;?\\.]+)/';
             break;
         case 2:
             // Level 2: Split into words.
             $expr = '/(\\s+)/';
             break;
         case 3:
             // Level 3: Split into characters.
             return phutil_utf8v_combined($corpus);
     }
     $pieces = preg_split($expr, $corpus, -1, PREG_SPLIT_DELIM_CAPTURE);
     return $this->stitchPieces($pieces);
 }
 public function truncateString($string)
 {
     // First, check if the string has fewer bytes than the most restrictive
     // limit. Codepoints and glyphs always take up at least one byte, so we can
     // just return the string unmodified if we're under all of the limits.
     $byte_len = strlen($string);
     if ($byte_len <= $this->minimumLimit) {
         return $string;
     }
     // If we need the vector of codepoints, build it.
     $string_pv = null;
     if ($this->maximumCodepoints) {
         $string_pv = phutil_utf8v($string);
         $point_len = count($string_pv);
     }
     // We always need the combined vector, even if we're only doing byte or
     // codepoint truncation, because we don't want to truncate to half of a
     // combining character.
     $string_gv = phutil_utf8v_combined($string);
     $glyph_len = count($string_gv);
     // Now, check if we're still over the limits. For example, a string may
     // be over the raw byte limit but under the glyph limit if it contains
     // several multibyte characters.
     $too_long = false;
     if ($this->maximumBytes && $byte_len > $this->maximumBytes) {
         $too_long = true;
     }
     if ($this->maximumCodepoints && $point_len > $this->maximumCodepoints) {
         $too_long = true;
     }
     if ($this->maximumGlyphs && $glyph_len > $this->maximumGlyphs) {
         $too_long = true;
     }
     if (!$too_long) {
         return $string;
     }
     // This string is legitimately longer than at least one of the limits, so
     // we need to truncate it. Find the minimum cutoff point: this is the last
     // glyph we can possibly return while satisfying the limits and having space
     // for the terminator.
     $cutoff = $glyph_len;
     if ($this->maximumBytes) {
         if ($byte_len <= $this->maximumBytes) {
             $cutoff = $glyph_len;
         } else {
             $bytes = $this->terminatorBytes;
             for ($ii = 0; $ii < $glyph_len; $ii++) {
                 $bytes += strlen($string_gv[$ii]);
                 if ($bytes > $this->maximumBytes) {
                     $cutoff = $ii;
                     break;
                 }
             }
         }
     }
     if ($this->maximumCodepoints) {
         if ($point_len <= $this->maximumCodepoints) {
             $cutoff = min($cutoff, $glyph_len);
         } else {
             $points = 0;
             for ($ii = 0; $ii < $glyph_len; $ii++) {
                 $glyph_bytes = strlen($string_gv[$ii]);
                 while ($points < $point_len) {
                     $glyph_bytes -= strlen($string_pv[$points]);
                     $points++;
                     if ($glyph_bytes <= 0) {
                         break;
                     }
                 }
                 $points_total = $points + $this->terminatorCodepoints;
                 if ($points_total > $this->maximumCodepoints) {
                     $cutoff = min($cutoff, $ii);
                     break;
                 }
             }
         }
     }
     if ($this->maximumGlyphs) {
         if ($glyph_len <= $this->maximumGlyphs) {
             $cutoff = min($cutoff, $glyph_len);
         } else {
             $cutoff = min($cutoff, $this->maximumGlyphs - $this->terminatorGlyphs);
         }
     }
     // If we don't have enough characters for anything, just return the
     // terminator.
     if ($cutoff <= 0) {
         return $this->terminator;
     }
     // Otherwise, we're going to try to cut the string off somewhere reasonable
     // rather than somewhere arbitrary.
     // NOTE: This is not complete, and there are many other word boundary
     // characters and reasonable places to break words in the UTF-8 character
     // space. For now, this gives us reasonable behavior for latin languages. We
     // don't necessarily have access to PCRE+Unicode so there isn't a great way
     // for us to look up character attributes.
     // If we encounter these, prefer to break on them instead of cutting the
     // string off in the middle of a word.
     static $break_characters = array(' ' => true, "\n" => true, ';' => true, ':' => true, '[' => true, '(' => true, ',' => true, '-' => true);
     // If we encounter these, shorten to this character exactly without
     // appending the terminal.
     static $stop_characters = array('.' => true, '!' => true, '?' => true);
     // Search backward in the string, looking for reasonable places to break it.
     $word_boundary = null;
     $stop_boundary = null;
     // If we do a word break with a terminal, we have to look beyond at least
     // the number of characters in the terminal. If the terminal is longer than
     // the required length, we'll skip this whole block and return it on its
     // own.
     // Only search backward for a while. At some point we don't get a better
     // result by looking through the whole string, and if this is "MMM..." or
     // a non-latin language without word break characters we're just wasting
     // time.
     $search = max(0, $cutoff - 256);
     for ($ii = min($cutoff, $glyph_len - 1); $ii >= $search; $ii--) {
         $c = $string_gv[$ii];
         if (isset($break_characters[$c])) {
             $word_boundary = $ii;
         } else {
             if (isset($stop_characters[$c])) {
                 $stop_boundary = $ii + 1;
                 break;
             } else {
                 if ($word_boundary !== null) {
                     break;
                 }
             }
         }
     }
     if ($stop_boundary !== null) {
         // We found a character like ".". Cut the string there, without appending
         // the terminal.
         $string_part = array_slice($string_gv, 0, $stop_boundary);
         return implode('', $string_part);
     }
     // If we didn't find any boundary characters or we found ONLY boundary
     // characters, just break at the maximum character length.
     if ($word_boundary === null || $word_boundary === 0) {
         $word_boundary = $cutoff;
     }
     $string_part = array_slice($string_gv, 0, $word_boundary);
     $string_part = implode('', $string_part);
     return $string_part . $this->terminator;
 }
 public function testUTF8vCombined()
 {
     // Empty string.
     $string = '';
     $this->assertEqual(array(), phutil_utf8v_combined($string));
     // Single character.
     $string = 'x';
     $this->assertEqual(array('x'), phutil_utf8v_combined($string));
     // No combining characters.
     $string = 'cat';
     $this->assertEqual(array('c', 'a', 't'), phutil_utf8v_combined($string));
     // String with a combining character in the middle.
     $string = "ca͠t";
     $this->assertEqual(array('c', "a͠", 't'), phutil_utf8v_combined($string));
     // String starting with a combined character.
     $string = "c͠at";
     $this->assertEqual(array("c͠", 'a', 't'), phutil_utf8v_combined($string));
     // String with trailing combining character.
     $string = "cat͠";
     $this->assertEqual(array('c', 'a', "t͠"), phutil_utf8v_combined($string));
     // String with muliple combined characters.
     $string = "c͠a͠t͠";
     $this->assertEqual(array("c͠", "a͠", "t͠"), phutil_utf8v_combined($string));
     // String with multiple combining characters.
     $string = "ca͠͠t";
     $this->assertEqual(array('c', "a͠͠", 't'), phutil_utf8v_combined($string));
     // String beginning with a combining character.
     $string = "͠͠c";
     $this->assertEqual(array(" ͠͠", 'c'), phutil_utf8v_combined($string));
 }
示例#6
0
 public static function computeIntralineEdits($o, $n)
 {
     if (preg_match('/[\\x80-\\xFF]/', $o . $n)) {
         $ov = phutil_utf8v_combined($o);
         $nv = phutil_utf8v_combined($n);
         $multibyte = true;
     } else {
         $ov = str_split($o);
         $nv = str_split($n);
         $multibyte = false;
     }
     $result = self::generateEditString($ov, $nv);
     // Now we have a character-based description of the edit. We need to
     // convert into a byte-based description. Walk through the edit string and
     // adjust each operation to reflect the number of bytes in the underlying
     // character.
     $o_pos = 0;
     $n_pos = 0;
     $result_len = strlen($result);
     $o_run = array();
     $n_run = array();
     $old_char_len = 1;
     $new_char_len = 1;
     for ($ii = 0; $ii < $result_len; $ii++) {
         $c = $result[$ii];
         if ($multibyte) {
             $old_char_len = strlen($ov[$o_pos]);
             $new_char_len = strlen($nv[$n_pos]);
         }
         switch ($c) {
             case 's':
             case 'x':
                 $byte_o = $old_char_len;
                 $byte_n = $new_char_len;
                 $o_pos++;
                 $n_pos++;
                 break;
             case 'i':
                 $byte_o = 0;
                 $byte_n = $new_char_len;
                 $n_pos++;
                 break;
             case 'd':
                 $byte_o = $old_char_len;
                 $byte_n = 0;
                 $o_pos++;
                 break;
         }
         if ($byte_o) {
             if ($c == 's') {
                 $o_run[] = array(0, $byte_o);
             } else {
                 $o_run[] = array(1, $byte_o);
             }
         }
         if ($byte_n) {
             if ($c == 's') {
                 $n_run[] = array(0, $byte_n);
             } else {
                 $n_run[] = array(1, $byte_n);
             }
         }
     }
     $o_run = self::collapseIntralineRuns($o_run);
     $n_run = self::collapseIntralineRuns($n_run);
     return array($o_run, $n_run);
 }
示例#7
0
 public static function computeIntralineEdits($o, $n)
 {
     if (preg_match('/[\\x80-\\xFF]/', $o . $n)) {
         $ov = phutil_utf8v_combined($o);
         $nv = phutil_utf8v_combined($n);
         $multibyte = true;
     } else {
         $ov = str_split($o);
         $nv = str_split($n);
         $multibyte = false;
     }
     $result = self::generateEditString($ov, $nv);
     // Smooth the string out, by replacing short runs of similar characters
     // with 'x' operations. This makes the result more readable to humans, since
     // there are fewer choppy runs of short added and removed substrings.
     do {
         $original = $result;
         $result = preg_replace('/([xdi])(s{3})([xdi])/', '$1xxx$3', $result);
         $result = preg_replace('/([xdi])(s{2})([xdi])/', '$1xx$3', $result);
         $result = preg_replace('/([xdi])(s{1})([xdi])/', '$1x$3', $result);
     } while ($result != $original);
     // Now we have a character-based description of the edit. We need to
     // convert into a byte-based description. Walk through the edit string and
     // adjust each operation to reflect the number of bytes in the underlying
     // character.
     $o_pos = 0;
     $n_pos = 0;
     $result_len = strlen($result);
     $o_run = array();
     $n_run = array();
     $old_char_len = 1;
     $new_char_len = 1;
     for ($ii = 0; $ii < $result_len; $ii++) {
         $c = $result[$ii];
         if ($multibyte) {
             $old_char_len = strlen($ov[$o_pos]);
             $new_char_len = strlen($nv[$n_pos]);
         }
         switch ($c) {
             case 's':
             case 'x':
                 $byte_o = $old_char_len;
                 $byte_n = $new_char_len;
                 $o_pos++;
                 $n_pos++;
                 break;
             case 'i':
                 $byte_o = 0;
                 $byte_n = $new_char_len;
                 $n_pos++;
                 break;
             case 'd':
                 $byte_o = $old_char_len;
                 $byte_n = 0;
                 $o_pos++;
                 break;
         }
         if ($byte_o) {
             if ($c == 's') {
                 $o_run[] = array(0, $byte_o);
             } else {
                 $o_run[] = array(1, $byte_o);
             }
         }
         if ($byte_n) {
             if ($c == 's') {
                 $n_run[] = array(0, $byte_n);
             } else {
                 $n_run[] = array(1, $byte_n);
             }
         }
     }
     $o_run = self::collapseIntralineRuns($o_run);
     $n_run = self::collapseIntralineRuns($n_run);
     return array($o_run, $n_run);
 }
示例#8
0
/**
 * Find the console display length of a UTF-8 string. This may differ from the
 * character length of the string if it contains double-width characters, like
 * many Chinese characters.
 *
 * This method is based on a C implementation here, which is based on the IEEE
 * standards. The source has more discussion and addresses more considerations
 * than this implementation does.
 *
 *   http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
 *
 * NOTE: We currently assume width 1 for East-Asian ambiguous characters.
 *
 * NOTE: This function is VERY slow.
 *
 * @param   string  A valid UTF-8 string.
 * @return  int     The console display length of the string.
 */
function phutil_utf8_console_strlen($string)
{
    // Formatting and colors don't contribute any width in the console.
    $string = preg_replace("/\\[\\d*m/", '', $string);
    // In the common case of an ASCII string, just return the string length.
    if (preg_match('/^[\\x01-\\x7F]*\\z/', $string)) {
        return strlen($string);
    }
    $len = 0;
    // NOTE: To deal with combining characters, we're splitting the string into
    // glyphs first (characters with combiners) and then counting just the width
    // of the first character in each glyph.
    $display_glyphs = phutil_utf8v_combined($string);
    foreach ($display_glyphs as $display_glyph) {
        $glyph_codepoints = phutil_utf8v_codepoints($display_glyph);
        foreach ($glyph_codepoints as $c) {
            if ($c == 0) {
                break;
            }
            $len += 1 + ($c >= 0x1100 && ($c <= 0x115f || $c == 0x2329 || $c == 0x232a || $c >= 0x2e80 && $c <= 0xa4cf && $c != 0x303f || $c >= 0xac00 && $c <= 0xd7a3 || $c >= 0xf900 && $c <= 0xfaff || $c >= 0xfe10 && $c <= 0xfe19 || $c >= 0xfe30 && $c <= 0xfe6f || $c >= 0xff00 && $c <= 0xff60 || $c >= 0xffe0 && $c <= 0xffe6 || $c >= 0x20000 && $c <= 0x2fffd || $c >= 0x30000 && $c <= 0x3fffd));
            break;
        }
    }
    return $len;
}
示例#9
0
/**
 * Shorten a string to provide a summary, respecting UTF-8 characters. This
 * function attempts to truncate strings at word boundaries.
 *
 * NOTE: This function makes a best effort to apply some reasonable rules but
 * will not work well for the full range of unicode languages.
 *
 * @param   string  UTF-8 string to shorten.
 * @param   int     Maximum length of the result.
 * @param   string  If the string is shortened, add this at the end. Defaults to
 *                  horizontal ellipsis.
 * @return  string  A string with no more than the specified character length.
 *
 * @group utf8
 */
function phutil_utf8_shorten($string, $length, $terminal = "…")
{
    // If the string has fewer bytes than the minimum length, we can return
    // it unmodified without doing any heavy lifting.
    if (strlen($string) <= $length) {
        return $string;
    }
    $string_v = phutil_utf8v_combined($string);
    $string_len = count($string_v);
    if ($string_len <= $length) {
        // If the string is already shorter than the requested length, simply return
        // it unmodified.
        return $string;
    }
    // NOTE: This is not complete, and there are many other word boundary
    // characters and reasonable places to break words in the UTF-8 character
    // space. For now, this gives us reasonable behavior for latin langauges. We
    // don't necessarily have access to PCRE+Unicode so there isn't a great way
    // for us to look up character attributes.
    // If we encounter these, prefer to break on them instead of cutting the
    // string off in the middle of a word.
    static $break_characters = array(' ' => true, "\n" => true, ';' => true, ':' => true, '[' => true, '(' => true, ',' => true, '-' => true);
    // If we encounter these, shorten to this character exactly without appending
    // the terminal.
    static $stop_characters = array('.' => true, '!' => true, '?' => true);
    // Search backward in the string, looking for reasonable places to break it.
    $word_boundary = null;
    $stop_boundary = null;
    $terminal_len = phutil_utf8_strlen($terminal);
    // If we do a word break with a terminal, we have to look beyond at least the
    // number of characters in the terminal. If the terminal is longer than the
    // required length, we'll skip this whole block and return it on its own
    $terminal_area = $length - min($length, $terminal_len);
    for ($ii = $length; $ii >= 0; $ii--) {
        $c = $string_v[$ii];
        if (isset($break_characters[$c]) && $ii <= $terminal_area) {
            $word_boundary = $ii;
        } else {
            if (isset($stop_characters[$c]) && $ii < $length) {
                $stop_boundary = $ii + 1;
                break;
            } else {
                if ($word_boundary !== null) {
                    break;
                }
            }
        }
    }
    if ($stop_boundary !== null) {
        // We found a character like ".". Cut the string there, without appending
        // the terminal.
        $string_part = array_slice($string_v, 0, $stop_boundary);
        return implode('', $string_part);
    }
    // If we didn't find any boundary characters or we found ONLY boundary
    // characters, just break at the maximum character length.
    if ($word_boundary === null || $word_boundary === 0) {
        $word_boundary = $terminal_area;
    }
    $string_part = array_slice($string_v, 0, $word_boundary);
    $string_part = implode('', $string_part);
    return $string_part . $terminal;
}