/** * Gets the starting position of each line. * * @param string $str String to be analyzed. * * @return array */ public static function getLines($str) { if (!$str instanceof UtfString && defined('USE_UTF_STRINGS') && USE_UTF_STRINGS) { // If the lexer uses UtfString for processing then the position will // represent the position of the character and not the position of // the byte. $str = new UtfString($str); } // The reason for using the '8bit' parameter is that the length // required is the length in bytes, not characters. // // Given the following string: `????+`, where `?` represents a // multi-byte character (lets assume that every `?` is a 2-byte // character) and `+` is a newline, the first value of `$i` is `0` // and the last one is `4` (because there are 5 characters). Bytes // `$str[0]` and `$str[1]` are the first character, `$str[2]` and // `$str[3]` are the second one and `$str[4]` is going to be the // first byte of the third character. The fourth and the last one // (which is actually a new line) aren't going to be processed at // all. $len = $str instanceof UtfString ? $str->length() : mb_strlen($len, '8bit'); $lines = array(0); for ($i = 0; $i < $len; ++$i) { if ($str[$i] === "\n") { $lines[] = $i + 1; } } return $lines; }
public function testGetCharLength() { $this->assertEquals(1, UtfString::getCharLength(chr(0x0))); // 00000000 $this->assertEquals(1, UtfString::getCharLength(chr(0x7f))); // 01111111 $this->assertEquals(2, UtfString::getCharLength(chr(0xc0))); // 11000000 $this->assertEquals(2, UtfString::getCharLength(chr(0xdf))); // 11011111 $this->assertEquals(3, UtfString::getCharLength(chr(0xe0))); // 11100000 $this->assertEquals(3, UtfString::getCharLength(chr(0xef))); // 11101111 $this->assertEquals(4, UtfString::getCharLength(chr(0xf0))); // 11110000 $this->assertEquals(4, UtfString::getCharLength(chr(0xf7))); // 11110111 $this->assertEquals(5, UtfString::getCharLength(chr(0xf8))); // 11111000 $this->assertEquals(5, UtfString::getCharLength(chr(0xfb))); // 11111011 $this->assertEquals(6, UtfString::getCharLength(chr(0xfc))); // 11111100 $this->assertEquals(6, UtfString::getCharLength(chr(0xfd))); // 11111101 }
/** * Constructor. * * @param string|UtfString $str The query to be lexed. * @param bool $strict Whether strict mode should be enabled or not. */ public function __construct($str, $strict = false) { // `strlen` is used instead of `mb_strlen` because the lexer needs to // parse each byte of the input. $len = $str instanceof UtfString ? $str->length() : strlen($str); // For multi-byte strings, a new instance of `UtfString` is // initialized (only if `UtfString` usage is forced. if (!$str instanceof UtfString) { if (USE_UTF_STRINGS && $len !== mb_strlen($str, 'UTF-8')) { $str = new UtfString($str); } } $this->str = $str; $this->len = $str instanceof UtfString ? $str->length() : $len; $this->strict = $strict; // Setting the delimiter. $this->delimiter = static::$DEFAULT_DELIMITER; $this->lex(); }
/** * Constructor. * * @param string|UtfString $str The query to be lexed. * @param bool $strict Whether strict mode should be enabled or not. */ public function __construct($str, $strict = false) { $this->str = $str; $this->len = $str instanceof UtfString ? $str->length() : strlen($str); $this->strict = $strict; $this->lex(); }
/** * Constructor. * * @param string|UtfString $str The query to be lexed. * @param bool $strict Whether strict mode should be enabled or not. */ public function __construct($str, $strict = false) { $this->str = $str; $this->len = $str instanceof UtfString ? $str->length() : strlen($str); $this->strict = $strict; $this->delimiter = static::$DEFAULT_DELIMITER; $this->lex(); }