function check_string_value($value, $field_name, $line_no, $max_length) { if (strlen($value) > $max_length) { throw new Exception("{$field_name} '{$value}' defined on line #{$line_no} is too long, " . "maximum {$field_name} length is {$max_length} characters."); } if (!phutil_is_utf8_with_only_bmp_characters($value)) { throw new Exception("{$field_name} '{$value}' defined on line #{$line_no} is not a valid " . "UTF-8 string, " . "it should contain only UTF-8 characters."); } }
function check_string_value($value, $field_name, $line_no, $max_length) { if (strlen($value) > $max_length) { throw new Exception(pht("%s '%s' defined on line #%d is too long, " . "maximum %s length is %d characters.", $field_name, $value, $line_no, $field_name, $max_length)); } if (!phutil_is_utf8_with_only_bmp_characters($value)) { throw new Exception(pht("%s '%s' defined on line #%d is not a valid " . "UTF-8 string, it should contain only UTF-8 characters.", $field_name, $value, $line_no)); } }
public function testMySQLAgreesWithUsAboutBMP() { // Build a string with every BMP character in it, then insert it into MySQL // and read it back. We expect to get the same string out that we put in, // demonstrating that strings which pass our BMP checks are also valid in // MySQL and no silent data truncation will occur. $buf = ''; for ($ii = 0x1; $ii <= 0x7f; $ii++) { $buf .= chr($ii); } for ($ii = 0xc2; $ii <= 0xdf; $ii++) { for ($jj = 0x80; $jj <= 0xbf; $jj++) { $buf .= chr($ii) . chr($jj); } } // NOTE: This is \xE0\xA0\xZZ. for ($ii = 0xe0; $ii <= 0xe0; $ii++) { for ($jj = 0xa0; $jj <= 0xbf; $jj++) { for ($kk = 0x80; $kk <= 0xbf; $kk++) { $buf .= chr($ii) . chr($jj) . chr($kk); } } } // NOTE: This is \xE1\xZZ\xZZ through \xEF\xZZ\xZZ. for ($ii = 0xe1; $ii <= 0xef; $ii++) { for ($jj = 0x80; $jj <= 0xbf; $jj++) { for ($kk = 0x80; $kk <= 0xbf; $kk++) { $buf .= chr($ii) . chr($jj) . chr($kk); } } } $this->assertEqual(194431, strlen($buf)); $this->assertTrue(phutil_is_utf8_with_only_bmp_characters($buf)); $write = id(new HarbormasterScratchTable())->setData('all.utf8.bmp')->setBigData($buf)->save(); $read = id(new HarbormasterScratchTable())->load($write->getID()); $this->assertEqual($buf, $read->getBigData()); }
public function testUTF8BMP() { $tests = array("" => array(true, true, "empty string"), "a" => array(true, true, "a"), "a͠͠" => array(true, true, "a with combining"), "☃" => array(true, true, "snowman"), "" => array(true, true, "U+FFFF"), "��" => array(false, false, "Invalid, byte range."), "𐀀" => array(true, false, "U+10000"), "𝄞" => array(true, false, "gclef"), "musical 𝄞 g-clef" => array(true, false, "gclef text"), "�" => array(false, false, "Invalid, truncated."), "���" => array(false, false, "Nonminimal 3-byte character."), "�" => array(false, false, "Partial 2-byte character."), "�" => array(false, false, "Partial BMP 0xE0 character."), "�" => array(false, false, "Partial BMP cahracter.")); foreach ($tests as $input => $test) { list($expect_utf8, $expect_bmp, $test_name) = $test; $this->assertEqual($expect_utf8, phutil_is_utf8($input), pht('is_utf(%s)', $test_name)); $this->assertEqual($expect_bmp, phutil_is_utf8_with_only_bmp_characters($input), pht('is_utf_bmp(%s)', $test_name)); } }
/** * Check inserts for characters outside of the BMP. Even with the strictest * settings, MySQL will silently truncate data when it encounters these, which * can lead to data loss and security problems. */ protected function validateUTF8String($string) { if (phutil_is_utf8_with_only_bmp_characters($string)) { return; } throw new AphrontQueryCharacterSetException(pht('Attempting to construct a query containing characters outside of ' . 'the Unicode Basic Multilingual Plane. MySQL will silently truncate ' . 'this data if it is inserted into a `utf8` column. Use the `%%B` ' . 'conversion to escape binary strings data.')); }
public function testUTF8BMP() { $tests = array('' => array(true, true, pht('empty string')), 'a' => array(true, true, 'a'), "a͠͠" => array(true, true, pht('%s with combining', 'a')), "☃" => array(true, true, pht('snowman')), "" => array(true, true, 'U+FFFF'), "��" => array(false, false, pht('Invalid, byte range.')), "����" => array(false, false, pht('Nonminimal 4-byte character.')), "𐀀" => array(true, false, 'U+10000'), "𝄞" => array(true, false, 'gclef'), "musical 𝄞 g-clef" => array(true, false, pht('gclef text')), "�" => array(false, false, pht('Invalid, truncated.')), "���" => array(false, false, pht('Nonminimal 3-byte character.')), "�" => array(false, false, pht('Partial 2-byte character.')), "�" => array(false, false, pht('Partial BMP 0xE0 character.')), "�" => array(false, false, pht('Partial BMP cahracter.'))); foreach ($tests as $input => $test) { list($expect_utf8, $expect_bmp, $test_name) = $test; // Depending on what's installed on the system, this may use an // extension. $this->assertEqual($expect_utf8, phutil_is_utf8($input), pht('is_utf(%s)', $test_name)); // Also test this against the pure PHP implementation, explicitly. $this->assertEqual($expect_utf8, phutil_is_utf8_slowly($input), pht('is_utf_slowly(%s)', $test_name)); $this->assertEqual($expect_bmp, phutil_is_utf8_with_only_bmp_characters($input), pht('is_utf_bmp(%s)', $test_name)); } }