/** * Initialize the entityNames array with all possible named entities * * @return does not return a value. */ private function _initializeMaps() { $entityNames = array("quot", "amp", "lt", "gt", "nbsp", "iexcl", "cent", "pound", "curren", "yen", "brvbar", "sect", "uml", "copy", "ordf", "laquo", "not", "shy", "reg", "macr", "deg", "plusmn", "sup2", "sup3", "acute", "micro", "para", "middot", "cedil", "sup1", "ordm", "raquo", "frac14", "frac12", "frac34", "iquest", "Agrave", "Aacute", "Acirc", "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave", "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc", "Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde", "Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml", "Yacute", "THORN", "szlig", "agrave", "aacute", "acirc", "atilde", "auml", "aring", "aelig", "ccedil", "egrave", "eacute", "ecirc", "euml", "igrave", "iacute", "icirc", "iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde", "ouml", "divide", "oslash", "ugrave", "uacute", "ucirc", "uuml", "yacute", "thorn", "yuml", "OElig", "oelig", "Scaron", "scaron", "Yuml", "fnof", "circ", "tilde", "Alpha", "Beta", "Gamma", "Delta", "Epsilon", "Zeta", "Eta", "Theta", "Iota", "Kappa", "Lambda", "Mu", "Nu", "Xi", "Omicron", "Pi", "Rho", "Sigma", "Tau", "Upsilon", "Phi", "Chi", "Psi", "Omega", "alpha", "beta", "gamma", "delta", "epsilon", "zeta", "eta", "theta", "iota", "kappa", "lambda", "mu", "nu", "xi", "omicron", "pi", "rho", "sigmaf", "sigma", "tau", "upsilon", "phi", "chi", "psi", "omega", "thetasym", "upsih", "piv", "ensp", "emsp", "thinsp", "zwnj", "zwj", "lrm", "rlm", "ndash", "mdash", "lsquo", "rsquo", "sbquo", "ldquo", "rdquo", "bdquo", "dagger", "Dagger", "bull", "hellip", "permil", "prime", "Prime", "lsaquo", "rsaquo", "oline", "frasl", "euro", "image", "weierp", "real", "trade", "alefsym", "larr", "uarr", "rarr", "darr", "harr", "crarr", "lArr", "uArr", "rArr", "dArr", "hArr", "forall", "part", "exist", "empty", "nabla", "isin", "notin", "ni", "prod", "sum", "minus", "lowast", "radic", "prop", "infin", "ang", "and", "or", "cap", "cup", "int", "there4", "sim", "cong", "asymp", "ne", "equiv", "le", "ge", "sub", "sup", "nsub", "sube", "supe", "oplus", "otimes", "perp", "sdot", "lceil", "rceil", "lfloor", "rfloor", "lang", "rang", "loz", "spades", "clubs", "hearts", "diams"); $entityValues = array(34, 38, 60, 62, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 338, 339, 352, 353, 376, 402, 710, 732, 913, 914, 915, 916, 917, 918, 919, 920, 921, 922, 923, 924, 925, 926, 927, 928, 929, 931, 932, 933, 934, 935, 936, 937, 945, 946, 947, 948, 949, 950, 951, 952, 953, 954, 955, 956, 957, 958, 959, 960, 961, 962, 963, 964, 965, 966, 967, 968, 969, 977, 978, 982, 8194, 8195, 8201, 8204, 8205, 8206, 8207, 8211, 8212, 8216, 8217, 8218, 8220, 8221, 8222, 8224, 8225, 8226, 8230, 8240, 8242, 8243, 8249, 8250, 8254, 8260, 8364, 8465, 8472, 8476, 8482, 8501, 8592, 8593, 8594, 8595, 8596, 8629, 8656, 8657, 8658, 8659, 8660, 8704, 8706, 8707, 8709, 8711, 8712, 8713, 8715, 8719, 8721, 8722, 8727, 8730, 8733, 8734, 8736, 8743, 8744, 8745, 8746, 8747, 8756, 8764, 8773, 8776, 8800, 8801, 8804, 8805, 8834, 8835, 8836, 8838, 8839, 8853, 8855, 8869, 8901, 8968, 8969, 8970, 8971, 9001, 9002, 9674, 9824, 9827, 9829, 9830); for ($i = 0; $i < count($entityNames); $i++) { $character = html_entity_decode('&' . $entityNames[$i] . ';', ENT_QUOTES, 'UTF-8'); // Normalize encoding to UTF-32 $character = mb_convert_encoding($character, 'UTF-32', 'UTF-8'); self::$_characterToEntityMap[$character] = $entityNames[$i]; self::$_entityToCharacterMap[$entityNames[$i]] = $character; // get the length of the longest entity name $len = mb_strlen($entityNames[$i], 'UTF-8'); if ($len > self::$_longestEntity) { self::$_longestEntity = $len; } } self::$_longestEntity += 2; self::$_mapIsInitialized = true; }
public function testDecodeDoesNotProduceMixedCharacterEncoding() { $codec = new HTMLEntityCodec(); // expecting a UTF-8 encoded string $expected = mb_convert_encoding("a b c d e f\tg h i j k¡l¢m", 'UTF-8', 'ISO-8859-1'); // check that the encoding conversion went well and the expected string is correct $expected_unpacked = array(1 => 0x61, 2 => 0x20, 3 => 0x62, 4 => 0x20, 5 => 0x63, 6 => 0x20, 7 => 0x64, 8 => 0x20, 9 => 0x65, 10 => 0x20, 11 => 0x66, 12 => 0x9, 13 => 0x67, 14 => 0x20, 15 => 0x68, 16 => 0x20, 17 => 0x69, 18 => 0x20, 19 => 0x6a, 20 => 0xc2, 21 => 0xa0, 22 => 0x6b, 23 => 0xc2, 24 => 0xa1, 25 => 0x6c, 26 => 0xc2, 27 => 0xa2, 28 => 0x6d); $unpacked = unpack('C*', $expected); $this->assertSame($expected_unpacked, $unpacked, 'Ensuring expected value was correctly encoded to UTF-8 - %s'); // decode and hope we get $expected! $this->assertEquals($expected, $codec->decode('a b c d e f	g h i j k¡l¢m')); }