public static function clean($var, $charset = NULL) { if (!$charset) { // Use the application character set $charset = JsonApiApplication::$charset; } if (is_array($var) or is_object($var)) { foreach ($var as $key => $val) { // Recursion! $var[UTF8::clean($key)] = UTF8::clean($val); } } elseif (is_string($var) and $var !== "") { // Remove control characters $var = UTF8::strip_ascii_ctrl($var); if (!UTF8::is_ascii($var)) { // Temporarily save the mb_substitute_character() value into a variable $mb_substitute_character = mb_substitute_character(); // Disable substituting illegal characters with the default '?' character mb_substitute_character("none"); // convert encoding, this is expensive, used when $var is not ASCII $var = mb_convert_encoding($var, $charset, $charset); // Reset mb_substitute_character() value back to the original setting mb_substitute_character($mb_substitute_character); } } return $var; }
static function initMbstring() { if (extension_loaded('mbstring')) { if (((int) ini_get('mbstring.encoding_translation') || in_array(strtolower(ini_get('mbstring.encoding_translation')), array('on', 'yes', 'true'))) && !in_array(strtolower(ini_get('mbstring.http_input')), array('pass', '8bit', 'utf-8'))) { user_error('php.ini settings: Please disable mbstring.encoding_translation or set mbstring.http_input to "pass"', E_USER_WARNING); } if (MB_OVERLOAD_STRING & (int) ini_get('mbstring.func_overload')) { user_error('php.ini settings: Please disable mbstring.func_overload', E_USER_WARNING); } mb_regex_encoding('UTF-8'); ini_set('mbstring.script_encoding', 'pass'); if ('utf-8' !== strtolower(mb_internal_encoding())) { mb_internal_encoding('UTF-8'); ini_set('mbstring.internal_encoding', 'UTF-8'); } if ('none' !== strtolower(mb_substitute_character())) { mb_substitute_character('none'); ini_set('mbstring.substitute_character', 'none'); } if (!in_array(strtolower(mb_http_output()), array('pass', '8bit'))) { mb_http_output('pass'); ini_set('mbstring.http_output', 'pass'); } if (!in_array(strtolower(mb_language()), array('uni', 'neutral'))) { mb_language('uni'); ini_set('mbstring.language', 'uni'); } } else { if (!defined('MB_OVERLOAD_MAIL')) { extension_loaded('iconv') or static::initIconv(); require __DIR__ . '/Bootup/mbstring.php'; } } }
function __construct() { $this->charsets = array("ASMO-708" => gettext("Arabic"), "BIG5" => gettext("Chinese Traditional"), "CP1026" => gettext("IBM EBCDIC (Turkish Latin-5)"), "cp866" => gettext("Cyrillic (DOS)"), "CP870" => gettext("IBM EBCDIC (Multilingual Latin-2)"), "CISO2022JP" => gettext("Japanese (JIS-Allow 1 byte Kana)"), "DOS-720" => gettext("Arabic (DOS)"), "DOS-862" => gettext("Hebrew (DOS)"), "EBCDIC-CP-US" => gettext("IBM EBCDIC (US-Canada)"), "EUC-CN" => gettext("Chinese Simplified (EUC)"), "EUC-JP" => gettext("Japanese (EUC)"), "EUC-KR" => gettext("Korean (EUC)"), "GB2312" => gettext("Chinese Simplified (GB2312)"), "HZ-GB-2312" => gettext("Chinese Simplified (HZ)"), "IBM437" => gettext("OEM United States"), "IBM737" => gettext("Greek (DOS)"), "IBM775" => gettext("Baltic (DOS)"), "IBM850" => gettext("Western European (DOS)"), "IBM852" => gettext("Central European (DOS)"), "IBM857" => gettext("Turkish (DOS)"), "IBM861" => gettext("Icelandic (DOS)"), "IBM869" => gettext("Greek, Modern (DOS)"), "ISO-2022-JP" => gettext("Japanese (JIS)"), "ISO-2022-JP" => gettext("Japanese (JIS-Allow 1 byte Kana - SO/SI)"), "ISO-2022-KR" => gettext("Korean (ISO)"), "ISO-8859-1" => gettext("Western European (ISO)"), "ISO-8859-15" => gettext("Latin 9 (ISO)"), "ISO-8859-2" => gettext("Central European (ISO)"), "ISO-8859-3" => gettext("Latin 3 (ISO)"), "ISO-8859-4" => gettext("Baltic (ISO)"), "ISO-8859-5" => gettext("Cyrillic (ISO)"), "ISO-8859-6" => gettext("Arabic (ISO)"), "ISO-8859-7" => gettext("Greek (ISO)"), "ISO-8859-8" => gettext("Hebrew (ISO-Visual)"), "ISO-8859-8-i" => gettext("Hebrew (ISO-Logical)"), "ISO-8859-9" => gettext("Turkish (ISO)"), "JOHAB" => gettext("Korean (Johab)"), "KOi8-R" => gettext("Cyrillic (KOI8-R)"), "KOi8-U" => gettext("Cyrillic (KOI8-U)"), "KS_C_5601-1987" => gettext("Korean"), "MACINTOSH" => gettext("Western European (MAC)"), "SHIFT_JIS" => gettext("Japanese (Shift-JIS)"), "UNICODE" => gettext("Unicode"), "UNICODEFFFE" => gettext("Unicode (Big-Endian)"), "US-ASCII" => gettext("US-ASCII"), "UTF-7" => gettext("Unicode (UTF-7)"), "UTF-8" => gettext("Unicode (UTF-8)"), "WINDOWS-1250" => gettext("Central European (Windows)"), "WINDOWS-1251" => gettext("Cyrillic (Windows)"), "WINDOWS-1252" => gettext("Western European (Windows)"), "WINDOWS-1253" => gettext("Greek (Windows)"), "WINDOWS-1254" => gettext("Turkish (Windows)"), "WINDOWS-1255" => gettext("Hebrew (Windows)"), "WINDOWS-1256" => gettext("Arabic (Windows)"), "WINDOWS-1257" => gettext("Baltic (Windows)"), "WINDOWS-1258" => gettext("Vietnamese (Windows)"), "WINDOWS-874" => gettext("Thai (Windows)")); // prune the list to supported character sets $this->iconv_sets = array(); $this->mb_sets = array(); if (function_exists('mb_convert_encoding')) { @mb_substitute_character('none'); if (function_exists('mb_list_encodings')) { $list = mb_list_encodings(); } else { $list = array("pass", "auto", "byte2be", "byte2le", "byte4be", "byte4le", "BASE64", "UUENCODE", "HTML-ENTITIES", "Quoted-Printable", "7bit", "8bit", "UCS-4", "UCS-4BE", "UCS-4LE", "UCS-2", "UCS-2BE", "UCS-2LE", "UTF-32", "UTF-32BE", "UTF-32LE", "UTF-16", "UTF-16BE", "UTF-16LE", "UTF-8", "UTF-7", "UTF7-IMAP", "ASCII", "EUC-JP", "SJIS", "eucJP-win", "SJIS-win", "CP51932", "JIS", "ISO-2022-JP", "ISO-2022-JP-MS", "Windows-1252", "ISO-8859-1", "ISO-8859-2", "ISO-8859-3", "ISO-8859-4", "ISO-8859-5", "ISO-8859-6", "ISO-8859-7", "ISO-8859-8", "ISO-8859-9", "ISO-8859-10", "ISO-8859-13", "ISO-8859-14", "ISO-8859-15", "ISO-8859-16", "EUC-CN", "CP936", "HZ", "EUC-TW", "BIG-5", "EUC-KR", "UHC", "ISO-2022-KR", "Windows-1251", "CP866", "KOI8-R", "ArmSCII-8"); } foreach ($this->charsets as $key => $encoding) { if (in_array($key, $list)) { $this->mb_sets[$key] = $encoding; } } } if (function_exists('iconv')) { foreach ($this->charsets as $key => $encoding) { if (@iconv("UTF-8", $key, "UTF-8") !== false) { $this->iconv_sets[$key] = $encoding; } } } }
function smarty_modifier_xoops_html_purifier($html, $ecoding = null, $doctype = null) { require_once XOOPS_LIBRARY_PATH . '/htmlpurifier/library/HTMLPurifier.auto.php'; $encoding = $encoding ? $encoding : _CHARSET; $doctypeArr = array("HTML 4.01 Strict", "HTML 4.01 Transitional", "XHTML 1.0 Strict", "XHTML 1.0 Transitional", "XHTML 1.1"); $config = HTMLPurifier_Config::createDefault(); if (in_array($doctype, $doctypeArr)) { $config->set('HTML.Doctype', $doctype); } if ($_conv = $encoding !== 'UTF-8' && function_exists('mb_convert_encoding')) { $_substitute = mb_substitute_character(); mb_substitute_character('none'); $html = mb_convert_encoding($html, 'UTF-8', $encoding); $config->set('Core.Encoding', 'UTF-8'); } else { $config->set('Core.Encoding', $encoding); } $purifier = new HTMLPurifier($config); $html = $purifier->purify($html); if ($_conv) { $html = mb_convert_encoding($html, $encoding, 'UTF-8'); mb_substitute_character($_substitute); } return $html; }
function inputFilter($str) { if (is_array($str)) { return array_map(array($this, "inputFilter"), $str); } // 入力された絵文字はUnicodeで保存するためSJIS-win $str = mb_convert_kana($str, 'KVrns', 'SJIS-win'); $sjismap = array(); $utf8map = array(); if ($this->is_ezweb()) { $sjismap = array(0xe234, 0xe272, 0xa0c, 0xffff, 0xe273, 0xe2ef, 0xa0d, 0xffff, 0xe2f0, 0xe32e, 0xa50, 0xffff, 0xe32f, 0xe342, 0xa51, 0xffff, 0xe468, 0xe4a6, 0xad8, 0xffff, 0xe4a7, 0xe523, 0xad9, 0xffff, 0xe524, 0xe562, 0xb1c, 0xffff, 0xe563, 0xe5df, 0xb1d, 0xffff); $utf8map = array(0xec40, 0xecfc, 0x0, 0xffff, 0xed40, 0xed93, 0x0, 0xffff, 0xef40, 0xeffc, 0x0, 0xffff, 0xf040, 0xf0fc, 0x0, 0xffff); $str = mb_encode_numericentity($str, $sjismap, 'SJIS-win'); $str = mb_convert_encoding($str, "UTF-8", "SJIS-win"); $str = mb_decode_numericentity($str, $utf8map, 'UTF-8'); } elseif ($this->is_softbank()) { $backup = mb_substitute_character(); mb_substitute_character('long'); $str = mb_convert_encoding($str, 'UTF-8', 'SJIS'); mb_substitute_character($backup); $pattern = '/BAD\\+([0-9A-F]{4})/'; $callback = array($this, '_softbank_fallbackSjisToUtf8'); $str = preg_replace_callback($pattern, $callback, $str); } else { $str = mb_convert_encoding($str, "UTF-8", "SJIS-win"); } $str = trim($str); //$str = h($str); return $str; }
function u2b($str, $charset = 'BIG5') { mb_regex_encoding($charset); //宣告 要進行 regex 的多位元編碼轉換格式 為 $charset mb_substitute_character('long'); //宣告 缺碼字改以U+16進位碼為標記取代 $str = mb_convert_encoding($str, $charset, 'UTF-8'); $str = preg_replace('/U\\+([0-9A-F]{4})/e', '"&#".intval("\\1",16).";"', $str); //將U+16進位碼標記轉換為UnicodeHTML碼 return $str; }
/** * コンストラクタ * @param string $cat カテゴリ */ public function __construct($cat) { mb_internal_encoding(Todo::ENCODING); mb_regex_encoding(Todo::ENCODING); ini_set('default_charset', Todo::ENCODING); //HTTPヘッダーでの文字コード指定 ini_set('mbstring.strict_detection', true); mb_substitute_character(0x5f); //変換できない文字は"_"にする $this->cat = $this->_encode($cat); }
function __construct($data) { libxml_use_internal_errors(true); libxml_clear_errors(); $this->doc = new DOMDocument(); $this->doc->loadXML($data); mb_substitute_character("none"); $error = libxml_get_last_error(); // libxml compiled without iconv? if ($error && $error->code == 32) { $data = $this->normalize_encoding($data); if ($data) { libxml_clear_errors(); $this->doc = new DOMDocument(); $this->doc->loadXML($data); $error = libxml_get_last_error(); } } // some terrible invalid unicode entity? if ($error) { foreach (libxml_get_errors() as $err) { if ($err->code == 9) { // if the source feed is not in utf8, next conversion will fail $data = $this->normalize_encoding($data); // remove dangling bytes $data = mb_convert_encoding($data, 'UTF-8', 'UTF-8'); // apparently not all UTF-8 characters are valid for XML $data = preg_replace('/[^\\x{0009}\\x{000a}\\x{000d}\\x{0020}-\\x{D7FF}\\x{E000}-\\x{FFFD}]+/u', ' ', $data); if ($data) { libxml_clear_errors(); $this->doc = new DOMDocument(); $this->doc->loadXML($data); $error = libxml_get_last_error(); } break; } } } if ($error) { foreach (libxml_get_errors() as $error) { if ($error->level == LIBXML_ERR_FATAL) { if (!isset($this->error)) { //currently only the first error is reported $this->error = $this->format_error($error); } $this->libxml_errors[] = $this->format_error($error); } } } libxml_clear_errors(); $this->items = array(); }
/** * Class constructor * * Determines if UTF-8 support is to be enabled. * * @return void */ public function __construct() { $charset = strtoupper(Config::get('main')->charset); ini_set('default_charset', $charset); /* * Configure mbstring and/or iconv if they are enabled * and set MB_ENABLED and ICONV_ENABLED constants, so * that we don't repeatedly do extension_loaded() or * function_exists() calls. */ if (extension_loaded('mbstring')) { define('MB_ENABLED', TRUE); // mbstring.internal_encoding is deprecated starting with PHP 5.6 // and it's usage triggers E_DEPRECATED messages. if (!Core::isPHP('5.6')) { @ini_set('mbstring.internal_encoding', $charset); } else { mb_internal_encoding($charset); } // This is required for mb_convert_encoding() to strip invalid characters. // That's utilized by Utf8, but it's also done for consistency with iconv. mb_substitute_character('none'); } else { define('MB_ENABLED', FALSE); } // There's an ICONV_IMPL constant, but the PHP manual says that using // iconv's predefined constants is "strongly discouraged". if (extension_loaded('iconv')) { define('ICONV_ENABLED', TRUE); // iconv.internal_encoding is deprecated starting with PHP 5.6 // and it's usage triggers E_DEPRECATED messages. if (!Core::isPHP(5.6)) { @ini_set('iconv.internal_encoding', $charset); } else { ini_set('default_encoding', $charset); } } else { define('ICONV_ENABLED', FALSE); } if (Core::isPHP('5.6')) { ini_set('php.internal_encoding', $charset); } if (defined('PREG_BAD_UTF8_ERROR') && (ICONV_ENABLED === TRUE or MB_ENABLED === TRUE) && strtoupper($charset) === 'UTF-8') { define('UTF8_ENABLED', TRUE); Logger::log('UTF-8 Support Enabled'); } else { define('UTF8_ENABLED', FALSE); Logger::log('UTF-8 Support Disabled'); } }
/** * AbstractDiff constructor. * * @param string $oldText * @param string $newText * @param string $encoding * @param null|array $specialCaseTags * @param null|bool $groupDiffs */ public function __construct($oldText, $newText, $encoding = 'UTF-8', $specialCaseTags = null, $groupDiffs = null) { mb_substitute_character(0x20); $this->setConfig(HtmlDiffConfig::create()->setEncoding($encoding)); if ($specialCaseTags !== null) { $this->config->setSpecialCaseTags($specialCaseTags); } if ($groupDiffs !== null) { $this->config->setGroupDiffs($groupDiffs); } $this->oldText = $oldText; $this->newText = $newText; $this->content = ''; }
public function format($response) { $this->inputCharset = @$response->data['inputCharset'] ?: Yii::$app->charset; $this->outputCharset = @$response->data['outputCharset'] ?: Yii::$app->charset; // 代替文字 $substitute = new Resource(mb_substitute_character(), function ($old) { mb_substitute_character($old); }); mb_substitute_character(0x3013); $tmpfile = tmpfile(); foreach ($response->data['rows'] as $row) { fwrite($tmpfile, $this->formatRow($row) . "\r\n"); } fseek($tmpfile, 0, SEEK_SET); $response->content = null; $response->stream = $tmpfile; }
/** * @return string * * @throws Backend\SourceFileException */ public function getSource() { $code = file_get_contents($this->fileInfo->getPathname()); $info = new \finfo(); $encoding = $info->file($this->fileInfo, FILEINFO_MIME_ENCODING); if (strtolower($encoding) != 'utf-8') { try { $code = iconv($encoding, 'UTF-8//TRANSLIT', $code); } catch (\ErrorException $e) { throw new SourceFileException('Encoding error - conversion to UTF-8 failed', SourceFileException::BadEncoding, $e); } } // This is a workaround to filter out leftover invalid UTF-8 byte sets // even if the source looks like it's UTF-8 already mb_substitute_character('none'); $cleanCode = mb_convert_encoding($code, 'UTF-8', 'UTF-8'); if ($cleanCode != $code) { throw new SourceFileException('Encoding error - invalid UTF-8 bytes found', SourceFileException::InvalidDataBytes); } return $cleanCode; }
/** * Perform initialization required for the string wrapper library. * @return null */ static function init() { $clientCharset = strtolower_codesafe(Config::getVar('i18n', 'client_charset')); // Check if mbstring is installed (requires PHP >= 4.3.0) if (String::hasMBString()) { // mbstring routines are available define('ENABLE_MBSTRING', true); // Set up required ini settings for mbstring // FIXME Do any other mbstring settings need to be set? mb_internal_encoding($clientCharset); mb_substitute_character('63'); // question mark } // Define modifier to be used in regexp_* routines // FIXME Should non-UTF-8 encodings be supported with mbstring? if ($clientCharset == 'utf-8' && String::hasPCREUTF8()) { define('PCRE_UTF8', 'u'); } else { define('PCRE_UTF8', ''); } }
function safeUTF8(&$text) { //when `mb_convert_encoding` is used below, we want it to use the recommended Unicode replacement character //rather than just "?" <stackoverflow.com/a/13695364> mb_substitute_character(0xfffd); //what's given could be any imaginable encoding, normalise it into UTF-8 though it may not yet be web-safe. //adapted from <php.net/mb_check_encoding#89286>, with thanks to Zegnat. this works by importing the current byte //stream into UTF-32 which has enough scope to contain any other encoding, then downsizing in to UTF-8 $text = mb_convert_encoding(mb_convert_encoding($text, 'UTF-32', 'UTF-8'), 'UTF-8', 'UTF-32'); //remove Unicode bytes unsafe for XML: <www.w3.org/TR/REC-xml/#charsets> $text = preg_replace('/[^\\x{0009}\\x{000a}\\x{000d}\\x{0020}-\\x{D7FF}\\x{E000}-\\x{FFFD}\\x{10000}-\\x{10FFFF}]+/u', '', $text); //remove "compatibility characters" and "permanently undefined Unicode characters", //see note proceeding: <www.w3.org/TR/REC-xml/#charsets> $text = preg_replace('/[\\x{007f}-\\x{0084}\\x{0086}-\\x{009f}\\x{FDD0}-\\x{FDEF}' . '\\x{200E}\\x{200F}\\x{202A}-\\x{202E}' . '\\x{1FFFE}\\x{1FFFF}\\x{2FFFE}\\x{2FFFF}\\x{3FFFE}\\x{3FFFF}\\x{4FFFE}\\x{4FFFF}' . '\\x{5FFFE}\\x{5FFFF}\\x{6FFFE}\\x{6FFFF}\\x{7FFFE}\\x{7FFFF}\\x{8FFFE}\\x{8FFFF}' . '\\x{9FFFE}\\x{9FFFF}\\x{AFFFE}\\x{AFFFF}\\x{BFFFE}\\x{BFFFF}\\x{CFFFE}\\x{CFFFF}' . '\\x{DFFFE}\\x{DFFFF}\\x{EFFFE}\\x{EFFFF}\\x{FFFFE}\\x{FFFFF}\\x{10FFFE}\\x{10FFFF}]+/u', '', $text); //TODO: strip invalid byte-sequences //see: http://stackoverflow.com/a/13695364 //Some interesting references: //http://www.php.net/manual/en/reference.pcre.pattern.modifiers.php#54805 //we still need to return, despite the by-reference parameter because use of anonymous variables and functions //for the call will not be by-reference return $text; }
public static function init() { self::$utf8validator = (bool) extension_loaded('utf8validator'); mb_internal_encoding('UTF-8'); mb_language('uni'); mb_regex_encoding('UTF-8'); mb_detect_order(array('UTF-8', 'ISO-8859-1')); mb_substitute_character(0xfffd); MCached::connect(); $trans = MCached::get(self::TRANSKEY); if ($trans === MCached::NO_RESULT) { $win = "€" . implode('', range("‚", "Œ")) . "Ž" . implode('', range("‘", "œ")) . implode('', range("ž", "ÿ")); $win_iso = "�����"; $iso = implode('', range("€", "ÿ")); $winlen = strlen($win); $winisolen = strlen($win_iso); $isolen = strlen($iso); $trans = array('iso_to_utf8' => array(), 'win_to_utf8' => array(), 'from_utf8' => array()); for ($i = 0; $i < $isolen; $i++) { $utf8 = mb_convert_encoding($iso[$i], 'UTF-8', 'ISO-8859-1'); $trans['iso_to_utf8'][$iso[$i]] = $utf8; $trans['from_utf8'][$utf8] = $iso[$i]; } for ($i = 0; $i < $winlen; $i++) { $utf8 = mb_convert_encoding($win[$i], 'UTF-8', 'Windows-1252'); $trans['win_to_utf8'][$win[$i]] = $utf8; $trans['from_utf8'][$utf8] = $win[$i]; } for ($i = 0; $i < $winisolen; $i++) { $utf8 = mb_convert_encoding($win_iso[$i], 'UTF-8', 'ISO-8859-1'); $trans['win_to_utf8'][$win_iso[$i]] = $utf8; } MCached::add(self::TRANSKEY, $trans, 86400); } self::$trans_table = $trans; }
function __construct() { parent::__construct(); $this->config->load('config_main', TRUE); mb_language(LANG); ini_set('mbstring.detect_order', 'auto'); ini_set('mbstring.http_input', 'auto'); ini_set('mbstring.http_output', 'pass'); ini_set('mbstring.internal_encoding', 'UTF-8'); ini_set('mbstring.script_encoding', 'UTF-8'); ini_set('mbstring.substitude_character', 'none'); mb_regex_encoding("UTF-8"); mb_substitute_character("long"); mb_substitute_character(0x3013); // PHP 5.3 用 ini_set('date.timezone', 'Asia/Ho_Chi_Minh'); $this->load->helper(array('url', 'path', 'form', 'main')); // mobileMyClassライブラリ parse_str($_SERVER['QUERY_STRING'], $_GET); $this->load->library('MobileMyClass'); $path = APPPATH . 'pear'; set_include_path(get_include_path() . PATH_SEPARATOR . $path); $this->init_APP(); }
private function SplitHTMLChunks($html, $fontstyle) { $html = str_replace(chr(160), " ", $html); if (strip_tags($html) == $html) { $html = nl2br(htmlspecialchars($html)); } else { $html = preg_replace("/&(?!([a-z\\d]+|#\\d+|#x[a-f\\d]+);)/i", "&", $html); $html = preg_replace("/<br\\s*>/i", "<br/>", $html); } mb_substitute_character("none"); $html = mb_convert_encoding($html, "UTF-8", "UTF-8"); if ($html == "") { return array(array("text" => "", "style" => $fontstyle, "newlines" => 0)); } $doc = new DOMDocument(); $doc->loadXML("<root/>"); $f = $doc->createDocumentFragment(); if (!$f->appendXML($html)) { return array(); } $doc->documentElement->appendChild($f); $cur = $doc->documentElement; $hs = array(); $inpara = null; $chunks = array(); while ($cur != null) { if ($cur->nodeType == XML_TEXT_NODE) { if ($inpara === 0) { $chunks[count($chunks) - 1]["newlines"] += 2; } $inpara = 1; if (count($hs) > 0) { $style = array_merge($fontstyle, array("style" => implode("", $hs))); if (isset($fontstyle["style"])) { $style["style"] .= $fontstyle["style"]; } } else { $style = $fontstyle; } $chunks[] = array("text" => $cur->nodeValue, "style" => $style, "newlines" => 0); } elseif ($cur->nodeType == XML_ELEMENT_NODE) { switch (strtolower($cur->nodeName)) { case "b": array_push($hs, "B"); break; case "i": array_push($hs, "I"); break; case "u": array_push($hs, "U"); break; case "br": $chunks[count($chunks) - 1]["newlines"]++; break; case "p": if ($inpara !== null && $inpara < 2) { $chunks[count($chunks) - 1]["newlines"] += 2; } $inpara = 2; break; } } if ($cur->firstChild) { $cur = $cur->firstChild; } elseif ($cur->nextSibling) { $cur = $cur->nextSibling; } else { while ($cur != null && $cur->nextSibling == null) { $cur = $cur->parentNode; if ($cur != null) { switch (strtolower($cur->nodeName)) { case "b": case "i": case "u": array_pop($hs); break; case "p": $inpara = 0; } } } if ($cur != null) { $cur = $cur->nextSibling; } } } return $chunks; }
} if (function_exists('mb_regex_encoding')) { @mb_regex_encoding('UTF-8'); } if (function_exists('mb_regex_set_options')) { @mb_regex_set_options('pr'); } # default: "pr" if (function_exists('mb_http_output')) { @mb_http_output('pass'); } if (function_exists('mb_language')) { @mb_language('uni'); } if (function_exists('mb_substitute_character')) { @mb_substitute_character(0xfffd); } # Unicode Replacement Character: # U+FFFD = 0xFFFD (utf16 hex) = 65533 (dec) = "\xEF\xBF\xBD" (utf8 hex) if (function_exists('mb_detect_order')) { @mb_detect_order('auto'); } $tmp = strToLower(trim(@ini_get('mbstring.func_overload'))); if ($tmp >= '1' || $tmp === 'on') { echo "mbstring.func_overload must not be enabled in php.ini\n"; exit(1); } # other php.ini settings # ini_set('display_errors', true); # to be changed when our error handler is installed
/** * Convert character encoding from ISO-2022-JP to UTF-8. * * @param string $text * @return string */ function _convertJisToUtf8($text) { $backup = mb_substitute_character(); mb_substitute_character('long'); $text = mb_convert_encoding($text, 'UTF-8', 'JIS'); mb_substitute_character($backup); $pattern = '/JIS\\+([0-9A-F]{4})/'; $callback = array($this, '_fallbackJisToUtf8'); $text = preg_replace_callback($pattern, $callback, $text); return $text; }
function euc2ktaimod($str) { if ($this->from === MPC_FROM_SOFTBANK) { $ex = '\'((s:\' . join(\'))((s:\', explode(\' \', rtrim(chunk_split(strtolower(bin2hex(str_replace(\'\\"\', \'"\', \'$1\'))), 4, \' \')))) . \'))\''; $str = preg_replace('/[\\x1B][\\x24]((?:[G|E|F|O|P|Q][\\x21-\\x7E])+)[\\x0F]?/e', $ex, $str); } else { $prefix = $this->from === MPC_FROM_FOMA ? 'i' : 'e'; $old = mb_substitute_character(); mb_substitute_character('long'); $str = mb_convert_encoding($str, 'EUC-JP', 'EUC-JP'); mb_substitute_character($old); $ex = '\'((' . $prefix . ':\'.strtolower(\'$1\').\'))\''; $str = preg_replace('/BAD\\+([0-9A-F]{4})/ie', $ex, $str); } return $str; }
/** * Makes sure the data is using valid utf8, invalid characters are discarded. * * Note: this function is not intended for full objects with methods and private properties. * * @param mixed $value * @return mixed with proper utf-8 encoding */ function fix_utf8($value) { if (is_null($value) or $value === '') { return $value; } else { if (is_string($value)) { if ((string) (int) $value === $value) { // shortcut return $value; } // Lower error reporting because glibc throws bogus notices. $olderror = error_reporting(); if ($olderror & E_NOTICE) { error_reporting($olderror ^ E_NOTICE); } // Note: this duplicates min_fix_utf8() intentionally. static $buggyiconv = null; if ($buggyiconv === null) { $buggyiconv = (!function_exists('iconv') or iconv('UTF-8', 'UTF-8//IGNORE', '100' . chr(130) . '\\80') !== '100\\80'); } if ($buggyiconv) { if (function_exists('mb_convert_encoding')) { $subst = mb_substitute_character(); mb_substitute_character(''); $result = mb_convert_encoding($value, 'utf-8', 'utf-8'); mb_substitute_character($subst); } else { // Warn admins on admin/index.php page. $result = $value; } } else { $result = iconv('UTF-8', 'UTF-8//IGNORE', $value); } if ($olderror & E_NOTICE) { error_reporting($olderror); } return $result; } else { if (is_array($value)) { foreach ($value as $k => $v) { $value[$k] = fix_utf8($v); } return $value; } else { if (is_object($value)) { $value = clone $value; // do not modify original foreach ($value as $k => $v) { $value->{$k} = fix_utf8($v); } return $value; } else { // this is some other type, no utf-8 here return $value; } } } } }
/** * Convert a foreign charset encoding from or to UTF-8 */ function convert($string, $encoding = NULL, $destination = 'UTF-8') { if (!$encoding) { $encoding = utf8::detect($string); } if ($encoding == $destination) { return $string; } if (!empty($this->mb_sets)) { $encode_mb = array_key_exists($encoding, $this->mb_sets); $dest_mb = array_key_exists($destination, $this->mb_sets); if ($encode_mb && $dest_mb) { @mb_substitute_character('none'); return mb_convert_encoding($string, $destination, $encoding); } } else { $encode_mb = $dest_mb = false; } $encode_iconv = array_key_exists($encoding, $this->iconv_sets); $dest_iconv = array_key_exists($destination, $this->iconv_sets); if ($encode_iconv && $dest_iconv) { return @iconv($encoding, $destination . '//IGNORE', $string); } // must use mixed conversion @mb_substitute_character('none'); if ($encode_mb) { $instring = mb_convert_encoding($string, 'UTF-8', $encoding); } else { if ($encode_iconv) { $instring = @iconv($encoding, 'UTF-8' . '//IGNORE', $string); } else { $instring = $string; } } if ($dest_mb) { $outstring = mb_convert_encoding($string, $destination, 'UTF-8'); } else { if ($dest_iconv) { $outstring = @iconv('UTF-8', $destination . '//IGNORE', $string); } else { $outstring = $string; } } return $outstring; }
function caSanitizeStringForJsonEncode($ps_text) { // Remove invalid UTF-8 mb_substitute_character(0xfffd); $ps_text = mb_convert_encoding($ps_text, 'UTF-8', 'UTF-8'); // @see http://php.net/manual/en/regexp.reference.unicode.php return preg_replace("/[^\\p{Ll}\\p{Lm}\\p{Lo}\\p{Lt}\\p{Lu}\\p{N}\\p{P}\\p{Zp}\\p{Zs}\\p{S}]|➔/", '', strip_tags($ps_text)); }
$str = "Mary Had A Little Lamb and She LOVED It So"; $str = mb_strtolower($str); var_dump($str); var_dump(mb_strtolower("ABC")); $str = "Mary Had A Little Lamb and She LOVED It So"; $str = mb_strtoupper($str); var_dump($str); var_dump(mb_strtoupper("abc")); var_dump(mb_strwidth("PrÜ" . "fung")); /* Set with Unicode U+3013 (GETA MARK) */ mb_substitute_character(0x3013); var_dump(mb_substitute_character() === 0x3013); /* Set hex format */ mb_substitute_character("long"); /* Display current setting */ var_dump(mb_substitute_character()); var_dump(mb_substr_count("This is a test", "is")); $text = "This is a test"; var_dump(mb_substr_count($text, "is")); // different from substr_count // mb_strrchr behaves differently in different versions of // libmbfl (https://github.com/facebook/hiphop-php/issues/68) var_dump(mb_substr_count("gcdgcdgcd", "gcdgcd") === 2 || mb_substr_count("gcdgcdgcd", "gcdgcd") === 1); var_dump(mb_substr("abcdef", 1)); var_dump(mb_substr("abcdef", 1, 3)); var_dump(mb_substr("abcdef", 0, 4)); var_dump(mb_substr("abcdef", 0, 8)); var_dump(mb_substr("abcdef", -1, 1)); var_dump(mb_substr("Ü" . "bcdef", 1)); var_dump(mb_substr("Ü" . "bcdef", 1, 3)); var_dump(mb_substr("Ü" . "bcdef", 0, 4) === "Ü" . "bcd");
* It is recommended to not enable this unless absolutely necessary. */ spl_autoload_register(array('Kohana', 'auto_load_lowercase')); /** * Enable the Kohana auto-loader for unserialization. * * @link http://php.net/spl_autoload_call * @link http://php.net/manual/var.configuration.php#unserialize-callback-func */ ini_set('unserialize_callback_func', 'spl_autoload_call'); /** * Set the mb_substitute_character to "none" * * @link http://www.php.net/manual/function.mb-substitute-character.php */ mb_substitute_character('none'); // -- Configuration and initialization ----------------------------------------- /** * Set Kohana::$environment if a 'GLEEZ_ENV' environment variable has been supplied. * * @todo In the future Kohana::$environment should be moved to Gleez Core as Gleez::$environment * * @link https://github.com/gleez/cms/wiki/Apache * @link https://github.com/gleez/cms/wiki/Nginx */ if (isset($_SERVER['GLEEZ_ENV'])) { // Get environment variable from $_SERVER, .htaccess, apache.conf, nginx.conf, etc. $env = 'Kohana::' . strtoupper($_SERVER['GLEEZ_ENV']); } elseif (get_cfg_var('GLEEZ_ENV')) { // Get environment variable from php.ini or from ini_get('user_ini.filename') $env = 'Kohana::' . strtoupper(get_cfg_var('GLEEZ_ENV'));
/** * Makes sure the data is using valid utf8, invalid characters are discarded. * * Note: this function is not intended for full objects with methods and private properties. * * @param mixed $value * @return mixed with proper utf-8 encoding */ function fix_utf8($value) { if (is_null($value) or $value === '') { return $value; } else { if (is_string($value)) { if ((string) (int) $value === $value) { // Shortcut. return $value; } // No null bytes expected in our data, so let's remove it. $value = str_replace("", '', $value); // Note: this duplicates min_fix_utf8() intentionally. static $buggyiconv = null; if ($buggyiconv === null) { $buggyiconv = (!function_exists('iconv') or @iconv('UTF-8', 'UTF-8//IGNORE', '100' . chr(130) . '€') !== '100€'); } if ($buggyiconv) { if (function_exists('mb_convert_encoding')) { $subst = mb_substitute_character(); mb_substitute_character(''); $result = mb_convert_encoding($value, 'utf-8', 'utf-8'); mb_substitute_character($subst); } else { // Warn admins on admin/index.php page. $result = $value; } } else { $result = @iconv('UTF-8', 'UTF-8//IGNORE', $value); } return $result; } else { if (is_array($value)) { foreach ($value as $k => $v) { $value[$k] = fix_utf8($v); } return $value; } else { if (is_object($value)) { // Do not modify original. $value = clone $value; foreach ($value as $k => $v) { $value->{$k} = fix_utf8($v); } return $value; } else { // This is some other type, no utf-8 here. return $value; } } } } }
function plugin_urlbookmark_get_title($url) { $ht = new Hyp_HTTP_Request(); $ht->init(); $ht->ua = 'Mozilla/5.0'; $ht->url = $url; $ht->get(); if ($ht->rc !== 200) { return 'The page not found. (' . $ht->rc . ')'; } $data = $ht->data; $ht = NULL; $buf = preg_replace('/[\\x00\\r\\n]+/', '', $data); if (preg_match('/<title[^>]*>(.+?)<\\/title>/i', $buf, $tmpary)) { $title = trim($tmpary[1]); } else { $title = rawurldecode($url); } $title = str_replace(array('<', '>'), array('<', '>'), $title); $enc = $this->get_encoding($buf); if ($enc !== 'auto') { $this->func->encode_numericentity($title, $this->cont['SOURCE_ENCODING'], $enc); $title = mb_convert_encoding($title, $this->cont['SOURCE_ENCODING'], $enc); } else { if (extension_loaded('mbstring')) { $enc = $this->get_encoding($buf); if (strtoupper($this->cont['SOURCE_ENCODING']) === 'UTF-8') { $title = mb_convert_encoding($title, $this->cont['SOURCE_ENCODING'], $enc); } else { $_sub = mb_substitute_character(); mb_substitute_character(0x3c); $_title = @mb_convert_encoding($title, $this->cont['SOURCE_ENCODING'], $enc); if (strpos($_title, '<') !== FALSE) { $title = @mb_convert_encoding($title, 'UTF-8', $enc); $title = mb_convert_encoding($title, 'HTML-ENTITIES', 'UTF-8'); } else { $title = $_title; } mb_substitute_character($_sub); } } } return trim($title); }
public static function csv($text) { mb_substitute_character(0x00A0); // Pour mettre " " au lieu de "?" en remplacement des caractères non convertis. return mb_convert_encoding($text,'Windows-1252','UTF-8'); }
/** * @param string $sInputString * @param string $sInputFromEncoding * @param string $sInputToEncoding * * @return string|bool */ public static function MbConvertEncoding($sInputString, $sInputFromEncoding, $sInputToEncoding) { static $sMbstringSubCh = null; if (null === $sMbstringSubCh) { $sMbstringSubCh = \mb_substitute_character(); } \mb_substitute_character('none'); $sResult = @\mb_convert_encoding($sInputString, \strtoupper($sInputToEncoding), \strtoupper($sInputFromEncoding)); \mb_substitute_character($sMbstringSubCh); return $sResult; }
/** * @param Dictionary $dictionary * @throws \BadMethodCallException $this->textFileOnly が偽、かつ「画像・音声・動画ファイルを含む場合のファイル形式」をCSVファイルのみで構文解析していた場合。 * @throws EmptyOutputException 該当の辞書形式に変換可能なお題が一つも存在しなかった。 * @return string[] */ public function serialize(Dictionary $dictionary) : array { $directoryName = (new \esperecyan\dictionary_php\validator\FilenameValidator())->convertToValidFilenameWithoutExtensionInArchives($dictionary->getTitle()); foreach ($dictionary->getWords() as $word) { $serialized = $this->type === 'Inteligenceω しりとり' ? $this->serializeWordAsShiritori($word) : $this->serializeWordAsQuiz($word, $directoryName); if ($serialized !== '') { $words[] = $serialized; } } if (empty($words)) { throw new EmptyOutputException(sprintf(_('%sの辞書形式に変換可能なお題が見つかりませんでした。'), $this->type)); } $previousSubstituteCharacter = mb_substitute_character(); mb_substitute_character(\IntlChar::ord(self::SUBSTITUTE_CHARACTER)); $bytes = mb_convert_encoding($this->serializeMetadata($dictionary, '%') . implode('', $words), 'Windows-31J', 'UTF-8'); mb_substitute_character($previousSubstituteCharacter); $files = $dictionary->getFiles(); if (!$files && !$this->textFileOnly && $dictionary->getFilenames()) { throw new \BadMethodCallException(); } elseif ($this->type === 'Inteligenceω クイズ' && $files && !$this->textFileOnly) { $archive = $this->generateArchive(); foreach ($files as $file) { $archive->addFile($file, "{$directoryName}/" . $file->getFilename()); } $archive->addFromString("{$directoryName}.txt", $bytes); $archivePath = $archive->filename; $archive->close(); return ['bytes' => file_get_contents($archivePath), 'type' => 'application/zip', 'name' => $this->getFilename($dictionary, 'zip')]; } else { return ['bytes' => $bytes, 'type' => 'text/plain; charset=Shift_JIS', 'name' => $this->getFilename($dictionary, 'txt')]; } }