/** * Detects encoding of xml-formatted text. * @param string $string The input xml-formatted text. * @param string $default_encoding This is the default encoding to be returned if there is no way the xml-text's encoding to be detected. If it not spesified, the system encoding is assumed then. * @return string Returns the detected encoding. * @todo The second parameter is to be eliminated. See api_detect_encoding_html(). */ function api_detect_encoding_xml($string, $default_encoding = null) { if (preg_match(_PCRE_XML_ENCODING, $string, $matches)) { return api_refine_encoding_id($matches[1]); } if (api_is_valid_utf8($string)) { return 'UTF-8'; } if (empty($default_encoding)) { $default_encoding = _api_mb_internal_encoding(); } return api_refine_encoding_id($default_encoding); }
/** * Detects the encoding of a given manifest (a xml-text). * It is possible the encoding of the manifest to be wrongly declared or * not to be declared at all. The proposed method tries to resolve these problems. * @param string $xml The input xml-text. * @return string The detected value of the input xml. */ private function detect_manifest_encoding(&$xml) { if (api_is_valid_utf8($xml)) { return 'UTF-8'; } if (preg_match(_PCRE_XML_ENCODING, $xml, $matches)) { $declared_encoding = api_refine_encoding_id($matches[1]); } else { $declared_encoding = ''; } if (!empty($declared_encoding) && !api_is_utf8($declared_encoding)) { return $declared_encoding; } $test_string = ''; if (preg_match_all('/<langstring[^>]*>(.*)<\\/langstring>/m', $xml, $matches)) { $test_string = implode("\n", $matches[1]); unset($matches); } if (preg_match_all('/<title[^>]*>(.*)<\\/title>/m', $xml, $matches)) { $test_string .= "\n" . implode("\n", $matches[1]); unset($matches); } if (empty($test_string)) { $test_string = $xml; } return api_detect_encoding($test_string); }
/** * This function reads a Chamilo language file and transforms it into XML, * then returns the XML string to the caller. */ function get_language_file_as_xml($language = 'english') { $path = api_get_path(SYS_LANG_PATH) . $language . '/'; if (!is_dir($path) or !is_readable($path)) { if ($language != 'english') { return get_language_file_as_xml('english'); } else { return ''; } } //error_log('Analysing path '.$path); $file = $path . 'videoconf.inc.php'; if (!is_file($file) or !is_readable($file)) { if ($language != 'english') { return get_language_file_as_xml('english'); } else { return ''; } } /* $convert = true; if(substr($language,-7,7) == 'unicode') {//do not convert if the language ends with 'unicode', which means it's in UTF-8 $convert=false; } $list = file($file); $xml = ''; foreach ( $list as $line ) { if(substr($line,0,1)=='$') { $items = array(); $match = preg_match('/^\$([^\s]*)\s*=\s*"(.*)";$/',$line,$items); if($match) { //todo: The following conversion should only happen for old language files (encoded in ISO-8859-1). if($convert) { $string = api_convert_encoding($items[2],'UTF-8','ISO-8859-1'); } else { $string = $items[2]; } $xml .= '<labelfield><labelid>'.$items[1].'</labelid><labelvalue>'.stripslashes($string).'</labelvalue></labelfield>'."\n"; } } } */ //--------- $non_utf8_encoding = api_get_non_utf8_encoding($language); $list = file($file); $xml = ''; foreach ($list as $line) { if (substr($line, 0, 1) == '$') { $items = array(); $match = preg_match('/^\\$([^\\s]*)\\s*=\\s*"(.*)";$/', $line, $items); if ($match) { $string = $items[2]; if (!api_is_valid_utf8($string)) { $string = api_html_entity_decode(api_utf8_encode($string, $non_utf8_encoding), ENT_QUOTES, 'UTF-8'); } $xml .= '<labelfield><labelid>' . $items[1] . '</labelid><labelvalue>' . stripslashes($string) . '</labelvalue></labelfield>' . "\n"; } } } //--------- if (empty($xml) && $language != 'english') { return get_language_file_as_xml('english'); } return $xml; }
/** * Detects encoding of plain text. * @param string $string The input text. * @param string $language (optional) The language of the input text, provided if it is known. * @return string Returns the detected encoding. */ function api_detect_encoding($string, $language = null) { // Testing against valid UTF-8 first. if (api_is_valid_utf8($string)) { return 'UTF-8'; } return mb_detect_encoding($string); }
/** * Detects encoding of plain text. * @param string $string The input text. * @param string $language (optional) The language of the input text, provided if it is known. * @return string Returns the detected encoding. */ function api_detect_encoding($string, $language = null) { // Testing against valid UTF-8 first. if (api_is_valid_utf8($string)) { return 'UTF-8'; } $result = null; $delta_points_min = LANGUAGE_DETECT_MAX_DELTA; // Testing non-UTF-8 encodings. $encodings = api_get_valid_encodings(); foreach ($encodings as &$encoding) { if (api_is_encoding_supported($encoding) && !api_is_utf8($encoding)) { $stringToParse = api_substr($string, 0, LANGUAGE_DETECT_MAX_LENGTH, $encoding); $strintToParse2 = _api_generate_n_grams($stringToParse, $encoding); $result_array = _api_compare_n_grams($strintToParse2, $encoding); if (!empty($result_array)) { list($key, $delta_points) = each($result_array); if ($delta_points < $delta_points_min) { $pos = strpos($key, ':'); $result_encoding = api_refine_encoding_id(substr($key, $pos + 1)); if (api_equal_encodings($encoding, $result_encoding)) { if ($string == api_utf8_decode(api_utf8_encode($string, $encoding), $encoding)) { $delta_points_min = $delta_points; $result = $encoding; } } } } } } // "Broken" UTF-8 texts are to be detected as UTF-8. // This functionality is enabled when language of the text is known. $language = api_purify_language_id((string) $language); if (!empty($language)) { $encoding = 'UTF-8'; $result_array =& _api_compare_n_grams(_api_generate_n_grams(api_substr($string, 0, LANGUAGE_DETECT_MAX_LENGTH, $encoding), $encoding), $encoding); if (!empty($result_array)) { list($key, $delta_points) = each($result_array); if ($delta_points < $delta_points_min) { $pos = strpos($key, ':'); $result_encoding = api_refine_encoding_id(substr($key, $pos + 1)); $result_language = substr($key, 0, $pos); if ($language == $result_language && api_is_utf8($result_encoding)) { $delta_points_min = $delta_points; $result = $encoding; } } } } return $result; }