/** * * The value $max_delta = 80000 is good enough for speed and detection accuracy. * If you set the value of $max_delta too low, no language will be recognized. * $max_delta = 400 * 350 = 140000 is the best detection with lowest speed. */ function &_api_compare_n_grams(&$n_grams, $encoding, $max_delta = LANGUAGE_DETECT_MAX_DELTA) { static $language_profiles; if (!isset($language_profiles)) { // Reading the language profile files from the internationalization database. $exceptions = array('.', '..', 'CVS', '.htaccess', '.svn', '_svn', 'index.html'); $path = str_replace("\\", '/', dirname(__FILE__) . '/internationalization_database/language_detection/language_profiles/'); $non_utf8_encodings =& _api_non_utf8_encodings(); if (is_dir($path)) { if ($handle = @opendir($path)) { while (($dir_entry = @readdir($handle)) !== false) { if (api_in_array_nocase($dir_entry, $exceptions)) { continue; } if (strpos($dir_entry, '.txt') === false) { continue; } $dir_entry_full_path = $path . '/' . $dir_entry; if (@filetype($dir_entry_full_path) != 'dir') { if (false !== ($data = @file_get_contents($dir_entry_full_path))) { $language = basename($dir_entry_full_path, '.txt'); $encodings = array('UTF-8'); if (!empty($non_utf8_encodings[$language])) { $encodings = array_merge($encodings, $non_utf8_encodings[$language]); } foreach ($encodings as $enc) { $data_enc = api_utf8_decode($data, $enc); if (empty($data_enc)) { continue; } $key = $language . ':' . $enc; $language_profiles[$key]['data'] = array_flip(explode("\n", $data_enc)); $language_profiles[$key]['language'] = $language; $language_profiles[$key]['encoding'] = $enc; } } } } } } @closedir($handle); ksort($language_profiles); } if (!is_array($n_grams) || empty($n_grams)) { return array(); } // Comparison between the input n-grams and the lanuage profiles. foreach ($language_profiles as $key => &$language_profile) { if (!api_is_language_supported($language_profile['language']) || !api_equal_encodings($encoding, $language_profile['encoding'])) { continue; } $delta = 0; // This is a summary measurment for matching between the input text and the current language profile. // Searching each n-gram from the input text into the language profile. foreach ($n_grams as $rank => &$n_gram) { if (isset($language_profile['data'][$n_gram])) { // The n-gram has been found, the difference between places in both // arrays is calculated (so called delta-points are adopted for // measuring distances between n-gram ranks. $delta += abs($rank - $language_profile['data'][$n_gram]); } else { // The n-gram has not been found in the profile. We add then // a large enough "distance" in delta-points. $delta += 400; } // Abort: This language already differs too much. if ($delta > $max_delta) { break; } } // Include only non-aborted languages in result array. if ($delta < $max_delta - 400) { $result[$key] = $delta; } } if (!isset($result)) { return array(); } asort($result); return $result; }
/** * Returns in an array the most-probably used non-UTF-8 encoding for the given language. * The first (leading) value is actually used by the system at the moment. * @param string $language (optional) The specified language, the default value is the user intrface language. * @return string The correspondent encoding to the specified language. * Note: See the file chamilo/main/inc/lib/internationalization_database/non_utf8_encodings.php * if you wish to revise the leading non-UTF-8 encoding for your language. */ function api_get_non_utf8_encoding($language = null) { $language_is_supported = api_is_language_supported($language); if (!$language_is_supported || empty($language)) { $language = api_get_interface_language(false, true); } $language = api_purify_language_id($language); $encodings =& _api_non_utf8_encodings(); if (is_array($encodings[$language])) { if (!empty($encodings[$language][0])) { return $encodings[$language][0]; } return null; } return null; }