$result = array(); $path = realpath($path); if (!is_dir($path)) { return $result; } if (!($handle = opendir($path))) { return $result; } while (($dir_entry = readdir($handle)) !== false) { if (api_in_array_nocase($dir_entry, $exceptions)) { continue; } $dir_entry_full_path = $path . '/' . $dir_entry; if (filetype($dir_entry_full_path) != 'dir') { $result[] = str_replace("\\", '/', $dir_entry_full_path); } } closedir($handle); asort($result); return $result; } $files = get_directory_content($_current_dir . 'sample_texts/'); echo 'Updating language profiles...<br />'; echo '<br />'; foreach ($files as $file) { $language = basename($file, '.txt'); echo $language . '<br />'; write_file($_current_dir . 'language_profiles/' . $language . '.txt', join("\n", _api_generate_n_grams(read_file($file), 'UTF-8', 400, 4))); } echo '<br />'; echo 'Done.<br />';
/** * Detects encoding of plain text. * @param string $string The input text. * @param string $language (optional) The language of the input text, provided if it is known. * @return string Returns the detected encoding. */ function api_detect_encoding($string, $language = null) { // Testing against valid UTF-8 first. if (api_is_valid_utf8($string)) { return 'UTF-8'; } $result = null; $delta_points_min = LANGUAGE_DETECT_MAX_DELTA; // Testing non-UTF-8 encodings. $encodings = api_get_valid_encodings(); foreach ($encodings as &$encoding) { if (api_is_encoding_supported($encoding) && !api_is_utf8($encoding)) { $stringToParse = api_substr($string, 0, LANGUAGE_DETECT_MAX_LENGTH, $encoding); $strintToParse2 = _api_generate_n_grams($stringToParse, $encoding); $result_array = _api_compare_n_grams($strintToParse2, $encoding); if (!empty($result_array)) { list($key, $delta_points) = each($result_array); if ($delta_points < $delta_points_min) { $pos = strpos($key, ':'); $result_encoding = api_refine_encoding_id(substr($key, $pos + 1)); if (api_equal_encodings($encoding, $result_encoding)) { if ($string == api_utf8_decode(api_utf8_encode($string, $encoding), $encoding)) { $delta_points_min = $delta_points; $result = $encoding; } } } } } } // "Broken" UTF-8 texts are to be detected as UTF-8. // This functionality is enabled when language of the text is known. $language = api_purify_language_id((string) $language); if (!empty($language)) { $encoding = 'UTF-8'; $result_array =& _api_compare_n_grams(_api_generate_n_grams(api_substr($string, 0, LANGUAGE_DETECT_MAX_LENGTH, $encoding), $encoding), $encoding); if (!empty($result_array)) { list($key, $delta_points) = each($result_array); if ($delta_points < $delta_points_min) { $pos = strpos($key, ':'); $result_encoding = api_refine_encoding_id(substr($key, $pos + 1)); $result_language = substr($key, 0, $pos); if ($language == $result_language && api_is_utf8($result_encoding)) { $delta_points_min = $delta_points; $result = $encoding; } } } } return $result; }