function test_unicode_off() { // see what happens when you turn the unicode setting off $myobj = new Text_LanguageDetect(); $str = 'This is a delightful sample of English text'; $myobj->useUnicodeBlocks(true); $result1 = $myobj->detectConfidence($str); $myobj->useUnicodeBlocks(false); $result2 = $myobj->detectConfidence($str); $this->assertEquals($result1, $result2); // note this test doesn't tell if unicode narrowing was actually used or not }
/** * @brief Takes a string and tries to identify the language. * * It uses the pear library Text_LanguageDetect and it can identify 52 human languages. * It returns the identified languges and a confidence score for each. * * Strings need to have a min length config['system']['language_detect_min_length'] * and you can influence the confidence that must be met before a result will get * returned through config['system']['language_detect_min_confidence']. * * @see http://pear.php.net/package/Text_LanguageDetect * @param string $s A string to examine * @return Language code in 2-letter ISO 639-1 (en, de, fr) format */ function detect_language($s) { require_once 'Text/LanguageDetect.php'; $min_length = get_config('system', 'language_detect_min_length'); if ($min_length === false) { $min_length = LANGUAGE_DETECT_MIN_LENGTH; } $min_confidence = get_config('system', 'language_detect_min_confidence'); if ($min_confidence === false) { $min_confidence = LANGUAGE_DETECT_MIN_CONFIDENCE; } // embedded apps have long base64 strings which will trip up the detector. $naked_body = preg_replace('/\\[app\\](.*?)\\[\\/app\\]/', '', $s); // strip off bbcode $naked_body = preg_replace('/\\[(.+?)\\]/', '', $naked_body); if (mb_strlen($naked_body) < intval($min_length)) { logger('string length less than ' . intval($min_length), LOGGER_DATA); return ''; } $l = new Text_LanguageDetect(); try { // return 2-letter ISO 639-1 (en) language code $l->setNameMode(2); $lng = $l->detectConfidence($naked_body); logger('detect language: ' . print_r($lng, true) . $naked_body, LOGGER_DATA); } catch (Text_LanguageDetect_Exception $e) { logger('detect language exception: ' . $e->getMessage(), LOGGER_DATA); } if (!$lng || !x($lng, 'language')) { return ''; } if ($lng['confidence'] < (double) $min_confidence) { logger('detect language: confidence less than ' . (double) $min_confidence, LOGGER_DATA); return ''; } return $lng['language']; }
Enter text to identify language (at least a couple of sentences):<br /> <textarea name="q" wrap="virtual" cols="80" rows="8"><?php echo $q; ?> </textarea> <br /> <input type="submit" value="Submit" /> </form> <?php if (isset($q) && strlen($q)) { $len = $l->utf8strlen($q); if ($len < 20) { // this value picked somewhat arbitrarily echo "Warning: string not very long ({$len} chars)<br />\n"; } $result = $l->detectConfidence($q); if ($result == null) { echo "Text_LanguageDetect cannot identify this piece of text. <br /><br />\n"; } else { echo "Text_LanguageDetect thinks this text is written in <b>{$result['language']}</b> ({$result['similarity']}, {$result['confidence']})<br /><br />\n"; } $result = $l->detectUnicodeBlocks($q, false); if (!empty($result)) { arsort($result); echo "Unicode blocks present: ", join(', ', array_keys($result)), "\n<br /><br />"; } } unset($l); /* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */ ?> </body></html>