Exemplo n.º 1
0
 function test_unicode_off()
 {
     // see what happens when you turn the unicode setting off
     $myobj = new Text_LanguageDetect();
     $str = 'This is a delightful sample of English text';
     $myobj->useUnicodeBlocks(true);
     $result1 = $myobj->detectConfidence($str);
     $myobj->useUnicodeBlocks(false);
     $result2 = $myobj->detectConfidence($str);
     $this->assertEquals($result1, $result2);
     // note this test doesn't tell if unicode narrowing was actually used or not
 }
Exemplo n.º 2
0
/**
 * @brief Takes a string and tries to identify the language.
 *
 * It uses the pear library Text_LanguageDetect and it can identify 52 human languages.
 * It returns the identified languges and a confidence score for each.
 *
 * Strings need to have a min length config['system']['language_detect_min_length']
 * and you can influence the confidence that must be met before a result will get
 * returned through config['system']['language_detect_min_confidence'].
 *
 * @see http://pear.php.net/package/Text_LanguageDetect
 * @param string $s A string to examine
 * @return Language code in 2-letter ISO 639-1 (en, de, fr) format
 */
function detect_language($s)
{
    require_once 'Text/LanguageDetect.php';
    $min_length = get_config('system', 'language_detect_min_length');
    if ($min_length === false) {
        $min_length = LANGUAGE_DETECT_MIN_LENGTH;
    }
    $min_confidence = get_config('system', 'language_detect_min_confidence');
    if ($min_confidence === false) {
        $min_confidence = LANGUAGE_DETECT_MIN_CONFIDENCE;
    }
    // embedded apps have long base64 strings which will trip up the detector.
    $naked_body = preg_replace('/\\[app\\](.*?)\\[\\/app\\]/', '', $s);
    // strip off bbcode
    $naked_body = preg_replace('/\\[(.+?)\\]/', '', $naked_body);
    if (mb_strlen($naked_body) < intval($min_length)) {
        logger('string length less than ' . intval($min_length), LOGGER_DATA);
        return '';
    }
    $l = new Text_LanguageDetect();
    try {
        // return 2-letter ISO 639-1 (en) language code
        $l->setNameMode(2);
        $lng = $l->detectConfidence($naked_body);
        logger('detect language: ' . print_r($lng, true) . $naked_body, LOGGER_DATA);
    } catch (Text_LanguageDetect_Exception $e) {
        logger('detect language exception: ' . $e->getMessage(), LOGGER_DATA);
    }
    if (!$lng || !x($lng, 'language')) {
        return '';
    }
    if ($lng['confidence'] < (double) $min_confidence) {
        logger('detect language: confidence less than ' . (double) $min_confidence, LOGGER_DATA);
        return '';
    }
    return $lng['language'];
}
Exemplo n.º 3
0
Enter text to identify language (at least a couple of sentences):<br />
<textarea name="q" wrap="virtual" cols="80" rows="8"><?php 
echo $q;
?>
</textarea>
<br />
<input type="submit" value="Submit" />
</form>
<?php 
if (isset($q) && strlen($q)) {
    $len = $l->utf8strlen($q);
    if ($len < 20) {
        // this value picked somewhat arbitrarily
        echo "Warning: string not very long ({$len} chars)<br />\n";
    }
    $result = $l->detectConfidence($q);
    if ($result == null) {
        echo "Text_LanguageDetect cannot identify this piece of text. <br /><br />\n";
    } else {
        echo "Text_LanguageDetect thinks this text is written in <b>{$result['language']}</b> ({$result['similarity']}, {$result['confidence']})<br /><br />\n";
    }
    $result = $l->detectUnicodeBlocks($q, false);
    if (!empty($result)) {
        arsort($result);
        echo "Unicode blocks present: ", join(', ', array_keys($result)), "\n<br /><br />";
    }
}
unset($l);
/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */
?>
</body></html>