Esempio n. 1
0
function topWords()
{
    global $contacts, $thresholdWords, $up;
    logMsg('USER', "Finding top {$thresholdWords} most often used words for each contact..");
    foreach (array_keys($contacts) as $email) {
        echo ' <span>.</span>';
        flush();
        $language = $contacts[$email]['language'];
        if ($language != "english" && file_exists($up . "/stopwords/{$language}.txt")) {
            $extraStopWords = " | grep -v -w -f {$up}/stopwords/{$language}.txt";
        }
        $f = sanitizeFilename($email);
        $filename = $up . '/content/' . $f . '.txt';
        $filenameWords = $up . '/content/' . $f . '_words.txt';
        $filenameWordsStem = $up . '/content/' . $f . '_words_stem.txt';
        chdir($up);
        if (file_exists($filename)) {
            $cmd = 'cat ' . $filename . ' | tr "A-Z" "a-z" | tr -c "[:alpha:]" " " | tr " " "\\n" | sort | uniq -c | sort | grep -v -w -f ' . $up . '/stopwords/english.txt | grep -E [a-z]{3,} | tr -d " *[:digit:]*\\t" | tail -n ' . $thresholdWords * 4 . ' > ' . $filenameWords;
            logMsg('DEBUG', "Running CMD: {$cmd}");
            shell_exec($cmd);
            #detect language
            $language = new LangDetect($filenameWords, -1);
            $lang = $language->Analyze();
            $languages = array_keys($lang);
            $contacts[$email]['language'] = $languages[0];
            $language = $languages[0];
            $score = array_shift($score = $lang);
            array_shift($lang);
            foreach ($lang as $l => $lscore) {
                if ($lscore - $score > 7000) {
                    break;
                }
                if ($l != 'english') {
                    unset($language);
                    $language = $l;
                    break;
                }
            }
            if ($language != 'english') {
                logMsg('DEBUG', "Language for {$email} is " . $contacts[$email]['language'] . " (but removing also {$language} stopwords)");
            } else {
                logMsg('DEBUG', "Language for {$email} is " . $contacts[$email]['language']);
            }
            if ($language != 'english') {
                $cmd = 'cat ' . $filenameWords . ' | tr "A-Z" "a-z" | tr -c "[:alpha:]" " " | tr " " "\\n" | sort | uniq -c | sort | grep -v -w -f ' . $up . '/stopwords/' . $contacts[$email]['language'] . '.txt | grep -E [a-z]{3,} | tr -d " *[:digit:]*\\t" | tail -n ' . $thresholdWords . ' > ' . $filenameWords;
                logMsg('DEBUG', "Running CMD: {$cmd}");
                shell_exec($cmd);
            }
            $contacts[$email]['words'] = array_reverse(array_trim(file($filenameWords)));
            if ($language == 'english' || $language == 'swedish') {
                $languageShort = substr($language, 0, 2);
                $cmd = $up . '/cstlemma/bin/vc2008/cstlemma.exe -e1 -L -f ' . $up . '/cstlemma/flexrules_' . $languageShort . ' -t- -c"$B" -B"$w\\n" < ' . $filenameWords . ' > ' . $filenameWordsStem;
                logMsg('DEBUG', "Running CMD: {$cmd}");
                shell_exec($cmd);
                $contacts[$email]['wordsStem'] = array_reverse(array_trim(file($filenameWordsStem)));
                array_pop($contacts[$email]['wordsStem']);
            }
        }
    }
    logMsg('USER', "Done!");
}
Esempio n. 2
0
function setLanguage($config)
{
    $locales = array();
    $lang_cookie = '';
    # Получаем локаль из куков
    if (isset($_COOKIE['lang'])) {
        $lang_cookie = $_COOKIE['lang'];
    }
    #Запись выбранного языка в куки
    if (isset($_GET['lang'])) {
        $lang_cookie = $_GET['lang'];
        SetCookie("lang", $lang_cookie, time() + 1000 * 24 * 60 * 60);
    }
    if (isset($config['config']['locales'])) {
        foreach ($config['config']['locales'] as $k => $v) {
            if (stristr($k, 'name_') !== false) {
                $key_name = str_replace('name_', '', $k);
                $val_en = 'enable_' . $key_name;
                $val_file = 'file_' . $key_name;
                $locales[$v] = array('file' => $config['config']['locales'][$val_file], 'enable' => $config['config']['locales'][$val_en]);
            }
        }
    }
    # Базовый язык ru-RU (на случай если по умолчанию язык недоступен или не включен)
    $defaultNameLocale = 'ru_RU';
    $defaultFileLocale = 'original';
    # Локализациия по умолчанию.
    if (isset($config['config']['locales_options']['default']) and $config['config']['locales_options']['default'] != '') {
        $temp_default = $config['config']['locales_options']['default'];
        # Проверяем активированна ли локализация по умолчанию
        if (isset($locales[$temp_default])) {
            $defaultNameLocale = $temp_default;
            $defaultFileLocale = $locales[$temp_default]['file'];
        }
    }
    # Если включен автодетект
    if (isset($config['config']['locales_options']['autoDetect']) and $config['config']['locales_options']['autoDetect'] == 1) {
        $langDetect = new LangDetect();
        $langs = array('ru_RU' => array('ru'), 'uk_UA' => array('uk'), 'en_GB' => array('en'));
        $detectNameLocale = $langDetect->getBestMatch($defaultNameLocale, $langs);
        # Проверяем активированна ли определенная локаль
        if (isset($locales[$detectNameLocale]) and $locales[$detectNameLocale]['enable'] == 1) {
            $nameLocale = $detectNameLocale;
            $fileLoacale = $locales[$temp_default]['file'];
        } else {
            $nameLocale = $defaultNameLocale;
            $fileLoacale = $defaultFileLocale;
        }
    } else {
        $nameLocale = $defaultNameLocale;
        $fileLoacale = $defaultFileLocale;
    }
    #Если язык из куков включен то используем его
    if (isset($locales[$lang_cookie]) and $locales[$lang_cookie]['enable'] == 1) {
        $nameLocale = $lang_cookie;
    }
    T_setlocale(LC_MESSAGES, $nameLocale);
    T_bindtextdomain($fileLoacale, './data/locale');
    T_bind_textdomain_codeset($fileLoacale, 'UTF-8');
    T_textdomain($fileLoacale);
}