Esempio n. 1
0
function topWords()
{
    global $contacts, $thresholdWords, $up;
    logMsg('USER', "Finding top {$thresholdWords} most often used words for each contact..");
    foreach (array_keys($contacts) as $email) {
        echo ' <span>.</span>';
        flush();
        $language = $contacts[$email]['language'];
        if ($language != "english" && file_exists($up . "/stopwords/{$language}.txt")) {
            $extraStopWords = " | grep -v -w -f {$up}/stopwords/{$language}.txt";
        }
        $f = sanitizeFilename($email);
        $filename = $up . '/content/' . $f . '.txt';
        $filenameWords = $up . '/content/' . $f . '_words.txt';
        $filenameWordsStem = $up . '/content/' . $f . '_words_stem.txt';
        chdir($up);
        if (file_exists($filename)) {
            $cmd = 'cat ' . $filename . ' | tr "A-Z" "a-z" | tr -c "[:alpha:]" " " | tr " " "\\n" | sort | uniq -c | sort | grep -v -w -f ' . $up . '/stopwords/english.txt | grep -E [a-z]{3,} | tr -d " *[:digit:]*\\t" | tail -n ' . $thresholdWords * 4 . ' > ' . $filenameWords;
            logMsg('DEBUG', "Running CMD: {$cmd}");
            shell_exec($cmd);
            #detect language
            $language = new LangDetect($filenameWords, -1);
            $lang = $language->Analyze();
            $languages = array_keys($lang);
            $contacts[$email]['language'] = $languages[0];
            $language = $languages[0];
            $score = array_shift($score = $lang);
            array_shift($lang);
            foreach ($lang as $l => $lscore) {
                if ($lscore - $score > 7000) {
                    break;
                }
                if ($l != 'english') {
                    unset($language);
                    $language = $l;
                    break;
                }
            }
            if ($language != 'english') {
                logMsg('DEBUG', "Language for {$email} is " . $contacts[$email]['language'] . " (but removing also {$language} stopwords)");
            } else {
                logMsg('DEBUG', "Language for {$email} is " . $contacts[$email]['language']);
            }
            if ($language != 'english') {
                $cmd = 'cat ' . $filenameWords . ' | tr "A-Z" "a-z" | tr -c "[:alpha:]" " " | tr " " "\\n" | sort | uniq -c | sort | grep -v -w -f ' . $up . '/stopwords/' . $contacts[$email]['language'] . '.txt | grep -E [a-z]{3,} | tr -d " *[:digit:]*\\t" | tail -n ' . $thresholdWords . ' > ' . $filenameWords;
                logMsg('DEBUG', "Running CMD: {$cmd}");
                shell_exec($cmd);
            }
            $contacts[$email]['words'] = array_reverse(array_trim(file($filenameWords)));
            if ($language == 'english' || $language == 'swedish') {
                $languageShort = substr($language, 0, 2);
                $cmd = $up . '/cstlemma/bin/vc2008/cstlemma.exe -e1 -L -f ' . $up . '/cstlemma/flexrules_' . $languageShort . ' -t- -c"$B" -B"$w\\n" < ' . $filenameWords . ' > ' . $filenameWordsStem;
                logMsg('DEBUG', "Running CMD: {$cmd}");
                shell_exec($cmd);
                $contacts[$email]['wordsStem'] = array_reverse(array_trim(file($filenameWordsStem)));
                array_pop($contacts[$email]['wordsStem']);
            }
        }
    }
    logMsg('USER', "Done!");
}