function topWords() { global $contacts, $thresholdWords, $up; logMsg('USER', "Finding top {$thresholdWords} most often used words for each contact.."); foreach (array_keys($contacts) as $email) { echo ' <span>.</span>'; flush(); $language = $contacts[$email]['language']; if ($language != "english" && file_exists($up . "/stopwords/{$language}.txt")) { $extraStopWords = " | grep -v -w -f {$up}/stopwords/{$language}.txt"; } $f = sanitizeFilename($email); $filename = $up . '/content/' . $f . '.txt'; $filenameWords = $up . '/content/' . $f . '_words.txt'; $filenameWordsStem = $up . '/content/' . $f . '_words_stem.txt'; chdir($up); if (file_exists($filename)) { $cmd = 'cat ' . $filename . ' | tr "A-Z" "a-z" | tr -c "[:alpha:]" " " | tr " " "\\n" | sort | uniq -c | sort | grep -v -w -f ' . $up . '/stopwords/english.txt | grep -E [a-z]{3,} | tr -d " *[:digit:]*\\t" | tail -n ' . $thresholdWords * 4 . ' > ' . $filenameWords; logMsg('DEBUG', "Running CMD: {$cmd}"); shell_exec($cmd); #detect language $language = new LangDetect($filenameWords, -1); $lang = $language->Analyze(); $languages = array_keys($lang); $contacts[$email]['language'] = $languages[0]; $language = $languages[0]; $score = array_shift($score = $lang); array_shift($lang); foreach ($lang as $l => $lscore) { if ($lscore - $score > 7000) { break; } if ($l != 'english') { unset($language); $language = $l; break; } } if ($language != 'english') { logMsg('DEBUG', "Language for {$email} is " . $contacts[$email]['language'] . " (but removing also {$language} stopwords)"); } else { logMsg('DEBUG', "Language for {$email} is " . $contacts[$email]['language']); } if ($language != 'english') { $cmd = 'cat ' . $filenameWords . ' | tr "A-Z" "a-z" | tr -c "[:alpha:]" " " | tr " " "\\n" | sort | uniq -c | sort | grep -v -w -f ' . $up . '/stopwords/' . $contacts[$email]['language'] . '.txt | grep -E [a-z]{3,} | tr -d " *[:digit:]*\\t" | tail -n ' . $thresholdWords . ' > ' . $filenameWords; logMsg('DEBUG', "Running CMD: {$cmd}"); shell_exec($cmd); } $contacts[$email]['words'] = array_reverse(array_trim(file($filenameWords))); if ($language == 'english' || $language == 'swedish') { $languageShort = substr($language, 0, 2); $cmd = $up . '/cstlemma/bin/vc2008/cstlemma.exe -e1 -L -f ' . $up . '/cstlemma/flexrules_' . $languageShort . ' -t- -c"$B" -B"$w\\n" < ' . $filenameWords . ' > ' . $filenameWordsStem; logMsg('DEBUG', "Running CMD: {$cmd}"); shell_exec($cmd); $contacts[$email]['wordsStem'] = array_reverse(array_trim(file($filenameWordsStem))); array_pop($contacts[$email]['wordsStem']); } } } logMsg('USER', "Done!"); }