/** * Executes this filter * * @param article $article reference directly to the article object to filter */ public static function filter(&$article) { // Can't perform this filter unless language is known if (!isset($article->language)) { return; } // Stop words are language dependant so this filter relies on previously detected language $stopWordResource = new \WebArticleExtractor\ResourceProvider("stop_words/" . $article->language . ".lst"); $article->keywords = array(); // Append article title, its likely keywords will be in here. $sentences = KeywordFilter::getSentences(str_replace('\\u00a0', " ", str_replace("\r\n", '.', $article->title . ". " . $article->text))); $phrases = KeywordFilter::getPhrases($sentences, $stopWordResource); $wordScores = KeywordFilter::getWordScores($phrases); $candidatePhrases = KeywordFilter::getPhraseScores($phrases, $wordScores); // Clean candidates foreach ($candidatePhrases as $key => $value) { if ($value > self::WORD_SCORE_THRESHOLD) { $article->keywords[] = trim($key); } } //TODO: Weight by candidate phrase's word occurences in other top candidate phrases }