Apply the transformation
public applyTransformation ( TextAnalysis\Interfaces\ITokenTransformation $transformer, $removeNulls = true ) : |
||
$transformer | TextAnalysis\Interfaces\ITokenTransformation | |
return |
public function testGetCollocationsByPmi() { $testData = (new SpacePunctuationFilter())->transform(self::$text); $tokens = (new GeneralTokenizer(" \n\t\r"))->tokenize($testData); $tokenDoc = new TokensDocument($tokens); $tokenDoc->applyTransformation(new LowerCaseFilter())->applyTransformation(new PunctuationFilter([]), false)->applyTransformation(new StopWordsFilter([]))->applyTransformation(new QuotesFilter())->applyTransformation(new CharFilter()); $finder = new CollocationFinder($tokenDoc->toArray(), 2); $this->assertArrayHasKey('outlying cottages', $finder->getCollocationsByPmi()); }
public function testRake() { $stopwords = array_map('trim', file(VENDOR_DIR . 'yooper/stop-words/data/stop-words_english_1_en.txt')); // all punctuation must be moved 1 over. Fixes issues with sentences $testData = (new SpacePunctuationFilter([':', '\\/']))->transform($this->getTestData()); //rake MUST be split on whitespace and new lines only $tokens = (new GeneralTokenizer(" \n\t\r"))->tokenize($testData); $tokenDoc = new TokensDocument($tokens); $tokenDoc->applyTransformation(new LowerCaseFilter())->applyTransformation(new StopWordsFilter($stopwords), false)->applyTransformation(new PunctuationFilter(['@', ':', '\\/']), false)->applyTransformation(new CharFilter(), false); $rake = new Rake($tokenDoc, 3); $results = $rake->getKeywordScores(); $this->assertArrayHasKey('minimal generating sets', $results); $this->assertArrayHasKey('8/8/2016 5:51 pm', $results); }
/** * Returns an array of stop words and their frequencies * @return string[] */ public function getStopwords() { if (!empty($this->stopWords)) { return $this->stopWords; } foreach ($this->getFilePaths() as $filePath) { $content = $this->getFileContent($filePath); $doc = new TokensDocument((new GeneralTokenizer())->tokenize($content)); $doc->applyTransformation(new LowerCaseFilter())->applyTransformation(new PossessiveNounFilter())->applyTransformation(new PunctuationFilter())->applyTransformation(new CharFilter()); if ($this->mode === self::MODE_FREQ) { $this->computeUsingFreqDist($doc->getDocumentData()); } } arsort($this->stopWords); return $this->stopWords; }