public function testGetCollocationsByPmi() { $testData = (new SpacePunctuationFilter())->transform(self::$text); $tokens = (new GeneralTokenizer(" \n\t\r"))->tokenize($testData); $tokenDoc = new TokensDocument($tokens); $tokenDoc->applyTransformation(new LowerCaseFilter())->applyTransformation(new PunctuationFilter([]), false)->applyTransformation(new StopWordsFilter([]))->applyTransformation(new QuotesFilter())->applyTransformation(new CharFilter()); $finder = new CollocationFinder($tokenDoc->toArray(), 2); $this->assertArrayHasKey('outlying cottages', $finder->getCollocationsByPmi()); }
/** * Add a document * @param TokensDocument $document * @return void */ public function addDocument(TokensDocument $document) { foreach ($document->getDocumentData() as $term) { if (isset($this->index[$term])) { $this->index[$term][self::FREQ]++; $this->index[$term][self::POSTINGS][] = $document->getId(); } else { $this->index[$term] = [self::FREQ => 1, self::POSTINGS => [$document->getId()]]; } } }
public function testRake() { $stopwords = array_map('trim', file(VENDOR_DIR . 'yooper/stop-words/data/stop-words_english_1_en.txt')); // all punctuation must be moved 1 over. Fixes issues with sentences $testData = (new SpacePunctuationFilter([':', '\\/']))->transform($this->getTestData()); //rake MUST be split on whitespace and new lines only $tokens = (new GeneralTokenizer(" \n\t\r"))->tokenize($testData); $tokenDoc = new TokensDocument($tokens); $tokenDoc->applyTransformation(new LowerCaseFilter())->applyTransformation(new StopWordsFilter($stopwords), false)->applyTransformation(new PunctuationFilter(['@', ':', '\\/']), false)->applyTransformation(new CharFilter(), false); $rake = new Rake($tokenDoc, 3); $results = $rake->getKeywordScores(); $this->assertArrayHasKey('minimal generating sets', $results); $this->assertArrayHasKey('8/8/2016 5:51 pm', $results); }
public function testStanfordPos() { if (getenv('SKIP_TEST') || !getenv('JAVA_HOME')) { return; } $document = new TokensDocument((new WhitespaceTokenizer())->tokenize($this->text)); $jarPath = get_storage_path('corpora/stanford_pos_tagger') . 'stanford-postagger-3.6.0.jar'; $modelPath = get_storage_path('corpora/stanford_pos_tagger' . DIRECTORY_SEPARATOR . "models") . "english-left3words-distsim.tagger"; $tagger = new StanfordPosTagger($jarPath, $modelPath); $output = $tagger->tag($document->getDocumentData()); $this->assertFileExists($tagger->getTmpFilePath()); $this->assertEquals(138, filesize($tagger->getTmpFilePath())); $this->assertEquals(['Michigan', 'NNP'], $output[15], "Did you set JAVA_HOME env variable?"); }
public function testStanfordNer() { if (getenv('SKIP_TEST') || !getenv('JAVA_HOME')) { return; } $document = new TokensDocument((new WhitespaceTokenizer())->tokenize($this->text)); $jarPath = get_storage_path('ner') . 'stanford-ner.jar'; $classiferPath = get_storage_path('ner' . DIRECTORY_SEPARATOR . "classifiers") . "english.all.3class.distsim.crf.ser.gz"; $tagger = new StanfordNerTagger($jarPath, $classiferPath); $output = $tagger->tag($document->getDocumentData()); $this->assertFileExists($tagger->getTmpFilePath()); $this->assertEquals(138, filesize($tagger->getTmpFilePath())); $this->assertEquals(['Michigan', 'LOCATION'], $output[15], "Did you set JAVA_HOME env variable?"); }
/** * Returns an array of stop words and their frequencies * @return string[] */ public function getStopwords() { if (!empty($this->stopWords)) { return $this->stopWords; } foreach ($this->getFilePaths() as $filePath) { $content = $this->getFileContent($filePath); $doc = new TokensDocument((new GeneralTokenizer())->tokenize($content)); $doc->applyTransformation(new LowerCaseFilter())->applyTransformation(new PossessiveNounFilter())->applyTransformation(new PunctuationFilter())->applyTransformation(new CharFilter()); if ($this->mode === self::MODE_FREQ) { $this->computeUsingFreqDist($doc->getDocumentData()); } } arsort($this->stopWords); return $this->stopWords; }
/** * Removes the document from the index * @param TokensDocument $document */ public function removeDocument(TokensDocument $document) { $this->isDirty = true; $this->getInvertedIndex()->removeDocument($document->getId()); unset($this->metadata[$document->getId()]); }