Author: yooper (yooper)
Inheritance: extends TextAnalysis\Documents\DocumentAbstract
コード例 #1
0
 public function testGetCollocationsByPmi()
 {
     $testData = (new SpacePunctuationFilter())->transform(self::$text);
     $tokens = (new GeneralTokenizer(" \n\t\r"))->tokenize($testData);
     $tokenDoc = new TokensDocument($tokens);
     $tokenDoc->applyTransformation(new LowerCaseFilter())->applyTransformation(new PunctuationFilter([]), false)->applyTransformation(new StopWordsFilter([]))->applyTransformation(new QuotesFilter())->applyTransformation(new CharFilter());
     $finder = new CollocationFinder($tokenDoc->toArray(), 2);
     $this->assertArrayHasKey('outlying cottages', $finder->getCollocationsByPmi());
 }
コード例 #2
0
 /**
  * Add a document
  * @param TokensDocument $document
  * @return void
  */
 public function addDocument(TokensDocument $document)
 {
     foreach ($document->getDocumentData() as $term) {
         if (isset($this->index[$term])) {
             $this->index[$term][self::FREQ]++;
             $this->index[$term][self::POSTINGS][] = $document->getId();
         } else {
             $this->index[$term] = [self::FREQ => 1, self::POSTINGS => [$document->getId()]];
         }
     }
 }
コード例 #3
0
ファイル: RakeTest.php プロジェクト: yooper/php-text-analysis
 public function testRake()
 {
     $stopwords = array_map('trim', file(VENDOR_DIR . 'yooper/stop-words/data/stop-words_english_1_en.txt'));
     // all punctuation must be moved 1 over. Fixes issues with sentences
     $testData = (new SpacePunctuationFilter([':', '\\/']))->transform($this->getTestData());
     //rake MUST be split on whitespace and new lines only
     $tokens = (new GeneralTokenizer(" \n\t\r"))->tokenize($testData);
     $tokenDoc = new TokensDocument($tokens);
     $tokenDoc->applyTransformation(new LowerCaseFilter())->applyTransformation(new StopWordsFilter($stopwords), false)->applyTransformation(new PunctuationFilter(['@', ':', '\\/']), false)->applyTransformation(new CharFilter(), false);
     $rake = new Rake($tokenDoc, 3);
     $results = $rake->getKeywordScores();
     $this->assertArrayHasKey('minimal generating sets', $results);
     $this->assertArrayHasKey('8/8/2016 5:51 pm', $results);
 }
コード例 #4
0
 public function testStanfordPos()
 {
     if (getenv('SKIP_TEST') || !getenv('JAVA_HOME')) {
         return;
     }
     $document = new TokensDocument((new WhitespaceTokenizer())->tokenize($this->text));
     $jarPath = get_storage_path('corpora/stanford_pos_tagger') . 'stanford-postagger-3.6.0.jar';
     $modelPath = get_storage_path('corpora/stanford_pos_tagger' . DIRECTORY_SEPARATOR . "models") . "english-left3words-distsim.tagger";
     $tagger = new StanfordPosTagger($jarPath, $modelPath);
     $output = $tagger->tag($document->getDocumentData());
     $this->assertFileExists($tagger->getTmpFilePath());
     $this->assertEquals(138, filesize($tagger->getTmpFilePath()));
     $this->assertEquals(['Michigan', 'NNP'], $output[15], "Did you set JAVA_HOME env variable?");
 }
コード例 #5
0
 public function testStanfordNer()
 {
     if (getenv('SKIP_TEST') || !getenv('JAVA_HOME')) {
         return;
     }
     $document = new TokensDocument((new WhitespaceTokenizer())->tokenize($this->text));
     $jarPath = get_storage_path('ner') . 'stanford-ner.jar';
     $classiferPath = get_storage_path('ner' . DIRECTORY_SEPARATOR . "classifiers") . "english.all.3class.distsim.crf.ser.gz";
     $tagger = new StanfordNerTagger($jarPath, $classiferPath);
     $output = $tagger->tag($document->getDocumentData());
     $this->assertFileExists($tagger->getTmpFilePath());
     $this->assertEquals(138, filesize($tagger->getTmpFilePath()));
     $this->assertEquals(['Michigan', 'LOCATION'], $output[15], "Did you set JAVA_HOME env variable?");
 }
コード例 #6
0
 /**
  * Returns an array of stop words and their frequencies
  * @return string[]
  */
 public function getStopwords()
 {
     if (!empty($this->stopWords)) {
         return $this->stopWords;
     }
     foreach ($this->getFilePaths() as $filePath) {
         $content = $this->getFileContent($filePath);
         $doc = new TokensDocument((new GeneralTokenizer())->tokenize($content));
         $doc->applyTransformation(new LowerCaseFilter())->applyTransformation(new PossessiveNounFilter())->applyTransformation(new PunctuationFilter())->applyTransformation(new CharFilter());
         if ($this->mode === self::MODE_FREQ) {
             $this->computeUsingFreqDist($doc->getDocumentData());
         }
     }
     arsort($this->stopWords);
     return $this->stopWords;
 }
コード例 #7
0
 /**
  * Removes the document from the index
  * @param TokensDocument $document
  */
 public function removeDocument(TokensDocument $document)
 {
     $this->isDirty = true;
     $this->getInvertedIndex()->removeDocument($document->getId());
     unset($this->metadata[$document->getId()]);
 }