getDocumentData() 공개 메소드

Return an array of tokens
public getDocumentData ( ) : array
리턴 array
예제 #1
0
 /**
  * Add a document
  * @param TokensDocument $document
  * @return void
  */
 public function addDocument(TokensDocument $document)
 {
     foreach ($document->getDocumentData() as $term) {
         if (isset($this->index[$term])) {
             $this->index[$term][self::FREQ]++;
             $this->index[$term][self::POSTINGS][] = $document->getId();
         } else {
             $this->index[$term] = [self::FREQ => 1, self::POSTINGS => [$document->getId()]];
         }
     }
 }
 public function testStanfordPos()
 {
     if (getenv('SKIP_TEST') || !getenv('JAVA_HOME')) {
         return;
     }
     $document = new TokensDocument((new WhitespaceTokenizer())->tokenize($this->text));
     $jarPath = get_storage_path('corpora/stanford_pos_tagger') . 'stanford-postagger-3.6.0.jar';
     $modelPath = get_storage_path('corpora/stanford_pos_tagger' . DIRECTORY_SEPARATOR . "models") . "english-left3words-distsim.tagger";
     $tagger = new StanfordPosTagger($jarPath, $modelPath);
     $output = $tagger->tag($document->getDocumentData());
     $this->assertFileExists($tagger->getTmpFilePath());
     $this->assertEquals(138, filesize($tagger->getTmpFilePath()));
     $this->assertEquals(['Michigan', 'NNP'], $output[15], "Did you set JAVA_HOME env variable?");
 }
 public function testStanfordNer()
 {
     if (getenv('SKIP_TEST') || !getenv('JAVA_HOME')) {
         return;
     }
     $document = new TokensDocument((new WhitespaceTokenizer())->tokenize($this->text));
     $jarPath = get_storage_path('ner') . 'stanford-ner.jar';
     $classiferPath = get_storage_path('ner' . DIRECTORY_SEPARATOR . "classifiers") . "english.all.3class.distsim.crf.ser.gz";
     $tagger = new StanfordNerTagger($jarPath, $classiferPath);
     $output = $tagger->tag($document->getDocumentData());
     $this->assertFileExists($tagger->getTmpFilePath());
     $this->assertEquals(138, filesize($tagger->getTmpFilePath()));
     $this->assertEquals(['Michigan', 'LOCATION'], $output[15], "Did you set JAVA_HOME env variable?");
 }
 /**
  * Returns an array of stop words and their frequencies
  * @return string[]
  */
 public function getStopwords()
 {
     if (!empty($this->stopWords)) {
         return $this->stopWords;
     }
     foreach ($this->getFilePaths() as $filePath) {
         $content = $this->getFileContent($filePath);
         $doc = new TokensDocument((new GeneralTokenizer())->tokenize($content));
         $doc->applyTransformation(new LowerCaseFilter())->applyTransformation(new PossessiveNounFilter())->applyTransformation(new PunctuationFilter())->applyTransformation(new CharFilter());
         if ($this->mode === self::MODE_FREQ) {
             $this->computeUsingFreqDist($doc->getDocumentData());
         }
     }
     arsort($this->stopWords);
     return $this->stopWords;
 }