/** * Add a document * @param TokensDocument $document * @return void */ public function addDocument(TokensDocument $document) { foreach ($document->getDocumentData() as $term) { if (isset($this->index[$term])) { $this->index[$term][self::FREQ]++; $this->index[$term][self::POSTINGS][] = $document->getId(); } else { $this->index[$term] = [self::FREQ => 1, self::POSTINGS => [$document->getId()]]; } } }
public function testStanfordPos() { if (getenv('SKIP_TEST') || !getenv('JAVA_HOME')) { return; } $document = new TokensDocument((new WhitespaceTokenizer())->tokenize($this->text)); $jarPath = get_storage_path('corpora/stanford_pos_tagger') . 'stanford-postagger-3.6.0.jar'; $modelPath = get_storage_path('corpora/stanford_pos_tagger' . DIRECTORY_SEPARATOR . "models") . "english-left3words-distsim.tagger"; $tagger = new StanfordPosTagger($jarPath, $modelPath); $output = $tagger->tag($document->getDocumentData()); $this->assertFileExists($tagger->getTmpFilePath()); $this->assertEquals(138, filesize($tagger->getTmpFilePath())); $this->assertEquals(['Michigan', 'NNP'], $output[15], "Did you set JAVA_HOME env variable?"); }
public function testStanfordNer() { if (getenv('SKIP_TEST') || !getenv('JAVA_HOME')) { return; } $document = new TokensDocument((new WhitespaceTokenizer())->tokenize($this->text)); $jarPath = get_storage_path('ner') . 'stanford-ner.jar'; $classiferPath = get_storage_path('ner' . DIRECTORY_SEPARATOR . "classifiers") . "english.all.3class.distsim.crf.ser.gz"; $tagger = new StanfordNerTagger($jarPath, $classiferPath); $output = $tagger->tag($document->getDocumentData()); $this->assertFileExists($tagger->getTmpFilePath()); $this->assertEquals(138, filesize($tagger->getTmpFilePath())); $this->assertEquals(['Michigan', 'LOCATION'], $output[15], "Did you set JAVA_HOME env variable?"); }
/** * Returns an array of stop words and their frequencies * @return string[] */ public function getStopwords() { if (!empty($this->stopWords)) { return $this->stopWords; } foreach ($this->getFilePaths() as $filePath) { $content = $this->getFileContent($filePath); $doc = new TokensDocument((new GeneralTokenizer())->tokenize($content)); $doc->applyTransformation(new LowerCaseFilter())->applyTransformation(new PossessiveNounFilter())->applyTransformation(new PunctuationFilter())->applyTransformation(new CharFilter()); if ($this->mode === self::MODE_FREQ) { $this->computeUsingFreqDist($doc->getDocumentData()); } } arsort($this->stopWords); return $this->stopWords; }