public function testPspell() { if (getenv('SKIP_TEST')) { return; } $stemmer = new DictionaryStemmer(new PspellAdapter(), new SnowballStemmer()); $this->assertEquals("judge", $stemmer->stem("judges")); // some times approach does not work $this->assertNotEquals('university', $stemmer->stem("university")); $this->assertEquals('hammock', $stemmer->stem("hammok")); }
protected function execute(InputInterface $input, OutputInterface $output) { if (ftell(STDIN) === 0) { $contents = ''; while (!feof(STDIN)) { $contents .= fread(STDIN, 1024); } // filtered tokens $tokens = array_map([$this, 'filter'], (new GeneralTokenizer())->tokenize($contents)); $tokens = array_unique($tokens); // stem the tokens $stemmer = new DictionaryStemmer(new PspellAdapter(), new SnowballStemmer()); $stemmedTokens = array_map(function ($token) use($stemmer) { return $stemmer->stem($token); }, $tokens); // use a dictionary to catch all stemmed words that must be fixed or ignored in this data set $stemmedTokens = array_map('mb_strtolower', $stemmedTokens); $comparison = new LevenshteinComparison(); for ($index = 0; $index < count($tokens); $index++) { // the stemmed word is not a word in the dictionary. The original token // will need to be manually mapped if (isset($stemmedTokens[$index]) && isset($tokens[$index]) && $comparison->distance($tokens[$index], $stemmedTokens[$index]) >= 4) { echo "{$tokens[$index]},{$stemmedTokens[$index]}" . PHP_EOL; } } } else { throw new \RuntimeException("Please pipe in STDIN"); } }