function stem_word($word, $type) { global $debug, $stem_words, $stem_dir, $min_word_length, $common; //if ($debug == '2') echo "\r\n\r\n<br /> unstemmed: $word<br />\r\n"; // no stemming for too short words or words containing some special characters if (strlen($word) < $min_word_length || preg_match("/[\\*\\!:]|[0-9]/si", $word)) { return $word; } if ($stem_words == 'bg') { require_once "{$stem_dir}/bg_stem.php"; $word1 = bg_stemmer::stem($word); } if ($stem_words == 'cz') { require_once "{$stem_dir}/cz_stem.php"; $word1 = cz_stemmer::stem($word); } if ($stem_words == 'de') { require_once "{$stem_dir}/de_stem.php"; $word1 = de_stemmer::stem($word); } if ($stem_words == 'el') { require_once "{$stem_dir}/el_stem.php"; $stemmer = new el_stemmer(); $word1 = $stemmer->stem($word); } if ($stem_words == 'en') { require_once "{$stem_dir}/en_stem.php"; // JFIELD jfield // make all special characters ascii (for english only) // NOTE: to kill a word completely, return void setlocale(LC_CTYPE, 'en_GB'); $word = iconv('UTF-8', 'ASCII//TRANSLIT', $word); // does most of the magic $word = preg_replace("/[^\\w\\s]/", "", $word); // clean up a few weird things, // like umlauts becoming double quotes (!?) // END JFIELD end jfield $word1 = en_stemmer::stem($word); if ($word1 == "informal") { echo "<h1>'{$word}' - '{$word1}'</h1>"; exit; } } if ($stem_words == 'es') { require_once "{$stem_dir}/es_stem.php"; $word1 = es_stemmer::stem($word); } if ($stem_words == 'fi') { require_once "{$stem_dir}/fi_stem.php"; $word1 = fi_stemmer::stem($word); } if ($stem_words == 'fr') { require_once "{$stem_dir}/fr_stem.php"; $word1 = fr_stemmer::stem($word); } if ($stem_words == 'hu') { require_once "{$stem_dir}/hu_stem.php"; $word1 = hu_stemmer::stem($word); } if ($stem_words == 'nl') { require_once "{$stem_dir}/nl_stem.php"; $word1 = nl_stemmer::stem($word); } if ($stem_words == 'it') { require_once "{$stem_dir}/it_stem.php"; $stemmer = new it_stemmer(); $word1 = $stemmer->stem($word); } if ($stem_words == 'pt') { require_once "{$stem_dir}/pt_stem.php"; $word1 = pt_stemmer::stem($word); } if ($stem_words == 'ru') { require_once "{$stem_dir}/ru_stem.php"; $word1 = ru_stemmer::stem($word); } if ($stem_words == 'se') { require_once "{$stem_dir}/se_stem.php"; $word1 = se_stemmer::stem($word); } // Hopefully the stemmed word did not become too short // and the stemming algorithm did not create a common word // JFIELD doesn't think we should undo stemming for common words // because that's f*****g stupid - instead discard the whole word if (strlen($word1) < $min_word_length || $common[$word1]) { return; } //if ($debug == '2') echo "\r\n\r\n<br /> stemmed: $word<br />\r\n"; return $word1; }
<?php error_reporting(E_ERROR | E_PARSE); echo "Hello\n"; echo "'" . en_stemmer::stem("informal") . "'\n"; echo "Goodbye\n"; /* o------------------------------------------------------------------------------o * * This script is based on Martin Porter's stemming algorithm. * First PHP implementation by Jon Abernathy * Improvements, PHP5 implementation and adapted for Sphider-plus application * by Rolf Kellner [Tec] March 2010 * * o------------------------------------------------------------------------------o */ class en_stemmer { public function stem($word) { if (strlen($word) > 2) { //$word = lower_case($word); $word = self::step_1($word); $word = self::step_2($word); $word = self::step_3($word); $word = self::step_4($word); $word = self::step_5($word); } return $word; } // Step1, if the word is in plural form, it is reduced to singular form. // Then, any -ed or -ing endings are removed as appropriate, and finally, // words ending in "y" with a vowel in the stem have the "y" changed to "i". function step_1($word)