public function __construct( $data = NULL, $options = NULL ){ $options = new GLU( $options ); if( $data instanceof iterator ) { $str = ''; foreach($data as $v ) $str .= ' ' . $v; $data = $str; } if( is_array( $data ) ) $data = implode(' ', $data ); $data = preg_split("/[\s,\.\:\/\\\]+/", trim(preg_replace('/[^a-z0-9\s,\.\:\/\\\]/i', '', strtolower($data)))); $words = array(); foreach( $data as $word ){ if( strpos(self::$stopwords, $word . '|') !== FALSE ) continue; if( ! $options->disable_stemmer ) $word = PorterStemmer::stem( $word ); $len = strlen( $word ); if( $len < 3 || $len > 30 ) continue; if( strpos(self::$stopwords, $word . '|') !== FALSE ) continue; if( ! isset( $words[ $word ] ) ) $words[ $word ] = 0; $words[$word]++; if( $len < 6 || $options->disable_soundex ) continue; $word = soundex( $word ); if( ! isset( $words[ $word ] ) ) $words[ $word ] = 0; $words[$word]++; } parent::__construct( $words ); }
public static function stemPhrase($phrase) { // split into words $words = str_word_count(strtolower($phrase), 1); // ignore stop words $words = myTools::removeStopWordsFromArray($words); // stem words $stemmed_words = array(); foreach ($words as $word) { // ignore 1 and 2 letter words if (strlen($word) <= 2) { continue; } // stem word (stemming is specific for each language) $stemmed_words[] = PorterStemmer::stem($word, true); } return $stemmed_words; }
function search_StemPhrase(&$module, $phrase) { // strip out smarty tags $phrase = preg_replace('/\\{.*?\\}/', '', $phrase); // add spaces between tags $phrase = str_replace("<", " <", $phrase); $phrase = str_replace(">", "> ", $phrase); // strip out html and php stuff $phrase = strip_tags($phrase); // escape meta characters $phrase = preg_quote($phrase); // split into words // strtolower isn't friendly to other charsets $phrase = preg_replace("/([A-Z]+)/e", "strtolower('\\1')", $phrase); //$words = preg_split('/[\s,!.()+-\/\\\\]+/', $phrase); $words = preg_split('/[\\s,!.;:\\?()+-\\/\\\\]+/', $phrase); // ignore stop words $words = $module->RemoveStopWordsFromArray($words); $stemmer = new PorterStemmer(); // stem words $stemmed_words = array(); $stem_pref = $module->GetPreference('usestemming', 'false'); foreach ($words as $word) { //trim words get rid of wrapping quotes $word = trim($word, ' \'"'); if (strlen($word) <= 0) { continue; } if ($stem_pref == 'true') { $stemmed_words[] = $stemmer->stem($word, true); } else { $stemmed_words[] = $word; } } return $stemmed_words; }
/** * Normalize Token or remove it (if null is returned) * * @param Zend_Search_Lucene_Analysis_Token $srcToken * @return Zend_Search_Lucene_Analysis_Token */ public function normalize(Zend_Search_Lucene_Analysis_Token $srcToken) { $newToken = new Zend_Search_Lucene_Analysis_Token(PorterStemmer::stem($srcToken->getTermText()), $srcToken->getStartOffset(), $srcToken->getEndOffset()); $newToken->setPositionIncrement($srcToken->getPositionIncrement()); return $newToken; }
<?php require_once 'db.php'; include 'classes/stem.php'; include 'classes/cleaner.php'; if (!empty($_POST['search'])) { $string = $_POST['search']; $main_url = 'http://www.roscripts.com/'; $stemmer = new PorterStemmer(); $stemmed_string = $stemmer->stem($string); $clean_string = new jSearchString(); $stemmed_string = $clean_string->parseString($stemmed_string); $new_string = ''; foreach (array_unique(split(" ", $stemmed_string)) as $array => $value) { if (strlen($value) >= 3) { $new_string .= '' . $value . ' '; } } $new_string = substr($new_string, 0, strLen($new_string) - 1); if (strlen($new_string) > 3) { $split_stemmed = split(" ", $new_string); mysql_select_db($database); $sql = "SELECT DISTINCT COUNT(*) as occurences, title, subtitle FROM articles WHERE ("; while (list($key, $val) = each($split_stemmed)) { if ($val != '' && strlen($val) > 0) { $sql .= "((title LIKE '%" . $val . "%' OR subtitle LIKE '%" . $val . "%' OR content LIKE '%" . $val . "%')) OR"; } } $sql = substr($sql, 0, strLen($sql) - 3); //this will eat the last OR $sql .= ") GROUP BY title ORDER BY occurences DESC LIMIT 10";
/** * @see xfLuceneStemmer */ public function doStem($word) { return PorterStemmer::stem($word); }
private function stemPhrase($strPhrase) { if ($strPhrase == "%") { //*** Wildcard only search. return array($strPhrase); } else { //*** Split into words. $arrWords = $this->mb_str_word_count(str_replace('-', ' ', mb_strtolower($strPhrase)), 1); //*** Ignore stop words. $arrWords = $this->removeStopWordsFromArray($arrWords); //*** Stem words. $arrStemmedWords = array(); foreach ($arrWords as $strWord) { //*** Ignore 1 and 2 letter words. if (mb_strlen($strWord) <= 2) { continue; } //*** Don't stem wildcards. if (stripos($strWord, "%") !== false) { $arrStemmedWords[] = $strWord; continue; } $arrStemmedWords[] = PorterStemmer::stem($strWord, true); } return $arrStemmedWords; } }
public function getWords() { // body $raw_text = str_repeat(' ' . strip_tags($this->getHtmlBody()), sfConfig::get('app_search_body_weight')); // title $raw_text .= str_repeat(' ' . $this->getTitle(), sfConfig::get('app_search_title_weight')); // title and body stemming $stemmed_words = myTools::stemPhrase($raw_text); // unique words with weight $words = array_count_values($stemmed_words); // add tags $max = 0; foreach ($this->getPopularTags(20) as $tag => $count) { if (!$max) { $max = $count; } $stemmed_tag = PorterStemmer::stem($tag); if (!isset($words[$stemmed_tag])) { $words[$stemmed_tag] = 0; } $words[$stemmed_tag] += ceil($count / $max * sfConfig::get('app_search_tag_weight')); } return $words; }