Ejemplo n.º 1
0
 public function __construct( $data  = NULL, $options = NULL ){
     $options = new GLU( $options );
     if( $data instanceof iterator ) {
         $str = '';
         foreach($data as $v ) $str .= ' ' . $v;
         $data = $str;
     }
     if( is_array( $data ) ) $data = implode(' ', $data );
     $data = preg_split("/[\s,\.\:\/\\\]+/", trim(preg_replace('/[^a-z0-9\s,\.\:\/\\\]/i', '', strtolower($data))));
     $words = array();
     foreach( $data as $word ){
         if( strpos(self::$stopwords, $word . '|') !== FALSE ) continue;
         if( ! $options->disable_stemmer ) $word = PorterStemmer::stem( $word );
         $len = strlen( $word );
         if( $len < 3 || $len > 30 ) continue;
         if( strpos(self::$stopwords, $word . '|') !== FALSE ) continue;
         if( ! isset( $words[ $word ] ) ) $words[ $word ] = 0;
         $words[$word]++;
         if( $len < 6 || $options->disable_soundex ) continue;
         $word = soundex( $word );
         if( ! isset( $words[ $word ] ) ) $words[ $word ] = 0;
         $words[$word]++;
     }
     parent::__construct( $words );
 }
Ejemplo n.º 2
0
 public static function stemPhrase($phrase)
 {
     // split into words
     $words = str_word_count(strtolower($phrase), 1);
     // ignore stop words
     $words = myTools::removeStopWordsFromArray($words);
     // stem words
     $stemmed_words = array();
     foreach ($words as $word) {
         // ignore 1 and 2 letter words
         if (strlen($word) <= 2) {
             continue;
         }
         // stem word (stemming is specific for each language)
         $stemmed_words[] = PorterStemmer::stem($word, true);
     }
     return $stemmed_words;
 }
Ejemplo n.º 3
0
function search_StemPhrase(&$module, $phrase)
{
    // strip out smarty tags
    $phrase = preg_replace('/\\{.*?\\}/', '', $phrase);
    // add spaces between tags
    $phrase = str_replace("<", " <", $phrase);
    $phrase = str_replace(">", "> ", $phrase);
    // strip out html and php stuff
    $phrase = strip_tags($phrase);
    // escape meta characters
    $phrase = preg_quote($phrase);
    // split into words
    // strtolower isn't friendly to other charsets
    $phrase = preg_replace("/([A-Z]+)/e", "strtolower('\\1')", $phrase);
    //$words = preg_split('/[\s,!.()+-\/\\\\]+/', $phrase);
    $words = preg_split('/[\\s,!.;:\\?()+-\\/\\\\]+/', $phrase);
    // ignore stop words
    $words = $module->RemoveStopWordsFromArray($words);
    $stemmer = new PorterStemmer();
    // stem words
    $stemmed_words = array();
    $stem_pref = $module->GetPreference('usestemming', 'false');
    foreach ($words as $word) {
        //trim words get rid of wrapping quotes
        $word = trim($word, ' \'"');
        if (strlen($word) <= 0) {
            continue;
        }
        if ($stem_pref == 'true') {
            $stemmed_words[] = $stemmer->stem($word, true);
        } else {
            $stemmed_words[] = $word;
        }
    }
    return $stemmed_words;
}
 /**
  * Normalize Token or remove it (if null is returned)
  *
  * @param Zend_Search_Lucene_Analysis_Token $srcToken
  * @return Zend_Search_Lucene_Analysis_Token
  */
 public function normalize(Zend_Search_Lucene_Analysis_Token $srcToken)
 {
     $newToken = new Zend_Search_Lucene_Analysis_Token(PorterStemmer::stem($srcToken->getTermText()), $srcToken->getStartOffset(), $srcToken->getEndOffset());
     $newToken->setPositionIncrement($srcToken->getPositionIncrement());
     return $newToken;
 }
<?php

require_once 'db.php';
include 'classes/stem.php';
include 'classes/cleaner.php';
if (!empty($_POST['search'])) {
    $string = $_POST['search'];
    $main_url = 'http://www.roscripts.com/';
    $stemmer = new PorterStemmer();
    $stemmed_string = $stemmer->stem($string);
    $clean_string = new jSearchString();
    $stemmed_string = $clean_string->parseString($stemmed_string);
    $new_string = '';
    foreach (array_unique(split(" ", $stemmed_string)) as $array => $value) {
        if (strlen($value) >= 3) {
            $new_string .= '' . $value . ' ';
        }
    }
    $new_string = substr($new_string, 0, strLen($new_string) - 1);
    if (strlen($new_string) > 3) {
        $split_stemmed = split(" ", $new_string);
        mysql_select_db($database);
        $sql = "SELECT DISTINCT COUNT(*) as occurences, title, subtitle FROM articles WHERE (";
        while (list($key, $val) = each($split_stemmed)) {
            if ($val != '' && strlen($val) > 0) {
                $sql .= "((title LIKE '%" . $val . "%' OR subtitle LIKE '%" . $val . "%' OR content LIKE '%" . $val . "%')) OR";
            }
        }
        $sql = substr($sql, 0, strLen($sql) - 3);
        //this will eat the last OR
        $sql .= ") GROUP BY title ORDER BY occurences DESC LIMIT 10";
 /**
  * @see xfLuceneStemmer
  */
 public function doStem($word)
 {
     return PorterStemmer::stem($word);
 }
Ejemplo n.º 7
0
 private function stemPhrase($strPhrase)
 {
     if ($strPhrase == "%") {
         //*** Wildcard only search.
         return array($strPhrase);
     } else {
         //*** Split into words.
         $arrWords = $this->mb_str_word_count(str_replace('-', ' ', mb_strtolower($strPhrase)), 1);
         //*** Ignore stop words.
         $arrWords = $this->removeStopWordsFromArray($arrWords);
         //*** Stem words.
         $arrStemmedWords = array();
         foreach ($arrWords as $strWord) {
             //*** Ignore 1 and 2 letter words.
             if (mb_strlen($strWord) <= 2) {
                 continue;
             }
             //*** Don't stem wildcards.
             if (stripos($strWord, "%") !== false) {
                 $arrStemmedWords[] = $strWord;
                 continue;
             }
             $arrStemmedWords[] = PorterStemmer::stem($strWord, true);
         }
         return $arrStemmedWords;
     }
 }
Ejemplo n.º 8
0
 public function getWords()
 {
     // body
     $raw_text = str_repeat(' ' . strip_tags($this->getHtmlBody()), sfConfig::get('app_search_body_weight'));
     // title
     $raw_text .= str_repeat(' ' . $this->getTitle(), sfConfig::get('app_search_title_weight'));
     // title and body stemming
     $stemmed_words = myTools::stemPhrase($raw_text);
     // unique words with weight
     $words = array_count_values($stemmed_words);
     // add tags
     $max = 0;
     foreach ($this->getPopularTags(20) as $tag => $count) {
         if (!$max) {
             $max = $count;
         }
         $stemmed_tag = PorterStemmer::stem($tag);
         if (!isset($words[$stemmed_tag])) {
             $words[$stemmed_tag] = 0;
         }
         $words[$stemmed_tag] += ceil($count / $max * sfConfig::get('app_search_tag_weight'));
     }
     return $words;
 }