protected function checkStemmer(Stemmer $stemmer, \Iterator $words, \Iterator $stems)
 {
     foreach ($words as $word) {
         $stem = $stems->current();
         $this->assertEquals($stemmer->stem($word), $stem, "The stem for '{$word}' should be '{$stem}' not '{$stemmer->stem($word)}'");
         $stems->next();
     }
 }
Exemple #2
0
 public function testStemming()
 {
     $stemmer = new Stemmer();
     $testWords = ['результаты' => 'результат', 'в' => 'в', 'вавиловка' => 'вавиловк', 'вагнера' => 'вагнер', 'вагон' => 'вагон', 'вагона' => 'вагон', 'вагоне' => 'вагон', 'вагонов' => 'вагон', 'вагоном' => 'вагон', 'вагоны' => 'вагон', 'важная' => 'важн', 'важнее' => 'важн', 'важнейшие' => 'важн', 'важнейшими' => 'важн', 'важничал' => 'важнича', 'важно' => 'важн', 'важного' => 'важн', 'важное' => 'важн', 'важной' => 'важн', 'важном' => 'важн', 'важному' => 'важн', 'важности' => 'важност', 'важностию' => 'важност', 'важность' => 'важност', 'важностью' => 'важност', 'важную' => 'важн', 'важны' => 'важн', 'важные' => 'важн', 'важный' => 'важн', 'важным' => 'важн', 'важных' => 'важн', 'вазах' => 'ваз', 'вазы' => 'ваз', 'вакса' => 'вакс', 'вакханка' => 'вакханк', 'вал' => 'вал', 'валандался' => 'валанда', 'валентина' => 'валентин', 'валериановых' => 'валерианов', 'валерию' => 'валер', 'валетами' => 'валет', 'вали' => 'вал', 'валил' => 'вал', 'валился' => 'вал', 'валится' => 'вал', 'валов' => 'вал', 'вальдшнепа' => 'вальдшнеп', 'вальс' => 'вальс', 'вальса' => 'вальс', 'вальсе' => 'вальс', 'вальсишку' => 'вальсишк', 'вальтера' => 'вальтер', 'валяется' => 'валя', 'валялась' => 'валя', 'валялись' => 'валя', 'валялось' => 'валя', 'валялся' => 'валя', 'валять' => 'валя', 'валяются' => 'валя', 'вам' => 'вам', 'вами' => 'вам'];
     foreach ($testWords as $word => $base) {
         $this->assertEquals($base, $stemmer->getWordBase($word));
     }
 }
 public function getAnalysis($text)
 {
     $an = new Stemmer();
     $text = $an->stem_words($text);
     $text = strip_tags($text);
     //удаляем html+php
     $text = stripslashes($text);
     //удаляем слеши
     $text = mb_strtolower(strip_tags($text), "utf-8");
     $patterns = array('/\\  /s', '/\\       /s');
     $text = preg_replace($patterns, null, $text);
     //прогоняем регулярки
     $text = str_replace(array(",", '"', '.'), " ", $text);
     //прогоняем регулярки
     $words = array_unique(explode(' ', $text));
     // Оставляем только уникальные значения
     // оставляем только слова, которые не меньше 4х букв
     foreach ($words as $key => $value) {
         if (mb_strlen($value, "utf-8") < 4) {
             unset($words[$key]);
         }
     }
     $text .= ' ';
     $result = array();
     foreach ($words as $word) {
         // исключаем схожие слова, добавляя пробел
         // исключаем слова, вхождениие которых меньше 3х
         if (($cnt = substr_count($text, $word . ' ')) < 3) {
             continue;
         }
         $result["{$word}"] = $cnt;
     }
     arsort($result);
     // Ищем словосочетания
     $words = preg_split('#\\s+#', trim($text));
     $pair_words = array();
     foreach ($words as $i => $word) {
         if (isset($words[$i + 1])) {
             $pair_words[] = $word . ' ' . $words[$i + 1];
         }
     }
     $pair_words = array_count_values($pair_words);
     foreach ($pair_words as $key => $value) {
         if ($value == 1) {
             unset($pair_words[$key]);
         }
     }
     arsort($pair_words);
     $result = array_merge($pair_words, $result);
     return $result;
 }
 public function stem_string()
 {
     $simple_array = array();
     $stemming;
     //making an instance of the class Stemmer which is an open source script for stemming
     $stemming = new Stemmer();
     $simple_array_stemmed = array();
     preg_match_all('@\\w+\\b@', $this->input_user_query, $output);
     /*var_dump($output);*/
     foreach ($output as $key) {
         $simple_array = $key;
     }
     foreach ($simple_array as $key) {
         //we dont want to stem NOT OR or AND and turn them into lowercase Boolean search would not work then
         if ($key != 'NOT' && $key != 'OR' && $key != 'AND') {
             $just_stemmed = $stemming->stem($key);
         } else {
             $just_stemmed = $key;
         }
         /*echo $just_stemmed;*/
         $simple_array_stemmed[] = $just_stemmed;
         /*echo $key;*/
     }
     /*var_dump($simple_array_stemmed);*/
     //put the modefied sting into $this->input_user_query
     $this->input_user_query = implode($simple_array_stemmed, ' ');
     /*echo ('I am stemmed');*/
     /*var_dump($this->input_user_query);*/
 }
Exemple #5
0
 public static function getInstance()
 {
     if (self::$_instance === NULL) {
         self::$_instance = new Stemmer();
     }
     return self::$_instance;
 }
function PricerrTheme_autosuggest_it()
{
    include 'classes/stem.php';
    include 'classes/cleaner.php';
    global $wpdb;
    $string = $_POST['queryString'];
    $stemmer = new Stemmer();
    $stemmed_string = $stemmer->stem($string);
    $clean_string = new jSearchString();
    $stemmed_string = $clean_string->parseString($stemmed_string);
    $new_string = '';
    foreach (array_unique(split(" ", $stemmed_string)) as $array => $value) {
        if (strlen($value) >= 1) {
            $new_string .= '' . $value . ' ';
        }
    }
    //$new_string = substr ( $new_string,0, ( strLen ( $new_string ) -1 ) );
    $new_string = htmlspecialchars($_POST['queryString']);
    if (strlen($new_string) > 0) {
        $split_stemmed = split(" ", $new_string);
        $sql = "SELECT DISTINCT COUNT(*) as occurences, " . $wpdb->prefix . "posts.post_title, " . $wpdb->prefix . "posts.ID FROM " . $wpdb->prefix . "posts,\r\n\t\t\t" . $wpdb->prefix . "postmeta WHERE " . $wpdb->prefix . "posts.post_status='publish' and \r\n\t\t\t" . $wpdb->prefix . "posts.post_type='job' \r\n\t\t\t\r\n\t\t\t\t\tAND " . $wpdb->prefix . "posts.ID = " . $wpdb->prefix . "postmeta.post_id \r\n\t\t\t\t\tAND " . $wpdb->prefix . "postmeta.meta_key = 'closed' \r\n\t\t\t\t\tAND " . $wpdb->prefix . "postmeta.meta_value = '0' \r\n\t\t\t\r\n\t\t\tAND (";
        while (list($key, $val) = each($split_stemmed)) {
            if ($val != '' && strlen($val) > 0) {
                $sql .= "(" . $wpdb->prefix . "posts.post_title LIKE '%" . $val . "%' OR " . $wpdb->prefix . "posts.post_content LIKE '%" . $val . "%') OR";
            }
        }
        $sql = substr($sql, 0, strlen($sql) - 3);
        //this will eat the last OR
        $sql .= ") GROUP BY " . $wpdb->prefix . "posts.post_title ORDER BY occurences DESC LIMIT 10";
        /*
        			SELECT DISTINCT COUNT(*) as occurences, wp_posts.post_title FROM wp_posts, wp_postmeta WHERE wp_posts.post_status='publish' and wp_posts.post_type='job' AND wp_posts.ID = wp_postmeta.post_id AND wp_postmeta.meta_key = 'closed' AND wp_postmeta.meta_value = '0' AND ((wp_posts.post_title LIKE '%test%' OR wp_posts.post_content LIKE '%test%')) GROUP BY wp_posts.post_title ORDER BY occurences DESC LIMIT 10 */
        $r = $wpdb->get_results($sql, ARRAY_A);
        if (count($r) > 0) {
            foreach ($r as $row) {
                echo '<ul id="sk_auto_suggest">';
                $prm = get_permalink($row['ID']);
                echo '<li onClick="window.location=\'' . $prm . '\';">' . PricerrTheme_wrap_the_title($row['post_title'], $row['ID']) . '</li>';
                echo '</ul>';
            }
        } else {
            echo '<ul>';
            echo '<li onClick="fill(\'' . $new_string . '\');">' . __('No results found', 'PricerrTheme') . '</li>';
            echo '</ul>';
        }
    }
}
 public static function stems($text)
 {
     // split sentence into words
     $words = preg_split('/[^a-zA-Z\'"-]+/', $text, -1, PREG_SPLIT_NO_EMPTY);
     // stemmer plugin
     require_once BASEPATH . '/application/plugins/class.stemmer.inc';
     $stemmer = new Stemmer();
     $result = '';
     foreach ($words as $word) {
         // if is at least three characters and not in the list of stopwords...
         if (strlen($word) > 2 && !in_array($word, self::$stopwords)) {
             // stem & attach to result
             $result .= $stemmer->stem(strtolower($word)) . ' ';
         }
     }
     // trailing space
     $result = substr($result, 0, -1);
     return $result;
 }
Exemple #8
0
 public function testAgainstDictionary()
 {
     return;
     $data = file("tests/data.txt", FILE_IGNORE_NEW_LINES);
     for ($i = 0; $i < count($data); $i++) {
         $line = preg_split('#\\s+#', $data[$i]);
         $this->assertEquals($line[1], Stemmer::stem($line[0]));
     }
 }
<html>
<head>
	<meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> 
	<title>porter stemmer</title> 
</head>
<body>
	<form action="porter_stemmer_test.php" method="get">
		<input name="word" size="100">
		<button type="submit">GO</button>
	</form>
</body>
</html>
 
<?php 
include 'class.stemmer.inc.php';
include 'porter_stemmer.php';
$word = isset($_GET['word']) ? $_GET['word'] : '';
$stemmer = new Stemmer();
echo "class.stemmer.inc.php:  " . $stemmer->stem($word);
echo "<br>";
echo "porter_stemmer.php:   " . PorterStemmer::Stem($word);
Exemple #10
0
 /**
  * Поиск леммы
  * @static
  * @param $value
  * @return mixed
  */
 public static function stemm($value)
 {
     return mb_strlen($value) > 3 ? Stemmer::getInstance()->stem_word($value) : $value;
 }
<?php

require '../modules/quizrooDB.php';
require '../modules/inc/class.stemmer.inc';
// check if query is blank
if (isset($_GET['searchQuery'])) {
    if ($_GET['searchQuery'] != "") {
        $currentPage = $_SERVER["PHP_SELF"];
        $searchQuery = $_GET['searchQuery'];
        // split it up into tokens
        // $tokenArray = explode(' ', $searchQuery);
        // prepare the Porter Stemmer
        $stemmer = new Stemmer();
        $tokenArray = $stemmer->stem_list($searchQuery);
        // search modifiers
        if (isset($_GET['searchType'])) {
            $searchType = $_GET['searchType'];
        } else {
            $searchType = 0;
        }
        if (isset($_GET['question_option'])) {
            $searchOption = $_GET['question_option'];
        } else {
            $searchOption = 0;
        }
        $maxRows_listQuiz = 10;
        $pageNum_listQuiz = 0;
        $maxPage_listQuiz = 10;
        // Maximum paging value
        if (isset($_GET['pageNum_listQuiz'])) {
            $pageNum_listQuiz = $_GET['pageNum_listQuiz'];
Exemple #12
0
<?php

global $wpdb;
include 'classes/stem.php';
include 'classes/cleaner.php';
$string = $_POST['queryString'];
$stemmer = new Stemmer();
$stemmed_string = $stemmer->stem($string);
$clean_string = new jSearchString();
$stemmed_string = $clean_string->parseString($stemmed_string);
$new_string = '';
foreach (array_unique(split(" ", $stemmed_string)) as $array => $value) {
    if (strlen($value) >= 1) {
        $new_string .= '' . $value . ' ';
    }
}
$new_string = substr($new_string, 0, strLen($new_string) - 1);
$new_string = htmlspecialchars($_POST['queryString']);
if (strlen($new_string) > 0) {
    $split_stemmed = split(" ", $new_string);
    $sql = "SELECT DISTINCT COUNT(*) as occurences, " . $wpdb->prefix . "posts.post_title FROM " . $wpdb->prefix . "posts,\n\t\t\t" . $wpdb->prefix . "postmeta WHERE " . $wpdb->prefix . "posts.post_status='publish' and \n\t\t\t" . $wpdb->prefix . "posts.post_type='project' \n\t\t\t\n\t\t\t\t\tAND " . $wpdb->prefix . "posts.ID = " . $wpdb->prefix . "postmeta.post_id \n\t\t\t\t\tAND " . $wpdb->prefix . "postmeta.meta_key = 'closed' \n\t\t\t\t\tAND " . $wpdb->prefix . "postmeta.meta_value = '0' \n\t\t\t\n\t\t\tAND (";
    while (list($key, $val) = each($split_stemmed)) {
        if ($val != '' && strlen($val) > 0) {
            $sql .= "(" . $wpdb->prefix . "posts.post_title LIKE '%" . $val . "%' OR " . $wpdb->prefix . "posts.post_content LIKE '%" . $val . "%') OR";
        }
    }
    $sql = substr($sql, 0, strLen($sql) - 3);
    //this will eat the last OR
    $sql .= ") GROUP BY " . $wpdb->prefix . "posts.post_title ORDER BY occurences DESC LIMIT 10";
    $query = mysql_query($sql) or die(mysql_error());
    //$row_sql = mysql_fetch_assoc ( $query );
Exemple #13
0
 public function init()
 {
     // getting last stamp from application instance
     $this->laststamp = $this->appInstance->getLastStamp();
     // TODO refactor
     if (isset($this->attrs['filter_cats'])) {
         $this->fcats = explode(',', $this->attrs['filter_cats']);
         if (0 === sizeof($this->fcats)) {
             $this->fcats = false;
         }
     }
     if (isset($this->attrs['filter_sites'])) {
         $this->fsites = explode(',', $this->attrs['filter_sites']);
         if (0 === sizeof($this->fsites)) {
             $this->fsites = false;
         }
     }
     if (isset($this->attrs['filter_keys'])) {
         $this->fkeys = Stemmer::Stem(Stemmer::ExtractWords($this->attrs['filter_keys']));
         if (0 === sizeof($this->fkeys)) {
             $this->fkeys = false;
         }
     }
 }
Exemple #14
0
 public function setDescription($text = '')
 {
     $tmp = Beautifier::HTMLPrepare($text);
     $this->description = $tmp;
     // stemmer
     $tmp .= ' ' . $this->getTitle();
     $words = Stemmer::ExtractWords($tmp);
     $this->stem = Stemmer::Stem($words);
     // /stemmer
     if (preg_match('/([^ \\n\\r]+[ \\n\\r]+){30}/s', $this->description, $match)) {
         $this->description_short = trim(str_replace("\n\n", "\n", $match[0])) . '...';
     }
     return $this;
 }
Exemple #15
0
 private function _wordArrayForWords($words)
 {
     $d = array();
     foreach ($words as $word) {
         if (preg_match('/[\\w]+/', $word)) {
             $word = strtolower($word);
         }
         $key = Stemmer::Stem($word);
         if (preg_match('/[^\\w]/', $word) || !in_array($word, self::$CORPUS_SKIP_WORDS) && strlen($word) > 2) {
             if (!isset($d[$key])) {
                 $d[$key] = 0;
             }
             $d[$key] += 1;
         }
     }
     return $d;
 }
Exemple #16
0
 /**
  * The main method to return jobs array
  */
 private function getJobs($stamp)
 {
     $c = Database::jobs();
     $jobs = array();
     if ($stamp < 0) {
         $mod = -1;
         $st = array('$lt' => -$stamp);
     } else {
         $mod = 1;
         $st = array('$gt' => $stamp);
     }
     $filter = array('stamp' => $st);
     if (isset($_POST['filter_sites'])) {
         $sites = explode(',', $_POST['filter_sites']);
         foreach ($sites as $key => &$val) {
             $val = intval($val);
             if ($val < 0 || $val > 20) {
                 unset($sites[$key]);
             }
         }
         if (count($sites)) {
             $filter['site'] = array('$in' => $sites);
         }
     }
     if (isset($_POST['filter_cats'])) {
         $cats = explode(',', $_POST['filter_cats']);
         foreach ($cats as $key => &$val) {
             $val = intval($val);
             if ($val < 0 || $val > 30) {
                 unset($cats[$key]);
             }
         }
         if (count($cats)) {
             $filter['cats'] = array('$in' => $cats);
         }
     }
     if (isset($_POST['filter_keys'])) {
         $val = Stemmer::ExtractWords($_POST['filter_keys']);
         $val = Stemmer::Stem($val);
         if (count($val)) {
             $filter['stem'] = array('$in' => $val);
         }
     }
     if (1 == count($filter)) {
         // only stamp
         $res = Cache::get('j' . $stamp);
         if ($res) {
             return $res;
         }
     }
     $cursor = $c->find($filter, array('site', 'id', 'stamp', 'title', 'cats', 'short', 'desc', 'money'));
     $cursor->sort(array('stamp' => -1));
     $cursor->limit(25);
     while ($job = $cursor->getNext()) {
         $item = Job::prepareJSON($job, $mod);
         $jobs[] = $item;
     }
     if (0 == count($jobs)) {
         return false;
     }
     if ($stamp < 0) {
         $jobs = array_reverse($jobs);
     }
     if (1 == count($filter)) {
         Cache::set('j' . $stamp, $jobs, 30);
     }
     return $jobs;
 }
 /**
  * Detects "short" words.
  *
  * @param string $word 
  * @return void
  * @author John Anderson
  * @see http://snowball.tartarus.org/algorithms/english/stemmer.html
  */
 public static function isShort($word)
 {
     //Should end on a short syllable
     $suffixes = Stemmer::getShortSyllables($word);
     if (!count($suffixes)) {
         return false;
     }
     $matches = false;
     foreach ($suffixes as $suffix) {
         if (substr($word, -strlen($suffix)) == $suffix) {
             $matches = true;
         }
     }
     if (!$matches) {
         return false;
     }
     //R1 should also be empty
     return !Stemmer::getR1($word);
 }
 public function termDocumentCorrelation($_term, $_document)
 {
     if ($this->l_termCorrelation !== null && $this->l_documentCorrelation !== null && isset($this->l_termIndex[$term = Stemmer::stem($_term)]) && isset($this->l_documentIndex[$document = (string) $_document])) {
         return $this->l_termDocumentCorrelation[$this->l_termIndex[$term]][$this->l_documentIndex[$document]];
     }
     return null;
 }