protected function checkStemmer(Stemmer $stemmer, \Iterator $words, \Iterator $stems) { foreach ($words as $word) { $stem = $stems->current(); $this->assertEquals($stemmer->stem($word), $stem, "The stem for '{$word}' should be '{$stem}' not '{$stemmer->stem($word)}'"); $stems->next(); } }
public function testStemming() { $stemmer = new Stemmer(); $testWords = ['результаты' => 'результат', 'в' => 'в', 'вавиловка' => 'вавиловк', 'вагнера' => 'вагнер', 'вагон' => 'вагон', 'вагона' => 'вагон', 'вагоне' => 'вагон', 'вагонов' => 'вагон', 'вагоном' => 'вагон', 'вагоны' => 'вагон', 'важная' => 'важн', 'важнее' => 'важн', 'важнейшие' => 'важн', 'важнейшими' => 'важн', 'важничал' => 'важнича', 'важно' => 'важн', 'важного' => 'важн', 'важное' => 'важн', 'важной' => 'важн', 'важном' => 'важн', 'важному' => 'важн', 'важности' => 'важност', 'важностию' => 'важност', 'важность' => 'важност', 'важностью' => 'важност', 'важную' => 'важн', 'важны' => 'важн', 'важные' => 'важн', 'важный' => 'важн', 'важным' => 'важн', 'важных' => 'важн', 'вазах' => 'ваз', 'вазы' => 'ваз', 'вакса' => 'вакс', 'вакханка' => 'вакханк', 'вал' => 'вал', 'валандался' => 'валанда', 'валентина' => 'валентин', 'валериановых' => 'валерианов', 'валерию' => 'валер', 'валетами' => 'валет', 'вали' => 'вал', 'валил' => 'вал', 'валился' => 'вал', 'валится' => 'вал', 'валов' => 'вал', 'вальдшнепа' => 'вальдшнеп', 'вальс' => 'вальс', 'вальса' => 'вальс', 'вальсе' => 'вальс', 'вальсишку' => 'вальсишк', 'вальтера' => 'вальтер', 'валяется' => 'валя', 'валялась' => 'валя', 'валялись' => 'валя', 'валялось' => 'валя', 'валялся' => 'валя', 'валять' => 'валя', 'валяются' => 'валя', 'вам' => 'вам', 'вами' => 'вам']; foreach ($testWords as $word => $base) { $this->assertEquals($base, $stemmer->getWordBase($word)); } }
public function getAnalysis($text) { $an = new Stemmer(); $text = $an->stem_words($text); $text = strip_tags($text); //удаляем html+php $text = stripslashes($text); //удаляем слеши $text = mb_strtolower(strip_tags($text), "utf-8"); $patterns = array('/\\ /s', '/\\ /s'); $text = preg_replace($patterns, null, $text); //прогоняем регулярки $text = str_replace(array(",", '"', '.'), " ", $text); //прогоняем регулярки $words = array_unique(explode(' ', $text)); // Оставляем только уникальные значения // оставляем только слова, которые не меньше 4х букв foreach ($words as $key => $value) { if (mb_strlen($value, "utf-8") < 4) { unset($words[$key]); } } $text .= ' '; $result = array(); foreach ($words as $word) { // исключаем схожие слова, добавляя пробел // исключаем слова, вхождениие которых меньше 3х if (($cnt = substr_count($text, $word . ' ')) < 3) { continue; } $result["{$word}"] = $cnt; } arsort($result); // Ищем словосочетания $words = preg_split('#\\s+#', trim($text)); $pair_words = array(); foreach ($words as $i => $word) { if (isset($words[$i + 1])) { $pair_words[] = $word . ' ' . $words[$i + 1]; } } $pair_words = array_count_values($pair_words); foreach ($pair_words as $key => $value) { if ($value == 1) { unset($pair_words[$key]); } } arsort($pair_words); $result = array_merge($pair_words, $result); return $result; }
public function stem_string() { $simple_array = array(); $stemming; //making an instance of the class Stemmer which is an open source script for stemming $stemming = new Stemmer(); $simple_array_stemmed = array(); preg_match_all('@\\w+\\b@', $this->input_user_query, $output); /*var_dump($output);*/ foreach ($output as $key) { $simple_array = $key; } foreach ($simple_array as $key) { //we dont want to stem NOT OR or AND and turn them into lowercase Boolean search would not work then if ($key != 'NOT' && $key != 'OR' && $key != 'AND') { $just_stemmed = $stemming->stem($key); } else { $just_stemmed = $key; } /*echo $just_stemmed;*/ $simple_array_stemmed[] = $just_stemmed; /*echo $key;*/ } /*var_dump($simple_array_stemmed);*/ //put the modefied sting into $this->input_user_query $this->input_user_query = implode($simple_array_stemmed, ' '); /*echo ('I am stemmed');*/ /*var_dump($this->input_user_query);*/ }
public static function getInstance() { if (self::$_instance === NULL) { self::$_instance = new Stemmer(); } return self::$_instance; }
function PricerrTheme_autosuggest_it() { include 'classes/stem.php'; include 'classes/cleaner.php'; global $wpdb; $string = $_POST['queryString']; $stemmer = new Stemmer(); $stemmed_string = $stemmer->stem($string); $clean_string = new jSearchString(); $stemmed_string = $clean_string->parseString($stemmed_string); $new_string = ''; foreach (array_unique(split(" ", $stemmed_string)) as $array => $value) { if (strlen($value) >= 1) { $new_string .= '' . $value . ' '; } } //$new_string = substr ( $new_string,0, ( strLen ( $new_string ) -1 ) ); $new_string = htmlspecialchars($_POST['queryString']); if (strlen($new_string) > 0) { $split_stemmed = split(" ", $new_string); $sql = "SELECT DISTINCT COUNT(*) as occurences, " . $wpdb->prefix . "posts.post_title, " . $wpdb->prefix . "posts.ID FROM " . $wpdb->prefix . "posts,\r\n\t\t\t" . $wpdb->prefix . "postmeta WHERE " . $wpdb->prefix . "posts.post_status='publish' and \r\n\t\t\t" . $wpdb->prefix . "posts.post_type='job' \r\n\t\t\t\r\n\t\t\t\t\tAND " . $wpdb->prefix . "posts.ID = " . $wpdb->prefix . "postmeta.post_id \r\n\t\t\t\t\tAND " . $wpdb->prefix . "postmeta.meta_key = 'closed' \r\n\t\t\t\t\tAND " . $wpdb->prefix . "postmeta.meta_value = '0' \r\n\t\t\t\r\n\t\t\tAND ("; while (list($key, $val) = each($split_stemmed)) { if ($val != '' && strlen($val) > 0) { $sql .= "(" . $wpdb->prefix . "posts.post_title LIKE '%" . $val . "%' OR " . $wpdb->prefix . "posts.post_content LIKE '%" . $val . "%') OR"; } } $sql = substr($sql, 0, strlen($sql) - 3); //this will eat the last OR $sql .= ") GROUP BY " . $wpdb->prefix . "posts.post_title ORDER BY occurences DESC LIMIT 10"; /* SELECT DISTINCT COUNT(*) as occurences, wp_posts.post_title FROM wp_posts, wp_postmeta WHERE wp_posts.post_status='publish' and wp_posts.post_type='job' AND wp_posts.ID = wp_postmeta.post_id AND wp_postmeta.meta_key = 'closed' AND wp_postmeta.meta_value = '0' AND ((wp_posts.post_title LIKE '%test%' OR wp_posts.post_content LIKE '%test%')) GROUP BY wp_posts.post_title ORDER BY occurences DESC LIMIT 10 */ $r = $wpdb->get_results($sql, ARRAY_A); if (count($r) > 0) { foreach ($r as $row) { echo '<ul id="sk_auto_suggest">'; $prm = get_permalink($row['ID']); echo '<li onClick="window.location=\'' . $prm . '\';">' . PricerrTheme_wrap_the_title($row['post_title'], $row['ID']) . '</li>'; echo '</ul>'; } } else { echo '<ul>'; echo '<li onClick="fill(\'' . $new_string . '\');">' . __('No results found', 'PricerrTheme') . '</li>'; echo '</ul>'; } } }
public static function stems($text) { // split sentence into words $words = preg_split('/[^a-zA-Z\'"-]+/', $text, -1, PREG_SPLIT_NO_EMPTY); // stemmer plugin require_once BASEPATH . '/application/plugins/class.stemmer.inc'; $stemmer = new Stemmer(); $result = ''; foreach ($words as $word) { // if is at least three characters and not in the list of stopwords... if (strlen($word) > 2 && !in_array($word, self::$stopwords)) { // stem & attach to result $result .= $stemmer->stem(strtolower($word)) . ' '; } } // trailing space $result = substr($result, 0, -1); return $result; }
public function testAgainstDictionary() { return; $data = file("tests/data.txt", FILE_IGNORE_NEW_LINES); for ($i = 0; $i < count($data); $i++) { $line = preg_split('#\\s+#', $data[$i]); $this->assertEquals($line[1], Stemmer::stem($line[0])); } }
<html> <head> <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> <title>porter stemmer</title> </head> <body> <form action="porter_stemmer_test.php" method="get"> <input name="word" size="100"> <button type="submit">GO</button> </form> </body> </html> <?php include 'class.stemmer.inc.php'; include 'porter_stemmer.php'; $word = isset($_GET['word']) ? $_GET['word'] : ''; $stemmer = new Stemmer(); echo "class.stemmer.inc.php: " . $stemmer->stem($word); echo "<br>"; echo "porter_stemmer.php: " . PorterStemmer::Stem($word);
/** * Поиск леммы * @static * @param $value * @return mixed */ public static function stemm($value) { return mb_strlen($value) > 3 ? Stemmer::getInstance()->stem_word($value) : $value; }
<?php require '../modules/quizrooDB.php'; require '../modules/inc/class.stemmer.inc'; // check if query is blank if (isset($_GET['searchQuery'])) { if ($_GET['searchQuery'] != "") { $currentPage = $_SERVER["PHP_SELF"]; $searchQuery = $_GET['searchQuery']; // split it up into tokens // $tokenArray = explode(' ', $searchQuery); // prepare the Porter Stemmer $stemmer = new Stemmer(); $tokenArray = $stemmer->stem_list($searchQuery); // search modifiers if (isset($_GET['searchType'])) { $searchType = $_GET['searchType']; } else { $searchType = 0; } if (isset($_GET['question_option'])) { $searchOption = $_GET['question_option']; } else { $searchOption = 0; } $maxRows_listQuiz = 10; $pageNum_listQuiz = 0; $maxPage_listQuiz = 10; // Maximum paging value if (isset($_GET['pageNum_listQuiz'])) { $pageNum_listQuiz = $_GET['pageNum_listQuiz'];
<?php global $wpdb; include 'classes/stem.php'; include 'classes/cleaner.php'; $string = $_POST['queryString']; $stemmer = new Stemmer(); $stemmed_string = $stemmer->stem($string); $clean_string = new jSearchString(); $stemmed_string = $clean_string->parseString($stemmed_string); $new_string = ''; foreach (array_unique(split(" ", $stemmed_string)) as $array => $value) { if (strlen($value) >= 1) { $new_string .= '' . $value . ' '; } } $new_string = substr($new_string, 0, strLen($new_string) - 1); $new_string = htmlspecialchars($_POST['queryString']); if (strlen($new_string) > 0) { $split_stemmed = split(" ", $new_string); $sql = "SELECT DISTINCT COUNT(*) as occurences, " . $wpdb->prefix . "posts.post_title FROM " . $wpdb->prefix . "posts,\n\t\t\t" . $wpdb->prefix . "postmeta WHERE " . $wpdb->prefix . "posts.post_status='publish' and \n\t\t\t" . $wpdb->prefix . "posts.post_type='project' \n\t\t\t\n\t\t\t\t\tAND " . $wpdb->prefix . "posts.ID = " . $wpdb->prefix . "postmeta.post_id \n\t\t\t\t\tAND " . $wpdb->prefix . "postmeta.meta_key = 'closed' \n\t\t\t\t\tAND " . $wpdb->prefix . "postmeta.meta_value = '0' \n\t\t\t\n\t\t\tAND ("; while (list($key, $val) = each($split_stemmed)) { if ($val != '' && strlen($val) > 0) { $sql .= "(" . $wpdb->prefix . "posts.post_title LIKE '%" . $val . "%' OR " . $wpdb->prefix . "posts.post_content LIKE '%" . $val . "%') OR"; } } $sql = substr($sql, 0, strLen($sql) - 3); //this will eat the last OR $sql .= ") GROUP BY " . $wpdb->prefix . "posts.post_title ORDER BY occurences DESC LIMIT 10"; $query = mysql_query($sql) or die(mysql_error()); //$row_sql = mysql_fetch_assoc ( $query );
public function init() { // getting last stamp from application instance $this->laststamp = $this->appInstance->getLastStamp(); // TODO refactor if (isset($this->attrs['filter_cats'])) { $this->fcats = explode(',', $this->attrs['filter_cats']); if (0 === sizeof($this->fcats)) { $this->fcats = false; } } if (isset($this->attrs['filter_sites'])) { $this->fsites = explode(',', $this->attrs['filter_sites']); if (0 === sizeof($this->fsites)) { $this->fsites = false; } } if (isset($this->attrs['filter_keys'])) { $this->fkeys = Stemmer::Stem(Stemmer::ExtractWords($this->attrs['filter_keys'])); if (0 === sizeof($this->fkeys)) { $this->fkeys = false; } } }
public function setDescription($text = '') { $tmp = Beautifier::HTMLPrepare($text); $this->description = $tmp; // stemmer $tmp .= ' ' . $this->getTitle(); $words = Stemmer::ExtractWords($tmp); $this->stem = Stemmer::Stem($words); // /stemmer if (preg_match('/([^ \\n\\r]+[ \\n\\r]+){30}/s', $this->description, $match)) { $this->description_short = trim(str_replace("\n\n", "\n", $match[0])) . '...'; } return $this; }
private function _wordArrayForWords($words) { $d = array(); foreach ($words as $word) { if (preg_match('/[\\w]+/', $word)) { $word = strtolower($word); } $key = Stemmer::Stem($word); if (preg_match('/[^\\w]/', $word) || !in_array($word, self::$CORPUS_SKIP_WORDS) && strlen($word) > 2) { if (!isset($d[$key])) { $d[$key] = 0; } $d[$key] += 1; } } return $d; }
/** * The main method to return jobs array */ private function getJobs($stamp) { $c = Database::jobs(); $jobs = array(); if ($stamp < 0) { $mod = -1; $st = array('$lt' => -$stamp); } else { $mod = 1; $st = array('$gt' => $stamp); } $filter = array('stamp' => $st); if (isset($_POST['filter_sites'])) { $sites = explode(',', $_POST['filter_sites']); foreach ($sites as $key => &$val) { $val = intval($val); if ($val < 0 || $val > 20) { unset($sites[$key]); } } if (count($sites)) { $filter['site'] = array('$in' => $sites); } } if (isset($_POST['filter_cats'])) { $cats = explode(',', $_POST['filter_cats']); foreach ($cats as $key => &$val) { $val = intval($val); if ($val < 0 || $val > 30) { unset($cats[$key]); } } if (count($cats)) { $filter['cats'] = array('$in' => $cats); } } if (isset($_POST['filter_keys'])) { $val = Stemmer::ExtractWords($_POST['filter_keys']); $val = Stemmer::Stem($val); if (count($val)) { $filter['stem'] = array('$in' => $val); } } if (1 == count($filter)) { // only stamp $res = Cache::get('j' . $stamp); if ($res) { return $res; } } $cursor = $c->find($filter, array('site', 'id', 'stamp', 'title', 'cats', 'short', 'desc', 'money')); $cursor->sort(array('stamp' => -1)); $cursor->limit(25); while ($job = $cursor->getNext()) { $item = Job::prepareJSON($job, $mod); $jobs[] = $item; } if (0 == count($jobs)) { return false; } if ($stamp < 0) { $jobs = array_reverse($jobs); } if (1 == count($filter)) { Cache::set('j' . $stamp, $jobs, 30); } return $jobs; }
/** * Detects "short" words. * * @param string $word * @return void * @author John Anderson * @see http://snowball.tartarus.org/algorithms/english/stemmer.html */ public static function isShort($word) { //Should end on a short syllable $suffixes = Stemmer::getShortSyllables($word); if (!count($suffixes)) { return false; } $matches = false; foreach ($suffixes as $suffix) { if (substr($word, -strlen($suffix)) == $suffix) { $matches = true; } } if (!$matches) { return false; } //R1 should also be empty return !Stemmer::getR1($word); }
public function termDocumentCorrelation($_term, $_document) { if ($this->l_termCorrelation !== null && $this->l_documentCorrelation !== null && isset($this->l_termIndex[$term = Stemmer::stem($_term)]) && isset($this->l_documentIndex[$document = (string) $_document])) { return $this->l_termDocumentCorrelation[$this->l_termIndex[$term]][$this->l_documentIndex[$document]]; } return null; }