public function words($string) { $words = preg_split("/[\\s,]+/", $string); $d = array(); foreach ($words as $word) { if (strlen($word) < 50 and in_array($word, $COMMON_WORDS) == false) { array_push($d, $result = stemm_es::stemm(strtolower($word))); } } return $d; }
function word_stem($word) { $word = strtolower($word); $cache = Cache::getCache('stemming'); if ($cache->has($word)) { return $cache->get($word); } $stemmed = @stemm_es::stemm($word); $cache->set($word, $stemmed); return $stemmed; }
/* Stemm_es a stemming class for spanish / Un lexemador para español Copyright (C) 2007 Paolo Ragone This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA or go to: http://www.gnu.org/licenses/lgpl.txt You may contact me at pragone@gmail.com */ require_once 'stemm_es.php'; $lines = file('stemm_test_corpus.txt'); $now = time(); foreach ($lines as $line) { $part = split(' ', $linea); $st = stemm_es::stemm($part[0]); if ($st != $part[1]) { print "Word: " . $part[0] . ", stem: " . $st . ", "; print "expected: " . $part[1]; print " -- BAD<HR>"; } } print "<BR>Stemmed: " . count($lines) . " words in " . (time() - $now) . " secs";
function checkKeywords($feed, $keywords) { if ($keywords != null) { $searchList = getSearchList($keywords); $blackList = getBlackList($keywords); //Unifico en un solo string el título y el texto del post (Por el momento, podrían agregarse otros campos) $text = $feed['name'] . ' ' . $feed['message']; //Divido el string en palabras, utilizando como separadores los blancos (espacios, tabs, etc.), comas, puntos y puntos y coma (Puede que haya que agregar otros símbolos a la expresión regular) foreach (preg_split("/[\\s,.;]+/", $text) as $word) { //Seteo un array de strings utilizando como índice la raiz de la palabra $string[stemm_es::stemm(strtolower($word))] = 1; } //Si no existen palabras en ninguna de las dos listas, no hay que filtran, por lo tanto retorno true if (empty($searchList) && empty($blackList)) { return true; } //Si existen palabras en la lista negra, pero no en la otra, solo filtro los post que contengan esas palabras if (empty($searchList) && !empty($blackList)) { //Chequeo si existen en el índice los keywords buscados (la raiz en realidad) y en ese caso retorno false para omitir el post foreach ($blackList as $keyword) { if (isset($string[stemm_es::stemm(strtolower($keyword))])) { return false; } } return true; } //Si existen palabras en la lista de términos a buscar, solo retorno los post que contengan esas palabras, y de ellos chequeo que no tengan palabras de la lista negra. foreach ($searchList as $keyword) { if (!empty($blackList)) { foreach ($blackList as $blackKeyword) { if (isset($string[stemm_es::stemm(strtolower($blackKeyword))])) { return false; } } } if (isset($string[stemm_es::stemm(strtolower($keyword))])) { return true; } return false; } } else { return true; } }
public function query_pr($cadena) { $cadena = trim(urldecode($cadena)); $cadena2 = ""; echo $cadena; $tam = explode("|", $cadena); for ($i = 0; $i <= sizeof($tam) - 1; $i++) { if (!empty($tam[$i])) { $cadena2 = $cadena2 . '&' . stemm_es::stemm($tam[$i]); } } $cadena2 = substr($cadena2, 1); echo '<br>'; echo $cadena2; // $query = "SELECT idlom, idrepository, ts_rank_cd(search_index_col, query) AS rank // FROM general_description, to_tsquery('spanish',lower(quitar_acento('$cadena2'))) query // WHERE query @@ search_index_col // ORDER BY rank DESC;"; // // $id_prin = $this->busqueda_model->get_id($query); // // foreach ($id_prin as $key) { // $result[] = $this->busqueda_model->consulta(strval($key['idlom']), $key['idrepository']); // $words[] = $this->lom_model->get_keyword(strval($key['idlom']), $key['idrepository']); // } // // print_r($result); }
function __stemm($palabra) { $stemm = stemm_es::stemm($palabra); return $stemm . '*'; }
function GetStem($word) { return stemm_es::stemm($word); }