예제 #1
0
function wp_rp_generate_auto_tags($post)
{
    $suitable_words = wp_rp_get_unigrams();
    $words = array_slice(array_merge(explode(' ', $post->post_title), explode(' ', $post->post_content)), 0, WP_RP_RECOMMENDATIONS_AUTO_TAGS_MAX_WORDS);
    $bag_of_words = array();
    foreach ($words as $word) {
        $word = strtolower($word);
        $word = preg_replace('/[\\W_]+/', '', $word);
        $stem = PorterStemmer::Stem($word);
        if ($stem) {
            if (!isset($bag_of_words[$stem])) {
                $bag_of_words[$stem] = 1;
            } else {
                $bag_of_words[$stem] += 1;
            }
        }
    }
    $selected_words = array();
    foreach ($bag_of_words as $word => $freq) {
        if (isset($suitable_words[$word])) {
            $selected_words[$word] = $suitable_words[$word] * sqrt($freq);
        }
    }
    asort($selected_words);
    $selected_words = array_reverse($selected_words, true);
    $auto_tags = array_slice($selected_words, 0, WP_RP_RECOMMENDATIONS_AUTO_TAGS_MAX_TAGS, true);
    return array_keys($auto_tags);
}
 private function Stem()
 {
     $arr = array();
     foreach ($this->filteredTokens as $val) {
         $arr[] = PorterStemmer::Stem($val);
     }
     $this->filteredTokens = $arr;
 }
예제 #3
0
 function applyPorterStemming($docment)
 {
     /* Applies Porter stemming algorithm to every word in the 
      * document */
     foreach ($docment as $index => $word) {
         $docment[$index] = PorterStemmer::Stem($word);
     }
     return $docment;
 }
예제 #4
0
function GetStem($word)
{
    # A list of irregular plurals not supported by Porter's algorithm.
    $irregular = array('alumni' => 'alumnus', 'cacti' => 'cactus', 'foci' => 'focus', 'focuses' => 'focuses', 'fungi' => 'funguses', 'nuclei' => 'nucleus', 'radii' => 'radius', 'stimuli' => 'stimulus', 'axes' => 'axis', 'analyses' => 'analysis', 'bases' => 'basis', 'crises' => 'crisis', 'diagnoses' => 'diagnosis', 'ellipses' => 'ellipsis', 'hypotheses' => 'hypothesis', 'oases' => 'oasis', 'paralyses' => 'paralysis', 'parentheses' => 'parenthesis', 'syntheses' => 'synthesis', 'synopses' => 'synopsis', 'theses' => 'thesis', 'appendices' => 'appendix', 'indeces' => 'index', 'indexes' => 'index', 'matrixes' => 'matrix', 'beaux' => 'beau', 'bureaus' => 'bureau', 'bureaux' => 'bureau', 'tableaux' => 'tableau', 'tableaus' => 'tableau', 'children' => 'child', 'men' => 'man', 'oxen' => 'ox', 'women' => 'woman', 'bacteria' => 'bacterium', 'corpora' => 'corpus', 'criteria' => 'criterion', 'curricula' => 'curriculum', 'data' => 'datum', 'genera' => 'genus', 'media' => 'medium', 'memoranda' => 'memorandum', 'phenomena' => 'phenomenon', 'strata' => 'stratum', 'deer' => 'deer', 'feet' => 'foot', 'geese' => 'goose', 'teeth' => 'tooth', 'antennae', 'antennas' => 'antenna', 'formulae' => 'formula', 'formulas' => 'formula', 'nebulae' => 'nebula', 'vertebrae' => 'vertebra', 'vitae' => 'vita', 'lice' => 'louse', 'mice' => 'mouse');
    if (array_key_exists($word . "", $irregular)) {
        $word = $irregular[$word];
    }
    # Replace, then still go through Porter
    return PorterStemmer::Stem($word);
}
function applyPorterStemming(array $searchArray)
{
    $stemsUnique = array();
    $searchDiff = stopwordRemoval($searchArray);
    foreach ($searchDiff as $word) {
        $stems[] .= PorterStemmer::Stem($word) . "\n";
    }
    // Remove duplicate words
    $stemsUnique = array_unique($stems);
    // Return stems
    return $stemsUnique;
}
예제 #6
0
 public function _cleanString($contents)
 {
     $cleandocument = strip_tags(strtolower($contents));
     $cleandocument = preg_replace('/\\W/i', ' ', $cleandocument);
     $cleandocument = preg_replace('/\\s\\s+/', ' ', $cleandocument);
     $return = '';
     foreach (explode(' ', $cleandocument) as $term) {
         if (array_key_exists($term, $this->stemmed)) {
             $return .= ' ' . $this->stemmed[$term];
         } else {
             $stem = PorterStemmer::Stem($term);
             $this->stemmed[$term] = $stem;
             $return .= ' ' . $stem;
         }
     }
     return $return;
 }
function doctermTags($req)
{
    global $CONFIG;
    set_time_limit(0);
    //this avoids timeouts
    require_once $CONFIG->path . "mod/profile_manager/views/default/profile_manager/members/config.php";
    require_once $CONFIG->path . "mod/profile_manager/views/default/profile_manager/members/classes.php";
    require_once $CONFIG->path . "mod/profile_manager/views/default/profile_manager/members/stemming.php";
    $outputfile = $CONFIG->path . "mod/profile_manager/views/default/profile_manager/members/output.log";
    file_put_contents($outputfile, "Starting creating TAGS Doc-term matrix...\n", FILE_APPEND);
    $guids = unserialize(file_get_contents($IOdir . "guids"));
    $lr_array = unserialize(file_get_contents($IOdir . "lr"));
    $tags_dt = array();
    $stop_words = set_stop_words_tags($CONFIG->path . "mod/profile_manager/views/default/profile_manager/members/stop_words_eng.txt");
    //create an array containing "stop words", in order to eliminate them from the text
    if ($req["dt_useold"] == "true" && file_exists($IOdir . "old_lr") && file_exists($IOdir . "old_tags_dt_raw")) {
        $old_lr = unserialize(file_get_contents($IOdir . "old_lr"));
        $old_tags_dt_raw = unserialize(file_get_contents($IOdir . "old_tags_dt_raw"));
        //we need the raw version of the doc-term matrix, before applying IDF or synonyms
    }
    //create an array containing all the tags of each document
    foreach ($guids as $guid) {
        if (!empty($lr_array[$guid]->tags)) {
            if ($req["dt_useold"] == "true" && isset($old_lr[$guid]->tags) && $lr_array[$guid]->tags == $old_lr[$guid]->tags && isset($old_tags_dt_raw[$guid])) {
                $tags_dt[$guid] = $old_tags_dt_raw[$guid];
                continue;
            }
            //find all the tags for the current resource
            $tags = array();
            foreach ($lr_array[$guid]->tags as $sentence) {
                if ($sentence == "") {
                    continue;
                }
                $sentence = strip_punctuation($sentence);
                //strip punctuation
                $sentence_clean = str_replace($stop_words, " ", $sentence);
                //eliminate stop words
                $tags_sentence = explode(" ", $sentence_clean);
                $tags = array_merge($tags, $tags_sentence);
            }
            //stem each tag
            foreach ($tags as $num => $element) {
                $tags[$num] = PorterStemmer::Stem(strtolower(trim($element)));
                //stem elements
            }
            $tags = array_filter(array_unique($tags));
            //delete duplicates and empty elements
            //create the entry for the current document in the doc-term tags matrix
            foreach ($tags as $tag) {
                $tags_dt[$guid][$tag] = 1;
            }
        }
    }
    file_put_contents($IOdir . "tags_dt_raw", serialize($tags_dt));
    //it saves the raw version of the doc-term matrix
    if (PHP_OS == "Linux") {
        chmod($IOdir . "tags_dt_raw", 0666);
    }
    //set rw permissions for everybody for this file
    if ($enable_synonyms) {
        foreach ($keys as $num => $key) {
            if (!isset($keys[$num])) {
                continue;
            }
            //since there is an unset on this array into the foreach, we have to check if the present key is still available or not
            foreach ($keys as $num2 => $key2) {
                if (!isset($keys[$num2])) {
                    continue;
                }
                //since there is an unset on this array into the foreach, we have to check if the present key is still available or not
                if ($key != $key2 && check_synonyms($key, $key2)) {
                    file_put_contents($outputfile, "\n{$key} and {$key2} are synonyms\n", FILE_APPEND);
                    foreach ($tags_dt as $guid => $element) {
                        if (isset($tags_dt[$guid][$key2])) {
                            unset($tags_dt[$guid][$key2]);
                            $tags_dt[$guid][$key] = 1;
                        }
                    }
                    unset($keys[$num2]);
                }
            }
        }
    }
    file_put_contents($IOdir . "tags_dt", serialize($tags_dt));
    if (PHP_OS == "Linux" && posix_getuid() == fileowner($IOdir . 'tags_dt')) {
        chmod($IOdir . 'tags_dt', 0666);
    }
    //set rw permissions for everybody for this file
    file_put_contents($outputfile, "TAGS Doc-term matrix created\n\n", FILE_APPEND);
    return "OK";
}
function generate_query_array()
{
    $stopwords = array('!', '@', '#', '%', '^', '&', '*', '(', ')', '-', '_', '+', '=', '`', '~', '{', '[', ']', '}', ':', ';', '"', ',', '.', '/', '|', '<', '>', '?', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'able', 'about', 'above', 'abroad', 'according', 'accordingly', 'across', 'actually', 'adj', 'after', 'afterwards', 'again', 'against', 'ago', 'ahead', 'all', 'allow', 'allows', 'almost', 'alone', 'along', 'alongside', 'already', 'also', 'although', 'always', 'am', 'amid', 'amidst', 'among', 'amongst', 'an', 'and', 'another', 'any', 'anybody', 'anyhow', 'anyone', 'anything', 'anyway', 'anyways', 'anywhere', 'apart', 'appear', 'appreciate', 'appropriate', 'are', 'around', 'as', 'aside', 'ask', 'asking', 'associated', 'at', 'available', 'away', 'awfully', 'back', 'backward', 'backwards', 'be', 'became', 'because', 'become', 'becomes', 'becoming', 'been', 'before', 'beforehand', 'begin', 'behind', 'being', 'believe', 'below', 'beside', 'besides', 'best', 'better', 'between', 'beyond', 'both', 'brief', 'but', 'by', 'came', 'can', 'cannot', 'cant', 'caption', 'cause', 'causes', 'certain', 'certainly', 'changes', 'clearly', 'co', 'co.', 'com', 'come', 'comes', 'concerning', 'consequently', 'consider', 'considering', 'contain', 'containing', 'contains', 'corresponding', 'could', 'course', 'currently', 'dare', 'definitely', 'described', 'despite', 'did', 'different', 'directly', 'do', 'does', 'doing', 'done', 'down', 'downwards', 'during', 'each', 'edu', 'eg', 'eight', 'eighty', 'either', 'else', 'elsewhere', 'end', 'ending', 'enough', 'entirely', 'especially', 'et', 'etc', 'even', 'ever', 'evermore', 'every', 'everybody', 'everyone', 'everything', 'everywhere', 'ex', 'exactly', 'example', 'except', 'fairly', 'far', 'farther', 'few', 'fewer', 'fifth', 'first', 'five', 'followed', 'following', 'follows', 'for', 'forever', 'former', 'formerly', 'forth', 'forward', 'found', 'four', 'from', 'further', 'furthermore', 'get', 'gets', 'getting', 'given', 'gives', 'go', 'goes', 'going', 'gone', 'got', 'gotten', 'greetings', 'had', 'half', 'happens', 'hardly', 'has', 'have', 'having', 'he', 'hello', 'help', 'hence', 'her', 'here', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers', 'herself', 'hi', 'him', 'himself', 'his', 'hither', 'hopefully', 'how', 'howbeit', 'however', 'hundred', 'ie', 'if', 'ignored', 'immediate', 'in', 'inasmuch', 'inc', 'inc.', 'indeed', 'indicate', 'indicated', 'indicates', 'inner', 'inside', 'insofar', 'instead', 'into', 'inward', 'is', 'it', 'its', 'itself', 'just', 'k', 'keep', 'keeps', 'kept', 'know', 'known', 'knows', 'last', 'lately', 'later', 'latter', 'latterly', 'least', 'less', 'lest', 'let', 'like', 'liked', 'likely', 'likewise', 'little', 'look', 'looking', 'looks', 'low', 'lower', 'ltd', 'made', 'mainly', 'make', 'makes', 'many', 'may', 'maybe', 'me', 'mean', 'meantime', 'meanwhile', 'merely', 'might', 'mine', 'minus', 'miss', 'more', 'moreover', 'most', 'mostly', 'mr', 'mrs', 'much', 'must', 'my', 'myself', 'name', 'namely', 'nd', 'near', 'nearly', 'necessary', 'need', 'needs', 'neither', 'never', 'neverf', 'neverless', 'nevertheless', 'new', 'next', 'nine', 'ninety', 'no', 'nobody', 'non', 'none', 'nonetheless', 'noone', 'no-one', 'nor', 'normally', 'not', 'nothing', 'notwithstanding', 'novel', 'now', 'nowhere', 'obviously', 'of', 'off', 'often', 'oh', 'ok', 'okay', 'old', 'on', 'once', 'one', 'ones', 'only', 'onto', 'opposite', 'or', 'other', 'others', 'otherwise', 'ought', 'our', 'ours', 'ourselves', 'out', 'outside', 'over', 'overall', 'own', 'particular', 'particularly', 'past', 'per', 'perhaps', 'placed', 'please', 'plus', 'possible', 'presumably', 'probably', 'provided', 'provides', 'que', 'quite', 'qv', 'rather', 'rd', 're', 'really', 'reasonably', 'recent', 'recently', 'regarding', 'regardless', 'regards', 'relatively', 'respectively', 'right', 'round', 'said', 'same', 'saw', 'say', 'saying', 'says', 'second', 'secondly', 'see', 'seeing', 'seem', 'seemed', 'seeming', 'seems', 'seen', 'self', 'selves', 'sensible', 'sent', 'serious', 'seriously', 'seven', 'several', 'shall', 'she', 'should', 'since', 'six', 'so', 'some', 'somebody', 'someday', 'somehow', 'someone', 'something', 'sometime', 'sometimes', 'somewhat', 'somewhere', 'soon', 'sorry', 'specified', 'specify', 'specifying', 'still', 'sub', 'such', 'sup', 'sure', 'take', 'taken', 'taking', 'tell', 'tends', 'th', 'than', 'thank', 'thanks', 'thanx', 'that', 'thats', 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'thence', 'there', 'thereafter', 'thereby', 'therefore', 'therein', 'theres', 'thereupon', 'these', 'they', 'thing', 'things', 'think', 'third', 'thirty', 'this', 'thorough', 'thoroughly', 'those', 'though', 'three', 'through', 'throughout', 'thru', 'thus', 'till', 'to', 'together', 'too', 'took', 'toward', 'towards', 'tried', 'tries', 'truly', 'try', 'trying', 'twice', 'two', 'un', 'under', 'underneath', 'undoing', 'unfortunately', 'unless', 'unlike', 'unlikely', 'until', 'unto', 'up', 'upon', 'upwards', 'us', 'use', 'used', 'useful', 'uses', 'using', 'usually', 'v', 'value', 'various', 'versus', 'very', 'via', 'viz', 'vs', 'want', 'wants', 'was', 'way', 'we', 'welcome', 'well', 'went', 'were', 'what', 'whatever', 'when', 'whence', 'whenever', 'where', 'whereafter', 'whereas', 'whereby', 'wherein', 'while', 'whereupon', 'wherever', 'whether', 'which', 'whichever', 'whilst', 'whither', 'who', 'whoever', 'whole', 'whom', 'whomever', 'whose', 'why', 'will', 'willing', 'wish', 'with', 'within', 'without', 'wonder', 'would', 'yes', 'yet', 'you', 'your', 'yours', 'yourself', 'yourselves', 'zero');
    $query_stem_words = array();
    $query_array0 = array();
    $query_array1 = array();
    $term_array = array();
    $query_array = array();
    $query_string = $_POST['query'];
    $query_string = strtolower($query_string);
    $query_words = explode(" ", $query_string);
    foreach ($query_words as $key => $word) {
        $stem = PorterStemmer::Stem($word);
        array_push($query_stem_words, $stem);
        $query_array0 = array_unique($query_stem_words);
    }
    foreach ($query_array0 as $key => $val) {
        if (in_array($val, $stopwords)) {
        } else {
            array_push($query_array1, $val);
        }
    }
    $host = "";
    // Host name
    $username = "";
    // Mysql username
    $password = "";
    // Mysql password
    $db_name = "";
    // Database name
    $con = mysql_connect($host, $username, $password) or die(mysql_error());
    mysql_select_db($db_name, $con) or die(mysql_error());
    $query1 = "select * from rownum_terms";
    $result = mysql_query($query1);
    while ($data = mysql_fetch_array($result)) {
        array_push($term_array, $data['term']);
    }
    $zero = 0;
    $one = 1;
    foreach ($term_array as $key => $val) {
        if (in_array($val, $query_array1)) {
            array_push($query_array, $one);
        } else {
            array_push($query_array, $zero);
        }
    }
    query_similarity($query_array);
}
예제 #9
0
 function getDocuments($sQuery)
 {
     $aKeyWords = preg_split('/(<\\s*p\\s*\\/?>)|(<\\s*br\\s*\\/?>)|[\\s,\\-\\/]/i', $sQuery);
     $aQuery = array();
     foreach ($aKeyWords as $sKeyWord) {
         $sKeyWord = preg_replace('/[^a-z0-9\']+/', '', strtolower($sKeyWord));
         if (strlen($sKeyWord) == 0) {
             continue;
         }
         $sKeyWord = PorterStemmer::Stem($sKeyWord);
         array_push($aQuery, $sKeyWord);
     }
     if (count($aQuery) == 0) {
         throw "no keywords left for " . $sKeyWords;
     }
     $sSelect = "SELECT A.sFile AS id, COUNT(*) AS relevance FROM ";
     $sWhere = " WHERE ";
     $sLetter = 'A';
     foreach ($aQuery as $sKeyWord) {
         if ($sLetter != 'A') {
             $sSelect .= " ,";
             $sWhere .= " AND A.sFile = " . $sLetter . ".sFile AND ";
         }
         $sSelect .= 'stemsIndex as ' . $sLetter;
         $sWhere .= $sLetter . ".sStem LIKE '%' || ? || '%'";
         $sLetter++;
     }
     $sSQL = $sSelect . ' ' . $sWhere . " GROUP BY 1 ORDER BY relevance DESC";
     $oStmt = $file_db->prepare($sSQL);
     $oStmt->execute($aQuery);
     return $oStmt->fetchAll(PDO::FETCH_OBJ);
 }
include 'pdf2text.php';
include 'stem_code.php';
$a = new PDF2Text();
$a->setFilename($file_name);
$a->decodePDF();
$stringput = $a->output();
$stringput = strtolower($stringput);
$allword_count = explode(" ", $stringput);
$wordarray = array();
$wordarrays = array();
foreach ($allword_count as $key => $val) {
    array_push($wordarrays, $val);
}
/*Stemming Code*/
foreach ($wordarrays as $key => $word) {
    $stem = PorterStemmer::Stem($word);
    array_push($wordarray, $stem);
}
$stopwords = array('on', 'us', 'xc', 'be', 'by', 'at', 'but', 'e', 'i', 'be', 'by', 'g', 'j', 'and', 'is', 'f', 'are', 'p', 'can', 'each', 'we', 'x', 'in', 'b', 'as', 'c', 'd', 'for', 'also', 'an', 'all', '-', 'a', 'any', 'in', 'the', 'thesis', 'to', 'of', 'dammalapati');
$useful_words = implode("=>", $wordarray);
$useful_words = str_replace("=>", " ", $useful_words);
$word_count = array_count_values(str_word_count($useful_words, 1));
ksort($word_count);
$con = mysql_connect($host, $username, $password) or die(mysql_error());
mysql_select_db($db_name, $con) or die(mysql_error());
$query = "SELECT COUNT(*) as totalno FROM terms";
$result = mysql_query($query);
while ($data = mysql_fetch_array($result)) {
    $count = $data['totalno'];
}
if ($count == 0) {
예제 #11
0
 /**
  * Use PorterStemmer library to stem a word.
  * @param str $word The word to stem.
  * @return str The stemmed word
  */
 public static function stemWord($word)
 {
     require_once THINKUP_WEBAPP_PATH . '_lib/extlib/Stemmer/class.PorterStemmer.php';
     return PorterStemmer::Stem($word);
 }
 /**
  * Stems a keyword
  *
  * The basic idea behind stemmming is described on the Wikipedia article on
  * {@link http://en.wikipedia.org/wiki/Stemming Stemming}.
  *
  * If the PECL <code>stem</code> package is loaded, English stemming is
  * performed on the <code>$keyword</code>. See
  * {@link http://pecl.php.net/package/stem/} for details about the PECL
  * stem package.
  *
  * Otherwise, if a <code>PorterStemmer</code< class is defined, it is
  * applied to the <code>$keyword</code>. The most commonly available PHP
  * implementation of the Porter-stemmer algorithm is licenced under the
  * GPL, and is thus not distributable with the LGPL licensed NateGoSearch.
  *
  * If no stemming is available, stemming is not performed and the original
  * keyword is returned.
  *
  * @param string $keyword the keyword to stem.
  *
  * @return string the stemmed keyword.
  */
 public static function stemKeyword($keyword)
 {
     if (extension_loaded('stem')) {
         $keyword = stem($keyword, STEM_ENGLISH);
     } elseif (is_callable(array('PorterStemmer', 'Stem'))) {
         $keyword = PorterStemmer::Stem($keyword);
     }
     return $keyword;
 }
 public function createIndexes()
 {
     if (is_dir(BOOKS_PATH)) {
         if ($directory = opendir(BOOKS_PATH)) {
             while (($file = readdir($directory)) !== false) {
                 $file_id = $this->getFileId($this->getFileName($file));
                 display($file);
                 if ($file_id != '' && filetype(BOOKS_PATH . $file) == 'file' && $this->getFileType($file) == 'txt' && !$this->areIndexesDefined($file_id)) {
                     $data = $this->readFile($file);
                     $data = $this->sanitizeWithSpace($data);
                     $indexes = str_word_count($data, 1, '0123456789');
                     $total_count = 0;
                     $index_count = array();
                     $stemmer = new PorterStemmer();
                     foreach ($indexes as $index) {
                         $word = $stemmer->Stem($index);
                         if (!in_array($word, $this->getIgnoredWords())) {
                             $word = 'indx-' . $word;
                             if (!isset($index_count[$word])) {
                                 $index_count[$word] = 1;
                             } else {
                                 $index_count[$word]++;
                             }
                             $total_count++;
                         }
                     }
                     arsort($index_count);
                     $this->saveIndexes($file_id, $index_count, $total_count);
                 }
             }
         }
     }
 }
예제 #14
0
       第二步,如果单词中包含元音,并且以y结尾,将y改为i。
       第三步,将双后缀的单词映射为单后缀。
       第四步,处理-ic-,-full,-ness等等后缀。
       第五步,在<c>vcvc<v>情形下,去除-ant,-ence等后缀。
       第六步,也就是最后一步,在m()>1的情况下,移除末尾的“e”。
   算法使用说明:
      传入的单词必须是小写
   参考学习网站:
       http://tartarus.org/~martin/PorterStemmer/
       http://snowball.tartarus.org/algorithms/english/stemmer.html
       http://blog.csdn.net/noobzc1/article/details/8902881
    * */
 require_once './PorterStemmer.php';
 $p_stemmer = new PorterStemmer();
 foreach ($token_array as $token) {
     $token_stem_array[] = $p_stemmer->Stem(rtrim($token));
 }
 // $token_stem_array[]= $p_stemmer->Stem( rtrim('news'));//特定单词监测
 //var_dump($token_stem_array);
 /*Stopword removal,并消除重复的关键字
   思路1:根据stopword list 去除stopword ,为提高准确度,stopword list 尽可能设置很小
   思路2:根据stopword list 设置stopword的weight,在匹配的时候根据权重设置返回结果
   * */
 //读取stopword_list
 $stopwords_en_file = dirname(__FILE__) . "\\stop_words" . "\\stop_words_eng.txt";
 $lines = file($stopwords_en_file);
 //读取文件内容
 foreach ($lines as $line) {
     $stopwords_en_array[] = rtrim($line);
 }
 //  var_dump($stopwords_en_array);
예제 #15
0
 /**
  * Generates word stems that are added to the text
  *
  * @author Jonathan Davis
  * @since 1.1
  *
  * @param string $text The text to stem
  * @return string The text plus the generated word stems
  **/
 public static function StemFilter($text)
 {
     // Filter out short words for stemming
     $source = preg_replace("/\\b\\w{1,3}\\b/", '', $text);
     $_ = array();
     $token = strtok($source, ' ');
     while ($token) {
         $stem = PorterStemmer::Stem($token);
         if ($stem != $token) {
             $_[] = $stem;
         }
         $token = strtok(' ');
     }
     return !empty($_) ? "{$text} " . join(' ', $_) : $text;
 }
예제 #16
0
 /**
  * $percent - what percentage of text should be used as the summary (in sentences).
  * $min_sentences - the minimum length of the summary.
  * $max_sentences - the maximum length of the summary.
  */
 function summary($text, $percent = 0.2, $min_sentences = 1, $max_sentences = 0)
 {
     $sentences = $this->sentence_tokenize($text);
     $sentence_bag = array();
     for ($i = 0; $i < count($sentences); $i++) {
         $words = $this->word_tokenize($sentences[$i]);
         $word_stats = array();
         foreach ($words as $word) {
             //skip stopwords
             if (in_array($word, $this->stopwords)) {
                 continue;
             }
             //stem
             $word = PorterStemmer::Stem($word);
             //skip stopwords by stem
             if (in_array($word, $this->stopwords)) {
                 continue;
             }
             //per-sentence word counts
             if (!isset($word_stats[$word])) {
                 $word_stats[$word] = 1;
             } else {
                 $word_stats[$word]++;
             }
             //global word counts
             if (!isset($this->word_stats[$word])) {
                 $this->word_stats[$word] = 1;
             } else {
                 $this->word_stats[$word]++;
             }
         }
         $sentence_bag[] = array('sentence' => $sentences[$i], 'word_stats' => $word_stats, 'ord' => $i);
     }
     //sort words by frequency
     arsort($this->word_stats);
     //only consider top 20 most common words. Throw away the rest.
     $this->word_stats = array_slice($this->word_stats, 0, 20);
     for ($i = 0; $i < count($sentence_bag); $i++) {
         $rating = $this->calculate_rating($sentence_bag[$i]['word_stats']);
         $sentence_bag[$i]['rating'] = $rating;
     }
     //Sort sentences by importance rating
     usort($sentence_bag, array(&$this, 'cmp_arrays_rating'));
     //How many sentences do we need?
     if ($max_sentences == 0) {
         $max_sentences = count($sentence_bag);
     }
     $summary_count = min($max_sentences, max(min($min_sentences, count($sentence_bag)), round($percent * count($sentence_bag))));
     if ($summary_count < 1) {
         $summary_count = 1;
     }
     //echo "Total sentences : ".count($sentence_bag).", summary : $summary_count\n";
     //Take the X highest rated sentences (from the end of the array)
     $summary_bag = array_slice($sentence_bag, -$summary_count);
     //Restore the original sentence order
     usort($summary_bag, array(&$this, 'cmp_arrays_ord'));
     $summary_sentences = array();
     foreach ($summary_bag as $sentence) {
         $summary_sentences[] = $sentence['sentence'];
     }
     return $summary_sentences;
 }
예제 #17
0
function STEMMING($tokens)
{
    include 'porter_stemmer.php';
    foreach ($tokens as $key => $word) {
        $tokens[$key] = PorterStemmer::Stem($word);
    }
    return $tokens;
}
예제 #18
0
$stop_tok = strtok(clean_string($excluded_words), $delim);
while ($stop_tok !== false) {
    $stopWords[] = trim(strtolower(PorterStemmer::Stem($stop_tok)));
    $stop_tok = strtok($delim);
}
$words = array();
foreach ($data as $data_item) {
    // 	echo "Original : " . $data_item["text"] . "<br>";
    $clean_text = "";
    foreach ($data_item as $field => $val) {
        $clean_text .= " " . clean_string($val);
    }
    // 	echo "Clean : " . $clean_text . "<hr>";
    $tok = strtok($clean_text, $delim);
    while ($tok !== false) {
        $stem = trim(strtolower(PorterStemmer::Stem($tok)));
        if (in_array($stem, $stopWords) || strlen($stem) < 3) {
            $tok = strtok($delim);
            continue;
        }
        if (array_key_exists($stem, $words)) {
            $words[$stem]->checkWord($tok);
            $words[$stem]->increment();
        } else {
            $words[$stem] = new WordCount($tok);
        }
        $tok = strtok($delim);
    }
}
usort($words, function ($a, $b) {
    return $a->count < $b->count;
예제 #19
0
<html>
<head>
	<meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> 
	<title>porter stemmer</title> 
</head>
<body>
	<form action="porter_stemmer_test.php" method="get">
		<input name="word" size="100">
		<button type="submit">GO</button>
	</form>
</body>
</html>
 
<?php 
include 'class.stemmer.inc.php';
include 'porter_stemmer.php';
$word = isset($_GET['word']) ? $_GET['word'] : '';
$stemmer = new Stemmer();
echo "class.stemmer.inc.php:  " . $stemmer->stem($word);
echo "<br>";
echo "porter_stemmer.php:   " . PorterStemmer::Stem($word);
예제 #20
0
function applyPorterStemming($tweet)
{
    foreach ($tweet as $index => $word) {
        $tweet[$index] = PorterStemmer::Stem($word);
    }
    return $tweet;
}
예제 #21
0
 function _cleanSearchTerms($searchterms)
 {
     $cleansearchterms = strtolower($searchterms);
     $cleansearchterms = preg_replace('/\\W/i', ' ', $cleansearchterms);
     $cleansearchterms = preg_replace('/\\s\\s+/', ' ', $cleansearchterms);
     $terms = explode(' ', trim($cleansearchterms));
     $toreturn = array();
     foreach ($terms as $term) {
         $term = PorterStemmer::Stem($term);
         $toreturn[] = $term;
     }
     return $toreturn;
 }
예제 #22
0
 public function execute(array &$param_pool = null)
 {
     $result = new XMLElement($this->dsParamROOTELEMENT);
     $config = (object) Symphony::Configuration()->get('search_index');
     // Setup
     /*-----------------------------------------------------------------------*/
     // look for key in GET array if it's specified
     if (!empty($config->{'get-param-prefix'})) {
         if ($config->{'get-param-prefix'} == 'param_pool') {
             $_GET = $this->_env['param'];
         } else {
             $_GET = $_GET[$config->{'get-param-prefix'}];
         }
     }
     // get input parameters from GET request
     $param_keywords = isset($_GET[$config->{'get-param-keywords'}]) ? trim($_GET[$config->{'get-param-keywords'}]) : '';
     $param_sort = isset($_GET[$config->{'get-param-sort'}]) ? $_GET[$config->{'get-param-sort'}] : $config->{'default-sort'};
     $param_direction = isset($_GET[$config->{'get-param-direction'}]) ? strtolower($_GET[$config->{'get-param-direction'}]) : $config->{'default-direction'};
     // set pagination on the data source
     $this->dsParamSTARTPAGE = isset($_GET[$config->{'get-param-page'}]) ? (int) $_GET[$config->{'get-param-page'}] : $this->dsParamSTARTPAGE;
     $this->dsParamLIMIT = isset($_GET[$config->{'get-param-per-page'}]) && (int) $_GET[$config->{'get-param-per-page'}] > 0 ? (int) $_GET[$config->{'get-param-per-page'}] : $config->{'default-per-page'};
     // build ORDER BY statement for later
     switch ($param_sort) {
         case 'date':
             $sql_order_by = "e.creation_date {$param_direction}";
             break;
         case 'id':
             $sql_order_by = "e.id {$param_direction}";
             break;
         default:
             $sql_order_by = "score {$param_direction}";
             break;
     }
     // Find valid sections to query
     /*-----------------------------------------------------------------------*/
     if (isset($_GET[$config->{'get-param-sections'}]) && !empty($_GET[$config->{'get-param-sections'}])) {
         $param_sections = $_GET[$config->{'get-param-sections'}];
         // allow sections to be sent as an array if the user wishes (multi-select or checkboxes)
         if (is_array($param_sections)) {
             implode(',', $param_sections);
         }
     } elseif (!empty($config->{'default-sections'})) {
         $param_sections = $config->{'default-sections'};
     } else {
         $param_sections = '';
     }
     $sections = array();
     foreach (array_map('trim', explode(',', $param_sections)) as $handle) {
         $section = Symphony::Database()->fetchRow(0, sprintf("SELECT `id`, `name` FROM `tbl_sections` WHERE handle = '%s' LIMIT 1", Symphony::Database()->cleanValue($handle)));
         if ($section) {
             $sections[$section['id']] = array('handle' => $handle, 'name' => $section['name']);
         }
     }
     if (count($sections) == 0) {
         return $this->errorXML('Invalid search sections');
     }
     // Set up and manipulate keywords
     /*-----------------------------------------------------------------------*/
     // should we apply word stemming?
     $do_stemming = $config->{'stem-words'} == 'yes' ? TRUE : FALSE;
     // replace synonyms
     $keywords = SearchIndex::applySynonyms($param_keywords);
     $keywords_boolean = SearchIndex::parseKeywordString($keywords, $do_stemming);
     $keywords_highlight = trim(implode(' ', $keywords_boolean['highlight']), '"');
     // Set up weighting
     /*-----------------------------------------------------------------------*/
     $sql_weighting = '';
     foreach (SearchIndex::getIndexes() as $section_id => $index) {
         $weight = isset($index['weighting']) ? $index['weighting'] : 2;
         switch ($weight) {
             case 0:
                 $weight = 4;
                 break;
                 // highest
             // highest
             case 1:
                 $weight = 2;
                 break;
                 // high
             // high
             case 2:
                 $weight = 1;
                 break;
                 // none
             // none
             case 3:
                 $weight = 0.5;
                 break;
                 // low
             // low
             case 4:
                 $weight = 0.25;
                 break;
                 // lowest
         }
         $sql_weighting .= sprintf("WHEN e.section_id = %d THEN %d \n", $section_id, $weight);
     }
     // Build search SQL
     /*-----------------------------------------------------------------------*/
     $mode = !is_null($config->{'mode'}) ? $config->{'mode'} : 'like';
     $mode = strtoupper($mode);
     switch ($mode) {
         case 'FULLTEXT':
             $sql = sprintf("SELECT\n\t\t\t\t\t\t\tSQL_CALC_FOUND_ROWS\n\t\t\t\t\t\t\te.id as `entry_id`,\n\t\t\t\t\t\t\tdata,\n\t\t\t\t\t\t\te.section_id as `section_id`,\n\t\t\t\t\t\t\tUNIX_TIMESTAMP(e.creation_date) AS `creation_date`,\n\t\t\t\t\t\t\t(\n\t\t\t\t\t\t\t\tMATCH(index.data) AGAINST ('%1\$s') *\n\t\t\t\t\t\t\t\tCASE\n\t\t\t\t\t\t\t\t\t%2\$s\n\t\t\t\t\t\t\t\t\tELSE 1\n\t\t\t\t\t\t\t\tEND\n\t\t\t\t\t\t\t\t%3\$s\n\t\t\t\t\t\t\t) AS `score`\n\t\t\t\t\t\tFROM\n\t\t\t\t\t\t\ttbl_search_index as `index`\n\t\t\t\t\t\t\tJOIN tbl_entries as `e` ON (index.entry_id = e.id)\n\t\t\t\t\t\tWHERE\n\t\t\t\t\t\t\tMATCH(index.data) AGAINST ('%4\$s' IN BOOLEAN MODE)\n\t\t\t\t\t\t\tAND e.section_id IN ('%5\$s')\n\t\t\t\t\t\tORDER BY\n\t\t\t\t\t\t\t%6\$s\n\t\t\t\t\t\tLIMIT %7\$d, %8\$d", Symphony::Database()->cleanValue($keywords), $sql_weighting, $param_sort == 'score-recency' ? '/ SQRT(GREATEST(1, DATEDIFF(NOW(), creation_date)))' : '', Symphony::Database()->cleanValue($keywords), implode("','", array_keys($sections)), Symphony::Database()->cleanValue($sql_order_by), max(0, ($this->dsParamSTARTPAGE - 1) * $this->dsParamLIMIT), (int) $this->dsParamLIMIT);
             break;
         case 'LIKE':
         case 'REGEXP':
             $sql_locate = '';
             $sql_replace = '';
             $sql_where = '';
             // by default, no wildcard separators
             $prefix = '';
             $suffix = '';
             // append wildcard for LIKE
             if ($mode == 'LIKE') {
                 $prefix = $suffix = '%';
             }
             // apply word boundary separator
             if ($mode == 'REGEXP') {
                 $prefix = '[[:<:]]';
                 $suffix = '[[:>:]]';
             }
             // all words to include in the query (single words and phrases)
             foreach ($keywords_boolean['include-words-all'] as $keyword) {
                 $keyword_stem = NULL;
                 $keyword = Symphony::Database()->cleanValue($keyword);
                 if ($do_stemming) {
                     $keyword_stem = Symphony::Database()->cleanValue(PorterStemmer::Stem($keyword));
                 }
                 // if the word can be stemmed, look for the word or the stem version
                 if ($do_stemming && $keyword_stem != $keyword) {
                     $sql_where .= "(index.data {$mode} '{$prefix}{$keyword}{$suffix}' OR index.data {$mode} '{$prefix}{$keyword}{$suffix}') AND ";
                 } else {
                     $sql_where .= "index.data {$mode} '{$prefix}{$keyword}{$suffix}' AND ";
                 }
                 // if this keyword exists in the entry contents, add 1 to "keywords_matched"
                 // which represents number of unique keywords in the search string that are found
                 $sql_locate .= "IF(LOCATE('{$keyword}', LOWER(`data`)) > 0, 1, 0) + ";
                 // see how many times this word is found in the entry contents by removing it from
                 // the column text then compare length to see how many times it was removed
                 $sql_replace .= "(LENGTH(`data`) - LENGTH(REPLACE(LOWER(`data`),LOWER('{$keyword}'),''))) / LENGTH('{$keyword}') + ";
             }
             // all words or phrases that we do not want
             foreach ($keywords_boolean['exclude-words-all'] as $keyword) {
                 $keyword = Symphony::Database()->cleanValue($keyword);
                 $sql_where .= "index.data NOT {$mode} '{$prefix}{$keyword}{$suffix}' AND ";
             }
             // append to complete SQL
             $sql_locate = $sql_locate == '' ? $sql_locate = '1' : ($sql_locate .= '0');
             $sql_replace = $sql_replace == '' ? $sql_replace = '1' : ($sql_replace .= '0');
             $sql_where = $sql_where == '' ? $sql_where = 'NOT 1' : $sql_where;
             // trim unnecessary boolean conditions from SQL
             $sql_where = preg_replace("/ OR \$/", "", $sql_where);
             $sql_where = preg_replace("/ AND \$/", "", $sql_where);
             // if ordering by score, use a function of the two columns
             // we are calculating rather than just "score"
             if (preg_match("/^score/", $sql_order_by)) {
                 $sql_order_by = preg_replace("/^score/", "(keywords_matched * score)", $sql_order_by);
             }
             $sql = sprintf("SELECT\n\t\t\t\t\t\t\tSQL_CALC_FOUND_ROWS\n\t\t\t\t\t\t\te.id as `entry_id`,\n\t\t\t\t\t\t\tdata,\n\t\t\t\t\t\t\te.section_id as `section_id`,\n\t\t\t\t\t\t\tUNIX_TIMESTAMP(e.creation_date) AS `creation_date`,\n\t\t\t\t\t\t\t(\n\t\t\t\t\t\t\t\t%1\$s\n\t\t\t\t\t\t\t) AS keywords_matched,\n\t\t\t\t\t\t\t(\n\t\t\t\t\t\t\t\t(%2\$s)\n\t\t\t\t\t\t\t\t*\n\t\t\t\t\t\t\t\tCASE\n\t\t\t\t\t\t\t\t\t%3\$s\n\t\t\t\t\t\t\t\t\tELSE 1\n\t\t\t\t\t\t\t\tEND\n\t\t\t\t\t\t\t\t%4\$s\n\t\t\t\t\t\t\t) AS score\n\t\t\t\t\t\tFROM\n\t\t\t\t\t\t\ttbl_search_index as `index`\n\t\t\t\t\t\t\tJOIN tbl_entries as `e` ON (index.entry_id = e.id)\n\t\t\t\t\t\tWHERE\n\t\t\t\t\t\t\t%5\$s\n\t\t\t\t\t\t\tAND e.section_id IN ('%6\$s')\n\t\t\t\t\t\tORDER BY\n\t\t\t\t\t\t\t%7\$s\n\t\t\t\t\t\tLIMIT\n\t\t\t\t\t\t\t%8\$d, %9\$d", $sql_locate, $sql_replace, $sql_weighting, $param_sort == 'score-recency' ? '/ SQRT(GREATEST(1, DATEDIFF(NOW(), creation_date)))' : '', $sql_where, implode("','", array_keys($sections)), Symphony::Database()->cleanValue($sql_order_by), max(0, ($this->dsParamSTARTPAGE - 1) * $this->dsParamLIMIT), (int) $this->dsParamLIMIT);
             //echo $sql;die;
             break;
     }
     // Add soundalikes ("did you mean?") to XML
     /*-----------------------------------------------------------------------*/
     // we have search words, check for soundalikes
     if (count($keywords_boolean['include-words-all']) > 0) {
         $sounds_like = array();
         foreach ($keywords_boolean['include-words-all'] as $word) {
             $soundalikes = Symphony::Database()->fetchCol('keyword', sprintf("SELECT keyword FROM tbl_search_index_keywords WHERE SOUNDEX(keyword) = SOUNDEX('%s')", Symphony::Database()->cleanValue($word)));
             foreach ($soundalikes as $i => &$soundalike) {
                 if ($soundalike == $word) {
                     unset($soundalikes[$i]);
                     continue;
                 }
                 $soundalike = array('word' => $soundalike, 'distance' => levenshtein($soundalike, $word));
             }
             usort($soundalikes, array('datasourcesearch', 'sortWordDistance'));
             $sounds_like[$word] = $soundalikes[0]['word'];
         }
         // add words to XML
         if (count($sounds_like) > 0) {
             $alternative_spelling = new XMLElement('alternative-keywords');
             foreach ($sounds_like as $word => $soundalike) {
                 $alternative_spelling->appendChild(new XMLElement('keyword', NULL, array('original' => $word, 'alternative' => $soundalike, 'distance' => levenshtein($soundalike, $word))));
             }
             $result->appendChild($alternative_spelling);
         }
     }
     // Run search SQL!
     /*-----------------------------------------------------------------------*/
     // get our entries, returns entry IDs
     $entries = Symphony::Database()->fetch($sql);
     $total_entries = Symphony::Database()->fetchVar('total', 0, 'SELECT FOUND_ROWS() AS `total`');
     // append input values
     $result->setAttributeArray(array('keywords' => General::sanitize($keywords), 'sort' => General::sanitize($param_sort), 'direction' => General::sanitize($param_direction)));
     // append pagination
     $result->appendChild(General::buildPaginationElement($total_entries, ceil($total_entries * (1 / $this->dsParamLIMIT)), $this->dsParamLIMIT, $this->dsParamSTARTPAGE));
     // append list of sections
     $sections_xml = new XMLElement('sections');
     foreach ($sections as $id => $section) {
         $sections_xml->appendChild(new XMLElement('section', General::sanitize($section['name']), array('id' => $id, 'handle' => $section['handle'])));
     }
     $result->appendChild($sections_xml);
     // Append entries to XML, build if desired
     /*-----------------------------------------------------------------------*/
     // if true then the entire entry will be appended to the XML. If not, only
     // a "stub" of the entry ID is provided, allowing other data sources to
     // supplement with the necessary fields
     $build_entries = $config->{'build-entries'} == 'yes' ? TRUE : FALSE;
     if ($build_entries) {
         $field_pool = array();
     }
     // container for entry ID output parameter
     $param_output = array();
     foreach ($entries as $entry) {
         $param_output[] = $entry['entry_id'];
         $entry_xml = new XMLElement('entry', NULL, array('id' => $entry['entry_id'], 'section' => $sections[$entry['section_id']]['handle']));
         // add excerpt with highlighted search terms
         $excerpt = SearchIndex::parseExcerpt($keywords_highlight, $entry['data']);
         $excerpt = $this->fixEncoding($excerpt);
         $entry_xml->appendChild(new XMLElement('excerpt', $excerpt));
         // build and append entry data
         if ($build_entries) {
             $e = reset(EntryManager::fetch($entry['entry_id']));
             $data = $e->getData();
             foreach ($data as $field_id => $values) {
                 if (!isset($field_pool[$field_id]) || !is_object($field_pool[$field_id])) {
                     $field_pool[$field_id] = FieldManager::fetch($field_id);
                 }
                 $field_pool[$field_id]->appendFormattedElement($entry_xml, $values, FALSE, !empty($values['value_formatted']) ? 'formatted' : null, $e->get('id'));
             }
         }
         $result->appendChild($entry_xml);
     }
     // send entry IDs as Output Parameterss
     $param_pool['ds-' . $this->dsParamROOTELEMENT . '.id'] = $param_output;
     $param_pool['ds-' . $this->dsParamROOTELEMENT] = $param_output;
     // Log query
     /*-----------------------------------------------------------------------*/
     if ($config->{'log-keywords'} == 'yes' && trim($keywords)) {
         $section_handles = array_map('reset', array_values($sections));
         // has this search (keywords+sections) already been logged this session?
         $already_logged = Symphony::Database()->fetch(sprintf("SELECT * FROM `tbl_search_index_logs` WHERE keywords='%s' AND sections='%s' AND session_id='%s'", Symphony::Database()->cleanValue($param_keywords), Symphony::Database()->cleanValue(implode(',', $section_handles)), session_id()));
         $log_sql = sprintf("INSERT INTO `tbl_search_index_logs`\n\t\t\t\t\t(date, keywords, keywords_manipulated, sections, page, results, session_id)\n\t\t\t\t\tVALUES('%s', '%s', '%s', '%s', %d, %d, '%s')", date('Y-m-d H:i:s', time()), Symphony::Database()->cleanValue($param_keywords), Symphony::Database()->cleanValue($keywords), Symphony::Database()->cleanValue(implode(',', $section_handles)), $this->dsParamSTARTPAGE, $total_entries, session_id());
         Symphony::Database()->query($log_sql);
     }
     return $result;
 }
예제 #23
0
                }
            }
            $return_array = $new_return_array;
            // Replace the previous return array by the next version
            $d_count++;
            //echo $d_count . PHP_EOL ;
        }
        return $return_array;
        // Return the exploded elements
    }
}
$wlw = new whitelistWords();
$wlw->init();
$sentence = $_SERVER['argv'][1];
$sentenceArray = explode(" ", $sentence);
//print_r($sentenceArray) ;
$wordsNotFound;
echo time() . PHP_EOL;
echo "Going to look for words in dictionary .. " . PHP_EOL;
echo "Words passed:   ";
foreach ($sentenceArray as $word) {
    $stemmedWord = PorterStemmer::Stem($word);
    if ($wlw->dictionary[$stemmedWord] == 1) {
        echo $word . "  ";
    } else {
        $wordsNotFound .= " " . $word;
    }
}
echo PHP_EOL;
echo "Words failed: " . $wordsNotFound . PHP_EOL;
echo time() . PHP_EOL;
예제 #24
0
 public function base()
 {
     $results = array();
     $query = isset($_GET['q']) ? $_GET['q'] : false;
     $search = $query;
     if ($query !== false) {
         $stemmer = new PorterStemmer();
         $index = new indexes($this->getDb());
         $file = new files($this->getDb());
         $scores = array();
         $terms = explode(' ', $query);
         $db_searches = new searches($this->getDb());
         foreach ($terms as $term) {
             if ($term != '') {
                 $old = $db_searches->select('*', 'WHERE term="' . $term . '"');
                 if (isset($old[0]['id'])) {
                     $db_searches->update(array('count' => ++$old[0]['count'], 'date' => time()), 'WHERE term="' . $term . '"');
                 } else {
                     $db_searches->insert(array('term' => $term, 'count' => 1, 'date' => time()));
                 }
             }
         }
         $term_weight = 0.05;
         $wpm_weight = 5;
         //7
         $count_weight = 150;
         //135
         $count = 0;
         foreach ($terms as $term) {
             $term = 'indx-' . $stemmer->Stem($term);
             $data = $index->select('*', 'WHERE stem="' . $term . '" ORDER BY wpm DESC, count DESC LIMIT 0, 100');
             foreach ($data as $file_data) {
                 $file_id = $file_data['file'];
                 $wpm = $file_data['wpm'];
                 $index_count = $file_data['count'];
                 $weight = $wpm * $wpm_weight * (1 - $term_weight * $count) + $index_count * $count_weight * (1 - $term_weight * $count);
                 if (isset($scores[$file_id])) {
                     $scores[$file_id] += $weight;
                 } else {
                     $scores[$file_id] = $weight;
                 }
             }
             $count++;
         }
         arsort($scores);
         foreach ($scores as $key => $score) {
             $results[$key] = array();
             $results[$key]['score'] = $score;
         }
         $ids = array_keys($scores);
         $files = array();
         if (count($ids) > 0) {
             $files = $file->select('*', 'WHERE id IN (' . implode(', ', $ids) . ')');
         }
         foreach ($files as $selected) {
             $results[$selected['id']]['id'] = $selected['id'];
             $results[$selected['id']]['name'] = $selected['name'];
             $results[$selected['id']]['link'] = $selected['link'];
         }
         $this->set('results', $results);
     } else {
         $this->set('results', $results);
     }
     $this->set('search', $search);
     $searches = new searches($this->getDb());
     $this->set('popular', $searches->select('*', 'ORDER BY count DESC LIMIT 0, 10'));
 }
예제 #25
0
 if (strlen($SearchWords[$sw]) < $MinWordLen) {
     SkipSearchWord($sw);
     continue;
 }
 $ExactPhrase = 0;
 $ExcludeTerm = 0;
 // Check exclusion searches
 if ($SearchWords[$sw][0] == "-") {
     $SearchWords[$sw] = substr($SearchWords[$sw], 1);
     $ExcludeTerm = 1;
     $exclude_count++;
 }
 // Stem the words if necessary (only AFTER stripping exclusion char)
 if ($UseStemming == 1) {
     if ($AllowExactPhrase == 0 || strpos($SearchWords[$sw], " ") === false) {
         $SearchWords[$sw] = $porterStemmer->Stem($SearchWords[$sw]);
     }
 }
 if ($AllowExactPhrase == 1 && strpos($SearchWords[$sw], " ") !== false) {
     // Initialise exact phrase matching for this search term
     $ExactPhrase = 1;
     $phrase_terms = explode(" ", $SearchWords[$sw]);
     //$phrase_terms = preg_split("/\W+/", $SearchWords[$sw], -1, 0 /*PREG_SPLIT_DELIM_CAPTURE*/);
     $num_phrase_terms = count($phrase_terms);
     if ($num_phrase_terms > $context_maxgoback) {
         $context_maxgoback = $num_phrase_terms;
     }
     $phrase_terms_data = array();
     if ($UseStemming == 1) {
         for ($j = 0; $j < $num_phrase_terms; $j++) {
             $phrase_terms[$j] = $porterStemmer->Stem($phrase_terms[$j]);
예제 #26
0
	/**
	 * Generates word stems that are added to the text
	 * 
	 * @since 1.1
	 *
	 * @param string $text The text to stem
	 * @return string The text plus the generated word stems
	 **/
	static function StemFilter ($text) {
		// Filter out short words for stemming
		$source = preg_replace("/\b\w{1,3}\b/",'',$text);
		$_ = array();
		$token = strtok($source,' ');
		while ($token) {
			$stem = PorterStemmer::Stem($token);
			if ($stem != $token) $_[] = $stem;
			$token = strtok(' ');
		}
		return $text.' '.join(' ',$_);
	}
 //(IF CLUSTERING IS CHOSEN:)
 if ($_POST['result_type'] === 'Clustered') {
     //AGGREGATE the Results using Borda-Fuse
     //NOTE THAT THE ORDER THE ARRAYS ARE SENT IN TO BE AGGREGATED MATTERS
     //FOR ANY RESULTS THAT HAVE THE SAME COMBINED SCORE/RANK
     aggregateResults($googleResults, $bingResults, $blekkoResults, $aggregatedResults);
     //Remove Stop Words
     foreach ($aggregatedResults as $key => &$value) {
         $value[5] = prepareSnippet($value[2]);
     }
     //Stem the results
     //$stem = PorterStemmer::Stem($value);
     foreach ($aggregatedResults as $key => &$value) {
         $tempArray = explode(" ", $value[5]);
         foreach ($tempArray as &$tempValue) {
             $tempValue = PorterStemmer::Stem($tempValue);
         }
         $value[5] = implode(" ", $tempArray);
     }
     //Get Coordinates
     $wordCollection = array();
     getCoordinates($aggregatedResults, $wordCollection);
     //Cluster the results
     $cluster1 = array();
     $cluster2 = array();
     $cluster3 = array();
     $cluster4 = array();
     fillClusters($aggregatedResults, $cluster1, $cluster2, $cluster3, $cluster4);
     //*********************************************************************
     //EXPERIMENTAL ATTEMPT AT NAMING CLUSTERS
     //Remove Stop Words from clusters
예제 #28
0
    $charset = $result['charset'];
    echo "This site is encoded with" . " " . "<b>" . $charset . "</b>" . " " . "format" . "<br>";
    $utf8_text = $result['utf8_text'];
    $text = strip_html_tags($page);
    $utf8_text = html_entity_decode($text, ENT_QUOTES, "utf-8");
    $utf8_text = strip_punctuation($utf8_text);
    $utf8_text = strip_symbols($utf8_text);
    $utf8_text = strip_numbers($utf8_text);
    mb_regex_encoding("utf-8");
    $words = mb_split(' +', $utf8_text);
    foreach ($words as $key => $word) {
        $words[$key] = PorterStemmer::Stem($word, true);
    }
    $stopWords = mb_split('[ \\n]+', mb_strtolower($words[$key], 'utf-8'));
    foreach ($stopWords as $key => $word) {
        $stopWords[$key] = PorterStemmer::Stem($word, true);
    }
    $words = array_diff($words, $stopWords);
    $keywordCounts = array_count_values($words);
    arsort($keywordCounts, SORT_NUMERIC);
    $uniqueKeywords = array_keys($keywordCounts);
    echo "The keywords are" . "<br>";
    foreach ($uniqueKeywords as $value) {
        echo "-" . $value;
        echo "<br>";
    }
}
?>

<body>
	<div id="content">
예제 #29
0
function macro_Keywords($formatter, $value, $options = array())
{
    global $DBInfo;
    $supported_lang = array('ko');
    $limit = isset($options['limit']) ? $options['limit'] : 40;
    $opts = explode(',', $value);
    $sort = '';
    foreach ($opts as $opt) {
        $opt = trim($opt);
        if ($opt == 'delicious' or $opt == 'del.icio.us') {
            $tag_link = 'http://del.icio.us/tag/$TAG';
        } else {
            if ($opt == 'technorati') {
                $tag_link = 'http://www.technorati.com/tag/$TAG';
            } else {
                if ($opt == 'flickr') {
                    $tag_link = 'http://www.flickr.com/photos/tags/$TAG';
                } else {
                    if ($opt == 'all') {
                        $options['all'] = 1;
                        $limit = 0;
                    } else {
                        if ($opt == 'random') {
                            $options['random'] = $options['all'] = 1;
                        } else {
                            if ($opt == 'suggest') {
                                $options['suggest'] = 1;
                            } else {
                                if ($opt == 'tour') {
                                    $options['tour'] = 1;
                                } else {
                                    if ($opt == 'cloud') {
                                        $options['cloud'] = 1;
                                    } else {
                                        if ($opt == 'freq') {
                                            $sort = 'freq';
                                        } else {
                                            if (($p = strpos($opt, '=')) !== false) {
                                                $k = substr($opt, 0, $p);
                                                $v = substr($opt, $p + 1);
                                                if ($k == 'limit') {
                                                    $limit = $v;
                                                } else {
                                                    if ($k == 'random') {
                                                        $options['all'] = 1;
                                                        $v = (int) $v;
                                                        $v = $v > 0 ? $v : 1;
                                                        $options['random'] = $v;
                                                    } else {
                                                        if ($k == 'sort' and in_array($v, array('freq', 'alpha'))) {
                                                            $sort = $v;
                                                        } else {
                                                            if ($k == 'type' and in_array($v, array('full', 'title'))) {
                                                                $search = $v . 'search';
                                                            } else {
                                                                if ($k == 'url') {
                                                                    $tag_link = $v;
                                                                    if (preg_match('/\\$TAG/', $tag_link) === false) {
                                                                        $tag_link .= '$TAG';
                                                                    }
                                                                }
                                                            }
                                                        }
                                                    }
                                                }
                                                // else ignore
                                            } else {
                                                $pagename = $opt;
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }
    }
    if (isset($options['random']) and empty($limit)) {
        $limit = 0;
    }
    if (isset($options['sort']) and $options['sort'] == 'freq') {
        $sort = 'freq';
    }
    if (empty($pagename)) {
        $pagename = $formatter->page->name;
    }
    # get cached keywords
    $cache = new Cache_text('keyword');
    $pkey = $pagename;
    $mc = new Cache_text('macro');
    $mkey = 'Keywords.' . md5($pagename . $value);
    $mykeys = array();
    # check cache mtime
    $cmt = $mc->mtime($mkey);
    $pmt = $cache->mtime($pkey);
    if ($cmt > $pmt) {
        # check update or not
        $dmt = $cache->mtime();
        if ($dmt > $cmt) {
            # XXX crude method
            $mykeys = array();
        } else {
            $mykeys = $mc->fetch($mkey);
        }
    } else {
        $mc->remove($mkey);
    }
    if (!$mykeys) {
        if (!empty($options['all'])) {
            $pages = $DBInfo->getPageLists();
        } else {
            $pages = array($pagename);
        }
        foreach ($pages as $pn) {
            if ($keys = $cache->fetch($pn)) {
                $mykeys = array_merge($mykeys, $keys);
            }
        }
        $mc->update($mkey, $mykeys);
    }
    if (!empty($options['all'])) {
        $use_sty = 1;
        $words = array_count_values($mykeys);
        unset($words['']);
        $ncount = array_sum($words);
        // total count
        arsort($words);
        $max = current($words);
        // get max hit number
        if (!empty($options['random'])) {
            $rws = array();
            $selected = array_rand($words, min($options['random'], count($words)));
            foreach ($selected as $k) {
                $rws[$k] = $words[$k];
            }
            $words =& $rws;
        }
        if ($sort != 'freq') {
            ksort($words);
        }
        #sort($words);
        #print $sort." $value";
        #print "<pre>";
        #print_r($words);
        #print "</pre>";
    } else {
        $max = 3;
        // default weight
        $words = array();
        foreach ($mykeys as $key) {
            $words[$key] = $max;
            // give weight to all selected keywords
        }
    }
    # automatically generate list of keywords
    if (empty($options['all']) and (empty($words) or isset($options['suggest']))) {
        $common = <<<EOF
am an a b c d e f g h i j k l m n o p q r s t u v w x y z
0 1 2 3 4 5 6 7 8 9
if on in by it at up as down over into for from to of he his him she her back
is are be being been or no not nor and all through under until
these there the top
with here only has had both did faw few little most almost much off on out
also each were was too any very more within then
across before behind beneath beyond after again against around among
so such since because but yet however ever during
it its the this that what where how when who whoever which their them
you your will shall may might we us our
get got would could have
can't won't didn't don't
aiff arj arts asp au avi bin biz css cgi com doc edu exe firm gif gz gzip
htm html info jpeg jpg js jsp mp3 mpeg mpg mov
nom pdf php pl qt ra ram rec shop sit tar tgz tiff txt wav web zip
one two three four five six seven eight nine ten eleven twelve
ftp http https www web net org or kr co us de
EOF;
        $page = $DBInfo->getPage($pagename);
        if (!$page->exists()) {
            return '';
        }
        $raw = $page->get_raw_body();
        $raw = rtrim($raw);
        // strip macros, entities
        $raw = preg_replace("/&[^;\\s]+;|\\[\\[[^\\[]+\\]\\]/", ' ', $raw);
        $raw = preg_replace("/^##.*\$/m", ' ', $raw);
        $raw = preg_replace("/([;\"',`\\\\\\/\\.:@#\\!\\?\$%\\^&\\*\\(\\)\\{\\}\\[\\]\\-_\\+=\\|<>])/", ' ', strip_tags($raw . ' ' . $pagename));
        // pagename also
        $raw = preg_replace("/((?<=[a-z0-9]|[B-Z]{2})([A-Z][a-z]))/", " \\1", $raw);
        $raw = strtolower($raw);
        $raw = preg_replace("/\\b/", ' ', $raw);
        //$raw=preg_replace("/\b([0-9a-zA-Z'\"])\\1+\s*/",' ',$raw);
        $words = preg_split("/\\s+|\n/", $raw);
        // remove common words
        $common_word_page0 = LOCAL_KEYWORDS . '/CommonWords';
        $lines0 = array();
        if ($DBInfo->hasPage($common_word_page0)) {
            $p = $DBInfo->getPage($common_word_page0);
            $lines0 = explode("\n", $p->get_raw_body());
        }
        $lang = isset($formatter->pi['#language']) ? $formatter->pi['#language'] : $DBInfo->default_language;
        if ($lang and in_array($lang, $supported_lang)) {
            $common_word_page = LOCAL_KEYWORDS . '/CommonWords' . ucfirst($lang);
            if ($DBInfo->hasPage($common_word_page)) {
                $p = $DBInfo->getPage($common_word_page);
                $lines = explode("\n", $p->get_raw_body());
                $lines = array_merge($lines, $lines0);
                foreach ($lines as $line) {
                    if (isset($line[0]) and $line[0] == '#') {
                        continue;
                    }
                    $common .= "\n" . $line;
                }
                $common = rtrim($common);
            }
        }
        $words = array_diff($words, preg_split("/\\s+|\n/", $common));
        while (!empty($DBInfo->use_stemmer)) {
            include_once dirname(__FILE__) . '/../lib/stemmer.ko.php';
            include_once dirname(__FILE__) . '/../lib/stemmer.php';
            $indexer = new KoreanStemmer();
            if (!is_resource($indexer->_dict)) {
                break;
            }
            $founds = array();
            foreach ($words as $key) {
                if (preg_match('/^[a-zA-Z0-9]+$/', $key)) {
                    // ignore alphanumeric
                    $stem = PorterStemmer::Stem($key);
                    $founds[] = $stem;
                    continue;
                }
                $match = null;
                $stem = $indexer->getStem(trim($key), $match, $type);
                if (!empty($stem)) {
                    $founds[] = $stem;
                } else {
                    if (!empty($last)) {
                        //print_r($match);
                    }
                }
            }
            $words = $founds;
            $indexer->close();
            break;
        }
        $preword = '';
        $bigwords = array();
        foreach ($words as $word) {
            if (strlen($word) > 2 and strlen($preword) > 2) {
                if ($word == $preword) {
                    continue;
                }
                $key = $preword . ' ' . $word;
                $rkey = $word . ' ' . $preword;
                if (isset($bigwords[$key])) {
                    $bigwords[$key]++;
                } else {
                    if (isset($bigwords[$rkey])) {
                        $bigwords[$rkey]++;
                    } else {
                        $bigwords[$key] = 1;
                    }
                }
            }
            $preword = $word;
        }
        $words = array_count_values($words);
        unset($words['']);
        $ncount = array_sum($words);
        // total count
        /*   
            $words=array_diff(array_keys($counts),preg_split("/\s+|\n/",$common));
        
            if (function_exists('array_intersect_key')) {
                $words=array_intersect_key($counts,$words);
            } else {
                $ret = array();
                foreach($words as $key) {
                    if(array_key_exists($key, $counts))
                        $ret[$key] = $counts[$key];
                }
                $words=&$ret;
            }
        */
        if ($bigwords) {
            //
            $bigwords = array_filter($bigwords, create_function('$a', 'return ($a != 1);'));
            foreach ($bigwords as $k => $v) {
                $words["{$k}"] = $v;
            }
        }
        arsort($words);
        $max = current($words);
        // get max hit number
        $nwords = array();
        if (isset($options['merge'])) {
            foreach ($mykeys as $key) {
                $nwords[$key] = $max;
                // give weight to all selected keywords
            }
        }
        if ($nwords) {
            foreach ($nwords as $k => $v) {
                $words[$k] = $v;
            }
        }
        $use_sty = 1;
    }
    //
    if (!empty($options['call'])) {
        return $words;
    }
    if ($limit and ($sz = sizeof($words)) > $limit) {
        arsort($words);
        $mywords = array_keys($words);
        $mywords = array_slice($mywords, 0, $limit);
        $nwords = array();
        foreach ($mywords as $k) {
            $nwords[$k] = $words[$k];
        }
        $words =& $nwords;
    }
    // make criteria list
    $fz = 0;
    $min = 0;
    $sty = array();
    if (!empty($use_sty)) {
        $fact = array();
        $weight = $max;
        // $ncount
        #print 'max='.$max.' ratio='.$weight/$ncount.':';
        $test = array(0.8, 0.6, 0.4, 0.5, 0.5, 0.5);
        // six level
        for ($i = 0; $i < 6 and $weight > 0; $i++) {
            $weight = (int) ($weight * $test[$i]);
            if ($weight > 0) {
                $fact[] = $weight;
            }
            #print $weight.'--';
        }
        $max = current($fact);
        $min = $limit ? max(1, end($fact)) - 1 : 0;
        // XXX
        // make font-size style
        $fz = max(sizeof($fact), 2);
        $fsh = (MAX_FONT_SZ - MIN_FONT_SZ) / ($fz - 1);
        $fs = MAX_FONT_SZ;
        // max font-size:24px;
        for ($i = 0; $i < $fz; $i++) {
            $ifs = (int) ($fs + 0.5);
            $sty[] = " style='font-size:{$ifs}px;'";
            #print '/'.$ifs;
            $fs -= $fsh;
            $fs = max($fs, 9);
            // min font-size:9px
        }
    }
    if (empty($sort) or $sort != 'freq') {
        ksort($words);
    }
    $link = $formatter->link_url(_rawurlencode($pagename), '');
    if (!isset($tag_link)) {
        if (empty($search)) {
            $search = 'fullsearch&amp;keywords=1';
        }
        if (!empty($options['tour'])) {
            $search = 'tour&amp;arena=keylinks';
        }
        $tag_link = $formatter->link_url(_rawurlencode($pagename), '?action=' . $search . '&amp;value=$TAG');
    }
    $out = '';
    if (!empty($options['add'])) {
        $out = "<form method='post' action='{$link}'>\n";
        $out .= "<input type='hidden' name='action' value='keywords' />\n";
    }
    if (isset($options['cloud'])) {
        $out = '';
        foreach ($words as $key => $val) {
            $style = $sty[$fz - 1];
            for ($i = 0; $i < $fz; $i++) {
                if ($val > $fact[$i]) {
                    $style = $sty[$i];
                    break;
                }
            }
            if ($val > $min) {
                $out .= "<a href='" . qualifiedUrl(str_replace('$TAG', $key, $tag_link)) . "'";
                if ($use_sty) {
                    $out .= ' ' . $style;
                } else {
                    $out .= " style='12'";
                }
                $out .= ">" . $key . "</a>";
            }
        }
        $out = preg_replace('/&amp;/', urlencode('&'), $out);
        $tout = "<a href='http://www.roytanck.com/tag1' style='font-size:20px'>Tag name</a><a href='http://www.roytanck.com/tag2' style='font-size:10px'>Tag two</a>";
        $formatter->register_javascripts(array('js/swfobject.js'));
        $_swf_prefix = qualifiedUrl("{$DBInfo->url_prefix}/local/wp-cumulus");
        // FIXME
        return <<<SWF
<script type="text/javascript">
var flashvars = {
   mode : "tags",
   distr : "true",
   tcolor : "0xffffff",
   tcolor2 : "0x86B9F2",
   hicolor : "0xBAD8F8",
   tagcloud : "<tags>{$out}</tags>"
};

var params = {
   wmode: "opaque",
   bgcolor: "#333333"
};

var attrs = {
   id: "myCloudContent"
};

swfobject.embedSWF("{$_swf_prefix}/tagcloud.swf", "myCloud", "200", "200", "9.0.0","expressInstall.swf", flashvars, params, attrs);
</script>
<div id="myCloud">
</div>
SWF;
    }
    $out .= '<ul>';
    $checkbox = '';
    foreach ($words as $key => $val) {
        $style = '';
        if ($fz > 0) {
            $style = $sty[$fz - 1];
            for ($i = 0; $i < $fz; $i++) {
                if ($val > $fact[$i]) {
                    $style = $sty[$i];
                    break;
                }
            }
        }
        if ($val > $min) {
            $checked = '';
            if ($val >= $max) {
                $checked = 'checked="checked"';
                $ok = 1;
            }
            if (!empty($options['add'])) {
                $checkbox = "<input type='checkbox' {$checked} name='key[]' " . "value='{$key}' />";
            }
            $out .= " <li class=\"tag-item\"";
            if (!empty($use_sty)) {
                $out .= " {$style} title=\"{$val} " . _("hits") . '"';
            }
            $out .= ">{$checkbox}" . "<a href='" . str_replace('$TAG', $key, $tag_link) . "' rel='nofollow'>" . $key . "</a></li>\n";
        }
    }
    $inp = '';
    $form_close = '';
    if (!empty($options['add'])) {
        $msg = _("add keywords");
        $inp = "<li><input type='text' name='keywords' size='12' />: {$msg}</li>";
        if ($ok) {
            $btn = _("Update keywords");
        } else {
            $btn = _("Add keywords");
        }
        $btn1 = _("Add as common words");
        $btn2 = _("Unselect all");
        $btnc = _("Suggest new Keywords");
        $form_close = "<input type='submit' value='{$btn}'/>\n";
        $form_close .= "<input type='submit' name='suggest' value='{$btnc}' />\n";
        $form_close .= "<input type='submit' name='common' value='{$btn1}' />\n";
        $form_close .= "<input type='button' value='{$btn2}' onClick='UncheckAll(this)' />\n";
        $form_close .= "<select name='lang'><option>---</option>\n";
        foreach ($supported_lang as $l) {
            $form_close .= "<option value='{$l}'>{$l}</option>\n";
        }
        $langmsg = _("select language");
        $form_close .= "</select>: {$langmsg}\n</form>\n";
        $form_close .= <<<EOF
<script type='text/javascript' src='{$DBInfo->url_prefix}/local/checkbox.js'>
</script>
EOF;
    }
    return "<div class='cloudView'>" . $out . "{$inp}</ul></div>{$form_close}";
}
예제 #30
0
 public function stemming($token_array)
 {
     require_once './PorterStemmer.php';
     $p_stemmer = new PorterStemmer();
     $token_stem_array = array();
     foreach ($token_array as $token) {
         if (!empty($token)) {
             $token_stem_array[] = $p_stemmer->Stem(rtrim($token));
         }
     }
     return $token_stem_array;
 }