Exemplo n.º 1
0
function validate_search_word($word, $idx)
{
    global $cache;
    static $stopwords;
    // If the word is a keyword we don't want to index it, but we do want to be allowed to search it
    if (is_keyword($word)) {
        return !$idx;
    }
    if (!isset($stopwords)) {
        $cache_id = generate_stopwords_cache_id();
        $stopwords = $cache->get('stopwords.' . $cache_id);
        if ($stopwords === Flux_Cache::NOT_FOUND) {
            $stopwords = array();
            $d = dir(PUN_ROOT . 'lang');
            while (($entry = $d->read()) !== false) {
                if ($entry[0] == '.') {
                    continue;
                }
                if (is_dir(PUN_ROOT . 'lang/' . $entry) && file_exists(PUN_ROOT . 'lang/' . $entry . '/stopwords.txt')) {
                    $stopwords = array_merge($stopwords, file(PUN_ROOT . 'lang/' . $entry . '/stopwords.txt'));
                }
            }
            $d->close();
            // Tidy up and filter the stopwords
            $stopwords = array_map('pun_trim', $stopwords);
            $stopwords = array_filter($stopwords);
            $cache->set('stopwords.' . $cache_id, $stopwords);
        }
    }
    // If it is a stopword it isn't valid
    if (in_array($word, $stopwords)) {
        return false;
    }
    // If the word if CJK we don't want to index it, but we do want to be allowed to search it
    if (is_cjk($word)) {
        return !$idx;
    }
    // Exclude % and * when checking whether current word is valid
    $word = str_replace(array('%', '*'), '', $word);
    // Check the word is within the min/max length
    $num_chars = pun_strlen($word);
    return $num_chars >= PUN_SEARCH_MIN_WORD && $num_chars <= PUN_SEARCH_MAX_WORD;
}
Exemplo n.º 2
0
function generate_stopwords_cache()
{
    $stopwords = array();
    $d = dir(PUN_ROOT . 'lang');
    while (($entry = $d->read()) !== false) {
        if ($entry[0] == '.') {
            continue;
        }
        if (is_dir(PUN_ROOT . 'lang/' . $entry) && file_exists(PUN_ROOT . 'lang/' . $entry . '/stopwords.txt')) {
            $stopwords = array_merge($stopwords, file(PUN_ROOT . 'lang/' . $entry . '/stopwords.txt'));
        }
    }
    $d->close();
    // Tidy up and filter the stopwords
    $stopwords = array_map('pun_trim', $stopwords);
    $stopwords = array_filter($stopwords);
    // Output stopwords as PHP code
    $content = '<?php' . "\n\n" . '$cache_id = \'' . generate_stopwords_cache_id() . '\';' . "\n" . 'if ($cache_id != generate_stopwords_cache_id()) return;' . "\n\n" . 'define(\'PUN_STOPWORDS_LOADED\', 1);' . "\n\n" . '$stopwords = ' . var_export($stopwords, true) . ';' . "\n\n" . '?>';
    fluxbb_write_cache_file('cache_stopwords.php', $content);
}
Exemplo n.º 3
0
<?php

$cache_id = '85b4ff4713ce34ba200ed0c8a5b1180bd9e86c3c';
if ($cache_id != generate_stopwords_cache_id()) {
    return;
}
define('PUN_STOPWORDS_LOADED', 1);
$stopwords = array(0 => 'about', 1 => 'after', 2 => 'ago', 3 => 'all', 4 => 'almost', 5 => 'along', 6 => 'also', 7 => 'any', 8 => 'anybody', 9 => 'anywhere', 10 => 'are', 11 => 'arent', 12 => 'aren\'t', 13 => 'around', 14 => 'ask', 15 => 'been', 16 => 'before', 17 => 'being', 18 => 'between', 19 => 'but', 20 => 'came', 21 => 'can', 22 => 'cant', 23 => 'can\'t', 24 => 'come', 25 => 'could', 26 => 'couldnt', 27 => 'couldn\'t', 28 => 'did', 29 => 'didnt', 30 => 'didn\'t', 31 => 'does', 32 => 'doesnt', 33 => 'doesn\'t', 34 => 'dont', 35 => 'don\'t', 36 => 'each', 37 => 'either', 38 => 'else', 39 => 'even', 40 => 'every', 41 => 'everybody', 42 => 'everyone', 43 => 'find', 44 => 'for', 45 => 'from', 46 => 'get', 47 => 'going', 48 => 'gone', 49 => 'got', 50 => 'had', 51 => 'has', 52 => 'have', 53 => 'havent', 54 => 'haven\'t', 55 => 'having', 56 => 'her', 57 => 'here', 58 => 'hers', 59 => 'him', 60 => 'his', 61 => 'how', 62 => 'ill', 63 => 'i\'ll', 64 => 'i\'m', 65 => 'into', 66 => 'isnt', 67 => 'isn\'t', 68 => 'itll', 69 => 'it\'ll', 70 => 'its', 71 => 'it\'s', 72 => 'ive', 73 => 'i\'ve', 74 => 'just', 75 => 'know', 76 => 'less', 77 => 'like', 78 => 'make', 79 => 'many', 80 => 'may', 81 => 'more', 82 => 'most', 83 => 'much', 84 => 'must', 85 => 'near', 86 => 'never', 87 => 'none', 88 => 'nothing', 89 => 'now', 90 => 'off', 91 => 'often', 92 => 'once', 93 => 'one', 94 => 'only', 95 => 'other', 96 => 'our', 97 => 'ours', 98 => 'our\'s', 99 => 'out', 100 => 'over', 101 => 'please', 102 => 'rather', 103 => 'really', 104 => 'said', 105 => 'see', 106 => 'she', 107 => 'should', 108 => 'small', 109 => 'some', 110 => 'something', 111 => 'sometime', 112 => 'somewhere', 113 => 'take', 114 => 'than', 115 => 'thank', 116 => 'thanks', 117 => 'that', 118 => 'thats', 119 => 'that\'s', 120 => 'the', 121 => 'their', 122 => 'theirs', 123 => 'them', 124 => 'then', 125 => 'there', 126 => 'these', 127 => 'they', 128 => 'thing', 129 => 'think', 130 => 'this', 131 => 'those', 132 => 'though', 133 => 'through', 134 => 'thus', 135 => 'too', 136 => 'true', 137 => 'two', 138 => 'under', 139 => 'until', 140 => 'upon', 141 => 'use', 142 => 'very', 143 => 'want', 144 => 'was', 145 => 'way', 146 => 'well', 147 => 'were', 148 => 'what', 149 => 'when', 150 => 'where', 151 => 'which', 152 => 'who', 153 => 'whom', 154 => 'whose', 155 => 'why', 156 => 'will', 157 => 'with', 158 => 'within', 159 => 'without', 160 => 'would', 161 => 'yes', 162 => 'yet', 163 => 'you', 164 => 'your', 165 => 'youre', 166 => 'you\'re', 167 => 'yours', 168 => 'http', 169 => 'https', 170 => 'ftp', 171 => 'www', 172 => 'com', 173 => 'net', 174 => 'org', 175 => 'afin', 176 => 'ainsi', 177 => 'alors', 178 => 'après', 179 => 'aucun', 180 => 'aucune', 181 => 'auprès', 182 => 'auquel', 183 => 'aussi', 184 => 'autant', 185 => 'aux', 186 => 'avec', 187 => 'car', 188 => 'ceci', 189 => 'cela', 190 => 'celle', 191 => 'celles', 192 => 'celui', 193 => 'cependant', 194 => 'ces', 195 => 'cet', 196 => 'cette', 197 => 'ceux', 198 => 'chacun', 199 => 'chacune', 200 => 'chaque', 201 => 'chez', 202 => 'comme', 203 => 'comment', 204 => 'dans', 205 => 'des', 206 => 'donc', 207 => 'donné', 208 => 'dont', 209 => 'duquel', 210 => 'dès', 211 => 'déjà', 212 => 'elle', 213 => 'elles', 214 => 'encore', 215 => 'entre', 216 => 'étant', 217 => 'etc', 218 => 'été', 219 => 'eux', 220 => 'furent', 221 => 'grâce', 222 => 'hors', 223 => 'ici', 224 => 'ils', 225 => 'jusqu', 226 => 'les', 227 => 'leur', 228 => 'leurs', 229 => 'lors', 230 => 'lui', 231 => 'mais', 232 => 'malgré', 233 => 'mes', 234 => 'mien', 235 => 'mienne', 236 => 'miennes', 237 => 'miens', 238 => 'moins', 239 => 'moment', 240 => 'mon', 241 => 'même', 242 => 'mêmes', 243 => 'non', 244 => 'nos', 245 => 'notre', 246 => 'notres', 247 => 'nous', 248 => 'notre', 249 => 'oui', 250 => 'par', 251 => 'parce', 252 => 'parmi', 253 => 'plus', 254 => 'pour', 255 => 'près', 256 => 'puis', 257 => 'puisque', 258 => 'quand', 259 => 'quant', 260 => 'que', 261 => 'quel', 262 => 'quelle', 263 => 'quelque', 264 => 'quelquun', 265 => 'quelques', 266 => 'quels', 267 => 'qui', 268 => 'quoi', 269 => 'sans', 270 => 'sauf', 271 => 'selon', 272 => 'ses', 273 => 'sien', 274 => 'sienne', 275 => 'siennes', 276 => 'siens', 277 => 'soi', 278 => 'soit', 279 => 'sont', 280 => 'sous', 281 => 'suis', 282 => 'sur', 283 => 'tandis', 284 => 'tant', 285 => 'tes', 286 => 'tienne', 287 => 'tiennes', 288 => 'tiens', 289 => 'toi', 290 => 'ton', 291 => 'tous', 292 => 'tout', 293 => 'toute', 294 => 'toutes', 295 => 'trop', 296 => 'très', 297 => 'une', 298 => 'vos', 299 => 'votre', 300 => 'vous', 301 => 'étaient', 302 => 'était', 303 => 'étant', 304 => 'être');