idx_tokenizer PHP代码示例

示例#1

0

显示文件

文件： fulltext.php 项目： ngharaibeh/Methodikos

/**
 * Returns the pages that use a given media file
 *
 * Does a quick lookup with the fulltext index, then
 * evaluates the instructions of the found pages
 *
 * Aborts after $max found results
 */
function ft_mediause($id, $max)
{
    if (!$max) {
        $max = 1;
    }
    // need to find at least one
    $result = array();
    // quick lookup of the mediafile
    // FIXME use metadata key lookup
    $media = noNS($id);
    $matches = idx_lookup(idx_tokenizer($media));
    $docs = array_keys(ft_resultCombine(array_values($matches)));
    if (!count($docs)) {
        return $result;
    }
    // go through all found pages
    $found = 0;
    $pcre = preg_quote($media, '/');
    foreach ($docs as $doc) {
        $ns = getNS($doc);
        preg_match_all('/\\{\\{([^|}]*' . $pcre . '[^|}]*)(|[^}]+)?\\}\\}/i', rawWiki($doc), $matches);
        foreach ($matches[1] as $img) {
            $img = trim($img);
            if (preg_match('/^https?:\\/\\//i', $img)) {
                continue;
            }
            // skip external images
            list($img) = explode('?', $img);
            // remove any parameters
            resolve_mediaid($ns, $img, $exists);
            // resolve the possibly relative img
            if ($img == $id) {
                // we have a match
                $result[] = $doc;
                $found++;
                break;
            }
        }
        if ($found >= $max) {
            break;
        }
    }
    sort($result);
    return $result;
}

示例#2

0

显示文件

文件： fulltext.php 项目： lorea/Hydra-dev

/**
 * Transforms given search term into intermediate representation
 *
 * This function is used in ft_queryParser() and not for general purpose use.
 *
 * @author Kazutaka Miyasaka <*****@*****.**>
 */
function ft_termParser($term, &$stopwords, $consider_asian = true, $phrase_mode = false)
{
    $parsed = '';
    if ($consider_asian) {
        // successive asian characters need to be searched as a phrase
        $words = preg_split('/(' . IDX_ASIAN . '+)/u', $term, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
        foreach ($words as $word) {
            if (preg_match('/' . IDX_ASIAN . '/u', $word)) {
                $phrase_mode = true;
            }
            $parsed .= ft_termParser($word, $stopwords, false, $phrase_mode);
        }
    } else {
        $term_noparen = str_replace(array('(', ')'), ' ', $term);
        $words = idx_tokenizer($term_noparen, $stopwords, true);
        // W_: no need to highlight
        if (empty($words)) {
            $parsed = '()';
            // important: do not remove
        } elseif ($words[0] === $term) {
            $parsed = '(W+:' . $words[0] . ')';
        } elseif ($phrase_mode) {
            $term_encoded = str_replace(array('(', ')'), array('OP', 'CP'), $term);
            $parsed = '((W_:' . implode(')(W_:', $words) . ')(P+:' . $term_encoded . '))';
        } else {
            $parsed = '((W+:' . implode(')(W+:', $words) . '))';
        }
    }
    return $parsed;
}

示例#3

0

显示文件

文件： indexer.php 项目： pijoter/dokuwiki

/**
 * Split a page into words
 *
 * Returns an array of word counts, false if an error occurred.
 * Array is keyed on the word length, then the word index.
 *
 * @author Andreas Gohr <*****@*****.**>
 * @author Christopher Smith <*****@*****.**>
 */
function idx_getPageWords($page)
{
    global $conf;
    $swfile = DOKU_INC . 'inc/lang/' . $conf['lang'] . '/stopwords.txt';
    if (@file_exists($swfile)) {
        $stopwords = file($swfile);
    } else {
        $stopwords = array();
    }
    $body = '';
    $data = array($page, $body);
    $evt = new Doku_Event('INDEXER_PAGE_ADD', $data);
    if ($evt->advise_before()) {
        $data[1] .= rawWiki($page);
    }
    $evt->advise_after();
    unset($evt);
    list($page, $body) = $data;
    $body = strtr($body, "\r\n\t", '   ');
    $tokens = explode(' ', $body);
    $tokens = array_count_values($tokens);
    // count the frequency of each token
    // ensure the deaccented or romanised page names of internal links are added to the token array
    // (this is necessary for the backlink function -- there maybe a better way!)
    if ($conf['deaccent']) {
        $links = p_get_metadata($page, 'relation references');
        if (!empty($links)) {
            $tmp = join(' ', array_keys($links));
            // make a single string
            $tmp = strtr($tmp, ':', ' ');
            // replace namespace separator with a space
            $link_tokens = array_unique(explode(' ', $tmp));
            // break into tokens
            foreach ($link_tokens as $link_token) {
                if (isset($tokens[$link_token])) {
                    continue;
                }
                $tokens[$link_token] = 1;
            }
        }
    }
    $words = array();
    foreach ($tokens as $word => $count) {
        $arr = idx_tokenizer($word, $stopwords);
        $arr = array_count_values($arr);
        foreach ($arr as $w => $c) {
            $l = wordlen($w);
            if (isset($words[$l])) {
                $words[$l][$w] = $c * $count + (isset($words[$l][$w]) ? $words[$l][$w] : 0);
            } else {
                $words[$l] = array($w => $c * $count);
            }
        }
    }
    // arrive here with $words = array(wordlen => array(word => frequency))
    $index = array();
    //resulting index
    foreach (array_keys($words) as $wlen) {
        $word_idx = idx_getIndex('w', $wlen);
        foreach ($words[$wlen] as $word => $freq) {
            $wid = array_search("{$word}\n", $word_idx);
            if (!is_int($wid)) {
                $wid = count($word_idx);
                $word_idx[] = "{$word}\n";
            }
            if (!isset($index[$wlen])) {
                $index[$wlen] = array();
            }
            $index[$wlen][$wid] = $freq;
        }
        // save back word index
        if (!idx_saveIndex('w', $wlen, $word_idx)) {
            trigger_error("Failed to write word index", E_USER_ERROR);
            return false;
        }
    }
    return $index;
}

示例#4

0

显示文件

文件： fulltext.php 项目： JVS-IS/ICONITO-EcoleNumerique

/**
 * Builds an array of search words from a query
 *
 * @todo support OR and parenthesises?
 */
function ft_queryParser($query)
{
    global $conf;
    $swfile = DOKU_INC . 'inc/lang/' . $conf['lang'] . '/stopwords.txt';
    if (@file_exists($swfile)) {
        $stopwords = file($swfile);
    } else {
        $stopwords = array();
    }
    $q = array();
    $q['query'] = $query;
    $q['ns'] = array();
    $q['phrases'] = array();
    $q['words'] = array();
    $q['and'] = array();
    $q['not'] = array();
    // strip namespace from query
    if (preg_match('/([^@]*)@(.*)/', $query, $match)) {
        $query = $match[1];
        $q['ns'] = explode('@', preg_replace("/ /", '', $match[2]));
    }
    // handle phrase searches
    while (preg_match('/"(.*?)"/', $query, $match)) {
        $q['phrases'][] = $match[1];
        $q['and'] = array_merge($q['and'], idx_tokenizer($match[0], $stopwords));
        $query = preg_replace('/"(.*?)"/', '', $query, 1);
    }
    $words = explode(' ', $query);
    foreach ($words as $w) {
        if ($w[0] == '-') {
            $token = idx_tokenizer($w, $stopwords, true);
            if (count($token)) {
                $q['not'] = array_merge($q['not'], $token);
            }
        } else {
            // asian "words" need to be searched as phrases
            if (@preg_match_all('/((' . IDX_ASIAN . ')+)/u', $w, $matches)) {
                $q['phrases'] = array_merge($q['phrases'], $matches[1]);
            }
            $token = idx_tokenizer($w, $stopwords, true);
            if (count($token)) {
                $q['and'] = array_merge($q['and'], $token);
                $q['words'] = array_merge($q['words'], $token);
            }
        }
    }
    return $q;
}

PHP idx_tokenizer示例