/** * Returns the pages that use a given media file * * Does a quick lookup with the fulltext index, then * evaluates the instructions of the found pages * * Aborts after $max found results */ function ft_mediause($id, $max) { if (!$max) { $max = 1; } // need to find at least one $result = array(); // quick lookup of the mediafile // FIXME use metadata key lookup $media = noNS($id); $matches = idx_lookup(idx_tokenizer($media)); $docs = array_keys(ft_resultCombine(array_values($matches))); if (!count($docs)) { return $result; } // go through all found pages $found = 0; $pcre = preg_quote($media, '/'); foreach ($docs as $doc) { $ns = getNS($doc); preg_match_all('/\\{\\{([^|}]*' . $pcre . '[^|}]*)(|[^}]+)?\\}\\}/i', rawWiki($doc), $matches); foreach ($matches[1] as $img) { $img = trim($img); if (preg_match('/^https?:\\/\\//i', $img)) { continue; } // skip external images list($img) = explode('?', $img); // remove any parameters resolve_mediaid($ns, $img, $exists); // resolve the possibly relative img if ($img == $id) { // we have a match $result[] = $doc; $found++; break; } } if ($found >= $max) { break; } } sort($result); return $result; }
/** * Transforms given search term into intermediate representation * * This function is used in ft_queryParser() and not for general purpose use. * * @author Kazutaka Miyasaka <*****@*****.**> */ function ft_termParser($term, &$stopwords, $consider_asian = true, $phrase_mode = false) { $parsed = ''; if ($consider_asian) { // successive asian characters need to be searched as a phrase $words = preg_split('/(' . IDX_ASIAN . '+)/u', $term, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY); foreach ($words as $word) { if (preg_match('/' . IDX_ASIAN . '/u', $word)) { $phrase_mode = true; } $parsed .= ft_termParser($word, $stopwords, false, $phrase_mode); } } else { $term_noparen = str_replace(array('(', ')'), ' ', $term); $words = idx_tokenizer($term_noparen, $stopwords, true); // W_: no need to highlight if (empty($words)) { $parsed = '()'; // important: do not remove } elseif ($words[0] === $term) { $parsed = '(W+:' . $words[0] . ')'; } elseif ($phrase_mode) { $term_encoded = str_replace(array('(', ')'), array('OP', 'CP'), $term); $parsed = '((W_:' . implode(')(W_:', $words) . ')(P+:' . $term_encoded . '))'; } else { $parsed = '((W+:' . implode(')(W+:', $words) . '))'; } } return $parsed; }
/** * Split a page into words * * Returns an array of word counts, false if an error occurred. * Array is keyed on the word length, then the word index. * * @author Andreas Gohr <*****@*****.**> * @author Christopher Smith <*****@*****.**> */ function idx_getPageWords($page) { global $conf; $swfile = DOKU_INC . 'inc/lang/' . $conf['lang'] . '/stopwords.txt'; if (@file_exists($swfile)) { $stopwords = file($swfile); } else { $stopwords = array(); } $body = ''; $data = array($page, $body); $evt = new Doku_Event('INDEXER_PAGE_ADD', $data); if ($evt->advise_before()) { $data[1] .= rawWiki($page); } $evt->advise_after(); unset($evt); list($page, $body) = $data; $body = strtr($body, "\r\n\t", ' '); $tokens = explode(' ', $body); $tokens = array_count_values($tokens); // count the frequency of each token // ensure the deaccented or romanised page names of internal links are added to the token array // (this is necessary for the backlink function -- there maybe a better way!) if ($conf['deaccent']) { $links = p_get_metadata($page, 'relation references'); if (!empty($links)) { $tmp = join(' ', array_keys($links)); // make a single string $tmp = strtr($tmp, ':', ' '); // replace namespace separator with a space $link_tokens = array_unique(explode(' ', $tmp)); // break into tokens foreach ($link_tokens as $link_token) { if (isset($tokens[$link_token])) { continue; } $tokens[$link_token] = 1; } } } $words = array(); foreach ($tokens as $word => $count) { $arr = idx_tokenizer($word, $stopwords); $arr = array_count_values($arr); foreach ($arr as $w => $c) { $l = wordlen($w); if (isset($words[$l])) { $words[$l][$w] = $c * $count + (isset($words[$l][$w]) ? $words[$l][$w] : 0); } else { $words[$l] = array($w => $c * $count); } } } // arrive here with $words = array(wordlen => array(word => frequency)) $index = array(); //resulting index foreach (array_keys($words) as $wlen) { $word_idx = idx_getIndex('w', $wlen); foreach ($words[$wlen] as $word => $freq) { $wid = array_search("{$word}\n", $word_idx); if (!is_int($wid)) { $wid = count($word_idx); $word_idx[] = "{$word}\n"; } if (!isset($index[$wlen])) { $index[$wlen] = array(); } $index[$wlen][$wid] = $freq; } // save back word index if (!idx_saveIndex('w', $wlen, $word_idx)) { trigger_error("Failed to write word index", E_USER_ERROR); return false; } } return $index; }
/** * Builds an array of search words from a query * * @todo support OR and parenthesises? */ function ft_queryParser($query) { global $conf; $swfile = DOKU_INC . 'inc/lang/' . $conf['lang'] . '/stopwords.txt'; if (@file_exists($swfile)) { $stopwords = file($swfile); } else { $stopwords = array(); } $q = array(); $q['query'] = $query; $q['ns'] = array(); $q['phrases'] = array(); $q['words'] = array(); $q['and'] = array(); $q['not'] = array(); // strip namespace from query if (preg_match('/([^@]*)@(.*)/', $query, $match)) { $query = $match[1]; $q['ns'] = explode('@', preg_replace("/ /", '', $match[2])); } // handle phrase searches while (preg_match('/"(.*?)"/', $query, $match)) { $q['phrases'][] = $match[1]; $q['and'] = array_merge($q['and'], idx_tokenizer($match[0], $stopwords)); $query = preg_replace('/"(.*?)"/', '', $query, 1); } $words = explode(' ', $query); foreach ($words as $w) { if ($w[0] == '-') { $token = idx_tokenizer($w, $stopwords, true); if (count($token)) { $q['not'] = array_merge($q['not'], $token); } } else { // asian "words" need to be searched as phrases if (@preg_match_all('/((' . IDX_ASIAN . ')+)/u', $w, $matches)) { $q['phrases'] = array_merge($q['phrases'], $matches[1]); } $token = idx_tokenizer($w, $stopwords, true); if (count($token)) { $q['and'] = array_merge($q['and'], $token); $q['words'] = array_merge($q['words'], $token); } } } return $q; }