/** * Test a minimal value in case of wildcard search */ function test_minValue() { global $conf; $filter = 5; // construction of the list of the index to compare $dir = @opendir($conf['indexdir']); $ref = array(); while (($f = readdir($dir)) !== false) { if (substr($f, 0, 1) == 'i' && substr($f, -4) == '.idx') { $i = substr($f, 1, -4); if (is_numeric($i) && $i >= $filter) { $ref[] = (int) $i; } } } closedir($dir); sort($ref); $result = idx_indexLengths(&$filter); sort($result); $this->assertIdentical($result, $ref); }
/** * Find the the index number of each search term. * * This will group together words that appear in the same index. * So it should perform better, because it only opens each index once. * Actually, it's not that great. (in my experience) Probably because of the disk cache. * And the sorted function does more work, making it slightly slower in some cases. * * @param array $words The query terms. Words should only contain valid characters, * with a '*' at either the beginning or end of the word (or both) * @param arrayref $result Set to word => array("length*id" ...), use this to merge the * index locations with the appropriate query term. * @return array Set to length => array(id ...) * * @author Tom N Harris <*****@*****.**> */ function idx_getIndexWordsSorted($words, &$result) { // parse and sort tokens $tokens = array(); $tokenlength = array(); $tokenwild = array(); foreach ($words as $word) { $result[$word] = array(); $wild = 0; $xword = $word; $wlen = wordlen($word); // check for wildcards if (substr($xword, 0, 1) == '*') { $xword = substr($xword, 1); $wild |= 1; $wlen -= 1; } if (substr($xword, -1, 1) == '*') { $xword = substr($xword, 0, -1); $wild |= 2; $wlen -= 1; } if ($wlen < IDX_MINWORDLENGTH && $wild == 0 && !is_numeric($xword)) { continue; } if (!isset($tokens[$xword])) { $tokenlength[$wlen][] = $xword; } if ($wild) { $ptn = preg_quote($xword, '/'); if (($wild & 1) == 0) { $ptn = '^' . $ptn; } if (($wild & 2) == 0) { $ptn = $ptn . '$'; } $tokens[$xword][] = array($word, '/' . $ptn . '/'); if (!isset($tokenwild[$xword])) { $tokenwild[$xword] = $wlen; } } else { $tokens[$xword][] = array($word, null); } } asort($tokenwild); // $tokens = array( base word => array( [ query word , grep pattern ] ... ) ... ) // $tokenlength = array( base word length => base word ... ) // $tokenwild = array( base word => base word length ... ) $length_filter = empty($tokenwild) ? $tokenlength : min(array_keys($tokenlength)); $indexes_known = idx_indexLengths($length_filter); if (!empty($tokenwild)) { sort($indexes_known); } // get word IDs $wids = array(); foreach ($indexes_known as $ixlen) { $word_idx = idx_getIndex('w', $ixlen); // handle exact search if (isset($tokenlength[$ixlen])) { foreach ($tokenlength[$ixlen] as $xword) { $wid = array_search("{$xword}\n", $word_idx); if (is_int($wid)) { $wids[$ixlen][] = $wid; foreach ($tokens[$xword] as $w) { $result[$w[0]][] = "{$ixlen}*{$wid}"; } } } } // handle wildcard search foreach ($tokenwild as $xword => $wlen) { if ($wlen >= $ixlen) { break; } foreach ($tokens[$xword] as $w) { if (is_null($w[1])) { continue; } foreach (array_keys(preg_grep($w[1], $word_idx)) as $wid) { $wids[$ixlen][] = $wid; $result[$w[0]][] = "{$ixlen}*{$wid}"; } } } } return $wids; }
/** * Returns the sorted word cloud array */ function _getWordCloud($num, &$min, &$max) { global $conf; // load stopwords $swfile = DOKU_INC . 'inc/lang/' . $conf['lang'] . '/stopwords.txt'; if (@file_exists($swfile)) { $stopwords = file($swfile, FILE_IGNORE_NEW_LINES); } else { $stopwords = array(); } // load extra local stopwords $swfile = DOKU_CONF . 'stopwords.txt'; if (@file_exists($swfile)) { $stopwords = array_merge($stopwords, file($swfile, FILE_IGNORE_NEW_LINES)); } $cloud = array(); if (@file_exists($conf['indexdir'] . '/page.idx')) { // new word-length based index require_once DOKU_INC . 'inc/indexer.php'; $n = $this->getConf('minimum_word_length'); // minimum word length $lengths = idx_indexLengths($n); foreach ($lengths as $len) { $idx = idx_getIndex('i', $len); $word_idx = idx_getIndex('w', $len); $this->_addWordsToCloud($cloud, $idx, $word_idx, $stopwords); } } else { // old index $idx = file($conf['cachedir'] . '/index.idx'); $word_idx = file($conf['cachedir'] . '/word.idx'); $this->_addWordsToCloud($cloud, $idx, $word_idx, $stopwords); } return $this->_sortCloud($cloud, $num, $min, $max); }
/** * Create a pagewords index from the existing index. * * @author Tom N Harris <*****@*****.**> */ function idx_upgradePageWords() { global $conf; $page_idx = idx_getIndex('page', ''); if (empty($page_idx)) { return; } $pagewords = array(); $len = count($page_idx); for ($n = 0; $n < $len; $n++) { $pagewords[] = array(); } unset($page_idx); $n = 0; foreach (idx_indexLengths($n) as $wlen) { $lines = idx_getIndex('i', $wlen); $len = count($lines); for ($wid = 0; $wid < $len; $wid++) { $wkey = "{$wlen}*{$wid}"; foreach (explode(':', trim($lines[$wid])) as $part) { if ($part == '') { continue; } list($doc, $cnt) = explode('*', $part); $pagewords[(int) $doc][] = $wkey; } } } $fn = $conf['indexdir'] . '/pageword'; $fh = @fopen($fn . '.tmp', 'w'); if (!$fh) { trigger_error("Failed to write word index", E_USER_ERROR); return false; } foreach ($pagewords as $line) { fwrite($fh, join(':', $line) . "\n"); } fclose($fh); if ($conf['fperm']) { chmod($fn . '.tmp', $conf['fperm']); } io_rename($fn . '.tmp', $fn . '.idx'); return true; }
/** * Test the time improvments of the new function * Time reference for 10000 call on minValue: 4,9s * Sould be at least 65% faster * Test fail with no cache */ function test_minValue() { global $conf; $filter = 5; $start = microtime(true); for ($i = 0; $i < 10000; $i++) { $result = idx_indexLengths(&$filter); } $end = microtime(true); $time = $end - $start; $timeref = 4.9 * 0.35; // actual execution time of 4,9s for 10000 calls echo "3) 35% ref : {$timeref} -> {$time} \n"; $this->assertTrue($time < $timeref); }