コード例 #1
0
 /**
  * Test a minimal value in case of wildcard search
  */
 function test_minValue()
 {
     global $conf;
     $filter = 5;
     // construction of the list of the index to compare
     $dir = @opendir($conf['indexdir']);
     $ref = array();
     while (($f = readdir($dir)) !== false) {
         if (substr($f, 0, 1) == 'i' && substr($f, -4) == '.idx') {
             $i = substr($f, 1, -4);
             if (is_numeric($i) && $i >= $filter) {
                 $ref[] = (int) $i;
             }
         }
     }
     closedir($dir);
     sort($ref);
     $result = idx_indexLengths(&$filter);
     sort($result);
     $this->assertIdentical($result, $ref);
 }
コード例 #2
0
ファイル: indexer.php プロジェクト: pijoter/dokuwiki
/**
 * Find the the index number of each search term.
 *
 * This will group together words that appear in the same index.
 * So it should perform better, because it only opens each index once.
 * Actually, it's not that great. (in my experience) Probably because of the disk cache.
 * And the sorted function does more work, making it slightly slower in some cases.
 *
 * @param array    $words   The query terms. Words should only contain valid characters,
 *                          with a '*' at either the beginning or end of the word (or both)
 * @param arrayref $result  Set to word => array("length*id" ...), use this to merge the
 *                          index locations with the appropriate query term.
 * @return array            Set to length => array(id ...)
 *
 * @author Tom N Harris <*****@*****.**>
 */
function idx_getIndexWordsSorted($words, &$result)
{
    // parse and sort tokens
    $tokens = array();
    $tokenlength = array();
    $tokenwild = array();
    foreach ($words as $word) {
        $result[$word] = array();
        $wild = 0;
        $xword = $word;
        $wlen = wordlen($word);
        // check for wildcards
        if (substr($xword, 0, 1) == '*') {
            $xword = substr($xword, 1);
            $wild |= 1;
            $wlen -= 1;
        }
        if (substr($xword, -1, 1) == '*') {
            $xword = substr($xword, 0, -1);
            $wild |= 2;
            $wlen -= 1;
        }
        if ($wlen < IDX_MINWORDLENGTH && $wild == 0 && !is_numeric($xword)) {
            continue;
        }
        if (!isset($tokens[$xword])) {
            $tokenlength[$wlen][] = $xword;
        }
        if ($wild) {
            $ptn = preg_quote($xword, '/');
            if (($wild & 1) == 0) {
                $ptn = '^' . $ptn;
            }
            if (($wild & 2) == 0) {
                $ptn = $ptn . '$';
            }
            $tokens[$xword][] = array($word, '/' . $ptn . '/');
            if (!isset($tokenwild[$xword])) {
                $tokenwild[$xword] = $wlen;
            }
        } else {
            $tokens[$xword][] = array($word, null);
        }
    }
    asort($tokenwild);
    // $tokens = array( base word => array( [ query word , grep pattern ] ... ) ... )
    // $tokenlength = array( base word length => base word ... )
    // $tokenwild = array( base word => base word length ... )
    $length_filter = empty($tokenwild) ? $tokenlength : min(array_keys($tokenlength));
    $indexes_known = idx_indexLengths($length_filter);
    if (!empty($tokenwild)) {
        sort($indexes_known);
    }
    // get word IDs
    $wids = array();
    foreach ($indexes_known as $ixlen) {
        $word_idx = idx_getIndex('w', $ixlen);
        // handle exact search
        if (isset($tokenlength[$ixlen])) {
            foreach ($tokenlength[$ixlen] as $xword) {
                $wid = array_search("{$xword}\n", $word_idx);
                if (is_int($wid)) {
                    $wids[$ixlen][] = $wid;
                    foreach ($tokens[$xword] as $w) {
                        $result[$w[0]][] = "{$ixlen}*{$wid}";
                    }
                }
            }
        }
        // handle wildcard search
        foreach ($tokenwild as $xword => $wlen) {
            if ($wlen >= $ixlen) {
                break;
            }
            foreach ($tokens[$xword] as $w) {
                if (is_null($w[1])) {
                    continue;
                }
                foreach (array_keys(preg_grep($w[1], $word_idx)) as $wid) {
                    $wids[$ixlen][] = $wid;
                    $result[$w[0]][] = "{$ixlen}*{$wid}";
                }
            }
        }
    }
    return $wids;
}
コード例 #3
0
ファイル: syntax.php プロジェクト: omusico/isle-web-framework
 /**
  * Returns the sorted word cloud array
  */
 function _getWordCloud($num, &$min, &$max)
 {
     global $conf;
     // load stopwords
     $swfile = DOKU_INC . 'inc/lang/' . $conf['lang'] . '/stopwords.txt';
     if (@file_exists($swfile)) {
         $stopwords = file($swfile, FILE_IGNORE_NEW_LINES);
     } else {
         $stopwords = array();
     }
     // load extra local stopwords
     $swfile = DOKU_CONF . 'stopwords.txt';
     if (@file_exists($swfile)) {
         $stopwords = array_merge($stopwords, file($swfile, FILE_IGNORE_NEW_LINES));
     }
     $cloud = array();
     if (@file_exists($conf['indexdir'] . '/page.idx')) {
         // new word-length based index
         require_once DOKU_INC . 'inc/indexer.php';
         $n = $this->getConf('minimum_word_length');
         // minimum word length
         $lengths = idx_indexLengths($n);
         foreach ($lengths as $len) {
             $idx = idx_getIndex('i', $len);
             $word_idx = idx_getIndex('w', $len);
             $this->_addWordsToCloud($cloud, $idx, $word_idx, $stopwords);
         }
     } else {
         // old index
         $idx = file($conf['cachedir'] . '/index.idx');
         $word_idx = file($conf['cachedir'] . '/word.idx');
         $this->_addWordsToCloud($cloud, $idx, $word_idx, $stopwords);
     }
     return $this->_sortCloud($cloud, $num, $min, $max);
 }
コード例 #4
0
ファイル: indexer.php プロジェクト: Harvie/dokuwiki
/**
 * Create a pagewords index from the existing index.
 *
 * @author Tom N Harris <*****@*****.**>
 */
function idx_upgradePageWords()
{
    global $conf;
    $page_idx = idx_getIndex('page', '');
    if (empty($page_idx)) {
        return;
    }
    $pagewords = array();
    $len = count($page_idx);
    for ($n = 0; $n < $len; $n++) {
        $pagewords[] = array();
    }
    unset($page_idx);
    $n = 0;
    foreach (idx_indexLengths($n) as $wlen) {
        $lines = idx_getIndex('i', $wlen);
        $len = count($lines);
        for ($wid = 0; $wid < $len; $wid++) {
            $wkey = "{$wlen}*{$wid}";
            foreach (explode(':', trim($lines[$wid])) as $part) {
                if ($part == '') {
                    continue;
                }
                list($doc, $cnt) = explode('*', $part);
                $pagewords[(int) $doc][] = $wkey;
            }
        }
    }
    $fn = $conf['indexdir'] . '/pageword';
    $fh = @fopen($fn . '.tmp', 'w');
    if (!$fh) {
        trigger_error("Failed to write word index", E_USER_ERROR);
        return false;
    }
    foreach ($pagewords as $line) {
        fwrite($fh, join(':', $line) . "\n");
    }
    fclose($fh);
    if ($conf['fperm']) {
        chmod($fn . '.tmp', $conf['fperm']);
    }
    io_rename($fn . '.tmp', $fn . '.idx');
    return true;
}
コード例 #5
0
 /**
  * Test the time improvments of the new function
  * Time reference for 10000 call on minValue: 4,9s
  * Sould be at least 65% faster
  * Test fail with no cache
  */
 function test_minValue()
 {
     global $conf;
     $filter = 5;
     $start = microtime(true);
     for ($i = 0; $i < 10000; $i++) {
         $result = idx_indexLengths(&$filter);
     }
     $end = microtime(true);
     $time = $end - $start;
     $timeref = 4.9 * 0.35;
     // actual execution time of 4,9s for 10000 calls
     echo "3) 35% ref : {$timeref} -> {$time} \n";
     $this->assertTrue($time < $timeref);
 }