Example #1
0
/**
 * Find the the index number of each search term.
 *
 * This will group together words that appear in the same index.
 * So it should perform better, because it only opens each index once.
 * Actually, it's not that great. (in my experience) Probably because of the disk cache.
 * And the sorted function does more work, making it slightly slower in some cases.
 *
 * @param array    $words   The query terms. Words should only contain valid characters,
 *                          with a '*' at either the beginning or end of the word (or both)
 * @param arrayref $result  Set to word => array("length*id" ...), use this to merge the
 *                          index locations with the appropriate query term.
 * @return array            Set to length => array(id ...)
 *
 * @author Tom N Harris <*****@*****.**>
 */
function idx_getIndexWordsSorted($words, &$result)
{
    // parse and sort tokens
    $tokens = array();
    $tokenlength = array();
    $tokenwild = array();
    foreach ($words as $word) {
        $result[$word] = array();
        $wild = 0;
        $xword = $word;
        $wlen = wordlen($word);
        // check for wildcards
        if (substr($xword, 0, 1) == '*') {
            $xword = substr($xword, 1);
            $wild |= 1;
            $wlen -= 1;
        }
        if (substr($xword, -1, 1) == '*') {
            $xword = substr($xword, 0, -1);
            $wild |= 2;
            $wlen -= 1;
        }
        if ($wlen < IDX_MINWORDLENGTH && $wild == 0 && !is_numeric($xword)) {
            continue;
        }
        if (!isset($tokens[$xword])) {
            $tokenlength[$wlen][] = $xword;
        }
        if ($wild) {
            $ptn = preg_quote($xword, '/');
            if (($wild & 1) == 0) {
                $ptn = '^' . $ptn;
            }
            if (($wild & 2) == 0) {
                $ptn = $ptn . '$';
            }
            $tokens[$xword][] = array($word, '/' . $ptn . '/');
            if (!isset($tokenwild[$xword])) {
                $tokenwild[$xword] = $wlen;
            }
        } else {
            $tokens[$xword][] = array($word, null);
        }
    }
    asort($tokenwild);
    // $tokens = array( base word => array( [ query word , grep pattern ] ... ) ... )
    // $tokenlength = array( base word length => base word ... )
    // $tokenwild = array( base word => base word length ... )
    $length_filter = empty($tokenwild) ? $tokenlength : min(array_keys($tokenlength));
    $indexes_known = idx_indexLengths($length_filter);
    if (!empty($tokenwild)) {
        sort($indexes_known);
    }
    // get word IDs
    $wids = array();
    foreach ($indexes_known as $ixlen) {
        $word_idx = idx_getIndex('w', $ixlen);
        // handle exact search
        if (isset($tokenlength[$ixlen])) {
            foreach ($tokenlength[$ixlen] as $xword) {
                $wid = array_search("{$xword}\n", $word_idx);
                if (is_int($wid)) {
                    $wids[$ixlen][] = $wid;
                    foreach ($tokens[$xword] as $w) {
                        $result[$w[0]][] = "{$ixlen}*{$wid}";
                    }
                }
            }
        }
        // handle wildcard search
        foreach ($tokenwild as $xword => $wlen) {
            if ($wlen >= $ixlen) {
                break;
            }
            foreach ($tokens[$xword] as $w) {
                if (is_null($w[1])) {
                    continue;
                }
                foreach (array_keys(preg_grep($w[1], $word_idx)) as $wid) {
                    $wids[$ixlen][] = $wid;
                    $result[$w[0]][] = "{$ixlen}*{$wid}";
                }
            }
        }
    }
    return $wids;
}
Example #2
0
 /**
  * Find the index ID of each search term.
  *
  * The query terms should only contain valid characters, with a '*' at
  * either the beginning or end of the word (or both).
  * The $result parameter can be used to merge the index locations with
  * the appropriate query term.
  *
  * @param array  $words  The query terms.
  * @param array  $result Set to word => array("length*id" ...)
  * @return array         Set to length => array(id ...)
  * @author Tom N Harris <*****@*****.**>
  */
 protected function getIndexWords(&$words, &$result)
 {
     $tokens = array();
     $tokenlength = array();
     $tokenwild = array();
     foreach ($words as $word) {
         $result[$word] = array();
         $caret = '^';
         $dollar = '$';
         $xword = $word;
         $wlen = wordlen($word);
         // check for wildcards
         if (substr($xword, 0, 1) == '*') {
             $xword = substr($xword, 1);
             $caret = '';
             $wlen -= 1;
         }
         if (substr($xword, -1, 1) == '*') {
             $xword = substr($xword, 0, -1);
             $dollar = '';
             $wlen -= 1;
         }
         if ($wlen < IDX_MINWORDLENGTH && $caret && $dollar && !is_numeric($xword)) {
             continue;
         }
         if (!isset($tokens[$xword])) {
             $tokenlength[$wlen][] = $xword;
         }
         if (!$caret || !$dollar) {
             $re = $caret . preg_quote($xword, '/') . $dollar;
             $tokens[$xword][] = array($word, '/' . $re . '/');
             if (!isset($tokenwild[$xword])) {
                 $tokenwild[$xword] = $wlen;
             }
         } else {
             $tokens[$xword][] = array($word, null);
         }
     }
     asort($tokenwild);
     // $tokens = array( base word => array( [ query term , regexp ] ... ) ... )
     // $tokenlength = array( base word length => base word ... )
     // $tokenwild = array( base word => base word length ... )
     $length_filter = empty($tokenwild) ? $tokenlength : min(array_keys($tokenlength));
     $indexes_known = $this->indexLengths($length_filter);
     if (!empty($tokenwild)) {
         sort($indexes_known);
     }
     // get word IDs
     $wids = array();
     foreach ($indexes_known as $ixlen) {
         $word_idx = $this->getIndex('w', $ixlen);
         // handle exact search
         if (isset($tokenlength[$ixlen])) {
             foreach ($tokenlength[$ixlen] as $xword) {
                 $wid = array_search($xword, $word_idx, true);
                 if ($wid !== false) {
                     $wids[$ixlen][] = $wid;
                     foreach ($tokens[$xword] as $w) {
                         $result[$w[0]][] = "{$ixlen}*{$wid}";
                     }
                 }
             }
         }
         // handle wildcard search
         foreach ($tokenwild as $xword => $wlen) {
             if ($wlen >= $ixlen) {
                 break;
             }
             foreach ($tokens[$xword] as $w) {
                 if (is_null($w[1])) {
                     continue;
                 }
                 foreach (array_keys(preg_grep($w[1], $word_idx)) as $wid) {
                     $wids[$ixlen][] = $wid;
                     $result[$w[0]][] = "{$ixlen}*{$wid}";
                 }
             }
         }
     }
     return $wids;
 }