/** * Find the the index number of each search term. * * This will group together words that appear in the same index. * So it should perform better, because it only opens each index once. * Actually, it's not that great. (in my experience) Probably because of the disk cache. * And the sorted function does more work, making it slightly slower in some cases. * * @param array $words The query terms. Words should only contain valid characters, * with a '*' at either the beginning or end of the word (or both) * @param arrayref $result Set to word => array("length*id" ...), use this to merge the * index locations with the appropriate query term. * @return array Set to length => array(id ...) * * @author Tom N Harris <*****@*****.**> */ function idx_getIndexWordsSorted($words, &$result) { // parse and sort tokens $tokens = array(); $tokenlength = array(); $tokenwild = array(); foreach ($words as $word) { $result[$word] = array(); $wild = 0; $xword = $word; $wlen = wordlen($word); // check for wildcards if (substr($xword, 0, 1) == '*') { $xword = substr($xword, 1); $wild |= 1; $wlen -= 1; } if (substr($xword, -1, 1) == '*') { $xword = substr($xword, 0, -1); $wild |= 2; $wlen -= 1; } if ($wlen < IDX_MINWORDLENGTH && $wild == 0 && !is_numeric($xword)) { continue; } if (!isset($tokens[$xword])) { $tokenlength[$wlen][] = $xword; } if ($wild) { $ptn = preg_quote($xword, '/'); if (($wild & 1) == 0) { $ptn = '^' . $ptn; } if (($wild & 2) == 0) { $ptn = $ptn . '$'; } $tokens[$xword][] = array($word, '/' . $ptn . '/'); if (!isset($tokenwild[$xword])) { $tokenwild[$xword] = $wlen; } } else { $tokens[$xword][] = array($word, null); } } asort($tokenwild); // $tokens = array( base word => array( [ query word , grep pattern ] ... ) ... ) // $tokenlength = array( base word length => base word ... ) // $tokenwild = array( base word => base word length ... ) $length_filter = empty($tokenwild) ? $tokenlength : min(array_keys($tokenlength)); $indexes_known = idx_indexLengths($length_filter); if (!empty($tokenwild)) { sort($indexes_known); } // get word IDs $wids = array(); foreach ($indexes_known as $ixlen) { $word_idx = idx_getIndex('w', $ixlen); // handle exact search if (isset($tokenlength[$ixlen])) { foreach ($tokenlength[$ixlen] as $xword) { $wid = array_search("{$xword}\n", $word_idx); if (is_int($wid)) { $wids[$ixlen][] = $wid; foreach ($tokens[$xword] as $w) { $result[$w[0]][] = "{$ixlen}*{$wid}"; } } } } // handle wildcard search foreach ($tokenwild as $xword => $wlen) { if ($wlen >= $ixlen) { break; } foreach ($tokens[$xword] as $w) { if (is_null($w[1])) { continue; } foreach (array_keys(preg_grep($w[1], $word_idx)) as $wid) { $wids[$ixlen][] = $wid; $result[$w[0]][] = "{$ixlen}*{$wid}"; } } } } return $wids; }
/** * Find the index ID of each search term. * * The query terms should only contain valid characters, with a '*' at * either the beginning or end of the word (or both). * The $result parameter can be used to merge the index locations with * the appropriate query term. * * @param array $words The query terms. * @param array $result Set to word => array("length*id" ...) * @return array Set to length => array(id ...) * @author Tom N Harris <*****@*****.**> */ protected function getIndexWords(&$words, &$result) { $tokens = array(); $tokenlength = array(); $tokenwild = array(); foreach ($words as $word) { $result[$word] = array(); $caret = '^'; $dollar = '$'; $xword = $word; $wlen = wordlen($word); // check for wildcards if (substr($xword, 0, 1) == '*') { $xword = substr($xword, 1); $caret = ''; $wlen -= 1; } if (substr($xword, -1, 1) == '*') { $xword = substr($xword, 0, -1); $dollar = ''; $wlen -= 1; } if ($wlen < IDX_MINWORDLENGTH && $caret && $dollar && !is_numeric($xword)) { continue; } if (!isset($tokens[$xword])) { $tokenlength[$wlen][] = $xword; } if (!$caret || !$dollar) { $re = $caret . preg_quote($xword, '/') . $dollar; $tokens[$xword][] = array($word, '/' . $re . '/'); if (!isset($tokenwild[$xword])) { $tokenwild[$xword] = $wlen; } } else { $tokens[$xword][] = array($word, null); } } asort($tokenwild); // $tokens = array( base word => array( [ query term , regexp ] ... ) ... ) // $tokenlength = array( base word length => base word ... ) // $tokenwild = array( base word => base word length ... ) $length_filter = empty($tokenwild) ? $tokenlength : min(array_keys($tokenlength)); $indexes_known = $this->indexLengths($length_filter); if (!empty($tokenwild)) { sort($indexes_known); } // get word IDs $wids = array(); foreach ($indexes_known as $ixlen) { $word_idx = $this->getIndex('w', $ixlen); // handle exact search if (isset($tokenlength[$ixlen])) { foreach ($tokenlength[$ixlen] as $xword) { $wid = array_search($xword, $word_idx, true); if ($wid !== false) { $wids[$ixlen][] = $wid; foreach ($tokens[$xword] as $w) { $result[$w[0]][] = "{$ixlen}*{$wid}"; } } } } // handle wildcard search foreach ($tokenwild as $xword => $wlen) { if ($wlen >= $ixlen) { break; } foreach ($tokens[$xword] as $w) { if (is_null($w[1])) { continue; } foreach (array_keys(preg_grep($w[1], $word_idx)) as $wid) { $wids[$ixlen][] = $wid; $result[$w[0]][] = "{$ixlen}*{$wid}"; } } } } return $wids; }