static function remote_open($host_or_program, $port_or_args, $timeout = 10000, $connect_timeout = null) { switch (func_num_args()) { case 2: case 3: $r = remote_open($host_or_program, $port_or_args, $timeout); break; default: $r = remote_open($host_or_program, $port_or_args, $timeout, $connect_timeout); } if (is_resource($r)) { $c = 'Xapian' . substr(get_resource_type($r), strpos(get_resource_type($r), '__') ? strpos(get_resource_type($r), '__') + 2 : 3); if (class_exists($c)) { return new $c($r); } return new XapianDatabase($r); } return $r; }
public function SEARCHENGINE($query) { if (!defined('XAPIANDB') || !XAPIANDB) { return null; } global $xapiandb, $PAGE, $hansardmajors, $parties; if (!$xapiandb) { if (strstr(XAPIANDB, ":")) { //ini_set('display_errors', 'On'); list($xapian_host, $xapian_port) = explode(":", XAPIANDB); twfy_debug("SEARCH", "Using Xapian remote backend: " . $xapian_host . " port " . $xapian_port); $xapiandb_remote = remote_open($xapian_host, intval($xapian_port)); $xapiandb = new XapianDatabase($xapiandb_remote); } else { $xapiandb = new XapianDatabase(XAPIANDB); } } $this->query = $query; if (!isset($this->stemmer)) { $this->stemmer = new XapianStem('english'); } if (!isset($this->enquire)) { $this->enquire = new XapianEnquire($xapiandb); } if (!isset($this->queryparser)) { $this->queryparser = new XapianQueryParser(); $this->datevaluerange = new XapianDateValueRangeProcessor(1); $this->queryparser->set_stemmer($this->stemmer); $this->queryparser->set_stemming_strategy(XapianQueryParser::STEM_SOME); $this->queryparser->set_database($xapiandb); $this->queryparser->set_default_op(Query_OP_AND); $this->queryparser->add_boolean_prefix('speaker', 'S'); $this->queryparser->add_boolean_prefix('major', 'M'); $this->queryparser->add_boolean_prefix('date', 'D'); $this->queryparser->add_boolean_prefix('batch', 'B'); $this->queryparser->add_boolean_prefix('segment', 'U'); $this->queryparser->add_boolean_prefix('department', 'G'); $this->queryparser->add_boolean_prefix('party', 'P'); $this->queryparser->add_boolean_prefix('column', 'C'); $this->queryparser->add_boolean_prefix('gid', 'Q'); $this->queryparser->add_valuerangeprocessor($this->datevaluerange); } # Force words to lower case $this->query = preg_replace('#(department|party):.+?\\b#ie', 'strtolower("$0")', $this->query); // Any characters other than this are treated as, basically, white space // (apart from quotes and minuses, special case below) // The colon is in here for prefixes speaker:10043 and so on. $this->wordchars = "A-Za-z0-9,.'&:_À-ÿ"; $this->wordcharsnodigit = "A-Za-z0-9'&_À-ÿ"; // An array of normal words. $this->words = array(); // All quoted phrases, as an (array of (arrays of words in each phrase)). $this->phrases = array(); // Items prefixed with a colon (speaker:10024) as an (array of (name, value)) $this->prefixed = array(); // Split words up into individual words, and quoted phrases preg_match_all('/(' . '"|' . '(?:(?<![' . $this->wordchars . '])-)?' . '[' . $this->wordchars . ']+' . ')/', $this->query, $all_words); if ($all_words) { $all_words = $all_words[0]; } else { $all_words = array(); } $in_quote = false; $from = ''; $to = ''; foreach ($all_words as $word) { if ($word == '"') { $in_quote = !$in_quote; if ($in_quote) { array_push($this->phrases, array()); } if (!$in_quote && !count($this->phrases[count($this->phrases) - 1])) { array_pop($this->phrases); } continue; } if ($word == '') { continue; } if (strpos($word, ':') !== false) { $items = explode(":", strtolower($word)); $type = $items[0]; if (substr($type, 0, 1) == '-') { $type = substr($type, 1); } $value = strtolower(join(":", array_slice($items, 1))); if ($type == 'section') { $newv = $value; if ($value == 'debates' || $value == 'debate') { $newv = 1; } elseif ($value == 'whall' || $value == 'westminster' || $value == 'westminhall') { $newv = 2; } elseif ($value == 'wrans' || $value == 'wran') { $newv = 3; } elseif ($value == 'wms' || $value == 'statements' || $value == 'statement') { $newv = 4; } elseif ($value == 'lordsdebates' || $value == 'lords') { $newv = 101; } elseif ($value == 'ni' || $value == 'nidebates') { $newv = 5; } elseif ($value == 'pbc' || $value == 'standing') { $newv = 6; } elseif ($value == 'sp') { $newv = 7; } elseif ($value == 'spwrans' || $value == 'spwran') { $newv = 8; } elseif ($value == 'uk') { $newv = array(1, 2, 3, 4, 6, 101); } elseif ($value == 'scotland') { $newv = array(7, 8); } elseif ($value == 'future') { $newv = 'F'; } if (is_array($newv)) { $newv = 'major:' . join(' major:', $newv); } else { $newv = "major:{$newv}"; } $this->query = str_ireplace("{$type}:{$value}", $newv, $this->query); } elseif ($type == 'groupby') { $newv = $value; if ($value == 'debates' || $value == 'debate') { $newv = 'debate'; } if ($value == 'speech' || $value == 'speeches') { $newv = 'speech'; } $this->query = str_ireplace("{$type}:{$value}", '', $this->query); array_push($this->prefixed, array($type, $newv)); } elseif ($type == 'from') { $from = $value; } elseif ($type == 'to') { $to = $value; } } elseif (strpos($word, '-') !== false) { } elseif ($in_quote) { array_push($this->phrases[count($this->phrases) - 1], strtolower($word)); } elseif (strpos($word, '..') !== false) { } elseif ($word == 'OR' || $word == 'AND' || $word == 'XOR' || $word == 'NEAR') { } else { array_push($this->words, strtolower($word)); } } if ($from && $to) { $this->query = str_ireplace("from:{$from}", '', $this->query); $this->query = str_ireplace("to:{$to}", '', $this->query); $this->query .= " {$from}..{$to}"; } elseif ($from) { $this->query = str_ireplace("from:{$from}", '', $this->query); $this->query .= " {$from}.." . date('Ymd'); } elseif ($to) { $this->query = str_ireplace("to:{$to}", '', $this->query); $this->query .= " 19990101..{$to}"; } # Merged people $db = new ParlDB(); $merged = $db->query('SELECT * FROM gidredirect WHERE gid_from LIKE :gid_from', array(':gid_from' => "uk.org.publicwhip/person/%")); for ($n = 0; $n < $merged->rows(); $n++) { $from_id = str_replace('uk.org.publicwhip/person/', '', $merged->field($n, 'gid_from')); $to_id = str_replace('uk.org.publicwhip/person/', '', $merged->field($n, 'gid_to')); $this->query = preg_replace("#speaker:({$from_id}|{$to_id})#i", "(speaker:{$from_id} OR speaker:{$to_id})", $this->query); } twfy_debug("SEARCH", "prefixed: " . var_export($this->prefixed, true)); twfy_debug("SEARCH", "query -- " . $this->query); $flags = XapianQueryParser::FLAG_BOOLEAN | XapianQueryParser::FLAG_LOVEHATE | XapianQueryParser::FLAG_WILDCARD | XapianQueryParser::FLAG_SPELLING_CORRECTION; $flags = $flags | XapianQueryParser::FLAG_PHRASE; try { $query = $this->queryparser->parse_query($this->query, $flags); } catch (Exception $e) { # Nothing we can really do with a bad query $this->error = _htmlspecialchars($e->getMessage()); return null; } $this->enquire->set_query($query); # Now parse the parsed query back into a query string, yummy $qd = $query->get_description(); twfy_debug("SEARCH", "queryparser original description -- " . $qd); $qd = substr($qd, 14, -1); # Strip Xapian::Query() $qd = preg_replace('#:\\(.*?\\)#', '', $qd); # Don't need pos or weight # Date range $qd = preg_replace('#VALUE_RANGE 1 (\\d+) (\\d+)#e', 'preg_replace("#(\\d{4})(\\d\\d)(\\d\\d)#", "\\$3/\\$2/\\$1", $1) . ".." . preg_replace("#(\\d{4})(\\d\\d)(\\d\\d)#", "\\$3/\\$2/\\$1", $2)', $qd); # Replace phrases with the phrase in quotes preg_match_all('#\\(([^(]*? PHRASE [^(]*?)\\)#', $qd, $m); foreach ($m[1] as $phrase) { $phrase_new = preg_replace('# PHRASE \\d+#', '', $phrase); #$this->phrases[] = preg_split('#\s+#', $phrase_new); $qd = str_replace("({$phrase})", '"' . $phrase_new . '"', $qd); } preg_match_all('#\\(([^(]*? NEAR [^(]*?)\\)#', $qd, $m); foreach ($m[1] as $mm) { $mmn = preg_replace('# NEAR \\d+ #', ' NEAR ', $mm); $qd = str_replace("({$mm})", "({$mmn})", $qd); } # Awesome regexes to get rid of superfluous matching brackets $qd = preg_replace('/( \\( ( (?: (?>[^ ()]+) | (?1) ) (?: [ ](?:AND|OR|XOR|FILTER|NEAR[ ]\\d+|PHRASE[ ]\\d+)[ ] (?: (?>[^ ()]+) | (?1) ) )* ) \\) ) [ ] (FILTER|AND_NOT)/x', '$2 $3', $qd); $qd = preg_replace('/(?:FILTER | 0 [ ] \\* ) [ ] ( \\( ( (?: (?>[^ ()]+) | (?1) ) (?: [ ](?:AND|OR|XOR)[ ] (?: (?>[^ ()]+) | (?1) ) )* ) \\) )/x', '$2', $qd); $qd = preg_replace('/(?:FILTER | 0 [ ] \\* ) [ ] ( [^()] )/x', '$1', $qd); $qd = str_replace('AND ', '', $qd); # AND is the default $qd = preg_replace('/^ ( \\( ( (?: (?>[^()]+) | (?1) )* ) \\) ) $/x', '$2', $qd); # Other prefixes $qd = preg_replace('#\\bU(\\d+)\\b#', 'segment:$1', $qd); $qd = preg_replace('#\\bC(\\d+)\\b#', 'column:$1', $qd); $qd = preg_replace('#\\bQ(.*?)\\b#', 'gid:$1', $qd); $qd = preg_replace('#\\bP(.*?)\\b#e', '"party:" . (isset($parties[ucfirst("$1")]) ? $parties[ucfirst("$1")] : "$1")', $qd); $qd = preg_replace('#\\bD(.*?)\\b#', 'date:$1', $qd); $qd = preg_replace('#\\bG(.*?)\\b#', 'department:$1', $qd); # XXX Lookup to show proper name of dept if (strstr($qd, 'M1 OR M2 OR M3 OR M4 OR M6 OR M101')) { $qd = str_replace('M1 OR M2 OR M3 OR M4 OR M6 OR M101', 'section:uk', $qd); } elseif (strstr($qd, 'M7 OR M8')) { $qd = str_replace('M7 OR M8', 'section:scotland', $qd); } $qd = preg_replace('#\\bM(\\d+)\\b#e', '"in the \'" . (isset($hansardmajors[$1]["title"]) ? $hansardmajors[$1]["title"] . "\'" : "$1")', $qd); $qd = preg_replace('#\\bMF\\b#', 'in Future Business', $qd); # Replace stemmed things with their unstemmed terms from the query $used = array(); preg_match_all('#Z[^\\s()]+#', $qd, $m); foreach ($m[0] as $mm) { $iter = $this->queryparser->unstem_begin($mm); $end = $this->queryparser->unstem_end($mm); while (!$iter->equals($end)) { $tt = $iter->get_term(); if (!in_array($tt, $used)) { break; } $iter->next(); } $used[] = $tt; $qd = preg_replace('#' . preg_quote($mm, '#') . '#', $tt, $qd, 1); } # Speakers for ($n = 0; $n < $merged->rows(); $n++) { $from_id = str_replace('uk.org.publicwhip/person/', '', $merged->field($n, 'gid_from')); $to_id = str_replace('uk.org.publicwhip/person/', '', $merged->field($n, 'gid_to')); $qd = str_replace("(S{$from_id} OR S{$to_id})", "S{$to_id}", $qd); $qd = str_replace("S{$from_id} OR S{$to_id}", "S{$to_id}", $qd); } preg_match_all('#S(\\d+)#', $qd, $m); foreach ($m[1] as $mm) { $member = new MEMBER(array('person_id' => $mm)); $name = iconv('iso-8859-1', 'utf-8//TRANSLIT', $member->full_name()); # Names are currently in ISO-8859-1 $qd = str_replace("S{$mm}", "speaker:{$name}", $qd); } # Simplify display of excluded words $qd = preg_replace('#AND_NOT ([a-z0-9"]+)#', '-$1', $qd); preg_match_all('#AND_NOT \\((.*?)\\)#', $qd, $m); foreach ($m[1] as $mm) { $mmn = '-' . join(' -', explode(' OR ', $mm)); $qd = str_replace("AND_NOT ({$mm})", $mmn, $qd); } foreach ($this->prefixed as $items) { if ($items[0] == 'groupby') { if ($items[1] == 'debate') { $qd .= ' grouped by debate'; } elseif ($items[1] == 'speech') { $qd .= ' showing all speeches'; } else { $PAGE->error_message("Unknown group by '{$items['1']}' ignored"); } } } $qd = iconv('utf-8', 'iso-8859-1//TRANSLIT', $qd); # Xapian is UTF-8, site is ISO8859-1 $this->query_desc = trim($qd); #print 'DEBUG: ' . $query->get_description(); twfy_debug("SEARCH", "words: " . var_export($this->words, true)); twfy_debug("SEARCH", "phrases: " . var_export($this->phrases, true)); twfy_debug("SEARCH", "queryparser description -- " . $this->query_desc); $this->valid = true; }
static function remote_open($host_or_program, $port_or_args, $timeout = 10000, $connect_timeout = null) { switch (func_num_args()) { case 2: case 3: $r = remote_open($host_or_program, $port_or_args, $timeout); break; default: $r = remote_open($host_or_program, $port_or_args, $timeout, $connect_timeout); } return is_resource($r) ? new XapianDatabase($r) : $r; }