function _get_data_by_search($args) { // Creates a fairly standard $data structure for the search function. // Will probably be rendered by the hansard_search.php template. // $args is an associative array with 's'=>'my search term' and // (optionally) 'p'=>1 (the page number of results to show) annd // (optionall) 'pop'=>1 (if "popular" search link, so don't log) global $PAGE, $hansardmajors; if (isset($args['s'])) { // $args['s'] should have been tidied up by the time we get here. // eg, by doing filter_user_input($s, 'strict'); $searchstring = $args['s']; } else { $PAGE->error_message("No search string"); return false; } // What we'll return. $data = array(); $data['info']['s'] = $args['s']; // Allows us to specify how many results we want // Mainly for glossary term adding if (isset($args['num']) && $args['num']) { $results_per_page = $args['num']; } else { $results_per_page = 20; } if ($results_per_page > 1000) { $results_per_page = 1000; } $data['info']['results_per_page'] = $results_per_page; // What page are we on? if (isset($args['p']) && is_numeric($args['p'])) { $page = $args['p']; } else { $page = 1; } $data['info']['page'] = $page; if (isset($args['e'])) { $encode = 'url'; } else { $encode = 'html'; } // Fetch count of number of matches global $SEARCHENGINE; $data['searchdescription'] = $SEARCHENGINE->query_description_long(); $count = $SEARCHENGINE->run_count(); $data['info']['total_results'] = $count; // Log this query so we can improve them - if it wasn't a "popular // query" link if (!isset($args['pop']) or $args['pop'] != 1) { global $SEARCHLOG; $SEARCHLOG->add(array('query' => $searchstring, 'page' => $page, 'hits' => $count)); } // No results. if ($count <= 0) { $data['rows'] = array(); return $data; } // For Xapian's equivalent of an SQL LIMIT clause. $first_result = ($page - 1) * $results_per_page; $data['info']['first_result'] = $first_result + 1; // Take account of LIMIT's 0 base. // Get the gids from Xapian $sort_order = 'date'; if (isset($args['o'])) { if ($args['o'] == 'd') { $sort_order = 'date'; } elseif ($args['o'] == 'c') { $sort_order = 'created'; } elseif ($args['o'] == 'r') { $sort_order = 'relevance'; } } $SEARCHENGINE->run_search($first_result, $results_per_page, $sort_order); $gids = $SEARCHENGINE->get_gids(); if ($sort_order == 'created') { $createds = $SEARCHENGINE->get_createds(); } $relevances = $SEARCHENGINE->get_relevances(); if (count($gids) <= 0) { // No results. $data['rows'] = array(); return $data; } #if ($sort_order=='created') { print_r($gids); } // We'll put all the data in here before giving it to a template. $rows = array(); // We'll cache the ids=>first_names/last_names of speakers here. $speakers = array(); // We'll cache (sub)section_ids here: $hansard_to_gid = array(); // Cycle through each result, munge the data, get more, and put it all in $data. for ($n = 0; $n < count($gids); $n++) { $gid = $gids[$n]; $relevancy = $relevances[$n]; if ($sort_order == 'created') { $created = substr($createds[$n], 0, strpos($createds[$n], ':')); if ($created < $args['threshold']) { $data['info']['total_results'] = $n; break; } } // Get the data for the gid from the database $q = $this->db->query("SELECT hansard.gid,\n hansard.hdate,\n hansard.section_id,\n hansard.subsection_id,\n hansard.htype,\n hansard.major,\n hansard.speaker_id,\n\t\t\t\t hansard.hpos,\n epobject.body\n FROM hansard, epobject\n WHERE hansard.gid = '{$gid}'\n AND hansard.epobject_id = epobject.epobject_id"); if ($q->rows() > 1) { $PAGE->error_message("Got more than one row getting data for {$gid}"); } if ($q->rows() == 0) { # This error message is totally spurious, so don't show it # $PAGE->error_message("Unexpected missing gid $gid while searching"); continue; } $itemdata = array(); $itemdata['gid'] = fix_gid_from_db($q->field(0, 'gid')); $itemdata['hdate'] = $q->field(0, 'hdate'); $itemdata['htype'] = $q->field(0, 'htype'); $itemdata['major'] = $q->field(0, 'major'); $itemdata['section_id'] = $q->field(0, 'section_id'); $itemdata['subsection_id'] = $q->field(0, 'subsection_id'); $itemdata['relevance'] = $relevances[$n]; $itemdata['speaker_id'] = $q->field(0, 'speaker_id'); $itemdata['hpos'] = $q->field(0, 'hpos'); ////////////////////////// // 1. Trim and highlight the body text. $body = $q->field(0, 'body'); // We want to trim the body to an extract that is centered // around the position of the first search word. // we don't use strip_tags as it doesn't replace tags with spaces, // which means some words end up stuck together $extract = strip_tags_tospaces($body); // $bestpos is the position of the first search word $bestpos = $SEARCHENGINE->position_of_first_word($extract); // Where do we want to extract from the $body to start? $length_of_extract = 400; // characters. $startpos = $bestpos - $length_of_extract / 2; if ($startpos < 0) { $startpos = 0; } // Trim it to length and position, adding ellipses. $extract = trim_characters($extract, $startpos, $length_of_extract); // Highlight search words $extract = $SEARCHENGINE->highlight($extract); $itemdata['body'] = $extract; ////////////////////////// // 2. Create the URL to link to this bit of text. $id_data = array('major' => $itemdata['major'], 'htype' => $itemdata['htype'], 'gid' => $itemdata['gid'], 'section_id' => $itemdata['section_id'], 'subsection_id' => $itemdata['subsection_id']); // We append the query onto the end of the URL as variable 's' // so we can highlight them on the debate/wrans list page. $url_args = array('s' => $searchstring); $itemdata['listurl'] = $this->_get_listurl($id_data, $url_args, $encode); ////////////////////////// // 3. Get the speaker for this item, if applicable. if ($itemdata['speaker_id'] != 0) { $itemdata['speaker'] = $this->_get_speaker($itemdata['speaker_id'], $itemdata['hdate']); } ////////////////////////// // 4. Get data about the parent (sub)section. TODO: CHECK THIS for major==4 if ($itemdata['major'] && $hansardmajors[$itemdata['major']]['type'] == 'debate') { // Debate if ($itemdata['htype'] != 10) { $section = $this->_get_section($itemdata); $itemdata['parent']['body'] = $section['body']; # $itemdata['parent']['listurl'] = $section['listurl']; if ($itemdata['section_id'] != $itemdata['subsection_id']) { $subsection = $this->_get_subsection($itemdata); $itemdata['parent']['body'] .= ': ' . $subsection['body']; # $itemdata['parent']['listurl'] = $subsection['listurl']; } if ($itemdata['major'] == 5) { $itemdata['parent']['body'] = 'NIA: ' . $itemdata['parent']['body']; } } else { // It's a section, so it will be its own title. $itemdata['parent']['body'] = $itemdata['body']; $itemdata['body'] = ''; } } else { // Wrans or WMS $section = $this->_get_section($itemdata); $subsection = $this->_get_subsection($itemdata); $body = $hansardmajors[$itemdata['major']]['title'] . ' — '; if (isset($section['body'])) { $body .= $section['body']; } if (isset($subsection['body'])) { $body .= ': ' . $subsection['body']; } if (isset($subsection['listurl'])) { $listurl = $subsection['listurl']; } else { $listurl = ''; } $itemdata['parent'] = array('body' => $body, 'listurl' => $listurl); } // Add this item's data onto the main array we'll be returning. $rows[] = $itemdata; } $data['rows'] = $rows; return $data; }
function trim_characters($text, $start, $length) { $text = strip_tags_tospaces($text); // Split long strings up so they don't go too long. // Mainly for URLs which are displayed, but aren't links when trimmed. # http://bugs.php.net/bug.php?id=42298 for why I'm having to repeat # \S 60 times... $text = rtrim(preg_replace('/' . str_repeat('\\S', 60) . '/u', '$0 ', $text)); // Otherwise the word boundary matching goes odd... $text = preg_replace("/[\n\r]/", " ", $text); // Trim start. if ($start > 0) { $text = substr($text, $start); // Word boundary. if (preg_match("/.+?\\b(.*)/", $text, $matches)) { $text = $matches[1]; // Strip spare space at the start. $text = ltrim($text); } $text = '...' . $text; } // Trim end. if (mb_strlen($text) > $length) { // Allow space for ellipsis. $text = mb_substr($text, 0, $length - 3, 'utf-8'); // Word boundary. if (preg_match("/(.*)\\b.+/u", $text, $matches)) { $text = $matches[1]; // Strip spare space at the end. $text = rtrim($text); } // We don't want to use the HTML entity for an ellipsis (…), because then // it screws up when we subsequently use htmlentities() to print the returned // string! $text .= '...'; } return $text; }
function prepare_search_result_for_display($body) { global $SEARCHENGINE; // We want to trim the body to an extract that is centered // around the position of the first search word. // we don't use strip_tags as it doesn't replace tags with spaces, // which means some words end up stuck together $extract = strip_tags_tospaces($body); // $bestpos is the position of the first search word $bestpos = $SEARCHENGINE->position_of_first_word($extract); // Where do we want to extract from the $body to start? $length_of_extract = 400; // characters. $startpos = $bestpos - $length_of_extract / 2; if ($startpos < 0) { $startpos = 0; } // Trim it to length and position, adding ellipses. $extract = trim_characters($extract, $startpos, $length_of_extract); // Highlight search words $extract = $SEARCHENGINE->highlight($extract); return $extract; }
function trim_characters($text, $start, $length, $url_length = 60) { // Pass it a string, a numeric start position and a numeric length. // If the start position is > 0, the string will be trimmed to start at the // nearest word boundary after (or at) that position. // If the string is then longer than $length, it will be trimmed to the nearest // word boundary below (or at) that length. // If either end is trimmed, ellipses will be added. // The modified string is then returned - its *maximum* length is $length. // HTML is always stripped (must be for trimming to prevent broken tags). $text = strip_tags_tospaces($text); // Split long strings up so they don't go too long. // Mainly for URLs which are displayed, but aren't links when trimmed. $text = preg_replace('/(\\S{' . $url_length . '})/', "\$1 ", $text); // Otherwise the word boundary matching goes odd... $text = preg_replace("/[\n\r]/", " ", $text); // Trim start. if ($start > 0) { $text = substr($text, $start); // Word boundary. if (preg_match("/.+?\\b(.*)/", $text, $matches)) { $text = $matches[1]; // Strip spare space at the start. $text = preg_replace("/^\\s/", '', $text); } $text = '...' . $text; } // Trim end. if (strlen($text) > $length) { // Allow space for ellipsis. $text = substr($text, 0, $length - 3); // Word boundary. if (preg_match("/(.*)\\s.+/", $text, $matches)) { $text = $matches[1]; // Strip spare space at the end. $text = preg_replace("/\\s\$/", '', $text); } // We don't want to use the HTML entity for an ellipsis (…), because then // it screws up when we subsequently use htmlentities() to print the returned // string! $text .= '...'; } return $text; }