/** * Test that glossarising a single word works as expected. * * @group xapian */ public function testSearchLink() { $SEARCHENGINE = new SEARCHENGINE('test'); $this->assertEquals('<a href="/mp/?m=40584" title="Our page on Mr Test - \'the Member for Birmingham (Mr Test)\'">Mr <span class="hi">Test</span></a>', $SEARCHENGINE->highlight('<a href="/mp/?m=40584" title="Our page on Mr Test - \'the Member for Birmingham (Mr Test)\'">Mr Test</a>')); }
/** * Test that search highlighting with phrases skips words contained in link title attributes. * * @group xapian */ public function testSearchPhraseHighlightingInTags() { $SEARCHENGINE = new SEARCHENGINE('"Shabana"'); $expected_text = '<p pid="b.893.4/1">On a point of order, Mr <a href="/glossary/?gl=21" title="The Speaker is an MP who has been elected to act as Chairman during debates..." class="glossary">Speaker</a>. In yesterday’s Finance Bill debate, <a href="/mp/?m=40084" title="Our page on Shabana Mahmood - \'the hon. Member for Birmingham, Ladywood (Shabana Mahmood)\'"><span class="hi">Shabana</span> Mahmood</a> said that the tax gap was 32 billion when the previous Government left office and that it has now gone up to 35 billion. Official Her Majesty’s Revenue and Customs figures show the tax gap was actually 42 billion when Labour left office, so there has been a fall of 7 billion under this Government'; $text = '<p pid="b.893.4/1">On a point of order, Mr <a href="/glossary/?gl=21" title="The Speaker is an MP who has been elected to act as Chairman during debates..." class="glossary">Speaker</a>. In yesterday’s Finance Bill debate, <a href="/mp/?m=40084" title="Our page on Shabana Mahmood - \'the hon. Member for Birmingham, Ladywood (Shabana Mahmood)\'">Shabana Mahmood</a> said that the tax gap was 32 billion when the previous Government left office and that it has now gone up to 35 billion. Official Her Majesty’s Revenue and Customs figures show the tax gap was actually 42 billion when Labour left office, so there has been a fall of 7 billion under this Government'; $this->assertEquals($expected_text, $SEARCHENGINE->highlight($text)); }
protected function display_section_or_speech($args = array()) { global $DATA, $this_page, $THEUSER; # += as we *don't* want to override any already supplied argument $args += array('gid' => get_http_var('id'), 's' => get_http_var('s'), 'member_id' => get_http_var('m')); if (preg_match('/speaker:(\\d+)/', get_http_var('s'), $mmm)) { $args['person_id'] = $mmm[1]; } try { $data = $this->list->display('gid', $args, 'none'); } catch (\RedirectException $e) { $URL = new \URL($this->major_data['page_all']); if ($this->major == 6) { # Magically (as in I can't remember quite why), pbc_clause will # contain the new URL without any change... $URL->remove(array('id')); } else { $URL->insert(array('id' => $e->getMessage())); } # put the search term back in so highlighting works. # NB: as we don't see the # part of the URL we lose this :( if ($args['s'] !== '') { $URL->insert(array('s' => $args['s'])); } redirect($URL->generate('none')); } $data['individual_item'] = $this->list->commentspage == $this_page; if ($data['individual_item']) { $COMMENTLIST = new \COMMENTLIST(); $args['user_id'] = get_http_var('u'); $args['epobject_id'] = $this->list->epobject_id(); $data['comments']['object'] = $COMMENTLIST; $data['comments']['args'] = $args; $data['comments']['commentdata'] = array('epobject_id' => $this->list->epobject_id(), 'gid' => get_http_var('id'), 'return_page' => $this_page); } if (!isset($data['info'])) { header("HTTP/1.0 404 Not Found"); exit; # XXX } # Okay, let's set up highlighting and glossarisation $SEARCHENGINE = null; if (isset($data['info']['searchstring']) && $data['info']['searchstring'] != '') { $SEARCHENGINE = new \SEARCHENGINE($data['info']['searchstring']); } // Before we print the body text we need to insert glossary links // and highlight search string words. $speeches = 0; $bodies = array(); foreach ($data['rows'] as $row) { $htype = $row['htype']; if ($htype == 12 || $htype == 13) { $speeches++; } $body = $row['body']; $body = preg_replace('#<phrase class="honfriend" id="uk.org.publicwhip/member/(\\d+)" name="([^"]*?)">(.*?\\s*\\((.*?)\\))</phrase>#', '<a href="/mp/?m=$1" title="Our page on $2 - \'$3\'">$4</a>', $body); $body = preg_replace_callback('#<phrase class="offrep" id="(.*?)/(\\d+)-(\\d+)-(\\d+)\\.(.*?)">(.*?)</phrase>#', function ($matches) { return '<a href="/search/?pop=1&s=date:' . $matches[2] . $matches[3] . $matches[4] . '+column:' . $matches[5] . '+section:' . $matches[1] . '">' . str_replace("Official Report", "Hansard", $matches[6]) . '</a>'; }, $body); #$body = preg_replace('#<phrase class="offrep" id="((.*?)/(\d+)-(\d+)-(\d+)\.(.*?))">(.*?)</phrase>#e', "\"<a href='/search/?pop=1&s=date:$3$4$5+column:$6+section:$2&match=$1'>\" . str_replace('Official Report', 'Hansard', '$7') . '</a>'", $body); $bodies[] = $body; } // Do all this unless the glossary is turned off in the URL if (get_http_var('ug') != 1) { // And glossary phrases twfy_debug_timestamp('Before glossarise'); $args['sort'] = "regexp_replace"; $GLOSSARY = new \GLOSSARY($args); $bodies = $GLOSSARY->glossarise($bodies, 1); twfy_debug_timestamp('After glossarise'); } if ($SEARCHENGINE) { // We have some search terms to highlight. twfy_debug_timestamp('Before highlight'); $bodies = $SEARCHENGINE->highlight($bodies); twfy_debug_timestamp('After highlight'); } $first_speech = null; $data['section_title'] = ''; $subsection_title = ''; for ($i = 0; $i < count($data['rows']); $i++) { $row = $data['rows'][$i]; $htype = $row['htype']; // HPOS should be defined below if it's needed; otherwise default to 0 $heading_hpos = 0; if ($htype == 10) { $data['section_title'] = $row['body']; $heading_hpos = $row['hpos']; } elseif ($htype == 11) { $subsection_title = $row['body']; $heading_hpos = $row['hpos']; } elseif ($htype == 12) { # Splitting out highlighting results back into individual bits $data['rows'][$i]['body'] = $bodies[$i]; } if ($htype == 12 || $htype == 13) { if (!$first_speech) { $first_speech = $data['rows'][$i]; } # Voting links $data['rows'][$i]['voting_data'] = ''; if (isset($row['votes'])) { $data['rows'][$i]['voting_data'] = $this->generate_votes($row['votes'], $row['epobject_id'], $row['gid']); } # Annotation link if ($this->is_debate_section_page()) { // Build the 'Add an annotation' link. if (!$THEUSER->isloggedin()) { $URL = new \URL('userprompt'); $URL->insert(array('ret' => $row['commentsurl'])); $data['rows'][$i]['annotation_url'] = $URL->generate(); } else { $data['rows'][$i]['annotation_url'] = $row['commentsurl']; } $data['rows'][$i]['commentteaser'] = $this->generate_commentteaser($row); } if (isset($row['mentions'])) { $data['rows'][$i]['mentions'] = $this->get_question_mentions_html($row['mentions']); } if ($this->major == 1) { $data['rows'][$i]['video'] = $this->get_video_html($row, $heading_hpos, $speeches); } } } if ($subsection_title) { $data['heading'] = $subsection_title; } else { $data['heading'] = $data['section_title']; } if ($subsection_title) { $data['intro'] = "{$data['section_title']}"; } else { $data['intro'] = ""; } $country = 'UK'; if ($this->major == 1) { $data['location'] = '– in the House of Commons'; } elseif ($this->major == 2) { $data['location'] = '– in Westminster Hall'; } elseif ($this->major == 3) { $data['location'] = 'written question – answered'; } elseif ($this->major == 4) { $data['location'] = 'written statement – made'; } elseif ($this->major == 5) { $country = 'NORTHERN IRELAND'; $data['location'] = '– in the Northern Ireland Assembly'; } elseif ($this->major == 6) { $data['location'] = '– in a Public Bill Committee'; } elseif ($this->major == 7) { $country = 'SCOTLAND'; $data['location'] = '– in the Scottish Parliament'; } elseif ($this->major == 8) { $country = 'SCOTLAND'; $data['location'] = '– Scottish Parliament written question – answered'; } elseif ($this->major == 101) { $data['location'] = '– in the House of Lords'; } $data['current_assembly'] = "westminster--debate"; switch ($country) { case "UK": $data['current_assembly'] = "westminster--debate"; break; case "SCOTLAND": $data['current_assembly'] = "scotland"; break; case "NORTHERN IRELAND": $data['current_assembly'] = "ni"; break; } if (array_key_exists('text_heading', $data['info'])) { // avoid having Clause 1 etc as the alert text search string on PBC pages as it's // almost certainly not what the person wants if ($this->major == 6) { $data['email_alert_text'] = $data['section_title']; } else { $data['email_alert_text'] = $data['info']['text_heading']; } } else { // The user has requested only part of a debate, so find a suitable title if ($subsection_title) { $data['intro'] = "Part of {$data['section_title']}"; } else { $data['intro'] = "Part of the debate"; } foreach ($data['rows'] as $row) { if ($row['htype'] == 10 || $row['htype'] == 11) { $data['email_alert_text'] = $row['body']; $data['full_debate_url'] = $row['listurl']; break; } } } // strip a couple of common characters that result in encode junk in the // search string $data['email_alert_text'] = preg_replace('/(?:[:()\\[\\]]|&#\\d+;)/', '', $data['email_alert_text']); $data['debate_time_human'] = format_time($first_speech['htime'], 'g:i a'); $data['debate_day_human'] = format_date($first_speech['hdate'], 'jS F Y'); $URL = new \URL($this->list->listpage); $URL->insert(array('d' => $first_speech['hdate'])); $URL->remove(array('id')); $data['debate_day_link'] = $URL->generate(); $data['nextprev'] = $DATA->page_metadata($this_page, 'nextprev'); return $data; }
// and highlight search string words. $bodies = array(); foreach ($data['rows'] as $row) { $bodies[] = $row['body']; } if (isset($data['info']['glossarise']) && $data['info']['glossarise']) { // And glossary phrases twfy_debug_timestamp('Before glossarise'); $bodies = $GLOSSARY->glossarise($bodies, $data['info']['glossarise']); twfy_debug_timestamp('After glossarise'); } if ($SEARCHENGINE) { // We have some search terms to highlight. twfy_debug_timestamp('Before highlight'); $bodies = $SEARCHENGINE->highlight($bodies); twfy_debug_timestamp('After highlight'); } if (isset($data['info']['glossarise']) && ($data['info']['glossarise'] == 1)) { // Now we replace the title attributes for the glossarised links // to avoid words being highlighted within them. twfy_debug_timestamp('Before glossarise_titletags'); $bodies = $GLOSSARY->glossarise_titletags($bodies, 1); twfy_debug_timestamp('After glossarise_titletags'); } $speeches = 0; for ($i=0; $i<count($data['rows']); $i++) { if ($data['rows'][$i]['htype'] == 12) $data['rows'][$i]['body'] = $bodies[$i]; if ($data['rows'][$i]['htype'] == 12 || $data['rows'][$i]['htype'] == 13)
private function highlightSpeeches($data) { $SEARCHENGINE = null; if (isset($data['info']['searchstring']) && $data['info']['searchstring'] != '') { $SEARCHENGINE = new \SEARCHENGINE($data['info']['searchstring']); } // Before we print the body text we need to insert glossary links // and highlight search string words. $speeches = 0; $bodies = array(); foreach ($data['rows'] as $row) { $htype = $row['htype']; if ($htype == 12 || $htype == 13) { $speeches++; } $body = $row['body']; $body = preg_replace('#<phrase class="honfriend" id="uk.org.publicwhip/member/(\\d+)" name="([^"]*?)">(.*?\\s*\\((.*?)\\))</phrase>#', '<a href="/mp/?m=$1" title="Our page on $2 - \'$3\'">$4</a>', $body); $body = preg_replace('#<phrase class="honfriend" name="([^"]*?)" person_id="uk.org.publicwhip/person/(\\d+)">(.*?\\s*\\((.*?)\\))</phrase>#', '<a href="/mp/?p=$2" title="Our page on $1 - \'$3\'">$4</a>', $body); $body = preg_replace_callback('#<phrase class="offrep" id="(.*?)/(\\d+)-(\\d+)-(\\d+)\\.(.*?)">(.*?)</phrase>#', function ($matches) { return '<a href="/search/?pop=1&s=date:' . $matches[2] . $matches[3] . $matches[4] . '+column:' . $matches[5] . '+section:' . $matches[1] . '">' . str_replace("Official Report", "Hansard", $matches[6]) . '</a>'; }, $body); #$body = preg_replace('#<phrase class="offrep" id="((.*?)/(\d+)-(\d+)-(\d+)\.(.*?))">(.*?)</phrase>#e', "\"<a href='/search/?pop=1&s=date:$3$4$5+column:$6+section:$2&match=$1'>\" . str_replace('Official Report', 'Hansard', '$7') . '</a>'", $body); $bodies[] = $body; } if ($SEARCHENGINE) { // We have some search terms to highlight. twfy_debug_timestamp('Before highlight'); $bodies = $SEARCHENGINE->highlight($bodies); twfy_debug_timestamp('After highlight'); } // Do all this unless the glossary is turned off in the URL if (get_http_var('ug') != 1) { // And glossary phrases twfy_debug_timestamp('Before glossarise'); $args['sort'] = "regexp_replace"; $GLOSSARY = new \GLOSSARY($args); $bodies = $GLOSSARY->glossarise($bodies, 1); twfy_debug_timestamp('After glossarise'); } return array($bodies, $speeches); }