/** * Filters search results to pick out only the ones that match the query. * @param array $results Array of results from internal_query * @param int $desired Number of desired results * @return object ->results containing actual results and ->dbnext * containing database position of next set of results. * @throws coding_exception If the results contain unsupported plugin types */ private function internal_filter($results, $desired) { global $CFG; $required = array(); $accepted = array(); $count = 0; $return = new StdClass(); $return->dbnext = 0; foreach ($results as $result) { $return->dbnext++; if (substr($result->plugin, 0, 4) === 'mod_') { // Module plugins. $module = substr($result->plugin, 4); $function = $module . '_ousearch_get_document'; if (!array_key_exists($module, $required)) { require_once $CFG->dirroot . '/mod/' . $module . '/lib.php'; $required[$module] = true; if (!function_exists($function)) { throw new coding_exception('Missing module search support ' . $function, 'Module is not searchable. Needs function ' . $function . '. See local/ousearch/doc/usage.html.'); } } } else { if (substr($result->plugin, 0, 5) === 'test_') { // Testing code, assumed to already be included. $function = substr($result->plugin, 5) . '_ousearch_get_document'; } else { // Nothing else supported yet. throw new coding_exception('Unsupported search plugin type ' . $result->plugin, 'OU search only currently works for modules'); } } // Let's request the document. Note that the 'document' fields of // $result are those used by this function to find the right one. $page = $function($result); // Ignore if we can't find the document. if (!$page) { // Output debug warning. debugging('Module ' . $result->plugin . ' can\'t find search document, removing from results'); $searchdoc = new local_ousearch_document(); $searchdoc->wipe_document($result->id); continue; } // Page option can request that this result is not included. if (!empty($page->hide)) { continue; } // Strip XHTML from content (need this before phrase scan). $textcontent = local_ousearch_document::strip_xhtml($page->content); // Add extra strings to the content after a special don't-show-this // marker and with another special marker between each (to prevent // phrases). if (isset($page->extrastrings) && count($page->extrastrings) > 0) { $evilmarker = rand(); // This means people can't do it on purpose. $textcontent .= ' xxrealcontentends' . $evilmarker; foreach ($page->extrastrings as $string) { $textcontent .= ' ' . $string . ' xxsplit' . $evilmarker; } } // Do quick phrase scan that doesn't deal with Unicode, // or word-splitting but just discards results that // don't have the phrase words next to each other without // ASCII letters in between. This is intended to discard // results that (fairly) definitely don't have the phrase. // The further check below will make sure they really do // have it according to our standard (slow) word-splitting. $quickcheckcontent = $page->title . ' ' . $textcontent; $ok = true; foreach ($this->terms as $term) { if (count($term->words) < 2) { continue; } $gap = '[^A-Za-z0-9]+'; $pattern = '/(^|' . $gap . ')'; $first = true; foreach ($term->words as $word) { if ($first) { $first = false; } else { $pattern .= $gap; } $pattern .= $word; } $pattern .= '($|' . $gap . ')/i'; if (!preg_match($pattern, $quickcheckcontent)) { $ok = false; break; } } if (!$ok) { continue; } // OK, obtain document as linear text. list($contentwords, $contentpositions) = local_ousearch_document::split_words($textcontent, false, true); list($titlewords, $titlepositions) = local_ousearch_document::split_words($page->title, false, true); $allwords = array_merge($titlewords, $contentwords); // Check it for phrases. $positivewords = array(); $ok = true; $dnifound = -1; foreach ($this->terms as $term) { foreach ($term->words as $word) { $positivewords[$word] = true; } if (count($term->words) < 2) { continue; } $pos = 0; $found = false; foreach ($allwords as $word) { if ($word === $term->words[$pos]) { $pos++; if ($pos === count($term->words)) { $found = true; break; } } else { $pos = 0; } } if (!$found) { $ok = false; break; } } foreach ($this->negativeterms as $term) { if (count($term->words) < 2) { continue; } $pos = 0; $found = false; foreach ($allwords as $word) { if ($word === $term->words[$pos]) { $pos++; if ($pos === count($term->words)) { $found = true; break; } } else { $pos = 0; } } if ($found) { $ok = false; break; } } if (!$ok) { continue; } // Result passes! Make structure holding it... // We now have list of all positive words, let's mark these // in title and summary. $result->title = self::internal_highlight_words($page->title, $titlewords, $titlepositions, $positivewords); // Strip searchable-but-not-displayable content for summary. if (isset($evilmarker)) { $strippedwords = array(); foreach ($contentwords as $word) { // Do not include extra strings in summary. if ($word === 'xxrealcontentends' . $evilmarker) { break; } $strippedwords[] = $word; } $contentwords = $strippedwords; } // Pick a section to include in the summary. This algorithm works as follows: // * Compute the 'score' (number of highlight words in the previous 20 words // up to and including this one) at each position in the text // * Observe where the maximum score is reached and where it is lost. // * A nice range that contains the most highlight words in the middle of the // range will end at ($maxstart + $maxlength/2). $highlights = array(); $pos = 0; $currentscore = 0; $maxscore = -1; $maxstart = 0; $maxlength = 0; $run = true; foreach ($contentwords as $word) { if (array_key_exists($pos - self::SUMMARY_LENGTH, $highlights)) { unset($highlights[$pos - self::SUMMARY_LENGTH]); $currentscore--; } if (array_key_exists($word, $positivewords)) { $highlights[$pos] = true; $currentscore++; } if ($currentscore > $maxscore) { $maxscore = $currentscore; $maxstart = $pos; $maxlength = 1; $run = true; } else { if ($currentscore === $maxscore && $run) { $maxlength++; } else { $run = false; } } $pos++; } $start = $maxstart + $maxlength / 2 - self::SUMMARY_LENGTH; if ($start < 0) { $start = 0; } $end = $start + self::SUMMARY_LENGTH; if ($end > count($contentwords)) { $end = count($contentwords); } // The $contentpositions is in characters. $result->summary = core_text::substr($textcontent, $contentpositions[$start], $contentpositions[$end] - $contentpositions[$start]) . ($end < count($contentwords) ? '...' : ''); $offset = -$contentpositions[$start]; $result->summary = self::internal_highlight_words($result->summary, $contentwords, $contentpositions, $positivewords, $offset, $start, $end); if ($start !== 0) { $result->summary = '...' . $result->summary; } $result->summary = trim($result->summary); $result->activityname = $page->activityname; $result->activityurl = $page->activityurl; $result->url = $page->url; if (isset($page->data)) { $result->data = $page->data; } // Do user-specified filter if set. if ($this->filter) { $filter = $this->filter; if (!$filter($result)) { continue; } } $accepted[] = $result; $count++; if ($count == $desired) { break; } } $return->results = $accepted; return $return; }