function test_split_words() { // Standard usage and caps $this->assertEqual(ousearch_document::split_words('Hello I AM a basic test'), array('hello', 'i', 'am', 'a', 'basic', 'test')); // Numbers $this->assertEqual(ousearch_document::split_words('13 2by2'), array('13', '2by2')); // Ignored and accepted punctuation and whitespace $this->assertEqual(ousearch_document::split_words(' hello,testing!what\'s&up there-by '), array('hello', 'testing', 'what\'s', 'up', 'there', 'by')); // Unicode letters and nonletter $this->assertEqual(ousearch_document::split_words('café ßåřĉĕļÅ?ņä※tonight'), array('café', 'ßåřĉĕļÅ?ņä', 'tonight')); // Unicode caps $this->assertEqual(ousearch_document::split_words('ĀĒĪŌŪ'), array('Ä?Ä“Ä«Å?Å«')); // Query mode (keeps " + -) $this->assertEqual(ousearch_document::split_words('"hello there" +frog -doughnut extra-special', true), array('"hello', 'there"', '+frog', '-doughnut', 'extra-special')); // Position mode: normal $this->assertEqual(ousearch_document::split_words('hello test', false, true), array(array('hello', 'test'), array(0, 6, 10))); // Position mode: whitespace $this->assertEqual(ousearch_document::split_words(' hello test ', false, true), array(array('hello', 'test'), array(4, 13, 21))); // Position mode: unicode $this->assertEqual(ousearch_document::split_words('hÄ•llo tÄ•st', false, true), array(array('hÄ•llo', 'tÄ•st'), array(0, 7, 12))); // Positions are in bytes }
/** * Filters search results to pick out only the ones that match the query. * @param array $results Array of results from internal_query * @param int $desired Number of desired results * @return object ->results containing actual results and ->dbnext containing * database position of next set of results. */ function internal_filter($results, $desired) { global $CFG; $required = array(); $accepted = array(); $count = 0; $return = new StdClass(); $return->dbnext = 0; $tl = textlib_get_instance(); foreach ($results as $result) { $return->dbnext++; if (substr($result->plugin, 0, 4) === 'mod/') { // Module plugins $module = substr($result->plugin, 4); $function = $module . '_ousearch_get_document'; if (!array_key_exists($module, $required)) { require_once $CFG->dirroot . '/mod/' . $module . '/lib.php'; $required[$module] = true; if (!function_exists($function)) { error('Missing module search support ' . $function); } } } else { if (substr($result->plugin, 0, 5) === 'test/') { // Testing code, assumed to already be included $function = substr($result->plugin, 5) . '_ousearch_get_document'; } else { // Nothing else supported yet error('Unsupported search plugin type ' . $result->plugin); } } // Let's request the document. Note that the 'document' fields of // $result are those used by this function to find the right one. $page = $function($result); // Ignore if we can't find the document if (!$page) { debugging('Module ' . $result->plugin . ' can\'t find search document'); ousearch_document::wipe_document($result->id); continue; } // Page option can request that this result is not included if (!empty($page->hide)) { continue; } // Strip XHTML from content (need this before phrase scan) $textcontent = ousearch_document::strip_xhtml($page->content); // Add extra strings to the content after a special don't-show-this // marker and with another special marker between each (to prevent // phrases) if (isset($page->extrastrings) && count($page->extrastrings) > 0) { $evilmarker = rand(); // This means people can't do it on purpose $textcontent .= ' xxrealcontentends' . $evilmarker; foreach ($page->extrastrings as $string) { $textcontent .= ' ' . $string . ' xxsplit' . $evilmarker; } } // Do quick phrase scan that doesn't deal with Unicode, // or word-splitting but just discards results that // don't have the phrase words next to each other without // ASCII letters in between. This is intended to discard // results that (fairly) definitely don't have the phrase. // The further check below will make sure they really do // have it according to our standard (slow) word-splitting. $quickcheckcontent = $page->title . ' ' . $textcontent; $ok = true; foreach ($this->terms as $term) { if (count($term->words) < 2) { continue; } $gap = '[^A-Za-z0-9]+'; $pattern = '/(^|' . $gap . ')'; $first = true; foreach ($term->words as $word) { if ($first) { $first = false; } else { $pattern .= $gap; } $pattern .= $word; } $pattern .= '($|' . $gap . ')/i'; if (!preg_match($pattern, $quickcheckcontent)) { $ok = false; break; } } if (!$ok) { continue; } // OK, obtain document as linear text list($contentwords, $contentpositions) = ousearch_document::split_words($textcontent, false, true); list($titlewords, $titlepositions) = ousearch_document::split_words($page->title, false, true); $allwords = array_merge($titlewords, $contentwords); // Check it for phrases $positivewords = array(); $ok = true; $DNIfound = -1; foreach ($this->terms as $term) { foreach ($term->words as $word) { $positivewords[$word] = true; } if (count($term->words) < 2) { continue; } $pos = 0; $found = false; foreach ($allwords as $word) { if ($word === $term->words[$pos]) { $pos++; if ($pos === count($term->words)) { $found = true; break; } } else { $pos = 0; } } if (!$found) { $ok = false; break; } } foreach ($this->negativeterms as $term) { if (count($term->words) < 2) { continue; } $pos = 0; $found = false; foreach ($allwords as $word) { if ($word === $term->words[$pos]) { $pos++; if ($pos === count($term->words)) { $found = true; break; } } else { $pos = 0; } } if ($found) { $ok = false; break; } } if (!$ok) { continue; } // Result passes! Make structure holding it... // We now have list of all positive words, let's mark these // in title and summary $result->title = self::internal_highlight_words($page->title, $titlewords, $titlepositions, $positivewords); // Strip searchable-but-not-displayable content for summary if (isset($evilmarker)) { $strippedwords = array(); foreach ($contentwords as $word) { // Do not include extra strings in summary if ($word == 'xxrealcontentends' . $evilmarker) { break; } $strippedwords[] = $word; } $contentwords = $strippedwords; } // Pick a section to include in the summary. This algorithm works as follows: // * Compute the 'score' (number of highlight words in the previous 20 words // up to and including this one) at each position in the text // * Observe where the maximum score is reached and where it is lost. // * A nice range that contains the most highlight words in the middle of the // range will end at ($maxstart + $maxlength/2). $highlights = array(); $pos = 0; $currentscore = 0; $maxscore = -1; $maxstart = 0; $maxlength = 0; $run = true; foreach ($contentwords as $word) { if (array_key_exists($pos - OUSEARCH_SUMMARYLENGTH, $highlights)) { unset($highlights[$pos - OUSEARCH_SUMMARYLENGTH]); $currentscore--; } if (array_key_exists($word, $positivewords)) { $highlights[$pos] = true; $currentscore++; } if ($currentscore > $maxscore) { $maxscore = $currentscore; $maxstart = $pos; $maxlength = 1; $run = true; } else { if ($currentscore === $maxscore && $run) { $maxlength++; } else { $run = false; } } $pos++; } $start = $maxstart + $maxlength / 2 - OUSEARCH_SUMMARYLENGTH; if ($start < 0) { $start = 0; } $end = $start + OUSEARCH_SUMMARYLENGTH; if ($end > count($contentwords)) { $end = count($contentwords); } // $contentpositions is in characters $result->summary = $tl->substr($textcontent, $contentpositions[$start], $contentpositions[$end] - $contentpositions[$start]) . ($end < count($contentwords) ? '...' : ''); $offset = -$contentpositions[$start]; $result->summary = self::internal_highlight_words($result->summary, $contentwords, $contentpositions, $positivewords, $offset, $start, $end); if ($start !== 0) { $result->summary = '...' . $result->summary; } $result->summary = trim($result->summary); $result->activityname = $page->activityname; $result->activityurl = $page->activityurl; $result->url = $page->url; if (isset($page->data)) { $result->data = $page->data; } // Do user-specified filter if set if ($this->filter) { $filter = $this->filter; if (!$filter($result)) { continue; } } $accepted[] = $result; $count++; if ($count == $desired) { break; } } $return->results = $accepted; return $return; }