function parseQuery($filteredText, $fulltext) { global $wgContLang; $lc = SearchEngine::legalSearchChars(); $searchon = ''; $this->searchTerms = array(); # FIXME: This doesn't handle parenthetical expressions. if (preg_match_all('/([-+<>~]?)(([' . $lc . ']+)(\\*?)|"[^"]*")/', $filteredText, $m, PREG_SET_ORDER)) { foreach ($m as $terms) { if ($searchon !== '') { $searchon .= ' '; } if ($this->strictMatching && $terms[1] == '') { $terms[1] = '+'; } $searchon .= $terms[1] . $wgContLang->stripForSearch($terms[2]); if (!empty($terms[3])) { $regexp = preg_quote($terms[3], '/'); if ($terms[4]) { $regexp .= "[0-9A-Za-z_]+"; } } else { $regexp = preg_quote(str_replace('"', '', $terms[2]), '/'); } $this->searchTerms[] = $regexp; } wfDebug("Would search with '{$searchon}'\n"); wfDebug("Match with /\\b" . implode('\\b|\\b', $this->searchTerms) . "\\b/\n"); } else { wfDebug("Can't understand search query '{$this->filteredText}'\n"); } $searchon = preg_replace('/(\\s+)/', '&', $searchon); $searchon = $this->db->strencode($searchon); return $searchon; }
function parseQuery($filteredText, $fulltext) { global $wgDBminWordLen, $wgContLang; $field = $this->getIndexField($fulltext); # on non mysql4 database: get list of words we don't want to search for require_once 'FulltextStoplist.php'; $lc = SearchEngine::legalSearchChars() . '()'; $q = preg_replace("/([()])/", " \\1 ", $filteredText); $q = preg_replace("/\\s+/", " ", $q); $w = explode(' ', trim($q)); $last = $cond = ''; foreach ($w as $word) { $word = $wgContLang->stripForSearch($word); if ('and' == $word || 'or' == $word || 'not' == $word || '(' == $word || ')' == $word) { $cond .= ' ' . strtoupper($word); $last = ''; } else { if (strlen($word) < $wgDBminWordLen) { continue; } else { if (FulltextStoplist::inList($word)) { continue; } else { if ('' != $last) { $cond .= ' AND'; } $cond .= " (MATCH ({$field}) AGAINST ('" . $this->db->strencode($word) . "'))"; $last = $word; $this->searchTerms[] = "\\b" . preg_quote($word, '/') . "\\b"; } } } } if (0 == count($this->searchTerms)) { # No searchable terms remaining. # We have to return a term for the query or we get an SQL error. return "0"; } return '(' . $cond . ' )'; }
/** @todo document */ function parseQuery($filteredText, $fulltext) { global $wgContLang; $lc = SearchEngine::legalSearchChars(); // Minus format chars $searchon = ''; $this->searchTerms = array(); # FIXME: This doesn't handle parenthetical expressions. $m = array(); if (preg_match_all('/([-+<>~]?)(([' . $lc . ']+)(\\*?)|"[^"]*")/', $filteredText, $m, PREG_SET_ORDER)) { foreach ($m as $terms) { if ($searchon !== '') { $searchon .= ' '; } if ($this->strictMatching && $terms[1] == '') { $terms[1] = '+'; } $searchon .= $terms[1] . $wgContLang->stripForSearch($terms[2]); if (!empty($terms[3])) { // Match individual terms in result highlighting... $regexp = preg_quote($terms[3], '/'); if ($terms[4]) { $regexp .= "[0-9A-Za-z_]+"; } } else { // Match the quoted term in result highlighting... $regexp = preg_quote(str_replace('"', '', $terms[2]), '/'); } $this->searchTerms[] = "\\b{$regexp}\\b"; } wfDebug("Would search with '{$searchon}'\n"); wfDebug('Match with /\\b' . implode('\\b|\\b', $this->searchTerms) . "\\b/\n"); } else { wfDebug("Can't understand search query '{$filteredText}'\n"); } $searchon = $this->db->strencode($searchon); $field = $this->getIndexField($fulltext); return " MATCH({$field}) AGAINST('{$searchon}' IN BOOLEAN MODE) "; }
/** * Parse the user's query and transform it into an SQL fragment which will * become part of a WHERE clause * * @param $filteredText string * @param $fulltext string * * @return string */ function parseQuery($filteredText, $fulltext) { global $wgContLang; $lc = SearchEngine::legalSearchChars(); // Minus format chars $searchon = ''; $this->searchTerms = array(); # @todo FIXME: This doesn't handle parenthetical expressions. $m = array(); if (preg_match_all('/([-+<>~]?)(([' . $lc . ']+)(\\*?)|"[^"]*")/', $filteredText, $m, PREG_SET_ORDER)) { foreach ($m as $bits) { @(list(, $modifier, $term, $nonQuoted, $wildcard) = $bits); if ($nonQuoted != '') { $term = $nonQuoted; $quote = ''; } else { $term = str_replace('"', '', $term); $quote = '"'; } if ($searchon !== '') { $searchon .= ' '; } if ($this->strictMatching && $modifier == '') { // If we leave this out, boolean op defaults to OR which is rarely helpful. $modifier = '+'; } // Some languages such as Serbian store the input form in the search index, // so we may need to search for matches in multiple writing system variants. $convertedVariants = $wgContLang->autoConvertToAllVariants($term); if (is_array($convertedVariants)) { $variants = array_unique(array_values($convertedVariants)); } else { $variants = array($term); } // The low-level search index does some processing on input to work // around problems with minimum lengths and encoding in MySQL's // fulltext engine. // For Chinese this also inserts spaces between adjacent Han characters. $strippedVariants = array_map(array($wgContLang, 'normalizeForSearch'), $variants); // Some languages such as Chinese force all variants to a canonical // form when stripping to the low-level search index, so to be sure // let's check our variants list for unique items after stripping. $strippedVariants = array_unique($strippedVariants); $searchon .= $modifier; if (count($strippedVariants) > 1) { $searchon .= '('; } foreach ($strippedVariants as $stripped) { $stripped = $this->normalizeText($stripped); if ($nonQuoted && strpos($stripped, ' ') !== false) { // Hack for Chinese: we need to toss in quotes for // multiple-character phrases since normalizeForSearch() // added spaces between them to make word breaks. $stripped = '"' . trim($stripped) . '"'; } $searchon .= "{$quote}{$stripped}{$quote}{$wildcard} "; } if (count($strippedVariants) > 1) { $searchon .= ')'; } // Match individual terms or quoted phrase in result highlighting... // Note that variants will be introduced in a later stage for highlighting! $regexp = $this->regexTerm($term, $wildcard); $this->searchTerms[] = $regexp; } wfDebug(__METHOD__ . ": Would search with '{$searchon}'\n"); wfDebug(__METHOD__ . ': Match with /' . implode('|', $this->searchTerms) . "/\n"); } else { wfDebug(__METHOD__ . ": Can't understand search query '{$filteredText}'\n"); } $searchon = $this->db->strencode($searchon); $field = $this->getIndexField($fulltext); return " MATCH({$field}) AGAINST('{$searchon}' IN BOOLEAN MODE) "; }
public static function legalSearchChars() { return "\"" . parent::legalSearchChars(); }
/** * Parse a user input search string, and return an SQL fragment to be used * as part of a WHERE clause * @return string */ function parseQuery($filteredText, $fulltext) { global $wgContLang; $lc = SearchEngine::legalSearchChars(); $this->searchTerms = array(); # @todo FIXME: This doesn't handle parenthetical expressions. $m = array(); $searchon = ''; if (preg_match_all('/([-+<>~]?)(([' . $lc . ']+)(\\*?)|"[^"]*")/', $filteredText, $m, PREG_SET_ORDER)) { foreach ($m as $terms) { // Search terms in all variant forms, only // apply on wiki with LanguageConverter $temp_terms = $wgContLang->autoConvertToAllVariants($terms[2]); if (is_array($temp_terms)) { $temp_terms = array_unique(array_values($temp_terms)); foreach ($temp_terms as $t) { $searchon .= ($terms[1] == '-' ? ' ~' : ' & ') . $this->escapeTerm($t); } } else { $searchon .= ($terms[1] == '-' ? ' ~' : ' & ') . $this->escapeTerm($terms[2]); } if (!empty($terms[3])) { $regexp = preg_quote($terms[3], '/'); if ($terms[4]) { $regexp .= "[0-9A-Za-z_]+"; } } else { $regexp = preg_quote(str_replace('"', '', $terms[2]), '/'); } $this->searchTerms[] = $regexp; } } $searchon = $this->db->addQuotes(ltrim($searchon, ' &')); $field = $this->getIndexField($fulltext); return " CONTAINS({$field}, {$searchon}, 1) > 0 "; }
/** * Get a normalized string representation of a title suitable for * including in a search index * * @param SearchEngine $search * @return string A stripped-down title string ready for the search index */ private function getNormalizedTitle(SearchEngine $search) { global $wgContLang; $ns = $this->title->getNamespace(); $title = $this->title->getText(); $lc = $search->legalSearchChars() . '&#;'; $t = $wgContLang->normalizeForSearch($title); $t = preg_replace("/[^{$lc}]+/", ' ', $t); $t = $wgContLang->lc($t); # Handle 's, s' $t = preg_replace("/([{$lc}]+)'s( |\$)/", "\\1 \\1's ", $t); $t = preg_replace("/([{$lc}]+)s'( |\$)/", "\\1s ", $t); $t = preg_replace("/\\s+/", ' ', $t); if ($ns == NS_FILE) { $t = preg_replace("/ (png|gif|jpg|jpeg|ogg)\$/", "", $t); } return $search->normalizeText(trim($t)); }
/** * Get a string representation of a title suitable for * including in a search index * * @param int $ns a namespace index * @param string $title text-form main part * @return string a stripped-down title string ready for the * search index */ public static function indexTitle($ns, $title) { global $wgContLang; $lc = SearchEngine::legalSearchChars() . '&#;'; $t = $wgContLang->stripForSearch($title); $t = preg_replace("/[^{$lc}]+/", ' ', $t); $t = $wgContLang->lc($t); # Handle 's, s' $t = preg_replace("/([{$lc}]+)'s( |\$)/", "\\1 \\1's ", $t); $t = preg_replace("/([{$lc}]+)s'( |\$)/", "\\1s ", $t); $t = preg_replace("/\\s+/", ' ', $t); if ($ns == NS_IMAGE) { $t = preg_replace("/ (png|gif|jpg|jpeg|ogg)\$/", "", $t); } return trim($t); }
/** @todo document */ function parseQuery($filteredText, $fulltext) { global $wgContLang; $lc = SearchEngine::legalSearchChars(); $this->searchTerms = array(); # @todo FIXME: This doesn't handle parenthetical expressions. $m = array(); $q = array(); if (preg_match_all('/([-+<>~]?)(([' . $lc . ']+)(\\*?)|"[^"]*")/', $filteredText, $m, PREG_SET_ORDER)) { foreach ($m as $terms) { // Search terms in all variant forms, only // apply on wiki with LanguageConverter $temp_terms = $wgContLang->autoConvertToAllVariants($terms[2]); if (is_array($temp_terms)) { $temp_terms = array_unique(array_values($temp_terms)); foreach ($temp_terms as $t) { $q[] = $terms[1] . $wgContLang->normalizeForSearch($t); } } else { $q[] = $terms[1] . $wgContLang->normalizeForSearch($terms[2]); } if (!empty($terms[3])) { $regexp = preg_quote($terms[3], '/'); if ($terms[4]) { $regexp .= "[0-9A-Za-z_]+"; } } else { $regexp = preg_quote(str_replace('"', '', $terms[2]), '/'); } $this->searchTerms[] = $regexp; } } $searchon = $this->db->strencode(join(',', $q)); $field = $this->getIndexField($fulltext); // requires Net Search Extender or equivalent //return " CONTAINS($field, '$searchon') > 0 "; return " lcase({$field}) LIKE lcase('%{$searchon}%')"; }
/** @todo document * @return string */ function parseQuery($filteredText, $fulltext) { global $wgContLang; $lc = SearchEngine::legalSearchChars(); $this->searchTerms = array(); # @todo FIXME: This doesn't handle parenthetical expressions. $m = array(); $q = array(); if (preg_match_all('/([-+<>~]?)(([' . $lc . ']+)(\\*?)|"[^"]*")/', $filteredText, $m, PREG_SET_ORDER)) { foreach ($m as $terms) { $q[] = $terms[1] . $wgContLang->normalizeForSearch($terms[2]); if (!empty($terms[3])) { $regexp = preg_quote($terms[3], '/'); if ($terms[4]) { $regexp .= "[0-9A-Za-z_]+"; } } else { $regexp = preg_quote(str_replace('"', '', $terms[2]), '/'); } $this->searchTerms[] = $regexp; } } $searchon = $this->db->strencode(join(',', $q)); $field = $this->getIndexField($fulltext); return "{$field}, '{$searchon}'"; }
function doUpdate() { global $wgContLang, $wgDisableSearchUpdate; if ($wgDisableSearchUpdate || !$this->mId) { return false; } wfProfileIn(__METHOD__); $search = SearchEngine::create(); $lc = SearchEngine::legalSearchChars() . '&#;'; if ($this->mText === false) { $search->updateTitle($this->mId, $search->normalizeText(Title::indexTitle($this->mNamespace, $this->mTitle))); wfProfileOut(__METHOD__); return; } # Language-specific strip/conversion $text = $wgContLang->normalizeForSearch($this->mText); wfProfileIn(__METHOD__ . '-regexps'); $text = preg_replace("/<\\/?\\s*[A-Za-z][^>]*?>/", ' ', $wgContLang->lc(" " . $text . " ")); # Strip HTML markup $text = preg_replace("/(^|\\n)==\\s*([^\\n]+)\\s*==(\\s)/sD", "\\1\\2 \\2 \\2\\3", $text); # Emphasize headings # Strip external URLs $uc = "A-Za-z0-9_\\/:.,~%\\-+&;#?!=()@\\x80-\\xFF"; $protos = "http|https|ftp|mailto|news|gopher"; $pat = "/(^|[^\\[])({$protos}):[{$uc}]+([^{$uc}]|\$)/"; $text = preg_replace($pat, "\\1 \\3", $text); $p1 = "/([^\\[])\\[({$protos}):[{$uc}]+]/"; $p2 = "/([^\\[])\\[({$protos}):[{$uc}]+\\s+([^\\]]+)]/"; $text = preg_replace($p1, "\\1 ", $text); $text = preg_replace($p2, "\\1 \\3 ", $text); # Internal image links $pat2 = "/\\[\\[image:([{$uc}]+)\\.(gif|png|jpg|jpeg)([^{$uc}])/i"; $text = preg_replace($pat2, " \\1 \\3", $text); $text = preg_replace("/([^{$lc}])([{$lc}]+)]]([a-z]+)/", "\\1\\2 \\2\\3", $text); # Handle [[game]]s # Strip all remaining non-search characters $text = preg_replace("/[^{$lc}]+/", " ", $text); # Handle 's, s' # # $text = preg_replace( "/([{$lc}]+)'s /", "\\1 \\1's ", $text ); # $text = preg_replace( "/([{$lc}]+)s' /", "\\1s ", $text ); # # These tail-anchored regexps are insanely slow. The worst case comes # when Japanese or Chinese text (ie, no word spacing) is written on # a wiki configured for Western UTF-8 mode. The Unicode characters are # expanded to hex codes and the "words" are very long paragraph-length # monstrosities. On a large page the above regexps may take over 20 # seconds *each* on a 1GHz-level processor. # # Following are reversed versions which are consistently fast # (about 3 milliseconds on 1GHz-level processor). # $text = strrev(preg_replace("/ s'([{$lc}]+)/", " s'\\1 \\1", strrev($text))); $text = strrev(preg_replace("/ 's([{$lc}]+)/", " s\\1", strrev($text))); # Strip wiki '' and ''' $text = preg_replace("/''[']*/", " ", $text); wfProfileOut(__METHOD__ . '-regexps'); wfRunHooks('SearchUpdate', array($this->mId, $this->mNamespace, $this->mTitle, &$text)); # Perform the actual update $search->update($this->mId, $search->normalizeText(Title::indexTitle($this->mNamespace, $this->mTitle)), $search->normalizeText($text)); wfProfileOut(__METHOD__); }
function termMatches() { $resq = preg_replace("/\\[.*?\\]:/", " ", $this->mQuery); # generic prefixes $resq = preg_replace("/all:/", " ", $resq); // Fixme: this is ripped from SearchMySQL and probably kind of sucks, // but it handles quoted phrase searches more or less correctly. // Should encapsulate this stuff better. // FIXME: This doesn't handle parenthetical expressions. $regexes = array(); $m = array(); $lc = SearchEngine::legalSearchChars(); if (preg_match_all('/([-+<>~]?)(([' . $lc . ']+)(\\*?)|"[^"]*")/', $resq, $m, PREG_SET_ORDER)) { foreach ($m as $terms) { if (!empty($terms[3])) { // Match individual terms in result highlighting... $regexp = preg_quote($terms[3], '/'); if ($terms[4]) { $regexp .= "[0-9A-Za-z_]+"; } } else { // Match the quoted term in result highlighting... $regexp = preg_quote(str_replace('"', '', $terms[2]), '/'); } $regexes[] = $regexp; } wfDebug(__METHOD__ . ': Match with /' . implode('|', $regexes) . "/\n"); } else { wfDebug("Can't understand search query '{$resq}'\n"); } return $regexes; }