Beispiel #1
0
 function parseQuery($filteredText, $fulltext)
 {
     global $wgContLang;
     $lc = SearchEngine::legalSearchChars();
     $searchon = '';
     $this->searchTerms = array();
     # FIXME: This doesn't handle parenthetical expressions.
     if (preg_match_all('/([-+<>~]?)(([' . $lc . ']+)(\\*?)|"[^"]*")/', $filteredText, $m, PREG_SET_ORDER)) {
         foreach ($m as $terms) {
             if ($searchon !== '') {
                 $searchon .= ' ';
             }
             if ($this->strictMatching && $terms[1] == '') {
                 $terms[1] = '+';
             }
             $searchon .= $terms[1] . $wgContLang->stripForSearch($terms[2]);
             if (!empty($terms[3])) {
                 $regexp = preg_quote($terms[3], '/');
                 if ($terms[4]) {
                     $regexp .= "[0-9A-Za-z_]+";
                 }
             } else {
                 $regexp = preg_quote(str_replace('"', '', $terms[2]), '/');
             }
             $this->searchTerms[] = $regexp;
         }
         wfDebug("Would search with '{$searchon}'\n");
         wfDebug("Match with /\\b" . implode('\\b|\\b', $this->searchTerms) . "\\b/\n");
     } else {
         wfDebug("Can't understand search query '{$this->filteredText}'\n");
     }
     $searchon = preg_replace('/(\\s+)/', '&', $searchon);
     $searchon = $this->db->strencode($searchon);
     return $searchon;
 }
 function parseQuery($filteredText, $fulltext)
 {
     global $wgDBminWordLen, $wgContLang;
     $field = $this->getIndexField($fulltext);
     # on non mysql4 database: get list of words we don't want to search for
     require_once 'FulltextStoplist.php';
     $lc = SearchEngine::legalSearchChars() . '()';
     $q = preg_replace("/([()])/", " \\1 ", $filteredText);
     $q = preg_replace("/\\s+/", " ", $q);
     $w = explode(' ', trim($q));
     $last = $cond = '';
     foreach ($w as $word) {
         $word = $wgContLang->stripForSearch($word);
         if ('and' == $word || 'or' == $word || 'not' == $word || '(' == $word || ')' == $word) {
             $cond .= ' ' . strtoupper($word);
             $last = '';
         } else {
             if (strlen($word) < $wgDBminWordLen) {
                 continue;
             } else {
                 if (FulltextStoplist::inList($word)) {
                     continue;
                 } else {
                     if ('' != $last) {
                         $cond .= ' AND';
                     }
                     $cond .= " (MATCH ({$field}) AGAINST ('" . $this->db->strencode($word) . "'))";
                     $last = $word;
                     $this->searchTerms[] = "\\b" . preg_quote($word, '/') . "\\b";
                 }
             }
         }
     }
     if (0 == count($this->searchTerms)) {
         # No searchable terms remaining.
         # We have to return a term for the query or we get an SQL error.
         return "0";
     }
     return '(' . $cond . ' )';
 }
 /** @todo document */
 function parseQuery($filteredText, $fulltext)
 {
     global $wgContLang;
     $lc = SearchEngine::legalSearchChars();
     // Minus format chars
     $searchon = '';
     $this->searchTerms = array();
     # FIXME: This doesn't handle parenthetical expressions.
     $m = array();
     if (preg_match_all('/([-+<>~]?)(([' . $lc . ']+)(\\*?)|"[^"]*")/', $filteredText, $m, PREG_SET_ORDER)) {
         foreach ($m as $terms) {
             if ($searchon !== '') {
                 $searchon .= ' ';
             }
             if ($this->strictMatching && $terms[1] == '') {
                 $terms[1] = '+';
             }
             $searchon .= $terms[1] . $wgContLang->stripForSearch($terms[2]);
             if (!empty($terms[3])) {
                 // Match individual terms in result highlighting...
                 $regexp = preg_quote($terms[3], '/');
                 if ($terms[4]) {
                     $regexp .= "[0-9A-Za-z_]+";
                 }
             } else {
                 // Match the quoted term in result highlighting...
                 $regexp = preg_quote(str_replace('"', '', $terms[2]), '/');
             }
             $this->searchTerms[] = "\\b{$regexp}\\b";
         }
         wfDebug("Would search with '{$searchon}'\n");
         wfDebug('Match with /\\b' . implode('\\b|\\b', $this->searchTerms) . "\\b/\n");
     } else {
         wfDebug("Can't understand search query '{$filteredText}'\n");
     }
     $searchon = $this->db->strencode($searchon);
     $field = $this->getIndexField($fulltext);
     return " MATCH({$field}) AGAINST('{$searchon}' IN BOOLEAN MODE) ";
 }
Beispiel #4
0
 /**
  * Parse the user's query and transform it into an SQL fragment which will
  * become part of a WHERE clause
  *
  * @param $filteredText string
  * @param $fulltext string
  *
  * @return string
  */
 function parseQuery($filteredText, $fulltext)
 {
     global $wgContLang;
     $lc = SearchEngine::legalSearchChars();
     // Minus format chars
     $searchon = '';
     $this->searchTerms = array();
     # @todo FIXME: This doesn't handle parenthetical expressions.
     $m = array();
     if (preg_match_all('/([-+<>~]?)(([' . $lc . ']+)(\\*?)|"[^"]*")/', $filteredText, $m, PREG_SET_ORDER)) {
         foreach ($m as $bits) {
             @(list(, $modifier, $term, $nonQuoted, $wildcard) = $bits);
             if ($nonQuoted != '') {
                 $term = $nonQuoted;
                 $quote = '';
             } else {
                 $term = str_replace('"', '', $term);
                 $quote = '"';
             }
             if ($searchon !== '') {
                 $searchon .= ' ';
             }
             if ($this->strictMatching && $modifier == '') {
                 // If we leave this out, boolean op defaults to OR which is rarely helpful.
                 $modifier = '+';
             }
             // Some languages such as Serbian store the input form in the search index,
             // so we may need to search for matches in multiple writing system variants.
             $convertedVariants = $wgContLang->autoConvertToAllVariants($term);
             if (is_array($convertedVariants)) {
                 $variants = array_unique(array_values($convertedVariants));
             } else {
                 $variants = array($term);
             }
             // The low-level search index does some processing on input to work
             // around problems with minimum lengths and encoding in MySQL's
             // fulltext engine.
             // For Chinese this also inserts spaces between adjacent Han characters.
             $strippedVariants = array_map(array($wgContLang, 'normalizeForSearch'), $variants);
             // Some languages such as Chinese force all variants to a canonical
             // form when stripping to the low-level search index, so to be sure
             // let's check our variants list for unique items after stripping.
             $strippedVariants = array_unique($strippedVariants);
             $searchon .= $modifier;
             if (count($strippedVariants) > 1) {
                 $searchon .= '(';
             }
             foreach ($strippedVariants as $stripped) {
                 $stripped = $this->normalizeText($stripped);
                 if ($nonQuoted && strpos($stripped, ' ') !== false) {
                     // Hack for Chinese: we need to toss in quotes for
                     // multiple-character phrases since normalizeForSearch()
                     // added spaces between them to make word breaks.
                     $stripped = '"' . trim($stripped) . '"';
                 }
                 $searchon .= "{$quote}{$stripped}{$quote}{$wildcard} ";
             }
             if (count($strippedVariants) > 1) {
                 $searchon .= ')';
             }
             // Match individual terms or quoted phrase in result highlighting...
             // Note that variants will be introduced in a later stage for highlighting!
             $regexp = $this->regexTerm($term, $wildcard);
             $this->searchTerms[] = $regexp;
         }
         wfDebug(__METHOD__ . ": Would search with '{$searchon}'\n");
         wfDebug(__METHOD__ . ': Match with /' . implode('|', $this->searchTerms) . "/\n");
     } else {
         wfDebug(__METHOD__ . ": Can't understand search query '{$filteredText}'\n");
     }
     $searchon = $this->db->strencode($searchon);
     $field = $this->getIndexField($fulltext);
     return " MATCH({$field}) AGAINST('{$searchon}' IN BOOLEAN MODE) ";
 }
Beispiel #5
0
 public static function legalSearchChars()
 {
     return "\"" . parent::legalSearchChars();
 }
 /**
  * Parse a user input search string, and return an SQL fragment to be used
  * as part of a WHERE clause
  * @return string
  */
 function parseQuery($filteredText, $fulltext)
 {
     global $wgContLang;
     $lc = SearchEngine::legalSearchChars();
     $this->searchTerms = array();
     # @todo FIXME: This doesn't handle parenthetical expressions.
     $m = array();
     $searchon = '';
     if (preg_match_all('/([-+<>~]?)(([' . $lc . ']+)(\\*?)|"[^"]*")/', $filteredText, $m, PREG_SET_ORDER)) {
         foreach ($m as $terms) {
             // Search terms in all variant forms, only
             // apply on wiki with LanguageConverter
             $temp_terms = $wgContLang->autoConvertToAllVariants($terms[2]);
             if (is_array($temp_terms)) {
                 $temp_terms = array_unique(array_values($temp_terms));
                 foreach ($temp_terms as $t) {
                     $searchon .= ($terms[1] == '-' ? ' ~' : ' & ') . $this->escapeTerm($t);
                 }
             } else {
                 $searchon .= ($terms[1] == '-' ? ' ~' : ' & ') . $this->escapeTerm($terms[2]);
             }
             if (!empty($terms[3])) {
                 $regexp = preg_quote($terms[3], '/');
                 if ($terms[4]) {
                     $regexp .= "[0-9A-Za-z_]+";
                 }
             } else {
                 $regexp = preg_quote(str_replace('"', '', $terms[2]), '/');
             }
             $this->searchTerms[] = $regexp;
         }
     }
     $searchon = $this->db->addQuotes(ltrim($searchon, ' &'));
     $field = $this->getIndexField($fulltext);
     return " CONTAINS({$field}, {$searchon}, 1) > 0 ";
 }
 /**
  * Get a normalized string representation of a title suitable for
  * including in a search index
  *
  * @param SearchEngine $search
  * @return string A stripped-down title string ready for the search index
  */
 private function getNormalizedTitle(SearchEngine $search)
 {
     global $wgContLang;
     $ns = $this->title->getNamespace();
     $title = $this->title->getText();
     $lc = $search->legalSearchChars() . '&#;';
     $t = $wgContLang->normalizeForSearch($title);
     $t = preg_replace("/[^{$lc}]+/", ' ', $t);
     $t = $wgContLang->lc($t);
     # Handle 's, s'
     $t = preg_replace("/([{$lc}]+)'s( |\$)/", "\\1 \\1's ", $t);
     $t = preg_replace("/([{$lc}]+)s'( |\$)/", "\\1s ", $t);
     $t = preg_replace("/\\s+/", ' ', $t);
     if ($ns == NS_FILE) {
         $t = preg_replace("/ (png|gif|jpg|jpeg|ogg)\$/", "", $t);
     }
     return $search->normalizeText(trim($t));
 }
Beispiel #8
0
 /**
  * Get a string representation of a title suitable for
  * including in a search index
  *
  * @param int $ns a namespace index
  * @param string $title text-form main part
  * @return string a stripped-down title string ready for the
  * 	search index
  */
 public static function indexTitle($ns, $title)
 {
     global $wgContLang;
     $lc = SearchEngine::legalSearchChars() . '&#;';
     $t = $wgContLang->stripForSearch($title);
     $t = preg_replace("/[^{$lc}]+/", ' ', $t);
     $t = $wgContLang->lc($t);
     # Handle 's, s'
     $t = preg_replace("/([{$lc}]+)'s( |\$)/", "\\1 \\1's ", $t);
     $t = preg_replace("/([{$lc}]+)s'( |\$)/", "\\1s ", $t);
     $t = preg_replace("/\\s+/", ' ', $t);
     if ($ns == NS_IMAGE) {
         $t = preg_replace("/ (png|gif|jpg|jpeg|ogg)\$/", "", $t);
     }
     return trim($t);
 }
 /** @todo document */
 function parseQuery($filteredText, $fulltext)
 {
     global $wgContLang;
     $lc = SearchEngine::legalSearchChars();
     $this->searchTerms = array();
     # @todo FIXME: This doesn't handle parenthetical expressions.
     $m = array();
     $q = array();
     if (preg_match_all('/([-+<>~]?)(([' . $lc . ']+)(\\*?)|"[^"]*")/', $filteredText, $m, PREG_SET_ORDER)) {
         foreach ($m as $terms) {
             // Search terms in all variant forms, only
             // apply on wiki with LanguageConverter
             $temp_terms = $wgContLang->autoConvertToAllVariants($terms[2]);
             if (is_array($temp_terms)) {
                 $temp_terms = array_unique(array_values($temp_terms));
                 foreach ($temp_terms as $t) {
                     $q[] = $terms[1] . $wgContLang->normalizeForSearch($t);
                 }
             } else {
                 $q[] = $terms[1] . $wgContLang->normalizeForSearch($terms[2]);
             }
             if (!empty($terms[3])) {
                 $regexp = preg_quote($terms[3], '/');
                 if ($terms[4]) {
                     $regexp .= "[0-9A-Za-z_]+";
                 }
             } else {
                 $regexp = preg_quote(str_replace('"', '', $terms[2]), '/');
             }
             $this->searchTerms[] = $regexp;
         }
     }
     $searchon = $this->db->strencode(join(',', $q));
     $field = $this->getIndexField($fulltext);
     // requires Net Search Extender or equivalent
     //return " CONTAINS($field, '$searchon') > 0 ";
     return " lcase({$field}) LIKE lcase('%{$searchon}%')";
 }
 /** @todo document
  * @return string
  */
 function parseQuery($filteredText, $fulltext)
 {
     global $wgContLang;
     $lc = SearchEngine::legalSearchChars();
     $this->searchTerms = array();
     # @todo FIXME: This doesn't handle parenthetical expressions.
     $m = array();
     $q = array();
     if (preg_match_all('/([-+<>~]?)(([' . $lc . ']+)(\\*?)|"[^"]*")/', $filteredText, $m, PREG_SET_ORDER)) {
         foreach ($m as $terms) {
             $q[] = $terms[1] . $wgContLang->normalizeForSearch($terms[2]);
             if (!empty($terms[3])) {
                 $regexp = preg_quote($terms[3], '/');
                 if ($terms[4]) {
                     $regexp .= "[0-9A-Za-z_]+";
                 }
             } else {
                 $regexp = preg_quote(str_replace('"', '', $terms[2]), '/');
             }
             $this->searchTerms[] = $regexp;
         }
     }
     $searchon = $this->db->strencode(join(',', $q));
     $field = $this->getIndexField($fulltext);
     return "{$field}, '{$searchon}'";
 }
Beispiel #11
0
 function doUpdate()
 {
     global $wgContLang, $wgDisableSearchUpdate;
     if ($wgDisableSearchUpdate || !$this->mId) {
         return false;
     }
     wfProfileIn(__METHOD__);
     $search = SearchEngine::create();
     $lc = SearchEngine::legalSearchChars() . '&#;';
     if ($this->mText === false) {
         $search->updateTitle($this->mId, $search->normalizeText(Title::indexTitle($this->mNamespace, $this->mTitle)));
         wfProfileOut(__METHOD__);
         return;
     }
     # Language-specific strip/conversion
     $text = $wgContLang->normalizeForSearch($this->mText);
     wfProfileIn(__METHOD__ . '-regexps');
     $text = preg_replace("/<\\/?\\s*[A-Za-z][^>]*?>/", ' ', $wgContLang->lc(" " . $text . " "));
     # Strip HTML markup
     $text = preg_replace("/(^|\\n)==\\s*([^\\n]+)\\s*==(\\s)/sD", "\\1\\2 \\2 \\2\\3", $text);
     # Emphasize headings
     # Strip external URLs
     $uc = "A-Za-z0-9_\\/:.,~%\\-+&;#?!=()@\\x80-\\xFF";
     $protos = "http|https|ftp|mailto|news|gopher";
     $pat = "/(^|[^\\[])({$protos}):[{$uc}]+([^{$uc}]|\$)/";
     $text = preg_replace($pat, "\\1 \\3", $text);
     $p1 = "/([^\\[])\\[({$protos}):[{$uc}]+]/";
     $p2 = "/([^\\[])\\[({$protos}):[{$uc}]+\\s+([^\\]]+)]/";
     $text = preg_replace($p1, "\\1 ", $text);
     $text = preg_replace($p2, "\\1 \\3 ", $text);
     # Internal image links
     $pat2 = "/\\[\\[image:([{$uc}]+)\\.(gif|png|jpg|jpeg)([^{$uc}])/i";
     $text = preg_replace($pat2, " \\1 \\3", $text);
     $text = preg_replace("/([^{$lc}])([{$lc}]+)]]([a-z]+)/", "\\1\\2 \\2\\3", $text);
     # Handle [[game]]s
     # Strip all remaining non-search characters
     $text = preg_replace("/[^{$lc}]+/", " ", $text);
     # Handle 's, s'
     #
     #   $text = preg_replace( "/([{$lc}]+)'s /", "\\1 \\1's ", $text );
     #   $text = preg_replace( "/([{$lc}]+)s' /", "\\1s ", $text );
     #
     # These tail-anchored regexps are insanely slow. The worst case comes
     # when Japanese or Chinese text (ie, no word spacing) is written on
     # a wiki configured for Western UTF-8 mode. The Unicode characters are
     # expanded to hex codes and the "words" are very long paragraph-length
     # monstrosities. On a large page the above regexps may take over 20
     # seconds *each* on a 1GHz-level processor.
     #
     # Following are reversed versions which are consistently fast
     # (about 3 milliseconds on 1GHz-level processor).
     #
     $text = strrev(preg_replace("/ s'([{$lc}]+)/", " s'\\1 \\1", strrev($text)));
     $text = strrev(preg_replace("/ 's([{$lc}]+)/", " s\\1", strrev($text)));
     # Strip wiki '' and '''
     $text = preg_replace("/''[']*/", " ", $text);
     wfProfileOut(__METHOD__ . '-regexps');
     wfRunHooks('SearchUpdate', array($this->mId, $this->mNamespace, $this->mTitle, &$text));
     # Perform the actual update
     $search->update($this->mId, $search->normalizeText(Title::indexTitle($this->mNamespace, $this->mTitle)), $search->normalizeText($text));
     wfProfileOut(__METHOD__);
 }
 function termMatches()
 {
     $resq = preg_replace("/\\[.*?\\]:/", " ", $this->mQuery);
     # generic prefixes
     $resq = preg_replace("/all:/", " ", $resq);
     // Fixme: this is ripped from SearchMySQL and probably kind of sucks,
     // but it handles quoted phrase searches more or less correctly.
     // Should encapsulate this stuff better.
     // FIXME: This doesn't handle parenthetical expressions.
     $regexes = array();
     $m = array();
     $lc = SearchEngine::legalSearchChars();
     if (preg_match_all('/([-+<>~]?)(([' . $lc . ']+)(\\*?)|"[^"]*")/', $resq, $m, PREG_SET_ORDER)) {
         foreach ($m as $terms) {
             if (!empty($terms[3])) {
                 // Match individual terms in result highlighting...
                 $regexp = preg_quote($terms[3], '/');
                 if ($terms[4]) {
                     $regexp .= "[0-9A-Za-z_]+";
                 }
             } else {
                 // Match the quoted term in result highlighting...
                 $regexp = preg_quote(str_replace('"', '', $terms[2]), '/');
             }
             $regexes[] = $regexp;
         }
         wfDebug(__METHOD__ . ': Match with /' . implode('|', $regexes) . "/\n");
     } else {
         wfDebug("Can't understand search query '{$resq}'\n");
     }
     return $regexes;
 }