/** * Get a normalized string representation of a title suitable for * including in a search index * * @param SearchEngine $search * @return string A stripped-down title string ready for the search index */ private function getNormalizedTitle(SearchEngine $search) { global $wgContLang; $ns = $this->title->getNamespace(); $title = $this->title->getText(); $lc = $search->legalSearchChars() . '&#;'; $t = $wgContLang->normalizeForSearch($title); $t = preg_replace("/[^{$lc}]+/", ' ', $t); $t = $wgContLang->lc($t); # Handle 's, s' $t = preg_replace("/([{$lc}]+)'s( |\$)/", "\\1 \\1's ", $t); $t = preg_replace("/([{$lc}]+)s'( |\$)/", "\\1s ", $t); $t = preg_replace("/\\s+/", ' ', $t); if ($ns == NS_FILE) { $t = preg_replace("/ (png|gif|jpg|jpeg|ogg)\$/", "", $t); } return $search->normalizeText(trim($t)); }
/** * Converts some characters for MySQL's indexing to grok it correctly, * and pads short words to overcome limitations. */ function normalizeText($string) { global $wgContLang; wfProfileIn(__METHOD__); $out = parent::normalizeText($string); // MySQL fulltext index doesn't grok utf-8, so we // need to fold cases and convert to hex $out = preg_replace_callback("/([\\xc0-\\xff][\\x80-\\xbf]*)/", array($this, 'stripForSearchCallback'), $wgContLang->lc($out)); // And to add insult to injury, the default indexing // ignores short words... Pad them so we can pass them // through without reconfiguring the server... $minLength = $this->minSearchLength(); if ($minLength > 1) { $n = $minLength - 1; $out = preg_replace("/\\b(\\w{1,{$n}})\\b/", "\$1u800", $out); } // Periods within things like hostnames and IP addresses // are also important -- we want a search for "example.com" // or "192.168.1.1" to work sanely. // // MySQL's search seems to ignore them, so you'd match on // "example.wikipedia.com" and "192.168.83.1" as well. $out = preg_replace("/(\\w)\\.(\\w|\\*)/u", "\$1u82e\$2", $out); wfProfileOut(__METHOD__); return $out; }