function entities_to_7bit($str) { require_once LEPTON_PATH . '/framework/summary.utf8.php'; // convert to UTF-8 $str = charset_to_utf8($str); if (!utf8_check($str)) { return $str; } // replace some specials $str = utf8_stripspecials($str, '_'); // translate non-ASCII characters to ASCII $str = utf8_romanize($str); // missed some? - Many UTF-8-chars can't be romanized // convert to HTML-entities, and replace entites by hex-numbers $str = utf8_fast_umlauts_to_entities($str, false); $str = str_replace(''', ''', $str); // $str = preg_replace_callback('/&#([0-9]+);/', function($matches) {return "dechex($matches[1])";}, $str); // $str = preg_replace_callback('/&#([0-9]+);/', function($matches) {return dechex($matches[1]);}, $str); if (version_compare(PHP_VERSION, '5.3', '<')) { $str = preg_replace('/&#([0-9]+);/e', "dechex('\$1')", $str); } else { $str = preg_replace_callback('/&#([0-9]+);/', create_function('$aMatches', 'return dechex($aMatches[1]);'), $str); } // maybe there are some > < ' " & left, replace them too $str = str_replace(array('>', '<', ''', '\'', '"', '&'), '', $str); $str = str_replace('&', '', $str); return $str; }
function test1() { // we test multiple cases here - format: string, repl, additional, test $tests = array(); $tests[] = array('asciistring', '', '', 'asciistring'); $tests[] = array('asciistring', '', '\\._\\-:', 'asciistring'); $tests[] = array('ascii.string', '', '\\._\\-:', 'asciistring'); $tests[] = array('ascii.string', ' ', '\\._\\-:', 'ascii string'); $tests[] = array('2.1.14', ' ', '\\._\\-:', '2 1 14'); $tests[] = array('ascii.string', '', '\\._\\-:\\*', 'asciistring'); $tests[] = array('ascii.string', ' ', '\\._\\-:\\*', 'ascii string'); $tests[] = array('2.1.14', ' ', '\\._\\-:\\*', '2 1 14'); foreach ($tests as $test) { $this->assertEqual(utf8_stripspecials($test[0], $test[1], $test[2]), $test[3]); } }
function cleanID($raw_id) { $sepchar = "_"; $sepcharpat = '#\\' . $sepchar . '+#'; $id = trim((string) $raw_id); $id = utf8_strtolower($id); //alternative namespace seperator $id = strtr($id, ';', ':'); $id = strtr($id, '/', $sepchar); $id = utf8_romanize($id); $id = utf8_deaccent($id, -1); //remove specials $id = utf8_stripspecials($id, $sepchar, '\\*'); $id = utf8_strip($id); $id = preg_replace($sepcharpat, $sepchar, $id); $id = preg_replace('#:+#', ':', $id); $id = preg_replace('#:[:\\._\\-]+#', ':', $id); return $id; }
/** * Send the wanted code block to the browser * * When the correct block was found it exits the script. */ function code($text, $language = NULL, $filename = '') { global $INPUT; if (!$language) { $language = 'txt'; } if (!$filename) { $filename = 'snippet.' . $language; } $filename = utf8_basename($filename); $filename = utf8_stripspecials($filename, '_'); if ($this->_codeblock == $INPUT->str('codeblock')) { header("Content-Type: text/plain; charset=utf-8"); header("Content-Disposition: attachment; filename={$filename}"); header("X-Robots-Tag: noindex"); echo trim($text, "\r\n"); exit; } $this->_codeblock++; }
/** * Send the wanted code block to the browser * * When the correct block was found it exits the script. */ function code($text, $language = null, $filename = '') { global $INPUT; if (!$language) { $language = 'txt'; } if (!$filename) { $filename = 'snippet.' . $language; } $filename = utf8_basename($filename); $filename = utf8_stripspecials($filename, '_'); // send CRLF to Windows clients if (strpos($INPUT->server->str('HTTP_USER_AGENT'), 'Windows') !== false) { $text = str_replace("\n", "\r\n", $text); } if ($this->_codeblock == $INPUT->str('codeblock')) { header("Content-Type: text/plain; charset=utf-8"); header("Content-Disposition: attachment; filename={$filename}"); header("X-Robots-Tag: noindex"); echo trim($text, "\r\n"); exit; } $this->_codeblock++; }
/** * Tokenizes a string into an array of search words * * Uses the same algorithm as idx_getPageWords() * * @param string $string the query as given by the user * @param arrayref $stopwords array of stopwords * @param boolean $wc are wildcards allowed? */ function idx_tokenizer($string, &$stopwords, $wc = false) { $words = array(); $wc = $wc ? '' : ($wc = '\\*'); if (preg_match('/[^0-9A-Za-z]/u', $string)) { // handle asian chars as single words (may fail on older PHP version) $asia = @preg_replace('/(' . IDX_ASIAN . ')/u', ' \\1 ', $string); if (!is_null($asia)) { $string = $asia; } //recover from regexp failure $arr = explode(' ', utf8_stripspecials($string, ' ', '\\._\\-:' . $wc)); foreach ($arr as $w) { if (!is_numeric($w) && strlen($w) < IDX_MINWORDLENGTH) { continue; } $w = utf8_strtolower($w); if ($stopwords && is_int(array_search("{$w}\n", $stopwords))) { continue; } $words[] = $w; } } else { $w = $string; if (!is_numeric($w) && strlen($w) < IDX_MINWORDLENGTH) { return $words; } $w = strtolower($w); if (is_int(array_search("{$w}\n", $stopwords))) { return $words; } $words[] = $w; } return $words; }
/** * Split the text into words for fulltext search * * TODO: does this also need &$stopwords ? * * @triggers INDEXER_TEXT_PREPARE * This event allows plugins to modify the text before it gets tokenized. * Plugins intercepting this event should also intercept INDEX_VERSION_GET * * @param string $text plain text * @param boolean $wc are wildcards allowed? * @return array list of words in the text * @author Tom N Harris <*****@*****.**> * @author Andreas Gohr <*****@*****.**> */ public function tokenizer($text, $wc = false) { $wc = $wc ? '' : '\\*'; $stopwords =& idx_get_stopwords(); // prepare the text to be tokenized $evt = new Doku_Event('INDEXER_TEXT_PREPARE', $text); if ($evt->advise_before(true)) { if (preg_match('/[^0-9A-Za-z ]/u', $text)) { // handle asian chars as single words (may fail on older PHP version) $asia = @preg_replace('/(' . IDX_ASIAN . ')/u', ' \\1 ', $text); if (!is_null($asia)) { $text = $asia; } // recover from regexp falure } } $evt->advise_after(); unset($evt); $text = strtr($text, array("\r" => ' ', "\n" => ' ', "\t" => ' ', "" => '')); if (preg_match('/[^0-9A-Za-z ]/u', $text)) { $text = utf8_stripspecials($text, ' ', '\\._\\-:' . $wc); } $wordlist = explode(' ', $text); foreach ($wordlist as $i => $word) { $wordlist[$i] = preg_match('/[^0-9A-Za-z]/u', $word) ? utf8_strtolower($word) : strtolower($word); } foreach ($wordlist as $i => $word) { if (!is_numeric($word) && strlen($word) < IDX_MINWORDLENGTH || array_search($word, $stopwords, true) !== false) { unset($wordlist[$i]); } } return array_values($wordlist); }
/** * Remove unwanted chars from ID * * Cleans a given ID to only use allowed characters. Accented characters are * converted to unaccented ones * * @author Andreas Gohr <*****@*****.**> * @param string $raw_id The pageid to clean * @param boolean $ascii Force ASCII * @return string cleaned id */ function cleanID($raw_id, $ascii = false) { global $conf; static $sepcharpat = null; global $cache_cleanid; $cache =& $cache_cleanid; if ($conf['syslog']) { syslog(LOG_WARNING, '[pageutils.php] cleanID: raw_id: ' . $raw_id); } // check if it's already in the memory cache if (isset($cache[(string) $raw_id])) { return $cache[(string) $raw_id]; } $sepchar = $conf['sepchar']; if ($sepcharpat == null) { // build string only once to save clock cycles $sepcharpat = '#\\' . $sepchar . '+#'; } $id = trim((string) $raw_id); if ($conf['mixedcase'] == 0) { $id = utf8_strtolower($id); } //alternative namespace seperator if ($conf['useslash']) { $id = strtr($id, ';/', '::'); } else { $id = strtr($id, ';/', ':' . $sepchar); } if ($conf['deaccent'] == 2 || $ascii) { $id = utf8_romanize($id); } if ($conf['deaccent'] || $ascii) { $id = utf8_deaccent($id, -1); } //remove specials if specialcharacters is set to 0 if ($conf['specialcharacters'] == 0) { $id = utf8_stripspecials($id, $sepchar, '\\*'); } if ($ascii) { $id = utf8_strip($id); } //clean up $id = preg_replace($sepcharpat, $sepchar, $id); $id = preg_replace('#:+#', ':', $id); $id = trim($id, ':._-'); $id = preg_replace('#:[:\\._\\-]+#', ':', $id); $id = preg_replace('#[:\\._\\-]+:#', ':', $id); $cache[(string) $raw_id] = $id; if ($conf['syslog']) { syslog(LOG_WARNING, '[pageutils.php] cleanID: id to be returned: ' . $id); } return $id; }
/** * Remove unwanted chars from ID * * Cleans a given ID to only use allowed characters. Accented characters are * converted to unaccented ones * * @author Andreas Gohr <*****@*****.**> * @param string $raw_id The pageid to clean * @param boolean $ascii Force ASCII * @param boolean $media Allow leading or trailing _ for media files */ function cleanID($raw_id, $ascii = false, $media = false) { global $conf; static $sepcharpat = null; global $cache_cleanid; $cache =& $cache_cleanid; // check if it's already in the memory cache if (isset($cache[(string) $raw_id])) { return $cache[(string) $raw_id]; } $sepchar = $conf['sepchar']; if ($sepcharpat == null) { // build string only once to save clock cycles $sepcharpat = '#\\' . $sepchar . '+#'; } $id = trim((string) $raw_id); $id = utf8_strtolower($id); //alternative namespace seperator $id = strtr($id, ';', ':'); if ($conf['useslash']) { $id = strtr($id, '/', ':'); } else { $id = strtr($id, '/', $sepchar); } if ($conf['deaccent'] == 2 || $ascii) { $id = utf8_romanize($id); } if ($conf['deaccent'] || $ascii) { $id = utf8_deaccent($id, -1); } //remove specials $id = utf8_stripspecials($id, $sepchar, '\\*'); if ($ascii) { $id = utf8_strip($id); } //clean up $id = preg_replace($sepcharpat, $sepchar, $id); $id = preg_replace('#:+#', ':', $id); $id = $media ? trim($id, ':.-') : trim($id, ':._-'); $id = preg_replace('#:[:\\._\\-]+#', ':', $id); $cache[(string) $raw_id] = $id; return $id; }
/** * Log external search queries * * Will not write anything if the referer isn't a search engine */ public function log_externalsearch($referer, &$type) { $referer = utf8_strtolower($referer); include dirname(__FILE__) . '/searchengines.php'; /** @var array $SEARCHENGINES */ $query = ''; $name = ''; // parse the referer $urlparts = parse_url($referer); $domain = $urlparts['host']; $qpart = $urlparts['query']; if (!$qpart) { $qpart = $urlparts['fragment']; } //google does this $params = array(); parse_str($qpart, $params); // check domain against common search engines foreach ($SEARCHENGINES as $regex => $info) { if (preg_match('/' . $regex . '/', $domain)) { $type = 'search'; $name = array_shift($info); // check the known parameters for content foreach ($info as $k) { if (empty($params[$k])) { continue; } $query = $params[$k]; break; } break; } } // try some generic search engin parameters if ($type != 'search') { foreach (array('search', 'query', 'q', 'keywords', 'keyword') as $k) { if (empty($params[$k])) { continue; } $query = $params[$k]; // we seem to have found some generic search, generate name from domain $name = preg_replace('/(\\.co)?\\.([a-z]{2,5})$/', '', $domain); //strip tld $name = explode('.', $name); $name = array_pop($name); $type = 'search'; break; } } // still no hit? return if ($type != 'search') { return; } // clean the query $query = preg_replace('/^(cache|related):[^\\+]+/', '', $query); // non-search queries $query = preg_replace('/ +/', ' ', $query); // ws compact $query = trim($query); if (!utf8_check($query)) { $query = utf8_encode($query); } // assume latin1 if not utf8 // no query? no log if (!$query) { return; } // log it! $words = explode(' ', utf8_stripspecials($query, ' ', '\\._\\-:\\*')); $this->log_search($_REQUEST['p'], $query, $words, $name); }
function utf8_stripspecials($string, $repl = '', $keep = '') { return utf8_stripspecials($string, $repl, $keep); }
function entities_to_7bit($str) { // convert to UTF-8 $str = charset_to_utf8($str); if (!utf8_check($str)) { return $str; } // replace some specials $str = utf8_stripspecials($str, '_'); // translate non-ASCII characters to ASCII $str = utf8_romanize($str); // missed some? - Many UTF-8-chars can't be romanized // convert to HTML-entities, and replace entites by hex-numbers $str = utf8_fast_umlauts_to_entities($str, false); $str = str_replace(''', ''', $str); $str = preg_replace('/&#([0-9]+);/e', "dechex('\$1')", $str); // maybe there are some > < ' " & left, replace them too $str = str_replace(array('>', '<', ''', '\'', '"', '&'), '', $str); $str = str_replace('&', '', $str); return $str; }