/** * Strip markup to show plaintext * @param string $text * @return string * @access private */ function _stripMarkup($text) { global $wgContLang; $text = substr($text, 0, 4096); // don't bother with long text... $text = str_replace("'''", "", $text); $text = str_replace("''", "", $text); $text = preg_replace('#__[a-z0-9_]+__#i', '', $text); // magic words $cleanChar = "[^|\\[\\]]"; $subLink = "\\[\\[{$cleanChar}*(?:\\|{$cleanChar}*)*\\]\\]"; $pipeContents = "(?:{$cleanChar}|{$subLink})*"; $text = preg_replace_callback("#\n\t\t\t\\[\\[\n\t\t\t\t({$cleanChar}*)\n\t\t\t\t(?:\\|({$pipeContents}))?\n\t\t\t\t(?:\\|{$pipeContents})*\n\t\t\t\\]\\]\n\t\t\t#six", array($this, '_stripLink'), $text); $protocols = wfUrlProtocols(); $text = preg_replace('#\\[(?:$protocols).*? (.*?)\\]#s', '$1', $text); // URL links $text = preg_replace('#</?[a-z0-9]+.*?>#s', '', $text); // HTML-style tags $text = preg_replace('#\\{\\|.*?\\|\\}#s', '', $text); // tables $text = preg_replace('#^:.*$#m', '', $text); // indented lines near start are usually disambigs or notices $text = Sanitizer::decodeCharReferences($text); return trim($text); }
private static function generateDBKey($title, $conn) { if (get_magic_quotes_gpc()) { $title = stripslashes($title); } $title = Sanitizer::decodeCharReferences($title); $title = mb_strtoupper(mb_substr($title, 0, 1)) . mb_substr($title, 1); return mysql_real_escape_string(str_replace(' ', '_', $title), $conn); }
/** * static constructor, Create new Title from name of page */ public static function newFromText($text, $namespace, $city_id) { $filteredText = Sanitizer::decodeCharReferences($text); $title = new GlobalTitle(); $title->mText = $filteredText; $title->mDbkeyform = str_replace(' ', '_', $filteredText); $title->mUrlform = wfUrlencode($title->mDbkeyform); $title->mTextform = str_replace('_', ' ', $title->mText); $title->mNamespace = $namespace; $title->mCityId = $city_id; return $title; }
/** * For compatibility with version < 1.9.0 */ static function escapeFragmentForURL($fragment) { global $mgVersion; if ($mgVersion < 10900) { $fragment = str_replace(' ', '_', $fragment); $fragment = urlencode(Sanitizer::decodeCharReferences($fragment)); $replaceArray = array('%3A' => ':', '%2F' => '/', '%' => '.'); return strtr($fragment, $replaceArray); } else { return Title::escapeFragmentForURL($fragment); } }
protected function getResultText(SMWQueryResult $res, $outputmode) { $result = ''; if ($outputmode == SMW_OUTPUT_FILE) { // make CSV file $csv = fopen('php://temp', 'r+'); if ($this->mShowHeaders) { $header_items = array(); foreach ($res->getPrintRequests() as $pr) { $header_items[] = $pr->getLabel(); } fputcsv($csv, $header_items, $this->m_sep); } while ($row = $res->getNext()) { $row_items = array(); foreach ($row as $field) { $growing = array(); while (($object = $field->getNextDataValue()) !== false) { $growing[] = Sanitizer::decodeCharReferences($object->getWikiValue()); } $row_items[] = implode(',', $growing); } fputcsv($csv, $row_items, $this->m_sep); } rewind($csv); $result .= stream_get_contents($csv); } else { // just make link to feed if ($this->getSearchLabel($outputmode)) { $label = $this->getSearchLabel($outputmode); } else { $label = wfMsgForContent('smw_csv_link'); } $link = $res->getQueryLink($label); $link->setParameter('csv', 'format'); $link->setParameter($this->m_sep, 'sep'); if (array_key_exists('mainlabel', $this->params) && $this->params['mainlabel'] !== false) { $link->setParameter($this->params['mainlabel'], 'mainlabel'); } $link->setParameter($this->mShowHeaders ? 'show' : 'hide', 'headers'); if (array_key_exists('limit', $this->params)) { $link->setParameter($this->params['limit'], 'limit'); } else { // use a reasonable default limit $link->setParameter(100, 'limit'); } $result .= $link->getText($outputmode, $this->mLinker); $this->isHTML = $outputmode == SMW_OUTPUT_HTML; // yes, our code can be viewed as HTML if requested, no more parsing needed } return $result; }
protected function parseResponse(TranslationQueryResponse $reply) { $body = $reply->getBody(); $response = FormatJson::decode($body); if (!is_object($response)) { throw new TranslationWebServiceException('Invalid json: ' . serialize($body)); } elseif ($response->code !== 200) { throw new TranslationWebServiceException($response->message); } $text = Sanitizer::decodeCharReferences($response->text[0]); $text = $this->unwrapUntranslatable($text); return trim($text); }
/** * Return the original filename of the uploaded file * * @return string|null Filename or null if non-existent */ public function getName() { if (!$this->exists()) { return null; } global $wgContLang; $name = $this->fileInfo['name']; # Safari sends filenames in HTML-encoded Unicode form D... # Horrid and evil! Let's try to make some kind of sense of it. $name = Sanitizer::decodeCharReferences($name); $name = $wgContLang->normalize($name); wfDebug(__METHOD__ . ": {$this->fileInfo['name']} normalized to '{$name}'\n"); return $name; }
/** * @desc Static constructor, Create new Title from name of page * * @param String $text * @param Integer $namespace (default NS_MAIN) * @param Integer|null $city_id a wiki id; we allow null because of compatibility with Title::newFromText() * * @throws Exception when $city_id parameter is null * * @return GlobalTitle */ public static function newFromText($text, $namespace = NS_MAIN, $city_id = null) { if ($city_id === null) { // we allow to pass null in the method definition because of Strict Compatibility with Title::newFromText() throw new \Exception('Invalid $city_id.'); } $filteredText = Sanitizer::decodeCharReferences($text); $title = new GlobalTitle(); $title->mText = $filteredText; $title->mDbkeyform = str_replace(' ', '_', $filteredText); $title->mUrlform = wfUrlencode($title->mDbkeyform); $title->mTextform = str_replace('_', ' ', $title->mText); $title->mNamespace = $namespace; $title->mCityId = $city_id; return $title; }
protected function processRow($row) { global $wgContLang; $source = $row->img_name; // do nothing for videos, regardless of anything else if ('VIDEO' == $row->img_media_type && 'video' == $row->img_major_mime && 'swf' == $row->img_minor_mime) { $this->log("omitting video row '{$source}'"); return $this->progress(0); } if ($source == '') { // Ye olde empty rows. Just kill them. $this->killRow($source); return $this->progress(1); } $cleaned = $source; // About half of old bad image names have percent-codes $cleaned = rawurldecode($cleaned); // We also have some HTML entities there $cleaned = Sanitizer::decodeCharReferences($cleaned); // Some are old latin-1 $cleaned = $wgContLang->checkTitleEncoding($cleaned); // Many of remainder look like non-normalized unicode $cleaned = $wgContLang->normalize($cleaned); $title = Title::makeTitleSafe(NS_FILE, $cleaned); if (is_null($title)) { $this->output("page {$source} ({$cleaned}) is illegal.\n"); $safe = $this->buildSafeTitle($cleaned); if ($safe === false) { return $this->progress(0); } $this->pokeFile($source, $safe); return $this->progress(1); } if ($title->getDBkey() !== $source) { $munged = $title->getDBkey(); $this->output("page {$source} ({$munged}) doesn't match self.\n"); $this->pokeFile($source, $munged); return $this->progress(1); } return $this->progress(0); }
protected function handleRow($row) { $template_params = array(); $field_num = 0; foreach ($row as $field) { $field_num++; $key = $field->getPrintRequest()->getLabel(); if (empty($key)) { $key = $field_num; } $value = array(); while (($object = $field->getNextDataValue()) !== false) { $value[] = Sanitizer::decodeCharReferences($object->getWikiValue()); } $template_params[$key] = $value; } if (isset($template_params['To']) && count($template_params['To']) > 0) { $data = array('title' => $this->pageTitle, 'params' => $template_params, 'template' => $this->params['template']); $row = $this->emailsTable->newRow($data); $row->save(); } }
static function categoryLinkHook($parser, $holders, $markers, Title $title, $titleText, &$sortText = null, &$leadingColon = false) { global $wgContLang; # When a category link starts with a : treat it as a normal link if ($leadingColon) { return true; } if (isset($sortText) && $markers->findMarker($sortText)) { # There are links inside of the sortText # For backwards compatibility the deepest links are dominant so this # link should not be handled $sortText = $markers->expand($sortText); # Return false so that this link is reverted back to WikiText return false; } if (!isset($sortText)) { $sortText = $parser->getDefaultSort(); } $sortText = Sanitizer::decodeCharReferences($sortText); $sortText = str_replace("\n", '', $sortText); $sortText = $wgContLang->convertCategoryKey($sortText); $parser->mOutput->addCategory($title->getDBkey(), $sortText); return ''; }
function processPage($row) { global $wgContLang; $source = $row->img_name; if ($source == '') { // Ye olde empty rows. Just kill them. $this->killRow($source); return $this->progress(1); } $cleaned = $source; // About half of old bad image names have percent-codes $cleaned = rawurldecode($cleaned); // We also have some HTML entities there $cleaned = Sanitizer::decodeCharReferences($cleaned); // Some are old latin-1 $cleaned = $wgContLang->checkTitleEncoding($cleaned); // Many of remainder look like non-normalized unicode $cleaned = UtfNormal::cleanUp($cleaned); $title = Title::makeTitleSafe(NS_FILE, $cleaned); if (is_null($title)) { $this->log("page {$source} ({$cleaned}) is illegal."); $safe = $this->buildSafeTitle($cleaned); if ($safe === false) { return $this->progress(0); } $this->pokeFile($source, $safe); return $this->progress(1); } if ($title->getDBkey() !== $source) { $munged = $title->getDBkey(); $this->log("page {$source} ({$munged}) doesn't match self."); $this->pokeFile($source, $munged); return $this->progress(1); } $this->progress(0); }
static function createFormLink(&$parser, $specialPageName, $params) { // Set defaults. $inFormName = $inLinkStr = $inLinkType = $inTooltip = $inQueryStr = $inTargetName = ''; if ($specialPageName == 'RunQuery') { $inLinkStr = wfMessage('runquery')->text(); } $classStr = ""; $inQueryArr = array(); $positionalParameters = false; // assign params // - support unlabelled params, for backwards compatibility // - parse and sanitize all parameter values foreach ($params as $i => $param) { $elements = explode('=', $param, 2); // set param_name and value if (count($elements) > 1 && !$positionalParameters) { $param_name = trim($elements[0]); // parse (and sanitize) parameter values $value = trim($parser->recursiveTagParse($elements[1])); } else { $param_name = null; // parse (and sanitize) parameter values $value = trim($parser->recursiveTagParse($param)); } if ($param_name == 'form') { $inFormName = $value; } elseif ($param_name == 'link text') { $inLinkStr = $value; } elseif ($param_name == 'link type') { $inLinkType = $value; } elseif ($param_name == 'query string') { // Change HTML-encoded ampersands directly to // URL-encoded ampersands, so that the string // doesn't get split up on the '&'. $inQueryStr = str_replace('&', '%26', $value); parse_str($inQueryStr, $arr); $inQueryArr = self::array_merge_recursive_distinct($inQueryArr, $arr); } elseif ($param_name == 'tooltip') { $inTooltip = Sanitizer::decodeCharReferences($value); } elseif ($param_name == 'target') { $inTargetName = $value; } elseif ($param_name == null && $value == 'popup') { self::loadScriptsForPopupForm($parser); $classStr = 'popupformlink'; } elseif ($param_name !== null && !$positionalParameters) { $value = urlencode($value); parse_str("{$param_name}={$value}", $arr); $inQueryArr = self::array_merge_recursive_distinct($inQueryArr, $arr); } elseif ($i == 0) { $inFormName = $value; $positionalParameters = true; } elseif ($i == 1) { $inLinkStr = $value; } elseif ($i == 2) { $inLinkType = $value; } elseif ($i == 3) { // Change HTML-encoded ampersands directly to // URL-encoded ampersands, so that the string // doesn't get split up on the '&'. $inQueryStr = str_replace('&', '%26', $value); parse_str($inQueryStr, $arr); $inQueryArr = self::array_merge_recursive_distinct($inQueryArr, $arr); } } $ad = SFUtils::getSpecialPage($specialPageName); $link_url = $ad->getTitle()->getLocalURL() . "/{$inFormName}"; if (!empty($inTargetName)) { $link_url .= "/{$inTargetName}"; } $link_url = str_replace(' ', '_', $link_url); $hidden_inputs = ""; if (!empty($inQueryArr)) { // Special handling for the buttons - query string // has to be turned into hidden inputs. if ($inLinkType == 'button' || $inLinkType == 'post button') { $query_components = explode('&', http_build_query($inQueryArr, '', '&')); foreach ($query_components as $query_component) { $var_and_val = explode('=', $query_component, 2); if (count($var_and_val) == 2) { $hidden_inputs .= Html::hidden(urldecode($var_and_val[0]), urldecode($var_and_val[1])); } } } else { $link_url .= strstr($link_url, '?') ? '&' : '?'; $link_url .= str_replace('+', '%20', http_build_query($inQueryArr, '', '&')); } } if ($inLinkType == 'button' || $inLinkType == 'post button') { $formMethod = $inLinkType == 'button' ? 'get' : 'post'; $str = Html::rawElement('form', array('action' => $link_url, 'method' => $formMethod, 'class' => $classStr), '<button ' . Html::expandAttributes(array('type' => 'submit', 'value' => $inLinkStr)) . '>' . $inLinkStr . '</button>' . $hidden_inputs); } else { // If a target page has been specified but it doesn't // exist, make it a red link. if (!empty($inTargetName)) { $targetTitle = Title::newFromText($inTargetName); if (is_null($targetTitle) || !$targetTitle->exists()) { $classStr .= " new"; } } $str = Html::rawElement('a', array('href' => $link_url, 'class' => $classStr, 'title' => $inTooltip), $inLinkStr); } return $str; }
/** * Try to guess the section anchor name based on a wikitext fragment * presumably extracted from a heading, for example "Header" from * "== Header ==". */ public function guessSectionNameFromWikiText($text) { # Strip out wikitext links(they break the anchor) $text = $this->stripSectionName($text); $headline = Sanitizer::decodeCharReferences($text); # strip out HTML $headline = StringUtils::delimiterReplace('<', '>', '', $headline); $headline = trim($headline); $sectionanchor = '#' . urlencode(str_replace(' ', '_', $headline)); $replacearray = array('%3A' => ':', '%' => '.'); return str_replace(array_keys($replacearray), array_values($replacearray), $sectionanchor); }
/** * Escape a text fragment, say from a link, for a URL */ static function escapeFragmentForURL($fragment) { $fragment = str_replace(' ', '_', $fragment); $fragment = urlencode(Sanitizer::decodeCharReferences($fragment)); $replaceArray = array('%3A' => ':', '%' => '.'); return strtr($fragment, $replaceArray); }
/** * @param string $url * @return mixed|string */ static function cleanUrl($url) { # Normalize any HTML entities in input. They will be # re-escaped by makeExternalLink(). $url = Sanitizer::decodeCharReferences($url); # Escape any control characters introduced by the above step $url = preg_replace_callback('/[\\][<>"\\x00-\\x20\\x7F\\|]/', array(__CLASS__, 'cleanUrlCallback'), $url); # Validate hostname portion $matches = array(); if (preg_match('!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches)) { list(, $protocol, $host, $rest) = $matches; // Characters that will be ignored in IDNs. // http://tools.ietf.org/html/3454#section-3.1 // Strip them before further processing so blacklists and such work. $strip = "/\n\t\t\t\t\\s| # general whitespace\n\t\t\t\t| # 00ad SOFT HYPHEN\n\t\t\t\t᠆| # 1806 MONGOLIAN TODO SOFT HYPHEN\n\t\t\t\t| # 200b ZERO WIDTH SPACE\n\t\t\t\t| # 2060 WORD JOINER\n\t\t\t\t| # feff ZERO WIDTH NO-BREAK SPACE\n\t\t\t\t͏| # 034f COMBINING GRAPHEME JOINER\n\t\t\t\t᠋| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE\n\t\t\t\t᠌| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO\n\t\t\t\t᠍| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE\n\t\t\t\t| # 200c ZERO WIDTH NON-JOINER\n\t\t\t\t| # 200d ZERO WIDTH JOINER\n\t\t\t\t[︀-️] # fe00-fe0f VARIATION SELECTOR-1-16\n\t\t\t\t/xuD"; $host = preg_replace($strip, '', $host); // IPv6 host names are bracketed with []. Url-decode these. if (substr_compare("//%5B", $host, 0, 5) === 0 && preg_match('!^//%5B([0-9A-Fa-f:.]+)%5D((:\\d+)?)$!', $host, $matches)) { $host = '//[' . $matches[1] . ']' . $matches[2]; } // @todo FIXME: Validate hostnames here return $protocol . $host . $rest; } else { return $url; } }
/** * Create a section edit link. This supersedes editSectionLink() and * editSectionLinkForOther(). * * @param $nt Title The title being linked to (may not be the same as * $wgTitle, if the section is included from a template) * @param string $section The designation of the section being pointed to, * to be included in the link, like "§ion=$section" * @param string $tooltip The tooltip to use for the link: will be escaped * and wrapped in the 'editsectionhint' message * @param $lang string Language code * @return string HTML to use for edit link */ public function doEditSectionLink(Title $nt, $section, $tooltip = null, $lang = false) { // HTML generated here should probably have userlangattributes // added to it for LTR text on RTL pages $lang = wfGetLangObj($lang); $attribs = array(); if (!is_null($tooltip)) { # Bug 25462: undo double-escaping. $tooltip = Sanitizer::decodeCharReferences($tooltip); $attribs['title'] = wfMessage('editsectionhint')->rawParams($tooltip)->inLanguage($lang)->text(); } $link = Linker::link($nt, wfMessage('editsection')->inLanguage($lang)->text(), $attribs, array('action' => 'edit', 'section' => $section), array('noclasses', 'known')); # Run the old hook. This takes up half of the function . . . hopefully # we can rid of it someday. $attribs = ''; if ($tooltip) { $attribs = wfMessage('editsectionhint')->rawParams($tooltip)->inLanguage($lang)->escaped(); $attribs = " title=\"{$attribs}\""; } $result = null; wfRunHooks('EditSectionLink', array(&$this, $nt, $section, $attribs, $link, &$result, $lang)); if (!is_null($result)) { # For reverse compatibility, add the brackets *after* the hook is # run, and even add them to hook-provided text. (This is the main # reason that the EditSectionLink hook is deprecated in favor of # DoEditSectionLink: it can't change the brackets or the span.) $result = wfMessage('editsection-brackets')->rawParams($result)->inLanguage($lang)->escaped(); return "<span class=\"editsection\">{$result}</span>"; } # Add the brackets and the span, and *then* run the nice new hook, with # clean and non-redundant arguments. $result = wfMessage('editsection-brackets')->rawParams($link)->inLanguage($lang)->escaped(); $result = "<span class=\"editsection\">{$result}</span>"; wfRunHooks('DoEditSectionLink', array($this, $nt, $section, $tooltip, &$result, $lang)); return $result; }
/** * Heuristic for detecting files that *could* contain JavaScript instructions or * things that may look like HTML to a browser and are thus * potentially harmful. The present implementation will produce false * positives in some situations. * * @param string $file pathname to the temporary upload file * @param string $mime the mime type of the file * @param string $extension the extension of the file * @return Boolean: true if the file contains something looking like embedded scripts */ public static function detectScript($file, $mime, $extension) { global $wgAllowTitlesInSVG; wfProfileIn(__METHOD__); # ugly hack: for text files, always look at the entire file. # For binary field, just check the first K. if (strpos($mime, 'text/') === 0) { $chunk = file_get_contents($file); } else { $fp = fopen($file, 'rb'); $chunk = fread($fp, 1024); fclose($fp); } $chunk = strtolower($chunk); if (!$chunk) { wfProfileOut(__METHOD__); return false; } # decode from UTF-16 if needed (could be used for obfuscation). if (substr($chunk, 0, 2) == "þÿ") { $enc = 'UTF-16BE'; } elseif (substr($chunk, 0, 2) == "ÿþ") { $enc = 'UTF-16LE'; } else { $enc = null; } if ($enc) { $chunk = iconv($enc, "ASCII//IGNORE", $chunk); } $chunk = trim($chunk); # @todo FIXME: Convert from UTF-16 if necessary! wfDebug(__METHOD__ . ": checking for embedded scripts and HTML stuff\n"); # check for HTML doctype if (preg_match("/<!DOCTYPE *X?HTML/i", $chunk)) { wfProfileOut(__METHOD__); return true; } // Some browsers will interpret obscure xml encodings as UTF-8, while // PHP/expat will interpret the given encoding in the xml declaration (bug 47304) if ($extension == 'svg' || strpos($mime, 'image/svg') === 0) { if (self::checkXMLEncodingMissmatch($file)) { wfProfileOut(__METHOD__); return true; } } /** * Internet Explorer for Windows performs some really stupid file type * autodetection which can cause it to interpret valid image files as HTML * and potentially execute JavaScript, creating a cross-site scripting * attack vectors. * * Apple's Safari browser also performs some unsafe file type autodetection * which can cause legitimate files to be interpreted as HTML if the * web server is not correctly configured to send the right content-type * (or if you're really uploading plain text and octet streams!) * * Returns true if IE is likely to mistake the given file for HTML. * Also returns true if Safari would mistake the given file for HTML * when served with a generic content-type. */ $tags = array('<a href', '<body', '<head', '<html', '<img', '<pre', '<script', '<table'); if (!$wgAllowTitlesInSVG && $extension !== 'svg' && $mime !== 'image/svg') { $tags[] = '<title'; } foreach ($tags as $tag) { if (false !== strpos($chunk, $tag)) { wfDebug(__METHOD__ . ": found something that may make it be mistaken for html: {$tag}\n"); wfProfileOut(__METHOD__); return true; } } /* * look for JavaScript */ # resolve entity-refs to look at attributes. may be harsh on big files... cache result? $chunk = Sanitizer::decodeCharReferences($chunk); # look for script-types if (preg_match('!type\\s*=\\s*[\'"]?\\s*(?:\\w*/)?(?:ecma|java)!sim', $chunk)) { wfDebug(__METHOD__ . ": found script types\n"); wfProfileOut(__METHOD__); return true; } # look for html-style script-urls if (preg_match('!(?:href|src|data)\\s*=\\s*[\'"]?\\s*(?:ecma|java)script:!sim', $chunk)) { wfDebug(__METHOD__ . ": found html-style script urls\n"); wfProfileOut(__METHOD__); return true; } # look for css-style script-urls if (preg_match('!url\\s*\\(\\s*[\'"]?\\s*(?:ecma|java)script:!sim', $chunk)) { wfDebug(__METHOD__ . ": found css-style script urls\n"); wfProfileOut(__METHOD__); return true; } wfDebug(__METHOD__ . ": no scripts found\n"); wfProfileOut(__METHOD__); return false; }
/** * Decodes character references and inserts Unicode characters instead, using * the MediaWiki Sanitizer. * * @param string $text */ function smwfHTMLtoUTF8($text) { return Sanitizer::decodeCharReferences($text); }
/** * Return the original filename of the uploaded file, as reported by * the submitting user agent. HTML-style character entities are * interpreted and normalized to Unicode normalization form C, in part * to deal with weird input from Safari with non-ASCII filenames. * * Other than this the name is not verified for being a safe filename. * * @param $key String: * @return string or NULL if no such file. */ public function getFileName($key) { global $wgContLang; if (!isset($_FILES[$key])) { return null; } $name = $_FILES[$key]['name']; # Safari sends filenames in HTML-encoded Unicode form D... # Horrid and evil! Let's try to make some kind of sense of it. $name = Sanitizer::decodeCharReferences($name); $name = $wgContLang->normalize($name); wfDebug("WebRequest::getFileName() '" . $_FILES[$key]['name'] . "' normalized to '{$name}'\n"); return $name; }
/** * Extract options from a blob of text * * @param string $text Tag contents */ public function extractOptions($text) { // Parse all possible options $values = array(); foreach (explode("\n", $text) as $line) { if (strpos($line, '=') === false) { continue; } list($name, $value) = explode('=', $line, 2); $values[strtolower(trim($name))] = Sanitizer::decodeCharReferences(trim($value)); } // Validate the dir value. if (isset($values['dir']) && !in_array($values['dir'], array('ltr', 'rtl'))) { unset($values['dir']); } // Build list of options, with local member names $options = array('type' => 'mType', 'default' => 'mDefaultText', 'placeholder' => 'mPlaceholderText', 'buttonlabel' => 'mButtonLabel', 'fancybutton' => 'mFancyButton', 'labeltext' => 'mLabelText', 'id' => 'mID', 'category' => 'mCategory', 'dir' => 'mDir', 'internal' => 'mInternal', 'mobile' => 'mMobile', 'elementsize' => 'mElementSize', 'hiddenbuttonlabel' => 'mHiddenButtonLabel', 'widthclasses' => 'mWidthClasses'); foreach ($options as $name => $var) { if (isset($values[$name])) { $this->{$var} = $values[$name]; } } // Validate css classes $classes = explode(' ', $this->mWidthClasses); $this->mWidthClasses = array(); foreach ($classes as $class) { $valid = preg_match($this->validExtraClassesRegEx, $class); if ($valid === 1) { $this->mWidthClasses[] = $class; } } if (!in_array($this->mElementSize, $this->validElementSizes)) { $this->mElementSize = 'normal'; } }
/** * Process [[ ]] wikilinks (RIL) * @param $s * @throws MWException * @return LinkHolderArray * * @private */ function replaceInternalLinks2(&$s) { wfProfileIn(__METHOD__); wfProfileIn(__METHOD__ . '-setup'); static $tc = false, $e1, $e1_img; # the % is needed to support urlencoded titles as well if (!$tc) { $tc = Title::legalChars() . '#%'; # Match a link having the form [[namespace:link|alternate]]trail $e1 = "/^([{$tc}]+)(?:\\|(.+?))?]](.*)\$/sD"; # Match cases where there is no "]]", which might still be images $e1_img = "/^([{$tc}]+)\\|(.*)\$/sD"; } $holders = new LinkHolderArray($this); # split the entire text string on occurrences of [[ $a = StringUtils::explode('[[', ' ' . $s); # get the first element (all text up to first [[), and remove the space we added $s = $a->current(); $a->next(); $line = $a->current(); # Workaround for broken ArrayIterator::next() that returns "void" $s = substr($s, 1); $useLinkPrefixExtension = $this->getTargetLanguage()->linkPrefixExtension(); $e2 = null; if ($useLinkPrefixExtension) { # Match the end of a line for a word that's not followed by whitespace, # e.g. in the case of 'The Arab al[[Razi]]', 'al' will be matched global $wgContLang; $charset = $wgContLang->linkPrefixCharset(); $e2 = "/^((?>.*[^{$charset}]|))(.+)\$/sDu"; } if (is_null($this->mTitle)) { wfProfileOut(__METHOD__ . '-setup'); wfProfileOut(__METHOD__); throw new MWException(__METHOD__ . ": \$this->mTitle is null\n"); } $nottalk = !$this->mTitle->isTalkPage(); if ($useLinkPrefixExtension) { $m = array(); if (preg_match($e2, $s, $m)) { $first_prefix = $m[2]; } else { $first_prefix = false; } } else { $prefix = ''; } $useSubpages = $this->areSubpagesAllowed(); wfProfileOut(__METHOD__ . '-setup'); # Loop for each link for (; $line !== false && $line !== null; $a->next(), $line = $a->current()) { # Check for excessive memory usage if ($holders->isBig()) { # Too big # Do the existence check, replace the link holders and clear the array $holders->replace($s); $holders->clear(); } if ($useLinkPrefixExtension) { wfProfileIn(__METHOD__ . '-prefixhandling'); if (preg_match($e2, $s, $m)) { $prefix = $m[2]; $s = $m[1]; } else { $prefix = ''; } # first link if ($first_prefix) { $prefix = $first_prefix; $first_prefix = false; } wfProfileOut(__METHOD__ . '-prefixhandling'); } $might_be_img = false; wfProfileIn(__METHOD__ . "-e1"); if (preg_match($e1, $line, $m)) { # page with normal text or alt $text = $m[2]; # If we get a ] at the beginning of $m[3] that means we have a link that's something like: # [[Image:Foo.jpg|[http://example.com desc]]] <- having three ] in a row f***s up, # the real problem is with the $e1 regex # See bug 1300. # # Still some problems for cases where the ] is meant to be outside punctuation, # and no image is in sight. See bug 2095. # if ($text !== '' && substr($m[3], 0, 1) === ']' && strpos($text, '[') !== false) { $text .= ']'; # so that replaceExternalLinks($text) works later $m[3] = substr($m[3], 1); } # fix up urlencoded title texts if (strpos($m[1], '%') !== false) { # Should anchors '#' also be rejected? $m[1] = str_replace(array('<', '>'), array('<', '>'), rawurldecode($m[1])); } $trail = $m[3]; } elseif (preg_match($e1_img, $line, $m)) { # Invalid, but might be an image with a link in its caption $might_be_img = true; $text = $m[2]; if (strpos($m[1], '%') !== false) { $m[1] = rawurldecode($m[1]); } $trail = ""; } else { # Invalid form; output directly $s .= $prefix . '[[' . $line; wfProfileOut(__METHOD__ . "-e1"); continue; } wfProfileOut(__METHOD__ . "-e1"); wfProfileIn(__METHOD__ . "-misc"); # Don't allow internal links to pages containing # PROTO: where PROTO is a valid URL protocol; these # should be external links. if (preg_match('/^(?i:' . $this->mUrlProtocols . ')/', $m[1])) { $s .= $prefix . '[[' . $line; wfProfileOut(__METHOD__ . "-misc"); continue; } # Make subpage if necessary if ($useSubpages) { $link = $this->maybeDoSubpageLink($m[1], $text); } else { $link = $m[1]; } $noforce = substr($m[1], 0, 1) !== ':'; if (!$noforce) { # Strip off leading ':' $link = substr($link, 1); } wfProfileOut(__METHOD__ . "-misc"); wfProfileIn(__METHOD__ . "-title"); $nt = Title::newFromText($this->mStripState->unstripNoWiki($link)); if ($nt === null) { $s .= $prefix . '[[' . $line; wfProfileOut(__METHOD__ . "-title"); continue; } $ns = $nt->getNamespace(); $iw = $nt->getInterwiki(); wfProfileOut(__METHOD__ . "-title"); if ($might_be_img) { # if this is actually an invalid link wfProfileIn(__METHOD__ . "-might_be_img"); if ($ns == NS_FILE && $noforce) { # but might be an image $found = false; while (true) { # look at the next 'line' to see if we can close it there $a->next(); $next_line = $a->current(); if ($next_line === false || $next_line === null) { break; } $m = explode(']]', $next_line, 3); if (count($m) == 3) { # the first ]] closes the inner link, the second the image $found = true; $text .= "[[{$m[0]}]]{$m[1]}"; $trail = $m[2]; break; } elseif (count($m) == 2) { # if there's exactly one ]] that's fine, we'll keep looking $text .= "[[{$m[0]}]]{$m[1]}"; } else { # if $next_line is invalid too, we need look no further $text .= '[[' . $next_line; break; } } if (!$found) { # we couldn't find the end of this imageLink, so output it raw # but don't ignore what might be perfectly normal links in the text we've examined $holders->merge($this->replaceInternalLinks2($text)); $s .= "{$prefix}[[{$link}|{$text}"; # note: no $trail, because without an end, there *is* no trail wfProfileOut(__METHOD__ . "-might_be_img"); continue; } } else { # it's not an image, so output it raw $s .= "{$prefix}[[{$link}|{$text}"; # note: no $trail, because without an end, there *is* no trail wfProfileOut(__METHOD__ . "-might_be_img"); continue; } wfProfileOut(__METHOD__ . "-might_be_img"); } $wasblank = $text == ''; if ($wasblank) { $text = $link; } else { # Bug 4598 madness. Handle the quotes only if they come from the alternate part # [[Lista d''e paise d''o munno]] -> <a href="...">Lista d''e paise d''o munno</a> # [[Criticism of Harry Potter|Criticism of ''Harry Potter'']] # -> <a href="Criticism of Harry Potter">Criticism of <i>Harry Potter</i></a> $text = $this->doQuotes($text); } # Link not escaped by : , create the various objects if ($noforce) { # Interwikis wfProfileIn(__METHOD__ . "-interwiki"); if ($iw && $this->mOptions->getInterwikiMagic() && $nottalk && Language::fetchLanguageName($iw, null, 'mw')) { // XXX: the above check prevents links to sites with identifiers that are not language codes # Bug 24502: filter duplicates if (!isset($this->mLangLinkLanguages[$iw])) { $this->mLangLinkLanguages[$iw] = true; $this->mOutput->addLanguageLink($nt->getFullText()); } $s = rtrim($s . $prefix); $s .= trim($trail, "\n") == '' ? '' : $prefix . $trail; wfProfileOut(__METHOD__ . "-interwiki"); continue; } wfProfileOut(__METHOD__ . "-interwiki"); if ($ns == NS_FILE) { wfProfileIn(__METHOD__ . "-image"); if (!wfIsBadImage($nt->getDBkey(), $this->mTitle)) { if ($wasblank) { # if no parameters were passed, $text # becomes something like "File:Foo.png", # which we don't want to pass on to the # image generator $text = ''; } else { # recursively parse links inside the image caption # actually, this will parse them in any other parameters, too, # but it might be hard to fix that, and it doesn't matter ATM $text = $this->replaceExternalLinks($text); $holders->merge($this->replaceInternalLinks2($text)); } # cloak any absolute URLs inside the image markup, so replaceExternalLinks() won't touch them $s .= $prefix . $this->armorLinks($this->makeImage($nt, $text, $holders)) . $trail; } else { $s .= $prefix . $trail; } wfProfileOut(__METHOD__ . "-image"); continue; } if ($ns == NS_CATEGORY) { wfProfileIn(__METHOD__ . "-category"); $s = rtrim($s . "\n"); # bug 87 if ($wasblank) { $sortkey = $this->getDefaultSort(); } else { $sortkey = $text; } $sortkey = Sanitizer::decodeCharReferences($sortkey); $sortkey = str_replace("\n", '', $sortkey); $sortkey = $this->getConverterLanguage()->convertCategoryKey($sortkey); $this->mOutput->addCategory($nt->getDBkey(), $sortkey); /** * Strip the whitespace Category links produce, see bug 87 */ $s .= trim($prefix . $trail, "\n") == '' ? '' : $prefix . $trail; wfProfileOut(__METHOD__ . "-category"); continue; } } # Self-link checking. For some languages, variants of the title are checked in # LinkHolderArray::doVariants() to allow batching the existence checks necessary # for linking to a different variant. if ($ns != NS_SPECIAL && $nt->equals($this->mTitle) && !$nt->hasFragment()) { $s .= $prefix . Linker::makeSelfLinkObj($nt, $text, '', $trail); continue; } # NS_MEDIA is a pseudo-namespace for linking directly to a file # @todo FIXME: Should do batch file existence checks, see comment below if ($ns == NS_MEDIA) { wfProfileIn(__METHOD__ . "-media"); # Give extensions a chance to select the file revision for us $options = array(); $descQuery = false; wfRunHooks('BeforeParserFetchFileAndTitle', array($this, $nt, &$options, &$descQuery)); # Fetch and register the file (file title may be different via hooks) list($file, $nt) = $this->fetchFileAndTitle($nt, $options); # Cloak with NOPARSE to avoid replacement in replaceExternalLinks $s .= $prefix . $this->armorLinks(Linker::makeMediaLinkFile($nt, $file, $text)) . $trail; wfProfileOut(__METHOD__ . "-media"); continue; } wfProfileIn(__METHOD__ . "-always_known"); # Some titles, such as valid special pages or files in foreign repos, should # be shown as bluelinks even though they're not included in the page table # # @todo FIXME: isAlwaysKnown() can be expensive for file links; we should really do # batch file existence checks for NS_FILE and NS_MEDIA if ($iw == '' && $nt->isAlwaysKnown()) { $this->mOutput->addLink($nt); $s .= $this->makeKnownLinkHolder($nt, $text, array(), $trail, $prefix); } else { # Links will be added to the output link list after checking $s .= $holders->makeHolder($nt, $text, array(), $trail, $prefix); } wfProfileOut(__METHOD__ . "-always_known"); } wfProfileOut(__METHOD__); return $holders; }
/** * Format an anchor fragment as it would appear for a given section name * @param string $text * @return string * @access private */ function sectionAnchor($text) { $headline = Sanitizer::decodeCharReferences($text); # strip out HTML $headline = preg_replace('/<.*?' . '>/', '', $headline); $headline = trim($headline); $sectionanchor = '#' . urlencode(str_replace(' ', '_', $headline)); $replacearray = array('%3A' => ':', '%' => '.'); return str_replace(array_keys($replacearray), array_values($replacearray), $sectionanchor); }
function renderTimeline( $timelinesrc ){ global $wgUploadDirectory, $wgUploadPath, $IP, $wgTimelineSettings, $wgArticlePath, $wgTmpDirectory, $wgRenderHashAppend; $hash = md5( $timelinesrc ); if ($wgRenderHashAppend != "") $hash = md5( $hash . $wgRenderHashAppend ); $dest = $wgUploadDirectory."/timeline/"; if ( ! is_dir( $dest ) ) { mkdir( $dest, 0777 ); } if ( ! is_dir( $wgTmpDirectory ) ) { mkdir( $wgTmpDirectory, 0777 ); } $fname = $dest . $hash; $previouslyFailed = file_exists( $fname.".err" ); $previouslyRendered = file_exists( $fname.".png" ); $expired = $previouslyRendered && ( filemtime( $fname.".png" ) < wfTimestamp( TS_UNIX, $wgTimelineSettings->epochTimestamp ) ); if ( $expired || ( !$previouslyRendered && !$previouslyFailed ) ){ $handle = fopen($fname, "w"); fwrite($handle, $timelinesrc); fclose($handle); $cmdline = wfEscapeShellArg( $wgTimelineSettings->perlCommand, $wgTimelineSettings->timelineFile ) . " -i " . wfEscapeShellArg( $fname ) . " -m -P " . wfEscapeShellArg( $wgTimelineSettings->ploticusCommand ) . " -T " . wfEscapeShellArg( $wgTmpDirectory ) . " -A " . wfEscapeShellArg( $wgArticlePath ) . " -f " . wfEscapeShellArg( $wgTimelineSettings->fontFile ); wfDebug( "Timeline cmd: $cmdline\n" ); $ret = `{$cmdline}`; unlink($fname); if ( $ret == "" ) { // Message not localized, only relevant during install return "<div id=\"toc\" dir=\"ltr\"><tt>Timeline error. " . "Command line was: " . htmlspecialchars( $cmdline ) . "</tt></div>"; } } @$err = file_get_contents( $fname.".err" ); if ( $err != "" ) { // Convert the error from poorly-sanitized HTML to plain text $err = strtr( $err, array( '</p><p>' => "\n\n", '<p>' => '', '</p>' => '', '<b>' => '', '</b>' => '', '<br>' => "\n" ) ); $err = Sanitizer::decodeCharReferences( $err ); // Now convert back to HTML again $encErr = nl2br( htmlspecialchars( $err ) ); $txt = "<div id=\"toc\" dir=\"ltr\"><tt>$encErr</tt></div>"; } else { @$map = file_get_contents( $fname.".map" ); $map = str_replace( ' >', ' />', $map ); $map = "<map name=\"timeline_" . htmlspecialchars( $hash ) . "\">{$map}</map>"; $map = easyTimelineFixMap( $map ); if (wfIsWindows()) { $ext = "gif"; } else { $ext = "png"; } $url = "{$wgUploadPath}/timeline/{$hash}.{$ext}"; $txt = $map . "<img usemap=\"#timeline_" . htmlspecialchars( $hash ) . "\" " . "src=\"" . htmlspecialchars( $url ) . "\">"; if( $expired ) { // Replacing an older file, we may need to purge the old one. global $wgUseSquid; if( $wgUseSquid ) { $u = new SquidUpdate( array( $url ) ); $u->doUpdate(); } } } return $txt; }
/** Heuristig for detecting files that *could* contain JavaScript instructions or * things that may look like HTML to a browser and are thus * potentially harmful. The present implementation will produce false positives in some situations. * * @param string $file Pathname to the temporary upload file * @param string $mime The mime type of the file * @param string $extension The extension of the file * @return bool true if the file contains something looking like embedded scripts */ function detectScript($file, $mime, $extension) { global $wgAllowTitlesInSVG; #ugly hack: for text files, always look at the entire file. #For binarie field, just check the first K. if (strpos($mime, 'text/') === 0) { $chunk = file_get_contents($file); } else { $fp = fopen($file, 'rb'); $chunk = fread($fp, 1024); fclose($fp); } $chunk = strtolower($chunk); if (!$chunk) { return false; } #decode from UTF-16 if needed (could be used for obfuscation). if (substr($chunk, 0, 2) == "��") { $enc = "UTF-16BE"; } elseif (substr($chunk, 0, 2) == "��") { $enc = "UTF-16LE"; } else { $enc = NULL; } if ($enc) { $chunk = iconv($enc, "ASCII//IGNORE", $chunk); } $chunk = trim($chunk); #FIXME: convert from UTF-16 if necessarry! wfDebug("SpecialUpload::detectScript: checking for embedded scripts and HTML stuff\n"); #check for HTML doctype if (eregi("<!DOCTYPE *X?HTML", $chunk)) { return true; } /** * Internet Explorer for Windows performs some really stupid file type * autodetection which can cause it to interpret valid image files as HTML * and potentially execute JavaScript, creating a cross-site scripting * attack vectors. * * Apple's Safari browser also performs some unsafe file type autodetection * which can cause legitimate files to be interpreted as HTML if the * web server is not correctly configured to send the right content-type * (or if you're really uploading plain text and octet streams!) * * Returns true if IE is likely to mistake the given file for HTML. * Also returns true if Safari would mistake the given file for HTML * when served with a generic content-type. */ $tags = array('<body', '<head', '<html', '<img', '<pre', '<script', '<table'); if (!$wgAllowTitlesInSVG && $extension !== 'svg' && $mime !== 'image/svg') { $tags[] = '<title'; } foreach ($tags as $tag) { if (false !== strpos($chunk, $tag)) { return true; } } /* * look for javascript */ #resolve entity-refs to look at attributes. may be harsh on big files... cache result? $chunk = Sanitizer::decodeCharReferences($chunk); #look for script-types if (preg_match("!type\\s*=\\s*['\"]?\\s*(\\w*/)?(ecma|java)!sim", $chunk)) { return true; } #look for html-style script-urls if (preg_match("!(href|src|data)\\s*=\\s*['\"]?\\s*(ecma|java)script:!sim", $chunk)) { return true; } #look for css-style script-urls if (preg_match("!url\\s*\\(\\s*['\"]?\\s*(ecma|java)script:!sim", $chunk)) { return true; } wfDebug("SpecialUpload::detectScript: no scripts found\n"); return false; }
/** * Create a section edit link. This supersedes editSectionLink() and * editSectionLinkForOther(). * * @param Title $nt The title being linked to (may not be the same as * the current page, if the section is included from a template) * @param string $section The designation of the section being pointed to, * to be included in the link, like "§ion=$section" * @param string $tooltip The tooltip to use for the link: will be escaped * and wrapped in the 'editsectionhint' message * @param string $lang Language code * @return string HTML to use for edit link */ public function doEditSectionLink(Title $nt, $section, $tooltip = null, $lang = false) { // HTML generated here should probably have userlangattributes // added to it for LTR text on RTL pages $lang = wfGetLangObj($lang); $attribs = array(); if (!is_null($tooltip)) { # Bug 25462: undo double-escaping. $tooltip = Sanitizer::decodeCharReferences($tooltip); $attribs['title'] = wfMessage('editsectionhint')->rawParams($tooltip)->inLanguage($lang)->text(); } $links = array('editsection' => array('text' => wfMessage('editsection')->inLanguage($lang)->escaped(), 'targetTitle' => $nt, 'attribs' => $attribs, 'query' => array('action' => 'edit', 'section' => $section), 'options' => array('noclasses', 'known'))); Hooks::run('SkinEditSectionLinks', array($this, $nt, $section, $tooltip, &$links, $lang)); $result = '<span class="mw-editsection"><span class="mw-editsection-bracket">[</span>'; $linksHtml = array(); foreach ($links as $k => $linkDetails) { $linksHtml[] = Linker::link($linkDetails['targetTitle'], $linkDetails['text'], $linkDetails['attribs'], $linkDetails['query'], $linkDetails['options']); } $result .= implode('<span class="mw-editsection-divider">' . wfMessage('pipe-separator')->inLanguage($lang)->text() . '</span>', $linksHtml); $result .= '<span class="mw-editsection-bracket">]</span></span>'; // Deprecated, use SkinEditSectionLinks hook instead Hooks::run('DoEditSectionLink', array($this, $nt, $section, $tooltip, &$result, $lang), '1.25'); return $result; }
/** * Extract options from a blob of text * * @param string $text Tag contents */ public function extractOptions($text) { wfProfileIn(__METHOD__); // Parse all possible options $values = array(); foreach (explode("\n", $text) as $line) { if (strpos($line, '=') === false) { continue; } list($name, $value) = explode('=', $line, 2); $values[strtolower(trim($name))] = Sanitizer::decodeCharReferences(trim($value)); } // Validate the dir value. if (isset($values['dir']) && !in_array($values['dir'], array('ltr', 'rtl'))) { unset($values['dir']); } // Build list of options, with local member names $options = array('type' => 'mType', 'width' => 'mWidth', 'preload' => 'mPreload', 'page' => 'mPage', 'editintro' => 'mEditIntro', 'summary' => 'mSummary', 'nosummary' => 'mNosummary', 'minor' => 'mMinor', 'break' => 'mBR', 'default' => 'mDefaultText', 'placeholder' => 'mPlaceholderText', 'bgcolor' => 'mBGColor', 'buttonlabel' => 'mButtonLabel', 'searchbuttonlabel' => 'mSearchButtonLabel', 'fulltextbutton' => 'mFullTextButton', 'namespaces' => 'mNamespaces', 'labeltext' => 'mLabelText', 'hidden' => 'mHidden', 'id' => 'mID', 'inline' => 'mInline', 'prefix' => 'mPrefix', 'dir' => 'mDir'); foreach ($options as $name => $var) { if (isset($values[$name])) { $this->{$var} = $values[$name]; } } // Insert a line break if configured to do so $this->mBR = strtolower($this->mBR) == "no" ? ' ' : '<br />'; // Validate the width; make sure it's a valid, positive integer $this->mWidth = intval($this->mWidth <= 0 ? 50 : $this->mWidth); // Validate background color if (!$this->isValidColor($this->mBGColor)) { $this->mBGColor = 'transparent'; } wfProfileOut(__METHOD__); }
/** * Returns the query result in DSV. * * @since 1.6 * * @param SMWQueryResult $res * * @return string */ protected function getResultFileContents(SMWQueryResult $res) { $lines = array(); if ($this->mShowHeaders) { $headerItems = array(); foreach ($res->getPrintRequests() as $pr) { $headerItems[] = $pr->getLabel(); } $lines[] = $this->getDSVLine($headerItems); } // Loop over the result objects (pages). while ($row = $res->getNext()) { $rowItems = array(); // Loop over their fields (properties). foreach ($row as $field) { $itemSegments = array(); // Loop over all values for the property. while (($object = $field->getNextDataValue()) !== false) { $itemSegments[] = Sanitizer::decodeCharReferences($object->getWikiValue()); } // Join all values into a single string, separating them with comma's. $rowItems[] = implode(',', $itemSegments); } $lines[] = $this->getDSVLine($rowItems); } return implode("\n", $lines); }
/** * Remove newlines, carriage returns and decode html entites * @param $text String * @return String */ public static function cleanupForIRC($text) { return Sanitizer::decodeCharReferences(str_replace(array("\n", "\r"), array("", ""), $text)); }
/** * Short description for 'cleanUrl' * * Long description (if any) ... * * @param unknown $url Parameter description (if any) ... * @param boolean $hostname Parameter description (if any) ... * @return string Return description (if any) ... */ static function cleanUrl($url, $hostname = true) { # Normalize any HTML entities in input. They will be # re-escaped by makeExternalLink(). $url = Sanitizer::decodeCharReferences($url); # Escape any control characters introduced by the above step $url = preg_replace('/[\\][<>"\\x00-\\x20\\x7F]/e', "urlencode('\\0')", $url); # Validate hostname portion $matches = array(); if (preg_match('!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches)) { list(, $protocol, $host, $rest) = $matches; // Characters that will be ignored in IDNs. // http://tools.ietf.org/html/3454#section-3.1 // Strip them before further processing so blacklists and such work. $strip = "/\n\t\t\t\t\\s| # general whitespace\n\t\t\t\t| # 00ad SOFT HYPHEN\n\t\t\t\t᠆| # 1806 MONGOLIAN TODO SOFT HYPHEN\n\t\t\t\t| # 200b ZERO WIDTH SPACE\n\t\t\t\t| # 2060 WORD JOINER\n\t\t\t\t| # feff ZERO WIDTH NO-BREAK SPACE\n\t\t\t\t͏| # 034f COMBINING GRAPHEME JOINER\n\t\t\t\t᠋| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE\n\t\t\t\t᠌| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO\n\t\t\t\t᠍| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE\n\t\t\t\t| # 200c ZERO WIDTH NON-JOINER\n\t\t\t\t| # 200d ZERO WIDTH JOINER\n\t\t\t\t[︀-️] # fe00-fe00f VARIATION SELECTOR-1-16\n\t\t\t\t/xuD"; $host = preg_replace($strip, '', $host); // @fixme: validate hostnames here return $protocol . $host . $rest; } else { return $url; } }