/** * Preprocess some wikitext and return the document tree. * This is the ghost of Parser::replace_variables(). * * @param string $text The text to parse * @param int $flags Bitwise combination of: * Parser::PTD_FOR_INCLUSION Handle "<noinclude>" and "<includeonly>" * as if the text is being included. Default * is to assume a direct page view. * * The generated DOM tree must depend only on the input text and the flags. * The DOM tree must be the same in OT_HTML and OT_WIKI mode, to avoid a regression of bug 4899. * * Any flag added to the $flags parameter here, or any other parameter liable to cause a * change in the DOM tree for a given text, must be passed through the section identifier * in the section edit link and thus back to extractSections(). * * The output of this function is currently only cached in process memory, but a persistent * cache may be implemented at a later date which takes further advantage of these strict * dependency requirements. * * @throws MWException * @return PPNode_DOM */ public function preprocessToObj($text, $flags = 0) { $xml = $this->cacheGetTree($text, $flags); if ($xml === false) { $xml = $this->preprocessToXml($text, $flags); $this->cacheSetTree($text, $flags, $xml); } // Fail if the number of elements exceeds acceptable limits // Do not attempt to generate the DOM $this->parser->mGeneratedPPNodeCount += substr_count($xml, '<'); $max = $this->parser->mOptions->getMaxGeneratedPPNodeCount(); if ($this->parser->mGeneratedPPNodeCount > $max) { // if ( $cacheable ) { ... } throw new MWException(__METHOD__ . ': generated node count limit exceeded'); } $dom = new DOMDocument(); MediaWiki\suppressWarnings(); $result = $dom->loadXML($xml); MediaWiki\restoreWarnings(); if (!$result) { // Try running the XML through UtfNormal to get rid of invalid characters $xml = UtfNormal\Validator::cleanUp($xml); // 1 << 19 == XML_PARSE_HUGE, needed so newer versions of libxml2 // don't barf when the XML is >256 levels deep. $result = $dom->loadXML($xml, 1 << 19); } if ($result) { $obj = new PPNode_DOM($dom->documentElement); } // if ( $cacheable ) { ... } if (!$result) { throw new MWException(__METHOD__ . ' generated invalid XML'); } return $obj; }
function pageTextCallback($matches) { # Get rid of invalid UTF-8, strip control characters $val = htmlspecialchars(UtfNormal\Validator::cleanUp(stripcslashes($matches[1]))); $val = str_replace(["\n", '�'], [' ', ''], $val); return '<PAGE value="' . $val . '" />'; }
/** * Preprocess some wikitext and return the document tree. * This is the ghost of Parser::replace_variables(). * * @param string $text The text to parse * @param int $flags Bitwise combination of: * Parser::PTD_FOR_INCLUSION Handle "<noinclude>" and "<includeonly>" * as if the text is being included. Default * is to assume a direct page view. * * The generated DOM tree must depend only on the input text and the flags. * The DOM tree must be the same in OT_HTML and OT_WIKI mode, to avoid a regression of bug 4899. * * Any flag added to the $flags parameter here, or any other parameter liable to cause a * change in the DOM tree for a given text, must be passed through the section identifier * in the section edit link and thus back to extractSections(). * * The output of this function is currently only cached in process memory, but a persistent * cache may be implemented at a later date which takes further advantage of these strict * dependency requirements. * * @throws MWException * @return PPNode_DOM */ public function preprocessToObj($text, $flags = 0) { global $wgMemc, $wgPreprocessorCacheThreshold; $xml = false; $cacheable = $wgPreprocessorCacheThreshold !== false && strlen($text) > $wgPreprocessorCacheThreshold; if ($cacheable) { $cacheKey = wfMemcKey('preprocess-xml', md5($text), $flags); $cacheValue = $wgMemc->get($cacheKey); if ($cacheValue) { $version = substr($cacheValue, 0, 8); if (intval($version) == self::CACHE_VERSION) { $xml = substr($cacheValue, 8); // From the cache wfDebugLog("Preprocessor", "Loaded preprocessor XML from memcached (key {$cacheKey})"); } } if ($xml === false) { $xml = $this->preprocessToXml($text, $flags); $cacheValue = sprintf("%08d", self::CACHE_VERSION) . $xml; $wgMemc->set($cacheKey, $cacheValue, 86400); wfDebugLog("Preprocessor", "Saved preprocessor XML to memcached (key {$cacheKey})"); } } else { $xml = $this->preprocessToXml($text, $flags); } // Fail if the number of elements exceeds acceptable limits // Do not attempt to generate the DOM $this->parser->mGeneratedPPNodeCount += substr_count($xml, '<'); $max = $this->parser->mOptions->getMaxGeneratedPPNodeCount(); if ($this->parser->mGeneratedPPNodeCount > $max) { if ($cacheable) { } throw new MWException(__METHOD__ . ': generated node count limit exceeded'); } $dom = new DOMDocument(); wfSuppressWarnings(); $result = $dom->loadXML($xml); wfRestoreWarnings(); if (!$result) { // Try running the XML through UtfNormal to get rid of invalid characters $xml = UtfNormal\Validator::cleanUp($xml); // 1 << 19 == XML_PARSE_HUGE, needed so newer versions of libxml2 // don't barf when the XML is >256 levels deep. $result = $dom->loadXML($xml, 1 << 19); } if ($result) { $obj = new PPNode_DOM($dom->documentElement); } if ($cacheable) { } if (!$result) { throw new MWException(__METHOD__ . ' generated invalid XML'); } return $obj; }
/** * Convert a UTF-8 string to normal form C. In Malayalam and Arabic, this * also cleans up certain backwards-compatible sequences, converting them * to the modern Unicode equivalent. * * This is language-specific for performance reasons only. * * @param string $s * * @return string */ function normalize($s) { global $wgAllUnicodeFixes; $s = UtfNormal\Validator::cleanUp($s); if ($wgAllUnicodeFixes) { $s = $this->transformUsingPairFile('normalize-ar.ser', $s); $s = $this->transformUsingPairFile('normalize-ml.ser', $s); } return $s; }
/** * This is a method to pass messages from wfDebug to the pretty debugger. * Do NOT use this method, use MWDebug::log or wfDebug() * * @since 1.19 * @param string $str * @param array $context */ public static function debugMsg($str, $context = array()) { global $wgDebugComments, $wgShowDebug; if (self::$enabled || $wgDebugComments || $wgShowDebug) { if ($context) { $prefix = ''; if (isset($context['prefix'])) { $prefix = $context['prefix']; } elseif (isset($context['channel']) && $context['channel'] !== 'wfDebug') { $prefix = "[{$context['channel']}] "; } if (isset($context['seconds_elapsed']) && isset($context['memory_used'])) { $prefix .= "{$context['seconds_elapsed']} {$context['memory_used']} "; } $str = $prefix . $str; } self::$debug[] = rtrim(UtfNormal\Validator::cleanUp($str)); } }
/** * Recursively normalizes UTF-8 strings in the given array. * * @param string|array $data * @return array|string Cleaned-up version of the given * @private */ function normalizeUnicode($data) { if (is_array($data)) { foreach ($data as $key => $val) { $data[$key] = $this->normalizeUnicode($val); } } else { global $wgContLang; $data = isset($wgContLang) ? $wgContLang->normalize($data) : UtfNormal\Validator::cleanUp($data); } return $data; }
$comment = file_get_contents($options['comment-file']); if ($comment === false || $comment === null) { die("failed to read comment file: {$options['comment-file']}\n"); } } elseif (isset($options['comment'])) { $comment = $options['comment']; } $commentExt = isset($options['comment-ext']) ? $options['comment-ext'] : false; $summary = isset($options['summary']) ? $options['summary'] : ''; # Get the license specifier $license = isset($options['license']) ? $options['license'] : ''; # Batch "upload" operation $count = count($files); if ($count > 0) { foreach ($files as $file) { $base = UtfNormal\Validator::cleanUp(wfBaseName($file)); # Validate a title $title = Title::makeTitleSafe(NS_FILE, $base); if (!is_object($title)) { echo "{$base} could not be imported; a valid title cannot be produced\n"; continue; } if ($from) { if ($from == $title->getDBkey()) { $from = null; } else { $ignored++; continue; } } if ($checkUserBlock && $processed % $checkUserBlock == 0) {
/** * Returns the normalized form of the given page title, using the * normalization rules of the given site. If the given title is a redirect, * the redirect weill be resolved and the redirect target is returned. * * @note This actually makes an API request to the remote site, so beware * that this function is slow and depends on an external service. * * @note If MW_PHPUNIT_TEST is defined, the call to the external site is * skipped, and the title is normalized using the local normalization * rules as implemented by the Title class. * * @see Site::normalizePageName * * @since 1.21 * * @param string $pageName * * @return string * @throws MWException */ public function normalizePageName($pageName) { // Check if we have strings as arguments. if (!is_string($pageName)) { throw new MWException('$pageName must be a string'); } // Go on call the external site if (defined('MW_PHPUNIT_TEST')) { // If the code is under test, don't call out to other sites, just // normalize locally. // Note: this may cause results to be inconsistent with the actual // normalization used by the respective remote site! $t = Title::newFromText($pageName); return $t->getPrefixedText(); } else { // Make sure the string is normalized into NFC (due to T42017) // but do nothing to the whitespaces, that should work appropriately. // @see https://phabricator.wikimedia.org/T42017 $pageName = UtfNormal\Validator::cleanUp($pageName); // Build the args for the specific call $args = array('action' => 'query', 'prop' => 'info', 'redirects' => true, 'converttitles' => true, 'format' => 'json', 'titles' => $pageName); $url = wfAppendQuery($this->getFileUrl('api.php'), $args); // Go on call the external site // @todo we need a good way to specify a timeout here. $ret = Http::get($url, array(), __METHOD__); } if ($ret === false) { wfDebugLog("MediaWikiSite", "call to external site failed: {$url}"); return false; } $data = FormatJson::decode($ret, true); if (!is_array($data)) { wfDebugLog("MediaWikiSite", "call to <{$url}> returned bad json: " . $ret); return false; } $page = static::extractPageRecord($data, $pageName); if (isset($page['missing'])) { wfDebugLog("MediaWikiSite", "call to <{$url}> returned a marker for a missing page title! " . $ret); return false; } if (isset($page['invalid'])) { wfDebugLog("MediaWikiSite", "call to <{$url}> returned a marker for an invalid page title! " . $ret); return false; } if (!isset($page['title'])) { wfDebugLog("MediaWikiSite", "call to <{$url}> did not return a page title! " . $ret); return false; } return $page['title']; }
/** * Do userComment tags and similar. See pg. 34 of exif standard. * basically first 8 bytes is charset, rest is value. * This has not been tested on any shift-JIS strings. * @param string $prop Prop name */ private function charCodeString($prop) { if (isset($this->mFilteredExifData[$prop])) { if (strlen($this->mFilteredExifData[$prop]) <= 8) { // invalid. Must be at least 9 bytes long. $this->debug($this->mFilteredExifData[$prop], __FUNCTION__, false); unset($this->mFilteredExifData[$prop]); return; } $charCode = substr($this->mFilteredExifData[$prop], 0, 8); $val = substr($this->mFilteredExifData[$prop], 8); switch ($charCode) { case "JIS": // JIS $charset = "Shift-JIS"; break; case "UNICODE": $charset = "UTF-16" . $this->byteOrder; break; default: // ascii or undefined. $charset = ""; break; } if ($charset) { MediaWiki\suppressWarnings(); $val = iconv($charset, 'UTF-8//IGNORE', $val); MediaWiki\restoreWarnings(); } else { // if valid utf-8, assume that, otherwise assume windows-1252 $valCopy = $val; UtfNormal\Validator::quickIsNFCVerify($valCopy); // validates $valCopy. if ($valCopy !== $val) { MediaWiki\suppressWarnings(); $val = iconv('Windows-1252', 'UTF-8//IGNORE', $val); MediaWiki\restoreWarnings(); } } // trim and check to make sure not only whitespace. $val = trim($val); if (strlen($val) === 0) { // only whitespace. $this->debug($this->mFilteredExifData[$prop], __FUNCTION__, "{$prop}: Is only whitespace"); unset($this->mFilteredExifData[$prop]); return; } // all's good. $this->mFilteredExifData[$prop] = $val; } }
/** * Helper function of a helper function to convert charset for iptc values. * @param string|array $data The IPTC string * @param string $charset The charset * * @return string */ private static function convIPTCHelper($data, $charset) { if ($charset) { MediaWiki\suppressWarnings(); $data = iconv($charset, "UTF-8//IGNORE", $data); MediaWiki\restoreWarnings(); if ($data === false) { $data = ""; wfDebugLog('iptc', __METHOD__ . " Error converting iptc data charset {$charset} to utf-8"); } } else { // treat as utf-8 if is valid utf-8. otherwise pretend its windows-1252 // most of the time if there is no 1:90 tag, it is either ascii, latin1, or utf-8 $oldData = $data; UtfNormal\Validator::quickIsNFCVerify($data); // make $data valid utf-8 if ($data === $oldData) { return $data; // if validation didn't change $data } else { return self::convIPTCHelper($oldData, 'Windows-1252'); } } return trim($data); }
/** * Really format a diff for the newsfeed * * @param Title $title Title object * @param int $oldid Old revision's id * @param int $newid New revision's id * @param int $timestamp New revision's timestamp * @param string $comment New revision's comment * @param string $actiontext Text of the action; in case of log event * @return string */ public static function formatDiffRow($title, $oldid, $newid, $timestamp, $comment, $actiontext = '') { global $wgFeedDiffCutoff, $wgLang; // log entries $completeText = '<p>' . implode(' ', array_filter(array($actiontext, Linker::formatComment($comment)))) . "</p>\n"; // NOTE: Check permissions for anonymous users, not current user. // No "privileged" version should end up in the cache. // Most feed readers will not log in anyway. $anon = new User(); $accErrors = $title->getUserPermissionsErrors('read', $anon, true); // Can't diff special pages, unreadable pages or pages with no new revision // to compare against: just return the text. if ($title->getNamespace() < 0 || $accErrors || !$newid) { return $completeText; } if ($oldid) { #$diffText = $de->getDiff( wfMessage( 'revisionasof', # $wgLang->timeanddate( $timestamp ), # $wgLang->date( $timestamp ), # $wgLang->time( $timestamp ) )->text(), # wfMessage( 'currentrev' )->text() ); $diffText = ''; // Don't bother generating the diff if we won't be able to show it if ($wgFeedDiffCutoff > 0) { $rev = Revision::newFromId($oldid); if (!$rev) { $diffText = false; } else { $context = clone RequestContext::getMain(); $context->setTitle($title); $contentHandler = $rev->getContentHandler(); $de = $contentHandler->createDifferenceEngine($context, $oldid, $newid); $diffText = $de->getDiff(wfMessage('previousrevision')->text(), wfMessage('revisionasof', $wgLang->timeanddate($timestamp), $wgLang->date($timestamp), $wgLang->time($timestamp))->text()); } } if ($wgFeedDiffCutoff <= 0 || strlen($diffText) > $wgFeedDiffCutoff) { // Omit large diffs $diffText = self::getDiffLink($title, $newid, $oldid); } elseif ($diffText === false) { // Error in diff engine, probably a missing revision $diffText = "<p>Can't load revision {$newid}</p>"; } else { // Diff output fine, clean up any illegal UTF-8 $diffText = UtfNormal\Validator::cleanUp($diffText); $diffText = self::applyDiffStyle($diffText); } } else { $rev = Revision::newFromId($newid); if ($wgFeedDiffCutoff <= 0 || is_null($rev)) { $newContent = ContentHandler::getForTitle($title)->makeEmptyContent(); } else { $newContent = $rev->getContent(); } if ($newContent instanceof TextContent) { // only textual content has a "source view". $text = $newContent->getNativeData(); if ($wgFeedDiffCutoff <= 0 || strlen($text) > $wgFeedDiffCutoff) { $html = null; } else { $html = nl2br(htmlspecialchars($text)); } } else { //XXX: we could get an HTML representation of the content via getParserOutput, but that may // contain JS magic and generally may not be suitable for inclusion in a feed. // Perhaps Content should have a getDescriptiveHtml method and/or a getSourceText method. //Compare also ApiFeedContributions::feedItemDesc $html = null; } if ($html === null) { // Omit large new page diffs, bug 29110 // Also use diff link for non-textual content $diffText = self::getDiffLink($title, $newid); } else { $diffText = '<p><b>' . wfMessage('newpage')->text() . '</b></p>' . '<div>' . $html . '</div>'; } } $completeText .= $diffText; return $completeText; }
/** Function to extract metadata segments of interest from jpeg files * based on GIFMetadataExtractor. * * we can almost use getimagesize to do this * but gis doesn't support having multiple app1 segments * and those can't extract xmp on files containing both exif and xmp data * * @param string $filename Name of jpeg file * @return array Array of interesting segments. * @throws MWException If given invalid file. */ static function segmentSplitter($filename) { $showXMP = XMPReader::isSupported(); $segmentCount = 0; $segments = ['XMP_ext' => [], 'COM' => [], 'PSIR' => []]; if (!$filename) { throw new MWException("No filename specified for " . __METHOD__); } if (!file_exists($filename) || is_dir($filename)) { throw new MWException("Invalid file {$filename} passed to " . __METHOD__); } $fh = fopen($filename, "rb"); if (!$fh) { throw new MWException("Could not open file {$filename}"); } $buffer = fread($fh, 2); if ($buffer !== "ÿØ") { throw new MWException("Not a jpeg, no SOI"); } while (!feof($fh)) { $buffer = fread($fh, 1); $segmentCount++; if ($segmentCount > self::MAX_JPEG_SEGMENTS) { // this is just a sanity check throw new MWException('Too many jpeg segments. Aborting'); } while ($buffer !== "ÿ") { // In theory JPEG files are not allowed to contain anything between the sections, // but in practice they sometimes do. It's customary to ignore the garbage data. $buffer = fread($fh, 1); } $buffer = fread($fh, 1); while ($buffer === "ÿ" && !feof($fh)) { // Skip through any 0xFF padding bytes. $buffer = fread($fh, 1); } if ($buffer === "þ") { // COM section -- file comment // First see if valid utf-8, // if not try to convert it to windows-1252. $com = $oldCom = trim(self::jpegExtractMarker($fh)); UtfNormal\Validator::quickIsNFCVerify($com); // turns $com to valid utf-8. // thus if no change, its utf-8, otherwise its something else. if ($com !== $oldCom) { MediaWiki\suppressWarnings(); $com = $oldCom = iconv('windows-1252', 'UTF-8//IGNORE', $oldCom); MediaWiki\restoreWarnings(); } // Try it again, if its still not a valid string, then probably // binary junk or some really weird encoding, so don't extract. UtfNormal\Validator::quickIsNFCVerify($com); if ($com === $oldCom) { $segments["COM"][] = $oldCom; } else { wfDebug(__METHOD__ . " Ignoring JPEG comment as is garbage.\n"); } } elseif ($buffer === "á") { // APP1 section (Exif, XMP, and XMP extended) // only extract if XMP is enabled. $temp = self::jpegExtractMarker($fh); // check what type of app segment this is. if (substr($temp, 0, 29) === "http://ns.adobe.com/xap/1.0/" && $showXMP) { $segments["XMP"] = substr($temp, 29); } elseif (substr($temp, 0, 35) === "http://ns.adobe.com/xmp/extension/" && $showXMP) { $segments["XMP_ext"][] = substr($temp, 35); } elseif (substr($temp, 0, 29) === "XMP://ns.adobe.com/xap/1.0/" && $showXMP) { // Some images (especially flickr images) seem to have this. // I really have no idea what the deal is with them, but // whatever... $segments["XMP"] = substr($temp, 29); wfDebug(__METHOD__ . ' Found XMP section with wrong app identifier ' . "Using anyways.\n"); } elseif (substr($temp, 0, 6) === "Exif") { // Just need to find out what the byte order is. // because php's exif plugin sucks... // This is a II for little Endian, MM for big. Not a unicode BOM. $byteOrderMarker = substr($temp, 6, 2); if ($byteOrderMarker === 'MM') { $segments['byteOrder'] = 'BE'; } elseif ($byteOrderMarker === 'II') { $segments['byteOrder'] = 'LE'; } else { wfDebug(__METHOD__ . " Invalid byte ordering?!\n"); } } } elseif ($buffer === "í") { // APP13 - PSIR. IPTC and some photoshop stuff $temp = self::jpegExtractMarker($fh); if (substr($temp, 0, 14) === "Photoshop 3.0") { $segments["PSIR"][] = $temp; } } elseif ($buffer === "Ù" || $buffer === "Ú") { // EOI - end of image or SOS - start of scan. either way we're past any interesting segments return $segments; } else { // segment we don't care about, so skip $size = wfUnpack("nint", fread($fh, 2), 2); if ($size['int'] < 2) { throw new MWException("invalid marker size in jpeg"); } fseek($fh, $size['int'] - 2, SEEK_CUR); } } // shouldn't get here. throw new MWException("Reached end of jpeg file unexpectedly"); }
/** * Begins profiling on a database query * * @since 1.19 * @param string $sql * @param string $function * @param bool $isMaster * @param float $runTime Query run time * @return int ID number of the query to pass to queryTime or -1 if the * debugger is disabled */ public static function query($sql, $function, $isMaster, $runTime) { if (!self::$enabled) { return -1; } // Replace invalid UTF-8 chars with a square UTF-8 character // This prevents json_encode from erroring out due to binary SQL data $sql = preg_replace('/( [\\xC0-\\xC1] # Invalid UTF-8 Bytes | [\\xF5-\\xFF] # Invalid UTF-8 Bytes | \\xE0[\\x80-\\x9F] # Overlong encoding of prior code point | \\xF0[\\x80-\\x8F] # Overlong encoding of prior code point | [\\xC2-\\xDF](?![\\x80-\\xBF]) # Invalid UTF-8 Sequence Start | [\\xE0-\\xEF](?![\\x80-\\xBF]{2}) # Invalid UTF-8 Sequence Start | [\\xF0-\\xF4](?![\\x80-\\xBF]{3}) # Invalid UTF-8 Sequence Start | (?<=[\\x0-\\x7F\\xF5-\\xFF])[\\x80-\\xBF] # Invalid UTF-8 Sequence Middle | (?<![\\xC2-\\xDF]|[\\xE0-\\xEF]|[\\xE0-\\xEF][\\x80-\\xBF]|[\\xF0-\\xF4] |[\\xF0-\\xF4][\\x80-\\xBF]|[\\xF0-\\xF4][\\x80-\\xBF]{2})[\\x80-\\xBF] # Overlong Sequence | (?<=[\\xE0-\\xEF])[\\x80-\\xBF](?![\\x80-\\xBF]) # Short 3 byte sequence | (?<=[\\xF0-\\xF4])[\\x80-\\xBF](?![\\x80-\\xBF]{2}) # Short 4 byte sequence | (?<=[\\xF0-\\xF4][\\x80-\\xBF])[\\x80-\\xBF](?![\\x80-\\xBF]) # Short 4 byte sequence (2) )/x', '■', $sql); // last check for invalid utf8 $sql = UtfNormal\Validator::cleanUp($sql); self::$query[] = ['sql' => $sql, 'function' => $function, 'master' => (bool) $isMaster, 'time' => $runTime]; return count(self::$query) - 1; }