public function execute() { global $wgRevisionCacheExpiry, $wgMemc; wfProfileIn(__METHOD__); $cluster = $blobid = null; extract($this->extractRequestParams()); if (empty($blobid)) { $this->dieUsage('Invalid blobid', 1, 404); } if (empty($cluster)) { $this->dieUsage('Invalid cluster', 2, 404); } $url = sprintf("DB://%s/%d", $cluster, $blobid); $text = ExternalStore::fetchFromURL($url); if ($text === false) { $this->dieUsage('Text not found', 3, 404); } $result = $this->getResult(); $result->setRawMode(); $result->disableSizeCheck(); $result->reset(); $result->addValue(null, 'text', $text); $result->addValue(null, 'mime', 'text/plain'); $result->enableSizeCheck(); wfProfileOut(__METHOD__); }
function testExternalStoreDoesNotFetchIncorrectURL() { global $wgExternalStores; $wgExternalStores = true; # Assertions for r68900 $this->assertFalse(ExternalStore::fetchFromURL('http://')); $this->assertFalse(ExternalStore::fetchFromURL('ftp.wikimedia.org')); $this->assertFalse(ExternalStore::fetchFromURL('/super.txt')); }
/** * @covers ExternalStore::fetchFromURL */ public function testExternalFetchFromURL() { $this->setMwGlobals('wgExternalStores', false); $this->assertFalse(ExternalStore::fetchFromURL('FOO://cluster1/200'), 'Deny if wgExternalStores is not set to a non-empty array'); $this->setMwGlobals('wgExternalStores', array('FOO')); $this->assertEquals(ExternalStore::fetchFromURL('FOO://cluster1/200'), 'Hello', 'Allow FOO://cluster1/200'); $this->assertEquals(ExternalStore::fetchFromURL('FOO://cluster1/300/0'), 'Hello', 'Allow FOO://cluster1/300/0'); # Assertions for r68900 $this->assertFalse(ExternalStore::fetchFromURL('ftp.example.org'), 'Deny domain ftp.example.org'); $this->assertFalse(ExternalStore::fetchFromURL('/example.txt'), 'Deny path /example.txt'); $this->assertFalse(ExternalStore::fetchFromURL('http://'), 'Deny protocol http://'); }
function moveToExternal() { $fname = __METHOD__; $dbw = wfGetDB(DB_MASTER); $dbr = wfGetDB(DB_SLAVE); $ext = new ExternalStoreDB(); $numMoved = 0; $numStubs = 0; $res = $dbr->query("SELECT * FROM revision r1 FORCE INDEX (PRIMARY), text t2\n\t\tWHERE old_id = rev_text_id\n\t\tAND old_flags LIKE '%external%'\n\t\tORDER BY rev_timestamp, rev_id", $fname); $ext = new ExternalStoreDB(); while ($row = $dbr->fetchObject($res)) { $url = $row->old_text; $id = $row->old_id; /** * do the trick with spliiting string and rejoining without external * flag */ $flags = explode(",", $row->old_flags); $ftmp = array(); foreach ($flags as $f) { $f = trim($f); if ($f === "external") { continue; } $ftmp[] = $f; } $flags = implode(",", $ftmp); if (strpos($flags, 'object') !== false) { $obj = unserialize($text); $className = strtolower(get_class($obj)); if ($className == 'historyblobstub') { continue; } elseif ($className == 'historyblobcurstub') { $text = gzdeflate($obj->getText()); $flags = 'utf-8,gzip,external'; } elseif ($className == 'concatenatedgziphistoryblob') { // Do nothing } else { print "Warning: unrecognised object class \"{$className}\"\n"; continue; } } else { $className = false; } $text = ExternalStore::fetchFromURL($url); echo "moved url {$url} back to {$id} with flags {$flags}\n"; $dbw->update('text', array('old_flags' => $flags, 'old_text' => $text), array('old_id' => $id), $fname); $numMoved++; } $dbr->freeResult($res); }
/** * Main entry point, tak job from queue and run it * * @access public */ public function execute() { global $wgUser, $wgTheSchwartzSecretToken, $wgLBFactoryConf; wfProfileIn(__METHOD__); ini_set("memory_limit", -1); ini_set("max_execution_time", 0); $params = $this->extractRequestParams(); $result = array(); # # check token first # if (!(isset($params["token"]) && $params["token"] == $wgTheSchwartzSecretToken)) { $this->dieUsageMsg(array("cantrunjobs")); } $blob = null; $hash = null; # # check for store and id parameters # if (isset($params["store"]) && isset($params["id"])) { $store = $params["store"]; $id = $params["id"]; # # check if store defined in loadbalancer file # if (isset($wgLBFactoryConf["externalLoads"][$store])) { wfDebug(__METHOD__ . ": getting {$id} from {$store}\n"); $url = sprintf("DB://%s/%d", $store, $id); $blob = ExternalStore::fetchFromURL($url); if ($blob === false) { wfProfileOut(__METHOD__); $this->dieUsage('Text not found', 3, 404); } $hash = md5($blob); $blob = unpack("H*", $blob)[1]; } else { wfDebug(__METHOD__ . ": store {$store} is not defined in wgLBFactoryConf\n"); wfProfileOut(__METHOD__); $this->dieUsage('Text not found', 3, 404); } } $result["blob"] = $blob; $result["hash"] = $hash; $this->getResult()->addValue(null, $this->getModuleName(), $result); wfProfileOut(__METHOD__); }
public function execute() { $dbr = wfGetDB(DB_SLAVE); $row = $dbr->selectRow(array('text', 'revision'), array('old_flags', 'old_text'), array('old_id=rev_text_id', 'rev_id' => $this->getArg())); if (!$row) { $this->error("Row not found", true); } $flags = explode(',', $row->old_flags); $text = $row->old_text; if (in_array('external', $flags)) { $this->output("External {$text}\n"); if (preg_match('!^DB://(\\w+)/(\\w+)/(\\w+)$!', $text, $m)) { $es = ExternalStore::getStoreObject('DB'); $blob = $es->fetchBlob($m[1], $m[2], $m[3]); if (strtolower(get_class($blob)) == 'concatenatedgziphistoryblob') { $this->output("Found external CGZ\n"); $blob->uncompress(); $this->output("Items: (" . implode(', ', array_keys($blob->mItems)) . ")\n"); $text = $blob->getItem($m[3]); } else { $this->output("CGZ expected at {$text}, got " . gettype($blob) . "\n"); $text = $blob; } } else { $this->output("External plain {$text}\n"); $text = ExternalStore::fetchFromURL($text); } } if (in_array('gzip', $flags)) { $text = gzinflate($text); } if (in_array('object', $flags)) { $obj = unserialize($text); $text = $obj->getText(); } if (is_object($text)) { $this->error("Unexpectedly got object of type: " . get_class($text)); } else { $this->output("Text length: " . strlen($text) . "\n"); $this->output(substr($text, 0, 100) . "\n"); } }
function execute() { $dbr = wfGetDB(DB_SLAVE); $dbw = wfGetDB(DB_MASTER); $dryRun = $this->getOption('dry-run'); if ($dryRun) { print "Dry run only.\n"; } $startId = $this->getOption('start', 0); $numGood = 0; $numFixed = 0; $numBad = 0; $totalRevs = $dbr->selectField('text', 'MAX(old_id)', false, __METHOD__); if ($dbr->getType() == 'mysql' && version_compare($dbr->getServerVersion(), '4.1.0', '>=')) { // In MySQL 4.1+, the binary field old_text has a non-working LOWER() function $lowerLeft = 'LOWER(CONVERT(LEFT(old_text,22) USING latin1))'; } else { // No CONVERT() in MySQL 4.0 $lowerLeft = 'LOWER(LEFT(old_text,22))'; } while (true) { print "ID: {$startId} / {$totalRevs}\r"; $res = $dbr->select('text', array('old_id', 'old_flags', 'old_text'), array('old_id > ' . intval($startId), 'old_flags LIKE \'%object%\' AND old_flags NOT LIKE \'%external%\'', "{$lowerLeft} = 'o:15:\"historyblobstub\"'"), __METHOD__, array('ORDER BY' => 'old_id', 'LIMIT' => $this->batchSize)); if (!$res->numRows()) { break; } $secondaryIds = array(); $stubs = array(); foreach ($res as $row) { $startId = $row->old_id; // Basic sanity checks $obj = unserialize($row->old_text); if ($obj === false) { print "{$row->old_id}: unrecoverable: cannot unserialize\n"; ++$numBad; continue; } if (!is_object($obj)) { print "{$row->old_id}: unrecoverable: unserialized to type " . gettype($obj) . ", possible double-serialization\n"; ++$numBad; continue; } if (strtolower(get_class($obj)) !== 'historyblobstub') { print "{$row->old_id}: unrecoverable: unexpected object class " . get_class($obj) . "\n"; ++$numBad; continue; } // Process flags $flags = explode(',', $row->old_flags); if (in_array('utf-8', $flags) || in_array('utf8', $flags)) { $legacyEncoding = false; } else { $legacyEncoding = true; } // Queue the stub for future batch processing $id = intval($obj->mOldId); $secondaryIds[] = $id; $stubs[$row->old_id] = array('legacyEncoding' => $legacyEncoding, 'secondaryId' => $id, 'hash' => $obj->mHash); } $secondaryIds = array_unique($secondaryIds); if (!count($secondaryIds)) { continue; } // Run the batch query on blob_tracking $res = $dbr->select('blob_tracking', '*', array('bt_text_id' => $secondaryIds), __METHOD__); $trackedBlobs = array(); foreach ($res as $row) { $trackedBlobs[$row->bt_text_id] = $row; } // Process the stubs foreach ($stubs as $primaryId => $stub) { $secondaryId = $stub['secondaryId']; if (!isset($trackedBlobs[$secondaryId])) { // No tracked blob. Work out what went wrong $secondaryRow = $dbr->selectRow('text', array('old_flags', 'old_text'), array('old_id' => $secondaryId), __METHOD__); if (!$secondaryRow) { print "{$primaryId}: unrecoverable: secondary row is missing\n"; ++$numBad; } elseif ($this->isUnbrokenStub($stub, $secondaryRow)) { // Not broken yet, and not in the tracked clusters so it won't get // broken by the current RCT run. ++$numGood; } elseif (strpos($secondaryRow->old_flags, 'external') !== false) { print "{$primaryId}: unrecoverable: secondary gone to {$secondaryRow->old_text}\n"; ++$numBad; } else { print "{$primaryId}: unrecoverable: miscellaneous corruption of secondary row\n"; ++$numBad; } unset($stubs[$primaryId]); continue; } $trackRow = $trackedBlobs[$secondaryId]; // Check that the specified text really is available in the tracked source row $url = "DB://{$trackRow->bt_cluster}/{$trackRow->bt_blob_id}/{$stub['hash']}"; $text = ExternalStore::fetchFromURL($url); if ($text === false) { print "{$primaryId}: unrecoverable: source text missing\n"; ++$numBad; unset($stubs[$primaryId]); continue; } if (md5($text) !== $stub['hash']) { print "{$primaryId}: unrecoverable: content hashes do not match\n"; ++$numBad; unset($stubs[$primaryId]); continue; } // Find the page_id and rev_id // The page is probably the same as the page of the secondary row $pageId = intval($trackRow->bt_page); if (!$pageId) { $revId = $pageId = 0; } else { $revId = $this->findTextIdInPage($pageId, $primaryId); if (!$revId) { // Actually an orphan $pageId = $revId = 0; } } $newFlags = $stub['legacyEncoding'] ? 'external' : 'external,utf-8'; if (!$dryRun) { // Reset the text row to point to the original copy $dbw->begin(__METHOD__); $dbw->update('text', array('old_flags' => $newFlags, 'old_text' => $url), array('old_id' => $primaryId), __METHOD__); // Add a blob_tracking row so that the new reference can be recompressed // without needing to run trackBlobs.php again $dbw->insert('blob_tracking', array('bt_page' => $pageId, 'bt_rev_id' => $revId, 'bt_text_id' => $primaryId, 'bt_cluster' => $trackRow->bt_cluster, 'bt_blob_id' => $trackRow->bt_blob_id, 'bt_cgz_hash' => $stub['hash'], 'bt_new_url' => null, 'bt_moved' => 0), __METHOD__); $dbw->commit(__METHOD__); $this->waitForSlaves(); } print "{$primaryId}: resolved to {$url}\n"; ++$numFixed; } } print "\n"; print "Fixed: {$numFixed}\n"; print "Unrecoverable: {$numBad}\n"; print "Good stubs: {$numGood}\n"; }
/** * Retrieve a var dump from External Storage or the text table * Some of this code is stolen from Revision::loadText et al * * @param $stored_dump * * @return object|AbuseFilterVariableHolder|bool */ public static function loadVarDump($stored_dump) { wfProfileIn(__METHOD__); // Back-compat if (strpos($stored_dump, 'stored-text:') === false) { wfProfileOut(__METHOD__); return unserialize($stored_dump); } $text_id = substr($stored_dump, strlen('stored-text:')); $dbr = wfGetDB(DB_SLAVE); $text_row = $dbr->selectRow('text', array('old_text', 'old_flags'), array('old_id' => $text_id), __METHOD__); if (!$text_row) { wfProfileOut(__METHOD__); return new AbuseFilterVariableHolder(); } $flags = explode(',', $text_row->old_flags); $text = $text_row->old_text; if (in_array('external', $flags)) { $text = ExternalStore::fetchFromURL($text); } if (in_array('gzip', $flags)) { $text = gzinflate($text); } $obj = unserialize($text); if (in_array('nativeDataArray', $flags)) { $vars = $obj; $obj = new AbuseFilterVariableHolder(); foreach ($vars as $key => $value) { $obj->setVar($key, $value); } } wfProfileOut(__METHOD__); return $obj; }
/** * Get revision text associated with an old or archive row * $row is usually an object from wfFetchRow(), both the flags and the text * field must be included * * @param $row Object: the text data * @param string $prefix table prefix (default 'old_') * @param string|false $wiki the name of the wiki to load the revision text from * (same as the the wiki $row was loaded from) or false to indicate the local * wiki (this is the default). Otherwise, it must be a symbolic wiki database * identifier as understood by the LoadBalancer class. * @return String: text the text requested or false on failure */ public static function getRevisionText($row, $prefix = 'old_', $wiki = false) { wfProfileIn(__METHOD__); # Get data $textField = $prefix . 'text'; $flagsField = $prefix . 'flags'; if (isset($row->{$flagsField})) { $flags = explode(',', $row->{$flagsField}); } else { $flags = array(); } if (isset($row->{$textField})) { $text = $row->{$textField}; } else { wfProfileOut(__METHOD__); return false; } # Use external methods for external objects, text in table is URL-only then if (in_array('external', $flags)) { $url = $text; $parts = explode('://', $url, 2); if (count($parts) == 1 || $parts[1] == '') { wfProfileOut(__METHOD__); return false; } $text = ExternalStore::fetchFromURL($url, array('wiki' => $wiki)); } // If the text was fetched without an error, convert it if ($text !== false) { if (in_array('gzip', $flags)) { # Deal with optional compression of archived pages. # This can be done periodically via maintenance/compressOld.php, and # as pages are saved if $wgCompressRevisions is set. $text = gzinflate($text); } if (in_array('object', $flags)) { # Generic compressed storage $obj = unserialize($text); if (!is_object($obj)) { // Invalid object wfProfileOut(__METHOD__); return false; } $text = $obj->getText(); } global $wgLegacyEncoding; if ($text !== false && $wgLegacyEncoding && !in_array('utf-8', $flags) && !in_array('utf8', $flags)) { # Old revisions kept around in a legacy encoding? # Upconvert on demand. # ("utf8" checked for compatibility with some broken # conversion scripts 2008-12-30) global $wgContLang; $text = $wgContLang->iconv($wgLegacyEncoding, 'UTF-8', $text); } } wfProfileOut(__METHOD__); return $text; }
/** * Get revision text associated with an old or archive row * $row is usually an object from wfFetchRow(), both the flags and the text * field must be included. * * @param stdClass $row The text data * @param string $prefix Table prefix (default 'old_') * @param string|bool $wiki The name of the wiki to load the revision text from * (same as the the wiki $row was loaded from) or false to indicate the local * wiki (this is the default). Otherwise, it must be a symbolic wiki database * identifier as understood by the LoadBalancer class. * @return string Text the text requested or false on failure */ public static function getRevisionText($row, $prefix = 'old_', $wiki = false) { # Get data $textField = $prefix . 'text'; $flagsField = $prefix . 'flags'; if (isset($row->{$flagsField})) { $flags = explode(',', $row->{$flagsField}); } else { $flags = array(); } if (isset($row->{$textField})) { $text = $row->{$textField}; } else { return false; } # Use external methods for external objects, text in table is URL-only then if (in_array('external', $flags)) { $url = $text; $parts = explode('://', $url, 2); if (count($parts) == 1 || $parts[1] == '') { return false; } $text = ExternalStore::fetchFromURL($url, array('wiki' => $wiki)); } // If the text was fetched without an error, convert it if ($text !== false) { $text = self::decompressRevisionText($text, $flags); } return $text; }
/** * Get revision text associated with an old or archive row * $row is usually an object from wfFetchRow(), both the flags and the text * field must be included * @static * @param integer $row Id of a row * @param string $prefix table prefix (default 'old_') * @return string $text|false the text requested */ function getRevisionText($row, $prefix = 'old_') { $fname = 'Revision::getRevisionText'; wfProfileIn($fname); # Get data $textField = $prefix . 'text'; $flagsField = $prefix . 'flags'; if (isset($row->{$flagsField})) { $flags = explode(',', $row->{$flagsField}); } else { $flags = array(); } if (isset($row->{$textField})) { $text = $row->{$textField}; } else { wfProfileOut($fname); return false; } # Use external methods for external objects, text in table is URL-only then if (in_array('external', $flags)) { $url = $text; @(list($proto, $path) = explode('://', $url, 2)); if ($path == "") { wfProfileOut($fname); return false; } require_once 'ExternalStore.php'; $text = ExternalStore::fetchFromURL($url); } // If the text was fetched without an error, convert it if ($text !== false) { if (in_array('gzip', $flags)) { # Deal with optional compression of archived pages. # This can be done periodically via maintenance/compressOld.php, and # as pages are saved if $wgCompressRevisions is set. $text = gzinflate($text); } if (in_array('object', $flags)) { # Generic compressed storage $obj = unserialize($text); if (!is_object($obj)) { // Invalid object wfProfileOut($fname); return false; } $text = $obj->getText(); } global $wgLegacyEncoding; if ($wgLegacyEncoding && !in_array('utf-8', $flags)) { # Old revisions kept around in a legacy encoding? # Upconvert on demand. global $wgInputEncoding, $wgContLang; $text = $wgContLang->iconv($wgLegacyEncoding, $wgInputEncoding . '//IGNORE', $text); } } wfProfileOut($fname); return $text; }
$uuid = Flow\Model\UUID::create($rev->rev_id); echo "\n********************\n\nProcessing revision " . $uuid->getAlphadecimal() . "\n"; ++$totalNullContentWithParent; $res = iterator_to_array($dbr->select('flow_revision', array('rev_content', 'rev_flags'), array('rev_id' => new \Flow\Model\UUIDBlob($rev->rev_parent_id)), __FILE__)); // not likely ... but lets be careful if (!$res) { echo "No parent found?\n"; $totalBadQueryResult++; continue; } elseif (count($res) > 1) { echo "Multiple parents found?\n"; $totalBadQueryResult++; continue; } $parent = reset($res); $parentItem = ExternalStore::fetchFromURL($parent->rev_content); if ($parentItem) { echo "MATCHED\n"; fputcsv($csvOutput, array($uuid->getAlphadecimal(), $parent->rev_content, $parent->rev_flags)); ++$totalMatched; } else { echo "Parent item is null\n"; ++$totalNullParentContent; } } } echo "Considered {$totalNullContentWithParent} revisions with parents and no content\n"; if ($totalNullContentWithParent > 0) { echo "Could not fix {$totalNullParentContent} (" . number_format(100 * $totalNullParentContent / $totalNullContentWithParent) . "%) due to parent not having content\n"; echo "Could not fix {$totalBadQueryResult} (" . number_format(100 * $totalBadQueryResult / $totalNullContentWithParent) . "%) due to not finding the parent revision\n"; echo "Found matches for {$totalMatched} (" . number_format(100 * $totalMatched / $totalNullContentWithParent) . "%)\n";
/** * This is based on part of HistoryBlobStub::getText(). * Determine if the text can be retrieved from the row in the normal way. * @param array $stub * @param stdClass $secondaryRow * @return bool */ function isUnbrokenStub($stub, $secondaryRow) { $flags = explode(',', $secondaryRow->old_flags); $text = $secondaryRow->old_text; if (in_array('external', $flags)) { $url = $text; MediaWiki\suppressWarnings(); list(, $path) = explode('://', $url, 2); MediaWiki\restoreWarnings(); if ($path == "") { return false; } $text = ExternalStore::fetchFromURL($url); } if (!in_array('object', $flags)) { return false; } if (in_array('gzip', $flags)) { $obj = unserialize(gzinflate($text)); } else { $obj = unserialize($text); } if (!is_object($obj)) { // Correct for old double-serialization bug. $obj = unserialize($obj); } if (!is_object($obj)) { return false; } $obj->uncompress(); $text = $obj->getItem($stub['hash']); return $text !== false; }
foreach ($esIdsForCluster[$cluster] as $id) { if ($id === $lastId || $id === $lastId + 1) { $lastId = $id; continue; } $range = range($lastId + 1, $id - 1); $lastId = $id; echo "Checking " . count($range) . " es urls\n"; if (count($range) > 100) { echo "More than 100 potential es urls, skipping\n"; $invalid = true; continue; } foreach ($range as $possible) { $url = "DB://{$cluster}/{$possible}"; $content = gzinflate(ExternalStore::fetchFromURL($url)); if (false !== @unserialize($content)) { // if it unserializes, its not our content continue; } $json = @json_decode($content, true); if ($json && count($json) === 1 && isset($json['flow-workflow'])) { // while technically possible to be a topic title, i'm almost // certain this is a core revisions inserted by flow in the form // of: {"flow-workflow":"sbk26yv6cpcxxm87"} continue; } if (!in_array($changeType, $plaintextChangeTypes)) { if (false === strpos($content, 'data-parsoid')) { continue; }
if (in_array('external', $flags)) { print "External {$text}\n"; if (preg_match('!^DB://(\\w+)/(\\w+)/(\\w+)$!', $text, $m)) { $es = ExternalStore::getStoreObject('DB'); $blob = $es->fetchBlob($m[1], $m[2], $m[3]); if (strtolower(get_class($blob)) == 'concatenatedgziphistoryblob') { print "Found external CGZ\n"; $blob->uncompress(); print "Items: (" . implode(', ', array_keys($blob->mItems)) . ")\n"; $text = $blob->getItem($m[3]); } else { print "CGZ expected at {$text}, got " . gettype($blob) . "\n"; $text = $blob; } } else { print "External plain {$text}\n"; $text = ExternalStore::fetchFromURL($text); } } if (in_array('gzip', $flags)) { $text = gzinflate($text); } if (in_array('object', $flags)) { $text = unserialize($text); } if (is_object($text)) { print "Unexpectedly got object of type: " . get_class($text) . "\n"; } else { print "Text length: " . strlen($text) . "\n"; print substr($text, 0, 100) . "\n"; }
/** * @return string */ function getText() { if (isset(self::$blobCache[$this->mOldId])) { $obj = self::$blobCache[$this->mOldId]; } else { $dbr = wfGetDB(DB_REPLICA); $row = $dbr->selectRow('text', ['old_flags', 'old_text'], ['old_id' => $this->mOldId]); if (!$row) { return false; } $flags = explode(',', $row->old_flags); if (in_array('external', $flags)) { $url = $row->old_text; $parts = explode('://', $url, 2); if (!isset($parts[1]) || $parts[1] == '') { return false; } $row->old_text = ExternalStore::fetchFromURL($url); } if (!in_array('object', $flags)) { return false; } if (in_array('gzip', $flags)) { // This shouldn't happen, but a bug in the compress script // may at times gzip-compress a HistoryBlob object row. $obj = unserialize(gzinflate($row->old_text)); } else { $obj = unserialize($row->old_text); } if (!is_object($obj)) { // Correct for old double-serialization bug. $obj = unserialize($obj); } // Save this item for reference; if pulling many // items in a row we'll likely use it again. $obj->uncompress(); self::$blobCache = [$this->mOldId => $obj]; } return $obj->getItem($this->mHash); }