/**
  * Compress the text in chunks after concatenating the revisions.
  *
  * @param int $startId
  * @param int $maxChunkSize
  * @param string $beginDate
  * @param string $endDate
  * @param string $extdb
  * @param bool|int $maxPageId
  * @return bool
  */
 private function compressWithConcat($startId, $maxChunkSize, $beginDate, $endDate, $extdb = "", $maxPageId = false)
 {
     $loadStyle = self::LS_CHUNKED;
     $dbr = wfGetDB(DB_SLAVE);
     $dbw = wfGetDB(DB_MASTER);
     # Set up external storage
     if ($extdb != '') {
         $storeObj = new ExternalStoreDB();
     }
     # Get all articles by page_id
     if (!$maxPageId) {
         $maxPageId = $dbr->selectField('page', 'max(page_id)', '', __METHOD__);
     }
     $this->output("Starting from {$startId} of {$maxPageId}\n");
     $pageConds = array();
     /*
     		if ( $exclude_ns0 ) {
     			print "Excluding main namespace\n";
     			$pageConds[] = 'page_namespace<>0';
     		}
     		if ( $queryExtra ) {
     					$pageConds[] = $queryExtra;
     		}
     */
     # For each article, get a list of revisions which fit the criteria
     # No recompression, use a condition on old_flags
     # Don't compress object type entities, because that might produce data loss when
     # overwriting bulk storage concat rows. Don't compress external references, because
     # the script doesn't yet delete rows from external storage.
     $conds = array('old_flags NOT ' . $dbr->buildLike($dbr->anyString(), 'object', $dbr->anyString()) . ' AND old_flags NOT ' . $dbr->buildLike($dbr->anyString(), 'external', $dbr->anyString()));
     if ($beginDate) {
         if (!preg_match('/^\\d{14}$/', $beginDate)) {
             $this->error("Invalid begin date \"{$beginDate}\"\n");
             return false;
         }
         $conds[] = "rev_timestamp>'" . $beginDate . "'";
     }
     if ($endDate) {
         if (!preg_match('/^\\d{14}$/', $endDate)) {
             $this->error("Invalid end date \"{$endDate}\"\n");
             return false;
         }
         $conds[] = "rev_timestamp<'" . $endDate . "'";
     }
     if ($loadStyle == self::LS_CHUNKED) {
         $tables = array('revision', 'text');
         $fields = array('rev_id', 'rev_text_id', 'old_flags', 'old_text');
         $conds[] = 'rev_text_id=old_id';
         $revLoadOptions = 'FOR UPDATE';
     } else {
         $tables = array('revision');
         $fields = array('rev_id', 'rev_text_id');
         $revLoadOptions = array();
     }
     # Don't work with current revisions
     # Don't lock the page table for update either -- TS 2006-04-04
     #$tables[] = 'page';
     #$conds[] = 'page_id=rev_page AND rev_id != page_latest';
     for ($pageId = $startId; $pageId <= $maxPageId; $pageId++) {
         wfWaitForSlaves();
         # Wake up
         $dbr->ping();
         # Get the page row
         $pageRes = $dbr->select('page', array('page_id', 'page_namespace', 'page_title', 'page_latest'), $pageConds + array('page_id' => $pageId), __METHOD__);
         if ($pageRes->numRows() == 0) {
             continue;
         }
         $pageRow = $dbr->fetchObject($pageRes);
         # Display progress
         $titleObj = Title::makeTitle($pageRow->page_namespace, $pageRow->page_title);
         $this->output("{$pageId}\t" . $titleObj->getPrefixedDBkey() . " ");
         # Load revisions
         $revRes = $dbw->select($tables, $fields, array_merge(array('rev_page' => $pageRow->page_id, 'rev_id < ' . $pageRow->page_latest), $conds), __METHOD__, $revLoadOptions);
         $revs = array();
         foreach ($revRes as $revRow) {
             $revs[] = $revRow;
         }
         if (count($revs) < 2) {
             # No revisions matching, no further processing
             $this->output("\n");
             continue;
         }
         # For each chunk
         $i = 0;
         while ($i < count($revs)) {
             if ($i < count($revs) - $maxChunkSize) {
                 $thisChunkSize = $maxChunkSize;
             } else {
                 $thisChunkSize = count($revs) - $i;
             }
             $chunk = new ConcatenatedGzipHistoryBlob();
             $stubs = array();
             $dbw->begin(__METHOD__);
             $usedChunk = false;
             $primaryOldid = $revs[$i]->rev_text_id;
             // @codingStandardsIgnoreStart Ignore avoid function calls in a FOR loop test part warning
             # Get the text of each revision and add it to the object
             for ($j = 0; $j < $thisChunkSize && $chunk->isHappy(); $j++) {
                 // @codingStandardsIgnoreEnd
                 $oldid = $revs[$i + $j]->rev_text_id;
                 # Get text
                 if ($loadStyle == self::LS_INDIVIDUAL) {
                     $textRow = $dbw->selectRow('text', array('old_flags', 'old_text'), array('old_id' => $oldid), __METHOD__, 'FOR UPDATE');
                     $text = Revision::getRevisionText($textRow);
                 } else {
                     $text = Revision::getRevisionText($revs[$i + $j]);
                 }
                 if ($text === false) {
                     $this->error("\nError, unable to get text in old_id {$oldid}");
                     #$dbw->delete( 'old', array( 'old_id' => $oldid ) );
                 }
                 if ($extdb == "" && $j == 0) {
                     $chunk->setText($text);
                     $this->output('.');
                 } else {
                     # Don't make a stub if it's going to be longer than the article
                     # Stubs are typically about 100 bytes
                     if (strlen($text) < 120) {
                         $stub = false;
                         $this->output('x');
                     } else {
                         $stub = new HistoryBlobStub($chunk->addItem($text));
                         $stub->setLocation($primaryOldid);
                         $stub->setReferrer($oldid);
                         $this->output('.');
                         $usedChunk = true;
                     }
                     $stubs[$j] = $stub;
                 }
             }
             $thisChunkSize = $j;
             # If we couldn't actually use any stubs because the pages were too small, do nothing
             if ($usedChunk) {
                 if ($extdb != "") {
                     # Move blob objects to External Storage
                     $stored = $storeObj->store($extdb, serialize($chunk));
                     if ($stored === false) {
                         $this->error("Unable to store object");
                         return false;
                     }
                     # Store External Storage URLs instead of Stub placeholders
                     foreach ($stubs as $stub) {
                         if ($stub === false) {
                             continue;
                         }
                         # $stored should provide base path to a BLOB
                         $url = $stored . "/" . $stub->getHash();
                         $dbw->update('text', array('old_text' => $url, 'old_flags' => 'external,utf-8'), array('old_id' => $stub->getReferrer()));
                     }
                 } else {
                     # Store the main object locally
                     $dbw->update('text', array('old_text' => serialize($chunk), 'old_flags' => 'object,utf-8'), array('old_id' => $primaryOldid));
                     # Store the stub objects
                     for ($j = 1; $j < $thisChunkSize; $j++) {
                         # Skip if not compressing and don't overwrite the first revision
                         if ($stubs[$j] !== false && $revs[$i + $j]->rev_text_id != $primaryOldid) {
                             $dbw->update('text', array('old_text' => serialize($stubs[$j]), 'old_flags' => 'object,utf-8'), array('old_id' => $revs[$i + $j]->rev_text_id));
                         }
                     }
                 }
             }
             # Done, next
             $this->output("/");
             $dbw->commit(__METHOD__);
             $i += $thisChunkSize;
             wfWaitForSlaves();
         }
         $this->output("\n");
     }
     return true;
 }
Exemple #2
0
 /**
  * @return string
  */
 function getText()
 {
     $fname = 'HistoryBlobStub::getText';
     if (isset(self::$blobCache[$this->mOldId])) {
         $obj = self::$blobCache[$this->mOldId];
     } else {
         $dbr = wfGetDB(DB_SLAVE);
         $row = $dbr->selectRow('text', array('old_flags', 'old_text'), array('old_id' => $this->mOldId));
         if (!$row) {
             return false;
         }
         $flags = explode(',', $row->old_flags);
         if (in_array('external', $flags)) {
             $url = $row->old_text;
             $parts = explode('://', $url, 2);
             if (!isset($parts[1]) || $parts[1] == '') {
                 wfProfileOut($fname);
                 return false;
             }
             $row->old_text = ExternalStore::fetchFromUrl($url);
         }
         if (!in_array('object', $flags)) {
             return false;
         }
         if (in_array('gzip', $flags)) {
             // This shouldn't happen, but a bug in the compress script
             // may at times gzip-compress a HistoryBlob object row.
             $obj = unserialize(gzinflate($row->old_text));
         } else {
             $obj = unserialize($row->old_text);
         }
         if (!is_object($obj)) {
             // Correct for old double-serialization bug.
             $obj = unserialize($obj);
         }
         if (!is_object($obj)) {
             return false;
         }
         // Save this item for reference; if pulling many
         // items in a row we'll likely use it again.
         $obj->uncompress();
         self::$blobCache = array($this->mOldId => $obj);
     }
     return $obj->getItem($this->mHash);
 }