/** * Compress the text in chunks after concatenating the revisions. * * @param int $startId * @param int $maxChunkSize * @param string $beginDate * @param string $endDate * @param string $extdb * @param bool|int $maxPageId * @return bool */ private function compressWithConcat($startId, $maxChunkSize, $beginDate, $endDate, $extdb = "", $maxPageId = false) { $loadStyle = self::LS_CHUNKED; $dbr = wfGetDB(DB_SLAVE); $dbw = wfGetDB(DB_MASTER); # Set up external storage if ($extdb != '') { $storeObj = new ExternalStoreDB(); } # Get all articles by page_id if (!$maxPageId) { $maxPageId = $dbr->selectField('page', 'max(page_id)', '', __METHOD__); } $this->output("Starting from {$startId} of {$maxPageId}\n"); $pageConds = array(); /* if ( $exclude_ns0 ) { print "Excluding main namespace\n"; $pageConds[] = 'page_namespace<>0'; } if ( $queryExtra ) { $pageConds[] = $queryExtra; } */ # For each article, get a list of revisions which fit the criteria # No recompression, use a condition on old_flags # Don't compress object type entities, because that might produce data loss when # overwriting bulk storage concat rows. Don't compress external references, because # the script doesn't yet delete rows from external storage. $conds = array('old_flags NOT ' . $dbr->buildLike($dbr->anyString(), 'object', $dbr->anyString()) . ' AND old_flags NOT ' . $dbr->buildLike($dbr->anyString(), 'external', $dbr->anyString())); if ($beginDate) { if (!preg_match('/^\\d{14}$/', $beginDate)) { $this->error("Invalid begin date \"{$beginDate}\"\n"); return false; } $conds[] = "rev_timestamp>'" . $beginDate . "'"; } if ($endDate) { if (!preg_match('/^\\d{14}$/', $endDate)) { $this->error("Invalid end date \"{$endDate}\"\n"); return false; } $conds[] = "rev_timestamp<'" . $endDate . "'"; } if ($loadStyle == self::LS_CHUNKED) { $tables = array('revision', 'text'); $fields = array('rev_id', 'rev_text_id', 'old_flags', 'old_text'); $conds[] = 'rev_text_id=old_id'; $revLoadOptions = 'FOR UPDATE'; } else { $tables = array('revision'); $fields = array('rev_id', 'rev_text_id'); $revLoadOptions = array(); } # Don't work with current revisions # Don't lock the page table for update either -- TS 2006-04-04 #$tables[] = 'page'; #$conds[] = 'page_id=rev_page AND rev_id != page_latest'; for ($pageId = $startId; $pageId <= $maxPageId; $pageId++) { wfWaitForSlaves(); # Wake up $dbr->ping(); # Get the page row $pageRes = $dbr->select('page', array('page_id', 'page_namespace', 'page_title', 'page_latest'), $pageConds + array('page_id' => $pageId), __METHOD__); if ($pageRes->numRows() == 0) { continue; } $pageRow = $dbr->fetchObject($pageRes); # Display progress $titleObj = Title::makeTitle($pageRow->page_namespace, $pageRow->page_title); $this->output("{$pageId}\t" . $titleObj->getPrefixedDBkey() . " "); # Load revisions $revRes = $dbw->select($tables, $fields, array_merge(array('rev_page' => $pageRow->page_id, 'rev_id < ' . $pageRow->page_latest), $conds), __METHOD__, $revLoadOptions); $revs = array(); foreach ($revRes as $revRow) { $revs[] = $revRow; } if (count($revs) < 2) { # No revisions matching, no further processing $this->output("\n"); continue; } # For each chunk $i = 0; while ($i < count($revs)) { if ($i < count($revs) - $maxChunkSize) { $thisChunkSize = $maxChunkSize; } else { $thisChunkSize = count($revs) - $i; } $chunk = new ConcatenatedGzipHistoryBlob(); $stubs = array(); $dbw->begin(__METHOD__); $usedChunk = false; $primaryOldid = $revs[$i]->rev_text_id; // @codingStandardsIgnoreStart Ignore avoid function calls in a FOR loop test part warning # Get the text of each revision and add it to the object for ($j = 0; $j < $thisChunkSize && $chunk->isHappy(); $j++) { // @codingStandardsIgnoreEnd $oldid = $revs[$i + $j]->rev_text_id; # Get text if ($loadStyle == self::LS_INDIVIDUAL) { $textRow = $dbw->selectRow('text', array('old_flags', 'old_text'), array('old_id' => $oldid), __METHOD__, 'FOR UPDATE'); $text = Revision::getRevisionText($textRow); } else { $text = Revision::getRevisionText($revs[$i + $j]); } if ($text === false) { $this->error("\nError, unable to get text in old_id {$oldid}"); #$dbw->delete( 'old', array( 'old_id' => $oldid ) ); } if ($extdb == "" && $j == 0) { $chunk->setText($text); $this->output('.'); } else { # Don't make a stub if it's going to be longer than the article # Stubs are typically about 100 bytes if (strlen($text) < 120) { $stub = false; $this->output('x'); } else { $stub = new HistoryBlobStub($chunk->addItem($text)); $stub->setLocation($primaryOldid); $stub->setReferrer($oldid); $this->output('.'); $usedChunk = true; } $stubs[$j] = $stub; } } $thisChunkSize = $j; # If we couldn't actually use any stubs because the pages were too small, do nothing if ($usedChunk) { if ($extdb != "") { # Move blob objects to External Storage $stored = $storeObj->store($extdb, serialize($chunk)); if ($stored === false) { $this->error("Unable to store object"); return false; } # Store External Storage URLs instead of Stub placeholders foreach ($stubs as $stub) { if ($stub === false) { continue; } # $stored should provide base path to a BLOB $url = $stored . "/" . $stub->getHash(); $dbw->update('text', array('old_text' => $url, 'old_flags' => 'external,utf-8'), array('old_id' => $stub->getReferrer())); } } else { # Store the main object locally $dbw->update('text', array('old_text' => serialize($chunk), 'old_flags' => 'object,utf-8'), array('old_id' => $primaryOldid)); # Store the stub objects for ($j = 1; $j < $thisChunkSize; $j++) { # Skip if not compressing and don't overwrite the first revision if ($stubs[$j] !== false && $revs[$i + $j]->rev_text_id != $primaryOldid) { $dbw->update('text', array('old_text' => serialize($stubs[$j]), 'old_flags' => 'object,utf-8'), array('old_id' => $revs[$i + $j]->rev_text_id)); } } } } # Done, next $this->output("/"); $dbw->commit(__METHOD__); $i += $thisChunkSize; wfWaitForSlaves(); } $this->output("\n"); } return true; }