00001 <?php
00002
00003 $optionsWithArgs = RecompressTracked::getOptionsWithArgs();
00004 require( dirname( __FILE__ ) .'/../commandLine.inc' );
00005
00006 if ( count( $args ) < 1 ) {
00007 echo "Usage: php recompressTracked.php [options] <cluster> [... <cluster>...]
00008 Moves blobs indexed by trackBlobs.php to a specified list of destination clusters, and recompresses them in the process. Restartable.
00009
00010 Options:
00011 --procs <procs> Set the number of child processes (default 1)
00012 --copy-only Copy only, do not update the text table. Restart without this option to complete.
00013 --debug-log <file> Log debugging data to the specified file
00014 --info-log <file> Log progress messages to the specified file
00015 --critical-log <file> Log error messages to the specified file
00016 ";
00017 exit( 1 );
00018 }
00019
00020 $job = RecompressTracked::newFromCommandLine( $args, $options );
00021 $job->execute();
00022
00023 class RecompressTracked {
00024 var $destClusters;
00025 var $batchSize = 1000;
00026 var $orphanBatchSize = 1000;
00027 var $reportingInterval = 10;
00028 var $numProcs = 1;
00029 var $useDiff, $pageBlobClass, $orphanBlobClass;
00030 var $slavePipes, $slaveProcs, $prevSlaveId;
00031 var $copyOnly = false;
00032 var $isChild = false;
00033 var $slaveId = false;
00034 var $debugLog, $infoLog, $criticalLog;
00035 var $store;
00036
00037 static $optionsWithArgs = array( 'procs', 'slave-id', 'debug-log', 'info-log', 'critical-log' );
00038 static $cmdLineOptionMap = array(
00039 'procs' => 'numProcs',
00040 'copy-only' => 'copyOnly',
00041 'child' => 'isChild',
00042 'slave-id' => 'slaveId',
00043 'debug-log' => 'debugLog',
00044 'info-log' => 'infoLog',
00045 'critical-log' => 'criticalLog',
00046 );
00047
00048 static function getOptionsWithArgs() {
00049 return self::$optionsWithArgs;
00050 }
00051
00052 static function newFromCommandLine( $args, $options ) {
00053 $jobOptions = array( 'destClusters' => $args );
00054 foreach ( self::$cmdLineOptionMap as $cmdOption => $classOption ) {
00055 if ( isset( $options[$cmdOption] ) ) {
00056 $jobOptions[$classOption] = $options[$cmdOption];
00057 }
00058 }
00059 return new self( $jobOptions );
00060 }
00061
00062 function __construct( $options ) {
00063 foreach ( $options as $name => $value ) {
00064 $this->$name = $value;
00065 }
00066 $this->store = new ExternalStoreDB;
00067 if ( !$this->isChild ) {
00068 $GLOBALS['wgDebugLogPrefix'] = "RCT M: ";
00069 } elseif ( $this->slaveId !== false ) {
00070 $GLOBALS['wgDebugLogPrefix'] = "RCT {$this->slaveId}: ";
00071 }
00072 $this->useDiff = function_exists( 'xdiff_string_bdiff' );
00073 $this->pageBlobClass = $this->useDiff ? 'DiffHistoryBlob' : 'ConcatenatedGzipHistoryBlob';
00074 $this->orphanBlobClass = 'ConcatenatedGzipHistoryBlob';
00075 }
00076
00077 function debug( $msg ) {
00078 wfDebug( "$msg\n" );
00079 if ( $this->debugLog ) {
00080 $this->logToFile( $msg, $this->debugLog );
00081 }
00082
00083 }
00084
00085 function info( $msg ) {
00086 echo "$msg\n";
00087 if ( $this->infoLog ) {
00088 $this->logToFile( $msg, $this->infoLog );
00089 }
00090 }
00091
00092 function critical( $msg ) {
00093 echo "$msg\n";
00094 if ( $this->criticalLog ) {
00095 $this->logToFile( $msg, $this->criticalLog );
00096 }
00097 }
00098
00099 function logToFile( $msg, $file ) {
00100 $header = '[' . date('d\TH:i:s') . '] ' . wfHostname() . ' ' . posix_getpid();
00101 if ( $this->slaveId !== false ) {
00102 $header .= "({$this->slaveId})";
00103 }
00104 $header .= ' ' . wfWikiID();
00105 wfErrorLog( sprintf( "%-50s %s\n", $header, $msg ), $file );
00106 }
00107
00113 function syncDBs() {
00114 $dbw = wfGetDB( DB_MASTER );
00115 $dbr = wfGetDB( DB_SLAVE );
00116 $pos = $dbw->getMasterPos();
00117 $dbr->masterPosWait( $pos, 100000 );
00118 }
00119
00123 function execute() {
00124 if ( $this->isChild ) {
00125 $this->executeChild();
00126 } else {
00127 $this->executeParent();
00128 }
00129 }
00130
00134 function executeParent() {
00135 if ( !$this->checkTrackingTable() ) {
00136 return;
00137 }
00138
00139 $this->syncDBs();
00140 $this->startSlaveProcs();
00141 $this->doAllPages();
00142 $this->doAllOrphans();
00143 $this->killSlaveProcs();
00144 }
00145
00149 function checkTrackingTable() {
00150 $dbr = wfGetDB( DB_SLAVE );
00151 if ( !$dbr->tableExists( 'blob_tracking' ) ) {
00152 $this->critical( "Error: blob_tracking table does not exist" );
00153 return false;
00154 }
00155 $row = $dbr->selectRow( 'blob_tracking', '*', false, __METHOD__ );
00156 if ( !$row ) {
00157 $this->info( "Warning: blob_tracking table contains no rows, skipping this wiki." );
00158 return false;
00159 }
00160 return true;
00161 }
00162
00169 function startSlaveProcs() {
00170 $cmd = 'php ' . wfEscapeShellArg( __FILE__ );
00171 foreach ( self::$cmdLineOptionMap as $cmdOption => $classOption ) {
00172 if ( $cmdOption == 'slave-id' ) {
00173 continue;
00174 } elseif ( in_array( $cmdOption, self::$optionsWithArgs ) && isset( $this->$classOption ) ) {
00175 $cmd .= " --$cmdOption " . wfEscapeShellArg( $this->$classOption );
00176 } elseif ( $this->$classOption ) {
00177 $cmd .= " --$cmdOption";
00178 }
00179 }
00180 $cmd .= ' --child' .
00181 ' --wiki ' . wfEscapeShellArg( wfWikiID() ) .
00182 ' ' . call_user_func_array( 'wfEscapeShellArg', $this->destClusters );
00183
00184 $this->slavePipes = $this->slaveProcs = array();
00185 for ( $i = 0; $i < $this->numProcs; $i++ ) {
00186 $pipes = false;
00187 $spec = array(
00188 array( 'pipe', 'r' ),
00189 array( 'file', 'php://stdout', 'w' ),
00190 array( 'file', 'php://stderr', 'w' )
00191 );
00192 wfSuppressWarnings();
00193 $proc = proc_open( "$cmd --slave-id $i", $spec, $pipes );
00194 wfRestoreWarnings();
00195 if ( !$proc ) {
00196 $this->critical( "Error opening slave process: $cmd" );
00197 exit( 1 );
00198 }
00199 $this->slaveProcs[$i] = $proc;
00200 $this->slavePipes[$i] = $pipes[0];
00201 }
00202 $this->prevSlaveId = -1;
00203 }
00204
00208 function killSlaveProcs() {
00209 $this->info( "Waiting for slave processes to finish..." );
00210 for ( $i = 0; $i < $this->numProcs; $i++ ) {
00211 $this->dispatchToSlave( $i, 'quit' );
00212 }
00213 for ( $i = 0; $i < $this->numProcs; $i++ ) {
00214 $status = proc_close( $this->slaveProcs[$i] );
00215 if ( $status ) {
00216 $this->critical( "Warning: child #$i exited with status $status" );
00217 }
00218 }
00219 $this->info( "Done." );
00220 }
00221
00226 function dispatch( ) {
00227 $args = func_get_args();
00228 $pipes = $this->slavePipes;
00229 $numPipes = stream_select( $x=array(), $pipes, $y=array(), 3600 );
00230 if ( !$numPipes ) {
00231 $this->critical( "Error waiting to write to slaves. Aborting" );
00232 exit( 1 );
00233 }
00234 for ( $i = 0; $i < $this->numProcs; $i++ ) {
00235 $slaveId = ( $i + $this->prevSlaveId + 1 ) % $this->numProcs;
00236 if ( isset( $pipes[$slaveId] ) ) {
00237 $this->prevSlaveId = $slaveId;
00238 $this->dispatchToSlave( $slaveId, $args );
00239 return;
00240 }
00241 }
00242 $this->critical( "Unreachable" );
00243 exit( 1 );
00244 }
00245
00249 function dispatchToSlave( $slaveId, $args ) {
00250 $args = (array)$args;
00251 $cmd = implode( ' ', $args );
00252 fwrite( $this->slavePipes[$slaveId], "$cmd\n" );
00253 }
00254
00258 function doAllPages() {
00259 $dbr = wfGetDB( DB_SLAVE );
00260 $i = 0;
00261 $startId = 0;
00262 $numPages = $dbr->selectField( 'blob_tracking',
00263 'COUNT(DISTINCT bt_page)',
00264 # A condition is required so that this query uses the index
00265 array( 'bt_moved' => 0 ),
00266 __METHOD__
00267 );
00268 if ( $this->copyOnly ) {
00269 $this->info( "Copying pages..." );
00270 } else {
00271 $this->info( "Moving pages..." );
00272 }
00273 while ( true ) {
00274 $res = $dbr->select( 'blob_tracking',
00275 array( 'bt_page' ),
00276 array(
00277 'bt_moved' => 0,
00278 'bt_page > ' . $dbr->addQuotes( $startId )
00279 ),
00280 __METHOD__,
00281 array(
00282 'DISTINCT',
00283 'ORDER BY' => 'bt_page',
00284 'LIMIT' => $this->batchSize,
00285 )
00286 );
00287 if ( !$res->numRows() ) {
00288 break;
00289 }
00290 foreach ( $res as $row ) {
00291 $this->dispatch( 'doPage', $row->bt_page );
00292 $i++;
00293 }
00294 $startId = $row->bt_page;
00295 $this->report( 'pages', $i, $numPages );
00296 }
00297 $this->report( 'pages', $i, $numPages );
00298 if ( $this->copyOnly ) {
00299 $this->info( "All page copies queued." );
00300 } else {
00301 $this->info( "All page moves queued." );
00302 }
00303 }
00304
00308 function report( $label, $current, $end ) {
00309 $this->numBatches++;
00310 if ( $current == $end || $this->numBatches >= $this->reportingInterval ) {
00311 $this->numBatches = 0;
00312 $this->info( "$label: $current / $end" );
00313 wfWaitForSlaves( 5 );
00314 }
00315 }
00316
00320 function doAllOrphans() {
00321 $dbr = wfGetDB( DB_SLAVE );
00322 $startId = 0;
00323 $i = 0;
00324 $numOrphans = $dbr->selectField( 'blob_tracking',
00325 'COUNT(DISTINCT bt_text_id)',
00326 array( 'bt_moved' => 0, 'bt_page' => 0 ),
00327 __METHOD__ );
00328 if ( !$numOrphans ) {
00329 return;
00330 }
00331 if ( $this->copyOnly ) {
00332 $this->info( "Copying orphans..." );
00333 } else {
00334 $this->info( "Moving orphans..." );
00335 }
00336
00337 while ( true ) {
00338 $res = $dbr->select( 'blob_tracking',
00339 array( 'bt_text_id' ),
00340 array(
00341 'bt_moved' => 0,
00342 'bt_page' => 0,
00343 'bt_text_id > ' . $dbr->addQuotes( $startId )
00344 ),
00345 __METHOD__,
00346 array(
00347 'DISTINCT',
00348 'ORDER BY' => 'bt_text_id',
00349 'LIMIT' => $this->batchSize
00350 )
00351 );
00352 if ( !$res->numRows() ) {
00353 break;
00354 }
00355 $ids = array();
00356 foreach ( $res as $row ) {
00357 $ids[] = $row->bt_text_id;
00358 $i++;
00359 }
00360
00361
00362
00363 while ( count( $ids ) > $this->orphanBatchSize ) {
00364 $args = array_slice( $ids, 0, $this->orphanBatchSize );
00365 $ids = array_slice( $ids, $this->orphanBatchSize );
00366 array_unshift( $args, 'doOrphanList' );
00367 call_user_func_array( array( $this, 'dispatch' ), $args );
00368 }
00369 if ( count( $ids ) ) {
00370 $args = $ids;
00371 array_unshift( $args, 'doOrphanList' );
00372 call_user_func_array( array( $this, 'dispatch' ), $args );
00373 }
00374
00375 $startId = $row->bt_text_id;
00376 $this->report( 'orphans', $i, $numOrphans );
00377 }
00378 $this->report( 'orphans', $i, $numOrphans );
00379 $this->info( "All orphans queued." );
00380 }
00381
00385 function executeChild() {
00386 $this->debug( 'starting' );
00387 $this->syncDBs();
00388
00389 while ( !feof( STDIN ) ) {
00390 $line = rtrim( fgets( STDIN ) );
00391 if ( $line == '' ) {
00392 continue;
00393 }
00394 $this->debug( $line );
00395 $args = explode( ' ', $line );
00396 $cmd = array_shift( $args );
00397 switch ( $cmd ) {
00398 case 'doPage':
00399 $this->doPage( intval( $args[0] ) );
00400 break;
00401 case 'doOrphanList':
00402 $this->doOrphanList( array_map( 'intval', $args ) );
00403 break;
00404 case 'quit':
00405 return;
00406 }
00407 wfWaitForSlaves( 5 );
00408 }
00409 }
00410
00414 function doPage( $pageId ) {
00415 $title = Title::newFromId( $pageId );
00416 if ( $title ) {
00417 $titleText = $title->getPrefixedText();
00418 } else {
00419 $titleText = '[deleted]';
00420 }
00421 $dbr = wfGetDB( DB_SLAVE );
00422
00423
00424 if ( !$this->copyOnly ) {
00425 $this->finishIncompleteMoves( array( 'bt_page' => $pageId ) );
00426 $this->syncDBs();
00427 }
00428
00429 $startId = 0;
00430 $trx = new CgzCopyTransaction( $this, $this->pageBlobClass );
00431
00432 while ( true ) {
00433 $res = $dbr->select(
00434 array( 'blob_tracking', 'text' ),
00435 '*',
00436 array(
00437 'bt_page' => $pageId,
00438 'bt_text_id > ' . $dbr->addQuotes( $startId ),
00439 'bt_moved' => 0,
00440 'bt_new_url IS NULL',
00441 'bt_text_id=old_id',
00442 ),
00443 __METHOD__,
00444 array(
00445 'ORDER BY' => 'bt_text_id',
00446 'LIMIT' => $this->batchSize
00447 )
00448 );
00449 if ( !$res->numRows() ) {
00450 break;
00451 }
00452
00453 $lastTextId = 0;
00454 foreach ( $res as $row ) {
00455 if ( $lastTextId == $row->bt_text_id ) {
00456
00457 continue;
00458 }
00459 $lastTextId = $row->bt_text_id;
00460
00461 $text = Revision::getRevisionText( $row );
00462 if ( $text === false ) {
00463 $this->critical( "Error loading {$row->bt_rev_id}/{$row->bt_text_id}" );
00464 continue;
00465 }
00466
00467
00468 if ( !$trx->addItem( $text, $row->bt_text_id ) ) {
00469 $this->debug( "$titleText: committing blob with " . $trx->getSize() . " items" );
00470 $trx->commit();
00471 $trx = new CgzCopyTransaction( $this, $this->pageBlobClass );
00472 }
00473 }
00474 $startId = $row->bt_text_id;
00475 }
00476
00477 $this->debug( "$titleText: committing blob with " . $trx->getSize() . " items" );
00478 $trx->commit();
00479 }
00480
00491 function moveTextRow( $textId, $url ) {
00492 if ( $this->copyOnly ) {
00493 $this->critical( "Internal error: can't call moveTextRow() in --copy-only mode" );
00494 exit( 1 );
00495 }
00496 $dbw = wfGetDB( DB_MASTER );
00497 $dbw->begin();
00498 $dbw->update( 'text',
00499 array(
00500 'old_text' => $url,
00501 'old_flags' => 'external,utf-8',
00502 ),
00503 array(
00504 'old_id' => $textId
00505 ),
00506 __METHOD__
00507 );
00508 $dbw->update( 'blob_tracking',
00509 array( 'bt_moved' => 1 ),
00510 array( 'bt_text_id' => $textId ),
00511 __METHOD__
00512 );
00513 $dbw->commit();
00514 }
00515
00524 function finishIncompleteMoves( $conds ) {
00525 $dbr = wfGetDB( DB_SLAVE );
00526
00527 $startId = 0;
00528 $conds = array_merge( $conds, array(
00529 'bt_moved' => 0,
00530 'bt_new_url IS NOT NULL'
00531 ));
00532 while ( true ) {
00533 $res = $dbr->select( 'blob_tracking',
00534 '*',
00535 array_merge( $conds, array( 'bt_text_id > ' . $dbr->addQuotes( $startId ) ) ),
00536 __METHOD__,
00537 array(
00538 'ORDER BY' => 'bt_text_id',
00539 'LIMIT' => $this->batchSize,
00540 )
00541 );
00542 if ( !$res->numRows() ) {
00543 break;
00544 }
00545 $this->debug( 'Incomplete: ' . $res->numRows() . ' rows' );
00546 foreach ( $res as $row ) {
00547 $this->moveTextRow( $row->bt_text_id, $row->bt_new_url );
00548 }
00549 $startId = $row->bt_text_id;
00550 }
00551 }
00552
00556 function getTargetCluster() {
00557 $cluster = next( $this->destClusters );
00558 if ( $cluster === false ) {
00559 $cluster = reset( $this->destClusters );
00560 }
00561 return $cluster;
00562 }
00563
00567 function getExtDB( $cluster ) {
00568 $lb = wfGetLBFactory()->getExternalLB( $cluster );
00569 return $lb->getConnection( DB_MASTER );
00570 }
00571
00575 function doOrphanList( $textIds ) {
00576
00577 if ( !$this->copyOnly ) {
00578 $this->finishIncompleteMoves( array( 'bt_text_id' => $textIds ) );
00579 $this->syncDBs();
00580 }
00581
00582 $trx = new CgzCopyTransaction( $this, $this->orphanBlobClass );
00583
00584 $res = wfGetDB( DB_SLAVE )->select(
00585 array( 'text', 'blob_tracking' ),
00586 array( 'old_id', 'old_text', 'old_flags' ),
00587 array(
00588 'old_id' => $textIds,
00589 'bt_text_id=old_id',
00590 'bt_moved' => 0,
00591 ),
00592 __METHOD__,
00593 array( 'DISTINCT' )
00594 );
00595
00596 foreach ( $res as $row ) {
00597 $text = Revision::getRevisionText( $row );
00598 if ( $text === false ) {
00599 $this->critical( "Error: cannot load revision text for old_id=$textId" );
00600 continue;
00601 }
00602
00603 if ( !$trx->addItem( $text, $row->old_id ) ) {
00604 $this->debug( "[orphan]: committing blob with " . $trx->getSize() . " rows" );
00605 $trx->commit();
00606 $trx = new CgzCopyTransaction( $this, $this->orphanBlobClass );
00607 }
00608 }
00609 $this->debug( "[orphan]: committing blob with " . $trx->getSize() . " rows" );
00610 $trx->commit();
00611 }
00612 }
00613
00617 class CgzCopyTransaction {
00618 var $parent;
00619 var $blobClass;
00620 var $cgz;
00621 var $referrers;
00622
00626 function __construct( $parent, $blobClass ) {
00627 $this->blobClass = $blobClass;
00628 $this->cgz = false;
00629 $this->texts = array();
00630 $this->parent = $parent;
00631 }
00632
00637 function addItem( $text, $textId ) {
00638 if ( !$this->cgz ) {
00639 $class = $this->blobClass;
00640 $this->cgz = new $class;
00641 }
00642 $hash = $this->cgz->addItem( $text );
00643 $this->referrers[$textId] = $hash;
00644 $this->texts[$textId] = $text;
00645 return $this->cgz->isHappy();
00646 }
00647
00648 function getSize() {
00649 return count( $this->texts );
00650 }
00651
00655 function recompress() {
00656 $class = $this->blobClass;
00657 $this->cgz = new $class;
00658 $this->referrers = array();
00659 foreach ( $this->texts as $textId => $text ) {
00660 $hash = $this->cgz->addItem( $text );
00661 $this->referrers[$textId] = $hash;
00662 }
00663 }
00664
00670 function commit() {
00671 $originalCount = count( $this->texts );
00672 if ( !$originalCount ) {
00673 return;
00674 }
00675
00676
00677
00678
00679
00680
00681
00682
00683 $dbw = wfGetDB( DB_MASTER );
00684 $dbw->begin();
00685 $res = $dbw->select( 'blob_tracking',
00686 array( 'bt_text_id', 'bt_moved' ),
00687 array( 'bt_text_id' => array_keys( $this->referrers ) ),
00688 __METHOD__, array( 'FOR UPDATE' ) );
00689 $dirty = false;
00690 foreach ( $res as $row ) {
00691 if ( $row->bt_moved ) {
00692 # This row has already been moved, remove it
00693 $this->parent->debug( "TRX: conflict detected in old_id={$row->bt_text_id}" );
00694 unset( $this->texts[$row->bt_text_id] );
00695 $dirty = true;
00696 }
00697 }
00698
00699
00700 if ( $dirty ) {
00701 if ( !count( $this->texts ) ) {
00702
00703 if ( $originalCount > 1 ) {
00704
00705 $this->critical( "Warning: concurrent operation detected, are there two conflicting " .
00706 "processes running, doing the same job?" );
00707 }
00708 return;
00709 }
00710 $this->recompress();
00711 }
00712
00713
00714 $targetCluster = $this->parent->getTargetCluster();
00715 $store = $this->parent->store;
00716 $targetDB = $store->getMaster( $targetCluster );
00717 $targetDB->clearFlag( DBO_TRX );
00718 $targetDB->begin();
00719 $baseUrl = $this->parent->store->store( $targetCluster, serialize( $this->cgz ) );
00720
00721
00722 foreach ( $this->referrers as $textId => $hash ) {
00723 $url = $baseUrl . '/' . $hash;
00724 $dbw->update( 'blob_tracking',
00725 array( 'bt_new_url' => $url ),
00726 array(
00727 'bt_text_id' => $textId,
00728 'bt_moved' => 0, # Check for concurrent conflicting update
00729 ),
00730 __METHOD__
00731 );
00732 }
00733
00734 $targetDB->commit();
00735
00736
00737 $dbw->commit();
00738
00739
00740 if ( !$this->parent->copyOnly ) {
00741 foreach ( $this->referrers as $textId => $hash ) {
00742 $url = $baseUrl . '/' . $hash;
00743 $this->parent->moveTextRow( $textId, $url );
00744 }
00745 }
00746 }
00747 }
00748