00001 <?php
00002
00003 require( dirname( __FILE__ ) .'/../commandLine.inc' );
00004
00005
00006 if ( count( $args ) < 1 ) {
00007 echo "Usage: php trackBlobs.php <cluster> [... <cluster>]\n";
00008 echo "Adds blobs from a given ES cluster to the blob_tracking table\n";
00009 echo "Automatically deletes the tracking table and starts from the start again when restarted.\n";
00010
00011 exit( 1 );
00012 }
00013 $tracker = new TrackBlobs( $args );
00014 $tracker->run();
00015
00016 class TrackBlobs {
00017 var $clusters, $textClause;
00018 var $doBlobOrphans;
00019 var $trackedBlobs = array();
00020
00021 var $batchSize = 1000;
00022 var $reportingInterval = 10;
00023
00024 function __construct( $clusters ) {
00025 $this->clusters = $clusters;
00026 if ( extension_loaded( 'gmp' ) ) {
00027 $this->doBlobOrphans = true;
00028 foreach ( $clusters as $cluster ) {
00029 $this->trackedBlobs[$cluster] = gmp_init( 0 );
00030 }
00031 } else {
00032 echo "Warning: the gmp extension is needed to find orphan blobs\n";
00033 }
00034 }
00035
00036 function run() {
00037 $this->initTrackingTable();
00038 $this->trackRevisions();
00039 $this->trackOrphanText();
00040 if ( $this->doBlobOrphans ) {
00041 $this->findOrphanBlobs();
00042 }
00043 }
00044
00045 function initTrackingTable() {
00046 $dbw = wfGetDB( DB_MASTER );
00047 if ( $dbw->tableExists( 'blob_tracking' ) ) {
00048 $dbw->query( 'DROP TABLE ' . $dbw->tableName( 'blob_tracking' ) );
00049 $dbw->query( 'DROP TABLE ' . $dbw->tableName( 'blob_orphans' ) );
00050 }
00051 $dbw->sourceFile( dirname( __FILE__ ) . '/blob_tracking.sql' );
00052 }
00053
00054 function getTextClause() {
00055 if ( !$this->textClause ) {
00056 $dbr = wfGetDB( DB_SLAVE );
00057 $this->textClause = '';
00058 foreach ( $this->clusters as $cluster ) {
00059 if ( $this->textClause != '' ) {
00060 $this->textClause .= ' OR ';
00061 }
00062 $this->textClause .= 'old_text LIKE ' . $dbr->addQuotes( $dbr->escapeLike( "DB://$cluster/" ) . '%' );
00063 }
00064 }
00065 return $this->textClause;
00066 }
00067
00068 function interpretPointer( $text ) {
00069 if ( !preg_match( '!^DB://(\w+)/(\d+)(?:/([0-9a-fA-F]+)|)$!', $text, $m ) ) {
00070 return false;
00071 }
00072 return array(
00073 'cluster' => $m[1],
00074 'id' => intval( $m[2] ),
00075 'hash' => isset( $m[3] ) ? $m[2] : null
00076 );
00077 }
00078
00082 function trackRevisions() {
00083 $dbw = wfGetDB( DB_MASTER );
00084 $dbr = wfGetDB( DB_SLAVE );
00085
00086 $textClause = $this->getTextClause();
00087 $startId = 0;
00088 $endId = $dbr->selectField( 'revision', 'MAX(rev_id)', false, __METHOD__ );
00089 $batchesDone = 0;
00090 $rowsInserted = 0;
00091
00092 echo "Finding revisions...\n";
00093
00094 while ( true ) {
00095 $res = $dbr->select( array( 'revision', 'text' ),
00096 array( 'rev_id', 'rev_page', 'old_id', 'old_flags', 'old_text' ),
00097 array(
00098 'rev_id > ' . $dbr->addQuotes( $startId ),
00099 'rev_text_id=old_id',
00100 $textClause,
00101 "old_flags LIKE '%external%'",
00102 ),
00103 __METHOD__,
00104 array(
00105 'ORDER BY' => 'rev_id',
00106 'LIMIT' => $this->batchSize
00107 )
00108 );
00109 if ( !$res->numRows() ) {
00110 break;
00111 }
00112
00113 $insertBatch = array();
00114 foreach ( $res as $row ) {
00115 $startId = $row->rev_id;
00116 $info = $this->interpretPointer( $row->old_text );
00117 if ( !$info ) {
00118 echo "Invalid DB:// URL in rev_id {$row->rev_id}\n";
00119 continue;
00120 }
00121 if ( !in_array( $info['cluster'], $this->clusters ) ) {
00122 echo "Invalid cluster returned in SQL query: {$info['cluster']}\n";
00123 continue;
00124 }
00125 $insertBatch[] = array(
00126 'bt_page' => $row->rev_page,
00127 'bt_rev_id' => $row->rev_id,
00128 'bt_text_id' => $row->old_id,
00129 'bt_cluster' => $info['cluster'],
00130 'bt_blob_id' => $info['id'],
00131 'bt_cgz_hash' => $info['hash']
00132 );
00133 if ( $this->doBlobOrphans ) {
00134 gmp_setbit( $this->trackedBlobs[$info['cluster']], $info['id'] );
00135 }
00136 }
00137 $dbw->insert( 'blob_tracking', $insertBatch, __METHOD__ );
00138 $rowsInserted += count( $insertBatch );
00139
00140 ++$batchesDone;
00141 if ( $batchesDone >= $this->reportingInterval ) {
00142 $batchesDone = 0;
00143 echo "$startId / $endId\n";
00144 wfWaitForSlaves( 5 );
00145 }
00146 }
00147 echo "Found $rowsInserted revisions\n";
00148 }
00149
00155 function trackOrphanText() {
00156 # Wait until the blob_tracking table is available in the slave
00157 $dbw = wfGetDB( DB_MASTER );
00158 $dbr = wfGetDB( DB_SLAVE );
00159 $pos = $dbw->getMasterPos();
00160 $dbr->masterPosWait( $pos, 100000 );
00161
00162 $textClause = $this->getTextClause( $this->clusters );
00163 $startId = 0;
00164 $endId = $dbr->selectField( 'text', 'MAX(old_id)', false, __METHOD__ );
00165 $rowsInserted = 0;
00166 $batchesDone = 0;
00167
00168 echo "Finding orphan text...\n";
00169
00170 # Scan the text table for orphan text
00171 while ( true ) {
00172 $res = $dbr->select( array( 'text', 'blob_tracking' ),
00173 array( 'old_id', 'old_flags', 'old_text' ),
00174 array(
00175 'old_id>' . $dbr->addQuotes( $startId ),
00176 $textClause,
00177 "old_flags LIKE '%external%'",
00178 'bt_text_id IS NULL'
00179 ),
00180 __METHOD__,
00181 array(
00182 'ORDER BY' => 'old_id',
00183 'LIMIT' => $this->batchSize
00184 ),
00185 array( 'blob_tracking' => array( 'LEFT JOIN', 'bt_text_id=old_id' ) )
00186 );
00187 $ids = array();
00188 foreach ( $res as $row ) {
00189 $ids[] = $row->old_id;
00190 }
00191
00192 if ( !$res->numRows() ) {
00193 break;
00194 }
00195
00196 $insertBatch = array();
00197 foreach ( $res as $row ) {
00198 $startId = $row->old_id;
00199 $info = $this->interpretPointer( $row->old_text );
00200 if ( !$info ) {
00201 echo "Invalid DB:// URL in old_id {$row->old_id}\n";
00202 continue;
00203 }
00204 if ( !in_array( $info['cluster'], $this->clusters ) ) {
00205 echo "Invalid cluster returned in SQL query\n";
00206 continue;
00207 }
00208
00209 $insertBatch[] = array(
00210 'bt_page' => 0,
00211 'bt_rev_id' => 0,
00212 'bt_text_id' => $row->old_id,
00213 'bt_cluster' => $info['cluster'],
00214 'bt_blob_id' => $info['id'],
00215 'bt_cgz_hash' => $info['hash']
00216 );
00217 if ( $this->doBlobOrphans ) {
00218 gmp_setbit( $this->trackedBlobs[$info['cluster']], $info['id'] );
00219 }
00220 }
00221 $dbw->insert( 'blob_tracking', $insertBatch, __METHOD__ );
00222
00223 $rowsInserted += count( $insertBatch );
00224 ++$batchesDone;
00225 if ( $batchesDone >= $this->reportingInterval ) {
00226 $batchesDone = 0;
00227 echo "$startId / $endId\n";
00228 wfWaitForSlaves( 5 );
00229 }
00230 }
00231 echo "Found $rowsInserted orphan text rows\n";
00232 }
00233
00241 function findOrphanBlobs() {
00242 if ( !extension_loaded( 'gmp' ) ) {
00243 echo "Can't find orphan blobs, need bitfield support provided by GMP.\n";
00244 return;
00245 }
00246
00247 $dbw = wfGetDB( DB_MASTER );
00248
00249 foreach ( $this->clusters as $cluster ) {
00250 echo "Searching for orphan blobs in $cluster...\n";
00251 $lb = wfGetLBFactory()->getExternalLB( $cluster );
00252 try {
00253 $extDB = $lb->getConnection( DB_SLAVE );
00254 } catch ( DBConnectionError $e ) {
00255 if ( strpos( $e->error, 'Unknown database' ) !== false ) {
00256 echo "No database on $cluster\n";
00257 } else {
00258 echo "Error on $cluster: " . $e->getMessage() . "\n";
00259 }
00260 continue;
00261 }
00262 $table = $extDB->getLBInfo( 'blobs table' );
00263 if ( is_null( $table ) ) {
00264 $table = 'blobs';
00265 }
00266 $startId = 0;
00267 $batchesDone = 0;
00268 $actualBlobs = gmp_init( 0 );
00269 $endId = $extDB->selectField( $table, 'MAX(blob_id)', false, __METHOD__ );
00270
00271
00272 while ( true ) {
00273 $res = $extDB->select( $table,
00274 array( 'blob_id' ),
00275 array( 'blob_id > ' . $extDB->addQuotes( $startId ) ),
00276 __METHOD__,
00277 array( 'LIMIT' => $this->batchSize, 'ORDER BY' => 'blob_id' )
00278 );
00279
00280 if ( !$res->numRows() ) {
00281 break;
00282 }
00283
00284 foreach ( $res as $row ) {
00285 gmp_setbit( $actualBlobs, $row->blob_id );
00286 }
00287 $startId = $row->blob_id;
00288
00289 ++$batchesDone;
00290 if ( $batchesDone >= $this->reportingInterval ) {
00291 $batchesDone = 0;
00292 echo "$startId / $endId\n";
00293 }
00294 }
00295
00296
00297
00298 $orphans = gmp_and( $actualBlobs, gmp_com( $this->trackedBlobs[$cluster] ) );
00299
00300
00301 $insertBatch = array();
00302 $id = 0;
00303 while ( true ) {
00304 $id = gmp_scan1( $orphans, $id );
00305 if ( $id == -1 ) {
00306 break;
00307 }
00308 $insertBatch[] = array(
00309 'bo_cluster' => $cluster,
00310 'bo_blob_id' => $id
00311 );
00312 ++$id;
00313 }
00314
00315
00316 echo "Found " . count( $insertBatch ) . " orphan(s) in $cluster\n";
00317 $dbw->insert( 'blob_orphans', $insertBatch, __METHOD__ );
00318 }
00319 }
00320 }