00001 <?php
00002 define( 'GS_MAIN', -2 );
00003 define( 'GS_TALK', -1 );
00019 class GenerateSitemap {
00027 var $url_limit;
00028
00036 var $size_limit;
00037
00043 var $fspath;
00044
00050 var $path;
00051
00057 var $compress;
00058
00064 var $limit = array();
00065
00071 var $priorities = array(
00072
00073 GS_MAIN => '0.5',
00074
00075 GS_TALK => '0.1',
00076
00077 NS_MAIN => '1.0',
00078 NS_TALK => '0.1',
00079 NS_USER => '0.5',
00080 NS_USER_TALK => '0.1',
00081 NS_PROJECT => '0.5',
00082 NS_PROJECT_TALK => '0.1',
00083 NS_FILE => '0.5',
00084 NS_FILE_TALK => '0.1',
00085 NS_MEDIAWIKI => '0.0',
00086 NS_MEDIAWIKI_TALK => '0.1',
00087 NS_TEMPLATE => '0.0',
00088 NS_TEMPLATE_TALK => '0.1',
00089 NS_HELP => '0.5',
00090 NS_HELP_TALK => '0.1',
00091 NS_CATEGORY => '0.5',
00092 NS_CATEGORY_TALK => '0.1',
00093 );
00094
00100 var $namespaces = array();
00101
00107 var $timestamp;
00108
00114 var $dbr;
00115
00121 var $findex;
00122
00123
00129 var $file;
00130
00136 var $stderr;
00137
00146 function GenerateSitemap( $fspath, $compress ) {
00147 global $wgScriptPath;
00148
00149 $this->url_limit = 50000;
00150 $this->size_limit = pow( 2, 20 ) * 10;
00151 $this->fspath = self::init_path( $fspath );
00152
00153 $this->compress = $compress;
00154
00155 $this->stderr = fopen( 'php://stderr', 'wt' );
00156 $this->dbr = wfGetDB( DB_SLAVE );
00157 $this->generateNamespaces();
00158 $this->timestamp = wfTimestamp( TS_ISO_8601, wfTimestampNow() );
00159
00160
00161 $this->findex = fopen( "{$this->fspath}sitemap-index-" . wfWikiID() . ".xml", 'wb' );
00162 }
00163
00167 private static function init_path( $fspath ) {
00168 if( !isset( $fspath ) ) {
00169 return null;
00170 }
00171 # Create directory if needed
00172 if( $fspath && !is_dir( $fspath ) ) {
00173 mkdir( $fspath, 0755 ) or die("Can not create directory $fspath.\n");
00174 }
00175
00176 return realpath( $fspath ). DIRECTORY_SEPARATOR ;
00177 }
00178
00182 function generateNamespaces() {
00183 $fname = 'GenerateSitemap::generateNamespaces';
00184
00185
00186 global $wgSitemapNamespaces;
00187 if( is_array( $wgSitemapNamespaces ) ) {
00188 $this->namespaces = $wgSitemapNamespaces;
00189 return;
00190 }
00191
00192 $res = $this->dbr->select( 'page',
00193 array( 'page_namespace' ),
00194 array(),
00195 $fname,
00196 array(
00197 'GROUP BY' => 'page_namespace',
00198 'ORDER BY' => 'page_namespace',
00199 )
00200 );
00201
00202 while ( $row = $this->dbr->fetchObject( $res ) )
00203 $this->namespaces[] = $row->page_namespace;
00204 }
00205
00214 function priority( $namespace ) {
00215 return isset( $this->priorities[$namespace] ) ? $this->priorities[$namespace] : $this->guessPriority( $namespace );
00216 }
00217
00227 function guessPriority( $namespace ) {
00228 return MWNamespace::isMain( $namespace ) ? $this->priorities[GS_MAIN] : $this->priorities[GS_TALK];
00229 }
00230
00238 function getPageRes( $namespace ) {
00239 $fname = 'GenerateSitemap::getPageRes';
00240
00241 return $this->dbr->select( 'page',
00242 array(
00243 'page_namespace',
00244 'page_title',
00245 'page_touched',
00246 ),
00247 array( 'page_namespace' => $namespace ),
00248 $fname
00249 );
00250 }
00251
00257 function main() {
00258 global $wgContLang;
00259
00260 fwrite( $this->findex, $this->openIndex() );
00261
00262 foreach ( $this->namespaces as $namespace ) {
00263 $res = $this->getPageRes( $namespace );
00264 $this->file = false;
00265 $this->generateLimit( $namespace );
00266 $length = $this->limit[0];
00267 $i = $smcount = 0;
00268
00269 $fns = $wgContLang->getFormattedNsText( $namespace );
00270 $this->debug( "$namespace ($fns)" );
00271 while ( $row = $this->dbr->fetchObject( $res ) ) {
00272 if ( $i++ === 0 || $i === $this->url_limit + 1 || $length + $this->limit[1] + $this->limit[2] > $this->size_limit ) {
00273 if ( $this->file !== false ) {
00274 $this->write( $this->file, $this->closeFile() );
00275 $this->close( $this->file );
00276 }
00277 $filename = $this->sitemapFilename( $namespace, $smcount++ );
00278 $this->file = $this->open( $this->fspath . $filename, 'wb' );
00279 $this->write( $this->file, $this->openFile() );
00280 fwrite( $this->findex, $this->indexEntry( $filename ) );
00281 $this->debug( "\t$this->fspath$filename" );
00282 $length = $this->limit[0];
00283 $i = 1;
00284 }
00285 $title = Title::makeTitle( $row->page_namespace, $row->page_title );
00286 $date = wfTimestamp( TS_ISO_8601, $row->page_touched );
00287 $entry = $this->fileEntry( $title->getFullURL(), $date, $this->priority( $namespace ) );
00288 $length += strlen( $entry );
00289 $this->write( $this->file, $entry );
00290
00291 if($wgContLang->hasVariants()){
00292 $variants = $wgContLang->getVariants();
00293 foreach($variants as $vCode){
00294 if($vCode==$wgContLang->getCode()) continue;
00295 $entry = $this->fileEntry( $title->getFullURL('',$vCode), $date, $this->priority( $namespace ) );
00296 $length += strlen( $entry );
00297 $this->write( $this->file, $entry );
00298 }
00299 }
00300 }
00301 if ( $this->file ) {
00302 $this->write( $this->file, $this->closeFile() );
00303 $this->close( $this->file );
00304 }
00305 }
00306 fwrite( $this->findex, $this->closeIndex() );
00307 fclose( $this->findex );
00308 }
00309
00315 function open( $file, $flags ) {
00316 return $this->compress ? gzopen( $file, $flags ) : fopen( $file, $flags );
00317 }
00318
00322 function write( &$handle, $str ) {
00323 if ( $this->compress )
00324 gzwrite( $handle, $str );
00325 else
00326 fwrite( $handle, $str );
00327 }
00328
00332 function close( &$handle ) {
00333 if ( $this->compress )
00334 gzclose( $handle );
00335 else
00336 fclose( $handle );
00337 }
00338
00349 function sitemapFilename( $namespace, $count ) {
00350 $ext = $this->compress ? '.gz' : '';
00351 return "sitemap-".wfWikiID()."-NS_$namespace-$count.xml$ext";
00352 }
00353
00361 function xmlHead() {
00362 return '<?xml version="1.0" encoding="UTF-8"?>' . "\n";
00363 }
00364
00372 function xmlSchema() {
00373 return 'http://www.sitemaps.org/schemas/sitemap/0.9';
00374 }
00375
00381 function openIndex() {
00382 return $this->xmlHead() . '<sitemapindex xmlns="' . $this->xmlSchema() . '">' . "\n";
00383 }
00384
00394 function indexEntry( $filename ) {
00395 return
00396 "\t<sitemap>\n" .
00397 "\t\t<loc>$filename</loc>\n" .
00398 "\t\t<lastmod>{$this->timestamp}</lastmod>\n" .
00399 "\t</sitemap>\n";
00400 }
00401
00409 function closeIndex() {
00410 return "</sitemapindex>\n";
00411 }
00412
00418 function openFile() {
00419 return $this->xmlHead() . '<urlset xmlns="' . $this->xmlSchema() . '">' . "\n";
00420 }
00421
00433 function fileEntry( $url, $date, $priority ) {
00434 return
00435 "\t<url>\n" .
00436 "\t\t<loc>$url</loc>\n" .
00437 "\t\t<lastmod>$date</lastmod>\n" .
00438 "\t\t<priority>$priority</priority>\n" .
00439 "\t</url>\n";
00440 }
00441
00448 function closeFile() {
00449 return "</urlset>\n";
00450 }
00451
00455 function debug( $str ) {
00456 fwrite( $this->stderr, "$str\n" );
00457 }
00458
00462 function generateLimit( $namespace ) {
00463 $title = Title::makeTitle( $namespace, str_repeat( "\xf0\xa8\xae\x81", 63 ) . "\xe5\x96\x83" );
00464
00465 $this->limit = array(
00466 strlen( $this->openFile() ),
00467 strlen( $this->fileEntry( $title->getFullUrl(), wfTimestamp( TS_ISO_8601, wfTimestamp() ), $this->priority( $namespace ) ) ),
00468 strlen( $this->closeFile() )
00469 );
00470 }
00471 }
00472
00473 if ( in_array( '--help', $argv ) ) {
00474 echo <<<EOT
00475 Usage: php generateSitemap.php [options]
00476 --help show this message
00477
00478 --fspath=<path> The file system path to save to, e.g /tmp/sitemap
00479 Saves to current directory if not given.
00480
00481 --server=<server> The protocol and server name to use in URLs, e.g.
00482 http:
00483 server name detection may fail in command line scripts.
00484
00485 --compress=[yes|no] compress the sitemap files, default yes
00486
00487 EOT;
00488 die( -1 );
00489 }
00490
00491 $optionsWithArgs = array( 'fspath', 'server', 'compress' );
00492 require_once( dirname( __FILE__ ) . '/commandLine.inc' );
00493
00494 if ( isset( $options['server'] ) ) {
00495 $wgServer = $options['server'];
00496 }
00497
00498 $gs = new GenerateSitemap( @$options['fspath'], @$options['compress'] !== 'no' );
00499 $gs->main();
00500