1#! /usr/bin/perl 2 3# Copyright (C) 2011 4# Jérémie Nikaes <jeremie.nikaes@ensimag.imag.fr> 5# Arnaud Lacurie <arnaud.lacurie@ensimag.imag.fr> 6# Claire Fousse <claire.fousse@ensimag.imag.fr> 7# David Amouyal <david.amouyal@ensimag.imag.fr> 8# Matthieu Moy <matthieu.moy@grenoble-inp.fr> 9# License: GPL v2 or later 10 11# Gateway between Git and MediaWiki. 12# Documentation & bugtracker: https://github.com/moy/Git-Mediawiki/ 13 14use strict; 15use MediaWiki::API; 16use Git; 17use DateTime::Format::ISO8601; 18use warnings; 19 20# By default, use UTF-8 to communicate with Git and the user 21binmode STDERR,':encoding(UTF-8)'; 22binmode STDOUT,':encoding(UTF-8)'; 23 24use URI::Escape; 25 26# Mediawiki filenames can contain forward slashes. This variable decides by which pattern they should be replaced 27useconstant SLASH_REPLACEMENT =>'%2F'; 28 29# It's not always possible to delete pages (may require some 30# privileges). Deleted pages are replaced with this content. 31useconstant DELETED_CONTENT =>"[[Category:Deleted]]\n"; 32 33# It's not possible to create empty pages. New empty files in Git are 34# sent with this content instead. 35useconstant EMPTY_CONTENT =>"<!-- empty page -->\n"; 36 37# used to reflect file creation or deletion in diff. 38useconstant NULL_SHA1 =>'0000000000000000000000000000000000000000'; 39 40# Used on Git's side to reflect empty edit messages on the wiki 41useconstant EMPTY_MESSAGE =>'*Empty MediaWiki Message*'; 42 43useconstant EMPTY =>q{}; 44 45# Number of pages taken into account at once in submodule get_mw_page_list 46useconstant SLICE_SIZE =>50; 47 48# Number of linked mediafile to get at once in get_linked_mediafiles 49# The query is split in small batches because of the MW API limit of 50# the number of links to be returned (500 links max). 51useconstant BATCH_SIZE =>10; 52 53useconstant HTTP_CODE_OK =>200; 54 55my$remotename=$ARGV[0]; 56my$url=$ARGV[1]; 57 58# Accept both space-separated and multiple keys in config file. 59# Spaces should be written as _ anyway because we'll use chomp. 60my@tracked_pages=split(/[ \n]/, run_git("config --get-all remote.${remotename}.pages")); 61chomp(@tracked_pages); 62 63# Just like @tracked_pages, but for MediaWiki categories. 64my@tracked_categories=split(/[ \n]/, run_git("config --get-all remote.${remotename}.categories")); 65chomp(@tracked_categories); 66 67# Import media files on pull 68my$import_media= run_git("config --get --bool remote.${remotename}.mediaimport"); 69chomp($import_media); 70$import_media= ($import_mediaeq'true'); 71 72# Export media files on push 73my$export_media= run_git("config --get --bool remote.${remotename}.mediaexport"); 74chomp($export_media); 75$export_media= !($export_mediaeq'false'); 76 77my$wiki_login= run_git("config --get remote.${remotename}.mwLogin"); 78# Note: mwPassword is discourraged. Use the credential system instead. 79my$wiki_passwd= run_git("config --get remote.${remotename}.mwPassword"); 80my$wiki_domain= run_git("config --get remote.${remotename}.mwDomain"); 81chomp($wiki_login); 82chomp($wiki_passwd); 83chomp($wiki_domain); 84 85# Import only last revisions (both for clone and fetch) 86my$shallow_import= run_git("config --get --bool remote.${remotename}.shallow"); 87chomp($shallow_import); 88$shallow_import= ($shallow_importeq'true'); 89 90# Fetch (clone and pull) by revisions instead of by pages. This behavior 91# is more efficient when we have a wiki with lots of pages and we fetch 92# the revisions quite often so that they concern only few pages. 93# Possible values: 94# - by_rev: perform one query per new revision on the remote wiki 95# - by_page: query each tracked page for new revision 96my$fetch_strategy= run_git("config --get remote.${remotename}.fetchStrategy"); 97if(!$fetch_strategy) { 98$fetch_strategy= run_git('config --get mediawiki.fetchStrategy'); 99} 100chomp($fetch_strategy); 101if(!$fetch_strategy) { 102$fetch_strategy='by_page'; 103} 104 105# Remember the timestamp corresponding to a revision id. 106my%basetimestamps; 107 108# Dumb push: don't update notes and mediawiki ref to reflect the last push. 109# 110# Configurable with mediawiki.dumbPush, or per-remote with 111# remote.<remotename>.dumbPush. 112# 113# This means the user will have to re-import the just-pushed 114# revisions. On the other hand, this means that the Git revisions 115# corresponding to MediaWiki revisions are all imported from the wiki, 116# regardless of whether they were initially created in Git or from the 117# web interface, hence all users will get the same history (i.e. if 118# the push from Git to MediaWiki loses some information, everybody 119# will get the history with information lost). If the import is 120# deterministic, this means everybody gets the same sha1 for each 121# MediaWiki revision. 122my$dumb_push= run_git("config --get --bool remote.${remotename}.dumbPush"); 123if(!$dumb_push) { 124$dumb_push= run_git('config --get --bool mediawiki.dumbPush'); 125} 126chomp($dumb_push); 127$dumb_push= ($dumb_pusheq'true'); 128 129my$wiki_name=$url; 130$wiki_name=~s{[^/]*://}{}; 131# If URL is like http://user:password@example.com/, we clearly don't 132# want the password in $wiki_name. While we're there, also remove user 133# and '@' sign, to avoid author like MWUser@HTTPUser@host.com 134$wiki_name=~s/^.*@//; 135 136# Commands parser 137while(<STDIN>) { 138chomp; 139 140if(!parse_command($_)) { 141last; 142} 143 144BEGIN{ $| =1}# flush STDOUT, to make sure the previous 145# command is fully processed. 146} 147 148########################## Functions ############################## 149 150sub parse_command { 151my($line) =@_; 152my@cmd=split(/ /,$line); 153if(!defined$cmd[0]) { 154return0; 155} 156if($cmd[0]eq'capabilities') { 157die("Too many arguments for capabilities\n") 158if(defined($cmd[1])); 159 mw_capabilities(); 160}elsif($cmd[0]eq'list') { 161die("Too many arguments for list\n")if(defined($cmd[2])); 162 mw_list($cmd[1]); 163}elsif($cmd[0]eq'import') { 164die("Invalid argument for import\n") 165if($cmd[1]eq EMPTY); 166die("Too many arguments for import\n") 167if(defined($cmd[2])); 168 mw_import($cmd[1]); 169}elsif($cmd[0]eq'option') { 170die("Invalid arguments for option\n") 171if($cmd[1]eq EMPTY ||$cmd[2]eq EMPTY); 172die("Too many arguments for option\n") 173if(defined($cmd[3])); 174 mw_option($cmd[1],$cmd[2]); 175}elsif($cmd[0]eq'push') { 176 mw_push($cmd[1]); 177}else{ 178print{*STDERR}"Unknown command. Aborting...\n"; 179return0; 180} 181return1; 182} 183 184# MediaWiki API instance, created lazily. 185my$mediawiki; 186 187sub mw_connect_maybe { 188if($mediawiki) { 189return; 190} 191$mediawiki= MediaWiki::API->new; 192$mediawiki->{config}->{api_url} ="${url}/api.php"; 193if($wiki_login) { 194my%credential= ( 195'url'=>$url, 196'username'=>$wiki_login, 197'password'=>$wiki_passwd 198); 199 Git::credential(\%credential); 200my$request= {lgname =>$credential{username}, 201 lgpassword =>$credential{password}, 202 lgdomain =>$wiki_domain}; 203if($mediawiki->login($request)) { 204 Git::credential(\%credential,'approve'); 205print{*STDERR}qq(Logged in mediawiki user "$credential{username}".\n); 206}else{ 207print{*STDERR}qq(Failed to log in mediawiki user "$credential{username}" on ${url}\n); 208print{*STDERR}' (error '. 209$mediawiki->{error}->{code} .': '. 210$mediawiki->{error}->{details} .")\n"; 211 Git::credential(\%credential,'reject'); 212exit1; 213} 214} 215return; 216} 217 218sub fatal_mw_error { 219my$action=shift; 220print STDERR "fatal: could not$action.\n"; 221print STDERR "fatal: '$url' does not appear to be a mediawiki\n"; 222if($url=~/^https/) { 223print STDERR "fatal: make sure '$url/api.php' is a valid page\n"; 224print STDERR "fatal: and the SSL certificate is correct.\n"; 225}else{ 226print STDERR "fatal: make sure '$url/api.php' is a valid page.\n"; 227} 228print STDERR "fatal: (error ". 229$mediawiki->{error}->{code} .': '. 230$mediawiki->{error}->{details} .")\n"; 231exit1; 232} 233 234## Functions for listing pages on the remote wiki 235sub get_mw_tracked_pages { 236my$pages=shift; 237 get_mw_page_list(\@tracked_pages,$pages); 238return; 239} 240 241sub get_mw_page_list { 242my$page_list=shift; 243my$pages=shift; 244my@some_pages= @{$page_list}; 245while(@some_pages) { 246my$last_page= SLICE_SIZE; 247if($#some_pages<$last_page) { 248$last_page=$#some_pages; 249} 250my@slice=@some_pages[0..$last_page]; 251 get_mw_first_pages(\@slice,$pages); 252@some_pages=@some_pages[(SLICE_SIZE +1)..$#some_pages]; 253} 254return; 255} 256 257sub get_mw_tracked_categories { 258my$pages=shift; 259foreachmy$category(@tracked_categories) { 260if(index($category,':') <0) { 261# Mediawiki requires the Category 262# prefix, but let's not force the user 263# to specify it. 264$category="Category:${category}"; 265} 266my$mw_pages=$mediawiki->list( { 267 action =>'query', 268 list =>'categorymembers', 269 cmtitle =>$category, 270 cmlimit =>'max'} ) 271||die$mediawiki->{error}->{code} .': ' 272.$mediawiki->{error}->{details} ."\n"; 273foreachmy$page(@{$mw_pages}) { 274$pages->{$page->{title}} =$page; 275} 276} 277return; 278} 279 280sub get_mw_all_pages { 281my$pages=shift; 282# No user-provided list, get the list of pages from the API. 283my$mw_pages=$mediawiki->list({ 284 action =>'query', 285 list =>'allpages', 286 aplimit =>'max' 287}); 288if(!defined($mw_pages)) { 289 fatal_mw_error("get the list of wiki pages"); 290} 291foreachmy$page(@{$mw_pages}) { 292$pages->{$page->{title}} =$page; 293} 294return; 295} 296 297# queries the wiki for a set of pages. Meant to be used within a loop 298# querying the wiki for slices of page list. 299sub get_mw_first_pages { 300my$some_pages=shift; 301my@some_pages= @{$some_pages}; 302 303my$pages=shift; 304 305# pattern 'page1|page2|...' required by the API 306my$titles=join('|',@some_pages); 307 308my$mw_pages=$mediawiki->api({ 309 action =>'query', 310 titles =>$titles, 311}); 312if(!defined($mw_pages)) { 313 fatal_mw_error("query the list of wiki pages"); 314} 315while(my($id,$page) =each(%{$mw_pages->{query}->{pages}})) { 316if($id<0) { 317print{*STDERR}"Warning: page$page->{title} not found on wiki\n"; 318}else{ 319$pages->{$page->{title}} =$page; 320} 321} 322return; 323} 324 325# Get the list of pages to be fetched according to configuration. 326sub get_mw_pages { 327 mw_connect_maybe(); 328 329print{*STDERR}"Listing pages on remote wiki...\n"; 330 331my%pages;# hash on page titles to avoid duplicates 332my$user_defined; 333if(@tracked_pages) { 334$user_defined=1; 335# The user provided a list of pages titles, but we 336# still need to query the API to get the page IDs. 337 get_mw_tracked_pages(\%pages); 338} 339if(@tracked_categories) { 340$user_defined=1; 341 get_mw_tracked_categories(\%pages); 342} 343if(!$user_defined) { 344 get_mw_all_pages(\%pages); 345} 346if($import_media) { 347print{*STDERR}"Getting media files for selected pages...\n"; 348if($user_defined) { 349 get_linked_mediafiles(\%pages); 350}else{ 351 get_all_mediafiles(\%pages); 352} 353} 354print{*STDERR} (scalar keys%pages) ." pages found.\n"; 355return%pages; 356} 357 358# usage: $out = run_git("command args"); 359# $out = run_git("command args", "raw"); # don't interpret output as UTF-8. 360sub run_git { 361my$args=shift; 362my$encoding= (shift||'encoding(UTF-8)'); 363open(my$git,"-|:${encoding}","git ${args}") 364or die"Unable to fork:$!\n"; 365my$res=do{ 366local$/=undef; 367<$git> 368}; 369close($git); 370 371return$res; 372} 373 374 375sub get_all_mediafiles { 376my$pages=shift; 377# Attach list of all pages for media files from the API, 378# they are in a different namespace, only one namespace 379# can be queried at the same moment 380my$mw_pages=$mediawiki->list({ 381 action =>'query', 382 list =>'allpages', 383 apnamespace => get_mw_namespace_id('File'), 384 aplimit =>'max' 385}); 386if(!defined($mw_pages)) { 387print{*STDERR}"fatal: could not get the list of pages for media files.\n"; 388print{*STDERR}"fatal: '$url' does not appear to be a mediawiki\n"; 389print{*STDERR}"fatal: make sure '$url/api.php' is a valid page.\n"; 390exit1; 391} 392foreachmy$page(@{$mw_pages}) { 393$pages->{$page->{title}} =$page; 394} 395return; 396} 397 398sub get_linked_mediafiles { 399my$pages=shift; 400my@titles=map{$_->{title} }values(%{$pages}); 401 402my$batch= BATCH_SIZE; 403while(@titles) { 404if($#titles<$batch) { 405$batch=$#titles; 406} 407my@slice=@titles[0..$batch]; 408 409# pattern 'page1|page2|...' required by the API 410my$mw_titles=join('|',@slice); 411 412# Media files could be included or linked from 413# a page, get all related 414my$query= { 415 action =>'query', 416 prop =>'links|images', 417 titles =>$mw_titles, 418 plnamespace => get_mw_namespace_id('File'), 419 pllimit =>'max' 420}; 421my$result=$mediawiki->api($query); 422 423while(my($id,$page) =each(%{$result->{query}->{pages}})) { 424my@media_titles; 425if(defined($page->{links})) { 426my@link_titles 427=map{$_->{title} } @{$page->{links}}; 428push(@media_titles,@link_titles); 429} 430if(defined($page->{images})) { 431my@image_titles 432=map{$_->{title} } @{$page->{images}}; 433push(@media_titles,@image_titles); 434} 435if(@media_titles) { 436 get_mw_page_list(\@media_titles,$pages); 437} 438} 439 440@titles=@titles[($batch+1)..$#titles]; 441} 442return; 443} 444 445sub get_mw_mediafile_for_page_revision { 446# Name of the file on Wiki, with the prefix. 447my$filename=shift; 448my$timestamp=shift; 449my%mediafile; 450 451# Search if on a media file with given timestamp exists on 452# MediaWiki. In that case download the file. 453my$query= { 454 action =>'query', 455 prop =>'imageinfo', 456 titles =>"File:${filename}", 457 iistart =>$timestamp, 458 iiend =>$timestamp, 459 iiprop =>'timestamp|archivename|url', 460 iilimit =>1 461}; 462my$result=$mediawiki->api($query); 463 464my($fileid,$file) =each( %{$result->{query}->{pages}} ); 465# If not defined it means there is no revision of the file for 466# given timestamp. 467if(defined($file->{imageinfo})) { 468$mediafile{title} =$filename; 469 470my$fileinfo=pop(@{$file->{imageinfo}}); 471$mediafile{timestamp} =$fileinfo->{timestamp}; 472# Mediawiki::API's download function doesn't support https URLs 473# and can't download old versions of files. 474print{*STDERR}"\tDownloading file$mediafile{title}, version$mediafile{timestamp}\n"; 475$mediafile{content} = download_mw_mediafile($fileinfo->{url}); 476} 477return%mediafile; 478} 479 480sub download_mw_mediafile { 481my$download_url=shift; 482 483my$response=$mediawiki->{ua}->get($download_url); 484if($response->code== HTTP_CODE_OK) { 485return$response->decoded_content; 486}else{ 487print{*STDERR}"Error downloading mediafile from :\n"; 488print{*STDERR}"URL: ${download_url}\n"; 489print{*STDERR}'Server response: '.$response->code.q{ }.$response->message."\n"; 490exit1; 491} 492} 493 494sub get_last_local_revision { 495# Get note regarding last mediawiki revision 496my$note= run_git("notes --ref=${remotename}/mediawiki show refs/mediawiki/${remotename}/master 2>/dev/null"); 497my@note_info=split(/ /,$note); 498 499my$lastrevision_number; 500if(!(defined($note_info[0]) &&$note_info[0]eq'mediawiki_revision:')) { 501print{*STDERR}'No previous mediawiki revision found'; 502$lastrevision_number=0; 503}else{ 504# Notes are formatted : mediawiki_revision: #number 505$lastrevision_number=$note_info[1]; 506chomp($lastrevision_number); 507print{*STDERR}"Last local mediawiki revision found is ${lastrevision_number}"; 508} 509return$lastrevision_number; 510} 511 512# Get the last remote revision without taking in account which pages are 513# tracked or not. This function makes a single request to the wiki thus 514# avoid a loop onto all tracked pages. This is useful for the fetch-by-rev 515# option. 516sub get_last_global_remote_rev { 517 mw_connect_maybe(); 518 519my$query= { 520 action =>'query', 521 list =>'recentchanges', 522 prop =>'revisions', 523 rclimit =>'1', 524 rcdir =>'older', 525}; 526my$result=$mediawiki->api($query); 527return$result->{query}->{recentchanges}[0]->{revid}; 528} 529 530# Get the last remote revision concerning the tracked pages and the tracked 531# categories. 532sub get_last_remote_revision { 533 mw_connect_maybe(); 534 535my%pages_hash= get_mw_pages(); 536my@pages=values(%pages_hash); 537 538my$max_rev_num=0; 539 540print{*STDERR}"Getting last revision id on tracked pages...\n"; 541 542foreachmy$page(@pages) { 543my$id=$page->{pageid}; 544 545my$query= { 546 action =>'query', 547 prop =>'revisions', 548 rvprop =>'ids|timestamp', 549 pageids =>$id, 550}; 551 552my$result=$mediawiki->api($query); 553 554my$lastrev=pop(@{$result->{query}->{pages}->{$id}->{revisions}}); 555 556$basetimestamps{$lastrev->{revid}} =$lastrev->{timestamp}; 557 558$max_rev_num= ($lastrev->{revid} >$max_rev_num?$lastrev->{revid} :$max_rev_num); 559} 560 561print{*STDERR}"Last remote revision found is$max_rev_num.\n"; 562return$max_rev_num; 563} 564 565# Clean content before sending it to MediaWiki 566sub mediawiki_clean { 567my$string=shift; 568my$page_created=shift; 569# Mediawiki does not allow blank space at the end of a page and ends with a single \n. 570# This function right trims a string and adds a \n at the end to follow this rule 571$string=~s/\s+$//; 572if($stringeq EMPTY &&$page_created) { 573# Creating empty pages is forbidden. 574$string= EMPTY_CONTENT; 575} 576return$string."\n"; 577} 578 579# Filter applied on MediaWiki data before adding them to Git 580sub mediawiki_smudge { 581my$string=shift; 582if($stringeq EMPTY_CONTENT) { 583$string= EMPTY; 584} 585# This \n is important. This is due to mediawiki's way to handle end of files. 586return"${string}\n"; 587} 588 589sub mediawiki_clean_filename { 590my$filename=shift; 591$filename=~s{@{[SLASH_REPLACEMENT]}}{/}g; 592# [, ], |, {, and } are forbidden by MediaWiki, even URL-encoded. 593# Do a variant of URL-encoding, i.e. looks like URL-encoding, 594# but with _ added to prevent MediaWiki from thinking this is 595# an actual special character. 596$filename=~s/[\[\]\{\}\|]/sprintf("_%%_%x", ord($&))/ge; 597# If we use the uri escape before 598# we should unescape here, before anything 599 600return$filename; 601} 602 603sub mediawiki_smudge_filename { 604my$filename=shift; 605$filename=~s{/}{@{[SLASH_REPLACEMENT]}}g; 606$filename=~s/ /_/g; 607# Decode forbidden characters encoded in mediawiki_clean_filename 608$filename=~s/_%_([0-9a-fA-F][0-9a-fA-F])/sprintf('%c', hex($1))/ge; 609return$filename; 610} 611 612sub literal_data { 613my($content) =@_; 614print{*STDOUT}'data ', bytes::length($content),"\n",$content; 615return; 616} 617 618sub literal_data_raw { 619# Output possibly binary content. 620my($content) =@_; 621# Avoid confusion between size in bytes and in characters 622 utf8::downgrade($content); 623binmode{*STDOUT},':raw'; 624print{*STDOUT}'data ', bytes::length($content),"\n",$content; 625binmode{*STDOUT},':encoding(UTF-8)'; 626return; 627} 628 629sub mw_capabilities { 630# Revisions are imported to the private namespace 631# refs/mediawiki/$remotename/ by the helper and fetched into 632# refs/remotes/$remotename later by fetch. 633print{*STDOUT}"refspec refs/heads/*:refs/mediawiki/${remotename}/*\n"; 634print{*STDOUT}"import\n"; 635print{*STDOUT}"list\n"; 636print{*STDOUT}"push\n"; 637print{*STDOUT}"\n"; 638return; 639} 640 641sub mw_list { 642# MediaWiki do not have branches, we consider one branch arbitrarily 643# called master, and HEAD pointing to it. 644print{*STDOUT}"? refs/heads/master\n"; 645print{*STDOUT}"\@refs/heads/masterHEAD\n"; 646print{*STDOUT}"\n"; 647return; 648} 649 650sub mw_option { 651print{*STDERR}"remote-helper command 'option$_[0]' not yet implemented\n"; 652print{*STDOUT}"unsupported\n"; 653return; 654} 655 656sub fetch_mw_revisions_for_page { 657my$page=shift; 658my$id=shift; 659my$fetch_from=shift; 660my@page_revs= (); 661my$query= { 662 action =>'query', 663 prop =>'revisions', 664 rvprop =>'ids', 665 rvdir =>'newer', 666 rvstartid =>$fetch_from, 667 rvlimit =>500, 668 pageids =>$id, 669}; 670 671my$revnum=0; 672# Get 500 revisions at a time due to the mediawiki api limit 673while(1) { 674my$result=$mediawiki->api($query); 675 676# Parse each of those 500 revisions 677foreachmy$revision(@{$result->{query}->{pages}->{$id}->{revisions}}) { 678my$page_rev_ids; 679$page_rev_ids->{pageid} =$page->{pageid}; 680$page_rev_ids->{revid} =$revision->{revid}; 681push(@page_revs,$page_rev_ids); 682$revnum++; 683} 684last if(!$result->{'query-continue'}); 685$query->{rvstartid} =$result->{'query-continue'}->{revisions}->{rvstartid}; 686} 687if($shallow_import&&@page_revs) { 688print{*STDERR}" Found 1 revision (shallow import).\n"; 689@page_revs=sort{$b->{revid} <=>$a->{revid}} (@page_revs); 690return$page_revs[0]; 691} 692print{*STDERR}" Found ${revnum} revision(s).\n"; 693return@page_revs; 694} 695 696sub fetch_mw_revisions { 697my$pages=shift;my@pages= @{$pages}; 698my$fetch_from=shift; 699 700my@revisions= (); 701my$n=1; 702foreachmy$page(@pages) { 703my$id=$page->{pageid}; 704print{*STDERR}"page ${n}/",scalar(@pages),': ',$page->{title},"\n"; 705$n++; 706my@page_revs= fetch_mw_revisions_for_page($page,$id,$fetch_from); 707@revisions= (@page_revs,@revisions); 708} 709 710return($n,@revisions); 711} 712 713sub fe_escape_path { 714my$path=shift; 715$path=~s/\\/\\\\/g; 716$path=~s/"/\\"/g; 717$path=~s/\n/\\n/g; 718returnqq("${path}"); 719} 720 721sub import_file_revision { 722my$commit=shift; 723my%commit= %{$commit}; 724my$full_import=shift; 725my$n=shift; 726my$mediafile=shift; 727my%mediafile; 728if($mediafile) { 729%mediafile= %{$mediafile}; 730} 731 732my$title=$commit{title}; 733my$comment=$commit{comment}; 734my$content=$commit{content}; 735my$author=$commit{author}; 736my$date=$commit{date}; 737 738print{*STDOUT}"commit refs/mediawiki/${remotename}/master\n"; 739print{*STDOUT}"mark :${n}\n"; 740print{*STDOUT}"committer ${author} <${author}\@${wiki_name}> ".$date->epoch." +0000\n"; 741 literal_data($comment); 742 743# If it's not a clone, we need to know where to start from 744if(!$full_import&&$n==1) { 745print{*STDOUT}"from refs/mediawiki/${remotename}/master^0\n"; 746} 747if($contentne DELETED_CONTENT) { 748print{*STDOUT}'M 644 inline '. 749 fe_escape_path("${title}.mw") ."\n"; 750 literal_data($content); 751if(%mediafile) { 752print{*STDOUT}'M 644 inline ' 753. fe_escape_path($mediafile{title}) ."\n"; 754 literal_data_raw($mediafile{content}); 755} 756print{*STDOUT}"\n\n"; 757}else{ 758print{*STDOUT}'D '. fe_escape_path("${title}.mw") ."\n"; 759} 760 761# mediawiki revision number in the git note 762if($full_import&&$n==1) { 763print{*STDOUT}"reset refs/notes/${remotename}/mediawiki\n"; 764} 765print{*STDOUT}"commit refs/notes/${remotename}/mediawiki\n"; 766print{*STDOUT}"committer ${author} <${author}\@${wiki_name}> ".$date->epoch." +0000\n"; 767 literal_data('Note added by git-mediawiki during import'); 768if(!$full_import&&$n==1) { 769print{*STDOUT}"from refs/notes/${remotename}/mediawiki^0\n"; 770} 771print{*STDOUT}"N inline :${n}\n"; 772 literal_data("mediawiki_revision:$commit{mw_revision}"); 773print{*STDOUT}"\n\n"; 774return; 775} 776 777# parse a sequence of 778# <cmd> <arg1> 779# <cmd> <arg2> 780# \n 781# (like batch sequence of import and sequence of push statements) 782sub get_more_refs { 783my$cmd=shift; 784my@refs; 785while(1) { 786my$line= <STDIN>; 787if($line=~/^$cmd (.*)$/) { 788push(@refs,$1); 789}elsif($lineeq"\n") { 790return@refs; 791}else{ 792die("Invalid command in a '$cmd' batch:$_\n"); 793} 794} 795return; 796} 797 798sub mw_import { 799# multiple import commands can follow each other. 800my@refs= (shift, get_more_refs('import')); 801foreachmy$ref(@refs) { 802 mw_import_ref($ref); 803} 804print{*STDOUT}"done\n"; 805return; 806} 807 808sub mw_import_ref { 809my$ref=shift; 810# The remote helper will call "import HEAD" and 811# "import refs/heads/master". 812# Since HEAD is a symbolic ref to master (by convention, 813# followed by the output of the command "list" that we gave), 814# we don't need to do anything in this case. 815if($refeq'HEAD') { 816return; 817} 818 819 mw_connect_maybe(); 820 821print{*STDERR}"Searching revisions...\n"; 822my$last_local= get_last_local_revision(); 823my$fetch_from=$last_local+1; 824if($fetch_from==1) { 825print{*STDERR}", fetching from beginning.\n"; 826}else{ 827print{*STDERR}", fetching from here.\n"; 828} 829 830my$n=0; 831if($fetch_strategyeq'by_rev') { 832print{*STDERR}"Fetching & writing export data by revs...\n"; 833$n= mw_import_ref_by_revs($fetch_from); 834}elsif($fetch_strategyeq'by_page') { 835print{*STDERR}"Fetching & writing export data by pages...\n"; 836$n= mw_import_ref_by_pages($fetch_from); 837}else{ 838print{*STDERR}qq(fatal: invalid fetch strategy "${fetch_strategy}".\n); 839print{*STDERR}"Check your configuration variables remote.${remotename}.fetchStrategy and mediawiki.fetchStrategy\n"; 840exit1; 841} 842 843if($fetch_from==1&&$n==0) { 844print{*STDERR}"You appear to have cloned an empty MediaWiki.\n"; 845# Something has to be done remote-helper side. If nothing is done, an error is 846# thrown saying that HEAD is referring to unknown object 0000000000000000000 847# and the clone fails. 848} 849return; 850} 851 852sub mw_import_ref_by_pages { 853 854my$fetch_from=shift; 855my%pages_hash= get_mw_pages(); 856my@pages=values(%pages_hash); 857 858my($n,@revisions) = fetch_mw_revisions(\@pages,$fetch_from); 859 860@revisions=sort{$a->{revid} <=>$b->{revid}}@revisions; 861my@revision_ids=map{$_->{revid} }@revisions; 862 863return mw_import_revids($fetch_from, \@revision_ids, \%pages_hash); 864} 865 866sub mw_import_ref_by_revs { 867 868my$fetch_from=shift; 869my%pages_hash= get_mw_pages(); 870 871my$last_remote= get_last_global_remote_rev(); 872my@revision_ids=$fetch_from..$last_remote; 873return mw_import_revids($fetch_from, \@revision_ids, \%pages_hash); 874} 875 876# Import revisions given in second argument (array of integers). 877# Only pages appearing in the third argument (hash indexed by page titles) 878# will be imported. 879sub mw_import_revids { 880my$fetch_from=shift; 881my$revision_ids=shift; 882my$pages=shift; 883 884my$n=0; 885my$n_actual=0; 886my$last_timestamp=0;# Placeholer in case $rev->timestamp is undefined 887 888foreachmy$pagerevid(@{$revision_ids}) { 889# Count page even if we skip it, since we display 890# $n/$total and $total includes skipped pages. 891$n++; 892 893# fetch the content of the pages 894my$query= { 895 action =>'query', 896 prop =>'revisions', 897 rvprop =>'content|timestamp|comment|user|ids', 898 revids =>$pagerevid, 899}; 900 901my$result=$mediawiki->api($query); 902 903if(!$result) { 904die"Failed to retrieve modified page for revision$pagerevid\n"; 905} 906 907if(defined($result->{query}->{badrevids}->{$pagerevid})) { 908# The revision id does not exist on the remote wiki. 909next; 910} 911 912if(!defined($result->{query}->{pages})) { 913die"Invalid revision ${pagerevid}.\n"; 914} 915 916my@result_pages=values(%{$result->{query}->{pages}}); 917my$result_page=$result_pages[0]; 918my$rev=$result_pages[0]->{revisions}->[0]; 919 920my$page_title=$result_page->{title}; 921 922if(!exists($pages->{$page_title})) { 923print{*STDERR}"${n}/",scalar(@{$revision_ids}), 924": Skipping revision #$rev->{revid} of ${page_title}\n"; 925next; 926} 927 928$n_actual++; 929 930my%commit; 931$commit{author} =$rev->{user} ||'Anonymous'; 932$commit{comment} =$rev->{comment} || EMPTY_MESSAGE; 933$commit{title} = mediawiki_smudge_filename($page_title); 934$commit{mw_revision} =$rev->{revid}; 935$commit{content} = mediawiki_smudge($rev->{'*'}); 936 937if(!defined($rev->{timestamp})) { 938$last_timestamp++; 939}else{ 940$last_timestamp=$rev->{timestamp}; 941} 942$commit{date} = DateTime::Format::ISO8601->parse_datetime($last_timestamp); 943 944# Differentiates classic pages and media files. 945my($namespace,$filename) =$page_title=~/^([^:]*):(.*)$/; 946my%mediafile; 947if($namespace) { 948my$id= get_mw_namespace_id($namespace); 949if($id&&$id== get_mw_namespace_id('File')) { 950%mediafile= get_mw_mediafile_for_page_revision($filename,$rev->{timestamp}); 951} 952} 953# If this is a revision of the media page for new version 954# of a file do one common commit for both file and media page. 955# Else do commit only for that page. 956print{*STDERR}"${n}/",scalar(@{$revision_ids}),": Revision #$rev->{revid} of$commit{title}\n"; 957 import_file_revision(\%commit, ($fetch_from==1),$n_actual, \%mediafile); 958} 959 960return$n_actual; 961} 962 963sub error_non_fast_forward { 964my$advice= run_git('config --bool advice.pushNonFastForward'); 965chomp($advice); 966if($advicene'false') { 967# Native git-push would show this after the summary. 968# We can't ask it to display it cleanly, so print it 969# ourselves before. 970print{*STDERR}"To prevent you from losing history, non-fast-forward updates were rejected\n"; 971print{*STDERR}"Merge the remote changes (e.g. 'git pull') before pushing again. See the\n"; 972print{*STDERR}"'Note about fast-forwards' section of 'git push --help' for details.\n"; 973} 974print{*STDOUT}qq(error$_[0] "non-fast-forward"\n); 975return0; 976} 977 978sub mw_upload_file { 979my$complete_file_name=shift; 980my$new_sha1=shift; 981my$extension=shift; 982my$file_deleted=shift; 983my$summary=shift; 984my$newrevid; 985my$path="File:${complete_file_name}"; 986my%hashFiles= get_allowed_file_extensions(); 987if(!exists($hashFiles{$extension})) { 988print{*STDERR}"${complete_file_name} is not a permitted file on this wiki.\n"; 989print{*STDERR}"Check the configuration of file uploads in your mediawiki.\n"; 990return$newrevid; 991} 992# Deleting and uploading a file requires a priviledged user 993if($file_deleted) { 994 mw_connect_maybe(); 995my$query= { 996 action =>'delete', 997 title =>$path, 998 reason =>$summary 999};1000if(!$mediawiki->edit($query)) {1001print{*STDERR}"Failed to delete file on remote wiki\n";1002print{*STDERR}"Check your permissions on the remote site. Error code:\n";1003print{*STDERR}$mediawiki->{error}->{code} .':'.$mediawiki->{error}->{details};1004exit1;1005}1006}else{1007# Don't let perl try to interpret file content as UTF-8 => use "raw"1008my$content= run_git("cat-file blob ${new_sha1}",'raw');1009if($contentne EMPTY) {1010 mw_connect_maybe();1011$mediawiki->{config}->{upload_url} =1012"${url}/index.php/Special:Upload";1013$mediawiki->edit({1014 action =>'upload',1015 filename =>$complete_file_name,1016 comment =>$summary,1017 file => [undef,1018$complete_file_name,1019 Content =>$content],1020 ignorewarnings =>1,1021}, {1022 skip_encoding =>11023} ) ||die$mediawiki->{error}->{code} .':'1024.$mediawiki->{error}->{details} ."\n";1025my$last_file_page=$mediawiki->get_page({title =>$path});1026$newrevid=$last_file_page->{revid};1027print{*STDERR}"Pushed file: ${new_sha1} - ${complete_file_name}.\n";1028}else{1029print{*STDERR}"Empty file ${complete_file_name} not pushed.\n";1030}1031}1032return$newrevid;1033}10341035sub mw_push_file {1036my$diff_info=shift;1037# $diff_info contains a string in this format:1038# 100644 100644 <sha1_of_blob_before_commit> <sha1_of_blob_now> <status>1039my@diff_info_split=split(/[ \t]/,$diff_info);10401041# Filename, including .mw extension1042my$complete_file_name=shift;1043# Commit message1044my$summary=shift;1045# MediaWiki revision number. Keep the previous one by default,1046# in case there's no edit to perform.1047my$oldrevid=shift;1048my$newrevid;10491050if($summaryeq EMPTY_MESSAGE) {1051$summary= EMPTY;1052}10531054my$new_sha1=$diff_info_split[3];1055my$old_sha1=$diff_info_split[2];1056my$page_created= ($old_sha1eq NULL_SHA1);1057my$page_deleted= ($new_sha1eq NULL_SHA1);1058$complete_file_name= mediawiki_clean_filename($complete_file_name);10591060my($title,$extension) =$complete_file_name=~/^(.*)\.([^\.]*)$/;1061if(!defined($extension)) {1062$extension= EMPTY;1063}1064if($extensioneq'mw') {1065my$ns= get_mw_namespace_id_for_page($complete_file_name);1066if($ns&&$ns== get_mw_namespace_id('File') && (!$export_media)) {1067print{*STDERR}"Ignoring media file related page: ${complete_file_name}\n";1068return($oldrevid,'ok');1069}1070my$file_content;1071if($page_deleted) {1072# Deleting a page usually requires1073# special privileges. A common1074# convention is to replace the page1075# with this content instead:1076$file_content= DELETED_CONTENT;1077}else{1078$file_content= run_git("cat-file blob ${new_sha1}");1079}10801081 mw_connect_maybe();10821083my$result=$mediawiki->edit( {1084 action =>'edit',1085 summary =>$summary,1086 title =>$title,1087 basetimestamp =>$basetimestamps{$oldrevid},1088 text => mediawiki_clean($file_content,$page_created),1089}, {1090 skip_encoding =>1# Helps with names with accentuated characters1091});1092if(!$result) {1093if($mediawiki->{error}->{code} ==3) {1094# edit conflicts, considered as non-fast-forward1095print{*STDERR}'Warning: Error '.1096$mediawiki->{error}->{code} .1097' from mediawiki: '.$mediawiki->{error}->{details} .1098".\n";1099return($oldrevid,'non-fast-forward');1100}else{1101# Other errors. Shouldn't happen => just die()1102die'Fatal: Error '.1103$mediawiki->{error}->{code} .1104' from mediawiki: '.$mediawiki->{error}->{details} ."\n";1105}1106}1107$newrevid=$result->{edit}->{newrevid};1108print{*STDERR}"Pushed file: ${new_sha1} - ${title}\n";1109}elsif($export_media) {1110$newrevid= mw_upload_file($complete_file_name,$new_sha1,1111$extension,$page_deleted,1112$summary);1113}else{1114print{*STDERR}"Ignoring media file ${title}\n";1115}1116$newrevid= ($newrevidor$oldrevid);1117return($newrevid,'ok');1118}11191120sub mw_push {1121# multiple push statements can follow each other1122my@refsspecs= (shift, get_more_refs('push'));1123my$pushed;1124formy$refspec(@refsspecs) {1125my($force,$local,$remote) =$refspec=~/^(\+)?([^:]*):([^:]*)$/1126or die("Invalid refspec for push. Expected <src>:<dst> or +<src>:<dst>\n");1127if($force) {1128print{*STDERR}"Warning: forced push not allowed on a MediaWiki.\n";1129}1130if($localeq EMPTY) {1131print{*STDERR}"Cannot delete remote branch on a MediaWiki\n";1132print{*STDOUT}"error ${remote} cannot delete\n";1133next;1134}1135if($remotene'refs/heads/master') {1136print{*STDERR}"Only push to the branch 'master' is supported on a MediaWiki\n";1137print{*STDOUT}"error ${remote} only master allowed\n";1138next;1139}1140if(mw_push_revision($local,$remote)) {1141$pushed=1;1142}1143}11441145# Notify Git that the push is done1146print{*STDOUT}"\n";11471148if($pushed&&$dumb_push) {1149print{*STDERR}"Just pushed some revisions to MediaWiki.\n";1150print{*STDERR}"The pushed revisions now have to be re-imported, and your current branch\n";1151print{*STDERR}"needs to be updated with these re-imported commits. You can do this with\n";1152print{*STDERR}"\n";1153print{*STDERR}" git pull --rebase\n";1154print{*STDERR}"\n";1155}1156return;1157}11581159sub mw_push_revision {1160my$local=shift;1161my$remote=shift;# actually, this has to be "refs/heads/master" at this point.1162my$last_local_revid= get_last_local_revision();1163print{*STDERR}".\n";# Finish sentence started by get_last_local_revision()1164my$last_remote_revid= get_last_remote_revision();1165my$mw_revision=$last_remote_revid;11661167# Get sha1 of commit pointed by local HEAD1168my$HEAD_sha1= run_git("rev-parse ${local} 2>/dev/null");1169chomp($HEAD_sha1);1170# Get sha1 of commit pointed by remotes/$remotename/master1171my$remoteorigin_sha1= run_git("rev-parse refs/remotes/${remotename}/master 2>/dev/null");1172chomp($remoteorigin_sha1);11731174if($last_local_revid>0&&1175$last_local_revid<$last_remote_revid) {1176return error_non_fast_forward($remote);1177}11781179if($HEAD_sha1eq$remoteorigin_sha1) {1180# nothing to push1181return0;1182}11831184# Get every commit in between HEAD and refs/remotes/origin/master,1185# including HEAD and refs/remotes/origin/master1186my@commit_pairs= ();1187if($last_local_revid>0) {1188my$parsed_sha1=$remoteorigin_sha1;1189# Find a path from last MediaWiki commit to pushed commit1190print{*STDERR}"Computing path from local to remote ...\n";1191my@local_ancestry=split(/\n/, run_git("rev-list --boundary --parents ${local} ^${parsed_sha1}"));1192my%local_ancestry;1193foreachmy$line(@local_ancestry) {1194if(my($child,$parents) =$line=~/^-?([a-f0-9]+) ([a-f0-9 ]+)/) {1195foreachmy$parent(split(/ /,$parents)) {1196$local_ancestry{$parent} =$child;1197}1198}elsif(!$line=~/^([a-f0-9]+)/) {1199die"Unexpected output from git rev-list: ${line}\n";1200}1201}1202while($parsed_sha1ne$HEAD_sha1) {1203my$child=$local_ancestry{$parsed_sha1};1204if(!$child) {1205print{*STDERR}"Cannot find a path in history from remote commit to last commit\n";1206return error_non_fast_forward($remote);1207}1208push(@commit_pairs, [$parsed_sha1,$child]);1209$parsed_sha1=$child;1210}1211}else{1212# No remote mediawiki revision. Export the whole1213# history (linearized with --first-parent)1214print{*STDERR}"Warning: no common ancestor, pushing complete history\n";1215my$history= run_git("rev-list --first-parent --children ${local}");1216my@history=split(/\n/,$history);1217@history=@history[1..$#history];1218foreachmy$line(reverse@history) {1219my@commit_info_split=split(/[ \n]/,$line);1220push(@commit_pairs, \@commit_info_split);1221}1222}12231224foreachmy$commit_info_split(@commit_pairs) {1225my$sha1_child= @{$commit_info_split}[0];1226my$sha1_commit= @{$commit_info_split}[1];1227my$diff_infos= run_git("diff-tree -r --raw -z ${sha1_child} ${sha1_commit}");1228# TODO: we could detect rename, and encode them with a #redirect on the wiki.1229# TODO: for now, it's just a delete+add1230my@diff_info_list=split(/\0/,$diff_infos);1231# Keep the subject line of the commit message as mediawiki comment for the revision1232my$commit_msg= run_git(qq(log --no-walk --format="%s" ${sha1_commit}));1233chomp($commit_msg);1234# Push every blob1235while(@diff_info_list) {1236my$status;1237# git diff-tree -z gives an output like1238# <metadata>\0<filename1>\01239# <metadata>\0<filename2>\01240# and we've split on \0.1241my$info=shift(@diff_info_list);1242my$file=shift(@diff_info_list);1243($mw_revision,$status) = mw_push_file($info,$file,$commit_msg,$mw_revision);1244if($statuseq'non-fast-forward') {1245# we may already have sent part of the1246# commit to MediaWiki, but it's too1247# late to cancel it. Stop the push in1248# the middle, but still give an1249# accurate error message.1250return error_non_fast_forward($remote);1251}1252if($statusne'ok') {1253die("Unknown error from mw_push_file()\n");1254}1255}1256if(!$dumb_push) {1257 run_git(qq(notes --ref=${remotename}/mediawiki add -f -m "mediawiki_revision: ${mw_revision}" ${sha1_commit}));1258 run_git(qq(update-ref -m "Git-MediaWiki push" refs/mediawiki/${remotename}/master ${sha1_commit} ${sha1_child}));1259}1260}12611262print{*STDOUT}"ok ${remote}\n";1263return1;1264}12651266sub get_allowed_file_extensions {1267 mw_connect_maybe();12681269my$query= {1270 action =>'query',1271 meta =>'siteinfo',1272 siprop =>'fileextensions'1273};1274my$result=$mediawiki->api($query);1275my@file_extensions=map{$_->{ext}} @{$result->{query}->{fileextensions}};1276my%hashFile=map{$_=>1}@file_extensions;12771278return%hashFile;1279}12801281# In memory cache for MediaWiki namespace ids.1282my%namespace_id;12831284# Namespaces whose id is cached in the configuration file1285# (to avoid duplicates)1286my%cached_mw_namespace_id;12871288# Return MediaWiki id for a canonical namespace name.1289# Ex.: "File", "Project".1290sub get_mw_namespace_id {1291 mw_connect_maybe();1292my$name=shift;12931294if(!exists$namespace_id{$name}) {1295# Look at configuration file, if the record for that namespace is1296# already cached. Namespaces are stored in form:1297# "Name_of_namespace:Id_namespace", ex.: "File:6".1298my@temp=split(/\n/,1299 run_git("config --get-all remote.${remotename}.namespaceCache"));1300chomp(@temp);1301foreachmy$ns(@temp) {1302my($n,$id) =split(/:/,$ns);1303if($ideq'notANameSpace') {1304$namespace_id{$n} = {is_namespace =>0};1305}else{1306$namespace_id{$n} = {is_namespace =>1, id =>$id};1307}1308$cached_mw_namespace_id{$n} =1;1309}1310}13111312if(!exists$namespace_id{$name}) {1313print{*STDERR}"Namespace ${name} not found in cache, querying the wiki ...\n";1314# NS not found => get namespace id from MW and store it in1315# configuration file.1316my$query= {1317 action =>'query',1318 meta =>'siteinfo',1319 siprop =>'namespaces'1320};1321my$result=$mediawiki->api($query);13221323while(my($id,$ns) =each(%{$result->{query}->{namespaces}})) {1324if(defined($ns->{id}) &&defined($ns->{canonical})) {1325$namespace_id{$ns->{canonical}} = {is_namespace =>1, id =>$ns->{id}};1326if($ns->{'*'}) {1327# alias (e.g. french Fichier: as alias for canonical File:)1328$namespace_id{$ns->{'*'}} = {is_namespace =>1, id =>$ns->{id}};1329}1330}1331}1332}13331334my$ns=$namespace_id{$name};1335my$id;13361337if(!defined$ns) {1338print{*STDERR}"No such namespace ${name} on MediaWiki.\n";1339$ns= {is_namespace =>0};1340$namespace_id{$name} =$ns;1341}13421343if($ns->{is_namespace}) {1344$id=$ns->{id};1345}13461347# Store "notANameSpace" as special value for inexisting namespaces1348my$store_id= ($id||'notANameSpace');13491350# Store explicitely requested namespaces on disk1351if(!exists$cached_mw_namespace_id{$name}) {1352 run_git(qq(config --add remote.${remotename}.namespaceCache "${name}:${store_id}"));1353$cached_mw_namespace_id{$name} =1;1354}1355return$id;1356}13571358sub get_mw_namespace_id_for_page {1359my$namespace=shift;1360if($namespace=~/^([^:]*):/) {1361return get_mw_namespace_id($namespace);1362}else{1363return;1364}1365}