1#! /usr/bin/perl 2 3# Copyright (C) 2011 4# Jérémie Nikaes <jeremie.nikaes@ensimag.imag.fr> 5# Arnaud Lacurie <arnaud.lacurie@ensimag.imag.fr> 6# Claire Fousse <claire.fousse@ensimag.imag.fr> 7# David Amouyal <david.amouyal@ensimag.imag.fr> 8# Matthieu Moy <matthieu.moy@grenoble-inp.fr> 9# License: GPL v2 or later 10 11# Gateway between Git and MediaWiki. 12# https://github.com/Bibzball/Git-Mediawiki/wiki 13# 14# Known limitations: 15# 16# - Several strategies are provided to fetch modifications from the 17# wiki, but no automatic heuristics is provided, the user has 18# to understand and chose which strategy is appropriate for him. 19# 20# - Git renames could be turned into MediaWiki renames (see TODO 21# below) 22# 23# - login/password support requires the user to write the password 24# cleartext in a file (see TODO below). 25# 26# - No way to import "one page, and all pages included in it" 27# 28# - Multiple remote MediaWikis have not been very well tested. 29 30use strict; 31use MediaWiki::API; 32use DateTime::Format::ISO8601; 33 34# By default, use UTF-8 to communicate with Git and the user 35binmode STDERR,":utf8"; 36binmode STDOUT,":utf8"; 37 38use URI::Escape; 39use IPC::Open2; 40 41use warnings; 42 43# Mediawiki filenames can contain forward slashes. This variable decides by which pattern they should be replaced 44useconstant SLASH_REPLACEMENT =>"%2F"; 45 46# It's not always possible to delete pages (may require some 47# priviledges). Deleted pages are replaced with this content. 48useconstant DELETED_CONTENT =>"[[Category:Deleted]]\n"; 49 50# It's not possible to create empty pages. New empty files in Git are 51# sent with this content instead. 52useconstant EMPTY_CONTENT =>"<!-- empty page -->\n"; 53 54# used to reflect file creation or deletion in diff. 55useconstant NULL_SHA1 =>"0000000000000000000000000000000000000000"; 56 57my$remotename=$ARGV[0]; 58my$url=$ARGV[1]; 59 60# Accept both space-separated and multiple keys in config file. 61# Spaces should be written as _ anyway because we'll use chomp. 62my@tracked_pages=split(/[ \n]/, run_git("config --get-all remote.".$remotename.".pages")); 63chomp(@tracked_pages); 64 65# Just like @tracked_pages, but for MediaWiki categories. 66my@tracked_categories=split(/[ \n]/, run_git("config --get-all remote.".$remotename.".categories")); 67chomp(@tracked_categories); 68 69# Import media files too. 70my$import_media= run_git("config --get --bool remote.".$remotename.".mediaimport"); 71chomp($import_media); 72$import_media= ($import_mediaeq"true"); 73 74my$wiki_login= run_git("config --get remote.".$remotename.".mwLogin"); 75# TODO: ideally, this should be able to read from keyboard, but we're 76# inside a remote helper, so our stdin is connect to git, not to a 77# terminal. 78my$wiki_passwd= run_git("config --get remote.".$remotename.".mwPassword"); 79my$wiki_domain= run_git("config --get remote.".$remotename.".mwDomain"); 80chomp($wiki_login); 81chomp($wiki_passwd); 82chomp($wiki_domain); 83 84# Import only last revisions (both for clone and fetch) 85my$shallow_import= run_git("config --get --bool remote.".$remotename.".shallow"); 86chomp($shallow_import); 87$shallow_import= ($shallow_importeq"true"); 88 89# Fetch (clone and pull) by revisions instead of by pages. This behavior 90# is more efficient when we have a wiki with lots of pages and we fetch 91# the revisions quite often so that they concern only few pages. 92# Possible values: 93# - by_rev: perform one query per new revision on the remote wiki 94# - by_page: query each tracked page for new revision 95my$fetch_strategy= run_git("config --get remote.$remotename.fetchStrategy"); 96unless($fetch_strategy) { 97$fetch_strategy= run_git("config --get mediawiki.fetchStrategy"); 98} 99chomp($fetch_strategy); 100unless($fetch_strategy) { 101$fetch_strategy="by_page"; 102} 103 104# Dumb push: don't update notes and mediawiki ref to reflect the last push. 105# 106# Configurable with mediawiki.dumbPush, or per-remote with 107# remote.<remotename>.dumbPush. 108# 109# This means the user will have to re-import the just-pushed 110# revisions. On the other hand, this means that the Git revisions 111# corresponding to MediaWiki revisions are all imported from the wiki, 112# regardless of whether they were initially created in Git or from the 113# web interface, hence all users will get the same history (i.e. if 114# the push from Git to MediaWiki loses some information, everybody 115# will get the history with information lost). If the import is 116# deterministic, this means everybody gets the same sha1 for each 117# MediaWiki revision. 118my$dumb_push= run_git("config --get --bool remote.$remotename.dumbPush"); 119unless($dumb_push) { 120$dumb_push= run_git("config --get --bool mediawiki.dumbPush"); 121} 122chomp($dumb_push); 123$dumb_push= ($dumb_pusheq"true"); 124 125my$wiki_name=$url; 126$wiki_name=~s/[^\/]*:\/\///; 127# If URL is like http://user:password@example.com/, we clearly don't 128# want the password in $wiki_name. While we're there, also remove user 129# and '@' sign, to avoid author like MWUser@HTTPUser@host.com 130$wiki_name=~s/^.*@//; 131 132# Commands parser 133my$entry; 134my@cmd; 135while(<STDIN>) { 136chomp; 137@cmd=split(/ /); 138if(defined($cmd[0])) { 139# Line not blank 140if($cmd[0]eq"capabilities") { 141die("Too many arguments for capabilities")unless(!defined($cmd[1])); 142 mw_capabilities(); 143}elsif($cmd[0]eq"list") { 144die("Too many arguments for list")unless(!defined($cmd[2])); 145 mw_list($cmd[1]); 146}elsif($cmd[0]eq"import") { 147die("Invalid arguments for import")unless($cmd[1]ne""&& !defined($cmd[2])); 148 mw_import($cmd[1]); 149}elsif($cmd[0]eq"option") { 150die("Too many arguments for option")unless($cmd[1]ne""&&$cmd[2]ne""&& !defined($cmd[3])); 151 mw_option($cmd[1],$cmd[2]); 152}elsif($cmd[0]eq"push") { 153 mw_push($cmd[1]); 154}else{ 155print STDERR "Unknown command. Aborting...\n"; 156last; 157} 158}else{ 159# blank line: we should terminate 160last; 161} 162 163BEGIN{ $| =1}# flush STDOUT, to make sure the previous 164# command is fully processed. 165} 166 167########################## Functions ############################## 168 169## credential API management (generic functions) 170 171sub credential_from_url { 172my$url=shift; 173my$parsed= URI->new($url); 174my%credential; 175 176if($parsed->scheme) { 177$credential{protocol} =$parsed->scheme; 178} 179if($parsed->host) { 180$credential{host} =$parsed->host; 181} 182if($parsed->path) { 183$credential{path} =$parsed->path; 184} 185if($parsed->userinfo) { 186if($parsed->userinfo=~/([^:]*):(.*)/) { 187$credential{username} =$1; 188$credential{password} =$2; 189}else{ 190$credential{username} =$parsed->userinfo; 191} 192} 193 194return%credential; 195} 196 197sub credential_read { 198my%credential; 199my$reader=shift; 200my$op=shift; 201while(<$reader>) { 202my($key,$value) =/([^=]*)=(.*)/; 203if(not defined$key) { 204die"ERROR receiving response from git credential$op:\n$_\n"; 205} 206$credential{$key} =$value; 207} 208return%credential; 209} 210 211sub credential_write { 212my$credential=shift; 213my$writer=shift; 214while(my($key,$value) =each(%$credential) ) { 215if($value) { 216print$writer"$key=$value\n"; 217} 218} 219} 220 221sub credential_run { 222my$op=shift; 223my$credential=shift; 224my$pid= open2(my$reader,my$writer,"git credential$op"); 225 credential_write($credential,$writer); 226print$writer"\n"; 227close($writer); 228 229if($opeq"fill") { 230%$credential= credential_read($reader,$op); 231}else{ 232if(<$reader>) { 233die"ERROR while running git credential$op:\n$_"; 234} 235} 236close($reader); 237waitpid($pid,0); 238my$child_exit_status=$?>>8; 239if($child_exit_status!=0) { 240die"'git credential$op' failed with code$child_exit_status."; 241} 242} 243 244# MediaWiki API instance, created lazily. 245my$mediawiki; 246 247sub mw_connect_maybe { 248if($mediawiki) { 249return; 250} 251$mediawiki= MediaWiki::API->new; 252$mediawiki->{config}->{api_url} ="$url/api.php"; 253if($wiki_login) { 254my%credential= credential_from_url($url); 255$credential{username} =$wiki_login; 256$credential{password} =$wiki_passwd; 257 credential_run("fill", \%credential); 258my$request= {lgname =>$credential{username}, 259 lgpassword =>$credential{password}, 260 lgdomain =>$wiki_domain}; 261if($mediawiki->login($request)) { 262 credential_run("approve", \%credential); 263print STDERR "Logged in mediawiki user\"$credential{username}\".\n"; 264}else{ 265print STDERR "Failed to log in mediawiki user\"$credential{username}\"on$url\n"; 266print STDERR " (error ". 267$mediawiki->{error}->{code} .': '. 268$mediawiki->{error}->{details} .")\n"; 269 credential_run("reject", \%credential); 270exit1; 271} 272} 273} 274 275## Functions for listing pages on the remote wiki 276sub get_mw_tracked_pages { 277my$pages=shift; 278 get_mw_page_list(\@tracked_pages,$pages); 279} 280 281sub get_mw_page_list { 282my$page_list=shift; 283my$pages=shift; 284my@some_pages=@$page_list; 285while(@some_pages) { 286my$last=50; 287if($#some_pages<$last) { 288$last=$#some_pages; 289} 290my@slice=@some_pages[0..$last]; 291 get_mw_first_pages(\@slice,$pages); 292@some_pages=@some_pages[51..$#some_pages]; 293} 294} 295 296sub get_mw_tracked_categories { 297my$pages=shift; 298foreachmy$category(@tracked_categories) { 299if(index($category,':') <0) { 300# Mediawiki requires the Category 301# prefix, but let's not force the user 302# to specify it. 303$category="Category:".$category; 304} 305my$mw_pages=$mediawiki->list( { 306 action =>'query', 307 list =>'categorymembers', 308 cmtitle =>$category, 309 cmlimit =>'max'} ) 310||die$mediawiki->{error}->{code} .': ' 311.$mediawiki->{error}->{details}; 312foreachmy$page(@{$mw_pages}) { 313$pages->{$page->{title}} =$page; 314} 315} 316} 317 318sub get_mw_all_pages { 319my$pages=shift; 320# No user-provided list, get the list of pages from the API. 321my$mw_pages=$mediawiki->list({ 322 action =>'query', 323 list =>'allpages', 324 aplimit =>'max' 325}); 326if(!defined($mw_pages)) { 327print STDERR "fatal: could not get the list of wiki pages.\n"; 328print STDERR "fatal: '$url' does not appear to be a mediawiki\n"; 329print STDERR "fatal: make sure '$url/api.php' is a valid page.\n"; 330exit1; 331} 332foreachmy$page(@{$mw_pages}) { 333$pages->{$page->{title}} =$page; 334} 335} 336 337# queries the wiki for a set of pages. Meant to be used within a loop 338# querying the wiki for slices of page list. 339sub get_mw_first_pages { 340my$some_pages=shift; 341my@some_pages= @{$some_pages}; 342 343my$pages=shift; 344 345# pattern 'page1|page2|...' required by the API 346my$titles=join('|',@some_pages); 347 348my$mw_pages=$mediawiki->api({ 349 action =>'query', 350 titles =>$titles, 351}); 352if(!defined($mw_pages)) { 353print STDERR "fatal: could not query the list of wiki pages.\n"; 354print STDERR "fatal: '$url' does not appear to be a mediawiki\n"; 355print STDERR "fatal: make sure '$url/api.php' is a valid page.\n"; 356exit1; 357} 358while(my($id,$page) =each(%{$mw_pages->{query}->{pages}})) { 359if($id<0) { 360print STDERR "Warning: page$page->{title} not found on wiki\n"; 361}else{ 362$pages->{$page->{title}} =$page; 363} 364} 365} 366 367# Get the list of pages to be fetched according to configuration. 368sub get_mw_pages { 369 mw_connect_maybe(); 370 371my%pages;# hash on page titles to avoid duplicates 372my$user_defined; 373if(@tracked_pages) { 374$user_defined=1; 375# The user provided a list of pages titles, but we 376# still need to query the API to get the page IDs. 377 get_mw_tracked_pages(\%pages); 378} 379if(@tracked_categories) { 380$user_defined=1; 381 get_mw_tracked_categories(\%pages); 382} 383if(!$user_defined) { 384 get_mw_all_pages(\%pages); 385} 386if($import_media) { 387print STDERR "Getting media files for selected pages...\n"; 388if($user_defined) { 389 get_linked_mediafiles(\%pages); 390}else{ 391 get_all_mediafiles(\%pages); 392} 393} 394return%pages; 395} 396 397# usage: $out = run_git("command args"); 398# $out = run_git("command args", "raw"); # don't interpret output as UTF-8. 399sub run_git { 400my$args=shift; 401my$encoding= (shift||"encoding(UTF-8)"); 402open(my$git,"-|:$encoding","git ".$args); 403my$res=do{local$/; <$git> }; 404close($git); 405 406return$res; 407} 408 409 410sub get_all_mediafiles { 411my$pages=shift; 412# Attach list of all pages for media files from the API, 413# they are in a different namespace, only one namespace 414# can be queried at the same moment 415my$mw_pages=$mediawiki->list({ 416 action =>'query', 417 list =>'allpages', 418 apnamespace => get_mw_namespace_id("File"), 419 aplimit =>'max' 420}); 421if(!defined($mw_pages)) { 422print STDERR "fatal: could not get the list of pages for media files.\n"; 423print STDERR "fatal: '$url' does not appear to be a mediawiki\n"; 424print STDERR "fatal: make sure '$url/api.php' is a valid page.\n"; 425exit1; 426} 427foreachmy$page(@{$mw_pages}) { 428$pages->{$page->{title}} =$page; 429} 430} 431 432sub get_linked_mediafiles { 433my$pages=shift; 434my@titles=map$_->{title},values(%{$pages}); 435 436# The query is split in small batches because of the MW API limit of 437# the number of links to be returned (500 links max). 438my$batch=10; 439while(@titles) { 440if($#titles<$batch) { 441$batch=$#titles; 442} 443my@slice=@titles[0..$batch]; 444 445# pattern 'page1|page2|...' required by the API 446my$mw_titles=join('|',@slice); 447 448# Media files could be included or linked from 449# a page, get all related 450my$query= { 451 action =>'query', 452 prop =>'links|images', 453 titles =>$mw_titles, 454 plnamespace => get_mw_namespace_id("File"), 455 pllimit =>'max' 456}; 457my$result=$mediawiki->api($query); 458 459while(my($id,$page) =each(%{$result->{query}->{pages}})) { 460my@media_titles; 461if(defined($page->{links})) { 462my@link_titles=map$_->{title}, @{$page->{links}}; 463push(@media_titles,@link_titles); 464} 465if(defined($page->{images})) { 466my@image_titles=map$_->{title}, @{$page->{images}}; 467push(@media_titles,@image_titles); 468} 469if(@media_titles) { 470 get_mw_page_list(\@media_titles,$pages); 471} 472} 473 474@titles=@titles[($batch+1)..$#titles]; 475} 476} 477 478sub get_mw_mediafile_for_page_revision { 479# Name of the file on Wiki, with the prefix. 480my$filename=shift; 481my$timestamp=shift; 482my%mediafile; 483 484# Search if on a media file with given timestamp exists on 485# MediaWiki. In that case download the file. 486my$query= { 487 action =>'query', 488 prop =>'imageinfo', 489 titles =>"File:".$filename, 490 iistart =>$timestamp, 491 iiend =>$timestamp, 492 iiprop =>'timestamp|archivename|url', 493 iilimit =>1 494}; 495my$result=$mediawiki->api($query); 496 497my($fileid,$file) =each( %{$result->{query}->{pages}} ); 498# If not defined it means there is no revision of the file for 499# given timestamp. 500if(defined($file->{imageinfo})) { 501$mediafile{title} =$filename; 502 503my$fileinfo=pop(@{$file->{imageinfo}}); 504$mediafile{timestamp} =$fileinfo->{timestamp}; 505# Mediawiki::API's download function doesn't support https URLs 506# and can't download old versions of files. 507print STDERR "\tDownloading file$mediafile{title}, version$mediafile{timestamp}\n"; 508$mediafile{content} = download_mw_mediafile($fileinfo->{url}); 509} 510return%mediafile; 511} 512 513sub download_mw_mediafile { 514my$url=shift; 515 516my$response=$mediawiki->{ua}->get($url); 517if($response->code==200) { 518return$response->decoded_content; 519}else{ 520print STDERR "Error downloading mediafile from :\n"; 521print STDERR "URL:$url\n"; 522print STDERR "Server response: ".$response->code." ".$response->message."\n"; 523exit1; 524} 525} 526 527sub get_last_local_revision { 528# Get note regarding last mediawiki revision 529my$note= run_git("notes --ref=$remotename/mediawikishow refs/mediawiki/$remotename/master2>/dev/null"); 530my@note_info=split(/ /,$note); 531 532my$lastrevision_number; 533if(!(defined($note_info[0]) &&$note_info[0]eq"mediawiki_revision:")) { 534print STDERR "No previous mediawiki revision found"; 535$lastrevision_number=0; 536}else{ 537# Notes are formatted : mediawiki_revision: #number 538$lastrevision_number=$note_info[1]; 539chomp($lastrevision_number); 540print STDERR "Last local mediawiki revision found is$lastrevision_number"; 541} 542return$lastrevision_number; 543} 544 545# Remember the timestamp corresponding to a revision id. 546my%basetimestamps; 547 548# Get the last remote revision without taking in account which pages are 549# tracked or not. This function makes a single request to the wiki thus 550# avoid a loop onto all tracked pages. This is useful for the fetch-by-rev 551# option. 552sub get_last_global_remote_rev { 553 mw_connect_maybe(); 554 555my$query= { 556 action =>'query', 557 list =>'recentchanges', 558 prop =>'revisions', 559 rclimit =>'1', 560 rcdir =>'older', 561}; 562my$result=$mediawiki->api($query); 563return$result->{query}->{recentchanges}[0]->{revid}; 564} 565 566# Get the last remote revision concerning the tracked pages and the tracked 567# categories. 568sub get_last_remote_revision { 569 mw_connect_maybe(); 570 571my%pages_hash= get_mw_pages(); 572my@pages=values(%pages_hash); 573 574my$max_rev_num=0; 575 576foreachmy$page(@pages) { 577my$id=$page->{pageid}; 578 579my$query= { 580 action =>'query', 581 prop =>'revisions', 582 rvprop =>'ids|timestamp', 583 pageids =>$id, 584}; 585 586my$result=$mediawiki->api($query); 587 588my$lastrev=pop(@{$result->{query}->{pages}->{$id}->{revisions}}); 589 590$basetimestamps{$lastrev->{revid}} =$lastrev->{timestamp}; 591 592$max_rev_num= ($lastrev->{revid} >$max_rev_num?$lastrev->{revid} :$max_rev_num); 593} 594 595print STDERR "Last remote revision found is$max_rev_num.\n"; 596return$max_rev_num; 597} 598 599# Clean content before sending it to MediaWiki 600sub mediawiki_clean { 601my$string=shift; 602my$page_created=shift; 603# Mediawiki does not allow blank space at the end of a page and ends with a single \n. 604# This function right trims a string and adds a \n at the end to follow this rule 605$string=~s/\s+$//; 606if($stringeq""&&$page_created) { 607# Creating empty pages is forbidden. 608$string= EMPTY_CONTENT; 609} 610return$string."\n"; 611} 612 613# Filter applied on MediaWiki data before adding them to Git 614sub mediawiki_smudge { 615my$string=shift; 616if($stringeq EMPTY_CONTENT) { 617$string=""; 618} 619# This \n is important. This is due to mediawiki's way to handle end of files. 620return$string."\n"; 621} 622 623sub mediawiki_clean_filename { 624my$filename=shift; 625$filename=~s/@{[SLASH_REPLACEMENT]}/\//g; 626# [, ], |, {, and } are forbidden by MediaWiki, even URL-encoded. 627# Do a variant of URL-encoding, i.e. looks like URL-encoding, 628# but with _ added to prevent MediaWiki from thinking this is 629# an actual special character. 630$filename=~s/[\[\]\{\}\|]/sprintf("_%%_%x", ord($&))/ge; 631# If we use the uri escape before 632# we should unescape here, before anything 633 634return$filename; 635} 636 637sub mediawiki_smudge_filename { 638my$filename=shift; 639$filename=~s/\//@{[SLASH_REPLACEMENT]}/g; 640$filename=~s/ /_/g; 641# Decode forbidden characters encoded in mediawiki_clean_filename 642$filename=~s/_%_([0-9a-fA-F][0-9a-fA-F])/sprintf("%c", hex($1))/ge; 643return$filename; 644} 645 646sub literal_data { 647my($content) =@_; 648print STDOUT "data ", bytes::length($content),"\n",$content; 649} 650 651sub literal_data_raw { 652# Output possibly binary content. 653my($content) =@_; 654# Avoid confusion between size in bytes and in characters 655 utf8::downgrade($content); 656binmode STDOUT,":raw"; 657print STDOUT "data ", bytes::length($content),"\n",$content; 658binmode STDOUT,":utf8"; 659} 660 661sub mw_capabilities { 662# Revisions are imported to the private namespace 663# refs/mediawiki/$remotename/ by the helper and fetched into 664# refs/remotes/$remotename later by fetch. 665print STDOUT "refspec refs/heads/*:refs/mediawiki/$remotename/*\n"; 666print STDOUT "import\n"; 667print STDOUT "list\n"; 668print STDOUT "push\n"; 669print STDOUT "\n"; 670} 671 672sub mw_list { 673# MediaWiki do not have branches, we consider one branch arbitrarily 674# called master, and HEAD pointing to it. 675print STDOUT "? refs/heads/master\n"; 676print STDOUT "\@refs/heads/masterHEAD\n"; 677print STDOUT "\n"; 678} 679 680sub mw_option { 681print STDERR "remote-helper command 'option$_[0]' not yet implemented\n"; 682print STDOUT "unsupported\n"; 683} 684 685sub fetch_mw_revisions_for_page { 686my$page=shift; 687my$id=shift; 688my$fetch_from=shift; 689my@page_revs= (); 690my$query= { 691 action =>'query', 692 prop =>'revisions', 693 rvprop =>'ids', 694 rvdir =>'newer', 695 rvstartid =>$fetch_from, 696 rvlimit =>500, 697 pageids =>$id, 698}; 699 700my$revnum=0; 701# Get 500 revisions at a time due to the mediawiki api limit 702while(1) { 703my$result=$mediawiki->api($query); 704 705# Parse each of those 500 revisions 706foreachmy$revision(@{$result->{query}->{pages}->{$id}->{revisions}}) { 707my$page_rev_ids; 708$page_rev_ids->{pageid} =$page->{pageid}; 709$page_rev_ids->{revid} =$revision->{revid}; 710push(@page_revs,$page_rev_ids); 711$revnum++; 712} 713last unless$result->{'query-continue'}; 714$query->{rvstartid} =$result->{'query-continue'}->{revisions}->{rvstartid}; 715} 716if($shallow_import&&@page_revs) { 717print STDERR " Found 1 revision (shallow import).\n"; 718@page_revs=sort{$b->{revid} <=>$a->{revid}} (@page_revs); 719return$page_revs[0]; 720} 721print STDERR " Found ",$revnum," revision(s).\n"; 722return@page_revs; 723} 724 725sub fetch_mw_revisions { 726my$pages=shift;my@pages= @{$pages}; 727my$fetch_from=shift; 728 729my@revisions= (); 730my$n=1; 731foreachmy$page(@pages) { 732my$id=$page->{pageid}; 733 734print STDERR "page$n/",scalar(@pages),": ".$page->{title} ."\n"; 735$n++; 736my@page_revs= fetch_mw_revisions_for_page($page,$id,$fetch_from); 737@revisions= (@page_revs,@revisions); 738} 739 740return($n,@revisions); 741} 742 743sub import_file_revision { 744my$commit=shift; 745my%commit= %{$commit}; 746my$full_import=shift; 747my$n=shift; 748my$mediafile=shift; 749my%mediafile; 750if($mediafile) { 751%mediafile= %{$mediafile}; 752} 753 754my$title=$commit{title}; 755my$comment=$commit{comment}; 756my$content=$commit{content}; 757my$author=$commit{author}; 758my$date=$commit{date}; 759 760print STDOUT "commit refs/mediawiki/$remotename/master\n"; 761print STDOUT "mark :$n\n"; 762print STDOUT "committer$author<$author\@$wiki_name> ",$date->epoch," +0000\n"; 763 literal_data($comment); 764 765# If it's not a clone, we need to know where to start from 766if(!$full_import&&$n==1) { 767print STDOUT "from refs/mediawiki/$remotename/master^0\n"; 768} 769if($contentne DELETED_CONTENT) { 770print STDOUT "M 644 inline$title.mw\n"; 771 literal_data($content); 772if(%mediafile) { 773print STDOUT "M 644 inline$mediafile{title}\n"; 774 literal_data_raw($mediafile{content}); 775} 776print STDOUT "\n\n"; 777}else{ 778print STDOUT "D$title.mw\n"; 779} 780 781# mediawiki revision number in the git note 782if($full_import&&$n==1) { 783print STDOUT "reset refs/notes/$remotename/mediawiki\n"; 784} 785print STDOUT "commit refs/notes/$remotename/mediawiki\n"; 786print STDOUT "committer$author<$author\@$wiki_name> ",$date->epoch," +0000\n"; 787 literal_data("Note added by git-mediawiki during import"); 788if(!$full_import&&$n==1) { 789print STDOUT "from refs/notes/$remotename/mediawiki^0\n"; 790} 791print STDOUT "N inline :$n\n"; 792 literal_data("mediawiki_revision: ".$commit{mw_revision}); 793print STDOUT "\n\n"; 794} 795 796# parse a sequence of 797# <cmd> <arg1> 798# <cmd> <arg2> 799# \n 800# (like batch sequence of import and sequence of push statements) 801sub get_more_refs { 802my$cmd=shift; 803my@refs; 804while(1) { 805my$line= <STDIN>; 806if($line=~m/^$cmd (.*)$/) { 807push(@refs,$1); 808}elsif($lineeq"\n") { 809return@refs; 810}else{ 811die("Invalid command in a '$cmd' batch: ".$_); 812} 813} 814} 815 816sub mw_import { 817# multiple import commands can follow each other. 818my@refs= (shift, get_more_refs("import")); 819foreachmy$ref(@refs) { 820 mw_import_ref($ref); 821} 822print STDOUT "done\n"; 823} 824 825sub mw_import_ref { 826my$ref=shift; 827# The remote helper will call "import HEAD" and 828# "import refs/heads/master". 829# Since HEAD is a symbolic ref to master (by convention, 830# followed by the output of the command "list" that we gave), 831# we don't need to do anything in this case. 832if($refeq"HEAD") { 833return; 834} 835 836 mw_connect_maybe(); 837 838print STDERR "Searching revisions...\n"; 839my$last_local= get_last_local_revision(); 840my$fetch_from=$last_local+1; 841if($fetch_from==1) { 842print STDERR ", fetching from beginning.\n"; 843}else{ 844print STDERR ", fetching from here.\n"; 845} 846 847my$n=0; 848if($fetch_strategyeq"by_rev") { 849print STDERR "Fetching & writing export data by revs...\n"; 850$n= mw_import_ref_by_revs($fetch_from); 851}elsif($fetch_strategyeq"by_page") { 852print STDERR "Fetching & writing export data by pages...\n"; 853$n= mw_import_ref_by_pages($fetch_from); 854}else{ 855print STDERR "fatal: invalid fetch strategy\"$fetch_strategy\".\n"; 856print STDERR "Check your configuration variables remote.$remotename.fetchStrategy and mediawiki.fetchStrategy\n"; 857exit1; 858} 859 860if($fetch_from==1&&$n==0) { 861print STDERR "You appear to have cloned an empty MediaWiki.\n"; 862# Something has to be done remote-helper side. If nothing is done, an error is 863# thrown saying that HEAD is refering to unknown object 0000000000000000000 864# and the clone fails. 865} 866} 867 868sub mw_import_ref_by_pages { 869 870my$fetch_from=shift; 871my%pages_hash= get_mw_pages(); 872my@pages=values(%pages_hash); 873 874my($n,@revisions) = fetch_mw_revisions(\@pages,$fetch_from); 875 876@revisions=sort{$a->{revid} <=>$b->{revid}}@revisions; 877my@revision_ids=map$_->{revid},@revisions; 878 879return mw_import_revids($fetch_from, \@revision_ids, \%pages_hash); 880} 881 882sub mw_import_ref_by_revs { 883 884my$fetch_from=shift; 885my%pages_hash= get_mw_pages(); 886 887my$last_remote= get_last_global_remote_rev(); 888my@revision_ids=$fetch_from..$last_remote; 889return mw_import_revids($fetch_from, \@revision_ids, \%pages_hash); 890} 891 892# Import revisions given in second argument (array of integers). 893# Only pages appearing in the third argument (hash indexed by page titles) 894# will be imported. 895sub mw_import_revids { 896my$fetch_from=shift; 897my$revision_ids=shift; 898my$pages=shift; 899 900my$n=0; 901my$n_actual=0; 902my$last_timestamp=0;# Placeholer in case $rev->timestamp is undefined 903 904foreachmy$pagerevid(@$revision_ids) { 905# fetch the content of the pages 906my$query= { 907 action =>'query', 908 prop =>'revisions', 909 rvprop =>'content|timestamp|comment|user|ids', 910 revids =>$pagerevid, 911}; 912 913my$result=$mediawiki->api($query); 914 915if(!$result) { 916die"Failed to retrieve modified page for revision$pagerevid"; 917} 918 919if(!defined($result->{query}->{pages})) { 920die"Invalid revision$pagerevid."; 921} 922 923my@result_pages=values(%{$result->{query}->{pages}}); 924my$result_page=$result_pages[0]; 925my$rev=$result_pages[0]->{revisions}->[0]; 926 927# Count page even if we skip it, since we display 928# $n/$total and $total includes skipped pages. 929$n++; 930 931my$page_title=$result_page->{title}; 932 933if(!exists($pages->{$page_title})) { 934print STDERR "$n/",scalar(@$revision_ids), 935": Skipping revision #$rev->{revid} of$page_title\n"; 936next; 937} 938 939$n_actual++; 940 941my%commit; 942$commit{author} =$rev->{user} ||'Anonymous'; 943$commit{comment} =$rev->{comment} ||'*Empty MediaWiki Message*'; 944$commit{title} = mediawiki_smudge_filename($page_title); 945$commit{mw_revision} =$rev->{revid}; 946$commit{content} = mediawiki_smudge($rev->{'*'}); 947 948if(!defined($rev->{timestamp})) { 949$last_timestamp++; 950}else{ 951$last_timestamp=$rev->{timestamp}; 952} 953$commit{date} = DateTime::Format::ISO8601->parse_datetime($last_timestamp); 954 955# Differentiates classic pages and media files. 956my($namespace,$filename) =$page_title=~/^([^:]*):(.*)$/; 957my%mediafile; 958if($namespace&& get_mw_namespace_id($namespace) == get_mw_namespace_id("File")) { 959%mediafile= get_mw_mediafile_for_page_revision($filename,$rev->{timestamp}); 960} 961# If this is a revision of the media page for new version 962# of a file do one common commit for both file and media page. 963# Else do commit only for that page. 964print STDERR "$n/",scalar(@$revision_ids),": Revision #$rev->{revid} of$commit{title}\n"; 965 import_file_revision(\%commit, ($fetch_from==1),$n_actual, \%mediafile); 966} 967 968return$n_actual; 969} 970 971sub error_non_fast_forward { 972my$advice= run_git("config --bool advice.pushNonFastForward"); 973chomp($advice); 974if($advicene"false") { 975# Native git-push would show this after the summary. 976# We can't ask it to display it cleanly, so print it 977# ourselves before. 978print STDERR "To prevent you from losing history, non-fast-forward updates were rejected\n"; 979print STDERR "Merge the remote changes (e.g. 'git pull') before pushing again. See the\n"; 980print STDERR "'Note about fast-forwards' section of 'git push --help' for details.\n"; 981} 982print STDOUT "error$_[0]\"non-fast-forward\"\n"; 983return0; 984} 985 986sub mw_upload_file { 987my$complete_file_name=shift; 988my$new_sha1=shift; 989my$extension=shift; 990my$file_deleted=shift; 991my$summary=shift; 992my$newrevid; 993my$path="File:".$complete_file_name; 994my%hashFiles= get_allowed_file_extensions(); 995if(!exists($hashFiles{$extension})) { 996print STDERR "$complete_file_nameis not a permitted file on this wiki.\n"; 997print STDERR "Check the configuration of file uploads in your mediawiki.\n"; 998return$newrevid; 999}1000# Deleting and uploading a file requires a priviledged user1001if($file_deleted) {1002 mw_connect_maybe();1003my$query= {1004 action =>'delete',1005 title =>$path,1006 reason =>$summary1007};1008if(!$mediawiki->edit($query)) {1009print STDERR "Failed to delete file on remote wiki\n";1010print STDERR "Check your permissions on the remote site. Error code:\n";1011print STDERR $mediawiki->{error}->{code} .':'.$mediawiki->{error}->{details};1012exit1;1013}1014}else{1015# Don't let perl try to interpret file content as UTF-8 => use "raw"1016my$content= run_git("cat-file blob$new_sha1","raw");1017if($contentne"") {1018 mw_connect_maybe();1019$mediawiki->{config}->{upload_url} =1020"$url/index.php/Special:Upload";1021$mediawiki->edit({1022 action =>'upload',1023 filename =>$complete_file_name,1024 comment =>$summary,1025 file => [undef,1026$complete_file_name,1027 Content =>$content],1028 ignorewarnings =>1,1029}, {1030 skip_encoding =>11031} ) ||die$mediawiki->{error}->{code} .':'1032.$mediawiki->{error}->{details};1033my$last_file_page=$mediawiki->get_page({title =>$path});1034$newrevid=$last_file_page->{revid};1035print STDERR "Pushed file:$new_sha1-$complete_file_name.\n";1036}else{1037print STDERR "Empty file$complete_file_namenot pushed.\n";1038}1039}1040return$newrevid;1041}10421043sub mw_push_file {1044my$diff_info=shift;1045# $diff_info contains a string in this format:1046# 100644 100644 <sha1_of_blob_before_commit> <sha1_of_blob_now> <status>1047my@diff_info_split=split(/[ \t]/,$diff_info);10481049# Filename, including .mw extension1050my$complete_file_name=shift;1051# Commit message1052my$summary=shift;1053# MediaWiki revision number. Keep the previous one by default,1054# in case there's no edit to perform.1055my$oldrevid=shift;1056my$newrevid;10571058my$new_sha1=$diff_info_split[3];1059my$old_sha1=$diff_info_split[2];1060my$page_created= ($old_sha1eq NULL_SHA1);1061my$page_deleted= ($new_sha1eq NULL_SHA1);1062$complete_file_name= mediawiki_clean_filename($complete_file_name);10631064my($title,$extension) =$complete_file_name=~/^(.*)\.([^\.]*)$/;1065if(!defined($extension)) {1066$extension="";1067}1068if($extensioneq"mw") {1069my$file_content;1070if($page_deleted) {1071# Deleting a page usually requires1072# special priviledges. A common1073# convention is to replace the page1074# with this content instead:1075$file_content= DELETED_CONTENT;1076}else{1077$file_content= run_git("cat-file blob$new_sha1");1078}10791080 mw_connect_maybe();10811082my$result=$mediawiki->edit( {1083 action =>'edit',1084 summary =>$summary,1085 title =>$title,1086 basetimestamp =>$basetimestamps{$oldrevid},1087 text => mediawiki_clean($file_content,$page_created),1088}, {1089 skip_encoding =>1# Helps with names with accentuated characters1090});1091if(!$result) {1092if($mediawiki->{error}->{code} ==3) {1093# edit conflicts, considered as non-fast-forward1094print STDERR 'Warning: Error '.1095$mediawiki->{error}->{code} .1096' from mediwiki: '.$mediawiki->{error}->{details} .1097".\n";1098return($oldrevid,"non-fast-forward");1099}else{1100# Other errors. Shouldn't happen => just die()1101die'Fatal: Error '.1102$mediawiki->{error}->{code} .1103' from mediwiki: '.$mediawiki->{error}->{details};1104}1105}1106$newrevid=$result->{edit}->{newrevid};1107print STDERR "Pushed file:$new_sha1-$title\n";1108}else{1109$newrevid= mw_upload_file($complete_file_name,$new_sha1,1110$extension,$page_deleted,1111$summary);1112}1113$newrevid= ($newrevidor$oldrevid);1114return($newrevid,"ok");1115}11161117sub mw_push {1118# multiple push statements can follow each other1119my@refsspecs= (shift, get_more_refs("push"));1120my$pushed;1121formy$refspec(@refsspecs) {1122my($force,$local,$remote) =$refspec=~/^(\+)?([^:]*):([^:]*)$/1123or die("Invalid refspec for push. Expected <src>:<dst> or +<src>:<dst>");1124if($force) {1125print STDERR "Warning: forced push not allowed on a MediaWiki.\n";1126}1127if($localeq"") {1128print STDERR "Cannot delete remote branch on a MediaWiki\n";1129print STDOUT "error$remotecannot delete\n";1130next;1131}1132if($remotene"refs/heads/master") {1133print STDERR "Only push to the branch 'master' is supported on a MediaWiki\n";1134print STDOUT "error$remoteonly master allowed\n";1135next;1136}1137if(mw_push_revision($local,$remote)) {1138$pushed=1;1139}1140}11411142# Notify Git that the push is done1143print STDOUT "\n";11441145if($pushed&&$dumb_push) {1146print STDERR "Just pushed some revisions to MediaWiki.\n";1147print STDERR "The pushed revisions now have to be re-imported, and your current branch\n";1148print STDERR "needs to be updated with these re-imported commits. You can do this with\n";1149print STDERR "\n";1150print STDERR " git pull --rebase\n";1151print STDERR "\n";1152}1153}11541155sub mw_push_revision {1156my$local=shift;1157my$remote=shift;# actually, this has to be "refs/heads/master" at this point.1158my$last_local_revid= get_last_local_revision();1159print STDERR ".\n";# Finish sentence started by get_last_local_revision()1160my$last_remote_revid= get_last_remote_revision();1161my$mw_revision=$last_remote_revid;11621163# Get sha1 of commit pointed by local HEAD1164my$HEAD_sha1= run_git("rev-parse$local2>/dev/null");chomp($HEAD_sha1);1165# Get sha1 of commit pointed by remotes/$remotename/master1166my$remoteorigin_sha1= run_git("rev-parse refs/remotes/$remotename/master2>/dev/null");1167chomp($remoteorigin_sha1);11681169if($last_local_revid>0&&1170$last_local_revid<$last_remote_revid) {1171return error_non_fast_forward($remote);1172}11731174if($HEAD_sha1eq$remoteorigin_sha1) {1175# nothing to push1176return0;1177}11781179# Get every commit in between HEAD and refs/remotes/origin/master,1180# including HEAD and refs/remotes/origin/master1181my@commit_pairs= ();1182if($last_local_revid>0) {1183my$parsed_sha1=$remoteorigin_sha1;1184# Find a path from last MediaWiki commit to pushed commit1185while($parsed_sha1ne$HEAD_sha1) {1186my@commit_info=grep(/^$parsed_sha1/,split(/\n/, run_git("rev-list --children$local")));1187if(!@commit_info) {1188return error_non_fast_forward($remote);1189}1190my@commit_info_split=split(/ |\n/,$commit_info[0]);1191# $commit_info_split[1] is the sha1 of the commit to export1192# $commit_info_split[0] is the sha1 of its direct child1193push(@commit_pairs, \@commit_info_split);1194$parsed_sha1=$commit_info_split[1];1195}1196}else{1197# No remote mediawiki revision. Export the whole1198# history (linearized with --first-parent)1199print STDERR "Warning: no common ancestor, pushing complete history\n";1200my$history= run_git("rev-list --first-parent --children$local");1201my@history=split('\n',$history);1202@history=@history[1..$#history];1203foreachmy$line(reverse@history) {1204my@commit_info_split=split(/ |\n/,$line);1205push(@commit_pairs, \@commit_info_split);1206}1207}12081209foreachmy$commit_info_split(@commit_pairs) {1210my$sha1_child= @{$commit_info_split}[0];1211my$sha1_commit= @{$commit_info_split}[1];1212my$diff_infos= run_git("diff-tree -r --raw -z$sha1_child$sha1_commit");1213# TODO: we could detect rename, and encode them with a #redirect on the wiki.1214# TODO: for now, it's just a delete+add1215my@diff_info_list=split(/\0/,$diff_infos);1216# Keep the subject line of the commit message as mediawiki comment for the revision1217my$commit_msg= run_git("log --no-walk --format=\"%s\"$sha1_commit");1218chomp($commit_msg);1219# Push every blob1220while(@diff_info_list) {1221my$status;1222# git diff-tree -z gives an output like1223# <metadata>\0<filename1>\01224# <metadata>\0<filename2>\01225# and we've split on \0.1226my$info=shift(@diff_info_list);1227my$file=shift(@diff_info_list);1228($mw_revision,$status) = mw_push_file($info,$file,$commit_msg,$mw_revision);1229if($statuseq"non-fast-forward") {1230# we may already have sent part of the1231# commit to MediaWiki, but it's too1232# late to cancel it. Stop the push in1233# the middle, but still give an1234# accurate error message.1235return error_non_fast_forward($remote);1236}1237if($statusne"ok") {1238die("Unknown error from mw_push_file()");1239}1240}1241unless($dumb_push) {1242 run_git("notes --ref=$remotename/mediawikiadd -m\"mediawiki_revision:$mw_revision\"$sha1_commit");1243 run_git("update-ref -m\"Git-MediaWiki push\"refs/mediawiki/$remotename/master$sha1_commit$sha1_child");1244}1245}12461247print STDOUT "ok$remote\n";1248return1;1249}12501251sub get_allowed_file_extensions {1252 mw_connect_maybe();12531254my$query= {1255 action =>'query',1256 meta =>'siteinfo',1257 siprop =>'fileextensions'1258};1259my$result=$mediawiki->api($query);1260my@file_extensions=map$_->{ext},@{$result->{query}->{fileextensions}};1261my%hashFile=map{$_=>1}@file_extensions;12621263return%hashFile;1264}12651266# In memory cache for MediaWiki namespace ids.1267my%namespace_id;12681269# Namespaces whose id is cached in the configuration file1270# (to avoid duplicates)1271my%cached_mw_namespace_id;12721273# Return MediaWiki id for a canonical namespace name.1274# Ex.: "File", "Project".1275sub get_mw_namespace_id {1276 mw_connect_maybe();1277my$name=shift;12781279if(!exists$namespace_id{$name}) {1280# Look at configuration file, if the record for that namespace is1281# already cached. Namespaces are stored in form:1282# "Name_of_namespace:Id_namespace", ex.: "File:6".1283my@temp=split(/[ \n]/, run_git("config --get-all remote."1284.$remotename.".namespaceCache"));1285chomp(@temp);1286foreachmy$ns(@temp) {1287my($n,$id) =split(/:/,$ns);1288$namespace_id{$n} =$id;1289$cached_mw_namespace_id{$n} =1;1290}1291}12921293if(!exists$namespace_id{$name}) {1294print STDERR "Namespace$namenot found in cache, querying the wiki ...\n";1295# NS not found => get namespace id from MW and store it in1296# configuration file.1297my$query= {1298 action =>'query',1299 meta =>'siteinfo',1300 siprop =>'namespaces'1301};1302my$result=$mediawiki->api($query);13031304while(my($id,$ns) =each(%{$result->{query}->{namespaces}})) {1305if(defined($ns->{id}) &&defined($ns->{canonical})) {1306$namespace_id{$ns->{canonical}} =$ns->{id};1307if($ns->{'*'}) {1308# alias (e.g. french Fichier: as alias for canonical File:)1309$namespace_id{$ns->{'*'}} =$ns->{id};1310}1311}1312}1313}13141315my$id=$namespace_id{$name};13161317if(defined$id) {1318# Store explicitely requested namespaces on disk1319if(!exists$cached_mw_namespace_id{$name}) {1320 run_git("config --add remote.".$remotename1321.".namespaceCache\"".$name.":".$id."\"");1322$cached_mw_namespace_id{$name} =1;1323}1324return$id;1325}else{1326die"No such namespace$nameon MediaWiki.";1327}1328}