1#! /usr/bin/perl 2 3# Copyright (C) 2011 4# Jérémie Nikaes <jeremie.nikaes@ensimag.imag.fr> 5# Arnaud Lacurie <arnaud.lacurie@ensimag.imag.fr> 6# Claire Fousse <claire.fousse@ensimag.imag.fr> 7# David Amouyal <david.amouyal@ensimag.imag.fr> 8# Matthieu Moy <matthieu.moy@grenoble-inp.fr> 9# License: GPL v2 or later 10 11# Gateway between Git and MediaWiki. 12# Documentation & bugtracker: https://github.com/moy/Git-Mediawiki/ 13 14use strict; 15use MediaWiki::API; 16use Git; 17use DateTime::Format::ISO8601; 18 19# By default, use UTF-8 to communicate with Git and the user 20binmode STDERR,":utf8"; 21binmode STDOUT,":utf8"; 22 23use URI::Escape; 24use IPC::Open2; 25 26use warnings; 27 28# Mediawiki filenames can contain forward slashes. This variable decides by which pattern they should be replaced 29useconstant SLASH_REPLACEMENT =>"%2F"; 30 31# It's not always possible to delete pages (may require some 32# privileges). Deleted pages are replaced with this content. 33useconstant DELETED_CONTENT =>"[[Category:Deleted]]\n"; 34 35# It's not possible to create empty pages. New empty files in Git are 36# sent with this content instead. 37useconstant EMPTY_CONTENT =>"<!-- empty page -->\n"; 38 39# used to reflect file creation or deletion in diff. 40useconstant NULL_SHA1 =>"0000000000000000000000000000000000000000"; 41 42# Used on Git's side to reflect empty edit messages on the wiki 43useconstant EMPTY_MESSAGE =>'*Empty MediaWiki Message*'; 44 45my$remotename=$ARGV[0]; 46my$url=$ARGV[1]; 47 48# Accept both space-separated and multiple keys in config file. 49# Spaces should be written as _ anyway because we'll use chomp. 50my@tracked_pages=split(/[ \n]/, run_git("config --get-all remote.".$remotename.".pages")); 51chomp(@tracked_pages); 52 53# Just like @tracked_pages, but for MediaWiki categories. 54my@tracked_categories=split(/[ \n]/, run_git("config --get-all remote.".$remotename.".categories")); 55chomp(@tracked_categories); 56 57# Import media files on pull 58my$import_media= run_git("config --get --bool remote.".$remotename.".mediaimport"); 59chomp($import_media); 60$import_media= ($import_mediaeq"true"); 61 62# Export media files on push 63my$export_media= run_git("config --get --bool remote.".$remotename.".mediaexport"); 64chomp($export_media); 65$export_media= !($export_mediaeq"false"); 66 67my$wiki_login= run_git("config --get remote.".$remotename.".mwLogin"); 68# Note: mwPassword is discourraged. Use the credential system instead. 69my$wiki_passwd= run_git("config --get remote.".$remotename.".mwPassword"); 70my$wiki_domain= run_git("config --get remote.".$remotename.".mwDomain"); 71chomp($wiki_login); 72chomp($wiki_passwd); 73chomp($wiki_domain); 74 75# Import only last revisions (both for clone and fetch) 76my$shallow_import= run_git("config --get --bool remote.".$remotename.".shallow"); 77chomp($shallow_import); 78$shallow_import= ($shallow_importeq"true"); 79 80# Fetch (clone and pull) by revisions instead of by pages. This behavior 81# is more efficient when we have a wiki with lots of pages and we fetch 82# the revisions quite often so that they concern only few pages. 83# Possible values: 84# - by_rev: perform one query per new revision on the remote wiki 85# - by_page: query each tracked page for new revision 86my$fetch_strategy= run_git("config --get remote.$remotename.fetchStrategy"); 87unless($fetch_strategy) { 88$fetch_strategy= run_git("config --get mediawiki.fetchStrategy"); 89} 90chomp($fetch_strategy); 91unless($fetch_strategy) { 92$fetch_strategy="by_page"; 93} 94 95# Dumb push: don't update notes and mediawiki ref to reflect the last push. 96# 97# Configurable with mediawiki.dumbPush, or per-remote with 98# remote.<remotename>.dumbPush. 99# 100# This means the user will have to re-import the just-pushed 101# revisions. On the other hand, this means that the Git revisions 102# corresponding to MediaWiki revisions are all imported from the wiki, 103# regardless of whether they were initially created in Git or from the 104# web interface, hence all users will get the same history (i.e. if 105# the push from Git to MediaWiki loses some information, everybody 106# will get the history with information lost). If the import is 107# deterministic, this means everybody gets the same sha1 for each 108# MediaWiki revision. 109my$dumb_push= run_git("config --get --bool remote.$remotename.dumbPush"); 110unless($dumb_push) { 111$dumb_push= run_git("config --get --bool mediawiki.dumbPush"); 112} 113chomp($dumb_push); 114$dumb_push= ($dumb_pusheq"true"); 115 116my$wiki_name=$url; 117$wiki_name=~s/[^\/]*:\/\///; 118# If URL is like http://user:password@example.com/, we clearly don't 119# want the password in $wiki_name. While we're there, also remove user 120# and '@' sign, to avoid author like MWUser@HTTPUser@host.com 121$wiki_name=~s/^.*@//; 122 123# Commands parser 124my$entry; 125my@cmd; 126while(<STDIN>) { 127chomp; 128@cmd=split(/ /); 129if(defined($cmd[0])) { 130# Line not blank 131if($cmd[0]eq"capabilities") { 132die("Too many arguments for capabilities")unless(!defined($cmd[1])); 133 mw_capabilities(); 134}elsif($cmd[0]eq"list") { 135die("Too many arguments for list")unless(!defined($cmd[2])); 136 mw_list($cmd[1]); 137}elsif($cmd[0]eq"import") { 138die("Invalid arguments for import")unless($cmd[1]ne""&& !defined($cmd[2])); 139 mw_import($cmd[1]); 140}elsif($cmd[0]eq"option") { 141die("Too many arguments for option")unless($cmd[1]ne""&&$cmd[2]ne""&& !defined($cmd[3])); 142 mw_option($cmd[1],$cmd[2]); 143}elsif($cmd[0]eq"push") { 144 mw_push($cmd[1]); 145}else{ 146print STDERR "Unknown command. Aborting...\n"; 147last; 148} 149}else{ 150# blank line: we should terminate 151last; 152} 153 154BEGIN{ $| =1}# flush STDOUT, to make sure the previous 155# command is fully processed. 156} 157 158########################## Functions ############################## 159 160# MediaWiki API instance, created lazily. 161my$mediawiki; 162 163sub mw_connect_maybe { 164if($mediawiki) { 165return; 166} 167$mediawiki= MediaWiki::API->new; 168$mediawiki->{config}->{api_url} ="$url/api.php"; 169if($wiki_login) { 170my%credential= ( 171'url'=>$url, 172'username'=>$wiki_login, 173'password'=>$wiki_passwd 174); 175 Git::credential(\%credential); 176my$request= {lgname =>$credential{username}, 177 lgpassword =>$credential{password}, 178 lgdomain =>$wiki_domain}; 179if($mediawiki->login($request)) { 180 Git::credential(\%credential,'approve'); 181print STDERR "Logged in mediawiki user\"$credential{username}\".\n"; 182}else{ 183print STDERR "Failed to log in mediawiki user\"$credential{username}\"on$url\n"; 184print STDERR " (error ". 185$mediawiki->{error}->{code} .': '. 186$mediawiki->{error}->{details} .")\n"; 187 Git::credential(\%credential,'reject'); 188exit1; 189} 190} 191} 192 193## Functions for listing pages on the remote wiki 194sub get_mw_tracked_pages { 195my$pages=shift; 196 get_mw_page_list(\@tracked_pages,$pages); 197} 198 199sub get_mw_page_list { 200my$page_list=shift; 201my$pages=shift; 202my@some_pages=@$page_list; 203while(@some_pages) { 204my$last=50; 205if($#some_pages<$last) { 206$last=$#some_pages; 207} 208my@slice=@some_pages[0..$last]; 209 get_mw_first_pages(\@slice,$pages); 210@some_pages=@some_pages[51..$#some_pages]; 211} 212} 213 214sub get_mw_tracked_categories { 215my$pages=shift; 216foreachmy$category(@tracked_categories) { 217if(index($category,':') <0) { 218# Mediawiki requires the Category 219# prefix, but let's not force the user 220# to specify it. 221$category="Category:".$category; 222} 223my$mw_pages=$mediawiki->list( { 224 action =>'query', 225 list =>'categorymembers', 226 cmtitle =>$category, 227 cmlimit =>'max'} ) 228||die$mediawiki->{error}->{code} .': ' 229.$mediawiki->{error}->{details}; 230foreachmy$page(@{$mw_pages}) { 231$pages->{$page->{title}} =$page; 232} 233} 234} 235 236sub get_mw_all_pages { 237my$pages=shift; 238# No user-provided list, get the list of pages from the API. 239my$mw_pages=$mediawiki->list({ 240 action =>'query', 241 list =>'allpages', 242 aplimit =>'max' 243}); 244if(!defined($mw_pages)) { 245print STDERR "fatal: could not get the list of wiki pages.\n"; 246print STDERR "fatal: '$url' does not appear to be a mediawiki\n"; 247print STDERR "fatal: make sure '$url/api.php' is a valid page.\n"; 248exit1; 249} 250foreachmy$page(@{$mw_pages}) { 251$pages->{$page->{title}} =$page; 252} 253} 254 255# queries the wiki for a set of pages. Meant to be used within a loop 256# querying the wiki for slices of page list. 257sub get_mw_first_pages { 258my$some_pages=shift; 259my@some_pages= @{$some_pages}; 260 261my$pages=shift; 262 263# pattern 'page1|page2|...' required by the API 264my$titles=join('|',@some_pages); 265 266my$mw_pages=$mediawiki->api({ 267 action =>'query', 268 titles =>$titles, 269}); 270if(!defined($mw_pages)) { 271print STDERR "fatal: could not query the list of wiki pages.\n"; 272print STDERR "fatal: '$url' does not appear to be a mediawiki\n"; 273print STDERR "fatal: make sure '$url/api.php' is a valid page.\n"; 274exit1; 275} 276while(my($id,$page) =each(%{$mw_pages->{query}->{pages}})) { 277if($id<0) { 278print STDERR "Warning: page$page->{title} not found on wiki\n"; 279}else{ 280$pages->{$page->{title}} =$page; 281} 282} 283} 284 285# Get the list of pages to be fetched according to configuration. 286sub get_mw_pages { 287 mw_connect_maybe(); 288 289print STDERR "Listing pages on remote wiki...\n"; 290 291my%pages;# hash on page titles to avoid duplicates 292my$user_defined; 293if(@tracked_pages) { 294$user_defined=1; 295# The user provided a list of pages titles, but we 296# still need to query the API to get the page IDs. 297 get_mw_tracked_pages(\%pages); 298} 299if(@tracked_categories) { 300$user_defined=1; 301 get_mw_tracked_categories(\%pages); 302} 303if(!$user_defined) { 304 get_mw_all_pages(\%pages); 305} 306if($import_media) { 307print STDERR "Getting media files for selected pages...\n"; 308if($user_defined) { 309 get_linked_mediafiles(\%pages); 310}else{ 311 get_all_mediafiles(\%pages); 312} 313} 314print STDERR (scalar keys%pages) ." pages found.\n"; 315return%pages; 316} 317 318# usage: $out = run_git("command args"); 319# $out = run_git("command args", "raw"); # don't interpret output as UTF-8. 320sub run_git { 321my$args=shift; 322my$encoding= (shift||"encoding(UTF-8)"); 323open(my$git,"-|:$encoding","git ".$args); 324my$res=do{local$/; <$git> }; 325close($git); 326 327return$res; 328} 329 330 331sub get_all_mediafiles { 332my$pages=shift; 333# Attach list of all pages for media files from the API, 334# they are in a different namespace, only one namespace 335# can be queried at the same moment 336my$mw_pages=$mediawiki->list({ 337 action =>'query', 338 list =>'allpages', 339 apnamespace => get_mw_namespace_id("File"), 340 aplimit =>'max' 341}); 342if(!defined($mw_pages)) { 343print STDERR "fatal: could not get the list of pages for media files.\n"; 344print STDERR "fatal: '$url' does not appear to be a mediawiki\n"; 345print STDERR "fatal: make sure '$url/api.php' is a valid page.\n"; 346exit1; 347} 348foreachmy$page(@{$mw_pages}) { 349$pages->{$page->{title}} =$page; 350} 351} 352 353sub get_linked_mediafiles { 354my$pages=shift; 355my@titles=map$_->{title},values(%{$pages}); 356 357# The query is split in small batches because of the MW API limit of 358# the number of links to be returned (500 links max). 359my$batch=10; 360while(@titles) { 361if($#titles<$batch) { 362$batch=$#titles; 363} 364my@slice=@titles[0..$batch]; 365 366# pattern 'page1|page2|...' required by the API 367my$mw_titles=join('|',@slice); 368 369# Media files could be included or linked from 370# a page, get all related 371my$query= { 372 action =>'query', 373 prop =>'links|images', 374 titles =>$mw_titles, 375 plnamespace => get_mw_namespace_id("File"), 376 pllimit =>'max' 377}; 378my$result=$mediawiki->api($query); 379 380while(my($id,$page) =each(%{$result->{query}->{pages}})) { 381my@media_titles; 382if(defined($page->{links})) { 383my@link_titles=map$_->{title}, @{$page->{links}}; 384push(@media_titles,@link_titles); 385} 386if(defined($page->{images})) { 387my@image_titles=map$_->{title}, @{$page->{images}}; 388push(@media_titles,@image_titles); 389} 390if(@media_titles) { 391 get_mw_page_list(\@media_titles,$pages); 392} 393} 394 395@titles=@titles[($batch+1)..$#titles]; 396} 397} 398 399sub get_mw_mediafile_for_page_revision { 400# Name of the file on Wiki, with the prefix. 401my$filename=shift; 402my$timestamp=shift; 403my%mediafile; 404 405# Search if on a media file with given timestamp exists on 406# MediaWiki. In that case download the file. 407my$query= { 408 action =>'query', 409 prop =>'imageinfo', 410 titles =>"File:".$filename, 411 iistart =>$timestamp, 412 iiend =>$timestamp, 413 iiprop =>'timestamp|archivename|url', 414 iilimit =>1 415}; 416my$result=$mediawiki->api($query); 417 418my($fileid,$file) =each( %{$result->{query}->{pages}} ); 419# If not defined it means there is no revision of the file for 420# given timestamp. 421if(defined($file->{imageinfo})) { 422$mediafile{title} =$filename; 423 424my$fileinfo=pop(@{$file->{imageinfo}}); 425$mediafile{timestamp} =$fileinfo->{timestamp}; 426# Mediawiki::API's download function doesn't support https URLs 427# and can't download old versions of files. 428print STDERR "\tDownloading file$mediafile{title}, version$mediafile{timestamp}\n"; 429$mediafile{content} = download_mw_mediafile($fileinfo->{url}); 430} 431return%mediafile; 432} 433 434sub download_mw_mediafile { 435my$url=shift; 436 437my$response=$mediawiki->{ua}->get($url); 438if($response->code==200) { 439return$response->decoded_content; 440}else{ 441print STDERR "Error downloading mediafile from :\n"; 442print STDERR "URL:$url\n"; 443print STDERR "Server response: ".$response->code." ".$response->message."\n"; 444exit1; 445} 446} 447 448sub get_last_local_revision { 449# Get note regarding last mediawiki revision 450my$note= run_git("notes --ref=$remotename/mediawikishow refs/mediawiki/$remotename/master2>/dev/null"); 451my@note_info=split(/ /,$note); 452 453my$lastrevision_number; 454if(!(defined($note_info[0]) &&$note_info[0]eq"mediawiki_revision:")) { 455print STDERR "No previous mediawiki revision found"; 456$lastrevision_number=0; 457}else{ 458# Notes are formatted : mediawiki_revision: #number 459$lastrevision_number=$note_info[1]; 460chomp($lastrevision_number); 461print STDERR "Last local mediawiki revision found is$lastrevision_number"; 462} 463return$lastrevision_number; 464} 465 466# Remember the timestamp corresponding to a revision id. 467my%basetimestamps; 468 469# Get the last remote revision without taking in account which pages are 470# tracked or not. This function makes a single request to the wiki thus 471# avoid a loop onto all tracked pages. This is useful for the fetch-by-rev 472# option. 473sub get_last_global_remote_rev { 474 mw_connect_maybe(); 475 476my$query= { 477 action =>'query', 478 list =>'recentchanges', 479 prop =>'revisions', 480 rclimit =>'1', 481 rcdir =>'older', 482}; 483my$result=$mediawiki->api($query); 484return$result->{query}->{recentchanges}[0]->{revid}; 485} 486 487# Get the last remote revision concerning the tracked pages and the tracked 488# categories. 489sub get_last_remote_revision { 490 mw_connect_maybe(); 491 492my%pages_hash= get_mw_pages(); 493my@pages=values(%pages_hash); 494 495my$max_rev_num=0; 496 497print STDERR "Getting last revision id on tracked pages...\n"; 498 499foreachmy$page(@pages) { 500my$id=$page->{pageid}; 501 502my$query= { 503 action =>'query', 504 prop =>'revisions', 505 rvprop =>'ids|timestamp', 506 pageids =>$id, 507}; 508 509my$result=$mediawiki->api($query); 510 511my$lastrev=pop(@{$result->{query}->{pages}->{$id}->{revisions}}); 512 513$basetimestamps{$lastrev->{revid}} =$lastrev->{timestamp}; 514 515$max_rev_num= ($lastrev->{revid} >$max_rev_num?$lastrev->{revid} :$max_rev_num); 516} 517 518print STDERR "Last remote revision found is$max_rev_num.\n"; 519return$max_rev_num; 520} 521 522# Clean content before sending it to MediaWiki 523sub mediawiki_clean { 524my$string=shift; 525my$page_created=shift; 526# Mediawiki does not allow blank space at the end of a page and ends with a single \n. 527# This function right trims a string and adds a \n at the end to follow this rule 528$string=~s/\s+$//; 529if($stringeq""&&$page_created) { 530# Creating empty pages is forbidden. 531$string= EMPTY_CONTENT; 532} 533return$string."\n"; 534} 535 536# Filter applied on MediaWiki data before adding them to Git 537sub mediawiki_smudge { 538my$string=shift; 539if($stringeq EMPTY_CONTENT) { 540$string=""; 541} 542# This \n is important. This is due to mediawiki's way to handle end of files. 543return$string."\n"; 544} 545 546sub mediawiki_clean_filename { 547my$filename=shift; 548$filename=~s/@{[SLASH_REPLACEMENT]}/\//g; 549# [, ], |, {, and } are forbidden by MediaWiki, even URL-encoded. 550# Do a variant of URL-encoding, i.e. looks like URL-encoding, 551# but with _ added to prevent MediaWiki from thinking this is 552# an actual special character. 553$filename=~s/[\[\]\{\}\|]/sprintf("_%%_%x", ord($&))/ge; 554# If we use the uri escape before 555# we should unescape here, before anything 556 557return$filename; 558} 559 560sub mediawiki_smudge_filename { 561my$filename=shift; 562$filename=~s/\//@{[SLASH_REPLACEMENT]}/g; 563$filename=~s/ /_/g; 564# Decode forbidden characters encoded in mediawiki_clean_filename 565$filename=~s/_%_([0-9a-fA-F][0-9a-fA-F])/sprintf("%c", hex($1))/ge; 566return$filename; 567} 568 569sub literal_data { 570my($content) =@_; 571print STDOUT "data ", bytes::length($content),"\n",$content; 572} 573 574sub literal_data_raw { 575# Output possibly binary content. 576my($content) =@_; 577# Avoid confusion between size in bytes and in characters 578 utf8::downgrade($content); 579binmode STDOUT,":raw"; 580print STDOUT "data ", bytes::length($content),"\n",$content; 581binmode STDOUT,":utf8"; 582} 583 584sub mw_capabilities { 585# Revisions are imported to the private namespace 586# refs/mediawiki/$remotename/ by the helper and fetched into 587# refs/remotes/$remotename later by fetch. 588print STDOUT "refspec refs/heads/*:refs/mediawiki/$remotename/*\n"; 589print STDOUT "import\n"; 590print STDOUT "list\n"; 591print STDOUT "push\n"; 592print STDOUT "\n"; 593} 594 595sub mw_list { 596# MediaWiki do not have branches, we consider one branch arbitrarily 597# called master, and HEAD pointing to it. 598print STDOUT "? refs/heads/master\n"; 599print STDOUT "\@refs/heads/masterHEAD\n"; 600print STDOUT "\n"; 601} 602 603sub mw_option { 604print STDERR "remote-helper command 'option$_[0]' not yet implemented\n"; 605print STDOUT "unsupported\n"; 606} 607 608sub fetch_mw_revisions_for_page { 609my$page=shift; 610my$id=shift; 611my$fetch_from=shift; 612my@page_revs= (); 613my$query= { 614 action =>'query', 615 prop =>'revisions', 616 rvprop =>'ids', 617 rvdir =>'newer', 618 rvstartid =>$fetch_from, 619 rvlimit =>500, 620 pageids =>$id, 621}; 622 623my$revnum=0; 624# Get 500 revisions at a time due to the mediawiki api limit 625while(1) { 626my$result=$mediawiki->api($query); 627 628# Parse each of those 500 revisions 629foreachmy$revision(@{$result->{query}->{pages}->{$id}->{revisions}}) { 630my$page_rev_ids; 631$page_rev_ids->{pageid} =$page->{pageid}; 632$page_rev_ids->{revid} =$revision->{revid}; 633push(@page_revs,$page_rev_ids); 634$revnum++; 635} 636last unless$result->{'query-continue'}; 637$query->{rvstartid} =$result->{'query-continue'}->{revisions}->{rvstartid}; 638} 639if($shallow_import&&@page_revs) { 640print STDERR " Found 1 revision (shallow import).\n"; 641@page_revs=sort{$b->{revid} <=>$a->{revid}} (@page_revs); 642return$page_revs[0]; 643} 644print STDERR " Found ",$revnum," revision(s).\n"; 645return@page_revs; 646} 647 648sub fetch_mw_revisions { 649my$pages=shift;my@pages= @{$pages}; 650my$fetch_from=shift; 651 652my@revisions= (); 653my$n=1; 654foreachmy$page(@pages) { 655my$id=$page->{pageid}; 656 657print STDERR "page$n/",scalar(@pages),": ".$page->{title} ."\n"; 658$n++; 659my@page_revs= fetch_mw_revisions_for_page($page,$id,$fetch_from); 660@revisions= (@page_revs,@revisions); 661} 662 663return($n,@revisions); 664} 665 666sub fe_escape_path { 667my$path=shift; 668$path=~s/\\/\\\\/g; 669$path=~s/"/\\"/g; 670$path=~s/\n/\\n/g; 671return'"'.$path.'"'; 672} 673 674sub import_file_revision { 675my$commit=shift; 676my%commit= %{$commit}; 677my$full_import=shift; 678my$n=shift; 679my$mediafile=shift; 680my%mediafile; 681if($mediafile) { 682%mediafile= %{$mediafile}; 683} 684 685my$title=$commit{title}; 686my$comment=$commit{comment}; 687my$content=$commit{content}; 688my$author=$commit{author}; 689my$date=$commit{date}; 690 691print STDOUT "commit refs/mediawiki/$remotename/master\n"; 692print STDOUT "mark :$n\n"; 693print STDOUT "committer$author<$author\@$wiki_name> ",$date->epoch," +0000\n"; 694 literal_data($comment); 695 696# If it's not a clone, we need to know where to start from 697if(!$full_import&&$n==1) { 698print STDOUT "from refs/mediawiki/$remotename/master^0\n"; 699} 700if($contentne DELETED_CONTENT) { 701print STDOUT "M 644 inline ". 702 fe_escape_path($title.".mw") ."\n"; 703 literal_data($content); 704if(%mediafile) { 705print STDOUT "M 644 inline " 706. fe_escape_path($mediafile{title}) ."\n"; 707 literal_data_raw($mediafile{content}); 708} 709print STDOUT "\n\n"; 710}else{ 711print STDOUT "D ". fe_escape_path($title.".mw") ."\n"; 712} 713 714# mediawiki revision number in the git note 715if($full_import&&$n==1) { 716print STDOUT "reset refs/notes/$remotename/mediawiki\n"; 717} 718print STDOUT "commit refs/notes/$remotename/mediawiki\n"; 719print STDOUT "committer$author<$author\@$wiki_name> ",$date->epoch," +0000\n"; 720 literal_data("Note added by git-mediawiki during import"); 721if(!$full_import&&$n==1) { 722print STDOUT "from refs/notes/$remotename/mediawiki^0\n"; 723} 724print STDOUT "N inline :$n\n"; 725 literal_data("mediawiki_revision: ".$commit{mw_revision}); 726print STDOUT "\n\n"; 727} 728 729# parse a sequence of 730# <cmd> <arg1> 731# <cmd> <arg2> 732# \n 733# (like batch sequence of import and sequence of push statements) 734sub get_more_refs { 735my$cmd=shift; 736my@refs; 737while(1) { 738my$line= <STDIN>; 739if($line=~m/^$cmd (.*)$/) { 740push(@refs,$1); 741}elsif($lineeq"\n") { 742return@refs; 743}else{ 744die("Invalid command in a '$cmd' batch: ".$_); 745} 746} 747} 748 749sub mw_import { 750# multiple import commands can follow each other. 751my@refs= (shift, get_more_refs("import")); 752foreachmy$ref(@refs) { 753 mw_import_ref($ref); 754} 755print STDOUT "done\n"; 756} 757 758sub mw_import_ref { 759my$ref=shift; 760# The remote helper will call "import HEAD" and 761# "import refs/heads/master". 762# Since HEAD is a symbolic ref to master (by convention, 763# followed by the output of the command "list" that we gave), 764# we don't need to do anything in this case. 765if($refeq"HEAD") { 766return; 767} 768 769 mw_connect_maybe(); 770 771print STDERR "Searching revisions...\n"; 772my$last_local= get_last_local_revision(); 773my$fetch_from=$last_local+1; 774if($fetch_from==1) { 775print STDERR ", fetching from beginning.\n"; 776}else{ 777print STDERR ", fetching from here.\n"; 778} 779 780my$n=0; 781if($fetch_strategyeq"by_rev") { 782print STDERR "Fetching & writing export data by revs...\n"; 783$n= mw_import_ref_by_revs($fetch_from); 784}elsif($fetch_strategyeq"by_page") { 785print STDERR "Fetching & writing export data by pages...\n"; 786$n= mw_import_ref_by_pages($fetch_from); 787}else{ 788print STDERR "fatal: invalid fetch strategy\"$fetch_strategy\".\n"; 789print STDERR "Check your configuration variables remote.$remotename.fetchStrategy and mediawiki.fetchStrategy\n"; 790exit1; 791} 792 793if($fetch_from==1&&$n==0) { 794print STDERR "You appear to have cloned an empty MediaWiki.\n"; 795# Something has to be done remote-helper side. If nothing is done, an error is 796# thrown saying that HEAD is referring to unknown object 0000000000000000000 797# and the clone fails. 798} 799} 800 801sub mw_import_ref_by_pages { 802 803my$fetch_from=shift; 804my%pages_hash= get_mw_pages(); 805my@pages=values(%pages_hash); 806 807my($n,@revisions) = fetch_mw_revisions(\@pages,$fetch_from); 808 809@revisions=sort{$a->{revid} <=>$b->{revid}}@revisions; 810my@revision_ids=map$_->{revid},@revisions; 811 812return mw_import_revids($fetch_from, \@revision_ids, \%pages_hash); 813} 814 815sub mw_import_ref_by_revs { 816 817my$fetch_from=shift; 818my%pages_hash= get_mw_pages(); 819 820my$last_remote= get_last_global_remote_rev(); 821my@revision_ids=$fetch_from..$last_remote; 822return mw_import_revids($fetch_from, \@revision_ids, \%pages_hash); 823} 824 825# Import revisions given in second argument (array of integers). 826# Only pages appearing in the third argument (hash indexed by page titles) 827# will be imported. 828sub mw_import_revids { 829my$fetch_from=shift; 830my$revision_ids=shift; 831my$pages=shift; 832 833my$n=0; 834my$n_actual=0; 835my$last_timestamp=0;# Placeholer in case $rev->timestamp is undefined 836 837foreachmy$pagerevid(@$revision_ids) { 838# Count page even if we skip it, since we display 839# $n/$total and $total includes skipped pages. 840$n++; 841 842# fetch the content of the pages 843my$query= { 844 action =>'query', 845 prop =>'revisions', 846 rvprop =>'content|timestamp|comment|user|ids', 847 revids =>$pagerevid, 848}; 849 850my$result=$mediawiki->api($query); 851 852if(!$result) { 853die"Failed to retrieve modified page for revision$pagerevid"; 854} 855 856if(defined($result->{query}->{badrevids}->{$pagerevid})) { 857# The revision id does not exist on the remote wiki. 858next; 859} 860 861if(!defined($result->{query}->{pages})) { 862die"Invalid revision$pagerevid."; 863} 864 865my@result_pages=values(%{$result->{query}->{pages}}); 866my$result_page=$result_pages[0]; 867my$rev=$result_pages[0]->{revisions}->[0]; 868 869my$page_title=$result_page->{title}; 870 871if(!exists($pages->{$page_title})) { 872print STDERR "$n/",scalar(@$revision_ids), 873": Skipping revision #$rev->{revid} of$page_title\n"; 874next; 875} 876 877$n_actual++; 878 879my%commit; 880$commit{author} =$rev->{user} ||'Anonymous'; 881$commit{comment} =$rev->{comment} || EMPTY_MESSAGE; 882$commit{title} = mediawiki_smudge_filename($page_title); 883$commit{mw_revision} =$rev->{revid}; 884$commit{content} = mediawiki_smudge($rev->{'*'}); 885 886if(!defined($rev->{timestamp})) { 887$last_timestamp++; 888}else{ 889$last_timestamp=$rev->{timestamp}; 890} 891$commit{date} = DateTime::Format::ISO8601->parse_datetime($last_timestamp); 892 893# Differentiates classic pages and media files. 894my($namespace,$filename) =$page_title=~/^([^:]*):(.*)$/; 895my%mediafile; 896if($namespace) { 897my$id= get_mw_namespace_id($namespace); 898if($id&&$id== get_mw_namespace_id("File")) { 899%mediafile= get_mw_mediafile_for_page_revision($filename,$rev->{timestamp}); 900} 901} 902# If this is a revision of the media page for new version 903# of a file do one common commit for both file and media page. 904# Else do commit only for that page. 905print STDERR "$n/",scalar(@$revision_ids),": Revision #$rev->{revid} of$commit{title}\n"; 906 import_file_revision(\%commit, ($fetch_from==1),$n_actual, \%mediafile); 907} 908 909return$n_actual; 910} 911 912sub error_non_fast_forward { 913my$advice= run_git("config --bool advice.pushNonFastForward"); 914chomp($advice); 915if($advicene"false") { 916# Native git-push would show this after the summary. 917# We can't ask it to display it cleanly, so print it 918# ourselves before. 919print STDERR "To prevent you from losing history, non-fast-forward updates were rejected\n"; 920print STDERR "Merge the remote changes (e.g. 'git pull') before pushing again. See the\n"; 921print STDERR "'Note about fast-forwards' section of 'git push --help' for details.\n"; 922} 923print STDOUT "error$_[0]\"non-fast-forward\"\n"; 924return0; 925} 926 927sub mw_upload_file { 928my$complete_file_name=shift; 929my$new_sha1=shift; 930my$extension=shift; 931my$file_deleted=shift; 932my$summary=shift; 933my$newrevid; 934my$path="File:".$complete_file_name; 935my%hashFiles= get_allowed_file_extensions(); 936if(!exists($hashFiles{$extension})) { 937print STDERR "$complete_file_nameis not a permitted file on this wiki.\n"; 938print STDERR "Check the configuration of file uploads in your mediawiki.\n"; 939return$newrevid; 940} 941# Deleting and uploading a file requires a priviledged user 942if($file_deleted) { 943 mw_connect_maybe(); 944my$query= { 945 action =>'delete', 946 title =>$path, 947 reason =>$summary 948}; 949if(!$mediawiki->edit($query)) { 950print STDERR "Failed to delete file on remote wiki\n"; 951print STDERR "Check your permissions on the remote site. Error code:\n"; 952print STDERR $mediawiki->{error}->{code} .':'.$mediawiki->{error}->{details}; 953exit1; 954} 955}else{ 956# Don't let perl try to interpret file content as UTF-8 => use "raw" 957my$content= run_git("cat-file blob$new_sha1","raw"); 958if($contentne"") { 959 mw_connect_maybe(); 960$mediawiki->{config}->{upload_url} = 961"$url/index.php/Special:Upload"; 962$mediawiki->edit({ 963 action =>'upload', 964 filename =>$complete_file_name, 965 comment =>$summary, 966 file => [undef, 967$complete_file_name, 968 Content =>$content], 969 ignorewarnings =>1, 970}, { 971 skip_encoding =>1 972} ) ||die$mediawiki->{error}->{code} .':' 973.$mediawiki->{error}->{details}; 974my$last_file_page=$mediawiki->get_page({title =>$path}); 975$newrevid=$last_file_page->{revid}; 976print STDERR "Pushed file:$new_sha1-$complete_file_name.\n"; 977}else{ 978print STDERR "Empty file$complete_file_namenot pushed.\n"; 979} 980} 981return$newrevid; 982} 983 984sub mw_push_file { 985my$diff_info=shift; 986# $diff_info contains a string in this format: 987# 100644 100644 <sha1_of_blob_before_commit> <sha1_of_blob_now> <status> 988my@diff_info_split=split(/[ \t]/,$diff_info); 989 990# Filename, including .mw extension 991my$complete_file_name=shift; 992# Commit message 993my$summary=shift; 994# MediaWiki revision number. Keep the previous one by default, 995# in case there's no edit to perform. 996my$oldrevid=shift; 997my$newrevid; 998 999if($summaryeq EMPTY_MESSAGE) {1000$summary='';1001}10021003my$new_sha1=$diff_info_split[3];1004my$old_sha1=$diff_info_split[2];1005my$page_created= ($old_sha1eq NULL_SHA1);1006my$page_deleted= ($new_sha1eq NULL_SHA1);1007$complete_file_name= mediawiki_clean_filename($complete_file_name);10081009my($title,$extension) =$complete_file_name=~/^(.*)\.([^\.]*)$/;1010if(!defined($extension)) {1011$extension="";1012}1013if($extensioneq"mw") {1014my$ns= get_mw_namespace_id_for_page($complete_file_name);1015if($ns&&$ns== get_mw_namespace_id("File") && (!$export_media)) {1016print STDERR "Ignoring media file related page:$complete_file_name\n";1017return($oldrevid,"ok");1018}1019my$file_content;1020if($page_deleted) {1021# Deleting a page usually requires1022# special privileges. A common1023# convention is to replace the page1024# with this content instead:1025$file_content= DELETED_CONTENT;1026}else{1027$file_content= run_git("cat-file blob$new_sha1");1028}10291030 mw_connect_maybe();10311032my$result=$mediawiki->edit( {1033 action =>'edit',1034 summary =>$summary,1035 title =>$title,1036 basetimestamp =>$basetimestamps{$oldrevid},1037 text => mediawiki_clean($file_content,$page_created),1038}, {1039 skip_encoding =>1# Helps with names with accentuated characters1040});1041if(!$result) {1042if($mediawiki->{error}->{code} ==3) {1043# edit conflicts, considered as non-fast-forward1044print STDERR 'Warning: Error '.1045$mediawiki->{error}->{code} .1046' from mediwiki: '.$mediawiki->{error}->{details} .1047".\n";1048return($oldrevid,"non-fast-forward");1049}else{1050# Other errors. Shouldn't happen => just die()1051die'Fatal: Error '.1052$mediawiki->{error}->{code} .1053' from mediwiki: '.$mediawiki->{error}->{details};1054}1055}1056$newrevid=$result->{edit}->{newrevid};1057print STDERR "Pushed file:$new_sha1-$title\n";1058}elsif($export_media) {1059$newrevid= mw_upload_file($complete_file_name,$new_sha1,1060$extension,$page_deleted,1061$summary);1062}else{1063print STDERR "Ignoring media file$title\n";1064}1065$newrevid= ($newrevidor$oldrevid);1066return($newrevid,"ok");1067}10681069sub mw_push {1070# multiple push statements can follow each other1071my@refsspecs= (shift, get_more_refs("push"));1072my$pushed;1073formy$refspec(@refsspecs) {1074my($force,$local,$remote) =$refspec=~/^(\+)?([^:]*):([^:]*)$/1075or die("Invalid refspec for push. Expected <src>:<dst> or +<src>:<dst>");1076if($force) {1077print STDERR "Warning: forced push not allowed on a MediaWiki.\n";1078}1079if($localeq"") {1080print STDERR "Cannot delete remote branch on a MediaWiki\n";1081print STDOUT "error$remotecannot delete\n";1082next;1083}1084if($remotene"refs/heads/master") {1085print STDERR "Only push to the branch 'master' is supported on a MediaWiki\n";1086print STDOUT "error$remoteonly master allowed\n";1087next;1088}1089if(mw_push_revision($local,$remote)) {1090$pushed=1;1091}1092}10931094# Notify Git that the push is done1095print STDOUT "\n";10961097if($pushed&&$dumb_push) {1098print STDERR "Just pushed some revisions to MediaWiki.\n";1099print STDERR "The pushed revisions now have to be re-imported, and your current branch\n";1100print STDERR "needs to be updated with these re-imported commits. You can do this with\n";1101print STDERR "\n";1102print STDERR " git pull --rebase\n";1103print STDERR "\n";1104}1105}11061107sub mw_push_revision {1108my$local=shift;1109my$remote=shift;# actually, this has to be "refs/heads/master" at this point.1110my$last_local_revid= get_last_local_revision();1111print STDERR ".\n";# Finish sentence started by get_last_local_revision()1112my$last_remote_revid= get_last_remote_revision();1113my$mw_revision=$last_remote_revid;11141115# Get sha1 of commit pointed by local HEAD1116my$HEAD_sha1= run_git("rev-parse$local2>/dev/null");chomp($HEAD_sha1);1117# Get sha1 of commit pointed by remotes/$remotename/master1118my$remoteorigin_sha1= run_git("rev-parse refs/remotes/$remotename/master2>/dev/null");1119chomp($remoteorigin_sha1);11201121if($last_local_revid>0&&1122$last_local_revid<$last_remote_revid) {1123return error_non_fast_forward($remote);1124}11251126if($HEAD_sha1eq$remoteorigin_sha1) {1127# nothing to push1128return0;1129}11301131# Get every commit in between HEAD and refs/remotes/origin/master,1132# including HEAD and refs/remotes/origin/master1133my@commit_pairs= ();1134if($last_local_revid>0) {1135my$parsed_sha1=$remoteorigin_sha1;1136# Find a path from last MediaWiki commit to pushed commit1137print STDERR "Computing path from local to remote ...\n";1138my@local_ancestry=split(/\n/, run_git("rev-list --boundary --parents$local^$parsed_sha1"));1139my%local_ancestry;1140foreachmy$line(@local_ancestry) {1141if(my($child,$parents) =$line=~m/^-?([a-f0-9]+) ([a-f0-9 ]+)/) {1142foreachmy$parent(split(' ',$parents)) {1143$local_ancestry{$parent} =$child;1144}1145}elsif(!$line=~m/^([a-f0-9]+)/) {1146die"Unexpected output from git rev-list:$line";1147}1148}1149while($parsed_sha1ne$HEAD_sha1) {1150my$child=$local_ancestry{$parsed_sha1};1151if(!$child) {1152printf STDERR "Cannot find a path in history from remote commit to last commit\n";1153return error_non_fast_forward($remote);1154}1155push(@commit_pairs, [$parsed_sha1,$child]);1156$parsed_sha1=$child;1157}1158}else{1159# No remote mediawiki revision. Export the whole1160# history (linearized with --first-parent)1161print STDERR "Warning: no common ancestor, pushing complete history\n";1162my$history= run_git("rev-list --first-parent --children$local");1163my@history=split('\n',$history);1164@history=@history[1..$#history];1165foreachmy$line(reverse@history) {1166my@commit_info_split=split(/ |\n/,$line);1167push(@commit_pairs, \@commit_info_split);1168}1169}11701171foreachmy$commit_info_split(@commit_pairs) {1172my$sha1_child= @{$commit_info_split}[0];1173my$sha1_commit= @{$commit_info_split}[1];1174my$diff_infos= run_git("diff-tree -r --raw -z$sha1_child$sha1_commit");1175# TODO: we could detect rename, and encode them with a #redirect on the wiki.1176# TODO: for now, it's just a delete+add1177my@diff_info_list=split(/\0/,$diff_infos);1178# Keep the subject line of the commit message as mediawiki comment for the revision1179my$commit_msg= run_git("log --no-walk --format=\"%s\"$sha1_commit");1180chomp($commit_msg);1181# Push every blob1182while(@diff_info_list) {1183my$status;1184# git diff-tree -z gives an output like1185# <metadata>\0<filename1>\01186# <metadata>\0<filename2>\01187# and we've split on \0.1188my$info=shift(@diff_info_list);1189my$file=shift(@diff_info_list);1190($mw_revision,$status) = mw_push_file($info,$file,$commit_msg,$mw_revision);1191if($statuseq"non-fast-forward") {1192# we may already have sent part of the1193# commit to MediaWiki, but it's too1194# late to cancel it. Stop the push in1195# the middle, but still give an1196# accurate error message.1197return error_non_fast_forward($remote);1198}1199if($statusne"ok") {1200die("Unknown error from mw_push_file()");1201}1202}1203unless($dumb_push) {1204 run_git("notes --ref=$remotename/mediawikiadd -f -m\"mediawiki_revision:$mw_revision\"$sha1_commit");1205 run_git("update-ref -m\"Git-MediaWiki push\"refs/mediawiki/$remotename/master$sha1_commit$sha1_child");1206}1207}12081209print STDOUT "ok$remote\n";1210return1;1211}12121213sub get_allowed_file_extensions {1214 mw_connect_maybe();12151216my$query= {1217 action =>'query',1218 meta =>'siteinfo',1219 siprop =>'fileextensions'1220};1221my$result=$mediawiki->api($query);1222my@file_extensions=map$_->{ext},@{$result->{query}->{fileextensions}};1223my%hashFile=map{$_=>1}@file_extensions;12241225return%hashFile;1226}12271228# In memory cache for MediaWiki namespace ids.1229my%namespace_id;12301231# Namespaces whose id is cached in the configuration file1232# (to avoid duplicates)1233my%cached_mw_namespace_id;12341235# Return MediaWiki id for a canonical namespace name.1236# Ex.: "File", "Project".1237sub get_mw_namespace_id {1238 mw_connect_maybe();1239my$name=shift;12401241if(!exists$namespace_id{$name}) {1242# Look at configuration file, if the record for that namespace is1243# already cached. Namespaces are stored in form:1244# "Name_of_namespace:Id_namespace", ex.: "File:6".1245my@temp=split(/[\n]/, run_git("config --get-all remote."1246.$remotename.".namespaceCache"));1247chomp(@temp);1248foreachmy$ns(@temp) {1249my($n,$id) =split(/:/,$ns);1250if($ideq'notANameSpace') {1251$namespace_id{$n} = {is_namespace =>0};1252}else{1253$namespace_id{$n} = {is_namespace =>1, id =>$id};1254}1255$cached_mw_namespace_id{$n} =1;1256}1257}12581259if(!exists$namespace_id{$name}) {1260print STDERR "Namespace$namenot found in cache, querying the wiki ...\n";1261# NS not found => get namespace id from MW and store it in1262# configuration file.1263my$query= {1264 action =>'query',1265 meta =>'siteinfo',1266 siprop =>'namespaces'1267};1268my$result=$mediawiki->api($query);12691270while(my($id,$ns) =each(%{$result->{query}->{namespaces}})) {1271if(defined($ns->{id}) &&defined($ns->{canonical})) {1272$namespace_id{$ns->{canonical}} = {is_namespace =>1, id =>$ns->{id}};1273if($ns->{'*'}) {1274# alias (e.g. french Fichier: as alias for canonical File:)1275$namespace_id{$ns->{'*'}} = {is_namespace =>1, id =>$ns->{id}};1276}1277}1278}1279}12801281my$ns=$namespace_id{$name};1282my$id;12831284unless(defined$ns) {1285print STDERR "No such namespace$nameon MediaWiki.\n";1286$ns= {is_namespace =>0};1287$namespace_id{$name} =$ns;1288}12891290if($ns->{is_namespace}) {1291$id=$ns->{id};1292}12931294# Store "notANameSpace" as special value for inexisting namespaces1295my$store_id= ($id||'notANameSpace');12961297# Store explicitely requested namespaces on disk1298if(!exists$cached_mw_namespace_id{$name}) {1299 run_git("config --add remote.".$remotename1300.".namespaceCache\"".$name.":".$store_id."\"");1301$cached_mw_namespace_id{$name} =1;1302}1303return$id;1304}13051306sub get_mw_namespace_id_for_page {1307if(my($namespace) =$_[0] =~/^([^:]*):/) {1308return get_mw_namespace_id($namespace);1309}else{1310return;1311}1312}