From 5d9798ae622bc51a877a93cae25c53c062854401 Mon Sep 17 00:00:00 2001 From: Kevin Date: Mon, 6 Nov 2017 16:19:47 -0500 Subject: remote-mediawiki: add namespace support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This introduces a new remote.origin.namespaces argument that is a space-separated list of namespaces. The list of pages extract is then taken from all the specified namespaces. Reviewed-by: Antoine Beaupré Signed-off-by: Antoine Beaupré Signed-off-by: Junio C Hamano diff --git a/contrib/mw-to-git/git-remote-mediawiki.perl b/contrib/mw-to-git/git-remote-mediawiki.perl index e7f857c..5ffb575 100755 --- a/contrib/mw-to-git/git-remote-mediawiki.perl +++ b/contrib/mw-to-git/git-remote-mediawiki.perl @@ -63,6 +63,10 @@ chomp(@tracked_pages); my @tracked_categories = split(/[ \n]/, run_git("config --get-all remote.${remotename}.categories")); chomp(@tracked_categories); +# Just like @tracked_categories, but for MediaWiki namespaces. +my @tracked_namespaces = split(/[ \n]/, run_git("config --get-all remote.${remotename}.namespaces")); +chomp(@tracked_namespaces); + # Import media files on pull my $import_media = run_git("config --get --bool remote.${remotename}.mediaimport"); chomp($import_media); @@ -256,6 +260,23 @@ sub get_mw_tracked_categories { return; } +sub get_mw_tracked_namespaces { + my $pages = shift; + foreach my $local_namespace (@tracked_namespaces) { + my $mw_pages = $mediawiki->list( { + action => 'query', + list => 'allpages', + apnamespace => get_mw_namespace_id($local_namespace), + aplimit => 'max' } ) + || die $mediawiki->{error}->{code} . ': ' + . $mediawiki->{error}->{details} . "\n"; + foreach my $page (@{$mw_pages}) { + $pages->{$page->{title}} = $page; + } + } + return; +} + sub get_mw_all_pages { my $pages = shift; # No user-provided list, get the list of pages from the API. @@ -319,6 +340,10 @@ sub get_mw_pages { $user_defined = 1; get_mw_tracked_categories(\%pages); } + if (@tracked_namespaces) { + $user_defined = 1; + get_mw_tracked_namespaces(\%pages); + } if (!$user_defined) { get_mw_all_pages(\%pages); } -- cgit v0.10.2-6-g49f6 From cc92338004d6f3e613c9446ccdcddf11433be85f Mon Sep 17 00:00:00 2001 From: Ingo Ruhnke Date: Mon, 6 Nov 2017 16:19:48 -0500 Subject: remote-mediawiki: allow fetching namespaces with spaces MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit we still want to use spaces as separators in the config, but we should allow the user to specify namespaces with spaces, so we use underscore for this. Reviewed-by: Antoine Beaupré Signed-off-by: Antoine Beaupré Signed-off-by: Junio C Hamano diff --git a/contrib/mw-to-git/git-remote-mediawiki.perl b/contrib/mw-to-git/git-remote-mediawiki.perl index 5ffb575..a1d7837 100755 --- a/contrib/mw-to-git/git-remote-mediawiki.perl +++ b/contrib/mw-to-git/git-remote-mediawiki.perl @@ -65,6 +65,7 @@ chomp(@tracked_categories); # Just like @tracked_categories, but for MediaWiki namespaces. my @tracked_namespaces = split(/[ \n]/, run_git("config --get-all remote.${remotename}.namespaces")); +for (@tracked_namespaces) { s/_/ /g; } chomp(@tracked_namespaces); # Import media files on pull -- cgit v0.10.2-6-g49f6 From 09eebbadca58dbf8e415678eeef8de54c0be6534 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antoine=20Beaupr=C3=A9?= Date: Tue, 7 Nov 2017 11:06:57 -0500 Subject: remote-mediawiki: show known namespace choices on failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If we fail to find a requested namespace, we should tell the user which ones we know about, since those were already fetched. This allows users to fetch all namespaces by specifying a dummy namespace, failing, then copying the list of namespaces in the config. Eventually, we should have a flag that allows fetching all namespaces automatically. Reviewed-by: Antoine Beaupré Signed-off-by: Antoine Beaupré Signed-off-by: Junio C Hamano diff --git a/contrib/mw-to-git/git-remote-mediawiki.perl b/contrib/mw-to-git/git-remote-mediawiki.perl index a1d7837..5e88458 100755 --- a/contrib/mw-to-git/git-remote-mediawiki.perl +++ b/contrib/mw-to-git/git-remote-mediawiki.perl @@ -1334,7 +1334,8 @@ sub get_mw_namespace_id { my $id; if (!defined $ns) { - print {*STDERR} "No such namespace ${name} on MediaWiki.\n"; + my @namespaces = map { s/ /_/g; $_; } sort keys %namespace_id; + print {*STDERR} "No such namespace ${name} on MediaWiki, known namespaces: @namespaces\n"; $ns = {is_namespace => 0}; $namespace_id{$name} = $ns; } -- cgit v0.10.2-6-g49f6 From db3364352da98f20915e1b838616688f388fad8b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antoine=20Beaupr=C3=A9?= Date: Tue, 7 Nov 2017 11:06:58 -0500 Subject: remote-mediawiki: skip virtual namespaces MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Virtual namespaces do not correspond to pages in the database and are automatically generated by MediaWiki. It makes little sense, therefore, to fetch pages from those namespaces and the MW API doesn't support listing those pages. According to the documentation, those virtual namespaces are currently "Special" (-1) and "Media" (-2) but we treat all negative namespaces as "virtual" as a future-proofing mechanism. Signed-off-by: Antoine Beaupré Signed-off-by: Junio C Hamano diff --git a/contrib/mw-to-git/git-remote-mediawiki.perl b/contrib/mw-to-git/git-remote-mediawiki.perl index 5e88458..611a04c 100755 --- a/contrib/mw-to-git/git-remote-mediawiki.perl +++ b/contrib/mw-to-git/git-remote-mediawiki.perl @@ -264,10 +264,13 @@ sub get_mw_tracked_categories { sub get_mw_tracked_namespaces { my $pages = shift; foreach my $local_namespace (@tracked_namespaces) { + my $namespace_id = get_mw_namespace_id($local_namespace); + # virtual namespaces don't support allpages + next if !defined($namespace_id) || $namespace_id < 0; my $mw_pages = $mediawiki->list( { action => 'query', list => 'allpages', - apnamespace => get_mw_namespace_id($local_namespace), + apnamespace => $namespace_id, aplimit => 'max' } ) || die $mediawiki->{error}->{code} . ': ' . $mediawiki->{error}->{details} . "\n"; -- cgit v0.10.2-6-g49f6 From da2a180977166aedafb34ed7eb68edf9e488b377 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antoine=20Beaupr=C3=A9?= Date: Tue, 7 Nov 2017 11:06:59 -0500 Subject: remote-mediawiki: support fetching from (Main) namespace MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When we specify a list of namespaces to fetch from, by default the MW API will not fetch from the default namespace, refered to as "(Main)" in the documentation: https://www.mediawiki.org/wiki/Manual:Namespace#Built-in_namespaces I haven't found a way to address that "(Main)" namespace when getting the namespace ids: indeed, when listing namespaces, there is no "canonical" field for the main namespace, although there is a "*" field that is set to "" (empty). So in theory, we could specify the empty namespace to get the main namespace, but that would make specifying namespaces harder for the user: we would need to teach users about the "empty" default namespace. It would also make the code more complicated: we'd need to parse quotes in the configuration. So we simply override the query here and allow the user to specify "(Main)" since that is the publicly documented name. Signed-off-by: Antoine Beaupré Signed-off-by: Junio C Hamano diff --git a/contrib/mw-to-git/git-remote-mediawiki.perl b/contrib/mw-to-git/git-remote-mediawiki.perl index 611a04c..0e60b85 100755 --- a/contrib/mw-to-git/git-remote-mediawiki.perl +++ b/contrib/mw-to-git/git-remote-mediawiki.perl @@ -264,7 +264,12 @@ sub get_mw_tracked_categories { sub get_mw_tracked_namespaces { my $pages = shift; foreach my $local_namespace (@tracked_namespaces) { - my $namespace_id = get_mw_namespace_id($local_namespace); + my $namespace_id; + if ($local_namespace eq "(Main)") { + $namespace_id = 0; + } else { + $namespace_id = get_mw_namespace_id($local_namespace); + } # virtual namespaces don't support allpages next if !defined($namespace_id) || $namespace_id < 0; my $mw_pages = $mediawiki->list( { -- cgit v0.10.2-6-g49f6 From 55fefa9e94e0633cdd85f6cded42384cbeaf8a61 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antoine=20Beaupr=C3=A9?= Date: Tue, 7 Nov 2017 11:07:00 -0500 Subject: remote-mediawiki: process namespaces in order MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ideally, we'd process them in numeric order since that is more logical, but we can't do that yet since this is where we find the numeric identifiers in the first place. Lexicographic order is a good compromise. Signed-off-by: Antoine Beaupré Signed-off-by: Junio C Hamano diff --git a/contrib/mw-to-git/git-remote-mediawiki.perl b/contrib/mw-to-git/git-remote-mediawiki.perl index 0e60b85..c9f4635 100755 --- a/contrib/mw-to-git/git-remote-mediawiki.perl +++ b/contrib/mw-to-git/git-remote-mediawiki.perl @@ -263,7 +263,7 @@ sub get_mw_tracked_categories { sub get_mw_tracked_namespaces { my $pages = shift; - foreach my $local_namespace (@tracked_namespaces) { + foreach my $local_namespace (sort @tracked_namespaces) { my $namespace_id; if ($local_namespace eq "(Main)") { $namespace_id = 0; -- cgit v0.10.2-6-g49f6 From 94c9acbf0025d5214c8efcf11389536759410dd8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antoine=20Beaupr=C3=A9?= Date: Tue, 7 Nov 2017 11:07:01 -0500 Subject: remote-mediawiki: show progress while fetching namespaces MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Without this, the fetch process seems hanged while we fetch page listings across the namespaces. Obviously, it should be possible to silence this with -q, but that's an issue already present everywhere in the code and should be fixed separately: https://github.com/Git-Mediawiki/Git-Mediawiki/issues/30 Signed-off-by: Antoine Beaupré Signed-off-by: Junio C Hamano diff --git a/contrib/mw-to-git/git-remote-mediawiki.perl b/contrib/mw-to-git/git-remote-mediawiki.perl index c9f4635..af9cbc9 100755 --- a/contrib/mw-to-git/git-remote-mediawiki.perl +++ b/contrib/mw-to-git/git-remote-mediawiki.perl @@ -279,6 +279,7 @@ sub get_mw_tracked_namespaces { aplimit => 'max' } ) || die $mediawiki->{error}->{code} . ': ' . $mediawiki->{error}->{details} . "\n"; + print {*STDERR} "$#{$mw_pages} found in namespace $local_namespace ($namespace_id)\n"; foreach my $page (@{$mw_pages}) { $pages->{$page->{title}} = $page; } -- cgit v0.10.2-6-g49f6