Wikia

Memory Alpha

Bp/How to get a database dump

Discuss0
37,464pages on
this wiki

< User:Bp

Wikia provides database dumps of MA at pages_current.xml.gz and pages_full.xml.gz, but they're usually way out of date. Here is a Perl script to get one using the Mediawiki API and Special:Export. Yes, I know that I'm parsing XML with a regex, but I don't care.

If any of the xml files end with "</html>" (check using tail), or it crashes with a 500 error, then you'll need to lower the value of $pages_per_xml.

See also: Memory Alpha:Bots#Taking care of broken XML export.

#!/usr/bin/perl

use utf8;
use warnings;
use strict;

my $wiki = 'memory-alpha.org';
#my $wiki = 'farscape.wikia.com'; # other wiki
#my $wiki = $ARGV[0]; # use the first command line parameter as the wiki, `./get_dump.pl memory-alpha.org` for example
my $api_url = 'http://'.$wiki.'/api.php';
my $export_url = 'http://'.$wiki.'/wiki/Special:Export';
my $aplimit = 500; # number of page names in one API request; passed to the API; 500 for anon, 1000 for logged in bot
my $pages_per_xml = 10000; # number of pages in one Special:Export request
my $current_only = 1;	# 1 = pages_current, 0 = pages_full
my @namespaces = (); # leave empty for all
#my @namespaces = (0); # just the main namespace
#my @namespaces = (0,4,6,10,14); # main, ma, file, template, category

use Time::HiRes qw[time];
use LWP::UserAgent;
use LWP::ConnCache;
use HTTP::Request::Common;
use HTTP::Cookies;
use URI::Escape qw[uri_escape_utf8];
use HTML::Entities qw[decode_entities];
use Digest::MD5 qw[md5_hex];
use Encode qw[encode_utf8];
# takes a page title and returns the location of the image
sub wikia_image {
        my $page_title = shift;
        my ($fn) = $page_title =~ /^(?:Image|File):(.*)/;
        $fn = ucfirst $fn;
        $fn =~ s/ /_/g;
        my ($h1,$h2) = ( lc md5_hex encode_utf8($fn) ) =~ /^([a-z0-9])([a-z0-9])/i;
        return "$h1/$h1$h2/" . uri_escape_utf8($fn);
}

my $stm = time;

my $wiki_fn = $wiki;
$wiki_fn =~ s~[^\w\.\-]~_~g;

my $br = LWP::UserAgent->new;
$br->conn_cache(LWP::ConnCache->new());
$br->agent("ma_dump/1.2");
$br->cookie_jar(HTTP::Cookies->new(file => $wiki_fn.'_cookies.txt', autosave => 1, ignore_discard => 1));

my %namespace_info = (0 => '<Main>');
print "Getting namespace list...\n";
my $res = $br->get ( $api_url . '?action=query&meta=siteinfo&siprop=namespaces&format=xml' );
if ($res->is_success) {
        my @nsi = $res->decoded_content =~ m#<ns id="-?\d+"[^/]*?>.*?</ns>#g;
        foreach(@nsi) {
                my ($nsid, $nst) = m#<ns id="(-?\d+)".*?>(.*?)</ns>#g;
                $namespace_info{$nsid} = $nst;
        }
} else {
        die $res->status_line." on fetch namespaces";
}
@namespaces = sort {$a <=> $b} keys %namespace_info unless @namespaces;

my @pages;
print "Getting page list...\n";
foreach my $ns (@namespaces) {
        next if $ns < 0;
	my $apfrom;
	do {
		my $url = $api_url . "?action=query&list=allpages&apnamespace=$ns&aplimit=$aplimit&format=xml" . 
			( $apfrom ? "&apfrom=$apfrom" : '' );	
			
		undef $apfrom;
		my $res = $br->get($url);
		if ($res->is_success) {
			push @pages, $res->decoded_content =~ m#<p pageid="\d+" ns="\d+" title="(.*?)" />#g;
			($apfrom) = $res->decoded_content =~ m#<allpages apfrom="(.*?)" />#;
		} else {
			die $res->status_line." on $url";
		}
		
	} while defined $apfrom;
	print "Done with ns-$ns ($namespace_info{$ns}), now have ",scalar @pages," page(s).\n";
}

my @images = grep /^(File|Image):/, @pages; # cheap way to do it
if ( @images && open ILST, '>', sprintf('%s_image_list.txt', $wiki_fn) ) {
        print ILST wikia_image(decode_entities $_)."\n" foreach @images;
        close ILST;
}

printf "%d page(s) to fetch, %d at a time, %d part(s) expected...\n", scalar @pages, $pages_per_xml, map( int( /^\d+$/ ? $_ : $_+1 ), @pages / $pages_per_xml );

my %export_parms = (
	action => 'submit',
	curonly => 1,
);
delete $export_parms{curonly} unless $current_only;

my $part = 0;
while (@pages) {
	$part++;
	
	$export_parms{pages} = join("\n", map(decode_entities($_), splice(@pages, 0, $pages_per_xml) ) );
	
	my $req = new HTTP::Request POST => $export_url;
	$req->content_type('application/x-www-form-urlencoded');
	$req->content( join('&', map(sprintf('%s=%s', $_, uri_escape_utf8($export_parms{$_}) ), keys %export_parms) ) );
	
	my $xml_file = sprintf '%s_pages_%s_hard_part%03d.xml', $wiki_fn, $export_parms{curonly} ? 'current' : 'full', $part;
	my $res = $br->request($req, $xml_file);
	
	if ($res->is_success) {
		print "OK. $xml_file ",-s $xml_file," bytes.\n";
	} else {
		die $res->decoded_content, "\n*** ", $res->status_line, "\n";
	}
	
}

print time-$stm," second(s). $part parts.\n";

Downloading Images Edit

The script above will create an image list file that can be used with something like puf to fetch all the images. For example:

puf -P ./ma-images -B http://images1.wikia.nocookie.net/memoryalpha/en/images/ -i memory-alpha.org_image_list.txt

Around Wikia's network

Random Wiki