Last active
August 29, 2015 13:57
-
-
Save toniher/9568414 to your computer and use it in GitHub Desktop.
Catalan culture challenge Wikipedia 2014 http://en.wikipedia.org/wiki/Wikipedia:Catalan_culture_challenge/list script for checking page length in English Wikipedia and coverage in other Wikimedia sister projects.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl -w | |
use MediaWiki::Bot; | |
use LWP::Simple qw(get); | |
use JSON qw(from_json); | |
use URI::Escape; | |
use Data::Dumper; | |
use utf8; | |
binmode STDOUT, ":utf8"; | |
my $user = 'xxx'; my $pass = 'xxx'; | |
#Create a MediaWiki::Bot object | |
my $enreader = MediaWiki::Bot->new({ | |
host => 'en.wikipedia.org', | |
login_data => { username => $user, password => $pass }, | |
}); | |
# Read wikisites correspondences | |
my $urlsites = "http://www.wikidata.org/w/api.php?action=sitematrix&format=json"; | |
my %sites = &get_sites( from_json(get($urlsites)) ); | |
#print Dumper(%sites); | |
# Gotta get the list | |
my $page = $enreader->get_text("Wikipedia:Catalan_culture_challenge/list"); | |
my (@lines) = split("\n", $page ); | |
print "Article\tNumber iw\tList iw\n"; | |
foreach my $line ( @lines ) { | |
if ($line=~/^\#/) { | |
my ($entry) = $line =~/\[\[\s*(.*)\s*\]\]/; | |
#my $entry = "Joan Alcover"; | |
&process_page( $entry ) | |
} | |
} | |
sub get_sites { | |
my $object = shift; | |
my %sites; | |
foreach my $entry ( keys %{$object->{"sitematrix"}} ) { | |
# If a hash and about a site | |
if ( (ref $object->{"sitematrix"}->{$entry} eq ref {}) && defined( $object->{"sitematrix"}->{$entry}->{"site"} ) ) { | |
foreach my $site ( @{$object->{"sitematrix"}->{$entry}->{"site"}} ) { | |
$sites{$site->{"dbname"}} = $site->{"url"}; | |
} | |
} | |
} | |
return %sites; | |
} | |
sub process_page { | |
my $entry = shift; | |
#Process API to get length | |
my $url = "http://en.wikipedia.org/w/api.php?action=query&titles=".uri_escape_utf8($entry)."&prop=info&format=json&redirects"; | |
my $jsonobj = from_json(get($url)); | |
my $name = &get_redirect( $jsonobj, $entry ); | |
print $name, "\t"; | |
my $wikidata_url = "http://www.wikidata.org/w/api.php?action=wbgetentities&sites=enwiki&titles=".uri_escape_utf8($name)."&languages=en&format=json"; | |
my %listiw = &get_iw( from_json(get($wikidata_url)) ); | |
my ( @listiw ) = keys %listiw; | |
print $#listiw, "\t"; | |
#print &process_iw(@listiw), "\t"; | |
print &process_iw_len( \%listiw), "\n"; | |
sleep(3); | |
} | |
# Return redirected page if exists | |
sub get_redirect { | |
my $object = shift; | |
my $entry = shift; | |
if ( defined( $object->{"query"}->{"redirects"} ) ) { | |
return $object->{"query"}->{"redirects"}->[0]->{"to"}; | |
} else { | |
return $entry; | |
} | |
} | |
# Return interwiki list | |
sub get_iw { | |
my ( @iw ) = (); | |
my $object = shift; | |
foreach my $page ( keys %{$object->{"entities"}} ){ | |
return %{$object->{"entities"}->{$page}->{"sitelinks"}}; | |
} | |
return @iw; | |
} | |
#Return list locales | |
sub process_iw { | |
my ( @iw ) = @_; | |
my @arr = (); | |
foreach my $i ( sort ( @iw ) ) { | |
if ( defined( $sites{$i} ) ) { | |
$i=~s/wiki//g; | |
push(@arr, $i); | |
} | |
} | |
return join(", ", @arr); | |
} | |
#Return list lengths | |
sub process_iw_len { | |
my $iwhash = shift; | |
my @arr = (); | |
foreach my $i ( sort ( keys %{$iwhash} ) ) { | |
my $title = $iwhash->{$i}->{"title"}; | |
my $site = $iwhash->{$i}->{"site"}; | |
my $len = get_length( $site, $title ); | |
push(@arr, $len); | |
} | |
return join(", ", @arr); | |
} | |
# Length of page | |
sub get_length { | |
my $site = shift; | |
my $entry = shift; | |
if ( defined( $sites{$site} ) ) { | |
my $url = $sites{$site}."/w/api.php?action=query&titles=".uri_escape_utf8($entry)."&prop=info&format=json&redirects"; | |
my $object = from_json(get($url)); | |
if ( $object ) { | |
if ( $object->{"query"}->{"pages"}->{"-1"} ) { | |
return $site.":".0; | |
} | |
foreach my $page ( keys %{$object->{"query"}->{"pages"}} ){ | |
return $site.":".$object->{"query"}->{"pages"}->{$page}->{"length"}; | |
} | |
} else { | |
return $site.":".-1; | |
} | |
} else { | |
return $site.":".-1; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment