Skip to content

Instantly share code, notes, and snippets.

@njh
Created June 1, 2011 07:24
Show Gist options
  • Select an option

  • Save njh/1001922 to your computer and use it in GitHub Desktop.

Select an option

Save njh/1001922 to your computer and use it in GitHub Desktop.
Perl script to fetch abstracts from the Wikipedia API
#!/usr/bin/env plackup
use LWP::UserAgent;
use XML::XPath;
use URI::Escape;
use Dancer;
use HTML::TreeBuilder;
use HTML::Entities;
use Time::Piece;
use strict;
use warnings;
use utf8;
my $ABSTRACT_MAX_LENGTH = 1000;
my $ABSTRACT_TRUNCATE_LENGTH = 2000;
my $ua = LWP::UserAgent->new;
$ua->timeout(5);
sub get_wikipedia_page {
my ($lang, $title) = @_;
my $key = uri_escape($title);
$key =~ s/%20/_/g;
$lang =~ s/\W//g;
my $url = "http://$lang.wikipedia.org/w/api.php?action=parse&prop=text|displaytitle|revid&redirects=1&format=xml&page=$key";
warn "$url\n";
my $response = $ua->get($url);
if ($response->is_success()) {
my $xp = XML::XPath->new( xml => $response->content() );
# FIXME: check for errors
return {
'url' => "http://$lang.wikipedia.org/wiki/$key",
'title' => $xp->getNodeText('/api/parse/@displaytitle'),
'revid' => $xp->getNodeText('/api/parse/@revid'),
'text' => $xp->getNodeText('/api/parse/text'),
};
} else {
return undef;
}
}
sub strip_pronunciation {
my ($text) = @_;
$text =~ s|\(.*?pronunciation:.*?\) ||g;
$text =~ s|\(IPA: ["\[/].*?["\]/]\) ||g;
$text =~ s|\(pronounced ["\[/].*?["\]/]\) ||g;
# for when pronounciation is mixed in with birthdate, e.g. (pronounced /bəˈɹɛlɪs/; born December 7, 1979)
$text =~ s|pronounced ["\[/].*?["\]/]\; ||g;
return $text;
}
sub extract_abstract {
my ($html) = @_;
my $abstract = '';
my $tree = HTML::TreeBuilder->new();
$tree->parse_content("<html><body>".$html."</body></html>");
my $body = $tree->find_by_tag_name('body');
foreach my $node ($body->content_list) {
if ($node->tag eq 'p') {
# Remove references
# FIXME: better way to do this?
foreach my $ref ($node->look_down('class', 'reference')) {
$ref->delete;
}
my $paragraph = strip_pronunciation($node->as_trimmed_text());
$abstract .= $paragraph . "\n\n";
# Stop if we have enough text
last if (length($abstract) > $ABSTRACT_MAX_LENGTH);
} else {
# Stop when we get to the table of contents
last if ($node->look_down("id", "toc"));
}
}
# Remove trailing whitespace
$abstract =~ s/\s+$//;
# Truncate if the abstract is too long
if (length($abstract) > $ABSTRACT_TRUNCATE_LENGTH) {
# FIXME: multibyte characters?
$abstract = substr($abstract, 0, $ABSTRACT_TRUNCATE_LENGTH-3);
# Remove trailing partial word and replace with an ellipsis
$abstract =~ s/[^\w\s]?\s*\w*$/.../;
}
return $abstract;
}
set charset => 'utf8';
get '/wikipedia/abstracts/:lang/:title' => sub {
my $data = get_wikipedia_page(params->{'lang'}, params->{'title'});
my $abstract = extract_abstract($data->{text});
if (params->{output} eq 'html') {
my @paragraphs = split(/[\r\n]+/, $abstract);
@paragraphs = map(encode_entities($_), @paragraphs);
my $title = encode_entities($data->{title});
header('Content-Type' => 'text/html');
return "<html><head><title>$title</title></head>".
"<body><h1>$title</h1>".
"<p>".join('</p><p>', @paragraphs)."</p>".
"</body></html>";
} elsif (params->{output} eq 'atom') {
header('Content-Type' => 'application/atom+xml');
my $t = gmtime();
# ARGH! - can't work out how to get utf8 to behave :-(
$abstract =~ s/[^[:ascii:]]//g;
return "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n".
"<feed xmlns=\"http://www.w3.org/2005/Atom\">\n".
" <title>$data->{title}</title>\n".
" <id>$data->{url}</id>\n".
" <updated>".$t->datetime."Z</updated>\n".
" <author><name>Wikipedia</name></author>\n".
" <entry>\n".
" <title>$data->{title}</title>\n".
" <id>$data->{url}</id>\n".
" <updated>".$t->datetime."Z</updated>\n".
" <author><name>Wikipedia</name></author>\n".
" <content type=\"text\">$abstract</content>\n".
" <link href=\"$data->{url}\" type=\"text/html\" rel=\"alternate\" title=\"Wikipedia - $data->{title}\"/>\n".
" </entry>\n".
"</feed>\n";
} else {
header('Content-Type' => 'text/plain');
return $abstract;
}
};
start;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment