Created
June 1, 2011 07:24
-
-
Save njh/1001922 to your computer and use it in GitHub Desktop.
Perl script to fetch abstracts from the Wikipedia API
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env plackup | |
| use LWP::UserAgent; | |
| use XML::XPath; | |
| use URI::Escape; | |
| use Dancer; | |
| use HTML::TreeBuilder; | |
| use HTML::Entities; | |
| use Time::Piece; | |
| use strict; | |
| use warnings; | |
| use utf8; | |
| my $ABSTRACT_MAX_LENGTH = 1000; | |
| my $ABSTRACT_TRUNCATE_LENGTH = 2000; | |
| my $ua = LWP::UserAgent->new; | |
| $ua->timeout(5); | |
| sub get_wikipedia_page { | |
| my ($lang, $title) = @_; | |
| my $key = uri_escape($title); | |
| $key =~ s/%20/_/g; | |
| $lang =~ s/\W//g; | |
| my $url = "http://$lang.wikipedia.org/w/api.php?action=parse&prop=text|displaytitle|revid&redirects=1&format=xml&page=$key"; | |
| warn "$url\n"; | |
| my $response = $ua->get($url); | |
| if ($response->is_success()) { | |
| my $xp = XML::XPath->new( xml => $response->content() ); | |
| # FIXME: check for errors | |
| return { | |
| 'url' => "http://$lang.wikipedia.org/wiki/$key", | |
| 'title' => $xp->getNodeText('/api/parse/@displaytitle'), | |
| 'revid' => $xp->getNodeText('/api/parse/@revid'), | |
| 'text' => $xp->getNodeText('/api/parse/text'), | |
| }; | |
| } else { | |
| return undef; | |
| } | |
| } | |
| sub strip_pronunciation { | |
| my ($text) = @_; | |
| $text =~ s|\(.*?pronunciation:.*?\) ||g; | |
| $text =~ s|\(IPA: ["\[/].*?["\]/]\) ||g; | |
| $text =~ s|\(pronounced ["\[/].*?["\]/]\) ||g; | |
| # for when pronounciation is mixed in with birthdate, e.g. (pronounced /bəˈɹɛlɪs/; born December 7, 1979) | |
| $text =~ s|pronounced ["\[/].*?["\]/]\; ||g; | |
| return $text; | |
| } | |
| sub extract_abstract { | |
| my ($html) = @_; | |
| my $abstract = ''; | |
| my $tree = HTML::TreeBuilder->new(); | |
| $tree->parse_content("<html><body>".$html."</body></html>"); | |
| my $body = $tree->find_by_tag_name('body'); | |
| foreach my $node ($body->content_list) { | |
| if ($node->tag eq 'p') { | |
| # Remove references | |
| # FIXME: better way to do this? | |
| foreach my $ref ($node->look_down('class', 'reference')) { | |
| $ref->delete; | |
| } | |
| my $paragraph = strip_pronunciation($node->as_trimmed_text()); | |
| $abstract .= $paragraph . "\n\n"; | |
| # Stop if we have enough text | |
| last if (length($abstract) > $ABSTRACT_MAX_LENGTH); | |
| } else { | |
| # Stop when we get to the table of contents | |
| last if ($node->look_down("id", "toc")); | |
| } | |
| } | |
| # Remove trailing whitespace | |
| $abstract =~ s/\s+$//; | |
| # Truncate if the abstract is too long | |
| if (length($abstract) > $ABSTRACT_TRUNCATE_LENGTH) { | |
| # FIXME: multibyte characters? | |
| $abstract = substr($abstract, 0, $ABSTRACT_TRUNCATE_LENGTH-3); | |
| # Remove trailing partial word and replace with an ellipsis | |
| $abstract =~ s/[^\w\s]?\s*\w*$/.../; | |
| } | |
| return $abstract; | |
| } | |
| set charset => 'utf8'; | |
| get '/wikipedia/abstracts/:lang/:title' => sub { | |
| my $data = get_wikipedia_page(params->{'lang'}, params->{'title'}); | |
| my $abstract = extract_abstract($data->{text}); | |
| if (params->{output} eq 'html') { | |
| my @paragraphs = split(/[\r\n]+/, $abstract); | |
| @paragraphs = map(encode_entities($_), @paragraphs); | |
| my $title = encode_entities($data->{title}); | |
| header('Content-Type' => 'text/html'); | |
| return "<html><head><title>$title</title></head>". | |
| "<body><h1>$title</h1>". | |
| "<p>".join('</p><p>', @paragraphs)."</p>". | |
| "</body></html>"; | |
| } elsif (params->{output} eq 'atom') { | |
| header('Content-Type' => 'application/atom+xml'); | |
| my $t = gmtime(); | |
| # ARGH! - can't work out how to get utf8 to behave :-( | |
| $abstract =~ s/[^[:ascii:]]//g; | |
| return "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n". | |
| "<feed xmlns=\"http://www.w3.org/2005/Atom\">\n". | |
| " <title>$data->{title}</title>\n". | |
| " <id>$data->{url}</id>\n". | |
| " <updated>".$t->datetime."Z</updated>\n". | |
| " <author><name>Wikipedia</name></author>\n". | |
| " <entry>\n". | |
| " <title>$data->{title}</title>\n". | |
| " <id>$data->{url}</id>\n". | |
| " <updated>".$t->datetime."Z</updated>\n". | |
| " <author><name>Wikipedia</name></author>\n". | |
| " <content type=\"text\">$abstract</content>\n". | |
| " <link href=\"$data->{url}\" type=\"text/html\" rel=\"alternate\" title=\"Wikipedia - $data->{title}\"/>\n". | |
| " </entry>\n". | |
| "</feed>\n"; | |
| } else { | |
| header('Content-Type' => 'text/plain'); | |
| return $abstract; | |
| } | |
| }; | |
| start; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment