njh · June 1, 2011 07:24
diff --git a/wikipedia-abstract.pl b/wikipedia-abstract.pl
 #!/usr/bin/env plackup

 use LWP::UserAgent;
 use XML::XPath;
 use URI::Escape;
 use Dancer;
 use HTML::TreeBuilder;
 use HTML::Entities;

 use Time::Piece;

 use strict;
 use warnings;
 use utf8;

 my $ABSTRACT_MAX_LENGTH = 1000;
 my $ABSTRACT_TRUNCATE_LENGTH = 2000;

 my $ua = LWP::UserAgent->new;
 $ua->timeout(5);


 sub get_wikipedia_page {
  my ($lang, $title) = @_;
  
  my $key = uri_escape($title);
  $key =~ s/%20/_/g;
  $lang =~ s/\W//g;

  my $url = "http://$lang.wikipedia.org/w/api.php?action=parse&prop=text|displaytitle|revid&redirects=1&format=xml&page=$key";
  warn "$url\n";
  
  my $response = $ua->get($url);
  if ($response->is_success()) {
      my $xp = XML::XPath->new( xml => $response->content() );
      # FIXME: check for errors
      return {
        'url' => "http://$lang.wikipedia.org/wiki/$key",
        'title' => $xp->getNodeText('/api/parse/@displaytitle'),
        'revid' => $xp->getNodeText('/api/parse/@revid'),
        'text' => $xp->getNodeText('/api/parse/text'),
      };
  } else {
      return undef;
  }
 }


 sub strip_pronunciation {
  my ($text) = @_;
  
  $text =~ s|\(.*?pronunciation:.*?\) ||g;
  $text =~ s|\(IPA: ["\[/].*?["\]/]\) ||g;
  $text =~ s|\(pronounced ["\[/].*?["\]/]\) ||g;
  # for when pronounciation is mixed in with birthdate, e.g. (pronounced /bəˈɹɛlɪs/; born December 7, 1979)
  $text =~ s|pronounced ["\[/].*?["\]/]\; ||g;

  return $text;
 }

 sub extract_abstract {
    my ($html) = @_;
    my $abstract = '';
    
    my $tree = HTML::TreeBuilder->new();
    $tree->parse_content("<html><body>".$html."</body></html>");
    
    my $body = $tree->find_by_tag_name('body');
    foreach my $node ($body->content_list) {
        if ($node->tag eq 'p') {
            # Remove references
            # FIXME: better way to do this?
            foreach my $ref ($node->look_down('class', 'reference')) {
                $ref->delete;
            }
        
            my $paragraph = strip_pronunciation($node->as_trimmed_text());
            $abstract .= $paragraph . "\n\n";
            
            # Stop if we have enough text
            last if (length($abstract) > $ABSTRACT_MAX_LENGTH);
        } else {
            # Stop when we get to the table of contents
            last if ($node->look_down("id", "toc"));
        }
    }
    
    # Remove trailing whitespace
    $abstract =~ s/\s+$//;
  
    # Truncate if the abstract is too long
    if (length($abstract) > $ABSTRACT_TRUNCATE_LENGTH) {
        # FIXME: multibyte characters?
        $abstract = substr($abstract, 0, $ABSTRACT_TRUNCATE_LENGTH-3);
     
        # Remove trailing partial word and replace with an ellipsis
        $abstract =~ s/[^\w\s]?\s*\w*$/.../;
    }
  
    return $abstract;
 }


 set charset => 'utf8';

 get '/wikipedia/abstracts/:lang/:title' => sub {
  my $data = get_wikipedia_page(params->{'lang'}, params->{'title'});
  my $abstract = extract_abstract($data->{text});

  if (params->{output} eq 'html') {
    my @paragraphs = split(/[\r\n]+/, $abstract);
    @paragraphs = map(encode_entities($_), @paragraphs);
    my $title = encode_entities($data->{title});
    
    header('Content-Type' => 'text/html');
    return "<html><head><title>$title</title></head>".
           "<body><h1>$title</h1>".
           "<p>".join('</p><p>', @paragraphs)."</p>".
           "</body></html>";
  } elsif (params->{output} eq 'atom') {
    header('Content-Type' => 'application/atom+xml');
    my $t = gmtime();
    
    # ARGH! - can't work out how to get utf8 to behave :-(
    $abstract =~ s/[^[:ascii:]]//g;
    
    return "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n".
           "<feed xmlns=\"http://www.w3.org/2005/Atom\">\n".
           "  <title>$data->{title}</title>\n".
           "  <id>$data->{url}</id>\n".
           "  <updated>".$t->datetime."Z</updated>\n".
           "  <author><name>Wikipedia</name></author>\n".
           "  <entry>\n".
           "    <title>$data->{title}</title>\n".
           "    <id>$data->{url}</id>\n".
           "    <updated>".$t->datetime."Z</updated>\n".
           "    <author><name>Wikipedia</name></author>\n".
           "    <content type=\"text\">$abstract</content>\n".
           "    <link href=\"$data->{url}\" type=\"text/html\" rel=\"alternate\" title=\"Wikipedia - $data->{title}\"/>\n".
           "  </entry>\n".
           "</feed>\n";
  } else {
    header('Content-Type' => 'text/plain');
    return $abstract;
  }
 };

 start;
	#!/usr/bin/env plackup

	use LWP::UserAgent;
	use XML::XPath;
	use URI::Escape;
	use Dancer;
	use HTML::TreeBuilder;
	use HTML::Entities;

	use Time::Piece;

	use strict;
	use warnings;
	use utf8;

	my $ABSTRACT_MAX_LENGTH = 1000;
	my $ABSTRACT_TRUNCATE_LENGTH = 2000;

	my $ua = LWP::UserAgent->new;
	$ua->timeout(5);


	sub get_wikipedia_page {
	my ($lang, $title) = @_;

	my $key = uri_escape($title);
	$key =~ s/%20/_/g;
	$lang =~ s/\W//g;

	my $url = "http://$lang.wikipedia.org/w/api.php?action=parse&prop=text\|displaytitle\|revid&redirects=1&format=xml&page=$key";
	warn "$url\n";

	my $response = $ua->get($url);
	if ($response->is_success()) {
	my $xp = XML::XPath->new( xml => $response->content() );
	# FIXME: check for errors
	return {
	'url' => "http://$lang.wikipedia.org/wiki/$key",
	'title' => $xp->getNodeText('/api/parse/@displaytitle'),
	'revid' => $xp->getNodeText('/api/parse/@revid'),
	'text' => $xp->getNodeText('/api/parse/text'),
	};
	} else {
	return undef;
	}
	}


	sub strip_pronunciation {
	my ($text) = @_;

	$text =~ s\|\(.?pronunciation:.?\) \|\|g;
	$text =~ s\|\(IPA: ["\[/].*?["\]/]\) \|\|g;
	$text =~ s\|\(pronounced ["\[/].*?["\]/]\) \|\|g;
	# for when pronounciation is mixed in with birthdate, e.g. (pronounced /bəˈɹɛlɪs/; born December 7, 1979)
	$text =~ s\|pronounced ["\[/].*?["\]/]\; \|\|g;

	return $text;
	}

	sub extract_abstract {
	my ($html) = @_;
	my $abstract = '';

	my $tree = HTML::TreeBuilder->new();
	$tree->parse_content("<html><body>".$html."</body></html>");

	my $body = $tree->find_by_tag_name('body');
	foreach my $node ($body->content_list) {
	if ($node->tag eq 'p') {
	# Remove references
	# FIXME: better way to do this?
	foreach my $ref ($node->look_down('class', 'reference')) {
	$ref->delete;
	}

	my $paragraph = strip_pronunciation($node->as_trimmed_text());
	$abstract .= $paragraph . "\n\n";

	# Stop if we have enough text
	last if (length($abstract) > $ABSTRACT_MAX_LENGTH);
	} else {
	# Stop when we get to the table of contents
	last if ($node->look_down("id", "toc"));
	}
	}

	# Remove trailing whitespace
	$abstract =~ s/\s+$//;

	# Truncate if the abstract is too long
	if (length($abstract) > $ABSTRACT_TRUNCATE_LENGTH) {
	# FIXME: multibyte characters?
	$abstract = substr($abstract, 0, $ABSTRACT_TRUNCATE_LENGTH-3);

	# Remove trailing partial word and replace with an ellipsis
	$abstract =~ s/[^\w\s]?\s\w$/.../;
	}

	return $abstract;
	}


	set charset => 'utf8';

	get '/wikipedia/abstracts/:lang/:title' => sub {
	my $data = get_wikipedia_page(params->{'lang'}, params->{'title'});
	my $abstract = extract_abstract($data->{text});

	if (params->{output} eq 'html') {
	my @paragraphs = split(/[\r\n]+/, $abstract);
	@paragraphs = map(encode_entities($_), @paragraphs);
	my $title = encode_entities($data->{title});

	header('Content-Type' => 'text/html');
	return "<html><head><title>$title</title></head>".
	"<body><h1>$title</h1>".
	"<p>".join('</p><p>', @paragraphs)."</p>".
	"</body></html>";
	} elsif (params->{output} eq 'atom') {
	header('Content-Type' => 'application/atom+xml');
	my $t = gmtime();

	# ARGH! - can't work out how to get utf8 to behave :-(
	$abstract =~ s/[^[:ascii:]]//g;

	return "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n".
	"<feed xmlns=\"http://www.w3.org/2005/Atom\">\n".
	" <title>$data->{title}</title>\n".
	" <id>$data->{url}</id>\n".
	" <updated>".$t->datetime."Z</updated>\n".
	" <author><name>Wikipedia</name></author>\n".
	" <entry>\n".
	" <title>$data->{title}</title>\n".
	" <id>$data->{url}</id>\n".
	" <updated>".$t->datetime."Z</updated>\n".
	" <author><name>Wikipedia</name></author>\n".
	" <content type=\"text\">$abstract</content>\n".
	" <link href=\"$data->{url}\" type=\"text/html\" rel=\"alternate\" title=\"Wikipedia - $data->{title}\"/>\n".
	" </entry>\n".
	"</feed>\n";
	} else {
	header('Content-Type' => 'text/plain');
	return $abstract;
	}
	};

	start;
No results found