-
-
Save fawce/4625617 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
################ | |
use strict; | |
use warnings; | |
use CGI ':standard'; | |
use CGI::Carp 'fatalsToBrowser'; | |
use Data::Dumper; | |
# Alas, the server I'm using this on has only perl 5.8. If you have | |
# the newer Date::Manip::Date, you can update the script to use it by | |
# searching for "DMD" and making the appropriate edits. | |
# DMD | |
# use Date::Manip::Date; | |
use Date::Manip; | |
use HTML::TreeBuilder; | |
use LWP::UserAgent; | |
use XML::RSS; | |
my $base = 'https://news.ycombinator.com'; | |
my $title_limit = 80; | |
# Set this to undef if you don't want to cache dates. Caching dates | |
# is necessary because the dates we get back from HN are vague, e.g., | |
# "4 hours ago". | |
my $date_cache_file = '/var/www/tmp/hn-comment-feed.cache'; | |
my $date_cache_timeout = 60 * 60 * 24 * 31; # 1 month | |
my $now = time(); | |
my %date_cache; | |
&read_date_cache(); | |
my $id = param('id'); | |
die "No post id specified\n" if (! $id); | |
die "Bad id '$id'\n" if ($id !~ /^\d+$/); | |
my $ua = LWP::UserAgent->new(); | |
my $url = "$base/item?id=$id"; | |
my $response = $ua->get($url); | |
die "Failed to fetch $url\n" if (! $response->is_success); | |
my $tb = HTML::TreeBuilder->new(); | |
$tb->parse_content($response->decoded_content()); | |
my @comments; | |
my $title = $tb->find_by_tag_name('title')->as_text(); | |
die "No title\n" if (! $title); | |
$title = "Comment feed for: $title"; | |
my(@elements) = $tb->descendants(); | |
foreach my $element (@elements) { | |
my $class = $element->attr('class'); | |
if ($class and $class eq 'comhead') { | |
while (@comments and ! $comments[-1]{'comment'}) { | |
pop @comments; | |
} | |
push(@comments, {'comhead' => $element}); | |
} | |
if ($class and $class eq 'comment') { | |
next if (! @comments); | |
$comments[-1]{'comment'} = $element; | |
} | |
} | |
foreach my $comment (@comments) { | |
@elements = $comment->{'comhead'}->content_list(); | |
# Structure of comhead is link to user, post time, link to comment | |
die "Bad comhead\n" if (@elements != 3); | |
my($author_tag) = $elements[0]; | |
my($age) = $elements[1]; | |
my($comment_link) = $elements[2]; | |
my($author_link) = $author_tag->attr('href'); | |
die "Bad author link\n" if (! $author_link); | |
$author_link = "$base/$author_link"; | |
@elements = $author_tag->content_list(); | |
die "Bad author tag\n" if (@elements != 1); | |
my($author_name) = ref $elements[0] ? $elements[0]->as_HTML() : $elements[0]; | |
$comment_link = $comment_link->attr('href'); | |
die "Bad comment link\n" if (! $comment_link); | |
$comment_link = "$base/$comment_link"; | |
die "Bad comment age\n" if (ref $age); | |
die "Bad comment age\n" if ($age !~ s/\s*\|\s*$//); | |
# DMD | |
# my $date = Date::Manip::Date->new($age); | |
# die "Bad comment age\n" if ($date->err); | |
# $date->convert('UTC'); | |
# $date = $date->printf('%O+00:00'); | |
my $date = ParseDate($age); | |
die "Bad comment age\n" if (! $date); | |
$date = Date_ConvTZ($date, '', 'UTC'); | |
$date = UnixDate($date, '%O+00:00'); | |
$date = &get_date_cache($comment_link, $date); | |
@elements = $comment->{'comment'}->content_list(); | |
my $content = ''; | |
for (@elements) { | |
if (ref) { | |
$content .= $_->as_HTML(); | |
} | |
else { | |
$content .= $_; | |
} | |
} | |
my($title) = $comment->{'comment'}->as_text(); | |
if (length($title) > $title_limit) { | |
$title = substr($title, 0, $title_limit-3) . '...'; | |
} | |
$comment->{'author_link'} = $author_link; | |
$comment->{'author_name'} = $author_name; | |
$comment->{'age'} = $age; | |
$comment->{'date'} = $date; | |
$comment->{'comment_link'} = $comment_link; | |
$comment->{'content'} = $content; | |
$comment->{'title'} = $title; | |
} | |
&write_date_cache(); | |
@comments = sort { $b->{'date'} cmp $a->{'date'} } @comments; | |
my $rss = XML::RSS->new(); | |
$rss->channel(title => $title, link => $url); | |
for (@comments) { | |
my $creator_blob = "<a href='$_->{author_link}'>$_->{author_name}</a>"; | |
$rss->add_item | |
( | |
title => $_->{'title'}, | |
link => $_->{'comment_link'}, | |
description => $_->{'content'} . "<p>(by $creator_blob, $_->{'age'})</p>", | |
dc => { | |
date => $_->{'date'}, | |
creator => $creator_blob, | |
}); | |
} | |
print header('application/rss+xml'); | |
print $rss->as_string; | |
$tb->delete(); | |
sub read_date_cache { | |
return if (! $date_cache_file); | |
return if (! -f $date_cache_file); | |
open(CACHE ,'<', $date_cache_file) or die "Error reading date cache\n"; | |
while (<CACHE>) { | |
chomp; | |
my($url, $date, $last_used) = split; | |
$date_cache{$url} = { | |
date => $date, | |
last_used => $last_used | |
}; | |
} | |
close(CACHE) or die "Error reading date cache\n"; | |
} | |
sub get_date_cache { | |
my($url, $date) = @_; | |
$date_cache{$url}{'last_used'} = $now; | |
if ($date_cache{$url}{'date'}) { | |
return $date_cache{$url}{'date'}; | |
} | |
else { | |
return $date_cache{$url}{'date'} = $date; | |
} | |
} | |
sub write_date_cache { | |
return if (! $date_cache_file); | |
my $new = "$date_cache_file.new"; | |
open(CACHE, '>', $new) or die "Error writing date cache\n"; | |
foreach my $url (keys %date_cache) { | |
next if ($now - $date_cache{$url}->{'last_used'} > $date_cache_timeout); | |
print(CACHE "$url $date_cache{$url}{'date'} $date_cache{$url}{'last_used'}\n") | |
or die "Error writing date cache\n"; | |
} | |
close(CACHE) or die "Error writing date cache\n"; | |
rename($new, $date_cache_file) or die "Error writing date cache\n"; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment