Created
October 8, 2011 05:36
-
-
Save willwillis/1271913 to your computer and use it in GitHub Desktop.
before rss
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl -w | |
use strict; | |
use LWP::Simple; | |
use HTML::TokeParser; | |
use HTML::Entities; | |
# @newspages are pages I don't really wanna read, but I'd rather just have the links | |
my @newspages = qw( | |
http://www.surfstation.lu/00_news.asp | |
http://www.cubadust.com/news.htm | |
http://www.caffemocha.com/cgi-bin/index.htm | |
http://www.halfproject.com/news.php | |
http://www.reinvent.co.nz/v2/skins/news2002.asp | |
); | |
my $body = <<END_HTML; | |
<html> | |
<head> | |
<title>Silent11 helps out</title> | |
<style> | |
body {margin:0; background-color:e25805;font-family: arial; color:black;font-size:10px;} | |
a {font-family: arial; color: yellow; text-decoration: none; font-size: 10px;} | |
a:hover {text-decoration: underline overline; background-color:orange} | |
td {font-size:10px; color: darkred;} | |
</style> | |
</head> | |
<body> | |
<table> | |
<tr> | |
END_HTML | |
for (@newspages) { | |
my $html = $_; | |
my ($junk,$short) = split(/\./,$html); # get domain name | |
$body .= "<td valign=top>$short<br>"; | |
my $get = get("$html"); | |
my $p = HTML::TokeParser->new(\$get); | |
while (my $token = $p->get_tag("a")) { | |
my $url = $token->[1]{href} || "-"; | |
my $text = $p->get_trimmed_text("/a"); | |
unless ($url =~ /^mailto|^javascript/){ # don't grab javascript or mailto's (not perfect) | |
$body .= "<a href=\"$url\" target=\"new\">$text</a><br>\n"; } | |
} $body .= "</td>" | |
} | |
$body .= "</tr></table>"; | |
open(OUT,">news.file.html"); # send to an html file | |
print OUT "$body"; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment