Created
October 10, 2010 16:53
-
-
Save toritori0318/619373 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
use strict; | |
use WWW::Mechanize; | |
use Data::Dumper; | |
#use HTML::ContentExtractor; | |
use HTML::Parser; | |
use Lingua::EN::Tagger; | |
use 5.10.0; | |
my $baseurl = 'http://search.cpan.org/~shay/perl/pod/perl.pod'; | |
my $mech = WWW::Mechanize->new(); | |
$mech->get($baseurl); | |
my $linknum = 2; | |
my $max_linknum = 2; | |
my $interval = 3; | |
my %followed_link = {}; | |
my %all_word_list = (); | |
sub getcontent { | |
my ($mech, $linknum) = @_; | |
sleep $interval; | |
return unless ($mech->success() && $mech->is_html()); | |
my $content = $mech->content(); | |
my $text; | |
my $parser = HTML::Parser->new( | |
api_version => 3, | |
text_h => [sub { $text .= shift || ""}, "dtext"], | |
); | |
$parser->parse($content); | |
my $p = new Lingua::EN::Tagger; | |
my %word_list = $p->get_words( $text ); | |
foreach my $key (keys %word_list){ | |
$all_word_list{$key} += $word_list{$key}; | |
} | |
$linknum--; | |
$max_linknum--; | |
return unless ($linknum > 0 || $max_linknum > 0); | |
foreach my $link ($mech->find_all_links(text_regex => qr/^perl/, url_abs_regex => qr#\.pod#)) | |
{ | |
my $url = $link->url_abs(); | |
return if $followed_link{$url}; | |
$followed_link{$url} = 1; | |
say "-- GET " . $url; | |
eval{ | |
$mech->get($link->url_abs()); | |
}; | |
if($@){ | |
say "url->get error!"; | |
}else{ | |
getcontent($mech, $linknum); | |
$mech->back(); | |
} | |
} | |
} | |
sub summary { | |
my $count = 0; | |
my $reg_exclude = qr/[@_\$\(\)\{\}\[\]\!\.%]/; | |
foreach my $key (reverse sort { $all_word_list{$a} <=> $all_word_list{$b} } keys %all_word_list){ | |
$key =~ s/$reg_exclude//g; | |
$key =~ s/^\s//g; | |
$key =~ s/\s$//g; | |
say $key." : ".$all_word_list{$key} if $key; | |
last if $count++ > 100; | |
} | |
} | |
{ | |
getcontent($mech, $linknum); | |
summary(); | |
} | |
exit(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment