January 3, 2013 19:51
diff --git a/gistfile1.txt b/gistfile1.txt
 #!/usr/bin/perl

 $|=1;

 use strict;
 use warnings;

 use LWP::UserAgent; 
 use URI::URL;
 use HTML::Strip;
 use HTML::LinkExtor;
 use Data::Dumper;
 use List::Util qw( shuffle );
 use Text::Sentence qw(split_sentences);

 #
 # Some global vars
 #
 my $MAXURLS   = 100;
 my $PAGEITEMS = 250;
 my @nexturls  = qw( 
                  http://nl.yahoo.com/?p=us
                  http://nl.wikipedia.org/wiki/Hoofdpagina
                  http://nl.wikipedia.org/wiki/Amsterdam 
                 );


 #--------------------------------------------------------------------------------
 #
 # Main program
 #
 #--------------------------------------------------------------------------------

 #
 # Main program fetching loop
 #
 #

 my %frequencies;
 my %seen_urls;
 my %sentences;

 my $urlcounter = 1;
 while ( my $url = shift @nexturls ) {

    warn "[DOC $urlcounter/$MAXURLS] $url\n";
    my $doc = LWP::UserAgent->new->get( $url );

    sentence_ref  ( $doc, \%sentences   );
    word_frequency( $doc, \%frequencies );
    get_urls      ( $doc, \@nexturls    );

    last unless $urlcounter++ < $MAXURLS;
    
 }

 #
 #
 # Spew out results!
 #
 #

 my $lines = 1;

 print <<EOH;
 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
 <html xmlns="http://www.w3.org/1999/xhtml">
 <head>
 <link rel="stylesheet" href="dwords.css" type="text/css"/>
 </head>
 <body>
 <P>
 DIY vocabulary builder. This document contains the $MAXURLS highest frequent used words in dutch 
 from a selection of dutch websites.  You probably came here from 
 <a href="http://www.einarsen.no/">my weblog</a> or my twitter: 
 <a href="http://www.twitter.com/matseinarsen/">@matseinarsen</a>.
 </P>
 <DL>

 EOH

 print "<DL>\n";
 for my $key ( sort { $frequencies{ $b } <=> $frequencies{ $a } } keys %frequencies ) {
    my $uckey  = uc      $key;    
    my $ucfkey = ucfirst $key;

    print <<EOH;
 <div class="box" id="$lines">
    <DT><a class="title" target="translation" href="http://translate.google.com/#nl|en|$key\">$uckey - $ucfkey - $key </a>
 <br/>
 $frequencies{ $key } mentions in $MAXURLS documents - $lines most frequent word - <a href="javascript:document.getElementById('$lines').style.opacity = 0.1; ">hide</a> - <a href="javascript:document.getElementById('$lines').style.opacity = 1; ">show</a></DT>
 EOH

    my %seen;
    print map  { my $h =  $_; 
                    $h =~ s/\b($key)\b/<b>$1<\/b>/gi; 
                 "<DD><a target=\"translation\" href=\"http://translate.google.com/#nl|en|$_\">$h</a></DD>" } 
          grep { defined $_ }
          (sort {length $a <=> length $b }
           grep {! $seen{ $_ }++ }
           @{ $sentences{$key} }
          )[0..5]
          ;
          
    print "</div>\n";
    last if $lines++ > $PAGEITEMS;

 }

 print "</DL>\n";



 #--------------------------------------------------------------------------------
 #
 # Subs
 #
 #--------------------------------------------------------------------------------

 #
 # Get sentence context
 #
 sub sentence_ref {
    my $text      = HTML::Strip->new->parse( shift->decoded_content );
    my $sentences = shift; 
       $text      =~ s/(\n|\[\d+\])//g;        
       
    my @sentences = grep {!( /[\.\!\?](.*)[\.\?\!]/ or /\s\s/ or /[a-z][A-Z]/) } split_sentences( $text );
    
 SENTENCE: 
    for my $sentence ( @sentences ) {
        $sentence =~ s/\s+(\.\,\?\!)/$1/g;
        (($sentence =~ tr/ //) > 20 or ($sentence =~ tr/ //) < 4) and next SENTENCE; 
        
 WORD:   for my $word ( split / /, lc $sentence ) {
            $word =~ s/[^\w]//g;
            length $word or next WORD;
            push @{ $sentences{$word} }, $sentence;        
        }
    }
 }


 #
 # Get the frequency of words in a LWP result object
 #
 sub word_frequency  {
    my $text        = HTML::Strip->new->parse( shift->decoded_content );
    my $frequencies = shift; 
    
    $text           =~ s/[\s\.\,\)\(\?]+/ /g;
    
    $frequencies->{ $_ } ++ for grep { /^[\w]+$/ and !/[0-9]/ } 
                                split / /, lc $text;
 }

 #
 # Push the urls in a LWP object into a list
 #
 sub get_urls  {
    my $document = shift;
    my $links    = shift;

    push @$links, shuffle 
                  grep { /(nl\.|\.nl|Nederlands)/i            and
                         !/(mediawiki|wikimedia|creativecom)/ and 
                         !/\#/                                and
                         !/(rss|css|js|gif|jpg|png)$/i        and 
                         !exists $seen_urls{ $_ }                 } 
                  map  { url($_->[2], $document->base)->abs       }
                  HTML::LinkExtor->new->parse( $document->content )->links;

    @$links = @$links;

    @seen_urls{ @$links } = ();

 }
	#!/usr/bin/perl

	$\|=1;

	use strict;
	use warnings;

	use LWP::UserAgent;
	use URI::URL;
	use HTML::Strip;
	use HTML::LinkExtor;
	use Data::Dumper;
	use List::Util qw( shuffle );
	use Text::Sentence qw(split_sentences);

	#
	# Some global vars
	#
	my $MAXURLS = 100;
	my $PAGEITEMS = 250;
	my @nexturls = qw(
	http://nl.yahoo.com/?p=us
	http://nl.wikipedia.org/wiki/Hoofdpagina
	http://nl.wikipedia.org/wiki/Amsterdam
	);


	#--------------------------------------------------------------------------------
	#
	# Main program
	#
	#--------------------------------------------------------------------------------

	#
	# Main program fetching loop
	#
	#

	my %frequencies;
	my %seen_urls;
	my %sentences;

	my $urlcounter = 1;
	while ( my $url = shift @nexturls ) {

	warn "[DOC $urlcounter/$MAXURLS] $url\n";
	my $doc = LWP::UserAgent->new->get( $url );

	sentence_ref ( $doc, \%sentences );
	word_frequency( $doc, \%frequencies );
	get_urls ( $doc, \@nexturls );

	last unless $urlcounter++ < $MAXURLS;

	}

	#
	#
	# Spew out results!
	#
	#

	my $lines = 1;

	print <<EOH;
	<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
	"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
	<html xmlns="http://www.w3.org/1999/xhtml">
	<head>
	<link rel="stylesheet" href="dwords.css" type="text/css"/>
	</head>
	<body>
	<P>
	DIY vocabulary builder. This document contains the $MAXURLS highest frequent used words in dutch
	from a selection of dutch websites. You probably came here from
	<a href="http://www.einarsen.no/">my weblog</a> or my twitter:
	<a href="http://www.twitter.com/matseinarsen/">@matseinarsen</a>.
	</P>
	<DL>

	EOH

	print "<DL>\n";
	for my $key ( sort { $frequencies{ $b } <=> $frequencies{ $a } } keys %frequencies ) {
	my $uckey = uc $key;
	my $ucfkey = ucfirst $key;

	print <<EOH;
	<div class="box" id="$lines">
	<DT><a class="title" target="translation" href="http://translate.google.com/#nl\|en\|$key\">$uckey - $ucfkey - $key </a>
	<br/>
	$frequencies{ $key } mentions in $MAXURLS documents - $lines most frequent word - <a href="javascript:document.getElementById('$lines').style.opacity = 0.1; ">hide</a> - <a href="javascript:document.getElementById('$lines').style.opacity = 1; ">show</a></DT>
	EOH

	my %seen;
	print map { my $h = $_;
	$h =~ s/\b($key)\b/<b>$1<\/b>/gi;
	"<DD><a target=\"translation\" href=\"http://translate.google.com/#nl\|en\|$_\">$h</a></DD>" }
	grep { defined $_ }
	(sort {length $a <=> length $b }
	grep {! $seen{ $_ }++ }
	@{ $sentences{$key} }
	)[0..5]
	;

	print "</div>\n";
	last if $lines++ > $PAGEITEMS;

	}

	print "</DL>\n";



	#--------------------------------------------------------------------------------
	#
	# Subs
	#
	#--------------------------------------------------------------------------------

	#
	# Get sentence context
	#
	sub sentence_ref {
	my $text = HTML::Strip->new->parse( shift->decoded_content );
	my $sentences = shift;
	$text =~ s/(\n\|\[\d+\])//g;

	my @sentences = grep {!( /[\.\!\?](.*)[\.\?\!]/ or /\s\s/ or /[a-z][A-Z]/) } split_sentences( $text );

	SENTENCE:
	for my $sentence ( @sentences ) {
	$sentence =~ s/\s+(\.\,\?\!)/$1/g;
	(($sentence =~ tr/ //) > 20 or ($sentence =~ tr/ //) < 4) and next SENTENCE;

	WORD: for my $word ( split / /, lc $sentence ) {
	$word =~ s/[^\w]//g;
	length $word or next WORD;
	push @{ $sentences{$word} }, $sentence;
	}
	}
	}


	#
	# Get the frequency of words in a LWP result object
	#
	sub word_frequency {
	my $text = HTML::Strip->new->parse( shift->decoded_content );
	my $frequencies = shift;

	$text =~ s/[\s\.\,\)\(\?]+/ /g;

	$frequencies->{ $_ } ++ for grep { /^[\w]+$/ and !/[0-9]/ }
	split / /, lc $text;
	}

	#
	# Push the urls in a LWP object into a list
	#
	sub get_urls {
	my $document = shift;
	my $links = shift;

	push @$links, shuffle
	grep { /(nl\.\|\.nl\|Nederlands)/i and
	!/(mediawiki\|wikimedia\|creativecom)/ and
	!/\#/ and
	!/(rss\|css\|js\|gif\|jpg\|png)$/i and
	!exists $seen_urls{ $_ } }
	map { url($_->[2], $document->base)->abs }
	HTML::LinkExtor->new->parse( $document->content )->links;

	@$links = @$links;

	@seen_urls{ @$links } = ();

	}