Skip to content

Instantly share code, notes, and snippets.

@kimmel
Created September 9, 2012 23:55
Show Gist options
  • Save kimmel/3688004 to your computer and use it in GitHub Desktop.
Save kimmel/3688004 to your computer and use it in GitHub Desktop.
text normalization and token splitting
#!/usr/bin/perl
use v5.14;
use warnings;
use utf8::all;
use List::Util qw( reduce );
use List::MoreUtils qw( uniq any );
use Path::Class::Rule;
use File::Slurp qw( read_file );
...
my $content = read_file($fname);
$content =~ tr/!"#$%&'()*+,\-.\/:;<=>?@\[\\]^_`{|}~/ /;
my @parts = split /\s+/, $content;
@parts = grep { $_ ne q{} } @parts;
@parts = grep { length $_ > 1 } @parts;
my @words = uniq @parts;
my $longest_string = reduce { length $a > length $b ? $a : $b } @words;
my $shortest_string = reduce { length $a < length $b ? $a : $b } @words;
foreach my $word (@words) {
my $key = lc substr $word, 0, 1;
if (any {$word eq $_} keys %{ $pattern_list->{$key} } ) {
#do collection here
}
}
...
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment