Created
January 4, 2016 23:07
-
-
Save ag4ve/cef23a2c48c38950dcc2 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
use strict; | |
use warnings; | |
use Data::Dumper; | |
use Digest::MD5 qw(md5_base64); | |
use Text::CSV; | |
my $csv = Text::CSV->new({binary => 1, always_quote => 1}) | |
or die "Can not use CSV: " . Text::CSV->error_diag(); | |
my $filein = $ARGV[0]; | |
my $lookupout = $ARGV[1] // 'lookup.csv'; | |
my $scoreout = $ARGV[2] // 'out.csv'; | |
open (my $fh, '<:encoding(utf8)', $filein) | |
or die "Can not open [$filein] $!"; | |
my $rows = $csv->getline_all($fh); | |
$csv->eof or $csv->error_diag(); | |
close $fh; | |
my ($score, $lookup); | |
foreach my $i (0 .. $#{$rows}) { | |
my $row = $rows->[$i]; | |
my ($hash, $scr, $replace); | |
if ($i == 0) { | |
$hash = "Hash"; | |
$scr = "Score"; | |
} else { | |
$hash = md5_base64(rand); | |
$scr = ""; | |
$replace = 1; | |
} | |
my $content = [ @{$row}[5 .. $#{$row}] ]; | |
if ($replace) { | |
my @words = | |
grep {defined($_) and length($_) > 2} # base length | |
map { # grab words | |
my $sub = ($_ | |
=~ s/[\n\r\t]+/ /r # no new lines or tabs | |
=~ s/[<>]//r # remove '<' and '>' | |
=~ s/([\.\+])/\\$1/r # escape '.' and '+' | |
); | |
split(/[ @]/, $sub) # split at '@' and ' ' | |
} @{$row}[1 .. 4]; # do name and email columns | |
@words = | |
keys %{+{ map {$_ => 1} @words }}; # dedupe | |
my $words_re = join('|', @words); | |
@$content = map {$_ =~ s/((?:$words_re) )+/<replaced> /gir} @$content; | |
} | |
push @$lookup, [$hash, @{$row}[1 .. 4]]; | |
push @$score, [$scr, $hash, $row->[0], @$content]; | |
} | |
$csv->eol("\r\n"); | |
foreach my $out ([$lookupout, $lookup], [$scoreout, $score]) { | |
open (my $fh, '>:encoding(utf8)', $out->[0]) | |
or die "Can not write [" . $out->[0] . "] $!"; | |
map {$csv->print(\*$fh, $_)} @{$out->[1]}; | |
close $fh; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment