Created
April 5, 2010 20:07
-
-
Save tvwerkhoven/356799 to your computer and use it in GitHub Desktop.
Find the word distance for each word, and signal words that are close together
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl -w | |
# | |
# Find the word distance for each word, and signal words that are close | |
# together. Can be useful for reviewing written text (papers, essays, | |
# whatever) to see if you might need a thesaurus somewhere. | |
# | |
# Tim van Werkoven, 20090426 <[email protected]> | |
# This file is licensed under the Creative Commons Attribution-Share Alike | |
# license versions 3.0 or higher, see | |
# http://creativecommons.org/licenses/by-sa/3.0/ | |
# If the inter-word distance is less than this, signal a warning | |
my $LIMIT = 10; | |
# Is the above count in characters (=0) or words (=1)? | |
my $MODE = 1; | |
# What is the minimum length of words to check for? | |
my $MINLEN = 4; | |
# What is a comment? | |
my $COMMCHAR = "%"; | |
# Open the file | |
open(FILE, "<$ARGV[0]"); | |
# keep track of the line number we're looking at | |
my $line=0; | |
my @history; | |
my $word=""; | |
my @output; | |
my $hits=0; | |
#while (file) { | |
# get enough data into history | |
# start comparing word by word | |
# eject words if history becomes too long | |
#} | |
while (<FILE>) { | |
$line++; | |
chomp $_; | |
# store current line | |
my $curr = $_; | |
# split line up in words | |
my @words = split(/ /, $curr); | |
# Check for comments | |
next if /^$COMMCHAR/; | |
# append words to history | |
@history = (@history,@words); | |
# check if we should ignore this | |
# Start searching now | |
# SEARCH WITH WORDLENGTH | |
if ($MODE == 1) { | |
while (scalar(@history) > $LIMIT) { | |
# Take a word from the history | |
$word = shift(@history); | |
# Format it for easy checking | |
my $word_f = lc($word); | |
$word_f =~ s/\W|_//g; | |
$i = 0; | |
while ($i < $LIMIT && $history[$i]) { | |
my $word_ch = lc($history[$i]); | |
$word_ch =~ s/\W|_//g; | |
#print "debug ".$word.$history[$i]."\n"; | |
if (length($word_f) < $MINLEN) { | |
last; | |
} | |
elsif ($word_f eq $word_ch) { | |
$output[$hits] = [($line, $word, $i)]; | |
$hits++; | |
print "FOUND: $word (d: $i), line $line: "; | |
print $word; | |
for my $n (0 .. $i) { | |
print " ".$history[$n]; | |
} | |
print "\n"; | |
last; | |
} | |
$i++; | |
} | |
} | |
} | |
# SEARCH WITH CHARACTER LENGTH | |
elsif ($MODE == 0) { | |
print "This does not work yet :)"; | |
} | |
} | |
# File reach EOF, now parse the remaining history | |
while (scalar(@history) > 1) { | |
# Take a word from the history | |
$word = shift(@history); | |
# Format it for easy checking | |
my $word_f = lc($word); | |
$word_f =~ s/\W|_//g; | |
$i = 0; | |
while ($i < $LIMIT && $history[$i]) { | |
my $word_ch = lc($history[$i]); | |
$word_ch =~ s/\W|_//g; | |
if (length($word_f) < $MINLEN) { | |
last; | |
} | |
if ($word_f eq $word_ch) { | |
$output[$hits] = [($line, $word, $i)]; | |
$hits++; | |
print "FOUND: $word (d: $i), line $line: "; | |
print $word; | |
for my $n (0 .. $i) { | |
print " ".$history[$n]; | |
} | |
print "\n"; | |
last; | |
} | |
$i++; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment