tvwerkhoven · April 5, 2010 20:07
diff --git a/worddistance.pl b/worddistance.pl
 #!/usr/bin/perl -w
 #
 # Find the word distance for each word, and signal words that are close 
 # together. Can be useful for reviewing written text (papers, essays, 
 # whatever) to see if you might need a thesaurus somewhere.
 # 
 # Tim van Werkoven, 20090426 <[email protected]>
 # This file is licensed under the Creative Commons Attribution-Share Alike
 # license versions 3.0 or higher, see
 # http://creativecommons.org/licenses/by-sa/3.0/

 # If the inter-word distance is less than this, signal a warning
 my $LIMIT = 10;
 # Is the above count in characters (=0) or words (=1)?
 my $MODE = 1;
 # What is the minimum length of words to check for?
 my $MINLEN = 4;
 # What is a comment?
 my $COMMCHAR = "%";


 # Open the file
 open(FILE, "<$ARGV[0]");

 # keep track of the line number we're looking at
 my $line=0;
 my @history;
 my $word="";

 my @output;
 my $hits=0;

 #while (file) {
 # get enough data into history
 # start comparing word by word
 # eject words if history becomes too long
 #}
 while (<FILE>) {
 	$line++;
 	chomp $_;

 	# store current line
 	my $curr = $_;
 	# split line up in words
 	my @words = split(/ /, $curr);		
 	# Check for comments
 	next if /^$COMMCHAR/;

 	# append words to history
 	@history = (@history,@words);

 	# check if we should ignore this
 	# Start searching now
 	# SEARCH WITH WORDLENGTH
 	if ($MODE == 1) {
 		while (scalar(@history) > $LIMIT) {
 			# Take a word from the history
 			$word = shift(@history);
 			# Format it for easy checking
 			my $word_f = lc($word);
 			$word_f =~ s/\W|_//g;
 			$i = 0;
 			while ($i < $LIMIT && $history[$i]) {
 				my $word_ch = lc($history[$i]);
 				$word_ch =~ s/\W|_//g;
 				#print "debug ".$word.$history[$i]."\n";
 				if (length($word_f) < $MINLEN) {
 					last;
 				}
 				elsif ($word_f eq $word_ch) {
 					$output[$hits] = [($line, $word, $i)];
 					$hits++;
 					print "FOUND: $word (d: $i), line $line: ";
 					print $word;
 					for my $n (0 .. $i) {
 						print " ".$history[$n];
 					}
 					print "\n";
 					last;
 				}
 				$i++;
 			}
 		}
 	}
 	# SEARCH WITH CHARACTER LENGTH
 	elsif ($MODE == 0) { 
 		print "This does not work yet :)";
 	}
 }
 # File reach EOF, now parse the remaining history
 while (scalar(@history) > 1) {
 	# Take a word from the history
 	$word = shift(@history);
 	# Format it for easy checking
 	my $word_f = lc($word);
 	$word_f =~ s/\W|_//g;
 	$i = 0;
 	while ($i < $LIMIT && $history[$i]) {
 		my $word_ch = lc($history[$i]);
 		$word_ch =~ s/\W|_//g;
 		if (length($word_f) < $MINLEN) {
 			last;
 		}
 		if ($word_f eq $word_ch) {
 			$output[$hits] = [($line, $word, $i)];
 			$hits++;
 			print "FOUND: $word (d: $i), line $line: ";
 			print $word;
 			for my $n (0 .. $i) {
 				print " ".$history[$n];
 			}
 			print "\n";
 			last;
 		}
 		$i++;
 	}
 }
	#!/usr/bin/perl -w
	#
	# Find the word distance for each word, and signal words that are close
	# together. Can be useful for reviewing written text (papers, essays,
	# whatever) to see if you might need a thesaurus somewhere.
	#
	# Tim van Werkoven, 20090426 <[email protected]>
	# This file is licensed under the Creative Commons Attribution-Share Alike
	# license versions 3.0 or higher, see
	# http://creativecommons.org/licenses/by-sa/3.0/

	# If the inter-word distance is less than this, signal a warning
	my $LIMIT = 10;
	# Is the above count in characters (=0) or words (=1)?
	my $MODE = 1;
	# What is the minimum length of words to check for?
	my $MINLEN = 4;
	# What is a comment?
	my $COMMCHAR = "%";


	# Open the file
	open(FILE, "<$ARGV[0]");

	# keep track of the line number we're looking at
	my $line=0;
	my @history;
	my $word="";

	my @output;
	my $hits=0;

	#while (file) {
	# get enough data into history
	# start comparing word by word
	# eject words if history becomes too long
	#}
	while (<FILE>) {
	$line++;
	chomp $_;

	# store current line
	my $curr = $_;
	# split line up in words
	my @words = split(/ /, $curr);
	# Check for comments
	next if /^$COMMCHAR/;

	# append words to history
	@history = (@history,@words);

	# check if we should ignore this
	# Start searching now
	# SEARCH WITH WORDLENGTH
	if ($MODE == 1) {
	while (scalar(@history) > $LIMIT) {
	# Take a word from the history
	$word = shift(@history);
	# Format it for easy checking
	my $word_f = lc($word);
	$word_f =~ s/\W\|_//g;
	$i = 0;
	while ($i < $LIMIT && $history[$i]) {
	my $word_ch = lc($history[$i]);
	$word_ch =~ s/\W\|_//g;
	#print "debug ".$word.$history[$i]."\n";
	if (length($word_f) < $MINLEN) {
	last;
	}
	elsif ($word_f eq $word_ch) {
	$output[$hits] = [($line, $word, $i)];
	$hits++;
	print "FOUND: $word (d: $i), line $line: ";
	print $word;
	for my $n (0 .. $i) {
	print " ".$history[$n];
	}
	print "\n";
	last;
	}
	$i++;
	}
	}
	}
	# SEARCH WITH CHARACTER LENGTH
	elsif ($MODE == 0) {
	print "This does not work yet :)";
	}
	}
	# File reach EOF, now parse the remaining history
	while (scalar(@history) > 1) {
	# Take a word from the history
	$word = shift(@history);
	# Format it for easy checking
	my $word_f = lc($word);
	$word_f =~ s/\W\|_//g;
	$i = 0;
	while ($i < $LIMIT && $history[$i]) {
	my $word_ch = lc($history[$i]);
	$word_ch =~ s/\W\|_//g;
	if (length($word_f) < $MINLEN) {
	last;
	}
	if ($word_f eq $word_ch) {
	$output[$hits] = [($line, $word, $i)];
	$hits++;
	print "FOUND: $word (d: $i), line $line: ";
	print $word;
	for my $n (0 .. $i) {
	print " ".$history[$n];
	}
	print "\n";
	last;
	}
	$i++;
	}
	}