Created
December 7, 2012 08:47
-
-
Save Ovid/4231878 to your computer and use it in GitHub Desktop.
Find Duplicate Code in Perl (a hack)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
use 5.12.0; | |
use autodie; | |
use Carp; | |
use utf8::all; | |
use File::Spec::Functions qw(catfile catdir); | |
use File::Find::Rule; | |
use Getopt::Long; | |
use Capture::Tiny qw(capture); | |
use File::Slurp; | |
use Term::ANSIColor ':constants'; | |
local $Term::ANSIColor::AUTORESET = 1; | |
GetOptions( | |
'window=i' => \( my $window = 5 ), | |
'dir=s' => \( my $dir = 'lib' ), | |
'ignore=s@' => \my @ignore, | |
) or die "Bad options"; | |
my $IGNORE = join '|' => @ignore; | |
my $CACHE_DIR = catdir( $ENV{HOME}, '.find_cnp' ); | |
if ( -d $CACHE_DIR ) { | |
my @cached = File::Find::Rule->file->in($CACHE_DIR); | |
unlink $_ for @cached; | |
} | |
else { | |
mkdir $CACHE_DIR; | |
} | |
unless ( -d $dir ) { | |
croak("Cannot find dir $dir"); | |
} | |
my @files = File::Find::Rule->file->name('*.pm')->in($dir); | |
my $num_files = @files; | |
for my $i ( 0 .. $#files - 1 ) { | |
my $next = $i + 1; | |
print WHITE "Processing $next out of $num_files files "; | |
for my $j ( $next .. $#files ) { | |
print '.'; | |
my ( $first, $second ) = @files[ $i, $j ]; | |
search_for_dups( $first, $second, $window ); | |
} | |
print "\n"; | |
} | |
sub search_for_dups { | |
my ( $first, $second, $window ) = @_; | |
my $code1 = get_text($first); | |
my $code2 = get_text($second); | |
my %in_second = map { $_->{key} => 1 } @$code2; | |
my $matches_found = 0; | |
my $last_found = 0; | |
foreach my $i ( 0 .. $#$code1 ) { | |
if ( $in_second{ $code1->[$i]{key} } ) { | |
if ( $i == $last_found + 1 ) { | |
$matches_found++; | |
} | |
$last_found = $i; | |
} | |
} | |
if ( $matches_found < $window ) { | |
return; | |
} | |
# brute force is bad! | |
LINE: foreach ( my $i = 0; $i < @$code1 - $window; $i++ ) { | |
next LINE unless $in_second{ $code1->[$i]{key} }; | |
my @code1 = @{$code1}[ $i .. $#$code1 ]; | |
foreach my $j ( 0 .. $#$code2 - $window ) { | |
my @code2 = @{$code2}[ $j .. $#$code2 ]; | |
my $matches = 0; | |
my $longest = 0; | |
WINDOW: foreach my $k ( 0 .. $#code1 ) { | |
if ( $code1[$k]{key} eq $code2[$k]{key} ) { | |
$matches++; | |
my $length1 = length( $code1[$k]{code} ); | |
if ( $length1 > $longest ) { | |
$longest = $length1; | |
} | |
my $length2 = length( $code2[$k]{code} ); | |
if ( $length1 > $longest ) { | |
$longest = $length1; | |
} | |
} | |
else { | |
last WINDOW; | |
} | |
} | |
if ( $matches >= $window ) { | |
my $line1 = 0 + $code1[0]{line}; | |
my $line2 = 0 + $code2[0]{line}; | |
my $code_to_print = ''; | |
for ( 0 .. $matches - 1 ) { | |
my ( $line1, $line2 ) = | |
map { chomp; $_ } ( $code1[$_]{code}, $code2[$_]{code} ); | |
$code_to_print | |
.= $line1 . ( ' ' x ( $longest - length($line1) ) ); | |
$code_to_print .= " | $line2\n"; | |
} | |
$i += $window; | |
if ( $IGNORE and $code_to_print =~ /$IGNORE/ ) { | |
next LINE; | |
} | |
say BOLD RED | |
"\nPossible match ($first near line $line1) ($second near line $line2)\n", | |
"Begining at:"; | |
print WHITE $code_to_print; | |
} | |
} | |
} | |
} | |
sub get_text { | |
my $file = shift; | |
my $filename = $file; | |
$filename =~ s/\W/_/g; | |
$filename = catfile( $CACHE_DIR, $filename ); | |
my @contents; | |
if ( -f $filename ) { | |
@contents = split /(\n)/ => read_file($filename); | |
} | |
else { | |
( undef, undef, @contents ) = capture {qx($^X -MO=Deparse,-l $file)}; | |
write_file( $filename, @contents ); | |
} | |
return add_line_numbers( \@contents ); | |
} | |
sub add_line_numbers { | |
my $contents = prefilter(shift); | |
my @contents; | |
my $line = 1; | |
foreach (@$contents) { | |
next if /^\s*\$\^H{/; # skip those damned strict lines | |
if (/^#line\s+([0-9]+)/) { | |
$line = $1; | |
next; | |
} | |
push @contents => { | |
line => $line, | |
key => munge_line($_), | |
code => $_, | |
}; | |
$line++; | |
} | |
return postfilter( \@contents ); | |
} | |
sub postfilter { | |
my $contents = shift; | |
my @contents; | |
INDEX: for ( my $i = 0; $i < @$contents; $i++ ) { | |
if ( $contents->[$i]{code} =~ /^(\s*)BEGIN\s*{/ ) { # BEGIN { | |
my $padding =~ $1; | |
if ( $contents->[ $i + 1 ]{code} =~ /^$padding}/ ) { | |
$DB::single = 1; | |
$i++; | |
next INDEX; | |
} | |
} | |
push @contents => $contents->[$i]; | |
} | |
#my $lines = join '' => map { $_->{code}} @contents; | |
#say BLUE $lines; | |
#<STDIN>; | |
return \@contents; | |
} | |
sub prefilter { | |
my $contents = shift; | |
my @contents; | |
my %skip = ( | |
sub_begin => 0, | |
); | |
my $skip = 0; | |
LINE: for ( my $i = 0; $i < @$contents; $i++ ) { | |
local $_ = $contents->[$i]; | |
next if /^\s*(?:use|require)\b/; # use/require | |
next if /^\s*$/; # blank lines | |
next if /^#(?!line\s+[0-9]+)/; # comments which aren't line directives | |
# Modules which import things create code like this: | |
# | |
# sub BEGIN { | |
# require strict; | |
# do { | |
# 'strict'->import('refs') | |
# }; | |
# } | |
# | |
# $skip{sub_begin} filters this out | |
if (/^sub BEGIN {/) { | |
$skip{sub_begin} = 1; | |
$skip++; | |
} | |
elsif ( $skip{sub_begin} and /^}/ ) { | |
$skip{sub_begin} = 0; | |
$skip--; | |
next; | |
} | |
# Modules which use strict often have blocks like this: | |
# | |
# BEGIN { | |
# $^H{'indirect'} = q(31664984); | |
# $^H{'feature_unicode'} = q(1); | |
# $^H{'autodie'} = q(Fatal :lexical :all); | |
# $^H{'feature_say'} = q(1); | |
# $^H{'guard Fatal'} = q(ARRAY(0x2300d20)); | |
# $^H{'feature_state'} = q(1); | |
# $^H{'autovivification'} = q(52); | |
# $^H{'utf8::all'} = q(1); | |
# $^H{'feature_switch'} = q(1); | |
# } | |
# | |
# $skip{strict_begin} filters this out | |
if (/^(\s*)BEGIN {/) { | |
my $padding = $1; | |
my $next = $contents->[ $i + 1 ]; | |
if ( $next =~ /^\s+\$\^H{/ ) { | |
my $index = $i + 2; | |
STRICT_SEARCH: while ( $index++ ) { | |
next if $contents->[$index] =~ /^\s+\$\^H{/; # $^H{'...'} | |
last STRICT_SEARCH | |
if not /^$padding}/ | |
; # the block did not terminate as we expect | |
$i = $index; | |
next LINE; | |
} | |
} | |
} | |
push @contents => $_ unless $skip; | |
} | |
return \@contents; | |
} | |
sub munge_line { | |
local $_ = shift; | |
chomp; | |
s/\s//g; | |
return $_; | |
} | |
__END__ | |
=head1 NAME | |
find_duplicate_code | |
=head1 SYNOPSIS | |
find_duplicate_code --window 7 --dir lib/ --ignore 'catch {' | |
=head1 DESCRIPTION | |
This program searches for cut-n-paste code. It does not (at the present time) | |
try to account for cases where people may have changed variable names, but it | |
applies a heuristic process for finding duplicate code and works moderately | |
well. It's also slow. | |
For every file in the target directory, C<--dir> (defaults to C<lib/>), it | |
runs the code through L<B::Deparse>, caches it, and then walks through that | |
code comparing C<--window> number of lines (default 5). If that many lines | |
matches, as determined by stripping all whitespace and doing an C<eq>, then we | |
have duplicated code. | |
Blank lines are skipped, as are C<use> and C<require>. This may be | |
configurable in the future. | |
=head1 OPTIONS | |
--window,-w Minimum number of lines to needed for a match (default 5) | |
--dir,-d Directory of .pm files to search through (default 'lib/') | |
--ignore,-i Duplicates to ignore (may be repeated). | |
The C<--ignore> switch may be useful if you are repeatedly getting "duplicate" | |
sections of code that you aren't interested in refactoring right now. The | |
values of C<--ignore> are joined with a pipe and the check looks like this: | |
if ( $ignore and $duplicate_code =~ /$ignore/ ) { | |
# don't report this chunk of code as a duplicate | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thank you for this great idea. Remember that all of the '{' characters in regexes need to be escaped to avoid this error:
Unescaped left brace in regex is illegal here in regex; marked by <-- HERE in m/^\s*$^H{ <-- HERE / at find_duplicate_code.pl line 144.