Created
July 18, 2009 15:20
-
-
Save zed9h/149582 to your computer and use it in GitHub Desktop.
suggests duplicate-file removal and small-file backup
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
use strict; | |
my @starting_points; | |
my ($recycle); | |
my ($del_check, $checksum, $no_name, $smaller_tree, $keep_path_pattern); | |
my ($zip_check, $zip_sizefactor, $zip_minsize); | |
my ($denied_pattern, $allowed_pattern); | |
my %byPath; | |
my %byKey; | |
my @debug; | |
# TO DO | |
# *make a scan cache and a sum cache, incremental.. | |
# -[no]cache (to replace [log] -recycle) | |
## RELEASE NOTES #################################### 20040628 0911 | |
# | |
# DISCLAMER: | |
# it's not just a legal disclamer.. this program really | |
# have not been tested enough, caution with what u delete. | |
# use it as a tool to find duplicates, not to free disk | |
# space automatically, gotit? | |
# progname: dup.pl (perl 5.x for windows/and probably unix) | |
# programmer: carlo a caputo (aka 9H.zED) <[email protected]> | |
# description: find duplicates, zero length and zippable dirs(&files). | |
# developed on ActivePerl Build 518 in a Win98 running on a P2-350 | |
# 128Mb 12Gb of a replete-of-messy-crap HD. | |
# tested with 3145 dirs and 44168 files on devel comp. | |
# tested on a 1979 dirs and 40587 files NT server IIS4 IS2 SQLSrv | |
# to begin: Search for: main() | |
################################################################# | |
# UTIL | |
#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=# | |
sub escape_path_in_regexp($) | |
{ | |
my ($pattern) = @_; | |
$pattern =~ s/\\/\\\\/g; | |
$pattern =~ s/\//\\\//g; | |
$pattern =~ s/([\^\$\+\:\(\)\[\]\{\}])/\\$1/g; | |
($pattern); | |
} | |
sub escape_name_in_filename($) { | |
$_[0] =~ s/ /_/g; | |
$_[0] =~ s/[\:\.\/\\]/\%/g; | |
($_[0]); | |
} | |
sub extract_name($) | |
{ | |
my ($path) = @_; | |
$path =~ m-/([^/]+)/?$-; | |
($1); | |
} | |
sub extract_parent_path($) | |
{ | |
my ($path) = @_; | |
$path =~ m-^(.*?/)[^/]+?/?$-; | |
($1); | |
} | |
### PROGRESS INDICATOR ################### | |
my ($acc_progress, $actual_progress, $total_progress); | |
my ($scan_deep, $scan_max_deep, $scan_path); # alternative for directories in blind scanning | |
my $progress_display_period = 1; #sec | |
my $progress_last_display; | |
sub reset_progress_indicator_byPath($$) | |
{ | |
my ($src, $type) = @_; #$type = (size,count,item); | |
$total_progress = 0; | |
my $amount; | |
foreach my $path (keys %{$byPath{$src}}) { | |
my $key = $byPath{$src}{$path}; | |
my (undef,$count,$size) = split /\|/, $key; | |
for($type) { | |
/^size$/ and $amount = $size, last; | |
/^count$/ and $amount = $count, last; | |
$amount = 1; | |
} | |
$total_progress += $amount; | |
} | |
($acc_progress, $actual_progress)=(0,0); | |
$progress_last_display=0; | |
} | |
sub reset_progress_indicator_byKey($$) | |
{ | |
my ($src, $type) = @_; #$type = (size,count,item); | |
$total_progress = 0; | |
my $amount; | |
foreach my $key (keys %{$byKey{$src}}) { | |
my (undef,$count,$size) = split /\|/, $key; | |
for($type) { | |
/^size$/ and $amount = $size, last; | |
/^count$/ and $amount = $count, last; | |
$amount = 1; | |
} | |
$total_progress += $amount * @{$byKey{$src}{$key}}; | |
} | |
($acc_progress, $actual_progress)=(0,0); | |
$progress_last_display=0; | |
} | |
sub reset_deep_indicator() | |
{ | |
my ($rez) = @_; | |
($scan_deep, $scan_max_deep, $scan_path)=(0,0,''); | |
$progress_last_display=0; | |
} | |
sub progress_acc($) | |
{ | |
my ($acc) = @_; | |
$acc_progress += $acc; | |
return if (!$total_progress); | |
if ($progress_last_display+$progress_display_period < time) { | |
$actual_progress += $acc_progress; | |
$acc_progress = 0; | |
printf "%3.2f%%\r", ($actual_progress*100.0/$total_progress); | |
$progress_last_display = time; | |
} | |
} | |
sub deep_update() | |
{ | |
if ($scan_deep>$scan_max_deep) { | |
$scan_max_deep=$scan_deep; | |
$progress_last_display = 0; # force redisplay | |
} | |
if ($progress_last_display+$progress_display_period <= time) { | |
my ($width,$min_width) = (79,10); | |
my ($full, $part); | |
$full .= $part= ("#" x $scan_deep).(":" x ($scan_max_deep-$scan_deep)); | |
$width -= length($part); | |
$part = " (\"".$scan_path."\")"; | |
if ($width > length($part)) { | |
$full .= $part; | |
$width -= length($part); | |
} | |
else { | |
$part = " (\"\"...\"\")"; | |
my $path_width = $width-length($part); | |
if ($path_width > $min_width) { | |
my $path_half = $path_width/2; | |
$part = " (\"".substr($scan_path,0,$path_half)."\"..."; | |
$part .= "\"".substr($scan_path,length($scan_path)-$path_half+1,$path_half)."\")"; | |
$full .= $part; | |
$width -= length($part); | |
} | |
} | |
$full .= " " x $width; | |
print "$full\r"; | |
$progress_last_display = time; | |
} | |
} | |
################################################################# | |
# BASIC OPERATIONS | |
#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=# | |
sub clear_byPath(@) | |
{ | |
foreach my $src (@_) { | |
delete $byPath{$src}; | |
} | |
} | |
sub clear_byKey(@) | |
{ | |
foreach my $src (@_) { | |
delete $byKey{$src}; | |
} | |
} | |
sub copy_byPath($$) | |
{ | |
my ($dest,$src)=@_; | |
foreach my $path (keys %{$byPath{$src}}) { | |
my $key = $byPath{$src}{$path}; | |
$byPath{$dest}{$path} = $key; | |
} | |
} | |
sub copy_byKey($$) | |
{ | |
my ($dest,$src)=@_; | |
foreach my $key (keys %{$byKey{$src}}) { | |
push @{$byKey{$dest}{$key}}, @{$byKey{$src}{$key}}; | |
} | |
} | |
sub indexing_byKey_to_byPath($) | |
{ | |
my ($type)=@_; | |
delete $byPath{$type}; | |
foreach my $key (keys %{$byKey{$type}}) { | |
foreach my $path ( @{$byKey{$type}{$key}}) { | |
$byPath{$type}{$path} = $key; | |
} | |
} | |
} | |
sub indexing_byPath_to_byKey($) | |
{ | |
my ($type)=@_; | |
clear_byKey($type); | |
foreach my $path (sort keys %{$byPath{$type}}) { | |
my $key = $byPath{$type}{$path}; | |
push @{$byKey{$type}{$key}}, $path; | |
} | |
} | |
#remove_path_pattern_byPath($src, $del, "/^$pattern./"); | |
#remove_path_pattern_byPath($src, $del, "!/$allowed_pattern/i"); | |
#remove_path_pattern_byPath($src, $del, "/$denied_pattern/i"); | |
# <code below copied to locations above for speed optmization> | |
sub remove_path_pattern_byPath($$$) | |
{ | |
my ($src, $del, $condition)=@_; | |
return if (!exists $byPath{$src}); | |
foreach my $path (sort grep {eval($condition)} keys %{$byPath{$src}}) { | |
$byPath{$del}{$path} = $byPath{$src}{$path}; | |
delete $byPath{$src}{$path}; | |
} | |
} | |
sub remove_path_list_byPath($$$) | |
{ | |
my ($src, $del, $to_remove)=@_; | |
return if (!exists $byPath{$src}); | |
foreach my $path (sort keys %{$byPath{$to_remove}}) { | |
if (exists $byPath{$src}{$path}) { | |
$byPath{$del}{$path} = $byPath{$src}{$path}; | |
delete $byPath{$src}{$path}; | |
} | |
} | |
} | |
################################################################# | |
# SCANNING | |
#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=# | |
my ($dir_count, $file_count)=(0,0); | |
my %path_cache; | |
my %sum_cache; | |
my $checksum_width = 4294967295; #32-bit - 1 | |
sub sum_name($) | |
{ | |
my ($name) = @_; | |
my ($sum) = (0); | |
my $i; | |
foreach (unpack("C*",uc($name))) { | |
$sum = ( $sum + $_*(++$i) ) % $checksum_width; | |
} | |
($sum); | |
} | |
sub sum_filename($) | |
{ | |
(sum_name(extract_name(shift))); | |
} | |
sub sum_file($) | |
{ | |
my ($path) = @_; | |
my ($sum) = (0); | |
if (open FILE, "$path") { | |
# or die "could not open file: \"$path\""; | |
binmode FILE; | |
$/ = 65536; | |
my $n=1; | |
while (<FILE>) { | |
$sum = ($sum + unpack("%32C*", $_)*($n++)) % $checksum_width; | |
} | |
# old: too slow | |
# my $n=1; | |
# while($_ = getc FILE) { | |
# $sum = ($sum + ord($_)*($n++)) % $checksum_width; | |
# } | |
close FILE; | |
} | |
($sum); | |
} | |
###################### | |
sub scan($) # get name, count, size | |
{ | |
my ($path) = @_; | |
$path =~ s|\\|/|g; | |
$path =~ s|^(.*?)/?$|$1|; | |
return if (-l $path); # don't follow symlinks | |
$path .= '/' if (-d $path); | |
my ($name,$count,$size,$sum) = (extract_name($path), 1, (-s $path)/1000.0, sum_filename($path)); | |
# cache check | |
if (exists $path_cache{$path}) { | |
return ($name,$count,$size,$sum) = split /\|/, $path_cache{$path}; | |
} | |
$scan_deep++; # go deep | |
$scan_path = $path; | |
deep_update(); | |
if (-f $path) { | |
# all defaults apply | |
$file_count++; # statistics only | |
} | |
if (-d $path) { | |
if (opendir DIR, "$path") { | |
# or die "could not open dir: \"$path\""; | |
my @entry_list = readdir DIR; | |
foreach my $entry (sort @entry_list) { | |
next if($entry =~ /^\.{1,2}$/); | |
my (undef,$dcount,$dsize,$dsum) = scan("$path$entry"); | |
$count += $dcount; | |
$size += $dsize; | |
$sum = ($sum + $dsum*$count) % $checksum_width; | |
} | |
closedir DIR; | |
} | |
$dir_count++; # statistics only | |
} | |
$scan_deep--; # back | |
my $key = "$name|$count|$size|$sum"; | |
$byPath{'scan'}{$path} = $key; | |
$path_cache{$path} = $key; | |
($name,$count,$size,$sum); | |
} | |
sub scanning_to_byPath() | |
{ | |
my $src = 'scan'; | |
print "gathering info...\n"; | |
my $logfile = report_short_filename('scanned'); | |
if($recycle and open FILE, "$logfile") { | |
# get all pre-scanned data from log (wait for synched data) | |
print " recycling old log file (\"$logfile\")...\n"; | |
$/ = "\n"; | |
while(<FILE>) { | |
#[scan] 0.000kb 1 00000000 c:/_/_chaos/_download/1/ | |
if (/^\[[^\]]+\]\s+([\d\.]+)kb\s+(\d+)\s+([0-9A-F]+)\*?\s+(.*?)$/) { | |
my $path = $4; | |
my ($name,$count,$size,$sum) = (extract_name($path), $2, 0+$1, hex $3); | |
my $key = "$name|$count|$size|$sum"; | |
$byPath{$src}{$path} = $key; | |
$path_cache{$path} = $key; | |
($count==1?$file_count:$dir_count)++; # statistics only | |
} | |
} | |
close FILE; | |
} | |
else { | |
# launch scan | |
foreach my $starting_point (@starting_points) { | |
my $msg = " scanning (\"$starting_point\")..."; | |
print $msg . " " x (79-length($msg)) . "\n"; | |
reset_deep_indicator(); | |
scan($starting_point); | |
} | |
report('scanned',0, $src); | |
} | |
my $msg = " got $dir_count dir".($dir_count!=1?'s':'')." and ". | |
"$file_count file".($file_count!=1?'s':'')."."; | |
print $msg . " " x (79-length($msg)) . "\n"; | |
} | |
######################## | |
sub scan_for_sum($) # requires 'scan' indexing | |
{ | |
my ($path) = @_; | |
my ($sum) = (0); | |
# cache check | |
if (exists $sum_cache{$path}) { | |
return ($sum_cache{$path}); | |
} | |
if (-f $path) { | |
$sum = sum_file("$path") % $checksum_width; | |
} | |
if (-d $path) { | |
my $pattern = escape_path_in_regexp($path); | |
foreach my $path (sort grep /^$pattern./, keys %{$byPath{scan}}) { | |
if (-f $path) { | |
$sum = ($sum + sum_file("$path")) % $checksum_width; | |
} | |
} | |
} | |
$sum_cache{$path} = $sum; | |
($sum); | |
} | |
sub scanning_for_sum_to_byPath($) # fill-in the sum field (in key) | |
{ | |
my ($src) = @_; | |
print "verifying full-checksum ($src)...\n"; | |
my $logfile = report_short_filename('summed'); | |
if($recycle and open FILE, "$logfile") { | |
# just precache cos' cant be sure of the synch between log's and needed data | |
print " recycling old log file (\"$logfile\")...\n"; | |
$/ = "\n"; | |
while(<FILE>) { | |
#[keeping] 0.003kb 1 00000048* c:/_/_chaos/_download/tmp2/.xx | |
if (/^\[[^\]]+\]\s+([\d\.]+)kb\s+(\d+)\s+([0-9A-F]+)\*?\s+(.*?)$/) { | |
my $path = $4; | |
my ($sum) = (hex $3); | |
$sum_cache{$path} = $sum; | |
} | |
} | |
close FILE; | |
} | |
reset_progress_indicator_byPath($src, 'size'); | |
foreach my $path (sort keys %{$byPath{$src}}) { | |
my $key = $byPath{$src}{$path}; | |
my ($name,$count,$size,$sum) = split /\|/, $key; | |
$sum = scan_for_sum($path); # new, and improved, checksum | |
my $new_key = "$name|$count|$size|$sum"; | |
$byPath{$src}{$path} = $new_key; | |
progress_acc($size); | |
} | |
report('summed',0, $src); | |
} | |
###################### | |
sub clear_key_fields_byPath($$$$$) # usually used before scanning_for_sum_to_byPath() | |
{ | |
my ($src,$clear_name,$clear_count,$clear_size,$clear_sum) = @_; | |
print "clearing name in key-info ($src)...\n"; | |
reset_progress_indicator_byPath($src, 'item'); | |
foreach my $path (keys %{$byPath{$src}}) { | |
my $key = $byPath{$src}{$path}; | |
my ($name,$count,$size,$sum) = split /\|/, $key; | |
$name = '' if ($clear_name); | |
$count = '' if ($clear_count); | |
$size = '' if ($clear_size); | |
$sum = '' if ($clear_sum); | |
my $new_key = "$name|$count|$size|$sum"; | |
$byPath{$src}{$path} = $new_key; | |
progress_acc(1); | |
} | |
} | |
############### | |
sub tree_size($$) { | |
my($base, $path) = @_; | |
my $leaf_path = $path; | |
my $level = ($path =~ s/\//\//g)+1; | |
my $leaf_level = $level; | |
my $factor = 1.1; | |
my $lv_wgt = 1/($factor**$level); | |
my $tree_size = 0; | |
while ($level and exists $byPath{$base}{$path}) { | |
$lv_wgt *= $factor; $level--; | |
my ($name,$count,$size) = split /\|/, $byPath{$base}{$path}; | |
$tree_size += ($size + $count + sum_name($name)*0.00001) * $lv_wgt; | |
$path = extract_parent_path($path); | |
} | |
($tree_size); | |
} | |
################################################################# | |
# ANALISYS AUX OPERATIONS | |
#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=# | |
sub nonduplicate_removal_byPath($$) | |
{ | |
my ($src, $del)=@_; | |
print "non-duplicate removal (+$src/-$del)...\n"; | |
indexing_byPath_to_byKey($src); | |
reset_progress_indicator_byKey($src, 'item'); | |
foreach my $key (keys %{$byKey{$src}}) { | |
if (@{$byKey{$src}{$key}} <= 1) { # not a duplicate | |
foreach my $path (@{$byKey{$src}{$key}}) { $byPath{$del}{$path} = $key; } | |
delete $byKey{$src}{$key}; | |
} | |
progress_acc(1); | |
} | |
indexing_byKey_to_byPath($src); | |
clear_byKey($src); | |
} | |
sub dir_children_removal_byPath($$$) | |
{ | |
my ($base, $src, $del)=@_; | |
print " dir cropping ([$base] +$src/-$del)...\n"; | |
reset_progress_indicator_byPath($base, 'item'); | |
foreach my $path (sort keys %{$byPath{$base}}) { | |
my $key = $byPath{$base}{$path}; | |
my (undef,$count) = split /\|/, $key; | |
if ($count>1) { # then it's a dir, may contain children | |
my $pattern = escape_path_in_regexp($path); | |
#remove_path_pattern_byPath($src, $del, "/^$pattern./"); # too slow | |
foreach my $path (grep /^$pattern./, keys %{$byPath{$src}}) { | |
$byPath{$del}{$path} = $byPath{$src}{$path}; | |
delete $byPath{$src}{$path}; | |
} | |
} | |
progress_acc(1); | |
} | |
} | |
################################################################# | |
# ANALISYS OPERATIONS | |
#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=# | |
sub pattern_removal_byPath($$) | |
{ | |
my($src, $del) = @_; | |
return if ( !($allowed_pattern or $denied_pattern) ); | |
print "removing forbiden patterns (+$src/-$del)...\n"; | |
if ($allowed_pattern) { | |
print " allow /$allowed_pattern/i\n"; | |
#remove_path_pattern_byPath($src, $del, "!/$allowed_pattern/i"); | |
foreach my $path (grep !/$allowed_pattern/i, keys %{$byPath{$src}}) { | |
$byPath{$del}{$path} = $byPath{$src}{$path}; | |
delete $byPath{$src}{$path}; | |
} | |
} | |
elsif ($denied_pattern) { | |
print " deny /$denied_pattern/i\n"; | |
#remove_path_pattern_byPath($src, $del, "/$denied_pattern/i"); | |
foreach my $path (grep /$denied_pattern/i, keys %{$byPath{$src}}) { | |
$byPath{$del}{$path} = $byPath{$src}{$path}; | |
delete $byPath{$src}{$path}; | |
} | |
} | |
# dir_children_removal_byPath($del, $del, 'void'); | |
# dir_children_removal_byPath($del, $src, 'void'); | |
# does not apply here, e.g. allow *.url but does not allow c:/urls/ | |
} | |
sub zero_removal_byPath($$) | |
{ | |
my($src, $del) = @_; | |
print "removing zero length (+$src/-$del)...\n"; | |
reset_progress_indicator_byPath($src, 'item'); | |
foreach my $path (keys %{$byPath{$src}}) { | |
my $key = $byPath{$src}{$path}; | |
my (undef,undef,$size) = split /\|/, $key; | |
if ($size==0) { | |
$byPath{$del}{$path} = $key; | |
delete $byPath{$src}{$path}; | |
} | |
progress_acc(1); | |
} | |
dir_children_removal_byPath($del, $del, 'void'); | |
dir_children_removal_byPath($del, $src, 'void'); | |
} | |
sub duplicate_removal_byPath($$$) | |
{ | |
my($base, $src, $del) = @_; | |
print "removing duplicates (+$src/-$del)...\n"; | |
indexing_byPath_to_byKey($src); | |
reset_progress_indicator_byKey($src, 'item'); | |
my $max_tree_size = 1e+306; | |
foreach my $key (keys %{$byKey{$src}}) { | |
my ($the_size, $the_path)=(($smaller_tree?$max_tree_size:0), ''); | |
my @candidate = (); | |
if($keep_path_pattern) { | |
@candidate = grep /$keep_path_pattern/, @{$byKey{$src}{$key}}; | |
} | |
@candidate = @{$byKey{$src}{$key}} unless @candidate; | |
foreach my $path (@candidate) { | |
my $tree_size = tree_size($base, $path); | |
if ($smaller_tree xor ($tree_size>$the_size)) { | |
$the_size = $tree_size; | |
$the_path = $path; | |
} | |
} | |
foreach my $path (grep {$_ ne $the_path} @{$byKey{$src}{$key}}) { | |
$byPath{$del}{$path} = $key; | |
} | |
@{$byKey{$src}{$key}} = ($the_path); | |
progress_acc(1); | |
} | |
indexing_byKey_to_byPath($src); | |
clear_byKey($src); | |
dir_children_removal_byPath($del, $del, 'void'); | |
dir_children_removal_byPath($del, $src, 'void'); | |
} | |
sub archiving_removal_byPath($$) # requires zero_removal_byPath | |
{ | |
my($src, $del) = @_; | |
print "removing archiving possibilities (+$src/-$del)...\n"; | |
reset_progress_indicator_byPath($src, 'item'); | |
foreach my $path (keys %{$byPath{$src}}) { | |
my $key = $byPath{$src}{$path}; | |
my (undef,$count, $size) = split /\|/, $key; | |
if ($size > $zip_minsize and $size/$count < $zip_sizefactor) { | |
$byPath{$del}{$path} = $key; | |
delete $byPath{$src}{$path}; | |
} | |
progress_acc(1); | |
} | |
dir_children_removal_byPath($del, $del, 'void'); | |
dir_children_removal_byPath($del, $src, 'void'); | |
} | |
################################################################# | |
# REPORT FUNCS | |
#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=# | |
my $report_totals = 0; | |
sub report_byKey(@) | |
{ | |
my @out; | |
my $max_width; | |
foreach my $src (@_) { | |
next if (!keys %{$byKey{$src}}); | |
$max_width = $max_width>length($src)?$max_width:length($src); | |
} | |
foreach my $src (@_) { | |
next if (!keys %{$byKey{$src}}); | |
foreach my $key (keys %{$byKey{$src}}) { | |
my ($name,$count,$size,$sum) = split /\|/, $key; | |
foreach my $path ( @{$byKey{$src}{$key}}) { | |
push @out, sprintf "[%-".$max_width."s] %12skb %5d %08X%1s %s", ($src, sprintf("%8.3f", $size),$count,$sum,($checksum?'*':''), $path); | |
} | |
} | |
} | |
(sort @out) | |
} | |
sub report_byPath(@) | |
{ | |
my @out; | |
my $max_width; | |
foreach my $src (@_) { | |
next if (!keys %{$byPath{$src}}); | |
$max_width = $max_width>length($src)?$max_width:length($src); | |
} | |
foreach my $src (@_) { | |
next if (!keys %{$byPath{$src}}); | |
my ($total_count, $total_size) = (0,0); | |
foreach my $path (keys %{$byPath{$src}}) { | |
my $key = $byPath{$src}{$path}; | |
my ($name,$count,$size,$sum) = split /\|/, $key; | |
push @out, sprintf "[%-".$max_width."s] %12skb %5d %08X%1s %s", ($src, sprintf("%8.3f", $size),$count,$sum,($checksum?'*':''), $path); | |
# push @out, sprintf "[%-".$max_width."s] %s (%s|%d|%d|%08X)", ($src, $path, $size,$count,$sum,$name); | |
$total_size += $size, $total_count += $count; | |
} | |
push @out, sprintf "[%-".$max_width."s] %12skb %5d %s", ($src, sprintf("%8.3f", $total_size),$total_count, "TOTAL ". ('=' x 40)) if($report_totals); | |
} | |
(sort @out) | |
} | |
#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=# | |
sub report_options() { | |
my @out; | |
push @out, ' starting points: "'.join('", "', @starting_points).'"'; | |
push @out, sprintf ' localtime: %s', (scalar localtime(time)); | |
push @out, sprintf ' dupcheck: use_checksum=%s, use_names=%s, keep files in the %s branch;', (($checksum?'yes':'no'), (!$no_name?'yes':'no'), ($smaller_tree?'smaller':'bigger')) if($del_check); | |
push @out, sprintf ' zipcheck: min_avg_file_size=%.3f kbytes, min_size_to_zip_dir=%.3f kbytes;', ($zip_sizefactor, $zip_minsize) if($zip_check); | |
if ($allowed_pattern) { push @out, ' allow only paths like /'.$allowed_pattern.'/i'; } | |
elsif ($denied_pattern) { push @out, ' deny all paths like /'.$denied_pattern.'/i'; } | |
push @out, ""; | |
(@out); | |
} | |
########################### | |
sub report_filename_pieces($) { | |
my ($title) = @_; | |
my $prefix = 'dup'; | |
my $path = join(',', @starting_points); | |
my $session = ($title?escape_name_in_filename($title).'_in_':'').'('.escape_name_in_filename($path).')'; | |
my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time); | |
my $date = sprintf "%04d%02d%02d_%02d%02d%02d", (1900+$year,$mon,$mday, $hour,$min,$sec); | |
my $ext = 'log'; | |
($prefix, $session, $date, $ext); | |
} | |
sub report_short_filename($) { | |
my ($title) = @_; | |
my ($prefix, $session, $date, $ext) = report_filename_pieces($title); | |
($prefix.'_'.$session.'.'.$ext); | |
} | |
sub report_long_filename($) { | |
my ($title) = @_; | |
my ($prefix, $session, $date, $ext) = report_filename_pieces($title); | |
($prefix.'__'.$date.'__'.$session.'.'.$ext); | |
} | |
########################### | |
my @types_to_report; | |
sub report_write_file($) | |
{ | |
my ($filename) = @_; | |
my @out; | |
# header | |
push @out, "$filename"; | |
push @out, report_options(); | |
# analisys | |
push @out, report_byPath(@types_to_report); | |
# debug | |
push @out, report_byKey(@types_to_report); | |
push @out, @debug; | |
open LOG, ">$filename"; | |
print LOG join("\n", @out); | |
close LOG; | |
print " log wrote to file $filename\n"; | |
} | |
sub report($$@) | |
{ | |
my ($title,$totals,@types) = @_; | |
$report_totals = $totals; | |
print "reporting result".($title?" \"$title\"":"")." (".join(",",@types).")...\n"; | |
if (@types) { | |
@types_to_report = @types; | |
} | |
else { | |
my %uniq_types; | |
map { $uniq_types{$_} } keys %byPath; | |
map { $uniq_types{$_} } keys %byKey; | |
@types_to_report = sort keys %uniq_types; | |
} | |
report_write_file(report_short_filename($title)); | |
report_write_file(report_long_filename($title)); | |
} | |
################################################################# | |
# TOP LEVEL FUNCTION | |
#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=# | |
sub analyse() | |
{ | |
my $src = 'scan'; | |
my $work = 'keeping'; | |
print "\nDATA ANALISYS ($src)...\n"; | |
copy_byPath($work, $src); | |
clear_key_fields_byPath($work,1,0,0,1) if($no_name); | |
pattern_removal_byPath($work, 'filtered_out'); | |
zero_removal_byPath($work, 'zero'); | |
archiving_removal_byPath($work, 'zip') if($zip_check); | |
if ($del_check) { | |
if ($checksum) { | |
nonduplicate_removal_byPath($work, 'non_dup'); | |
scanning_for_sum_to_byPath($work); | |
} | |
nonduplicate_removal_byPath($work, 'non_dup'); | |
copy_byPath('dup2', $work); | |
dir_children_removal_byPath('dup2', 'dup2', 'void'); | |
report('duplicate',1, 'dup2'); # all duplicates (for human decisions) | |
duplicate_removal_byPath($src, $work, 'dup'); | |
dir_children_removal_byPath('dup', 'zero', 'void'); | |
dir_children_removal_byPath('dup', 'zip', 'void') if($zip_check); | |
clear_byPath('void'); | |
} | |
dir_children_removal_byPath($work, $work, 'void'); | |
report('to_be_keept',1, $work); # for [human] size cmps | |
###### | |
clear_byPath($work,'non_dup','filtered_out'); | |
report('deletable',1, 'zero','dup') if($del_check); | |
report('zippable',1, 'zip') if($zip_check); | |
clear_byPath('zero','dup','zip'); | |
} | |
################################################################# | |
# STARTING POINT | |
#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=# | |
#### SETTIN'UP PARAMS ################################# | |
my $timer_start; | |
sub init() | |
{ | |
print "\ninitializing...\n"; | |
@starting_points = grep !/^-/, @ARGV; | |
if (!@starting_points) { | |
print 'usage:'."\n". | |
' perl dup.pl [-s:]<starting_dir> [[-s:]<starting_dir>] '."\n". | |
' [-[no]recycle] [-(deny|allow):<pattern>]'."\n". | |
' [-[no][check]sum] [-[no][file]name]'."\n". | |
' [-[no]del] [-keep:(big[ger]|small[er]|<path_pattern>)]'."\n". | |
' [-[no]zip] [-zip:<min_avg_filesize>:<min_total_dirsize>]'."\n". | |
"\n". | |
'examples:'."\n". | |
' perl dup.pl c:/ "-deny:\\.(c|h|hpp|cpp)\$"'."\n". | |
' perl dup.pl c: d: -nochecksum -noname'."\n". | |
' perl dup.pl \'c:\\My Documents\' -sum'."\n". | |
' perl dup.pl c:\\Archive\\ -zip:10000:200000 -keep:small'."\n". | |
"\n". | |
'notes:'."\n". | |
' 1) some reports will be generaded, no action will be taken;'."\n". | |
' 2) data scanned and hold on old log files may be reused '."\n". | |
' (use "-recycle"), there\'s no need to wait a long scan all over;'."\n". | |
' 3) better zip-test after delete the duplicates;'."\n". | |
' 4) "-keep:small" means the dups in the less populated branch '."\n". | |
' would be kept, while while the other may be deleted, '."\n". | |
' the default is "-keep:big", or -keep:^/start/dir/1;'."\n". | |
' 5) "-zip:8:50" means that dirs with less than 8kb/file will be '."\n". | |
' recommended to be zipped, but only if the dir sum 50kb in size; '."\n". | |
' 6) "-allow" overrides "-deny", they cant be used together in this version '."\n". | |
' recommended to be zipped, but only if the dir sum 50kb in size; '."\n". | |
' 7) this is a open-src change as much as u want.'."\n". | |
"\n". | |
' 9H.zED ICQ#2405375 [email protected]'."\n". | |
' RELEASED IN 20040628 0911 Brazil'."\n". | |
"\n"; | |
exit; | |
} | |
# option defaults | |
$recycle = 0; # gather info from old logs? | |
$del_check = 1; # perform dup check? | |
$checksum = 0; # perform full binary checksum? | |
$no_name = 0; # use filenames in the hash-key to find dups? | |
$smaller_tree = 0; # keeps dup files on smaller or bigger branchs? | |
$keep_path_pattern = 0; # keeps dup files that match that path pattern | |
$zip_check = 1; # perform zip check? | |
$zip_sizefactor = 8.000; # minimum kbytes per file avg | |
$zip_minsize = 50.000; # minimum size of a dir to allow zipping | |
$allowed_pattern = ''; | |
$denied_pattern = ''; | |
map { | |
/^-(no)?recycle$/i and $recycle = (lc($1) ne 'no') or | |
/^-(no)?del$/i and $del_check = (lc($1) ne 'no') or | |
/^-(no)?zip$/i and $zip_check = (lc($1) ne 'no') or | |
/^-(no)?(check)?sum$/i and $checksum = (lc($1) ne 'no') or | |
/^-(no)?(file)?name$/i and $no_name = (lc($1) eq 'no') or | |
/^-allow[\:=](.*)$/i and $allowed_pattern = $1 or | |
/^-deny[\:=](.*)$/i and $denied_pattern = $1 or | |
/^-(?:keep[\:=])?(big|small).*$/i and $smaller_tree=(lc($1) eq 'small') or | |
/^-keep[\:=](.*)$/i and $keep_path_pattern=$1 or | |
/^-zip[\:=](.*)\:(.*)$/i and ($zip_sizefactor,$zip_minsize) = ($1,$2) or | |
/^-s(?:tart(?:ting(?:_point|_dir)?)?)?[\:=](.*)$/i and push @starting_points, $1 | |
} @ARGV; | |
@starting_points = sort @starting_points; | |
print join("\n", report_options()) . "\n"; | |
$timer_start = time; | |
} | |
#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=# | |
sub finish() | |
{ | |
print "\nfinishing...\n"; | |
print " took"; | |
my ($day,$hour,$min,$sec); | |
$sec = (time-$timer_start); | |
if($sec>0) { | |
$min = int($sec/60); $sec-=$min*60; | |
$hour = int($min/60); $min-=$hour*60; | |
$day = int($hour/24); $hour-=$day*24; | |
printf " %d day", ($day) if($day); print "s" if($day>1); | |
printf " %d hour", ($hour) if($hour); print "s" if($hour>1); | |
printf " %02d min", ($min) if($min); print "s" if($min>1); | |
printf " %02d sec", ($sec) if($sec); print "s" if($sec>1); | |
} | |
else { | |
print " nothing"; | |
} | |
print ".\n"; | |
} | |
#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=# | |
# main() | |
init(); | |
scanning_to_byPath(); # output to 'scan' | |
analyse(); | |
finish(); | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment