zed9h · July 18, 2009 15:20
diff --git a/dup.pl b/dup.pl
 #!/usr/bin/perl
 use strict;

 my @starting_points;
 my ($recycle);
 my ($del_check, $checksum, $no_name, $smaller_tree, $keep_path_pattern);
 my ($zip_check, $zip_sizefactor, $zip_minsize);
 my ($denied_pattern, $allowed_pattern);

 my %byPath;
 my %byKey;

 my @debug;

 # TO DO
 # *make a scan cache and a sum cache, incremental.. 
 #   -[no]cache (to replace [log] -recycle)

 ## RELEASE NOTES #################################### 20040628 0911
 #
 # DISCLAMER:
 # it's not just a legal disclamer.. this program really 
 # have not been tested enough, caution with what u delete.
 # use it as a tool to find duplicates, not to free disk 
 # space automatically, gotit?

 # progname: dup.pl (perl 5.x for windows/and probably unix)
 # programmer: carlo a caputo (aka 9H.zED) <[email protected]>
 # description: find duplicates, zero length and zippable dirs(&files).
 # developed on ActivePerl Build 518 in a Win98 running on a P2-350 
 #  128Mb 12Gb of a replete-of-messy-crap HD.
 # tested with 3145 dirs and 44168 files on devel comp.
 # tested on a 1979 dirs and 40587 files NT server IIS4 IS2 SQLSrv

 # to begin: Search for: main()

 #################################################################
 # UTIL
 #=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#
 sub escape_path_in_regexp($)
 {
 	my ($pattern) = @_;
 	$pattern =~ s/\\/\\\\/g;
 	$pattern =~ s/\//\\\//g;
 	$pattern =~ s/([\^\$\+\:\(\)\[\]\{\}])/\\$1/g;
 	($pattern);
 }

 sub escape_name_in_filename($) {
 	$_[0] =~ s/ /_/g;
 	$_[0] =~ s/[\:\.\/\\]/\%/g;
 	($_[0]);
 }

 sub extract_name($)
 {
 	my ($path) = @_;
 	$path =~ m-/([^/]+)/?$-;
 	($1);
 }

 sub extract_parent_path($)
 {
 	my ($path) = @_;
 	$path =~ m-^(.*?/)[^/]+?/?$-;
 	($1);
 }

 ### PROGRESS INDICATOR ###################
 my ($acc_progress, $actual_progress, $total_progress);
 my ($scan_deep, $scan_max_deep, $scan_path); # alternative for directories in blind scanning
 my $progress_display_period = 1; #sec
 my $progress_last_display;

 sub reset_progress_indicator_byPath($$)
 {
 	my ($src, $type) = @_; #$type = (size,count,item);
 	
 	$total_progress = 0;
 	my $amount;
 	foreach my $path (keys %{$byPath{$src}}) {
 		my $key = $byPath{$src}{$path};
 		my (undef,$count,$size) = split /\|/, $key;
 		for($type) {
 			/^size$/ and $amount = $size, last;
 			/^count$/ and $amount = $count, last;
 			$amount = 1;
 		}
 		$total_progress += $amount;
 	}

 	($acc_progress, $actual_progress)=(0,0);
 	$progress_last_display=0;
 }
 sub reset_progress_indicator_byKey($$)
 {
 	my ($src, $type) = @_; #$type = (size,count,item);
 	
 	$total_progress = 0;
 	my $amount;
 	foreach my $key (keys %{$byKey{$src}}) {
 		my (undef,$count,$size) = split /\|/, $key;
 		for($type) {
 			/^size$/ and $amount = $size, last;
 			/^count$/ and $amount = $count, last;
 			$amount = 1;
 		}
 		$total_progress += $amount * @{$byKey{$src}{$key}};
 	}

 	($acc_progress, $actual_progress)=(0,0);
 	$progress_last_display=0;
 }

 sub reset_deep_indicator()
 {
 	my ($rez) = @_;

 	($scan_deep, $scan_max_deep, $scan_path)=(0,0,'');
 	$progress_last_display=0;
 }


 sub progress_acc($)
 {
 	my ($acc) = @_;
 	$acc_progress += $acc;
 	return if (!$total_progress);
 	if ($progress_last_display+$progress_display_period < time) {
 		$actual_progress += $acc_progress;
 		$acc_progress = 0;
 		printf "%3.2f%%\r", ($actual_progress*100.0/$total_progress);
 		$progress_last_display = time;
 	}
 }

 sub deep_update()
 {
 	if ($scan_deep>$scan_max_deep) {
 		$scan_max_deep=$scan_deep;
 		$progress_last_display = 0; # force redisplay
 	}
 	if ($progress_last_display+$progress_display_period <= time) {
 		my ($width,$min_width) = (79,10);
 		my ($full, $part);

 		$full .= $part= ("#" x $scan_deep).(":" x ($scan_max_deep-$scan_deep)); 
 		$width -= length($part);

 		$part = " (\"".$scan_path."\")";
 		if ($width > length($part)) {
 			$full .= $part;
 			$width -= length($part);
 		}
 		else {
 			$part = " (\"\"...\"\")";
 			my $path_width = $width-length($part);
 			if ($path_width > $min_width) {
 				my $path_half = $path_width/2;
 				$part = " (\"".substr($scan_path,0,$path_half)."\"...";
 				$part .= "\"".substr($scan_path,length($scan_path)-$path_half+1,$path_half)."\")";
 				$full .= $part;
 				$width -= length($part);
 			}
 		}
 		$full .= " " x $width;
 		print "$full\r";

 		$progress_last_display = time;
 	}
 }

 #################################################################
 # BASIC OPERATIONS
 #=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#

 sub clear_byPath(@)
 {
 	foreach my $src (@_) {
 		delete $byPath{$src};
 	}
 }

 sub clear_byKey(@)
 {
 	foreach my $src (@_) {
 		delete $byKey{$src};
 	}
 }

 sub copy_byPath($$)
 {
 	my ($dest,$src)=@_;

 	foreach my $path (keys %{$byPath{$src}}) {
 		my $key = $byPath{$src}{$path};
 		$byPath{$dest}{$path} = $key;
 	}
 }

 sub copy_byKey($$)
 {
 	my ($dest,$src)=@_;

 	foreach my $key (keys %{$byKey{$src}}) {
 		push @{$byKey{$dest}{$key}}, @{$byKey{$src}{$key}};
 	}
 }

 sub indexing_byKey_to_byPath($)
 {
 	my ($type)=@_;

 	delete $byPath{$type};
 	foreach my $key (keys %{$byKey{$type}}) {
 		foreach my $path ( @{$byKey{$type}{$key}}) {
 			$byPath{$type}{$path} = $key;
 		}
 	}
 }

 sub indexing_byPath_to_byKey($)
 {
 	my ($type)=@_;

 	clear_byKey($type);
 	foreach my $path (sort keys %{$byPath{$type}}) {
 		my $key = $byPath{$type}{$path};
 		push @{$byKey{$type}{$key}}, $path;
 	}
 }

 #remove_path_pattern_byPath($src, $del, "/^$pattern./");
 #remove_path_pattern_byPath($src, $del, "!/$allowed_pattern/i");
 #remove_path_pattern_byPath($src, $del, "/$denied_pattern/i");
 # <code below copied to locations above for speed optmization>
 sub remove_path_pattern_byPath($$$)
 {
 	my ($src, $del, $condition)=@_;
 	return if (!exists $byPath{$src});

 	foreach my $path (sort grep {eval($condition)} keys %{$byPath{$src}}) {
 		$byPath{$del}{$path} = $byPath{$src}{$path};
 		delete $byPath{$src}{$path};
 	}
 }


 sub remove_path_list_byPath($$$)
 {
 	my ($src, $del, $to_remove)=@_;
 	return if (!exists $byPath{$src});

 	foreach my $path (sort keys %{$byPath{$to_remove}}) {
 		if (exists $byPath{$src}{$path}) {
 			$byPath{$del}{$path} = $byPath{$src}{$path};
 			delete $byPath{$src}{$path};
 		}
 	}
 }


 #################################################################
 # SCANNING
 #=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#
 my ($dir_count, $file_count)=(0,0);
 my %path_cache;
 my %sum_cache;
 my $checksum_width = 4294967295; #32-bit - 1

 sub sum_name($)
 {
 	my ($name) = @_;
 	my ($sum) = (0);

 	my $i;
 	foreach  (unpack("C*",uc($name))) {
 		$sum = ( $sum + $_*(++$i) ) % $checksum_width;
 	}

 	($sum);
 }

 sub sum_filename($)
 {
 	(sum_name(extract_name(shift)));
 }

 sub sum_file($)
 {
 	my ($path) = @_;
 	my ($sum) = (0);

 		if (open FILE, "$path") {
 #			or die "could not open file: \"$path\"";
 			binmode FILE;

 			$/ = 65536;
 			my $n=1;
 			while (<FILE>) {
 				$sum = ($sum + unpack("%32C*", $_)*($n++)) % $checksum_width;
 			}

 			# old: too slow
 #			my $n=1;
 #			while($_ = getc FILE) {
 #				$sum = ($sum + ord($_)*($n++)) % $checksum_width;
 #			}

 			close FILE;
 		} 

 	($sum);
 }

 ######################
 sub scan($) # get name, count, size
 {
 	my ($path) = @_;
 	$path =~ s|\\|/|g;
 	$path =~ s|^(.*?)/?$|$1|;
 	return if (-l $path); # don't follow symlinks
 	$path .= '/' if (-d $path);

 	my ($name,$count,$size,$sum) = (extract_name($path), 1, (-s $path)/1000.0, sum_filename($path));

 	# cache check
 	if (exists $path_cache{$path}) {
 		return ($name,$count,$size,$sum) = split /\|/, $path_cache{$path};
 	}

 	$scan_deep++; # go deep
 	$scan_path = $path;
 	deep_update();

 		if (-f $path) {
 			# all defaults apply
 			$file_count++; # statistics only
 		}

 		if (-d $path) {
 			if (opendir DIR, "$path") {
 #				or die "could not open dir: \"$path\"";
 				my @entry_list = readdir DIR;
 				foreach my $entry (sort @entry_list) {
 					next if($entry =~ /^\.{1,2}$/);

 					my (undef,$dcount,$dsize,$dsum) = scan("$path$entry");
 					$count += $dcount;
 					$size += $dsize;
 					$sum = ($sum + $dsum*$count) % $checksum_width;
 				}
 				closedir DIR;
 			}
 			$dir_count++; # statistics only
 		}

 	$scan_deep--; # back

 	my $key = "$name|$count|$size|$sum";
 	$byPath{'scan'}{$path} = $key;
 	$path_cache{$path} = $key;
 	($name,$count,$size,$sum);
 }

 sub scanning_to_byPath()
 {
 	my $src = 'scan';

 	print "gathering info...\n";

 	my $logfile = report_short_filename('scanned');
 	if($recycle and open FILE, "$logfile") {
 		# get all pre-scanned data from log (wait for synched data)
 		print " recycling old log file (\"$logfile\")...\n";
 		$/ = "\n";
 		while(<FILE>) {
 			#[scan]        0.000kb     1 00000000  c:/_/_chaos/_download/1/
 			if (/^\[[^\]]+\]\s+([\d\.]+)kb\s+(\d+)\s+([0-9A-F]+)\*?\s+(.*?)$/) {
 				my $path = $4;
 				my ($name,$count,$size,$sum) = (extract_name($path), $2, 0+$1, hex $3);
 				my $key = "$name|$count|$size|$sum";
 				$byPath{$src}{$path} = $key;
 				$path_cache{$path} = $key;
 				($count==1?$file_count:$dir_count)++; # statistics only
 			}
 		}
 		close FILE;
 	}
 	else {
 		# launch scan
 		foreach my $starting_point (@starting_points) {
 			my $msg = " scanning (\"$starting_point\")...";
 			print $msg . " " x (79-length($msg)) . "\n";
 			reset_deep_indicator();
 			scan($starting_point);
 		}
 		report('scanned',0, $src);
 	}

 	my $msg = " got $dir_count dir".($dir_count!=1?'s':'')." and ".
 			"$file_count file".($file_count!=1?'s':'').".";
 	print $msg . " " x (79-length($msg)) . "\n";
 }

 ########################
 sub scan_for_sum($) # requires 'scan' indexing
 {
 	my ($path) = @_;
 	my ($sum) = (0);

 	# cache check
 	if (exists $sum_cache{$path}) {
 		return ($sum_cache{$path});
 	}

 	if (-f $path) {
 		$sum = sum_file("$path") % $checksum_width;
 	}
 	if (-d $path) {
 		my $pattern = escape_path_in_regexp($path);
 		foreach my $path (sort grep /^$pattern./, keys %{$byPath{scan}}) {
 			if (-f $path) {
 				$sum = ($sum + sum_file("$path")) % $checksum_width;
 			}
 		}
 	}

 	$sum_cache{$path} = $sum;
 	($sum);
 }

 sub scanning_for_sum_to_byPath($) # fill-in the sum field (in key)
 {
 	my ($src) = @_;

 	print "verifying full-checksum ($src)...\n";

 	my $logfile = report_short_filename('summed');
 	if($recycle and open FILE, "$logfile") {
 		# just precache cos' cant be sure of the synch between log's and needed data
 		print " recycling old log file (\"$logfile\")...\n";
 		$/ = "\n";
 		while(<FILE>) {
 			#[keeping]        0.003kb     1 00000048* c:/_/_chaos/_download/tmp2/.xx
 			if (/^\[[^\]]+\]\s+([\d\.]+)kb\s+(\d+)\s+([0-9A-F]+)\*?\s+(.*?)$/) {
 				my $path = $4;
 				my ($sum) = (hex $3);
 				$sum_cache{$path} = $sum;
 			}
 		}
 		close FILE;
 	}

 	reset_progress_indicator_byPath($src, 'size');
 	foreach my $path (sort keys %{$byPath{$src}}) {
 		my $key = $byPath{$src}{$path};
 		my ($name,$count,$size,$sum) = split /\|/, $key;
 		$sum = scan_for_sum($path); # new, and improved, checksum
 		my $new_key = "$name|$count|$size|$sum";
 		$byPath{$src}{$path} = $new_key;
 		progress_acc($size);
 	}

 	report('summed',0, $src);
 }
 ######################
 sub clear_key_fields_byPath($$$$$) # usually used before scanning_for_sum_to_byPath()
 {
 	my ($src,$clear_name,$clear_count,$clear_size,$clear_sum) = @_;

 	print "clearing name in key-info ($src)...\n";

 	reset_progress_indicator_byPath($src, 'item');
 	foreach my $path (keys %{$byPath{$src}}) {
 		my $key = $byPath{$src}{$path};
 		my ($name,$count,$size,$sum) = split /\|/, $key;
 		$name = '' if ($clear_name);
 		$count = '' if ($clear_count);
 		$size = '' if ($clear_size);
 		$sum = '' if ($clear_sum);
 		my $new_key = "$name|$count|$size|$sum";
 		$byPath{$src}{$path} = $new_key;
 		progress_acc(1);
 	}
 }

 ###############
 sub tree_size($$) {
 	my($base, $path) = @_;
 	my $leaf_path = $path;

 	my $level = ($path =~ s/\//\//g)+1;
 	my $leaf_level = $level;
 	my $factor = 1.1;
 	my $lv_wgt = 1/($factor**$level);

 	my $tree_size = 0;
 	while ($level and exists $byPath{$base}{$path}) {
 		$lv_wgt *= $factor; $level--;
 		my ($name,$count,$size) = split /\|/, $byPath{$base}{$path};
 		$tree_size += ($size + $count + sum_name($name)*0.00001) * $lv_wgt;
 		$path = extract_parent_path($path);
 	}

 	($tree_size);
 }

 #################################################################
 # ANALISYS AUX OPERATIONS
 #=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#
 sub nonduplicate_removal_byPath($$)
 {
 	my ($src, $del)=@_;
 	print "non-duplicate removal (+$src/-$del)...\n";

 	indexing_byPath_to_byKey($src);
 	reset_progress_indicator_byKey($src, 'item');
 	foreach my $key (keys %{$byKey{$src}}) {
 		if (@{$byKey{$src}{$key}} <= 1) { # not a duplicate
 			foreach my $path (@{$byKey{$src}{$key}}) { $byPath{$del}{$path} = $key; }
 			delete $byKey{$src}{$key};
 		}
 		progress_acc(1);
 	}
 	indexing_byKey_to_byPath($src);
 	clear_byKey($src);
 }

 sub dir_children_removal_byPath($$$)
 {
 	my ($base, $src, $del)=@_;
 	print " dir cropping ([$base] +$src/-$del)...\n";

 	reset_progress_indicator_byPath($base, 'item');
 	foreach my $path (sort keys %{$byPath{$base}}) {
 		my $key = $byPath{$base}{$path};
 		my (undef,$count) = split /\|/, $key;
 		if ($count>1) { # then it's a dir, may contain children
 			my $pattern = escape_path_in_regexp($path);
 			#remove_path_pattern_byPath($src, $del, "/^$pattern./"); # too slow
 			foreach my $path (grep /^$pattern./, keys %{$byPath{$src}}) {
 				$byPath{$del}{$path} = $byPath{$src}{$path};
 				delete $byPath{$src}{$path};
 			}
 		}
 		progress_acc(1);
 	}
 }


 #################################################################
 # ANALISYS OPERATIONS
 #=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#
 sub pattern_removal_byPath($$)
 {
 	my($src, $del) = @_;
 	return if ( !($allowed_pattern or $denied_pattern) );
 	print "removing forbiden patterns (+$src/-$del)...\n";

 	if ($allowed_pattern) {
 		print " allow /$allowed_pattern/i\n";
 		#remove_path_pattern_byPath($src, $del, "!/$allowed_pattern/i");
 		foreach my $path (grep !/$allowed_pattern/i, keys %{$byPath{$src}}) {
 			$byPath{$del}{$path} = $byPath{$src}{$path};
 			delete $byPath{$src}{$path};
 		}
 	}
 	elsif ($denied_pattern) {
 		print " deny /$denied_pattern/i\n";
 		#remove_path_pattern_byPath($src, $del, "/$denied_pattern/i");
 		foreach my $path (grep /$denied_pattern/i, keys %{$byPath{$src}}) {
 			$byPath{$del}{$path} = $byPath{$src}{$path};
 			delete $byPath{$src}{$path};
 		}
 	}

 #	dir_children_removal_byPath($del, $del, 'void');
 #	dir_children_removal_byPath($del, $src, 'void'); 
 # does not apply here, e.g. allow *.url but does not allow c:/urls/
 }

 sub zero_removal_byPath($$)
 {
 	my($src, $del) = @_;
 	print "removing zero length (+$src/-$del)...\n";

 	reset_progress_indicator_byPath($src, 'item');
 	foreach my $path (keys %{$byPath{$src}}) {
 		my $key = $byPath{$src}{$path};
 		my (undef,undef,$size) = split /\|/, $key;
 		if ($size==0) {
 			$byPath{$del}{$path} = $key;
 			delete $byPath{$src}{$path};
 		}
 		progress_acc(1);
 	}

 	dir_children_removal_byPath($del, $del, 'void');
 	dir_children_removal_byPath($del, $src, 'void');
 }

 sub duplicate_removal_byPath($$$)
 {
 	my($base, $src, $del) = @_;
 	print "removing duplicates (+$src/-$del)...\n";

 	indexing_byPath_to_byKey($src);
 	reset_progress_indicator_byKey($src, 'item');
 	my $max_tree_size = 1e+306;
 	foreach my $key (keys %{$byKey{$src}}) {
 		my ($the_size, $the_path)=(($smaller_tree?$max_tree_size:0), '');
 			
 		my @candidate = ();
 		if($keep_path_pattern) {
 			@candidate = grep /$keep_path_pattern/, @{$byKey{$src}{$key}};
 		}
 		@candidate = @{$byKey{$src}{$key}} unless @candidate;

 		foreach my $path (@candidate) { 
 			my $tree_size = tree_size($base, $path);
 			if ($smaller_tree xor ($tree_size>$the_size)) {
 				$the_size = $tree_size;
 				$the_path = $path;
 			}
 		}
 		
 		foreach my $path (grep {$_ ne $the_path} @{$byKey{$src}{$key}}) { 
 			$byPath{$del}{$path} = $key;
 		}

 		@{$byKey{$src}{$key}} = ($the_path);
 		progress_acc(1);
 	}
 	indexing_byKey_to_byPath($src);
 	clear_byKey($src);

 	dir_children_removal_byPath($del, $del, 'void');
 	dir_children_removal_byPath($del, $src, 'void');
 }

 sub archiving_removal_byPath($$) # requires zero_removal_byPath
 {
 	my($src, $del) = @_;
 	print "removing archiving possibilities (+$src/-$del)...\n";

 	reset_progress_indicator_byPath($src, 'item');
 	foreach my $path (keys %{$byPath{$src}}) {
 		my $key = $byPath{$src}{$path};
 		my (undef,$count, $size) = split /\|/, $key;
 		if ($size > $zip_minsize and $size/$count < $zip_sizefactor) {
 			$byPath{$del}{$path} = $key;
 			delete $byPath{$src}{$path};
 		}
 		progress_acc(1);
 	}

 	dir_children_removal_byPath($del, $del, 'void');
 	dir_children_removal_byPath($del, $src, 'void');
 }

 #################################################################
 # REPORT FUNCS
 #=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#
 my $report_totals = 0;

 sub report_byKey(@)
 {
 	my @out;
 	my $max_width;
 	foreach my $src (@_) {
 		next if (!keys %{$byKey{$src}});
 		$max_width = $max_width>length($src)?$max_width:length($src); 
 	}
 	foreach my $src (@_) {
 		next if (!keys %{$byKey{$src}});
 		foreach my $key (keys %{$byKey{$src}}) {
 			my ($name,$count,$size,$sum) = split /\|/, $key;
 			foreach my $path ( @{$byKey{$src}{$key}}) {
 				push @out, sprintf "[%-".$max_width."s] %12skb %5d %08X%1s %s", ($src, sprintf("%8.3f", $size),$count,$sum,($checksum?'*':''), $path);
 			}
 		}
 	}
 	(sort @out)
 }

 sub report_byPath(@)
 {
 	my @out;
 	my $max_width;
 	foreach my $src (@_) {
 		next if (!keys %{$byPath{$src}});
 		$max_width = $max_width>length($src)?$max_width:length($src); 
 	}
 	foreach my $src (@_) {
 		next if (!keys %{$byPath{$src}});
 		my ($total_count, $total_size) = (0,0);
 		foreach my $path (keys %{$byPath{$src}}) {
 			my $key = $byPath{$src}{$path};
 			my ($name,$count,$size,$sum) = split /\|/, $key;
 			push @out, sprintf "[%-".$max_width."s] %12skb %5d %08X%1s %s", ($src, sprintf("%8.3f", $size),$count,$sum,($checksum?'*':''), $path);
 #			push @out, sprintf "[%-".$max_width."s] %s (%s|%d|%d|%08X)", ($src, $path, $size,$count,$sum,$name);
 			$total_size += $size, $total_count += $count;
 		}
 		push @out, sprintf "[%-".$max_width."s] %12skb %5d %s", ($src, sprintf("%8.3f", $total_size),$total_count, "TOTAL ". ('=' x 40)) if($report_totals);
 	}
 	(sort @out)
 }


 #=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#
 sub report_options() {
 	my @out;
 	push @out, ' starting points: "'.join('", "', @starting_points).'"';
 	push @out, sprintf ' localtime: %s', (scalar localtime(time));
 	push @out, sprintf ' dupcheck: use_checksum=%s, use_names=%s, keep files in the %s branch;', (($checksum?'yes':'no'), (!$no_name?'yes':'no'), ($smaller_tree?'smaller':'bigger')) if($del_check);
 	push @out, sprintf ' zipcheck: min_avg_file_size=%.3f kbytes, min_size_to_zip_dir=%.3f kbytes;', ($zip_sizefactor, $zip_minsize) if($zip_check);
 	if ($allowed_pattern) {	push @out, ' allow only paths like /'.$allowed_pattern.'/i'; }
 	elsif ($denied_pattern) { push @out, ' deny all paths like /'.$denied_pattern.'/i'; }
 	push @out, "";
 	(@out);
 }

 ###########################
 sub report_filename_pieces($) {
 	my ($title) = @_;

 	my $prefix = 'dup';
 	my $path = join(',', @starting_points);
 	my $session = ($title?escape_name_in_filename($title).'_in_':'').'('.escape_name_in_filename($path).')';
 	my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time);
 	my $date = sprintf "%04d%02d%02d_%02d%02d%02d", (1900+$year,$mon,$mday, $hour,$min,$sec); 
 	my $ext = 'log';

 	($prefix, $session, $date, $ext);
 }

 sub report_short_filename($) {
 	my ($title) = @_;
 	my ($prefix, $session, $date, $ext) = report_filename_pieces($title);
 	($prefix.'_'.$session.'.'.$ext);
 }

 sub report_long_filename($) {
 	my ($title) = @_;
 	my ($prefix, $session, $date, $ext) = report_filename_pieces($title);
 	($prefix.'__'.$date.'__'.$session.'.'.$ext);
 }

 ###########################
 my @types_to_report;
 sub report_write_file($) 
 {
 	my ($filename) = @_;
 	my @out;

 	# header
 	push @out, "$filename";
 	push @out, report_options();
 	# analisys
 	push @out, report_byPath(@types_to_report);
 	# debug
 	push @out, report_byKey(@types_to_report);
 	push @out, @debug;

 	open LOG, ">$filename";
 	print LOG join("\n", @out);
 	close LOG;
 	print " log wrote to file $filename\n";
 }

 sub report($$@)
 {
 	my ($title,$totals,@types) = @_;
 	$report_totals = $totals;
 	print "reporting result".($title?" \"$title\"":"")." (".join(",",@types).")...\n";
 	if (@types) {
 		@types_to_report = @types;
 	}
 	else {
 		my %uniq_types;
 		map { $uniq_types{$_} } keys %byPath;
 		map { $uniq_types{$_} } keys %byKey;
 		@types_to_report = sort keys %uniq_types;
 	}

 	report_write_file(report_short_filename($title));
 	report_write_file(report_long_filename($title));
 }

 #################################################################
 # TOP LEVEL FUNCTION
 #=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#

 sub analyse()
 {
 	my $src = 'scan';
 	my $work = 'keeping';
 	
 	print "\nDATA ANALISYS ($src)...\n";
 	copy_byPath($work, $src);
 	clear_key_fields_byPath($work,1,0,0,1) if($no_name);

 	pattern_removal_byPath($work, 'filtered_out');
 	zero_removal_byPath($work, 'zero');
 	archiving_removal_byPath($work, 'zip') if($zip_check);

 	if ($del_check) {
 		if ($checksum) {
 			nonduplicate_removal_byPath($work, 'non_dup');
 			scanning_for_sum_to_byPath($work);
 		}

 		nonduplicate_removal_byPath($work, 'non_dup');

 		copy_byPath('dup2', $work);
 		dir_children_removal_byPath('dup2', 'dup2', 'void');
 		report('duplicate',1, 'dup2'); # all duplicates (for human decisions)

 		duplicate_removal_byPath($src, $work, 'dup');

 		dir_children_removal_byPath('dup', 'zero', 'void');
 		dir_children_removal_byPath('dup', 'zip', 'void') if($zip_check);
 		clear_byPath('void');
 	}

 	dir_children_removal_byPath($work, $work, 'void');
 	report('to_be_keept',1, $work); # for [human] size cmps

 	######
 	clear_byPath($work,'non_dup','filtered_out');
 	report('deletable',1, 'zero','dup') if($del_check);
 	report('zippable',1, 'zip') if($zip_check);
 	clear_byPath('zero','dup','zip');
 }

 #################################################################
 # STARTING POINT
 #=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#

 #### SETTIN'UP PARAMS #################################
 my $timer_start;
 sub init()
 {
 	print "\ninitializing...\n";

 	@starting_points = grep !/^-/, @ARGV;
 	if (!@starting_points) {
 		print	'usage:'."\n".
 				' perl dup.pl [-s:]<starting_dir> [[-s:]<starting_dir>] '."\n".
 				'             [-[no]recycle] [-(deny|allow):<pattern>]'."\n".
 				'             [-[no][check]sum] [-[no][file]name]'."\n".
 				'             [-[no]del] [-keep:(big[ger]|small[er]|<path_pattern>)]'."\n".
 				'             [-[no]zip] [-zip:<min_avg_filesize>:<min_total_dirsize>]'."\n".
 				"\n".
 				'examples:'."\n".
 				' perl dup.pl c:/ "-deny:\\.(c|h|hpp|cpp)\$"'."\n".
 				' perl dup.pl c: d: -nochecksum -noname'."\n".
 				' perl dup.pl \'c:\\My Documents\' -sum'."\n".
 				' perl dup.pl c:\\Archive\\ -zip:10000:200000 -keep:small'."\n".
 				"\n".
 				'notes:'."\n".
 				' 1) some reports will be generaded, no action will be taken;'."\n".
 				' 2) data scanned and hold on old log files may be reused '."\n".
 				'    (use "-recycle"), there\'s no need to wait a long scan all over;'."\n".
 				' 3) better zip-test after delete the duplicates;'."\n".
 				' 4) "-keep:small" means the dups in the less populated branch '."\n".
 				'    would be kept, while while the other may be deleted, '."\n".
 				'    the default is "-keep:big", or -keep:^/start/dir/1;'."\n".
 				' 5) "-zip:8:50" means that dirs with less than 8kb/file will be '."\n".
 				'    recommended to be zipped, but only if the dir sum 50kb in size; '."\n".
 				' 6) "-allow" overrides "-deny", they cant be used together in this version '."\n".
 				'    recommended to be zipped, but only if the dir sum 50kb in size; '."\n".
 				' 7) this is a open-src change as much as u want.'."\n".
 				
 				"\n".
  				'                      9H.zED ICQ#2405375 [email protected]'."\n".
  				'                             RELEASED IN 20040628 0911 Brazil'."\n".
 				"\n"; 
 		exit;
 	}

 	# option defaults
 	$recycle = 0; # gather info from old logs?

 	$del_check = 1; # perform dup check?
 	$checksum = 0; # perform full binary checksum?
 	$no_name = 0; # use filenames in the hash-key to find dups?
 	$smaller_tree = 0; # keeps dup files on smaller or bigger branchs?
 	$keep_path_pattern = 0; # keeps dup files that match that path pattern

 	$zip_check = 1; # perform zip check?
 	$zip_sizefactor = 8.000; # minimum kbytes per file avg
 	$zip_minsize = 50.000; # minimum size of a dir to allow zipping

 	$allowed_pattern = '';
 	$denied_pattern = '';

 	map {
 		/^-(no)?recycle$/i and $recycle = (lc($1) ne 'no') or
 		/^-(no)?del$/i and $del_check = (lc($1) ne 'no') or
 		/^-(no)?zip$/i and $zip_check = (lc($1) ne 'no') or
 		/^-(no)?(check)?sum$/i and $checksum = (lc($1) ne 'no') or
 		/^-(no)?(file)?name$/i and $no_name = (lc($1) eq 'no') or
 		/^-allow[\:=](.*)$/i and $allowed_pattern = $1 or
 		/^-deny[\:=](.*)$/i and $denied_pattern = $1 or
 		/^-(?:keep[\:=])?(big|small).*$/i and $smaller_tree=(lc($1) eq 'small') or
 		/^-keep[\:=](.*)$/i and $keep_path_pattern=$1 or
 		/^-zip[\:=](.*)\:(.*)$/i and ($zip_sizefactor,$zip_minsize) = ($1,$2) or 
 		/^-s(?:tart(?:ting(?:_point|_dir)?)?)?[\:=](.*)$/i and push @starting_points, $1
 	} @ARGV;
 	@starting_points = sort @starting_points;
 	print join("\n", report_options()) . "\n";

 	$timer_start = time;
 }

 #=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#
 sub finish()
 {
 	print "\nfinishing...\n";
 	print " took";
 	my ($day,$hour,$min,$sec);
 	$sec = (time-$timer_start);
 	if($sec>0) {
 		$min = int($sec/60); $sec-=$min*60;
 		$hour = int($min/60); $min-=$hour*60;
 		$day = int($hour/24); $hour-=$day*24;

 		printf " %d day", ($day) if($day); print "s" if($day>1);
 		printf " %d hour", ($hour) if($hour); print "s" if($hour>1);
 		printf " %02d min", ($min) if($min); print "s" if($min>1);
 		printf " %02d sec", ($sec) if($sec); print "s" if($sec>1);
 	}
 	else {
 		print " nothing";
 	}
 	print ".\n";
 }

 #=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#

 # main()
 init();
 scanning_to_byPath(); # output to 'scan'
 analyse();
 finish();