wwalker · July 18, 2010 16:09
diff --git a/gistfile1.txt b/gistfile1.txt
 #!/bin/sh
 #
 # Simple file based dedupe producer.
 # Idea is to run this on a directory(s) and the result is a tuple
 # of md5sum and filenames that are potentially duplicate (need to
 # finish by using cmp to make sure).
 # Copyright 2010 Sterling Commerce, Inc.
 # Copyright 2010 Christopher Jay Cox
 #
 # http://endlessnow.com/ten/Source/dedupe-sh.txt
 #
 # This program is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License
 # as published by the Free Software Foundation; either version 2
 # of the License, or (at your option) any later version.
 # 
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
 # 
 # You should have received a copy of the GNU General Public License
 # along with this program; if not, write to the Free Software
 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 #
 # Filters here: size, inode, md5sum
 # 
 # md5sum, like a cmp, is the most expensive item, so ideally, it should be
 # the last filter run.  The inode check, while fast, is more intersting
 # only in certain cases as the 1st filter run.  In some cases, you may
 # determine to skip it altogether, but realize that the program might
 # give you duplicate files that are really the SAME file.
 #
 # Output is a list of md5sum filename tuples ordered by matching md5sum.
 # So ultimately, the md5sum becomes the key for the next step of processing
 # in identifying what files are duplicates.  So, in the next step, your
 # processing loop would to cmp's across the potentially duplicate files
 # by md5sum (this is to avoid the rare case of md5sum collision, made even
 # rarer by the size filter in particular).
 #
 # You could also consider a file type filter, if you are interested in
 # duplicates of some particular file type.
 # xargs -0 file | grep 'image data' | cut -f1 -d: | tr '\012' '\000' |
 #
 # Changelog
 # 
 # 2010-07-18 - wwalker - changed sed | cut to more complex sed to prevent
 # possible filename truncation
 # 2010-07-18 - wwalker - changed 6s to 7s

 find $* -type f -print0 |
 xargs -0 ls -sd | sort -k1bn | uniq -w 7 -D |
 sed 's/^  *[0-9]* *//' | tr '\012' '\000' |
 xargs -0 ls -id | sort | uniq -w 7 |
 sed 's/^  *[0-9]* *//' | tr '\012' '\000' |
 xargs -0 md5sum | sort | uniq -w 32 -D
	#!/bin/sh
	#
	# Simple file based dedupe producer.
	# Idea is to run this on a directory(s) and the result is a tuple
	# of md5sum and filenames that are potentially duplicate (need to
	# finish by using cmp to make sure).
	# Copyright 2010 Sterling Commerce, Inc.
	# Copyright 2010 Christopher Jay Cox
	#
	# http://endlessnow.com/ten/Source/dedupe-sh.txt
	#
	# This program is free software; you can redistribute it and/or
	# modify it under the terms of the GNU General Public License
	# as published by the Free Software Foundation; either version 2
	# of the License, or (at your option) any later version.
	#
	# This program is distributed in the hope that it will be useful,
	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	# GNU General Public License for more details.
	#
	# You should have received a copy of the GNU General Public License
	# along with this program; if not, write to the Free Software
	# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
	#
	# Filters here: size, inode, md5sum
	#
	# md5sum, like a cmp, is the most expensive item, so ideally, it should be
	# the last filter run. The inode check, while fast, is more intersting
	# only in certain cases as the 1st filter run. In some cases, you may
	# determine to skip it altogether, but realize that the program might
	# give you duplicate files that are really the SAME file.
	#
	# Output is a list of md5sum filename tuples ordered by matching md5sum.
	# So ultimately, the md5sum becomes the key for the next step of processing
	# in identifying what files are duplicates. So, in the next step, your
	# processing loop would to cmp's across the potentially duplicate files
	# by md5sum (this is to avoid the rare case of md5sum collision, made even
	# rarer by the size filter in particular).
	#
	# You could also consider a file type filter, if you are interested in
	# duplicates of some particular file type.
	# xargs -0 file \| grep 'image data' \| cut -f1 -d: \| tr '\012' '\000' \|
	#
	# Changelog
	#
	# 2010-07-18 - wwalker - changed sed \| cut to more complex sed to prevent
	# possible filename truncation
	# 2010-07-18 - wwalker - changed 6s to 7s

	find $* -type f -print0 \|
	xargs -0 ls -sd \| sort -k1bn \| uniq -w 7 -D \|
	sed 's/^ [0-9] *//' \| tr '\012' '\000' \|
	xargs -0 ls -id \| sort \| uniq -w 7 \|
	sed 's/^ [0-9] *//' \| tr '\012' '\000' \|
	xargs -0 md5sum \| sort \| uniq -w 32 -D