notwa · August 10, 2017 10:28
diff --git a/idup.py b/idup.py
 #!/usr/bin/env python3
 # find duplicate images given a hamming distance threshold.
 # employs dhash to do the heavy lifting.
 # doesn't recurse into "./_duplicate/" so you can dump things there if you wish.
 # dependencies: pillow, dhash

 import sys, os, os.path, pickle
 from PIL import Image
 import dhash

 def lament(*args, **kwargs):
    print(*args, file=sys.stderr, **kwargs)

 def result(diff, p1, p2): # TODO: rename
    print("{}\t{}\t{}".format(diff, p1, p2))

 dbname = "idup.db"
 exts = ".jpeg .jpg .png".split()

 rootpath = "."
 ignore_dir = os.path.join(rootpath, "_duplicate")

 """verbosity:
    -1: only unrecoverable errors.
    0: include failures.
    1: include image opening/hashing.
    2: the kitchen sink.
 """
 verbosity = 1

 pname = sys.argv[0]
 if len(sys.argv) <= 1:
    print("usage: {} {{threshold}}".format(pname))
    print("    utilizes {} in the current working directory".format(dbname))
    sys.exit(1)
 args = sys.argv[1:]

 threshold = int(args[0])

 paths = {} # path to hash mapping.

 if os.path.exists(dbname) and os.path.getsize(dbname) > 0:
    with open(dbname, "rb") as f:
        paths = pickle.load(f)
        #lament("loaded", len(paths.keys()), "hashes")
 else:
    if verbosity >= 0:
        lament("warning: no database found. starting from scratch.")

 existing = dict((path, h) for path, h in paths.items() if os.path.exists(path))
 for path in paths.keys():
    if path not in existing:
        if verbosity >= 0:
            lament("#d", path)

 paths = existing

 def compare_hash(h1, h2):
    # hashes are in byte strings, so we have to convert them to integers.
    i1 = int.from_bytes(h1, byteorder="big")
    i2 = int.from_bytes(h2, byteorder="big")
    # return the hamming distance.
    return bin(i1 ^ i2).count('1')

 def run():
    for dn, _, fns in os.walk(rootpath):
        if dn == ignore_dir:
            continue

        for fn in fns:
            name, ext = os.path.splitext(fn)
            path = os.path.join(dn, fn)
            if ext not in exts:
                continue

            if path in paths:
                if verbosity >= 2:
                    lament("#s", path)
                continue

            try:
                image = Image.open(path)
            except OSError:
                if verbosity >= 0:
                    lament("#f", path)
            else:
                try:
                    row, col = dhash.dhash_row_col(image)
                except OSError:
                    if verbosity >= 0:
                        lament("#f", path)
                else:
                    if verbosity >= 1:
                        lament("#o", path)
                    h = dhash.format_bytes(row, col)
                    paths[path] = h
                finally:
                    image.close()

    # first pass: exact hash matching.
    hashes = dict((v, k) for k, v in paths.items())
    for p1, h in paths.items():
        p2 = hashes[h]
        if p1 != p2:
            result(-1, p1, p2)

    # second pass: fuzzy hash matching.
    if threshold <= 0:
        return
    seen = set()
    for p1, h1 in paths.items():
        if verbosity >= 2:
            lament("#c", p1)
        seen.add(p1)
        for p2, h2 in paths.items():
            if p2 in seen:
                continue
            if h1 == h2:
                continue
            diff = compare_hash(h1, h2)
            if diff <= threshold:
                result(diff, p1, p2)

 try:
    run()
 except KeyboardInterrupt:
    if verbosity >= 0:
        lament("# interrupted")
 finally:
    if os.path.exists(dbname):
        backup = dbname+".bak"
        if os.path.exists(backup):
            os.remove(backup)
        os.rename(dbname, dbname+".bak")
    with open(dbname, "wb") as f:
        pickle.dump(paths, f)
	#!/usr/bin/env python3
	# find duplicate images given a hamming distance threshold.
	# employs dhash to do the heavy lifting.
	# doesn't recurse into "./_duplicate/" so you can dump things there if you wish.
	# dependencies: pillow, dhash

	import sys, os, os.path, pickle
	from PIL import Image
	import dhash

	def lament(args, *kwargs):
	print(args, file=sys.stderr, *kwargs)

	def result(diff, p1, p2): # TODO: rename
	print("{}\t{}\t{}".format(diff, p1, p2))

	dbname = "idup.db"
	exts = ".jpeg .jpg .png".split()

	rootpath = "."
	ignore_dir = os.path.join(rootpath, "_duplicate")

	"""verbosity:
	-1: only unrecoverable errors.
	0: include failures.
	1: include image opening/hashing.
	2: the kitchen sink.
	"""
	verbosity = 1

	pname = sys.argv[0]
	if len(sys.argv) <= 1:
	print("usage: {} {{threshold}}".format(pname))
	print(" utilizes {} in the current working directory".format(dbname))
	sys.exit(1)
	args = sys.argv[1:]

	threshold = int(args[0])

	paths = {} # path to hash mapping.

	if os.path.exists(dbname) and os.path.getsize(dbname) > 0:
	with open(dbname, "rb") as f:
	paths = pickle.load(f)
	#lament("loaded", len(paths.keys()), "hashes")
	else:
	if verbosity >= 0:
	lament("warning: no database found. starting from scratch.")

	existing = dict((path, h) for path, h in paths.items() if os.path.exists(path))
	for path in paths.keys():
	if path not in existing:
	if verbosity >= 0:
	lament("#d", path)

	paths = existing

	def compare_hash(h1, h2):
	# hashes are in byte strings, so we have to convert them to integers.
	i1 = int.from_bytes(h1, byteorder="big")
	i2 = int.from_bytes(h2, byteorder="big")
	# return the hamming distance.
	return bin(i1 ^ i2).count('1')

	def run():
	for dn, _, fns in os.walk(rootpath):
	if dn == ignore_dir:
	continue

	for fn in fns:
	name, ext = os.path.splitext(fn)
	path = os.path.join(dn, fn)
	if ext not in exts:
	continue

	if path in paths:
	if verbosity >= 2:
	lament("#s", path)
	continue

	try:
	image = Image.open(path)
	except OSError:
	if verbosity >= 0:
	lament("#f", path)
	else:
	try:
	row, col = dhash.dhash_row_col(image)
	except OSError:
	if verbosity >= 0:
	lament("#f", path)
	else:
	if verbosity >= 1:
	lament("#o", path)
	h = dhash.format_bytes(row, col)
	paths[path] = h
	finally:
	image.close()

	# first pass: exact hash matching.
	hashes = dict((v, k) for k, v in paths.items())
	for p1, h in paths.items():
	p2 = hashes[h]
	if p1 != p2:
	result(-1, p1, p2)

	# second pass: fuzzy hash matching.
	if threshold <= 0:
	return
	seen = set()
	for p1, h1 in paths.items():
	if verbosity >= 2:
	lament("#c", p1)
	seen.add(p1)
	for p2, h2 in paths.items():
	if p2 in seen:
	continue
	if h1 == h2:
	continue
	diff = compare_hash(h1, h2)
	if diff <= threshold:
	result(diff, p1, p2)

	try:
	run()
	except KeyboardInterrupt:
	if verbosity >= 0:
	lament("# interrupted")
	finally:
	if os.path.exists(dbname):
	backup = dbname+".bak"
	if os.path.exists(backup):
	os.remove(backup)
	os.rename(dbname, dbname+".bak")
	with open(dbname, "wb") as f:
	pickle.dump(paths, f)