Skip to content

Instantly share code, notes, and snippets.

@komly
Created November 1, 2016 10:35
Show Gist options
  • Save komly/efd7a13dae67dd55f9e96e6d8522c5b3 to your computer and use it in GitHub Desktop.
Save komly/efd7a13dae67dd55f9e96e6d8522c5b3 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
import os
import sys
import binascii
def find_duplicates(path):
checksums = {}
for root, dirs, files in os.walk(path):
for file in files:
full_path = os.path.join(root, file)
# We can't read sockets, symlinks, etc
if not os.path.isfile(full_path):
continue
try:
with open(full_path, 'rb') as f:
crc = binascii.crc32(f.read())
if crc in checksums:
print("Duplicate found: %s, Original: %s" % (full_path, checksums[crc]))
checksums[crc] = full_path
except PermissionError as e:
print(e, file=sys.stderr)
if __name__ == '__main__':
if len(sys.argv) < 2:
print("Usage: %s path" % sys.argv[0])
exit(1)
find_duplicates(sys.argv[1])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment