Created
August 12, 2019 15:54
-
-
Save Jwink3101/af77e8bdf509caf6162643549f29a6fe to your computer and use it in GitHub Desktop.
Super, super, super barebones, barely-tested, feature-less duplicate file finder
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
""" | |
Find duplicate files | |
Warning: This has not been thouroughly tested and is missing many | |
features including exlucsions, error handling, parallelism, etc. | |
This is barebones | |
Process: | |
* Walk the directory tree | |
* Compute the Adler32 checksum. This is faster than SHA1 but less reliable | |
* Determine duplicates and confirm via sha256 | |
""" | |
from __future__ import print_function | |
## Specify this path. It is probably better to be an absolute path | |
DATAPATH = '/PATH/TO/DIRECTORY' | |
import os | |
import zlib | |
import hashlib | |
from collections import defaultdict | |
def adler(filepath,BLOCKSIZE=2**15): | |
""" | |
Create an additive adler32 checksum. Faster than sha1. | |
From the documentation: | |
> Changed in version 3.0: Always returns an unsigned value. | |
> To generate the same numeric value across all Python versions and | |
> platforms, use adler32(data) & 0xffffffff. | |
""" | |
csum = 1 | |
with open(filepath, 'rb') as afile: | |
buf = afile.read(BLOCKSIZE) | |
while len(buf) > 0: | |
csum = zlib.adler32(buf,csum) | |
buf = afile.read(BLOCKSIZE) | |
csum = csum & 0xffffffff | |
return csum | |
def sha256(filepath,BLOCKSIZE=2**15): | |
""" | |
2**20: 1 mb | |
2**12: 4 kb | |
""" | |
hasher = hashlib.sha256() | |
with open(filepath, 'rb') as afile: | |
buf = afile.read(BLOCKSIZE) | |
while len(buf) > 0: | |
hasher.update(buf) | |
buf = afile.read(BLOCKSIZE) | |
return hasher.hexdigest() | |
if __name__ == '__main__': | |
dups_adler = defaultdict(list) | |
dups_sha256 = defaultdict(list) | |
# Find all files and compute the adler checksum (faster than sha256) | |
for dirpath, dirnames, filenames in os.walk(DATAPATH): | |
for filename in filenames: | |
filename = os.path.join(dirpath,filename) | |
hh = adler(filename) | |
dups_adler[hh].append(filename) | |
# Now compute the SHA1 for any duplicate | |
dups0 = [k for k,v in dups_adler.items() if len(v)>1] | |
for dup in dups0: | |
for dd in dups_adler[dup]: | |
hh = sha256(dd) | |
dups_sha256[hh].append(dd) | |
for files in dups_sha256.values(): | |
if len(files) == 1: | |
continue # Not really a duplicate | |
print('\nThe following {} files are duplicates'.format(len(files))) | |
for file in files: | |
print(' ' + file) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment