Last active
November 20, 2020 02:19
-
-
Save Jwink3101/4d0edae5eae762509676913aac050049 to your computer and use it in GitHub Desktop.
Really barebones duplicate file finder. Doesn't even have a CLI
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
""" | |
Duplicate file finder. Finds dupe files by comparing the following attributes. | |
Matching size is nessesary but very, very far from sufficient. Still, it is very | |
fast so we use that to cut out a lot of files. | |
* size | |
* Obviously not at all robust but a nessesary | |
* CRC checksum | |
* SHA1 hash | |
""" | |
from __future__ import division, print_function, unicode_literals | |
__version__ = '20200120' | |
import os | |
from collections import defaultdict | |
import hashlib | |
import itertools | |
import fnmatch | |
import zlib | |
######################### Settings 1 | |
root = '/path/to/root' | |
excludes = ['.*','*.AAE',] # Note that they are on a per-file basis | |
# See Settings 2 below but those likely do not need to change | |
######################### | |
##### Hashers | |
def sha256(filepath,BLOCKSIZE=2**20): | |
""" | |
http://pythoncentral.io/hashing-files-with-python/ | |
2**20: 1 mb | |
2**12: 4 kb | |
""" | |
hasher = hashlib.sha256() | |
with open(filepath, 'rb') as afile: | |
buf = afile.read(BLOCKSIZE) | |
while len(buf) > 0: | |
hasher.update(buf) | |
buf = afile.read(BLOCKSIZE) | |
return hasher.hexdigest() | |
def crc32(filepath,blocksize=2**20): | |
""" | |
Return the crc32 of a file as an 8-byte hex number | |
`blocksize` adjusts how much of the file is read into memory at a time. | |
This is useful for large files. | |
2**20 = 1024 * 1024 = 1 mb | |
2**12 = 4 * 1024 = 4 kb | |
""" | |
csum = 0 | |
with open(filepath, 'rb') as afile: | |
buf = afile.read(blocksize) | |
while len(buf) > 0: | |
csum = zlib.crc32(buf,csum) | |
buf = afile.read(blocksize) | |
# From the documentation: | |
# > Changed in version 3.0: Always returns an unsigned value. | |
# > To generate the same numeric value across all Python versions and | |
# > platforms, use crc32(data) & 0xffffffff. | |
csum = csum & 0xffffffff | |
return ('0'*8 + hex(csum)[2:])[-8:] # Preceding 0s | |
def adler32_1kb(filepath): | |
"""adler32 of *just* the first 1kb""" | |
with open(filepath,'rb') as file: | |
csum = zlib.adler32(file.read(1024)) | |
csum = csum & 0xffffffff | |
return ('0'*8 + hex(csum)[2:])[-8:] # Preceding 0s | |
######################### Settings 2 | |
# Specify test functions | |
# The tests MUST be in order of severity!!! | |
tests = [ | |
('size',lambda s:os.stat(s).st_size), | |
('Adler32 1kb',adler32_1kb), | |
# ('crc32',crc32), | |
('sha256',sha256) | |
] | |
######################### | |
## Get items and size to start | |
files = [] | |
excludes = [e.lower() for e in excludes] | |
for dirpath, dirnames, filenames in os.walk(root): # TODO: Settable | |
for dirname in dirnames[:]: | |
if any(fnmatch.fnmatch(dirname.lower(),e) for e in excludes): | |
dirnames.remove(dirname) | |
for filename in filenames: | |
if filename.startswith('./'): | |
filename = filename[2:] | |
if any(fnmatch.fnmatch(filename.lower(),e) for e in excludes): | |
continue | |
filename = os.path.join(dirpath,filename) | |
if os.path.islink(filename): | |
continue | |
files.append(filename) | |
print(f'Initally found {len(files)} files') | |
class ArgReturn(object): | |
def __init__(self,fun): | |
self.fun = fun | |
def __call__(self,*args,**kwargs): | |
return args,kwargs,self.fun(*args,**kwargs) | |
udict = None | |
for test in tests: | |
udict = defaultdict(list) | |
for (filename,),_,val in map(ArgReturn(test[1]),files): | |
udict[val].append(filename) | |
_udict = {} | |
files = [] | |
for key,vals in udict.items(): | |
if len(vals) < 2: | |
continue | |
_udict[key] = vals | |
files.extend(vals) | |
udict = _udict | |
print(f"Test '{test[0]}' found {len(udict)} unique values with {len(files)} files") | |
keys = sorted(udict.keys(),key=lambda k:-len(udict[k])) | |
for key in keys: | |
files = udict[key] | |
files.sort(key=str.lower) | |
print(f'\nFollowing {len(files)} are identical:') | |
for ii,file in enumerate(files): | |
print(f' {ii+1}: {file}') | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment