Created
January 31, 2020 19:52
-
-
Save cgcostume/2d2aa6b3a964618594b6c00d1169ad08 to your computer and use it in GitHub Desktop.
Iterate over all Files within a Directory and Output File Paths and Hashes
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
import hashlib | |
import math | |
import os | |
import sqlite3 | |
import sys | |
import locale | |
locale.setlocale(locale.LC_ALL, '') | |
parser = argparse.ArgumentParser('This iterates over all files within a directory and outputs file path and file hash.') | |
parser.add_argument('path', help = 'path to compute hashes for', default = '.') | |
parser.add_argument('target', help = 'target SQL database file path to write hashes and file pathes to') | |
parser.add_argument('-r', '--recurse', help = 'iterate directory recursively', action = 'store_true', default = False) | |
args = parser.parse_args() | |
path = os.path.abspath(args.path) | |
assert os.path.isdir(path) | |
target = os.path.abspath(args.target) | |
# assert not os.path.isfile(target) | |
def file_as_chunk_iter(file, bytesPerChunk = 1<<16): | |
with file: | |
chunk = file.read(bytesPerChunk) | |
while len(chunk) > 0: | |
yield chunk | |
chunk = file.read(bytesPerChunk) | |
def hash_of_chunk_iter(chunk_iter, hasher): | |
for chunk in chunk_iter: | |
hasher.update(chunk) | |
return hasher.hexdigest() | |
BLACK_LIST = [ 'node_modules', '.git', '.svn' ] | |
todo = [] | |
for subdir, dirs, files in os.walk(path): | |
dirs[:] = [d for d in dirs if d not in BLACK_LIST] | |
for file in files: | |
filepath = os.path.join(subdir, file) | |
if os.path.isfile(filepath) and os.path.getsize(filepath) > 0: | |
todo.append(filepath) | |
print ('Counting files. Found', f'{len(todo):n}', '...', | |
flush = True, end = '\r') | |
if not args.recurse: | |
break | |
print('\r') | |
total = len(todo) | |
progress = 0 | |
totalDecimals = math.ceil(math.log10(total)) | |
totalDecimals = totalDecimals + math.floor(totalDecimals / 3) | |
conn = sqlite3.connect(target) | |
conn.execute("""CREATE TABLE IF NOT EXISTS hashes ( | |
hash TEXT, | |
file TEXT NOT NULL, | |
size INTEGER, | |
UNIQUE(hash, file) | |
)""") | |
conn.execute("""CREATE INDEX IF NOT EXISTS idx_hash ON hashes (hash)""") | |
# conn.execute("""DELETE FROM hashes""") | |
# if not args.quiet: print ('HASH,FILEPATH') | |
for filepath in todo: | |
progress += 1 | |
print ('Processing file', f'{progress:n}'.rjust(totalDecimals), 'of', f'{total:n}', '...', | |
flush = True, end = ('\r' if progress < total else '\n')) | |
try: | |
hash = hash_of_chunk_iter(file_as_chunk_iter(open(filepath, 'rb')), hashlib.sha256()) | |
pass | |
except Exception as e: | |
hash = None | |
print('warning: hash issue on', filepath) | |
conn.execute("""INSERT OR IGNORE INTO hashes VALUES (?, ?, ?)""", (hash, filepath, os.path.getsize(filepath))) | |
conn.commit() | |
conn.close() | |
print("""use 'SELECT file, count(*) FROM hashes GROUP BY hash HAVING count(*) > 1' to detect duplicates""") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment