Skip to content

Instantly share code, notes, and snippets.

@cgcostume
Created January 31, 2020 19:52
Show Gist options
  • Save cgcostume/2d2aa6b3a964618594b6c00d1169ad08 to your computer and use it in GitHub Desktop.
Save cgcostume/2d2aa6b3a964618594b6c00d1169ad08 to your computer and use it in GitHub Desktop.
Iterate over all Files within a Directory and Output File Paths and Hashes
import argparse
import hashlib
import math
import os
import sqlite3
import sys
import locale
locale.setlocale(locale.LC_ALL, '')
parser = argparse.ArgumentParser('This iterates over all files within a directory and outputs file path and file hash.')
parser.add_argument('path', help = 'path to compute hashes for', default = '.')
parser.add_argument('target', help = 'target SQL database file path to write hashes and file pathes to')
parser.add_argument('-r', '--recurse', help = 'iterate directory recursively', action = 'store_true', default = False)
args = parser.parse_args()
path = os.path.abspath(args.path)
assert os.path.isdir(path)
target = os.path.abspath(args.target)
# assert not os.path.isfile(target)
def file_as_chunk_iter(file, bytesPerChunk = 1<<16):
with file:
chunk = file.read(bytesPerChunk)
while len(chunk) > 0:
yield chunk
chunk = file.read(bytesPerChunk)
def hash_of_chunk_iter(chunk_iter, hasher):
for chunk in chunk_iter:
hasher.update(chunk)
return hasher.hexdigest()
BLACK_LIST = [ 'node_modules', '.git', '.svn' ]
todo = []
for subdir, dirs, files in os.walk(path):
dirs[:] = [d for d in dirs if d not in BLACK_LIST]
for file in files:
filepath = os.path.join(subdir, file)
if os.path.isfile(filepath) and os.path.getsize(filepath) > 0:
todo.append(filepath)
print ('Counting files. Found', f'{len(todo):n}', '...',
flush = True, end = '\r')
if not args.recurse:
break
print('\r')
total = len(todo)
progress = 0
totalDecimals = math.ceil(math.log10(total))
totalDecimals = totalDecimals + math.floor(totalDecimals / 3)
conn = sqlite3.connect(target)
conn.execute("""CREATE TABLE IF NOT EXISTS hashes (
hash TEXT,
file TEXT NOT NULL,
size INTEGER,
UNIQUE(hash, file)
)""")
conn.execute("""CREATE INDEX IF NOT EXISTS idx_hash ON hashes (hash)""")
# conn.execute("""DELETE FROM hashes""")
# if not args.quiet: print ('HASH,FILEPATH')
for filepath in todo:
progress += 1
print ('Processing file', f'{progress:n}'.rjust(totalDecimals), 'of', f'{total:n}', '...',
flush = True, end = ('\r' if progress < total else '\n'))
try:
hash = hash_of_chunk_iter(file_as_chunk_iter(open(filepath, 'rb')), hashlib.sha256())
pass
except Exception as e:
hash = None
print('warning: hash issue on', filepath)
conn.execute("""INSERT OR IGNORE INTO hashes VALUES (?, ?, ?)""", (hash, filepath, os.path.getsize(filepath)))
conn.commit()
conn.close()
print("""use 'SELECT file, count(*) FROM hashes GROUP BY hash HAVING count(*) > 1' to detect duplicates""")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment