Created
January 31, 2020 19:53
-
-
Save cgcostume/dc38c8120966ad5d21b6887c67416d86 to your computer and use it in GitHub Desktop.
Create a data base with Directory Hashes
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
import hashlib | |
import math | |
import numpy | |
import os | |
import sqlite3 | |
import sys | |
import locale | |
locale.setlocale(locale.LC_ALL, '') | |
parser = argparse.ArgumentParser('This creates a new data base with directory hashes.') | |
parser.add_argument('source', help = 'path to SQL database containing hashes, file pathes, and file sizes', default = '.') | |
parser.add_argument('target', help = 'target SQL database file path to write hashes and file pathes to') | |
args = parser.parse_args() | |
source = os.path.abspath(args.source) | |
assert os.path.isfile(source) | |
target = os.path.abspath(args.target) | |
# assert not os.path.isfile(target) | |
# total = len(todo) | |
# progress = 0 | |
# totalDecimals = math.ceil(math.log10(total)) | |
# totalDecimals = totalDecimals + math.floor(totalDecimals / 3) | |
conn_in = sqlite3.connect(source) | |
result = conn_in.execute("""SELECT count(*) FROM hashes""") | |
total = int(result.fetchone()[0]) | |
result = conn_in.execute("""SELECT file, size, hash FROM hashes""") | |
todo = {} | |
for file, size, hash in result.fetchall(): | |
dirname = os.path.dirname(file) | |
basename = os.path.basename(file) | |
if dirname not in todo: | |
todo[dirname] = [] | |
todo[dirname].append(basename) | |
todo[dirname].append(int(size)) | |
todo[dirname].append(hash) | |
conn_out = sqlite3.connect(target) | |
conn_out.execute("""CREATE TABLE IF NOT EXISTS hashes ( | |
hash TEXT, | |
path TEXT NOT NULL, | |
size INTEGER, | |
UNIQUE(hash, path) | |
)""") | |
conn_out.execute("""CREATE INDEX IF NOT EXISTS idx_hash ON hashes (hash)""") | |
# conn_out.execute("""DELETE FROM hashes""") | |
total = len(todo) | |
progress = 0 | |
totalDecimals = math.ceil(math.log10(total)) | |
totalDecimals = totalDecimals + math.floor(totalDecimals / 3) | |
hashes = {} | |
for path in todo: | |
progress += 1 | |
print ('Processing file', f'{progress:n}'.rjust(totalDecimals), 'of', f'{total:n}', '...', | |
flush = True, end = ('\r' if progress < total else '\n')) | |
data = numpy.ravel(todo[path]) | |
try: | |
size = int(data[1::3].astype(int).sum()) | |
except Exception as e: | |
print('exception', e, data[1::3]) | |
hasher = hashlib.sha256() | |
hasher.update(data) | |
hash = hasher.hexdigest() | |
conn_out.execute("""INSERT OR IGNORE INTO hashes VALUES (?, ?, ?)""", (hash, path, size)) | |
conn_in.close() | |
conn_out.commit() | |
conn_out.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Find duplicate files:
Find duplicate directories: