Last active
September 4, 2021 12:37
-
-
Save SapphicCode/a516e29da1133a1c87405bf91c121808 to your computer and use it in GitHub Desktop.
A script to ingest recurring archives from various service exports
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import sys | |
import hashlib | |
import json | |
algorithm = 'sha3_256' | |
file_map = {} | |
try: | |
with open(os.path.join(algorithm, 'digests.json'), 'r') as f: | |
file_map = json.load(f) | |
except FileNotFoundError: | |
pass | |
try: | |
for directory, _, files in os.walk(sys.argv[1]): | |
for file in files: | |
# hash file | |
hash = getattr(hashlib, algorithm)() | |
file_path = os.path.join(directory, file) | |
with open(file_path, 'rb') as f: | |
data = f.read(4 * 1024 * 1024) | |
hash.update(data) | |
# Windows support | |
file_path = file_path.replace("\\", "/") | |
# strip relative path | |
if file_path.startswith("./"): | |
file_path = file_path[2:] | |
# dump into map | |
digest = hash.hexdigest() | |
file_paths = file_map.get(digest, []) | |
if file_path not in file_paths: | |
file_paths.append(file_path) | |
file_map[digest] = file_paths | |
# move to hash directory | |
try: | |
digest_path = os.path.join(algorithm, digest[:2]) | |
digest_file_path = os.path.join(digest_path, f'{digest}.bin') | |
os.makedirs(digest_path, exist_ok=True) | |
os.rename(file_path, digest_file_path) | |
except FileExistsError: | |
print("duplicated file encountered. discarding.") | |
os.remove(file_path) | |
except AttributeError as e: | |
raise e | |
except Exception as e: | |
print(e) | |
with open(os.path.join(algorithm, 'digests.json'), 'w') as f: | |
json.dump(file_map, f, indent=2) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment