Created
August 24, 2024 00:31
-
-
Save hax4dazy/a214ba8f63dbaa4663e8946b4981fcbc to your computer and use it in GitHub Desktop.
Quick python script to check for dupes in a few directories. This scans all dirs and then puts their SHA1 hash into the database, then checks other files if they already exist wtihin the DB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from peewee import * | |
db = SqliteDatabase("database.db", pragmas={"foreign_keys": 1}) | |
class MySQLModel(Model): | |
class Meta: | |
database = db | |
class Files(MySQLModel): | |
path = TextField() | |
sha1hash = TextField() | |
if __name__ == "__main__": | |
db.create_tables([Files], safe=True) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os, hashlib, sys, time | |
from db import * | |
BUF_SIZE = 65536 | |
directorysToScan = ["/Main_Pool/Hax4dayz/Immich", "/Main_Pool/Hax4dayz/Nextcloud"] | |
dupeCounter = 0 | |
fileCounter = 0 | |
try: | |
os.mkdir('out') | |
except FileExistsError: | |
pass | |
# This might not be good code but oh well, its fine. | |
pathToOutputDir = f'{os.getcwd()}{os.sep}out{os.sep}' | |
def scanDirectory(directory): | |
filepaths = [] | |
for root, dirs, files in os.walk(directory): | |
for file in files: | |
path = os.path.join(root, file) | |
filepaths.append(path) | |
return filepaths | |
def scanFile(file): | |
sha1 = hashlib.sha1() | |
openFile = open(file, "rb") | |
data = openFile.read(BUF_SIZE) | |
sha1.update(data) | |
return sha1.hexdigest() | |
start = time.perf_counter() | |
for directory in directorysToScan: | |
sys.stdout.write(f'Scanning {directory}...\n') | |
scannedDirectory = scanDirectory(directory) | |
for file in scannedDirectory: | |
Files.create(path=file, sha1hash='0') | |
scannedFile = scanFile(file) | |
checkIfExists = Files.get_or_none(sha1hash=scannedFile) | |
if checkIfExists is None: | |
Files.update(sha1hash=scannedFile).where(Files.path == file).execute() | |
fileCounter += 1 | |
sys.stdout.write(f'{fileCounter} files scanned.\r') | |
else: | |
#os.replace(file, pathToOutputDir + os.path.basename(file)) | |
fileCounter += 1 | |
sys.stdout.write(f'{fileCounter} files scanned.\r') | |
dupeCounter += 1 | |
end = time.perf_counter() | |
print(f'\n{dupeCounter} duplicates found.') | |
elapsed = end - start | |
print(f'Time taken: {elapsed:.6f} seconds') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment