Skip to content

Instantly share code, notes, and snippets.

@hax4dazy
Created August 24, 2024 00:31
Show Gist options
  • Save hax4dazy/a214ba8f63dbaa4663e8946b4981fcbc to your computer and use it in GitHub Desktop.
Save hax4dazy/a214ba8f63dbaa4663e8946b4981fcbc to your computer and use it in GitHub Desktop.
Quick python script to check for dupes in a few directories. This scans all dirs and then puts their SHA1 hash into the database, then checks other files if they already exist wtihin the DB
from peewee import *
db = SqliteDatabase("database.db", pragmas={"foreign_keys": 1})
class MySQLModel(Model):
class Meta:
database = db
class Files(MySQLModel):
path = TextField()
sha1hash = TextField()
if __name__ == "__main__":
db.create_tables([Files], safe=True)
import os, hashlib, sys, time
from db import *
BUF_SIZE = 65536
directorysToScan = ["/Main_Pool/Hax4dayz/Immich", "/Main_Pool/Hax4dayz/Nextcloud"]
dupeCounter = 0
fileCounter = 0
try:
os.mkdir('out')
except FileExistsError:
pass
# This might not be good code but oh well, its fine.
pathToOutputDir = f'{os.getcwd()}{os.sep}out{os.sep}'
def scanDirectory(directory):
filepaths = []
for root, dirs, files in os.walk(directory):
for file in files:
path = os.path.join(root, file)
filepaths.append(path)
return filepaths
def scanFile(file):
sha1 = hashlib.sha1()
openFile = open(file, "rb")
data = openFile.read(BUF_SIZE)
sha1.update(data)
return sha1.hexdigest()
start = time.perf_counter()
for directory in directorysToScan:
sys.stdout.write(f'Scanning {directory}...\n')
scannedDirectory = scanDirectory(directory)
for file in scannedDirectory:
Files.create(path=file, sha1hash='0')
scannedFile = scanFile(file)
checkIfExists = Files.get_or_none(sha1hash=scannedFile)
if checkIfExists is None:
Files.update(sha1hash=scannedFile).where(Files.path == file).execute()
fileCounter += 1
sys.stdout.write(f'{fileCounter} files scanned.\r')
else:
#os.replace(file, pathToOutputDir + os.path.basename(file))
fileCounter += 1
sys.stdout.write(f'{fileCounter} files scanned.\r')
dupeCounter += 1
end = time.perf_counter()
print(f'\n{dupeCounter} duplicates found.')
elapsed = end - start
print(f'Time taken: {elapsed:.6f} seconds')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment