Last active
May 22, 2024 19:08
-
-
Save pl77/917c7a488c13246dec104d5dc23e7cdb to your computer and use it in GitHub Desktop.
old perceptual hashing algorithm.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
import os | |
import hashlib | |
from ImageHash import avhash, dhash, dimensions # , rotavhash | |
import base64 | |
import sqlite3 | |
import glob | |
import time | |
# import shutil | |
import datetime | |
import re | |
import zipfile | |
import sys | |
import scandir | |
# from multiprocessing import Pool | |
# from multiprocessing.dummy import Pool as ThreadPool | |
# TODO use pathlib vs os.querypath calls? this is 3.4 only | |
# http://docs.sqlalchemy.org/en/rel_0_9/orm/tutorial.html ?? | |
# http://docs.python.org/3.4/howto/logging-cookbook.html | |
# a list of valid file extensions to import. anything else will be skipped. make it a set in case people add dupes | |
extensions = {'.jpg', '.avi', '.ram', '.rm', '.wmv', '.pdf', '.mov', '.mp4', '.flv', '.jpe', '.jpeg', '.mpg', '.mpe', | |
'.mpeg', '.png', '.3g2', '.3gp', '.asf', '.bmp', '.divx', '.gif', '.m1v', '.vob', '.mod', '.tif', '.mkv', | |
'.jp2', '.psd', '.m4v', '.pcx', '.webm', '.m4a', '.mp3', '.org', '.jpgy', '.jpgg', | |
'.aspx', '.3ga ', '.net'} | |
# a list of extensions to delete. If any of these extensions are found in 'extensions' the import will be cancelled | |
auto_delete_extensions = {} | |
BUFFER_SIZE = 65536 # 8192 # file reading buffer size 8192 * 64? | |
# logger = logging.getLogger('filemgr') | |
# logger.setLevel(logging.CRITICAL) | |
# fh = logging.FileHandler('filemgr_debug.log') | |
# formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') | |
# fh.setFormatter(formatter) | |
# logger.addHandler(fh) | |
def safeprint(s): | |
try: | |
print(s) | |
except UnicodeEncodeError: | |
print(s.encode('utf8').decode(sys.stdout.encoding)) | |
class ED2KHash(object): | |
MAGICLEN = 9728000 | |
def __init__(self): | |
self.hashes = [] | |
self.pos = 0 | |
self.md4 = hashlib.new('md4') | |
def update(self, data): | |
data_len = len(data) | |
for d in (data[i:i + ED2KHash.MAGICLEN] for i in range(0, data_len, ED2KHash.MAGICLEN)): | |
self._update(d) | |
def _update(self, data): | |
data_len = len(data) | |
assert data_len <= ED2KHash.MAGICLEN | |
newpos = self.pos + data_len | |
if newpos < ED2KHash.MAGICLEN: | |
self.md4.update(data) | |
self.pos = newpos | |
return | |
else: | |
prev = data[:ED2KHash.MAGICLEN - self.pos] | |
next_val = data[ED2KHash.MAGICLEN - self.pos:] | |
self.md4.update(prev) | |
self.hashes.append(self.md4.digest()) | |
self.md4 = hashlib.new('md4') | |
self.md4.update(next_val) | |
self.pos = len(next_val) | |
return | |
def digest(self): | |
if len(self.hashes) == 0: | |
return self.md4.digest() | |
else: | |
m = hashlib.new('md4') | |
newhashes = self.hashes + [self.md4.digest()] | |
m.update(b''.join(newhashes)) | |
return m.digest() | |
class ApplicationConfiguration(object): | |
""" | |
Holds configuration values used in various places | |
""" | |
def __init__(self): | |
self.__database_name = 'filemgr.db3' | |
self.__base_directory = '' | |
self.__database_file = '' | |
self.__delete_existing = '' | |
self.__copy_new_destination = '' | |
self.__export_directory = '' | |
self.__rename_exported = False | |
self.__zip_exported = False | |
self.__delete_empty_directories = '' | |
def get_database_name(self): | |
return self.__database_name | |
def set_database_name(self, database_name): | |
self.__database_name = database_name | |
database_name = property(get_database_name, set_database_name) | |
def get_base_directory(self): | |
return self.__base_directory | |
def set_base_directory(self, base_directory): | |
self.__base_directory = base_directory | |
base_directory = property(get_base_directory, set_base_directory) | |
def get_database_file(self): | |
return self.__database_file | |
def set_database_file(self, database_file): | |
self.__database_file = database_file | |
database_file = property(get_database_file, set_database_file) | |
def get_delete_existing(self): | |
return self.__delete_existing | |
def set_delete_existing(self, delete_existing): | |
self.__delete_existing = delete_existing | |
delete_existing = property(get_delete_existing, set_delete_existing) | |
def get_delete_empty_directories(self): | |
return self.__delete_empty_directories | |
def set_delete_empty_directories(self, delete_empty_directories): | |
self.__delete_empty_directories = delete_empty_directories | |
delete_empty_directories = property(get_delete_empty_directories, set_delete_empty_directories) | |
def get_export_directory(self): | |
return self.__export_directory | |
def set_export_directory(self, export_directory): | |
self.__export_directory = export_directory | |
export_directory = property(get_export_directory, set_export_directory) | |
def get_rename_exported(self): | |
return self.__rename_exported | |
def set_rename_exported(self, rename_exported): | |
self.__rename_exported = rename_exported | |
rename_exported = property(get_rename_exported, set_rename_exported) | |
def get_zip_exported(self): | |
return self.__zip_exported | |
def set_zip_exported(self, zip_exported): | |
self.__zip_exported = zip_exported | |
zip_exported = property(get_zip_exported, set_zip_exported) | |
def get_copy_new_destination(self): | |
return self.__copy_new_destination | |
def set_copy_new_destination(self, copy_new_destination): | |
self.__copy_new_destination = copy_new_destination | |
copy_new_destination = property(get_copy_new_destination, set_copy_new_destination) | |
def add_insert_hashtype(appconfig, hashtype): | |
conn = sqlite3.connect(appconfig.database_file) | |
c = conn.cursor() | |
c.execute( | |
"SELECT hashID FROM hashtypes WHERE hashtypes.hashname = ?;", (hashtype,)) | |
row = c.fetchone() | |
if row is None: | |
# insert last_insert_rowid() | |
c.execute("INSERT INTO hashtypes (hashname) VALUES (?);", (hashtype,)) | |
conn.commit() | |
rowid = c.lastrowid | |
else: | |
rowid = row[0] | |
conn.close() | |
return rowid | |
def add_file_to_db(appconfig, fileinfo): | |
conn = sqlite3.connect(appconfig.database_file) | |
c = conn.cursor() | |
# check if hashtypes has an entry for each hash in hashes | |
hashtypes = {} | |
for key in fileinfo['hashes'].keys(): | |
hashtypes[key] = add_insert_hashtype(appconfig, key) | |
# print(fileinfo) | |
filename = fileinfo['inputfile'] | |
basefilename = os.path.split(filename)[-1] | |
basefilenameparts = os.path.splitext(basefilename) | |
file_ext = basefilenameparts[1].lower() | |
file_directory = os.path.join('files', fileinfo['hashes']['sha1b32'][0:2], fileinfo['hashes']['sha1b32'] + file_ext) | |
# add file to files table | |
c.execute("INSERT INTO files (inputpath,filepath,filesize,filewidth,fileheight,comment) VALUES (?,?,?,?,?,?);", | |
(fileinfo['inputfile'], '1', fileinfo['filesize'], fileinfo['filewidth'], fileinfo['fileheight'], '')) | |
fileid = c.lastrowid | |
# add each hash to file hashes | |
for hashtype in hashtypes: | |
c.execute("INSERT INTO filehashes (hashID,fileID,filehash) VALUES (?,?,?);", | |
(hashtypes[hashtype], fileid, fileinfo['hashes'][hashtype])) | |
conn.commit() | |
conn.close() | |
def import_files_work(appconfig, dirname): | |
files_with_invalid_extensions = [] # list of files we didn't import. | |
total_files = 0 | |
files_added_to_database = 0 | |
files_deleted = 0 | |
files_with_duplicate_hashes = [] | |
files_copied = 0 | |
# Looking up each hash is sllllllow, so pull em all in as a set and just look there! | |
print("Getting existing file locations from database...", end='') | |
existing_files = get_filelist_from_database(appconfig) | |
print("Got {:,d} file locations from database. Looking for new files.\n".format(len(existing_files))) | |
for dirpath, dirnames, files in scandir.walk(dirname, topdown=False): | |
total_files += len(files) | |
file_counter = 0 | |
if len(files) > 0: | |
safeprint("\n\tFound {:,d} files in {}. Processing...".format(len(files), dirpath)) | |
# logger.info("Found {:,d} files in {}".format(len(files), dirpath)) | |
for name in files: | |
full_path_name = os.path.join(dirpath, name) | |
file_counter += 1 | |
if full_path_name not in existing_files: | |
if os.path.isfile(full_path_name): | |
if os.path.getsize(full_path_name) == 0: | |
safeprint("\t\tDeleting 0 byte file '{}'.".format(full_path_name)) | |
# os.remove(full_path_name) | |
continue | |
parts = os.path.splitext(name.lower()) | |
if len(parts) == 2: | |
ext = parts[1] | |
# some files are always bad, so just make em go away. | |
if ext in auto_delete_extensions: | |
safeprint( | |
'\t\t({} [{:,d}/{:,d}]): File {} has an autonuke extension. Deleting...'.format( | |
datetime.datetime.now().strftime('%x %X'), | |
file_counter, | |
len(files), full_path_name)) | |
# os.remove(full_path_name) | |
continue | |
# if ext in extensions: | |
# logger.info( | |
# "{} before fileinfo = get_file_data(full_path_name)".format( | |
# datetime.datetime.now().strftime('%x %X'))) | |
fileinfo = get_file_data(full_path_name) | |
# logger.info("{} after fileinfo = get_file_data(full_path_name)".format( | |
# datetime.datetime.now().strftime('%x %X'))) | |
if not fileinfo['inputfile'] in existing_files: | |
files_added_to_database += 1 | |
safeprint("\t\t({} [{:,d}/{:,d}]): '{}' does not exist in database! Adding...".format | |
(datetime.datetime.now().strftime('%x %X'), | |
file_counter, | |
len(files), | |
full_path_name)) | |
# since this is a new file, we add it to our set for future import operations | |
existing_files.add(fileinfo['hashes']['sha1b32']) | |
add_file_to_db(appconfig, fileinfo) | |
else: | |
pass # do anything else here? should i check if file exists in file system? who cares tho | |
# as this syncs it up maybe here is where you do extra hashing of what is on file | |
# system to make sure the 2 match, properly named, etc | |
copied = copy_file_to_store(appconfig, fileinfo) | |
if copied: | |
safeprint( | |
'\t\t({} [{:,d}/{:,d}]): Processing {} with {:,d} bytes...'.format( | |
datetime.datetime.now().strftime('%x %X'), | |
file_counter, | |
len(files), fileinfo['inputfile'], fileinfo['filesize'])) | |
# logger.info("{} after copied = copy_file_to_store(appconfig, fileinfo)):".format( | |
# datetime.datetime.now().strftime('%x %X'))) | |
if not copied: | |
files_with_duplicate_hashes.append(full_path_name) | |
else: | |
files_copied += 1 | |
if len(appconfig.copy_new_destination) > 0 and copied: | |
# if not os.querypath.exists(appconfig.copy_new_destination): | |
# os.mkdir(appconfig.copy_new_destination) | |
# TODO should this create the 2 char structure too? for now, just copy it | |
copy_name = os.path.join(appconfig.copy_new_destination, name) | |
unique_prefix = 0 | |
while os.path.isfile(copy_name): | |
# file exists, so get a unique name | |
copy_name = os.path.join(appconfig.copy_new_destination, | |
str(unique_prefix) + "_" + name) | |
unique_prefix += 1 | |
# shutil.copyfile(full_path_name, copy_name) | |
outfile = os.path.join(appconfig.copy_new_destination, | |
"!!" + datetime.datetime.now().strftime( | |
"%Y-%m-%d") + " File copy log " + '.txt') | |
with open(outfile, 'a', encoding="utf-16") as logfile: | |
logfile.write( | |
"{}: Copied {} to {}.\n".format(datetime.datetime.now(), full_path_name, copy_name)) | |
if appconfig.delete_existing: | |
safeprint("\t\t({} [{:,d}/{:,d}]): Deleting '{}'...".format( | |
datetime.datetime.now().strftime('%x %X'), | |
file_counter, | |
len(files), | |
full_path_name)) | |
# if appconfig.delete_existing == 'yes': | |
# os.remove(full_path_name) | |
files_deleted += 1 | |
else: | |
continue # do anything else here? should i check if file exists in file system? who cares tho | |
# logger.info("{} before copied = copy_file_to_store(appconfig, fileinfo)):".format( | |
# datetime.datetime.now().strftime('%x %X'))) | |
if appconfig.delete_empty_directories: | |
if not os.listdir(dirpath): | |
safeprint("\t\t({} [{:,d}/{:,d}]): Deleting empty directory '{}'...".format( | |
datetime.datetime.now().strftime('%x %X'), file_counter, len(files), dirpath)) | |
if appconfig.delete_empty_directories == 'yes': | |
os.rmdir(dirpath) | |
return (files_added_to_database, total_files, files_deleted, files_copied, files_with_duplicate_hashes, | |
files_with_invalid_extensions) | |
def get_filelist_from_database(appconfig): | |
# pull them out and cache on startup or when first pulled? | |
conn = sqlite3.connect(appconfig.database_file) | |
c = conn.cursor() | |
c.execute("SELECT inputpath FROM files;") | |
rows = c.fetchall() | |
conn.close() | |
filenames = [row[0] for row in rows] | |
return set(filenames) | |
def file_exists_in_database(appconfig, fileinfo): | |
conn = sqlite3.connect(appconfig.database_file) | |
c = conn.cursor() | |
c.execute( | |
"SELECT filehashID FROM files, filehashes, hashtypes WHERE hashtypes.hashid = filehashes.hashid " | |
"AND files.fileID = filehashes.fileID AND hashtypes.hashname = 'sha1b32' AND filehashes.filehash = ?;", | |
(fileinfo['hashes']['sha1b32'],)) | |
row = c.fetchone() | |
conn.close() | |
if row is None: | |
return False | |
else: | |
return True | |
def get_sha1b32_from_database(appconfig): | |
# pull them out and cache on startup or when first pulled? | |
conn = sqlite3.connect(appconfig.database_file) | |
c = conn.cursor() | |
hash_id = get_hash_id_from_hash_name(appconfig, "sha1b32") | |
c.execute("SELECT filehash FROM filehashes WHERE hashid = ?;", (hash_id,)) | |
rows = c.fetchall() | |
conn.close() | |
hashes = [row[0] for row in rows] | |
return set(hashes) | |
def copy_file_to_store(appconfig, fileinfo): | |
"""Checks datastore for a file with identical sha1b32 hash. | |
if one exists, optionally delete the source file | |
optionally copy new file to separate directory for sharing purposes | |
""" | |
filename = fileinfo['inputfile'] | |
base_filename = os.path.split(filename)[-1] | |
base_filename_parts = os.path.splitext(base_filename) | |
file_ext = base_filename_parts[1].lower() | |
files_directory = os.path.join(appconfig.base_directory, 'files') | |
file_directory = os.path.join(files_directory, fileinfo['hashes']['sha1b32'][0:2]) | |
# if not os.querypath.exists(file_directory): | |
# os.mkdir(file_directory) | |
target_filemask = os.path.join(file_directory, fileinfo['hashes']['sha1b32'] + '*') | |
dest_filename = os.path.join(file_directory, fileinfo['hashes']['sha1b32'] + file_ext) | |
listing = glob.glob(target_filemask) | |
file_copied = False | |
if len(listing) == 0: | |
# shutil.copyfile(filename, dest_filename) | |
file_copied = True | |
return file_copied | |
""" | |
def multiprocess(processes, samples, x, widths): | |
pool = mp.Pool(processes=processes) | |
results = [pool.apply_async(parzen_estimation, args=(samples, x, w)) for w in widths] | |
results = [p.get() for p in results] | |
results.sort() # to sort the results by input window width | |
return results | |
""" | |
def get_file_data(file): | |
""" | |
Generates hashes for file and other file info such as size, etc. | |
""" | |
# TODO can i use some kind of magic to determine mime type and forego extension? | |
fileinfo = {'inputfile': file, 'filesize': os.path.getsize(file), 'hashes': {}} | |
parts = os.path.splitext(file.lower()) | |
ext = '' | |
# imageexts = ('.jpg', '.jpeg', '.JPG', '.JPEG', '.png', '.PNG', '.bmp', '.tiff', '.gif', '.GIF') | |
if len(parts) == 2: | |
ext = parts[1] | |
ed2k = ED2KHash() | |
sha1 = hashlib.sha1() | |
md5 = hashlib.md5() | |
filewidth = () | |
fileheight = () | |
imghash = () | |
md4 = hashlib.new('md4') | |
f = open(file, 'rb') | |
# print('file extension = ', ext) | |
# if ext in imageexts: | |
try: | |
(filewidth, fileheight) = dimensions(file) | |
imghash = str(avhash(file)) | |
imgdhash = str(dhash(file)) | |
# imgrotavg = str(rotavhash(file)) | |
# vprint('\n[!] image hash (%s)' % imghash) | |
except (RuntimeError, TypeError, NameError, ValueError): | |
# Failed to get hash, delete image & raise exception | |
print('image check failed') | |
imghash = 0 | |
imgdhash = 0 | |
# imgrotavg =0 | |
filewidth = 0 | |
fileheight = 0 | |
pass | |
# print('\n[!]file width = %s, height = %s' % (filewidth, fileheight)) | |
if filewidth > 4000 or fileheight > 4000: | |
print('\n[!] image too large to hash (%dx%d)' % (filewidth, fileheight)) | |
imghash = 999 | |
imgdhash = 999 | |
# imgrotavg = 999 | |
pass | |
if filewidth == 161 and fileheight == 81: | |
# Size of empty imgur image ('not found!') | |
imghash = 503 | |
imgdhash = 503 | |
# imgrotavg = 503 | |
pass | |
# else: | |
# imghash = 9999 | |
# filewidth = 9999 | |
# fileheight = 9999 | |
buf = f.read(BUFFER_SIZE) | |
while buf != b'': | |
md5.update(buf) | |
sha1.update(buf) | |
# md4.update(buf) | |
# ed2k.update(buf) | |
# imghash.update(buf) | |
buf = f.read(BUFFER_SIZE) | |
f.close() | |
sha1b16 = sha1.hexdigest().upper() | |
sha1b32 = base64.b32encode(base64.b16decode(sha1b16.upper())).decode().upper() | |
edonkey = 1 # base64.b16encode(ed2k.digest()) | |
md4hash = 1 # md4.hexdigest().upper() | |
md5hash = md5.hexdigest().upper() | |
# fileinfo['hashes']['md4'] = md4hash | |
# fileinfo['hashes']['ed2k'] = 1 # edonkey.decode('utf-8').upper() | |
# fileinfo['hashes']['sha1b16'] = sha1b16 | |
fileinfo['hashes']['sha1b32'] = sha1b32 | |
fileinfo['hashes']['md5'] = md5hash | |
fileinfo['hashes']['imghash'] = imghash | |
fileinfo['hashes']['imgdhash'] = imgdhash | |
# fileinfo['hashes']['imgrotavg'] = imgrotavg | |
fileinfo['extension'] = ext.lower() | |
fileinfo['file_store_name'] = 0 | |
fileinfo['filewidth'] = filewidth | |
fileinfo['fileheight'] = fileheight | |
return fileinfo | |
# def generate_missing_hashes(appconfig, file): | |
# """ Given file, look for missing hashes, generate them, and update the | |
# database """ | |
# | |
# return "not done yet" | |
def setup_base_directory(directory): | |
try: | |
# if not os.querypath.exists(directory): | |
# print('{} does not exist! Creating...'.format(directory)) | |
# os.mkdir(directory) | |
subdir = os.path.join(directory, 'files') | |
# if not os.querypath.exists(subdir): | |
# os.mkdir(subdir) | |
except: | |
raise | |
def init_db(appconfig): | |
# create, setup tables | |
# one table is hashname | |
# another is for files that references hashname pk | |
# this allows for easy expanding if hashname is missing without schema changes | |
conn = sqlite3.connect(appconfig.database_file) | |
c = conn.cursor() | |
c.execute("PRAGMA synchronous = OFF") | |
c.execute("PRAGMA journal_mode = MEMORY") | |
c.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='hashtypes';") | |
row = c.fetchone() | |
if row is None: | |
print("!!!Database is missing. Creating...") | |
c.execute('''CREATE TABLE hashtypes | |
(hashID INTEGER PRIMARY KEY AUTOINCREMENT, hashname TEXT)''') | |
c.execute('''CREATE TABLE files | |
(fileID INTEGER PRIMARY KEY AUTOINCREMENT, inputpath TEXT, | |
filepath TEXT, filesize INTEGER, filewidth INTEGER, fileheight INTEGER, comment TEXT)''') | |
c.execute('''CREATE TABLE filehashes | |
(filehashID INTEGER PRIMARY KEY AUTOINCREMENT, hashID INTEGER, fileID INTEGER, filehash TEXT)''') | |
conn.commit() | |
c.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='importedpaths';") | |
row = c.fetchone() | |
if row is None: | |
print("!!!Table 'importedpaths' is missing!. Creating...") | |
c.execute('''CREATE TABLE importedpaths (pathID INTEGER PRIMARY KEY AUTOINCREMENT, importedpath TEXT, | |
imported_date TEXT, files_added_to_database INTEGER, total_files INTEGER, files_deleted INTEGER, | |
files_copied INTEGER, files_with_duplicate_hashes INTEGER, files_with_invalid_extensions INTEGER);''') | |
conn.commit() | |
# add indexes | |
c.execute("SELECT COUNT(*) FROM sqlite_master WHERE type = 'index';") | |
row = c.fetchone() | |
if row[0] == 0: | |
print("!!!Indexes are missing. Creating...") | |
c.execute('CREATE INDEX "IX_filehashes" ON "filehashes" ("filehash")') | |
print("!File hash index created") | |
c.execute('CREATE INDEX "IX_fileID" ON "filehashes" ("fileID")') | |
print("!FileID index created") | |
c.execute('CREATE INDEX "IU_inputpath" ON "files" ("inputpath", "filesize", "filewidth", "fileheight")') | |
print("!File querypath/file size index created") | |
c.execute('CREATE INDEX "IU_hashID_fileID" ON "filehashes" ("hashID", "filehash")') | |
print("!HashID/file hash index created\n") | |
c.execute('CREATE INDEX "IX_hashID" ON "filehashes" ("hashID")') | |
print("!File hash index created") | |
conn.commit() | |
conn.close() | |
def add_import_path_to_db(appconfig, path_name, files_added_to_database, total_files, files_deleted, files_copied, | |
files_with_duplicate_hashes, files_with_invalid_extensions): | |
conn = sqlite3.connect(appconfig.database_file) | |
c = conn.cursor() | |
c.execute( | |
"INSERT INTO importedpaths (importedpath, imported_date, files_added_to_database, total_files, files_deleted, files_copied, files_with_duplicate_hashes, files_with_invalid_extensions) VALUES (?, ?, ?, ?, ?, ?, ?, ?);", | |
(path_name, datetime.datetime.now(), files_added_to_database, total_files, files_deleted, files_copied, | |
len(files_with_duplicate_hashes), len(files_with_invalid_extensions))) | |
conn.commit() | |
conn.close() | |
def check_import_path_in_db(appconfig, path_name): | |
conn = sqlite3.connect(appconfig.database_file) | |
c = conn.cursor() | |
c.execute("SELECT imported_date FROM importedpaths WHERE importedpath = ? ORDER BY imported_date DESC;", | |
(path_name,)) | |
rows = c.fetchall() | |
conn.close() | |
# 2014-02-05 10:22:30.214031 | |
dates = [datetime.datetime.strptime(row[0], '%Y-%m-%d %H:%M:%S.%f').strftime('%x %X') for row in rows] | |
return dates | |
def generate_hash_list(appconfig, hash_type, suppress_file_info): | |
outfile = os.path.join(appconfig.base_directory, | |
"Exported hash list_" + datetime.datetime.now().strftime("%H%M%S%f") + '.tsv') | |
file_count = 0 | |
conn = sqlite3.connect(appconfig.database_file) | |
file_cursor = conn.execute("SELECT files.inputpath, files.filesize, files.fileID FROM files ORDER BY fileID") | |
if hash_type == 'all': | |
sql = 'SELECT hashid, hashname FROM hashtypes ORDER BY hashname ASC' | |
else: | |
sql = 'SELECT hashid, hashname FROM hashtypes WHERE hashname = "{}" ORDER BY hashname ASC'.format(hash_type) | |
hash_types_cursor = conn.execute(sql) | |
with open(outfile, 'w+', encoding="utf-16") as logfile: | |
header = ['relative_path', 'file_size'] | |
if suppress_file_info: | |
header.clear() | |
hash_types = {} | |
for hash_type_row in hash_types_cursor: | |
header.append(hash_type_row[1]) | |
hash_types[hash_type_row[0]] = hash_type_row[1] | |
logfile.write('\t'.join(header) + "\n") | |
for file_row in file_cursor: | |
file_count += 1 | |
file_id = file_row[2] | |
# hash_types contains the id and hash name for all known hashes. for each of those, get that hash for | |
# active file. if not present, tell the user | |
row_values = [file_row[0], str(file_row[1])] # this is what will build out each row | |
if suppress_file_info: | |
row_values.clear() | |
for hash_id in sorted(hash_types, | |
key=hash_types.get): # sort it according to the hash names so the order is correct | |
hash_cursor = conn.execute( | |
"SELECT filehashes.filehash, hashtypes.hashname FROM hashtypes INNER JOIN filehashes ON " | |
"filehashes.hashID = hashtypes.hashID WHERE filehashes.fileID = ? AND filehashes.hashID = ? " | |
"ORDER BY hashtypes.hashname ASC;", | |
(file_id, hash_id)) | |
row = hash_cursor.fetchone() | |
if not row is None: | |
row_values.append(row[0]) | |
else: | |
row_values.append("Hash '{}' missing in database!".format(hash_types[hash_id])) | |
hash_cursor.close() | |
logfile.write('\t'.join(row_values) + "\n") | |
conn.close() | |
return file_count, outfile | |
def import_files(appconfig, directories): | |
""" | |
Attempts to recursively import files from values in directories and writes log files with actions taken | |
@param appconfig: Configuration data | |
@param directories: a list of directories to import from | |
""" | |
print("Importing from '{}'".format(",".join(directories))) | |
for directory in directories: | |
directory = directory.strip() | |
if os.path.isdir(directory): | |
import_history = check_import_path_in_db(appconfig, directory) | |
if len(import_history) > 0: | |
answer = input( | |
"\n\n**** '{}' has already been imported on:\n\n{}\n\nContinue: [y|N]: ".format(directory, | |
'\n'.join( | |
import_history))) | |
if not answer.lower() == 'y': | |
print("**** Skipping '{}'\n".format(directory)) | |
continue | |
(files_added_to_database, total_files, files_deleted, files_copied, files_with_duplicate_hashes, | |
files_with_invalid_extensions) = import_files_work(appconfig, directory) | |
add_import_path_to_db(appconfig, directory, files_added_to_database, total_files, files_deleted, | |
files_copied, files_with_duplicate_hashes, files_with_invalid_extensions) | |
print( | |
'\n' + '*' * 4 + """ {:,d} total files found. {:,d} copied to file store and {:,d} files were added to the database. {:,d} files had duplicate hashes. {:,d} files had invalid extensions (see log file for details)""".format( | |
total_files, files_copied, files_added_to_database, len(files_with_duplicate_hashes), | |
len(files_with_invalid_extensions))) | |
directory_clean = re.sub('[^\w\-_\. ]', '_', directory) | |
logfile_name = os.path.join(appconfig.base_directory, | |
"Import log for " + directory_clean + " " + datetime.datetime.now().strftime( | |
"%H%M%S%f") + '.txt') | |
with open(logfile_name, 'w+', encoding="utf-16") as logfile: | |
logfile.write('Directory processed: {}\n\n'.format(directory)) | |
logfile.write('Files found: {:,d}\n'.format(total_files)) | |
logfile.write('Files copied to file store: {:,d}\n'.format(files_copied)) | |
logfile.write('Files added to database: {:,d}\n'.format(files_added_to_database)) | |
logfile.write('Files with duplicate hashes: {:,d}\n\n'.format(len(files_with_duplicate_hashes))) | |
if files_deleted > 0: | |
logfile.write('Number of deleted files: {:,d}\n\n'.format(files_deleted)) | |
logfile.write('*' * 78 + '\n\n') | |
logfile.write('The following files had duplicate hashes and were not imported:\n\n') | |
for item in files_with_duplicate_hashes: | |
logfile.write("{}\n".format(item)) | |
logfile.write('\n\nThe following files had invalid extensions and were not imported:\n\n') | |
for item in files_with_invalid_extensions: | |
logfile.write("{}\n".format(item)) | |
if appconfig.delete_existing and files_deleted > 0: | |
print(' ' * 5 + '{:,d} files were deleted'.format(files_deleted)) | |
else: | |
print("\t'{}' does not exist!".format(directory)) | |
# after import, tell the user to see generated logs (one per directory) in the main directory | |
# but only if we actually attempted to import something | |
if len(directories) > 0 and 'logfile_name' in locals(): | |
print("\n\nSee log files in {} for details.".format(appconfig.base_directory)) | |
def get_hash_id_from_hash_name(appconfig, hash_name): | |
conn = sqlite3.connect(appconfig.database_file) | |
c = conn.cursor() | |
c.execute( | |
"SELECT hashID FROM hashtypes WHERE hashname = ?;", (hash_name,)) | |
row = c.fetchone() | |
conn.close() | |
if row is None: | |
return -1 | |
else: | |
return int(row[0]) | |
def check_file_exists_in_database(appconfig, hash_id, hash_value): | |
conn = sqlite3.connect(appconfig.database_file) | |
c = conn.cursor() | |
c.execute( | |
"SELECT files.inputpath, files.filesize FROM filehashes JOIN files ON files.fileID = filehashes.fileID " | |
"WHERE filehashes.hashID = ? AND filehashes.filehash = ?;", | |
(hash_id, hash_value)) | |
row = c.fetchone() | |
conn.close() | |
if row is None: | |
db_info = ('', 0) | |
else: | |
db_info = (row[0], row[1]) | |
return db_info | |
def get_database_delta(appconfig, hash_set, hash_id): | |
conn = sqlite3.connect(appconfig.database_file) | |
c = conn.cursor() | |
sql = "SELECT files.fileID, files.inputpath FROM filehashes INNER JOIN files ON files.fileID = filehashes.fileID WHERE filehashes.hashID = ? AND filehashes.filehash NOT in ({0})".format( | |
', '.join('?' for _ in hash_set)) | |
params = hash_set | |
params.insert(0, str(hash_id)) | |
c.execute(sql, params) | |
rows = c.fetchall() | |
conn.close() | |
return rows | |
def get_hash_from_hash_id_and_file_id(appconfig, hash_id, file_id): | |
conn = sqlite3.connect(appconfig.database_file) | |
c = conn.cursor() | |
c.execute( | |
"SELECT filehashes.filehash FROM filehashes WHERE filehashes.hashID = ? AND filehashes.fileID = ?;", | |
(hash_id, file_id)) | |
row = c.fetchone() | |
conn.close() | |
if row is None: | |
return False | |
else: | |
return row[0] | |
def build_new_out_path(export_directory, new_hash, file_name): | |
front = "files\\" + new_hash[0:2] | |
mid = new_hash | |
ext = os.path.splitext(file_name[1])[-1] | |
out_path = os.path.join(export_directory, front, mid + ext.lower()) | |
return out_path | |
def copy_file(abs_path, log_file, out_path): | |
if not os.path.exists(os.path.dirname(out_path)): | |
os.makedirs(os.path.dirname(out_path)) | |
log_file.write("Copying '{}' to '{}'\n".format(abs_path, out_path)) | |
# shutil.copyfile(abs_path, out_path) | |
def get_existing_hash_list(appconfig, hash_id): | |
conn = sqlite3.connect(appconfig.database_file) | |
c = conn.cursor() | |
c.execute( | |
"SELECT fileID, filehash FROM filehashes WHERE filehashes.hashID = ?;", (hash_id,)) | |
existing_hashes = {} | |
# row_count = 0 | |
record = c.fetchone() | |
while record: | |
# if row_count % 1000000 == 0: | |
# print("{}: Database rows fetched: {:,d}".format(datetime.datetime.now().strftime('%x %X'), row_count)) | |
existing_hashes[record[1]] = record[0] | |
record = c.fetchone() | |
# row_count += 1 | |
conn.close() | |
return existing_hashes | |
def get_file_from_db(appconfig, file_id): | |
conn = sqlite3.connect(appconfig.database_file) | |
c = conn.cursor() | |
c.execute( | |
"SELECT inputpath FROM files WHERE fileID = ?;", (file_id,)) | |
record = c.fetchone() | |
conn.close() | |
return record[0] | |
def export_files(appconfig, export_existing, file_name): | |
""" | |
Copies files from file store to a directory | |
@param appconfig: basic config data | |
@param export_existing: if true, export files in input file that are also in file store, else, export the opposite | |
@param file_name: the file to read hash type and hashes from | |
""" | |
hash_file = open(file_name) | |
hash_name = hash_file.readline().strip().lower() | |
hash_id = get_hash_id_from_hash_name(appconfig, hash_name) | |
if hash_id == -1: | |
print("Unknown hash type: '{}'. Export cancelled!".format(hash_name)) | |
return | |
datetime_string = datetime.datetime.now().strftime("%H%M%S%f") | |
export_directory = os.path.join(appconfig.export_directory, | |
"Export run " + datetime_string + " for {}".format(hash_name)) | |
if not os.path.exists(export_directory): | |
os.makedirs(export_directory) | |
log_name = os.path.join(export_directory, | |
"Export log " + datetime_string + '.txt') | |
log_file = open(log_name, 'w', encoding="utf-16") | |
log_file.write("Looking for hashes in '{}'\n\n".format(file_name)) | |
log_file.write("Hash type: {}\n".format(hash_name)) | |
print("\t\tHash type: {}\n".format(hash_name)) | |
log_file.write("Zip exported: {}\n".format(appconfig.zip_exported)) | |
log_file.write("Rename exported: {}\n\n".format(appconfig.rename_exported)) | |
if export_existing: | |
export_type = "Existing" | |
else: | |
export_type = "Delta" | |
log_file.write("Export operation: {}\n\n".format(export_type)) | |
log_file.write("Copy log\n\n") | |
found_files = 0 | |
hash_count = 0 | |
# TODO collect operations in a single list then iterate/copy after so as to remove duplicate code in loops for each | |
if export_existing: | |
for line in hash_file: | |
line = line.strip() | |
hash_count += 1 | |
(file_path, file_size) = check_file_exists_in_database(appconfig, hash_id, line) | |
# TODO This needs cleaned up in regard to the paths. the database should store things in one format | |
# right now its all bunged up | |
if file_path: | |
print( | |
"\t\t({:,d}) File with hash '{}' found! Copying {:,d} bytes...".format(hash_count, line, file_size)) | |
found_files += 1 | |
abs_path = os.path.join(appconfig.base_directory, file_path) | |
if not os.path.isfile(abs_path): | |
front, ext = os.path.splitext(abs_path) | |
abs_path = front + ext.lower() | |
abs_path = abs_path.replace("\\", "/") | |
if appconfig.rename_exported and not hash_name == 'sha1b32': # the default is sha1b32 | |
out_path = build_new_out_path(export_directory, line, file_path.replace("\\", "/")) | |
else: | |
out_path = os.path.join(export_directory, file_path.replace("\\", "/")) | |
print("Copying '{}' to '{}'\n".format(abs_path, out_path)) | |
copy_file(abs_path, log_file, out_path) # TODO Error handling here | |
else: | |
print("Getting hashes from file...") | |
hashes = [line.strip() for line in hash_file] | |
hash_set = set(hashes) # get rid of any dupes | |
hash_count = len(hash_set) | |
file_count = 0 | |
print("Found {:,d} hashes in file!".format(hash_count)) | |
# sql wont work | |
# export entire DB for hash_id to file containing: file_id and hash for hash_id | |
# once done, read that into dictionary with hash: fileid | |
# loop thru hash_set and remove similar items from dictionary | |
# when done, export files remaining in dictionary | |
print("Getting existing hashes from database...") | |
existing_hash_list = get_existing_hash_list(appconfig, hash_id) | |
print("Found {:,d} hashes in database!".format(len(existing_hash_list))) | |
for hash in hash_set: | |
if hash in existing_hash_list: | |
del existing_hash_list[hash] | |
print("After pruning there are {:,d} hashes to export.".format(len(existing_hash_list))) | |
for value in existing_hash_list.values(): | |
# value is fileID for the file, so now we can get info on the file and export | |
db_name = get_file_from_db(appconfig, value) | |
if db_name: | |
abs_path = os.path.join(appconfig.base_directory, db_name) | |
if not os.path.isfile(abs_path): | |
front, ext = os.path.splitext(abs_path) | |
abs_path = front + ext.lower() | |
abs_path = abs_path.replace("\\", "/") | |
if appconfig.rename_exported and not hash_name == 'sha1b32': # the default is sha1b32 | |
# sigh. we have to now get the appropriate hash value from the database and do trickery based on that | |
# we know the file id, so we can get the hash for the corresponding hash_type from the database | |
# since we also know the hash_id | |
new_hash = get_hash_from_hash_id_and_file_id(appconfig, hash_id, value) | |
out_path = build_new_out_path(export_directory, new_hash, db_name) | |
else: | |
out_path = os.path.join(export_directory, db_name.replace("\\", "/")) | |
# print("abs_path is {}".format(abs_path)) | |
# print("out_path is {}".format(out_path)) | |
file_count += 1 | |
print("[{:,d}/{:,d}] Copying '{}' to '{}'\n".format(file_count, len(existing_hash_list), abs_path, | |
out_path)) | |
copy_file(abs_path, log_file, out_path) # TODO Error handling here | |
hash_file.close() | |
log_file.close() | |
if appconfig.zip_exported: | |
zip_name = os.path.join(appconfig.export_directory, | |
"Exported " + hash_name + " " + datetime_string + ".zip") | |
print("\t\tZipping files to '{}'\n".format(zip_name)) | |
z_file = zipfile.ZipFile(zip_name, "w") | |
for dirpath, dirnames, filenames in scandir.walk(export_directory): | |
for filename in filenames: | |
full_name = os.path.join(export_directory, dirpath, filename) | |
if full_name.endswith("txt"): | |
archive_name = os.path.basename(full_name) | |
else: | |
parts = full_name.split("\\") | |
archive_name = "\\".join(str(parts[-3:])) | |
z_file.write(full_name, archive_name) | |
z_file.close() | |
print("\t\tRemoving '{} since export was zipped to {}...'\n".format(export_directory, zip_name)) | |
# shutil.rmtree(export_directory) | |
print("\n\t\tSaw {:,d} {} hashes in '{}'. Files found: {:,d}. See '{}' for details.".format(hash_count, hash_name, | |
file_name, found_files, | |
log_name)) | |
def get_stats(appconfig, stats_level): | |
# total files | |
# total size | |
total_store_files = 0 | |
total_store_size = 0 | |
conn = sqlite3.connect(appconfig.database_file) | |
c = conn.cursor() | |
c.execute("SELECT COUNT(fileID) FROM files") | |
row = c.fetchone() | |
total_db_files = row[0] or 0 | |
c.execute("SELECT sum(filesize) FROM files") | |
row = c.fetchone() | |
total_db_size = row[0] or 0 | |
conn.close() | |
if stats_level == 'full': | |
for r, d, files in scandir.walk(os.path.join(appconfig.base_directory, "files")): | |
total_store_files += len(files) | |
for file in files: | |
total_store_size += os.path.getsize(os.path.join(r, file)) | |
return total_db_files, total_db_size, total_store_files, total_store_size | |
def bytes_to_human(byte_value, to, bsize=1024): | |
"""convert byte_value to megabytes, etc. | |
sample code: | |
print('mb= ' + str(bytesto(314575262000000, 'm'))) | |
sample output: | |
mb= 300002347.946 | |
""" | |
if byte_value is None: | |
return float(0) | |
a = {'k': 1, 'm': 2, 'g': 3, 't': 4, 'p': 5, 'e': 6} | |
r = float(byte_value) | |
for i in range(a[to]): | |
r /= bsize | |
return r | |
def dump_stats(appconfig, print_stats): | |
print("\n*** Database statistics ***\n") | |
if print_stats == 'full': | |
print("\t *** Please be patient while file store statistics are calculated. This may take a while! ***\n") | |
(total_db_files, total_db_size, total_store_files, total_store_size) = get_stats(appconfig, print_stats) | |
print("Total files in database: {:,d}".format(total_db_files)) | |
print("Total size of files in database: {:,d} bytes ({:,f} MB, {:,f} GB, {:,f} TB)\n".format(total_db_size, | |
bytes_to_human( | |
total_db_size, | |
'm'), | |
bytes_to_human( | |
total_db_size, | |
'g'), | |
bytes_to_human( | |
total_db_size, | |
't'))) | |
if print_stats == 'full': | |
print("Total files in file store: {:,d}".format(total_store_files)) | |
print("Total size of files in file store: {:,d} bytes ({:,f} MB, {:,f} GB, {:,f} TB)\n".format(total_store_size, | |
bytes_to_human( | |
total_store_size, | |
'm'), | |
bytes_to_human( | |
total_store_size, | |
'g'), | |
bytes_to_human( | |
total_store_size, | |
't'))) | |
count_discrepancy = False | |
size_discrepancy = False | |
if not total_db_files == total_store_files: | |
count_discrepancy = True | |
if not total_db_size == total_store_size: | |
size_discrepancy = True | |
if size_discrepancy or count_discrepancy: | |
print("\n*** WARNING ***") | |
if size_discrepancy: | |
print( | |
"There is a discrepancy between the size of files in the database ({:,d}) and the file store ({:,d})! Delta: {:,d} bytes".format( | |
total_db_size, total_store_size, total_db_size - total_store_size)) | |
if count_discrepancy: | |
print( | |
"There is a discrepancy between the number of files in the database ({:,d}) and the file store ({:,d})! Delta: {:,d}".format( | |
total_db_files, total_store_files, total_db_files - total_store_files)) | |
if size_discrepancy or count_discrepancy: | |
print("**It is recommended to use the --verify switch to correct this.") | |
else: | |
print("Database and file store appear to be in sync!\n\n") | |
def check_db_to_fs(appconfig): | |
conn = sqlite3.connect(appconfig.database_file) | |
c = conn.cursor() | |
c.execute("SELECT fileid, inputpath FROM files ORDER BY inputpath") | |
bad_files = [] | |
for row in c: | |
full_path = os.path.join(appconfig.base_directory, row[1]).lower() | |
if not os.path.isfile(full_path): | |
bad_files.append(row[0]) | |
print("\t{} is in database but does not exist in file store!".format(full_path)) | |
conn.close() | |
return bad_files | |
def get_files_from_db(appconfig): | |
conn = sqlite3.connect(appconfig.database_file) | |
c = conn.cursor() | |
c.execute("SELECT inputpath FROM files") | |
file_names = [] | |
for row in c: | |
file_names.append(row[0]) | |
conn.close() | |
return file_names | |
def check_fs_to_db(appconfig): | |
bad_files = [] | |
db_file_names = get_files_from_db(appconfig) | |
for r, d, files in scandir.walk(os.path.join(appconfig.base_directory, "files")): | |
for file in files: | |
full_path = os.path.join(r, file) | |
db_path = full_path.replace(appconfig.base_directory, "") | |
db_path = db_path[1:] | |
if not db_path in db_file_names: | |
bad_files.append(full_path) | |
print("\t{} is in file store but does not exist in database!".format(full_path)) | |
return bad_files | |
def get_fileid_from_fileinfo(appconfig, fileinfo): | |
conn = sqlite3.connect(appconfig.database_file) | |
c = conn.cursor() | |
hashid = get_hash_id_from_hash_name(appconfig, 'sha1b32') | |
c.execute("SELECT fileid FROM FILEHASHES WHERE hashID = ? AND filehash = ?;", | |
(hashid, fileinfo['hashes']['sha1b32'])) | |
row = c.fetchone() | |
conn.close() | |
return row[0] | |
def delete_files_from_db(appconfig, files): | |
conn = sqlite3.connect(appconfig.database_file) | |
c = conn.cursor() | |
sql = "DELETE FROM FILEHASHES WHERE fileID in ({})".format( | |
', '.join('?' for _ in list(files))) | |
c.execute(sql, files) | |
sql = "DELETE FROM files WHERE fileID in ({})".format( | |
', '.join('?' for _ in list(files))) | |
c.execute(sql, files) | |
conn.commit() | |
conn.close() | |
def delete_file_from_db(appconfig, fileinfo): | |
conn = sqlite3.connect(appconfig.database_file) | |
c = conn.cursor() | |
fileid = get_fileid_from_fileinfo(appconfig, fileinfo) | |
c.execute("DELETE FROM filehashes WHERE fileid = ?;", (fileid,)) | |
conn.commit() | |
c.execute("DELETE FROM files WHERE fileid = ?;", (fileid,)) | |
conn.commit() | |
conn.close() | |
def verify(appconfig): | |
print("*** File manager verification ***\n") | |
print("Beginning stage 1 (comparing database against file store)...") | |
db_to_fs_bad = check_db_to_fs(appconfig) | |
if len(db_to_fs_bad) == 0: | |
print("Stage 1 complete. No inconsistencies detected between database and file system.") | |
print("\nBeginning stage 2 (comparing file store against database)...") | |
fs_to_db_bad = check_fs_to_db(appconfig) | |
if len(fs_to_db_bad) == 0: | |
print("Stage 2 complete. No inconsistencies detected between file system and database.") | |
if len(fs_to_db_bad) == 0 and len(db_to_fs_bad) == 0: | |
print("\n\nNo inconsistencies detected!") | |
else: | |
# we have to fix things | |
print("\n\nFound {:,d} database and {:,d} file system inconsistencies.".format(len(db_to_fs_bad), | |
len(fs_to_db_bad))) | |
fix_it = input("\nDo you want to fix these issues? [Y|n]: ") | |
if not fix_it.lower() == 'n': | |
print("\nDeleting bad records from database...", end='') | |
delete_files_from_db(appconfig, db_to_fs_bad) | |
print("Deleted {:,d} records from database!".format(len(db_to_fs_bad))) | |
# set up a clean staging area for files to be imported from | |
verify_directory = os.path.join(appconfig.base_directory, "verify") | |
# if os.querypath.isdir(verify_directory): | |
# shutil.rmtree(verify_directory) | |
# os.mkdir(verify_directory) | |
print("Adding files to database...") | |
for file in fs_to_db_bad: | |
fileinfo = get_file_data(file) | |
if file_exists_in_database(appconfig, fileinfo): | |
# nuke it to be clean | |
delete_file_from_db(appconfig, fileinfo) | |
# move each file to a staging directory, then call import work on it. done | |
head, tail = os.path.split(file) | |
to_file = os.path.join(verify_directory, tail) | |
unique_prefix = 0 | |
while os.path.isfile(to_file): | |
# file exists, so get a unique name | |
to_file = os.path.join(verify_directory, str(unique_prefix) + "_" + tail) | |
unique_prefix += 1 | |
# shutil.move(file, to_file) | |
(files_added_to_database, total_files, files_deleted, files_copied, files_with_duplicate_hashes, | |
files_with_invalid_extensions) = import_files_work(appconfig, verify_directory) | |
# shutil.rmtree(verify_directory) | |
print("\nAdded {:,d} files to database!".format(files_added_to_database)) | |
print("\n\n*** Repair complete! ***") | |
def main(): | |
parser = argparse.ArgumentParser( | |
description="""File manager that can import files, | |
export file sets based on a list of hashes, export files NOT in a list, etc.""", epilog=""" | |
This program can be used to manage files of any type. Before use, adjust the value of | |
'extensions' at the top of the file. Only files having an extension in this set will be | |
imported. A list of files that weren't imported will be documented in a log file when | |
the import operation finishes. | |
""") | |
parser.add_argument("base_directory", help="""The root directory where files | |
will live. This is also where the database of file info will | |
be created. Enclose directories with spaces in double quotes. | |
This should be the first argument provided. | |
""") | |
parser.add_argument("--print_stats", choices=['lite', 'full'], help="""'lite' will produce statistics from | |
information in the database only. 'full' will look at both the database and file store. | |
""") | |
parser.add_argument("--verify", action="store_true", help="""Perform consistency check. | |
Stage 1 is verifying what is in the database against what is in the file store. | |
Stage 2 is verifying what is in the file store against the database. | |
When comparison is complete, the results are displayed and, if any issues are found, | |
options presented to correct any inconsistencies. | |
""") | |
import_group = parser.add_argument_group('Import options', 'These options determine how files are imported') | |
import_group.add_argument( | |
"--import_from", help="""List of comma separated directories to import | |
files from. Enclose directories with spaces in double quotes. Directories should | |
NOT have trailing slashes (i.e. C:\\foo is OK, but C:\\bar\\ is NOT OK | |
""", metavar='PATHS_TO_IMPORT_FROM') | |
import_group.add_argument( | |
"--delete_existing", choices=['yes', 'simulate'], help="""When importing, delete source files if | |
they already exist in file store. If set to 'simulate' files | |
will not actually be deleted. This is useful to see what | |
would happen as a result of using this flag without actually | |
deleting files. | |
""") | |
import_group.add_argument( | |
"--delete_empty_directories", choices=['yes', 'simulate'], help="""When importing, delete any empty directories found. | |
If set to 'simulate' directories will not actually be deleted. | |
""") | |
import_group.add_argument("--copy_new_destination", help="""The directory to copy any newly imported files into. | |
No renaming of files (except when conflicts exist) will be done. | |
If directory name has spaces, enclose it in double quotes | |
""", metavar='PATH_TO_DIRECTORY') | |
generate_group = parser.add_argument_group('Generate hash list options', | |
'These options determine how hash lists are generated') | |
generate_group.add_argument("--generate_hash_list", help="""Creates a CSV file of all hashes in the database. Also | |
includes the relative querypath to the file. The file will be saved to | |
the file manager's base directory | |
""", choices=['all', 'ed2k', 'md4', 'md5', 'sha1b16', 'sha1b32']) | |
generate_group.add_argument("--suppress_file_info", help="""When true, prevents relative file querypath and file size | |
from being included in the hash list. This is handy to generate | |
hash lists to import into X-Ways Forensics, etc. | |
""", action="store_true") | |
export_group = parser.add_argument_group('Export options', | |
'These options allow for exporting files in several ways.') | |
# because crazy people may try to do both at once... | |
export_group_exclusive = export_group.add_mutually_exclusive_group() | |
export_group_exclusive.add_argument("--export_existing", help="""Export a copy of files in PATH_TO_TEXT_FILE to | |
--export_directory. The first line of the file should | |
be the hash type to query: md5, sha1b16, sha1b32, ed2k, or md4, | |
followed by one hash per line. Enclose paths with spaces | |
in double quotes. | |
""", metavar='PATH_TO_TEXT_FILE') | |
export_group_exclusive.add_argument("--export_delta", help="""Export a copy of files | |
NOT in PATH_TO_TEXT_FILE to --export_directory. The first line of the file should | |
be the hash type to query: md5, sha1b16, sha1b32, ed2k, or md4, | |
followed by one hash per line. Enclose paths with spaces | |
in double quotes. | |
This is useful to synchronize two different file manager instances | |
by 1) using --generate_hash_list on one instance and then 2) | |
using this option on the file from step 1. The resultant files | |
can then be imported into the instance from step 1. | |
""", metavar='PATH_TO_TEXT_FILE') | |
export_group.add_argument("--export_directory", help="""The target directory when using --export_files_in_list or | |
--export_files_not_in_list options. Enclose directories with spaces | |
in double quotes. | |
""", metavar='PATH_TO_DIRECTORY') | |
export_group.add_argument("--rename", help="""When true, all exported files will be renamed to match | |
the hash type from the provided file listing. | |
""", action="store_true") | |
export_group.add_argument("--zip", help="""When true, all exported files will be added to a zip | |
archive in --export_directory. | |
""", action="store_true") | |
# this stores our application parameters so it can get passed around to functions | |
appconfig = ApplicationConfiguration() | |
args = parser.parse_args() | |
if args.delete_existing: | |
appconfig.delete_existing = args.delete_existing | |
if args.delete_empty_directories: | |
appconfig.delete_empty_directories = args.delete_empty_directories | |
if args.copy_new_destination: | |
appconfig.copy_new_destination = args.copy_new_destination | |
if args.base_directory: | |
appconfig.base_directory = args.base_directory | |
setup_base_directory(appconfig.base_directory) | |
appconfig.database_file = os.path.join(appconfig.base_directory, appconfig.database_name) | |
print('\n\n') | |
init_db(appconfig) | |
# Process things in a sane order so things later down the list of options are as complete as possible | |
if args.verify: | |
verify(appconfig) | |
if args.import_from: # since at least something was passed to this argument, lets try to import | |
if extensions.intersection(auto_delete_extensions): | |
print( | |
"Cannot import files as there is at least one extension in common between 'extensions' and 'auto_delete_extensions: {}".format( | |
", ".join(extensions.intersection(auto_delete_extensions)))) | |
else: | |
directories = args.import_from.split(",") | |
import_files(appconfig, directories) | |
if args.generate_hash_list: | |
(files_processed, hash_path) = generate_hash_list(appconfig, args.generate_hash_list, args.suppress_file_info) | |
if files_processed: | |
print("\n\nHashes for {} files have been exported to '{}'\n".format(files_processed, hash_path)) | |
else: | |
print("\n\nNothing to export! The database is empty!\n") | |
if args.export_existing or args.export_delta: | |
if args.export_directory: | |
appconfig.export_directory = os.path.normpath(args.export_directory) | |
print("\tExport directory set to: {}".format(appconfig.export_directory)) | |
if not os.path.exists(appconfig.export_directory): | |
print("\tExport directory does not exist. Creating...") | |
os.makedirs(appconfig.export_directory) | |
if args.rename: | |
appconfig.rename_exported = True | |
if args.zip: | |
appconfig.zip_exported = True | |
file_name = "" | |
if args.export_existing: | |
file_name = args.export_existing | |
elif args.export_delta: | |
file_name = args.export_delta | |
if os.path.isfile(file_name): | |
export_files(appconfig, bool(args.export_existing), file_name) | |
else: | |
print("\t{} does not exist! Export cancelled!".format(file_name)) | |
else: | |
print("\t--export_directory must be set when exporting files! Export cancelled.") | |
# see whats set in appconfig | |
# attrs = vars(appconfig) | |
# print('\n'.join("%s: %s" % item for item in attrs.items())) | |
# TODO have a built in web mode to allow searching, exporting etc? | |
# TODO Add error handling/try catch, etc | |
# TODO make backup of SQLite DB on startup (if newer than last) | |
# TODO add --purge_files that takes a list of files and cleans file store and DB of those hashes | |
if args.print_stats: | |
dump_stats(appconfig, args.print_stats) | |
if not args.export_delta and not args.export_existing and not args.generate_hash_list and not args.import_from and not args.print_stats and not args.verify: | |
print("You didn't ask me to do anything, so here are some statistics:") | |
dump_stats(appconfig, 'lite') | |
if __name__ == '__main__': | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import hashlib | |
from ImageHash import avhash, dhash, dimensions # , rotavhash | |
import base64 | |
import sqlite3 | |
import datetime | |
import re | |
import sys | |
import scandir | |
from struct import error as structerror | |
# from multiprocessing import Pool | |
# from multiprocessing.dummy import Pool as ThreadPool | |
# TODO use pathlib vs os.querypath calls? this is 3.4 only | |
# http://docs.sqlalchemy.org/en/rel_0_9/orm/tutorial.html ?? | |
# http://docs.python.org/3.4/howto/logging-cookbook.html | |
BUFFER_SIZE = 65536 # 8192 # file reading buffer size 8192 * 64? | |
excludedextensions = ['.txt', '.ini', '.log'] | |
class ED2KHash(object): | |
MAGICLEN = 9728000 | |
def __init__(self): | |
self.hashes = [] | |
self.pos = 0 | |
self.md4 = hashlib.new('md4') | |
def update(self, data): | |
data_len = len(data) | |
for d in (data[i:i + ED2KHash.MAGICLEN] for i in range(0, data_len, ED2KHash.MAGICLEN)): | |
self._update(d) | |
def _update(self, data): | |
data_len = len(data) | |
assert data_len <= ED2KHash.MAGICLEN | |
newpos = self.pos + data_len | |
if newpos < ED2KHash.MAGICLEN: | |
self.md4.update(data) | |
self.pos = newpos | |
return | |
else: | |
prev = data[:ED2KHash.MAGICLEN - self.pos] | |
next_val = data[ED2KHash.MAGICLEN - self.pos:] | |
self.md4.update(prev) | |
self.hashes.append(self.md4.digest()) | |
self.md4 = hashlib.new('md4') | |
self.md4.update(next_val) | |
self.pos = len(next_val) | |
return | |
def digest(self): | |
if len(self.hashes) == 0: | |
return self.md4.digest() | |
else: | |
m = hashlib.new('md4') | |
newhashes = self.hashes + [self.md4.digest()] | |
m.update(b''.join(newhashes)) | |
return m.digest() | |
class ApplicationConfiguration(object): | |
""" | |
Holds configuration values used in various places | |
""" | |
def __init__(self): | |
self.__database_name = 'filemgr.db' | |
self.__base_directory = '' | |
self.__database_file = '' | |
def get_database_name(self): | |
return self.__database_name | |
def set_database_name(self, database_name): | |
self.__database_name = database_name | |
database_name = property(get_database_name, set_database_name) | |
def get_base_directory(self): | |
return self.__base_directory | |
def set_base_directory(self, base_directory): | |
self.__base_directory = base_directory | |
base_directory = property(get_base_directory, set_base_directory) | |
def get_database_file(self): | |
return self.__database_file | |
def set_database_file(self, database_file): | |
self.__database_file = database_file | |
database_file = property(get_database_file, set_database_file) | |
def safeprint(s): | |
try: | |
print(s) | |
except UnicodeEncodeError: | |
print(s.encode('utf8').decode(sys.stdout.encoding)) | |
def add_insert_username(appconfig, username): | |
conn = sqlite3.connect(appconfig.database_file) | |
c = conn.cursor() | |
c.execute( | |
"SELECT userID FROM users WHERE users.username = ?;", (username,)) | |
row = c.fetchone() | |
if row is None: | |
# insert last_insert_rowid() | |
c.execute("INSERT INTO users (username) VALUES (?);", (username,)) | |
conn.commit() | |
rowid = c.lastrowid | |
else: | |
rowid = row[0] | |
conn.close() | |
return rowid | |
def add_insert_albumname(appconfig, albumname): | |
conn = sqlite3.connect(appconfig.database_file) | |
c = conn.cursor() | |
c.execute( | |
"SELECT albumID FROM albums WHERE albums.albumname = ?;", (albumname,)) | |
row = c.fetchone() | |
if row is None: | |
# insert last_insert_rowid() | |
c.execute("INSERT INTO albums (albumname) VALUES (?);", (albumname,)) | |
conn.commit() | |
rowid = c.lastrowid | |
else: | |
rowid = row[0] | |
conn.close() | |
return rowid | |
def add_insert_hash(appconfig, fileinfo): | |
conn = sqlite3.connect(appconfig.database_file) | |
c = conn.cursor() | |
c.execute( | |
"SELECT filehashID FROM filehashes WHERE filehashes.sha1b32 = ?;", (fileinfo['sha1b32'],)) | |
row = c.fetchone() | |
if row is None: | |
# insert last_insert_rowid() | |
c.execute("INSERT INTO filehashes (imgdhash,imghash,md5,sha1b32) VALUES (?,?,?,?);", | |
(fileinfo['imgdhash'], fileinfo['imghash'], fileinfo['md5'], fileinfo['sha1b32'])) | |
conn.commit() | |
rowid = c.lastrowid | |
else: | |
rowid = row[0] | |
conn.close() | |
return rowid | |
def add_file_to_db(appconfig, fileinfo): | |
conn = sqlite3.connect(appconfig.database_file) | |
c = conn.cursor() | |
# check if hashtypes has an entry for each hash in hashes | |
# print(fileinfo) | |
userid = add_insert_username(appconfig, fileinfo['username']) | |
albumid = add_insert_albumname(appconfig, fileinfo['albumname']) | |
# add file to files table | |
hashid = add_insert_hash(appconfig, fileinfo) | |
c.execute("INSERT INTO files (userid, inputpath, post, comment, filename, extension, albumID, " | |
"albumindex, fileindex, filesize, filewidth, fileheight, filehashID) " | |
"VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?);", | |
(userid, fileinfo['inputfile'], fileinfo['post'], fileinfo['comment'], | |
fileinfo['filename'], fileinfo['extension'], albumid, fileinfo['albumindex'], | |
fileinfo['fileindex'], fileinfo['filesize'], fileinfo['filewidth'], | |
fileinfo['fileheight'], hashid)) | |
conn.commit() | |
conn.close() | |
def add_missing_to_db(appconfig, missingfiles): | |
conn = sqlite3.connect(appconfig.database_file) | |
c = conn.cursor() | |
# c.execute("DELETE FROM missingpaths") | |
# conn.commit() | |
c.executemany("INSERT OR IGNORE INTO missingpaths (inputpath) VALUES (?)", missingfiles) | |
conn.commit() | |
c.execute("DELETE FROM files WHERE files.inputpath IN (SELECT inputpath FROM missingpaths)") | |
print("Deleted {:,d} files from DB that no longer existed in file system" | |
.format(len(missingfiles))) | |
print("Deleted entries have been saved to missingpaths table") | |
conn.commit() | |
conn.close() | |
def import_files_work(appconfig, dirname): | |
total_files = 0 | |
files_added_to_database = 0 | |
dbfiles = set() | |
# Looking up each hash is sllllllow, so pull em all in as a set and just look there! | |
print("Getting existing file locations from database...", end='') | |
existing_files = get_filelist_from_database(appconfig) | |
print("Got {:,d} file locations from database. Looking for new files.\n".format(len(existing_files))) | |
for dirpath, dirnames, files in scandir.walk(dirname, topdown=False): | |
total_files += len(files) | |
file_counter = 0 | |
if len(files) > 0: | |
safeprint("\n\tFound {:,d} files in {}. Processing...".format(len(files), dirpath)) | |
for name in files: | |
full_path_name = os.path.join(dirpath, name) | |
rel_path_name = os.path.relpath(full_path_name, start=dirname) | |
file_counter += 1 | |
if full_path_name not in existing_files: | |
if os.path.isfile(full_path_name) and len(rel_path_name.split('\\')) > 1: | |
parts = os.path.splitext(name.lower()) | |
if len(parts) == 2: | |
fileinfo = get_file_data(full_path_name, rel_path_name, dirname) | |
if fileinfo['inputfile'] not in existing_files and \ | |
fileinfo['extension'] not in excludedextensions: | |
files_added_to_database += 1 | |
# since this is a new file, we add it to our set for future import operations | |
existing_files.add(fileinfo['inputfile']) | |
dbfiles.add(fileinfo['inputfile'], ) | |
add_file_to_db(appconfig, fileinfo) | |
safeprint( | |
'\t\t({} [{:,d}/{:,d}]): Processing {} with {:,d} bytes...'.format( | |
datetime.datetime.now().strftime('%x %X'), | |
file_counter, | |
len(files), fileinfo['inputfile'], fileinfo['filesize'])) | |
else: | |
pass | |
else: | |
dbfiles.add(full_path_name, ) | |
continue | |
missingset = existing_files - dbfiles | |
missingfiles = list() | |
for file in missingset: | |
file = tuple((file,)) | |
missingfiles.append(file) | |
add_missing_to_db(appconfig, missingfiles) | |
return files_added_to_database, total_files | |
def get_filelist_from_database(appconfig): | |
# pull them out and cache on startup or when first pulled? | |
conn = sqlite3.connect(appconfig.database_file) | |
c = conn.cursor() | |
c.execute("SELECT inputpath FROM files;") | |
rows = c.fetchall() | |
conn.close() | |
filenames = [row[0] for row in rows] | |
return set(filenames) | |
def get_file_data(file, relpath, basepath): | |
""" | |
Generates hashes for file and other file info such as size, etc. | |
""" | |
# TODO can i use some kind of magic to determine mime type and forego extension? | |
fileinfo = {'inputfile': file, 'filesize': os.path.getsize(file), 'hashes': {}} | |
ignoredfiles = ['history.log', 'unsupported.txt', '.picasa.ini'] | |
untmp = relpath.split('\\') | |
username = untmp[0] | |
calcdir = os.path.join(untmp[0], untmp[1]) | |
errdir = basepath | |
if len(calcdir) < len(relpath): | |
albumdir = untmp[1] | |
filedir = untmp[2] | |
else: | |
albumdir = None | |
filedir = untmp[1] | |
albumname = '' | |
albumpost = '' | |
albumcomment = '' | |
albumindex = '' | |
if albumdir is not None: | |
albumhyphen = albumdir.split('-') | |
if albumhyphen[0].startswith('t3_') and len(albumhyphen) == 4 and \ | |
albumhyphen[2].isdigit(): # newstyle comment album | |
albumpost = albumhyphen[0] | |
albumcomment = albumhyphen[1] | |
albumindex = albumhyphen[2] | |
albumname = albumhyphen[3] | |
elif len(albumhyphen) == 3 and albumhyphen[1].isdigit(): # newstyle album | |
albumpost = albumhyphen[0] | |
albumcomment = '' | |
albumindex = albumhyphen[1] | |
albumname = albumhyphen[2] | |
elif len(albumhyphen) == 2 and albumhyphen[1].startswith('c') and \ | |
4 < len(albumhyphen[0]) < 7: # oldstyle comment album | |
albumunder = (albumhyphen[1]).split('_') | |
albumpost = 't3_' + albumhyphen[0] | |
albumcomment = albumunder[0] | |
albumindex = '' | |
albumname = albumunder[1] | |
elif len(albumhyphen) == 1 and len((albumhyphen[0]).split('_')) == 2: # oldstyle album | |
albumunder = (albumhyphen[0]).split('_') | |
albumpost = albumunder[0] | |
albumcomment = '' | |
albumindex = '' | |
albumname = albumunder[1] | |
# print("oldstyle album", albumpost, albumcomment, albumindex, albumname) | |
else: | |
print("can't resolve album:", file) | |
outfile = os.path.join(errdir, 'parsing_error ' + | |
datetime.datetime.now().strftime("%Y-%m-%d") + '.txt') | |
with open(outfile, 'a', encoding="utf-16") as logfile: | |
logfile.write("{}: Albumname Parsing Error - {}\n" | |
.format(datetime.datetime.now().strftime("%Y-%m-%d %H:%M"), file)) | |
filehyphen = filedir.split('-') | |
if filehyphen[0].startswith('t3_') and len(filehyphen) >= 5 and len(filehyphen[2]) < 4 and \ | |
filehyphen[2].isdigit() and filehyphen[3].isdigit(): # newstyle comment filename | |
filepost = filehyphen[0] | |
filecomment = filehyphen[1] | |
filealbumindex = filehyphen[2] | |
fileindex = filehyphen[3] | |
filename = '-'.join(filehyphen[4:]) | |
elif len(filehyphen) > 3 and len(filehyphen[1]) < 4 and \ | |
filehyphen[1].isdigit() and filehyphen[2].isdigit(): # newstyle filename | |
filepost = filehyphen[0] | |
filecomment = '' | |
filealbumindex = filehyphen[1] | |
fileindex = filehyphen[2] | |
filename = '-'.join(filehyphen[3:]) | |
elif len(filehyphen) >= 2 and 4 < len(filehyphen[0]) < 7: # oldstyle comment filename | |
filehyphen[1] = '-'.join(filehyphen[1:]) # recombine hyphens in actual filename | |
fileunder = (filehyphen[1]).split('_') | |
filepost = 't3_' + filehyphen[0] | |
filecomment = fileunder[0] | |
filealbumindex = '' | |
fileindex = '' | |
filename = '_'.join(fileunder[1:]) | |
elif len(filehyphen) >= 2 and len((filehyphen[0]).split('_')) >= 2: # oldstyle hyphenated fname | |
filehyphen[0] = '-'.join(filehyphen[0:]) # recombine hyphens in actual filename | |
fileunder = (filehyphen[0]).split('_') | |
filepost = fileunder[0] | |
filecomment = '' | |
filealbumindex = '' | |
fileindex = '' | |
filename = '_'.join(fileunder[1:]) | |
elif len(filehyphen) == 1 and len((filehyphen[0]).split('_')) >= 2: # oldstyle filename | |
fileunder = filehyphen[0].split('_') | |
filepost = fileunder[0] | |
filecomment = '' | |
filealbumindex = '' | |
fileindex = '' | |
filename = '_'.join(fileunder[1:]) | |
else: | |
print("can't resolve filename:", file) | |
if albumpost != '': | |
filepost = albumpost | |
else: | |
filepost = '' | |
if albumcomment != '': | |
filecomment = albumcomment | |
else: | |
filecomment = '' | |
if albumindex != '': | |
filealbumindex = albumindex | |
else: | |
filealbumindex = '' | |
fileindex = '' | |
filename = '_'.join(filehyphen[0:]) | |
if filename not in ignoredfiles: | |
outfile = os.path.join(errdir, 'parsing_error ' + | |
datetime.datetime.now().strftime("%Y-%m-%d") + '.txt') | |
with open(outfile, 'a', encoding="utf-16") as logfile: | |
logfile.write("{}: Filename Parsing Error - {}\n" | |
.format(datetime.datetime.now().strftime("%Y-%m-%d %H:%M"), file)) | |
parts = os.path.splitext(file.lower()) | |
ext = '' | |
if len(parts) == 2: | |
ext = parts[1] | |
sha1 = hashlib.sha1() | |
md5 = hashlib.md5() | |
f = open(file, 'rb') | |
# print('file extension = ', ext) | |
# if ext in imageexts: | |
try: | |
(filewidth, fileheight) = dimensions(file) | |
imghash = str(avhash(file)) | |
imgdhash = str(dhash(file)) | |
# imgrotavg = str(rotavhash(file)) | |
# vprint('\n[!] image hash (%s)' % imghash) | |
except (RuntimeError, TypeError, NameError, ValueError, structerror): | |
# Failed to get hash, delete image & raise exception | |
print('image check failed') | |
imghash = 0 | |
imgdhash = 'hashing_failed' | |
# imgrotavg =0 | |
filewidth = 0 | |
fileheight = 0 | |
if filename not in ignoredfiles: | |
outfile = os.path.join(errdir, 'hasherrors ' + datetime.datetime.now().strftime("%Y-%m-%d") + '.txt') | |
with open(outfile, 'a', encoding="utf-16") as logfile: | |
logfile.write( | |
"{}: Hashing Error - {}\n".format(datetime.datetime.now().strftime("%Y-%m-%d %H:%M"), file)) | |
if filewidth > 5000 or fileheight > 5000: | |
print('\n[!] image too large to hash (%dx%d)' % (filewidth, fileheight)) | |
imghash = 999 | |
imgdhash = 'image_too_large' | |
# imgrotavg = 999 | |
pass | |
if filewidth == 161 and fileheight == 81: | |
# Size of empty imgur image ('not found!') | |
imghash = 503 | |
imgdhash = 'imgur_503_removed' | |
# imgrotavg = 503 | |
pass | |
filename = filename[0:(len(filename) - len(ext))] | |
buf = f.read(BUFFER_SIZE) | |
while buf != b'': | |
md5.update(buf) | |
sha1.update(buf) | |
# imghash.update(buf) | |
buf = f.read(BUFFER_SIZE) | |
f.close() | |
sha1b16 = sha1.hexdigest().upper() | |
sha1b32 = base64.b32encode(base64.b16decode(sha1b16.upper())).decode().upper() | |
md5hash = md5.hexdigest().upper() | |
fileinfo['sha1b32'] = sha1b32 | |
fileinfo['md5'] = md5hash | |
fileinfo['post'] = filepost | |
fileinfo['comment'] = filecomment | |
fileinfo['fileindex'] = fileindex | |
fileinfo['filename'] = filename | |
fileinfo['albumindex'] = filealbumindex | |
fileinfo['albumname'] = albumname | |
fileinfo['imghash'] = imghash | |
fileinfo['imgdhash'] = imgdhash | |
fileinfo['extension'] = ext.lower() | |
fileinfo['filewidth'] = filewidth | |
fileinfo['fileheight'] = fileheight | |
fileinfo['username'] = username | |
return fileinfo | |
def init_db(appconfig): | |
# create, setup tables | |
# one table is hashname | |
# another is for files that references hashname pk | |
# this allows for easy expanding if hashname is missing without schema changes | |
conn = sqlite3.connect(appconfig.database_file) | |
c = conn.cursor() | |
c.execute("PRAGMA synchronous = OFF") | |
c.execute("PRAGMA journal_mode = MEMORY") | |
c.execute('''CREATE TABLE IF NOT EXISTS users | |
(userID INTEGER PRIMARY KEY, username TEXT)''') | |
c.execute('''CREATE TABLE IF NOT EXISTS albums | |
(albumID INTEGER PRIMARY KEY, albumname TEXT)''') | |
c.execute('''CREATE TABLE IF NOT EXISTS files | |
(fileID INTEGER PRIMARY KEY AUTOINCREMENT, userid INTEGER, inputpath TEXT, post TEXT, | |
comment TEXT, filename TEXT, extension TEXT, albumID INTEGER, albumindex TEXT, | |
fileindex TEXT, filesize INTEGER, filewidth INTEGER, fileheight INTEGER, | |
filehashID INTEGER)''') | |
c.execute('''CREATE TABLE IF NOT EXISTS filehashes (filehashID INTEGER PRIMARY KEY, | |
imgdhash TEXT, imghash TEXT, md5 TEXT, sha1b32 TEXT)''') | |
c.execute('''CREATE TABLE IF NOT EXISTS missingpaths (inputpath TEXT)''') | |
hashes = ['imgdhash', 'imghash', 'md5', 'sha1b32'] | |
viewstate = """CREATE VIEW IF NOT EXISTS {0}_dupes AS | |
SELECT files.inputpath, files.filename, users.username, albums.albumname, files.post, | |
files.filesize, files.filewidth, files.fileheight, filehashes.{0}, files.filehashID | |
FROM files JOIN filehashes ON files.filehashID = filehashes.filehashID | |
JOIN users ON files.userid = users.userID | |
JOIN albums ON files.albumID = albums.albumID | |
WHERE filehashes.{0} IN ( SELECT filehashes.{0} | |
FROM files JOIN filehashes ON filehashes.filehashID = files.filehashID | |
WHERE filehashes.{0} <> 0 AND filehashes.{0} <> '0000000000000000' AND | |
filehashes.{0} <> 'average_hash_error' AND filehashes.{0} <> 'imgur_503_removed' AND | |
filehashes.{0} <> 'image_too_large' AND filehashes.{0} <> 'hashing_failed' AND | |
filehashes.{0} <> 'differential_hash_error' AND filehashes.{0} <> 9999 AND | |
filehashes.{0} <> 12345 AND filehashes.{0} <> 223344 AND filehashes.{0} <> 112233 | |
GROUP BY filehashes.{0} HAVING count() > 1) ORDER BY filehashes.{0}""" | |
for ihash in hashes: | |
curstate = viewstate.format(ihash) | |
c.execute(curstate) | |
conn.commit() | |
inexactview = """CREATE VIEW IF NOT EXISTS inexact_matches AS | |
SELECT files.inputpath, files.filename, files.post, files.filesize, | |
files.filewidth, files.fileheight, filehashes.imgdhash, filehashes.md5 | |
FROM files JOIN filehashes ON files.filehashID = filehashes.filehashID | |
WHERE files.filehashID IN ( SELECT files.filehashID FROM filehashes | |
JOIN files ON files.filehashID = filehashes.filehashID | |
WHERE filehashes.imgdhash IN | |
( SELECT imgdhash FROM filehashes | |
WHERE filehashes.imgdhash <> 0 AND filehashes.imgdhash <> '0000000000000000' | |
AND filehashes.imgdhash <> 'average_hash_error' | |
AND filehashes.imgdhash <> 'imgur_503_removed' | |
AND filehashes.imgdhash <> 'image_too_large' AND filehashes.imgdhash <> 'hashing_failed' | |
AND filehashes.imgdhash <> 'differential_hash_error' AND filehashes.imgdhash <> 9999 | |
AND filehashes.imgdhash <> 12345 AND filehashes.imgdhash <> 223344 | |
AND filehashes.imgdhash <> 112233 | |
GROUP BY filehashes.imgdhash HAVING count() > 1 ) ) | |
AND files.filehashID NOT IN ( SELECT DISTINCT files.filehashID | |
FROM files WHERE files.filehashID IN ( SELECT DISTINCT source.filehashID | |
FROM files AS source LEFT JOIN files AS target ON source.fileID = target.fileID | |
WHERE source.filehashid <> target.filehashid ORDER BY source.fileID ) | |
GROUP BY files.filename ) ORDER BY filehashes.imgdhash""" | |
c.execute(inexactview) | |
conn.commit() | |
filenamemismatchview = """CREATE VIEW IF NOT EXISTS filenamehash_mismatch AS | |
SELECT files.inputpath, users.username, albums.albumname, files.filename, files.filesize, | |
files.filewidth, files.fileheight, files.filehashid | |
FROM files JOIN users ON files.userid = users.userID | |
JOIN albums ON files.albumID = albums.albumID | |
WHERE files.filename IN ( SELECT DISTINCT files.filename FROM files | |
WHERE files.filename IN ( SELECT DISTINCT source.filename | |
FROM files AS source LEFT JOIN files AS target ON source.filename = target.filename | |
WHERE source.filehashid <> target.filehashid | |
ORDER BY source.filename ) GROUP BY files.filename ) AND files.filename <> 'default' AND | |
files.filename <> 'encoded' AND files.filename <> 'giphy' AND | |
files.filename <> 'image' AND files.filename <> 'large' AND files.filename <> '' AND | |
files.filename <> 'media_command' AND length(files.filename) > 4 | |
ORDER BY files.filename""" | |
c.execute(filenamemismatchview) | |
conn.commit() | |
filenamematchview = """CREATE VIEW IF NOT EXISTS filenamehash_match AS | |
SELECT files.inputpath, users.username, albums.albumname, files.filename, files.filesize, | |
files.filewidth, files.fileheight, files.filehashid | |
FROM files JOIN users ON files.userid = users.userID | |
JOIN albums ON files.albumID = albums.albumID | |
WHERE files.filename IN ( SELECT DISTINCT files.filename FROM files | |
WHERE files.filename IN ( SELECT DISTINCT source.filename | |
FROM files AS source LEFT JOIN files AS target ON source.filename = target.filename | |
WHERE source.filehashid = target.filehashid | |
ORDER BY source.filename ) GROUP BY files.filename HAVING count() > 1) | |
AND files.filename <> 'default' AND | |
files.filename <> 'encoded' AND files.filename <> 'giphy' AND | |
files.filename <> 'image' AND files.filename <> 'large' AND files.filename <> '' AND | |
files.filename <> 'media_command' AND length(files.filename) > 4 | |
ORDER BY files.filename""" | |
c.execute(filenamematchview) | |
conn.commit() | |
dirsizeview = """CREATE VIEW IF NOT EXISTS directory_size AS | |
SELECT users.username, SUM(files.filesize) AS dirsum | |
FROM filehashes JOIN files ON filehashes.filehashID = files.filehashID | |
JOIN users ON files.userid = users.userID | |
GROUP BY users.username ORDER BY sum(files.filesize) ASC;""" | |
c.execute(dirsizeview) | |
conn.commit() | |
# add indexes | |
c.execute('CREATE INDEX IF NOT EXISTS "IX_filehashes" ON "filehashes" ' | |
'("imgdhash","imghash","md5","sha1b32")') | |
c.execute('CREATE INDEX IF NOT EXISTS "IX_filehashID" ON "files" ("filehashID")') | |
c.execute('CREATE INDEX IF NOT EXISTS "IX_username" ON "users" ("username")') | |
c.execute('CREATE INDEX IF NOT EXISTS "IX_albumname" ON "albums" ("albumname")') | |
c.execute('CREATE INDEX IF NOT EXISTS "IU_inputpath" ON "files" ' | |
'("inputpath", "filesize", "filewidth", "fileheight")') | |
conn.commit() | |
conn.close() | |
def import_files(appconfig, directory): | |
""" | |
Attempts to recursively import files from values in directories and writes log files with actions taken | |
@param appconfig: Configuration data | |
@param directory: a list of directories to import from | |
""" | |
print("Importing from '{}'".format(",".join(directory))) | |
directory = directory.strip() | |
if os.path.isdir(directory): | |
(files_added_to_database, total_files) = import_files_work(appconfig, directory) | |
print( | |
'\n' + '*' * 4 + """ {:,d} total files found. {:,d} files were added to the database.""" | |
.format(total_files, files_added_to_database)) | |
directory_clean = re.sub(r'[^\w\-_. ]', '_', directory) | |
logfile_name = os.path.join(appconfig.base_directory, | |
"Import log for " + directory_clean + " " + | |
datetime.datetime.now().strftime("%H%M%S%f") + '.txt') | |
with open(logfile_name, 'w+', encoding="utf-16") as logfile: | |
logfile.write('Directory processed: {}\n\n'.format(directory)) | |
logfile.write('Files found: {:,d}\n'.format(total_files)) | |
logfile.write('Files added to database: {:,d}\n'.format(files_added_to_database)) | |
logfile.write('*' * 78 + '\n\n') | |
else: | |
print("\t'{}' does not exist!".format(directory)) | |
# after import, tell the user to see generated logs (one per directory) in the main directory | |
# but only if we actually attempted to import something | |
if len(directory) > 0 and 'logfile_name' in locals(): | |
print("\n\nSee log files in {} for details.".format(appconfig.base_directory)) | |
def main(): | |
# this stores our application parameters so it can get passed around to functions | |
appconfig = ApplicationConfiguration() | |
appconfig.base_directory = r"F:\Python\databases" | |
appconfig.database_file = os.path.join(appconfig.base_directory, appconfig.database_name) | |
init_db(appconfig) | |
targetroot = r"G:\GWScanner\\content" | |
import_files(appconfig, targetroot) | |
if __name__ == '__main__': | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# https://pypi.org/project/ImageHash/ | |
from os import path, mkdir, sep, remove | |
from sys import exit, argv | |
from PIL import Image | |
import numpy | |
from struct import error as StructError | |
# import scipy.fftpack | |
# import multiprocessing as mp | |
def binary_array_to_hex(arr): | |
h = 0 | |
s = [] | |
for i, v in enumerate(arr.flatten()): | |
if v: h += 2 ** (i % 8) | |
if (i % 8) == 7: | |
s.append(hex(h)[2:].rjust(2, '0')) | |
h = 0 | |
return "".join(s) | |
def dhash(im): | |
hash_size = 8 | |
if not isinstance(im, Image.Image): | |
try: | |
im = Image.open(im) | |
im = im.convert('L').resize((hash_size + 1, hash_size), Image.ANTIALIAS) | |
pixels = numpy.array(im.getdata(), dtype=numpy.float).reshape((hash_size + 1, hash_size)) | |
# compute differences | |
diff = pixels[1:, :] > pixels[:-1, :] | |
diff = binary_array_to_hex(diff) | |
except (OSError, SyntaxError, IndexError): | |
diff = "differential_hash_error" | |
return diff | |
def avhash(im): | |
""" | |
Shrinks image to 16x16 pixels, | |
Finds average amongst the pixels, | |
Iterates over every pixel, comparing to average. | |
1 if above avg, 0 if below. | |
Returns resulting integer. (hash of the image 'im') | |
Updated to not use ternary operator (not available in python 2.4.x) | |
""" | |
if not isinstance(im, Image.Image): | |
try: | |
im = Image.open(im) | |
im = im.convert('L').resize((16, 16), Image.ANTIALIAS) | |
ttl = 0 | |
for gd in im.getdata(): ttl += gd | |
avg = ttl // 256 | |
result = 0 | |
for i, gd in enumerate(im.getdata()): | |
if gd > avg: | |
result += (1 << i) | |
del im | |
except (OSError, SyntaxError, IndexError, StructError): | |
result = "average_hash_error" | |
return result | |
def avhash_dict(im): | |
""" | |
Generate hashes for the image, including variations of the image | |
* Regular image | |
* Mirrored (left-right) | |
* Rotated left (90deg) | |
* Rotated right (270deg) | |
""" | |
if not isinstance(im, Image.Image): | |
im = Image.open(im) | |
im = im.resize((16, 16), Image.ANTIALIAS).convert('L') | |
ttl = 0 | |
for gd in im.getdata(): ttl += gd | |
avg = ttl // 256 | |
result = {} | |
# Regular hash | |
regular_hash = 0 | |
for i, gd in enumerate(im.getdata()): | |
if gd > avg: | |
regular_hash += (1 << i) | |
result['hash'] = regular_hash | |
# Mirror hash | |
mirror_im = im.transpose(Image.FLIP_LEFT_RIGHT) | |
mirror_hash = 0 | |
for i, gd in enumerate(mirror_im.getdata()): | |
if gd > avg: | |
mirror_hash += (1 << i) | |
result['mirror'] = mirror_hash | |
# Rotated 90deg hash | |
left_im = im.transpose(Image.ROTATE_90) | |
left_hash = 0 | |
for i, gd in enumerate(left_im.getdata()): | |
if gd > avg: | |
left_hash += (1 << i) | |
result['left'] = left_hash | |
# Rotated 270deg hash | |
right_im = im.transpose(Image.ROTATE_270) | |
right_hash = 0 | |
for i, gd in enumerate(right_im.getdata()): | |
if gd > avg: | |
right_hash += (1 << i) | |
result['right'] = right_hash | |
rotaverage = (regular_hash + mirror_hash + left_hash + right_hash) / 4 | |
del im | |
return result | |
def rotavhash(im): | |
""" | |
Generate hashes for the image, including variations of the image | |
* Regular image | |
* Mirrored (left-right) | |
* Rotated left (90deg) | |
* Rotated right (270deg) | |
""" | |
if not isinstance(im, Image.Image): | |
try: | |
im = Image.open(im) | |
im = im.resize((16, 16), Image.ANTIALIAS).convert('L') | |
ttl = 0 | |
for gd in im.getdata(): ttl += gd | |
avg = ttl // 256 | |
result = {} | |
# Regular hash | |
regular_hash = 0 | |
for i, gd in enumerate(im.getdata()): | |
if gd > avg: | |
regular_hash += (1 << i) | |
result['hash'] = regular_hash | |
# Mirror hash | |
mirror_im = im.transpose(Image.FLIP_LEFT_RIGHT) | |
mirror_hash = 0 | |
for i, gd in enumerate(mirror_im.getdata()): | |
if gd > avg: | |
mirror_hash += (1 << i) | |
result['mirror'] = mirror_hash | |
# Rotated 90deg hash | |
left_im = im.transpose(Image.ROTATE_90) | |
left_hash = 0 | |
for i, gd in enumerate(left_im.getdata()): | |
if gd > avg: | |
left_hash += (1 << i) | |
result['left'] = left_hash | |
# Rotated 270deg hash | |
right_im = im.transpose(Image.ROTATE_270) | |
right_hash = 0 | |
for i, gd in enumerate(right_im.getdata()): | |
if gd > avg: | |
right_hash += (1 << i) | |
result['right'] = right_hash | |
rotaverage = (regular_hash + mirror_hash + left_hash + right_hash) / 4 | |
rotaverage = int(rotaverage) | |
del im | |
except (OSError, SyntaxError, IndexError): | |
rotaverage = "rotated_average_error" | |
return rotaverage | |
def dimensions(im): | |
""" Returns tuple (Width, Height) for given image. """ | |
if not isinstance(im, Image.Image): | |
try: | |
im = Image.open(im) | |
result = im.size | |
except OSError: | |
print("Not a recognized Image File") | |
# im.size = (0, 0) | |
result = (0, 0) | |
del im | |
return result | |
def create_thumb(im, num): | |
""" | |
Creates a thumbnail for a given image file. | |
Saves to 'thumbs' directory, named <num>.jpg | |
""" | |
try: | |
mkdir('thumbs') | |
except OSError: | |
pass | |
if not isinstance(im, Image.Image): | |
im = Image.open(im) | |
# Convert to RGB if not already | |
if im.mode != "RGB": im = im.convert("RGB") | |
im.thumbnail((100, 100), Image.ANTIALIAS) | |
im.save('thumbs%s%d.jpg' % (sep, num), 'JPEG') | |
del im | |
if __name__ == '__main__': | |
args = argv[1:] | |
if len(args) == 0: | |
print('argument required: image file location') | |
exit(1) | |
filename = ' '.join(args) | |
remove_file = False | |
if not path.exists(filename): | |
print('file not found: %s' % filename) | |
exit(1) | |
print('Hash:\t\t%d' % avhash(filename)) | |
print('') | |
d = avhash_dict(filename) | |
for key in d: | |
print('Hash[%s] = \t%d' % (key, d[key])) | |
print('') | |
dim = dimensions(filename) | |
print('Dimensions:\t%dx%d' % (dim[0], dim[1])) | |
# create_thumb(filename, 1) | |
if remove_file: | |
remove(filename) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment