pl77 · May 22, 2024 19:08
diff --git a/filemgr.py b/filemgr.py
 import argparse
 import os
 import hashlib
 from ImageHash import avhash, dhash, dimensions  # , rotavhash
 import base64
 import sqlite3
 import glob
 import time
 # import shutil
 import datetime
 import re
 import zipfile
 import sys
 import scandir

 # from multiprocessing import Pool
 # from multiprocessing.dummy import Pool as ThreadPool

 # TODO use pathlib vs os.querypath calls? this is 3.4 only
 # http://docs.sqlalchemy.org/en/rel_0_9/orm/tutorial.html ??
 # http://docs.python.org/3.4/howto/logging-cookbook.html

 # a list of valid file extensions to import. anything else will be skipped. make it a set in case people add dupes
 extensions = {'.jpg', '.avi', '.ram', '.rm', '.wmv', '.pdf', '.mov', '.mp4', '.flv', '.jpe', '.jpeg', '.mpg', '.mpe',
              '.mpeg', '.png', '.3g2', '.3gp', '.asf', '.bmp', '.divx', '.gif', '.m1v', '.vob', '.mod', '.tif', '.mkv',
              '.jp2', '.psd', '.m4v', '.pcx', '.webm', '.m4a', '.mp3', '.org', '.jpgy', '.jpgg',
              '.aspx', '.3ga ', '.net'}

 # a list of extensions to delete. If any of these extensions are found in 'extensions' the import will be cancelled
 auto_delete_extensions = {}

 BUFFER_SIZE = 65536  # 8192 # file reading buffer size 8192 * 64?


 # logger = logging.getLogger('filemgr')
 # logger.setLevel(logging.CRITICAL)
 # fh = logging.FileHandler('filemgr_debug.log')
 # formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 # fh.setFormatter(formatter)
 # logger.addHandler(fh)


 def safeprint(s):
    try:
        print(s)
    except UnicodeEncodeError:
        print(s.encode('utf8').decode(sys.stdout.encoding))


 class ED2KHash(object):
    MAGICLEN = 9728000

    def __init__(self):
        self.hashes = []
        self.pos = 0
        self.md4 = hashlib.new('md4')

    def update(self, data):
        data_len = len(data)
        for d in (data[i:i + ED2KHash.MAGICLEN] for i in range(0, data_len, ED2KHash.MAGICLEN)):
            self._update(d)

    def _update(self, data):
        data_len = len(data)
        assert data_len <= ED2KHash.MAGICLEN

        newpos = self.pos + data_len

        if newpos < ED2KHash.MAGICLEN:
            self.md4.update(data)
            self.pos = newpos
            return
        else:
            prev = data[:ED2KHash.MAGICLEN - self.pos]
            next_val = data[ED2KHash.MAGICLEN - self.pos:]
            self.md4.update(prev)
            self.hashes.append(self.md4.digest())
            self.md4 = hashlib.new('md4')
            self.md4.update(next_val)
            self.pos = len(next_val)
            return

    def digest(self):
        if len(self.hashes) == 0:
            return self.md4.digest()
        else:
            m = hashlib.new('md4')
            newhashes = self.hashes + [self.md4.digest()]
            m.update(b''.join(newhashes))
            return m.digest()


 class ApplicationConfiguration(object):
    """
    Holds configuration values used in various places
    """

    def __init__(self):
        self.__database_name = 'filemgr.db3'
        self.__base_directory = ''
        self.__database_file = ''
        self.__delete_existing = ''
        self.__copy_new_destination = ''
        self.__export_directory = ''
        self.__rename_exported = False
        self.__zip_exported = False
        self.__delete_empty_directories = ''

    def get_database_name(self):
        return self.__database_name

    def set_database_name(self, database_name):
        self.__database_name = database_name

    database_name = property(get_database_name, set_database_name)

    def get_base_directory(self):
        return self.__base_directory

    def set_base_directory(self, base_directory):
        self.__base_directory = base_directory

    base_directory = property(get_base_directory, set_base_directory)

    def get_database_file(self):
        return self.__database_file

    def set_database_file(self, database_file):
        self.__database_file = database_file

    database_file = property(get_database_file, set_database_file)

    def get_delete_existing(self):
        return self.__delete_existing

    def set_delete_existing(self, delete_existing):
        self.__delete_existing = delete_existing

    delete_existing = property(get_delete_existing, set_delete_existing)

    def get_delete_empty_directories(self):
        return self.__delete_empty_directories

    def set_delete_empty_directories(self, delete_empty_directories):
        self.__delete_empty_directories = delete_empty_directories

    delete_empty_directories = property(get_delete_empty_directories, set_delete_empty_directories)

    def get_export_directory(self):
        return self.__export_directory

    def set_export_directory(self, export_directory):
        self.__export_directory = export_directory

    export_directory = property(get_export_directory, set_export_directory)

    def get_rename_exported(self):
        return self.__rename_exported

    def set_rename_exported(self, rename_exported):
        self.__rename_exported = rename_exported

    rename_exported = property(get_rename_exported, set_rename_exported)

    def get_zip_exported(self):
        return self.__zip_exported

    def set_zip_exported(self, zip_exported):
        self.__zip_exported = zip_exported

    zip_exported = property(get_zip_exported, set_zip_exported)

    def get_copy_new_destination(self):
        return self.__copy_new_destination

    def set_copy_new_destination(self, copy_new_destination):
        self.__copy_new_destination = copy_new_destination

    copy_new_destination = property(get_copy_new_destination, set_copy_new_destination)


 def add_insert_hashtype(appconfig, hashtype):
    conn = sqlite3.connect(appconfig.database_file)
    c = conn.cursor()
    c.execute(
        "SELECT hashID FROM hashtypes WHERE hashtypes.hashname = ?;", (hashtype,))
    row = c.fetchone()
    if row is None:
        # insert last_insert_rowid()
        c.execute("INSERT INTO hashtypes (hashname) VALUES (?);", (hashtype,))
        conn.commit()
        rowid = c.lastrowid
    else:
        rowid = row[0]
    conn.close()
    return rowid


 def add_file_to_db(appconfig, fileinfo):
    conn = sqlite3.connect(appconfig.database_file)

    c = conn.cursor()

    # check if hashtypes has an entry for each hash in hashes
    hashtypes = {}

    for key in fileinfo['hashes'].keys():
        hashtypes[key] = add_insert_hashtype(appconfig, key)
    # print(fileinfo)
    filename = fileinfo['inputfile']
    basefilename = os.path.split(filename)[-1]
    basefilenameparts = os.path.splitext(basefilename)
    file_ext = basefilenameparts[1].lower()

    file_directory = os.path.join('files', fileinfo['hashes']['sha1b32'][0:2], fileinfo['hashes']['sha1b32'] + file_ext)

    # add file to files table
    c.execute("INSERT INTO files (inputpath,filepath,filesize,filewidth,fileheight,comment) VALUES (?,?,?,?,?,?);",
              (fileinfo['inputfile'], '1', fileinfo['filesize'], fileinfo['filewidth'], fileinfo['fileheight'], ''))

    fileid = c.lastrowid

    # add each hash to file hashes
    for hashtype in hashtypes:
        c.execute("INSERT INTO filehashes (hashID,fileID,filehash) VALUES (?,?,?);",
                  (hashtypes[hashtype], fileid, fileinfo['hashes'][hashtype]))

    conn.commit()

    conn.close()


 def import_files_work(appconfig, dirname):
    files_with_invalid_extensions = []  # list of files we didn't import.

    total_files = 0
    files_added_to_database = 0
    files_deleted = 0
    files_with_duplicate_hashes = []
    files_copied = 0

    # Looking up each hash is sllllllow, so pull em all in as a set and just look there!
    print("Getting existing file locations from database...", end='')
    existing_files = get_filelist_from_database(appconfig)

    print("Got {:,d} file locations from database. Looking for new files.\n".format(len(existing_files)))

    for dirpath, dirnames, files in scandir.walk(dirname, topdown=False):

        total_files += len(files)

        file_counter = 0

        if len(files) > 0:
            safeprint("\n\tFound {:,d} files in {}. Processing...".format(len(files), dirpath))

            #   logger.info("Found {:,d} files in {}".format(len(files), dirpath))

        for name in files:
            full_path_name = os.path.join(dirpath, name)

            file_counter += 1
            if full_path_name not in existing_files:
                if os.path.isfile(full_path_name):

                    if os.path.getsize(full_path_name) == 0:
                        safeprint("\t\tDeleting 0 byte file '{}'.".format(full_path_name))
                        # os.remove(full_path_name)
                        continue

                    parts = os.path.splitext(name.lower())
                    if len(parts) == 2:
                        ext = parts[1]

                        # some files are always bad, so just make em go away.
                        if ext in auto_delete_extensions:
                            safeprint(
                                '\t\t({} [{:,d}/{:,d}]): File {} has an autonuke extension. Deleting...'.format(
                                    datetime.datetime.now().strftime('%x %X'),
                                    file_counter,
                                    len(files), full_path_name))
                            # os.remove(full_path_name)
                            continue

                        # if ext in extensions:
                        # logger.info(
                        #     "{} before fileinfo = get_file_data(full_path_name)".format(
                        #         datetime.datetime.now().strftime('%x %X')))

                        fileinfo = get_file_data(full_path_name)

                        # logger.info("{} after fileinfo = get_file_data(full_path_name)".format(
                        #     datetime.datetime.now().strftime('%x %X')))

                        if not fileinfo['inputfile'] in existing_files:
                            files_added_to_database += 1

                            safeprint("\t\t({} [{:,d}/{:,d}]): '{}' does not exist in database! Adding...".format
                                      (datetime.datetime.now().strftime('%x %X'),
                                       file_counter,
                                       len(files),
                                       full_path_name))

                            # since this is a new file, we add it to our set for future import operations
                            existing_files.add(fileinfo['hashes']['sha1b32'])

                            add_file_to_db(appconfig, fileinfo)
                        else:
                            pass  # do anything else here? should i check if file exists in file system? who cares tho
                            # as this syncs it up maybe here is where you do extra hashing of what is on file
                            #  system to make sure the 2 match, properly named, etc

                        copied = copy_file_to_store(appconfig, fileinfo)

                        if copied:
                            safeprint(
                                '\t\t({} [{:,d}/{:,d}]): Processing {} with {:,d} bytes...'.format(
                                    datetime.datetime.now().strftime('%x %X'),
                                    file_counter,
                                    len(files), fileinfo['inputfile'], fileinfo['filesize']))

                        # logger.info("{} after copied = copy_file_to_store(appconfig, fileinfo)):".format(
                        #     datetime.datetime.now().strftime('%x %X')))

                        if not copied:
                            files_with_duplicate_hashes.append(full_path_name)
                        else:
                            files_copied += 1

                        if len(appconfig.copy_new_destination) > 0 and copied:
                            # if not os.querypath.exists(appconfig.copy_new_destination):
                            # os.mkdir(appconfig.copy_new_destination)

                            # TODO should this create the 2 char structure too? for now, just copy it

                            copy_name = os.path.join(appconfig.copy_new_destination, name)

                            unique_prefix = 0

                            while os.path.isfile(copy_name):
                                # file exists, so get a unique name
                                copy_name = os.path.join(appconfig.copy_new_destination,
                                                         str(unique_prefix) + "_" + name)
                                unique_prefix += 1

                            # shutil.copyfile(full_path_name, copy_name)

                            outfile = os.path.join(appconfig.copy_new_destination,
                                                   "!!" + datetime.datetime.now().strftime(
                                                       "%Y-%m-%d") + " File copy log " + '.txt')
                            with open(outfile, 'a', encoding="utf-16") as logfile:
                                logfile.write(
                                    "{}: Copied {} to {}.\n".format(datetime.datetime.now(), full_path_name, copy_name))

                        if appconfig.delete_existing:
                            safeprint("\t\t({} [{:,d}/{:,d}]): Deleting '{}'...".format(
                                datetime.datetime.now().strftime('%x %X'),
                                file_counter,
                                len(files),
                                full_path_name))

                            # if appconfig.delete_existing == 'yes':
                            # os.remove(full_path_name)

                            files_deleted += 1
            else:
                continue  # do anything else here? should i check if file exists in file system? who cares tho
                # logger.info("{} before copied = copy_file_to_store(appconfig, fileinfo)):".format(
                #     datetime.datetime.now().strftime('%x %X')))

        if appconfig.delete_empty_directories:
            if not os.listdir(dirpath):
                safeprint("\t\t({} [{:,d}/{:,d}]): Deleting empty directory '{}'...".format(
                    datetime.datetime.now().strftime('%x %X'), file_counter, len(files), dirpath))
                if appconfig.delete_empty_directories == 'yes':
                    os.rmdir(dirpath)

    return (files_added_to_database, total_files, files_deleted, files_copied, files_with_duplicate_hashes,
            files_with_invalid_extensions)


 def get_filelist_from_database(appconfig):
    # pull them out and cache on startup or when first pulled?

    conn = sqlite3.connect(appconfig.database_file)
    c = conn.cursor()

    c.execute("SELECT inputpath FROM files;")

    rows = c.fetchall()

    conn.close()

    filenames = [row[0] for row in rows]

    return set(filenames)


 def file_exists_in_database(appconfig, fileinfo):
    conn = sqlite3.connect(appconfig.database_file)
    c = conn.cursor()
    c.execute(
        "SELECT filehashID FROM files, filehashes, hashtypes WHERE hashtypes.hashid = filehashes.hashid "
        "AND files.fileID = filehashes.fileID AND hashtypes.hashname = 'sha1b32' AND filehashes.filehash = ?;",
        (fileinfo['hashes']['sha1b32'],))

    row = c.fetchone()

    conn.close()

    if row is None:
        return False
    else:
        return True


 def get_sha1b32_from_database(appconfig):
    # pull them out and cache on startup or when first pulled?

    conn = sqlite3.connect(appconfig.database_file)
    c = conn.cursor()

    hash_id = get_hash_id_from_hash_name(appconfig, "sha1b32")

    c.execute("SELECT filehash FROM filehashes WHERE hashid = ?;", (hash_id,))

    rows = c.fetchall()

    conn.close()

    hashes = [row[0] for row in rows]

    return set(hashes)


 def copy_file_to_store(appconfig, fileinfo):
    """Checks datastore for a file with identical sha1b32 hash.
    if one exists, optionally delete the source file
    optionally copy new file to separate directory for sharing purposes
    """

    filename = fileinfo['inputfile']
    base_filename = os.path.split(filename)[-1]
    base_filename_parts = os.path.splitext(base_filename)
    file_ext = base_filename_parts[1].lower()

    files_directory = os.path.join(appconfig.base_directory, 'files')

    file_directory = os.path.join(files_directory, fileinfo['hashes']['sha1b32'][0:2])

    # if not os.querypath.exists(file_directory):
    # os.mkdir(file_directory)

    target_filemask = os.path.join(file_directory, fileinfo['hashes']['sha1b32'] + '*')

    dest_filename = os.path.join(file_directory, fileinfo['hashes']['sha1b32'] + file_ext)

    listing = glob.glob(target_filemask)

    file_copied = False

    if len(listing) == 0:
        # shutil.copyfile(filename, dest_filename)
        file_copied = True

    return file_copied


 """
 def multiprocess(processes, samples, x, widths):
    pool = mp.Pool(processes=processes)
    results = [pool.apply_async(parzen_estimation, args=(samples, x, w)) for w in widths]
    results = [p.get() for p in results]
    results.sort() # to sort the results by input window width
    return results
 """


 def get_file_data(file):
    """
    Generates hashes for file and other file info such as size, etc.
    """
    # TODO can i use some kind of magic to determine mime type and forego extension?

    fileinfo = {'inputfile': file, 'filesize': os.path.getsize(file), 'hashes': {}}

    parts = os.path.splitext(file.lower())
    ext = ''
    # imageexts = ('.jpg', '.jpeg', '.JPG', '.JPEG', '.png', '.PNG', '.bmp', '.tiff', '.gif', '.GIF')
    if len(parts) == 2:
        ext = parts[1]

    ed2k = ED2KHash()
    sha1 = hashlib.sha1()
    md5 = hashlib.md5()
    filewidth = ()
    fileheight = ()
    imghash = ()
    md4 = hashlib.new('md4')

    f = open(file, 'rb')
    # print('file extension = ', ext)
    # if ext in imageexts:
    try:
        (filewidth, fileheight) = dimensions(file)
        imghash = str(avhash(file))
        imgdhash = str(dhash(file))
        # imgrotavg = str(rotavhash(file))
        # vprint('\n[!] image hash (%s)' % imghash)

    except (RuntimeError, TypeError, NameError, ValueError):
        # Failed to get hash, delete image & raise exception
        print('image check failed')
        imghash = 0
        imgdhash = 0
        # imgrotavg =0
        filewidth = 0
        fileheight = 0
        pass
        # print('\n[!]file width = %s, height = %s' % (filewidth, fileheight))
    if filewidth > 4000 or fileheight > 4000:
        print('\n[!] image too large to hash (%dx%d)' % (filewidth, fileheight))
        imghash = 999
        imgdhash = 999
        # imgrotavg = 999
        pass
    if filewidth == 161 and fileheight == 81:
        # Size of empty imgur image ('not found!')
        imghash = 503
        imgdhash = 503
        # imgrotavg = 503
        pass
    # else:
    # imghash = 9999
    # filewidth = 9999
    # fileheight = 9999
    buf = f.read(BUFFER_SIZE)
    while buf != b'':
        md5.update(buf)
        sha1.update(buf)
        # md4.update(buf)
        # ed2k.update(buf)
        # imghash.update(buf)
        buf = f.read(BUFFER_SIZE)
    f.close()

    sha1b16 = sha1.hexdigest().upper()
    sha1b32 = base64.b32encode(base64.b16decode(sha1b16.upper())).decode().upper()
    edonkey = 1  # base64.b16encode(ed2k.digest())
    md4hash = 1  # md4.hexdigest().upper()
    md5hash = md5.hexdigest().upper()

    # fileinfo['hashes']['md4'] = md4hash
    # fileinfo['hashes']['ed2k'] = 1  # edonkey.decode('utf-8').upper()
    # fileinfo['hashes']['sha1b16'] = sha1b16
    fileinfo['hashes']['sha1b32'] = sha1b32
    fileinfo['hashes']['md5'] = md5hash
    fileinfo['hashes']['imghash'] = imghash
    fileinfo['hashes']['imgdhash'] = imgdhash
    # fileinfo['hashes']['imgrotavg'] = imgrotavg
    fileinfo['extension'] = ext.lower()
    fileinfo['file_store_name'] = 0
    fileinfo['filewidth'] = filewidth
    fileinfo['fileheight'] = fileheight
    return fileinfo


 # def generate_missing_hashes(appconfig, file):
 #     """ Given file, look for missing hashes, generate them, and update the
 #     database """
 #
 #     return "not done yet"


 def setup_base_directory(directory):
    try:
        # if not os.querypath.exists(directory):
        # print('{} does not exist! Creating...'.format(directory))
        # os.mkdir(directory)

        subdir = os.path.join(directory, 'files')

        # if not os.querypath.exists(subdir):
        # os.mkdir(subdir)
    except:
        raise


 def init_db(appconfig):
    # create, setup tables
    # one table is hashname
    # another is for files that references hashname pk
    # this allows for easy expanding if hashname is missing without schema changes
    conn = sqlite3.connect(appconfig.database_file)

    c = conn.cursor()
    c.execute("PRAGMA synchronous = OFF")
    c.execute("PRAGMA journal_mode = MEMORY")
    c.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='hashtypes';")

    row = c.fetchone()

    if row is None:
        print("!!!Database is missing. Creating...")
        c.execute('''CREATE TABLE hashtypes
             (hashID INTEGER PRIMARY KEY AUTOINCREMENT, hashname TEXT)''')

        c.execute('''CREATE TABLE files
             (fileID INTEGER PRIMARY KEY AUTOINCREMENT, inputpath TEXT,
             filepath TEXT, filesize INTEGER, filewidth INTEGER, fileheight INTEGER, comment TEXT)''')

        c.execute('''CREATE TABLE filehashes
             (filehashID INTEGER PRIMARY KEY AUTOINCREMENT, hashID INTEGER, fileID INTEGER, filehash TEXT)''')

        conn.commit()

    c.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='importedpaths';")

    row = c.fetchone()

    if row is None:
        print("!!!Table 'importedpaths' is missing!. Creating...")
        c.execute('''CREATE TABLE importedpaths (pathID INTEGER PRIMARY KEY AUTOINCREMENT, importedpath TEXT,
                  imported_date TEXT, files_added_to_database INTEGER, total_files INTEGER, files_deleted INTEGER,
                  files_copied INTEGER, files_with_duplicate_hashes INTEGER, files_with_invalid_extensions INTEGER);''')

        conn.commit()

    # add indexes

    c.execute("SELECT COUNT(*) FROM sqlite_master WHERE type = 'index';")

    row = c.fetchone()

    if row[0] == 0:
        print("!!!Indexes are missing. Creating...")
        c.execute('CREATE INDEX "IX_filehashes" ON "filehashes" ("filehash")')
        print("!File hash index created")
        c.execute('CREATE INDEX "IX_fileID" ON "filehashes" ("fileID")')
        print("!FileID index created")
        c.execute('CREATE INDEX "IU_inputpath" ON "files" ("inputpath", "filesize", "filewidth", "fileheight")')
        print("!File querypath/file size index created")
        c.execute('CREATE INDEX "IU_hashID_fileID" ON "filehashes" ("hashID", "filehash")')
        print("!HashID/file hash index created\n")
        c.execute('CREATE INDEX "IX_hashID" ON "filehashes" ("hashID")')
        print("!File hash index created")
        conn.commit()

    conn.close()


 def add_import_path_to_db(appconfig, path_name, files_added_to_database, total_files, files_deleted, files_copied,
                          files_with_duplicate_hashes, files_with_invalid_extensions):
    conn = sqlite3.connect(appconfig.database_file)

    c = conn.cursor()
    c.execute(
        "INSERT INTO importedpaths (importedpath, imported_date, files_added_to_database, total_files, files_deleted, files_copied, files_with_duplicate_hashes, files_with_invalid_extensions) VALUES (?, ?, ?, ?, ?, ?, ?, ?);",
        (path_name, datetime.datetime.now(), files_added_to_database, total_files, files_deleted, files_copied,
         len(files_with_duplicate_hashes), len(files_with_invalid_extensions)))

    conn.commit()

    conn.close()


 def check_import_path_in_db(appconfig, path_name):
    conn = sqlite3.connect(appconfig.database_file)

    c = conn.cursor()
    c.execute("SELECT imported_date FROM importedpaths WHERE importedpath = ? ORDER BY imported_date DESC;",
              (path_name,))

    rows = c.fetchall()

    conn.close()
    # 2014-02-05 10:22:30.214031
    dates = [datetime.datetime.strptime(row[0], '%Y-%m-%d %H:%M:%S.%f').strftime('%x %X') for row in rows]

    return dates


 def generate_hash_list(appconfig, hash_type, suppress_file_info):
    outfile = os.path.join(appconfig.base_directory,
                           "Exported hash list_" + datetime.datetime.now().strftime("%H%M%S%f") + '.tsv')

    file_count = 0

    conn = sqlite3.connect(appconfig.database_file)

    file_cursor = conn.execute("SELECT files.inputpath, files.filesize, files.fileID FROM files ORDER BY fileID")

    if hash_type == 'all':
        sql = 'SELECT hashid, hashname FROM hashtypes ORDER BY hashname ASC'
    else:
        sql = 'SELECT hashid, hashname FROM hashtypes WHERE hashname = "{}" ORDER BY hashname ASC'.format(hash_type)

    hash_types_cursor = conn.execute(sql)

    with open(outfile, 'w+', encoding="utf-16") as logfile:
        header = ['relative_path', 'file_size']

        if suppress_file_info:
            header.clear()

        hash_types = {}

        for hash_type_row in hash_types_cursor:
            header.append(hash_type_row[1])
            hash_types[hash_type_row[0]] = hash_type_row[1]

        logfile.write('\t'.join(header) + "\n")

        for file_row in file_cursor:
            file_count += 1
            file_id = file_row[2]

            # hash_types contains the id and hash name for all known hashes. for each of those, get that hash for
            # active file. if not present, tell the user

            row_values = [file_row[0], str(file_row[1])]  # this is what will build out each row

            if suppress_file_info:
                row_values.clear()

            for hash_id in sorted(hash_types,
                                  key=hash_types.get):  # sort it according to the hash names so the order is correct
                hash_cursor = conn.execute(
                    "SELECT filehashes.filehash, hashtypes.hashname FROM hashtypes INNER JOIN filehashes ON "
                    "filehashes.hashID = hashtypes.hashID WHERE filehashes.fileID = ? AND filehashes.hashID = ? "
                    "ORDER BY hashtypes.hashname ASC;",
                    (file_id, hash_id))
                row = hash_cursor.fetchone()
                if not row is None:
                    row_values.append(row[0])
                else:
                    row_values.append("Hash '{}' missing in database!".format(hash_types[hash_id]))
                hash_cursor.close()

            logfile.write('\t'.join(row_values) + "\n")

    conn.close()

    return file_count, outfile


 def import_files(appconfig, directories):
    """
    Attempts to recursively import files from values in directories and writes log files with actions taken
    @param appconfig: Configuration data
    @param directories: a list of directories to import from
    """
    print("Importing from '{}'".format(",".join(directories)))
    for directory in directories:
        directory = directory.strip()
        if os.path.isdir(directory):

            import_history = check_import_path_in_db(appconfig, directory)

            if len(import_history) > 0:
                answer = input(
                    "\n\n**** '{}' has already been imported on:\n\n{}\n\nContinue: [y|N]: ".format(directory,
                                                                                                    '\n'.join(
                                                                                                        import_history)))
                if not answer.lower() == 'y':
                    print("**** Skipping '{}'\n".format(directory))
                    continue

            (files_added_to_database, total_files, files_deleted, files_copied, files_with_duplicate_hashes,
             files_with_invalid_extensions) = import_files_work(appconfig, directory)

            add_import_path_to_db(appconfig, directory, files_added_to_database, total_files, files_deleted,
                                  files_copied, files_with_duplicate_hashes, files_with_invalid_extensions)

            print(
                '\n' + '*' * 4 + """ {:,d} total files found. {:,d} copied to file store and {:,d} files were added to the database. {:,d} files had duplicate hashes. {:,d} files had invalid extensions (see log file for details)""".format(
                    total_files, files_copied, files_added_to_database, len(files_with_duplicate_hashes),
                    len(files_with_invalid_extensions)))

            directory_clean = re.sub('[^\w\-_\. ]', '_', directory)

            logfile_name = os.path.join(appconfig.base_directory,
                                        "Import log for " + directory_clean + " " + datetime.datetime.now().strftime(
                                            "%H%M%S%f") + '.txt')

            with open(logfile_name, 'w+', encoding="utf-16") as logfile:
                logfile.write('Directory processed: {}\n\n'.format(directory))
                logfile.write('Files found: {:,d}\n'.format(total_files))
                logfile.write('Files copied to file store: {:,d}\n'.format(files_copied))
                logfile.write('Files added to database: {:,d}\n'.format(files_added_to_database))

                logfile.write('Files with duplicate hashes: {:,d}\n\n'.format(len(files_with_duplicate_hashes)))

                if files_deleted > 0:
                    logfile.write('Number of deleted files: {:,d}\n\n'.format(files_deleted))

                logfile.write('*' * 78 + '\n\n')

                logfile.write('The following files had duplicate hashes and were not imported:\n\n')
                for item in files_with_duplicate_hashes:
                    logfile.write("{}\n".format(item))

                logfile.write('\n\nThe following files had invalid extensions and were not imported:\n\n')
                for item in files_with_invalid_extensions:
                    logfile.write("{}\n".format(item))

            if appconfig.delete_existing and files_deleted > 0:
                print(' ' * 5 + '{:,d} files were deleted'.format(files_deleted))
        else:
            print("\t'{}' does not exist!".format(directory))

    # after import, tell the user to see generated logs (one per directory) in the main directory
    # but only if we actually attempted to import something
    if len(directories) > 0 and 'logfile_name' in locals():
        print("\n\nSee log files in {} for details.".format(appconfig.base_directory))


 def get_hash_id_from_hash_name(appconfig, hash_name):
    conn = sqlite3.connect(appconfig.database_file)
    c = conn.cursor()
    c.execute(
        "SELECT hashID FROM hashtypes WHERE hashname = ?;", (hash_name,))

    row = c.fetchone()

    conn.close()

    if row is None:
        return -1
    else:
        return int(row[0])


 def check_file_exists_in_database(appconfig, hash_id, hash_value):
    conn = sqlite3.connect(appconfig.database_file)
    c = conn.cursor()
    c.execute(
        "SELECT files.inputpath, files.filesize FROM filehashes JOIN files ON files.fileID = filehashes.fileID "
        "WHERE filehashes.hashID = ? AND filehashes.filehash = ?;",
        (hash_id, hash_value))

    row = c.fetchone()

    conn.close()

    if row is None:
        db_info = ('', 0)
    else:
        db_info = (row[0], row[1])

    return db_info


 def get_database_delta(appconfig, hash_set, hash_id):
    conn = sqlite3.connect(appconfig.database_file)
    c = conn.cursor()
    sql = "SELECT files.fileID, files.inputpath FROM filehashes INNER JOIN files ON files.fileID = filehashes.fileID WHERE filehashes.hashID = ? AND filehashes.filehash NOT in ({0})".format(
        ', '.join('?' for _ in hash_set))
    params = hash_set
    params.insert(0, str(hash_id))

    c.execute(sql, params)

    rows = c.fetchall()

    conn.close()

    return rows


 def get_hash_from_hash_id_and_file_id(appconfig, hash_id, file_id):
    conn = sqlite3.connect(appconfig.database_file)
    c = conn.cursor()
    c.execute(
        "SELECT filehashes.filehash FROM filehashes WHERE filehashes.hashID = ? AND filehashes.fileID = ?;",
        (hash_id, file_id))

    row = c.fetchone()

    conn.close()

    if row is None:
        return False
    else:
        return row[0]


 def build_new_out_path(export_directory, new_hash, file_name):
    front = "files\\" + new_hash[0:2]
    mid = new_hash
    ext = os.path.splitext(file_name[1])[-1]
    out_path = os.path.join(export_directory, front, mid + ext.lower())
    return out_path


 def copy_file(abs_path, log_file, out_path):
    if not os.path.exists(os.path.dirname(out_path)):
        os.makedirs(os.path.dirname(out_path))
    log_file.write("Copying '{}' to '{}'\n".format(abs_path, out_path))
    # shutil.copyfile(abs_path, out_path)


 def get_existing_hash_list(appconfig, hash_id):
    conn = sqlite3.connect(appconfig.database_file)
    c = conn.cursor()
    c.execute(
        "SELECT fileID, filehash FROM filehashes WHERE filehashes.hashID = ?;", (hash_id,))

    existing_hashes = {}

    #   row_count = 0

    record = c.fetchone()

    while record:
        # if row_count % 1000000 == 0:
        #     print("{}: Database rows fetched: {:,d}".format(datetime.datetime.now().strftime('%x %X'), row_count))

        existing_hashes[record[1]] = record[0]
        record = c.fetchone()
    #  row_count += 1

    conn.close()

    return existing_hashes


 def get_file_from_db(appconfig, file_id):
    conn = sqlite3.connect(appconfig.database_file)
    c = conn.cursor()
    c.execute(
        "SELECT inputpath FROM files WHERE fileID = ?;", (file_id,))

    record = c.fetchone()

    conn.close()

    return record[0]


 def export_files(appconfig, export_existing, file_name):
    """
    Copies files from file store to a directory
    @param appconfig: basic config data
    @param export_existing: if true, export files in input file that are also in file store, else, export the opposite
    @param file_name: the file to read hash type and hashes from
    """
    hash_file = open(file_name)
    hash_name = hash_file.readline().strip().lower()
    hash_id = get_hash_id_from_hash_name(appconfig, hash_name)

    if hash_id == -1:
        print("Unknown hash type: '{}'. Export cancelled!".format(hash_name))
        return

    datetime_string = datetime.datetime.now().strftime("%H%M%S%f")
    export_directory = os.path.join(appconfig.export_directory,
                                    "Export run " + datetime_string + " for {}".format(hash_name))

    if not os.path.exists(export_directory):
        os.makedirs(export_directory)

    log_name = os.path.join(export_directory,
                            "Export log " + datetime_string + '.txt')

    log_file = open(log_name, 'w', encoding="utf-16")

    log_file.write("Looking for hashes in '{}'\n\n".format(file_name))
    log_file.write("Hash type: {}\n".format(hash_name))
    print("\t\tHash type: {}\n".format(hash_name))
    log_file.write("Zip exported: {}\n".format(appconfig.zip_exported))
    log_file.write("Rename exported: {}\n\n".format(appconfig.rename_exported))

    if export_existing:
        export_type = "Existing"
    else:
        export_type = "Delta"

    log_file.write("Export operation: {}\n\n".format(export_type))

    log_file.write("Copy log\n\n")

    found_files = 0
    hash_count = 0

    # TODO collect operations in a single list then iterate/copy after so as to remove duplicate code in loops for each
    if export_existing:
        for line in hash_file:
            line = line.strip()
            hash_count += 1

            (file_path, file_size) = check_file_exists_in_database(appconfig, hash_id, line)

            # TODO This needs cleaned up in regard to the paths. the database should store things in one format
            # right now its all bunged up

            if file_path:
                print(
                    "\t\t({:,d}) File with hash '{}' found! Copying {:,d} bytes...".format(hash_count, line, file_size))
                found_files += 1
                abs_path = os.path.join(appconfig.base_directory, file_path)

                if not os.path.isfile(abs_path):
                    front, ext = os.path.splitext(abs_path)

                    abs_path = front + ext.lower()

                abs_path = abs_path.replace("\\", "/")

                if appconfig.rename_exported and not hash_name == 'sha1b32':  # the default is sha1b32

                    out_path = build_new_out_path(export_directory, line, file_path.replace("\\", "/"))
                else:
                    out_path = os.path.join(export_directory, file_path.replace("\\", "/"))

                print("Copying '{}' to '{}'\n".format(abs_path, out_path))
                copy_file(abs_path, log_file, out_path)  # TODO Error handling here
    else:
        print("Getting hashes from file...")
        hashes = [line.strip() for line in hash_file]

        hash_set = set(hashes)  # get rid of any dupes
        hash_count = len(hash_set)

        file_count = 0

        print("Found {:,d} hashes in file!".format(hash_count))

        # sql wont work
        # export entire DB for hash_id to file containing: file_id and hash for hash_id
        # once done, read that into dictionary with hash: fileid
        # loop thru hash_set and remove similar items from dictionary
        # when done, export files remaining in dictionary

        print("Getting existing hashes from database...")
        existing_hash_list = get_existing_hash_list(appconfig, hash_id)

        print("Found {:,d} hashes in database!".format(len(existing_hash_list)))

        for hash in hash_set:
            if hash in existing_hash_list:
                del existing_hash_list[hash]

        print("After pruning there are {:,d} hashes to export.".format(len(existing_hash_list)))

        for value in existing_hash_list.values():
            # value is fileID for the file, so now we can get info on the file and export
            db_name = get_file_from_db(appconfig, value)
            if db_name:
                abs_path = os.path.join(appconfig.base_directory, db_name)
                if not os.path.isfile(abs_path):
                    front, ext = os.path.splitext(abs_path)
                    abs_path = front + ext.lower()

                abs_path = abs_path.replace("\\", "/")

                if appconfig.rename_exported and not hash_name == 'sha1b32':  # the default is sha1b32
                    # sigh. we have to now get the appropriate hash value from the database and do trickery based on that
                    # we know the file id, so we can get the hash for the corresponding hash_type from the database
                    # since we also know the hash_id

                    new_hash = get_hash_from_hash_id_and_file_id(appconfig, hash_id, value)

                    out_path = build_new_out_path(export_directory, new_hash, db_name)
                else:
                    out_path = os.path.join(export_directory, db_name.replace("\\", "/"))

                # print("abs_path is {}".format(abs_path))
                #  print("out_path is {}".format(out_path))

                file_count += 1

                print("[{:,d}/{:,d}] Copying '{}' to '{}'\n".format(file_count, len(existing_hash_list), abs_path,
                                                                    out_path))
                copy_file(abs_path, log_file, out_path)  # TODO Error handling here

    hash_file.close()
    log_file.close()

    if appconfig.zip_exported:

        zip_name = os.path.join(appconfig.export_directory,
                                "Exported " + hash_name + " " + datetime_string + ".zip")
        print("\t\tZipping files to '{}'\n".format(zip_name))
        z_file = zipfile.ZipFile(zip_name, "w")

        for dirpath, dirnames, filenames in scandir.walk(export_directory):
            for filename in filenames:
                full_name = os.path.join(export_directory, dirpath, filename)
                if full_name.endswith("txt"):
                    archive_name = os.path.basename(full_name)
                else:
                    parts = full_name.split("\\")
                    archive_name = "\\".join(str(parts[-3:]))

                z_file.write(full_name, archive_name)
        z_file.close()

        print("\t\tRemoving '{} since export was zipped to {}...'\n".format(export_directory, zip_name))
        # shutil.rmtree(export_directory)

    print("\n\t\tSaw {:,d} {} hashes in '{}'. Files found: {:,d}. See '{}' for details.".format(hash_count, hash_name,
                                                                                                file_name, found_files,
                                                                                                log_name))


 def get_stats(appconfig, stats_level):
    # total files
    # total size

    total_store_files = 0
    total_store_size = 0

    conn = sqlite3.connect(appconfig.database_file)
    c = conn.cursor()
    c.execute("SELECT COUNT(fileID) FROM files")

    row = c.fetchone()

    total_db_files = row[0] or 0

    c.execute("SELECT sum(filesize) FROM files")

    row = c.fetchone()

    total_db_size = row[0] or 0

    conn.close()

    if stats_level == 'full':
        for r, d, files in scandir.walk(os.path.join(appconfig.base_directory, "files")):
            total_store_files += len(files)
            for file in files:
                total_store_size += os.path.getsize(os.path.join(r, file))

    return total_db_files, total_db_size, total_store_files, total_store_size


 def bytes_to_human(byte_value, to, bsize=1024):
    """convert byte_value to megabytes, etc.
       sample code:
           print('mb= ' + str(bytesto(314575262000000, 'm')))

       sample output:
           mb= 300002347.946
    """

    if byte_value is None:
        return float(0)

    a = {'k': 1, 'm': 2, 'g': 3, 't': 4, 'p': 5, 'e': 6}
    r = float(byte_value)
    for i in range(a[to]):
        r /= bsize

    return r


 def dump_stats(appconfig, print_stats):
    print("\n*** Database statistics ***\n")

    if print_stats == 'full':
        print("\t *** Please be patient while file store statistics are calculated. This may take a while! ***\n")

    (total_db_files, total_db_size, total_store_files, total_store_size) = get_stats(appconfig, print_stats)

    print("Total files in database: {:,d}".format(total_db_files))
    print("Total size of files in database: {:,d} bytes ({:,f} MB, {:,f} GB, {:,f} TB)\n".format(total_db_size,
                                                                                                 bytes_to_human(
                                                                                                     total_db_size,
                                                                                                     'm'),
                                                                                                 bytes_to_human(
                                                                                                     total_db_size,
                                                                                                     'g'),
                                                                                                 bytes_to_human(
                                                                                                     total_db_size,
                                                                                                     't')))

    if print_stats == 'full':
        print("Total files in file store: {:,d}".format(total_store_files))
        print("Total size of files in file store: {:,d} bytes ({:,f} MB, {:,f} GB, {:,f} TB)\n".format(total_store_size,
                                                                                                       bytes_to_human(
                                                                                                           total_store_size,
                                                                                                           'm'),
                                                                                                       bytes_to_human(
                                                                                                           total_store_size,
                                                                                                           'g'),
                                                                                                       bytes_to_human(
                                                                                                           total_store_size,
                                                                                                           't')))

        count_discrepancy = False
        size_discrepancy = False

        if not total_db_files == total_store_files:
            count_discrepancy = True

        if not total_db_size == total_store_size:
            size_discrepancy = True

        if size_discrepancy or count_discrepancy:
            print("\n*** WARNING ***")

        if size_discrepancy:
            print(
                "There is a discrepancy between the size of files in the database ({:,d}) and the file store ({:,d})! Delta: {:,d} bytes".format(
                    total_db_size, total_store_size, total_db_size - total_store_size))

        if count_discrepancy:
            print(
                "There is a discrepancy between the number of files in the database ({:,d}) and the file store ({:,d})! Delta: {:,d}".format(
                    total_db_files, total_store_files, total_db_files - total_store_files))

        if size_discrepancy or count_discrepancy:
            print("**It is recommended to use the --verify switch to correct this.")
        else:
            print("Database and file store appear to be in sync!\n\n")


 def check_db_to_fs(appconfig):
    conn = sqlite3.connect(appconfig.database_file)
    c = conn.cursor()
    c.execute("SELECT fileid, inputpath FROM files ORDER BY inputpath")

    bad_files = []

    for row in c:
        full_path = os.path.join(appconfig.base_directory, row[1]).lower()
        if not os.path.isfile(full_path):
            bad_files.append(row[0])
            print("\t{} is in database but does not exist in file store!".format(full_path))

    conn.close()

    return bad_files


 def get_files_from_db(appconfig):
    conn = sqlite3.connect(appconfig.database_file)
    c = conn.cursor()
    c.execute("SELECT inputpath FROM files")

    file_names = []

    for row in c:
        file_names.append(row[0])

    conn.close()

    return file_names


 def check_fs_to_db(appconfig):
    bad_files = []

    db_file_names = get_files_from_db(appconfig)

    for r, d, files in scandir.walk(os.path.join(appconfig.base_directory, "files")):
        for file in files:
            full_path = os.path.join(r, file)
            db_path = full_path.replace(appconfig.base_directory, "")
            db_path = db_path[1:]

            if not db_path in db_file_names:
                bad_files.append(full_path)
                print("\t{} is in file store but does not exist in database!".format(full_path))

    return bad_files


 def get_fileid_from_fileinfo(appconfig, fileinfo):
    conn = sqlite3.connect(appconfig.database_file)
    c = conn.cursor()

    hashid = get_hash_id_from_hash_name(appconfig, 'sha1b32')

    c.execute("SELECT fileid FROM FILEHASHES WHERE hashID = ? AND filehash = ?;",
              (hashid, fileinfo['hashes']['sha1b32']))

    row = c.fetchone()

    conn.close()

    return row[0]


 def delete_files_from_db(appconfig, files):
    conn = sqlite3.connect(appconfig.database_file)
    c = conn.cursor()

    sql = "DELETE FROM FILEHASHES WHERE fileID in ({})".format(
        ', '.join('?' for _ in list(files)))

    c.execute(sql, files)

    sql = "DELETE FROM files WHERE fileID in ({})".format(
        ', '.join('?' for _ in list(files)))

    c.execute(sql, files)

    conn.commit()

    conn.close()


 def delete_file_from_db(appconfig, fileinfo):
    conn = sqlite3.connect(appconfig.database_file)
    c = conn.cursor()

    fileid = get_fileid_from_fileinfo(appconfig, fileinfo)

    c.execute("DELETE FROM filehashes WHERE fileid = ?;", (fileid,))
    conn.commit()

    c.execute("DELETE FROM files WHERE fileid = ?;", (fileid,))
    conn.commit()

    conn.close()


 def verify(appconfig):
    print("*** File manager verification ***\n")

    print("Beginning stage 1 (comparing database against file store)...")
    db_to_fs_bad = check_db_to_fs(appconfig)

    if len(db_to_fs_bad) == 0:
        print("Stage 1 complete. No inconsistencies detected between database and file system.")

    print("\nBeginning stage 2 (comparing file store against database)...")
    fs_to_db_bad = check_fs_to_db(appconfig)

    if len(fs_to_db_bad) == 0:
        print("Stage 2 complete. No inconsistencies detected between file system and database.")

    if len(fs_to_db_bad) == 0 and len(db_to_fs_bad) == 0:
        print("\n\nNo inconsistencies detected!")
    else:
        # we have to fix things
        print("\n\nFound {:,d} database and {:,d} file system inconsistencies.".format(len(db_to_fs_bad),
                                                                                       len(fs_to_db_bad)))

        fix_it = input("\nDo you want to fix these issues? [Y|n]: ")

        if not fix_it.lower() == 'n':
            print("\nDeleting bad records from database...", end='')
            delete_files_from_db(appconfig, db_to_fs_bad)

            print("Deleted {:,d} records from database!".format(len(db_to_fs_bad)))

            # set up a clean staging area for files to be imported from
            verify_directory = os.path.join(appconfig.base_directory, "verify")

            # if os.querypath.isdir(verify_directory):
            # shutil.rmtree(verify_directory)

            # os.mkdir(verify_directory)

            print("Adding files to database...")
            for file in fs_to_db_bad:
                fileinfo = get_file_data(file)

                if file_exists_in_database(appconfig, fileinfo):
                    # nuke it to be clean
                    delete_file_from_db(appconfig, fileinfo)

                # move each file to a staging directory, then call import work on it. done
                head, tail = os.path.split(file)

                to_file = os.path.join(verify_directory, tail)

                unique_prefix = 0

                while os.path.isfile(to_file):
                    # file exists, so get a unique name
                    to_file = os.path.join(verify_directory, str(unique_prefix) + "_" + tail)
                    unique_prefix += 1

                # shutil.move(file, to_file)

            (files_added_to_database, total_files, files_deleted, files_copied, files_with_duplicate_hashes,
             files_with_invalid_extensions) = import_files_work(appconfig, verify_directory)

            # shutil.rmtree(verify_directory)

            print("\nAdded {:,d} files to database!".format(files_added_to_database))

            print("\n\n*** Repair complete! ***")


 def main():
    parser = argparse.ArgumentParser(
        description="""File manager that can import files,
                        export file sets based on a list of hashes, export files NOT in a list, etc.""", epilog="""
                        This program can be used to manage files of any type. Before use, adjust the value of
                        'extensions' at the top of the file. Only files having an extension in this set will be
                        imported. A list of files that weren't imported will be documented in a log file when
                        the import operation finishes.
                        """)

    parser.add_argument("base_directory", help="""The root directory where files
                                                will live. This is also where the database of file info will
                                                be created. Enclose directories with spaces in double quotes.
                                                This should be the first argument provided.
                                                """)

    parser.add_argument("--print_stats", choices=['lite', 'full'], help="""'lite' will produce statistics from
    information in the database only. 'full' will look at both the database and file store.
        """)

    parser.add_argument("--verify", action="store_true", help="""Perform consistency check.
     Stage 1 is verifying what is in the database against what is in the file store.
     Stage 2 is verifying what is in the file store against the database.
     When comparison is complete, the results are displayed and, if any issues are found,
     options presented to correct any inconsistencies.
            """)

    import_group = parser.add_argument_group('Import options', 'These options determine how files are imported')
    import_group.add_argument(
        "--import_from", help="""List of comma separated directories to import
                                files from. Enclose directories with spaces in double quotes. Directories should
                                NOT have trailing slashes (i.e. C:\\foo is OK, but C:\\bar\\ is NOT OK
                                """, metavar='PATHS_TO_IMPORT_FROM')
    import_group.add_argument(
        "--delete_existing", choices=['yes', 'simulate'], help="""When importing, delete source files if
                                                        they already exist in file store. If set to 'simulate' files
                                                         will not actually be deleted. This is useful to see what
                                                         would happen as a result of using this flag without actually
                                                         deleting files.
                                                        """)

    import_group.add_argument(
        "--delete_empty_directories", choices=['yes', 'simulate'], help="""When importing, delete any empty directories found.
                                                        If set to 'simulate' directories will not actually be deleted.
                                                        """)

    import_group.add_argument("--copy_new_destination", help="""The directory to copy any newly imported files into.
                                                    No renaming of files (except when conflicts exist) will be done.
                                                    If directory name has spaces, enclose it in double quotes
                                                    """, metavar='PATH_TO_DIRECTORY')

    generate_group = parser.add_argument_group('Generate hash list options',
                                               'These options determine how hash lists are generated')

    generate_group.add_argument("--generate_hash_list", help="""Creates a CSV file of all hashes in the database. Also
                                                    includes the relative querypath to the file. The file will be saved to
                                                    the file manager's base directory
                                                    """, choices=['all', 'ed2k', 'md4', 'md5', 'sha1b16', 'sha1b32'])

    generate_group.add_argument("--suppress_file_info", help="""When true, prevents relative file querypath and file size
                                                    from being included in the hash list. This is handy to generate
                                                    hash lists to import into X-Ways Forensics, etc.
                                                    """, action="store_true")

    export_group = parser.add_argument_group('Export options',
                                             'These options allow for exporting files in several ways.')

    # because crazy people may try to do both at once...
    export_group_exclusive = export_group.add_mutually_exclusive_group()

    export_group_exclusive.add_argument("--export_existing", help="""Export a copy of files in PATH_TO_TEXT_FILE to
                                                    --export_directory. The first line of the file should
                                                    be the hash type to query: md5, sha1b16, sha1b32, ed2k, or md4,
                                                    followed by one hash per line. Enclose paths with spaces
                                                    in double quotes.
                                                    """, metavar='PATH_TO_TEXT_FILE')

    export_group_exclusive.add_argument("--export_delta", help="""Export a copy of files
                                    NOT in PATH_TO_TEXT_FILE to --export_directory. The first line of the file should
                                                    be the hash type to query: md5, sha1b16, sha1b32, ed2k, or md4,
                                                    followed by one hash per line. Enclose paths with spaces
                                                    in double quotes.
                                                    This is useful to synchronize two different file manager instances
                                                    by 1) using --generate_hash_list on one instance and then 2)
                                                    using this option on the file from step 1. The resultant files
                                                    can then be imported into the instance from step 1.
                                                    """, metavar='PATH_TO_TEXT_FILE')

    export_group.add_argument("--export_directory", help="""The target directory when using --export_files_in_list or
                                                    --export_files_not_in_list options. Enclose directories with spaces
                                                    in double quotes.
                                                    """, metavar='PATH_TO_DIRECTORY')

    export_group.add_argument("--rename", help="""When true, all exported files will be renamed to match
                                                    the hash type from the provided file listing.
                                                    """, action="store_true")

    export_group.add_argument("--zip", help="""When true, all exported files will be added to a zip
                                                    archive in --export_directory.
                                                    """, action="store_true")

    # this stores our application parameters so it can get passed around to functions
    appconfig = ApplicationConfiguration()

    args = parser.parse_args()

    if args.delete_existing:
        appconfig.delete_existing = args.delete_existing

    if args.delete_empty_directories:
        appconfig.delete_empty_directories = args.delete_empty_directories

    if args.copy_new_destination:
        appconfig.copy_new_destination = args.copy_new_destination

    if args.base_directory:
        appconfig.base_directory = args.base_directory
        setup_base_directory(appconfig.base_directory)

    appconfig.database_file = os.path.join(appconfig.base_directory, appconfig.database_name)

    print('\n\n')

    init_db(appconfig)

    # Process things in a sane order so things later down the list of options are as complete as possible

    if args.verify:
        verify(appconfig)

    if args.import_from:  # since at least something was passed to this argument, lets try to import
        if extensions.intersection(auto_delete_extensions):
            print(
                "Cannot import files as there is at least one extension in common between 'extensions' and 'auto_delete_extensions: {}".format(
                    ", ".join(extensions.intersection(auto_delete_extensions))))
        else:
            directories = args.import_from.split(",")
            import_files(appconfig, directories)

    if args.generate_hash_list:
        (files_processed, hash_path) = generate_hash_list(appconfig, args.generate_hash_list, args.suppress_file_info)
        if files_processed:
            print("\n\nHashes for {} files have been exported to '{}'\n".format(files_processed, hash_path))
        else:
            print("\n\nNothing to export! The database is empty!\n")

    if args.export_existing or args.export_delta:
        if args.export_directory:
            appconfig.export_directory = os.path.normpath(args.export_directory)

            print("\tExport directory set to: {}".format(appconfig.export_directory))

            if not os.path.exists(appconfig.export_directory):
                print("\tExport directory does not exist. Creating...")
                os.makedirs(appconfig.export_directory)

            if args.rename:
                appconfig.rename_exported = True

            if args.zip:
                appconfig.zip_exported = True

            file_name = ""

            if args.export_existing:
                file_name = args.export_existing

            elif args.export_delta:
                file_name = args.export_delta

            if os.path.isfile(file_name):
                export_files(appconfig, bool(args.export_existing), file_name)
            else:
                print("\t{} does not exist! Export cancelled!".format(file_name))

        else:
            print("\t--export_directory must be set when exporting files! Export cancelled.")

            # see whats set in appconfig
            # attrs = vars(appconfig)
            # print('\n'.join("%s: %s" % item for item in attrs.items()))

            # TODO have a built in web mode to allow searching, exporting etc?
            # TODO Add error handling/try catch, etc
            # TODO make backup of SQLite DB on startup (if newer than last)
            # TODO add --purge_files that takes a list of files and cleans file store and DB of those hashes

    if args.print_stats:
        dump_stats(appconfig, args.print_stats)

    if not args.export_delta and not args.export_existing and not args.generate_hash_list and not args.import_from and not args.print_stats and not args.verify:
        print("You didn't ask me to do anything, so here are some statistics:")
        dump_stats(appconfig, 'lite')


 if __name__ == '__main__':
    main()
diff --git a/filemgrscandir.py b/filemgrscandir.py
 import os
 import hashlib
 from ImageHash import avhash, dhash, dimensions  # , rotavhash
 import base64
 import sqlite3
 import datetime
 import re
 import sys
 import scandir
 from struct import error as structerror

 # from multiprocessing import Pool
 # from multiprocessing.dummy import Pool as ThreadPool

 # TODO use pathlib vs os.querypath calls? this is 3.4 only
 # http://docs.sqlalchemy.org/en/rel_0_9/orm/tutorial.html ??
 # http://docs.python.org/3.4/howto/logging-cookbook.html

 BUFFER_SIZE = 65536  # 8192 # file reading buffer size 8192 * 64?

 excludedextensions = ['.txt', '.ini', '.log']


 class ED2KHash(object):
    MAGICLEN = 9728000

    def __init__(self):
        self.hashes = []
        self.pos = 0
        self.md4 = hashlib.new('md4')

    def update(self, data):
        data_len = len(data)
        for d in (data[i:i + ED2KHash.MAGICLEN] for i in range(0, data_len, ED2KHash.MAGICLEN)):
            self._update(d)

    def _update(self, data):
        data_len = len(data)
        assert data_len <= ED2KHash.MAGICLEN

        newpos = self.pos + data_len

        if newpos < ED2KHash.MAGICLEN:
            self.md4.update(data)
            self.pos = newpos
            return
        else:
            prev = data[:ED2KHash.MAGICLEN - self.pos]
            next_val = data[ED2KHash.MAGICLEN - self.pos:]
            self.md4.update(prev)
            self.hashes.append(self.md4.digest())
            self.md4 = hashlib.new('md4')
            self.md4.update(next_val)
            self.pos = len(next_val)
            return

    def digest(self):
        if len(self.hashes) == 0:
            return self.md4.digest()
        else:
            m = hashlib.new('md4')
            newhashes = self.hashes + [self.md4.digest()]
            m.update(b''.join(newhashes))
            return m.digest()


 class ApplicationConfiguration(object):
    """
    Holds configuration values used in various places
    """

    def __init__(self):
        self.__database_name = 'filemgr.db'
        self.__base_directory = ''
        self.__database_file = ''

    def get_database_name(self):
        return self.__database_name

    def set_database_name(self, database_name):
        self.__database_name = database_name

    database_name = property(get_database_name, set_database_name)

    def get_base_directory(self):
        return self.__base_directory

    def set_base_directory(self, base_directory):
        self.__base_directory = base_directory

    base_directory = property(get_base_directory, set_base_directory)

    def get_database_file(self):
        return self.__database_file

    def set_database_file(self, database_file):
        self.__database_file = database_file

    database_file = property(get_database_file, set_database_file)


 def safeprint(s):
    try:
        print(s)
    except UnicodeEncodeError:
        print(s.encode('utf8').decode(sys.stdout.encoding))


 def add_insert_username(appconfig, username):
    conn = sqlite3.connect(appconfig.database_file)
    c = conn.cursor()
    c.execute(
        "SELECT userID FROM users WHERE users.username = ?;", (username,))
    row = c.fetchone()
    if row is None:
        # insert last_insert_rowid()
        c.execute("INSERT INTO users (username) VALUES (?);", (username,))
        conn.commit()
        rowid = c.lastrowid
    else:
        rowid = row[0]
    conn.close()
    return rowid


 def add_insert_albumname(appconfig, albumname):
    conn = sqlite3.connect(appconfig.database_file)
    c = conn.cursor()
    c.execute(
        "SELECT albumID FROM albums WHERE albums.albumname = ?;", (albumname,))
    row = c.fetchone()
    if row is None:
        # insert last_insert_rowid()
        c.execute("INSERT INTO albums (albumname) VALUES (?);", (albumname,))
        conn.commit()
        rowid = c.lastrowid
    else:
        rowid = row[0]
    conn.close()
    return rowid


 def add_insert_hash(appconfig, fileinfo):
    conn = sqlite3.connect(appconfig.database_file)
    c = conn.cursor()
    c.execute(
        "SELECT filehashID FROM filehashes WHERE filehashes.sha1b32 = ?;", (fileinfo['sha1b32'],))
    row = c.fetchone()
    if row is None:
        # insert last_insert_rowid()
        c.execute("INSERT INTO filehashes (imgdhash,imghash,md5,sha1b32) VALUES (?,?,?,?);",
                  (fileinfo['imgdhash'], fileinfo['imghash'], fileinfo['md5'], fileinfo['sha1b32']))
        conn.commit()
        rowid = c.lastrowid
    else:
        rowid = row[0]
    conn.close()
    return rowid


 def add_file_to_db(appconfig, fileinfo):
    conn = sqlite3.connect(appconfig.database_file)
    c = conn.cursor()
    # check if hashtypes has an entry for each hash in hashes
    # print(fileinfo)
    userid = add_insert_username(appconfig, fileinfo['username'])
    albumid = add_insert_albumname(appconfig, fileinfo['albumname'])

    # add file to files table
    hashid = add_insert_hash(appconfig, fileinfo)
    c.execute("INSERT INTO files (userid, inputpath, post, comment, filename, extension, albumID, "
              "albumindex, fileindex, filesize, filewidth, fileheight, filehashID) "
              "VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?);",
              (userid, fileinfo['inputfile'], fileinfo['post'], fileinfo['comment'],
               fileinfo['filename'], fileinfo['extension'], albumid, fileinfo['albumindex'],
               fileinfo['fileindex'], fileinfo['filesize'], fileinfo['filewidth'],
               fileinfo['fileheight'], hashid))
    conn.commit()
    conn.close()


 def add_missing_to_db(appconfig, missingfiles):
    conn = sqlite3.connect(appconfig.database_file)
    c = conn.cursor()
    # c.execute("DELETE FROM missingpaths")
    # conn.commit()
    c.executemany("INSERT OR IGNORE INTO missingpaths (inputpath) VALUES (?)", missingfiles)
    conn.commit()
    c.execute("DELETE FROM files WHERE files.inputpath IN (SELECT inputpath FROM missingpaths)")
    print("Deleted {:,d} files from DB that no longer existed in file system"
          .format(len(missingfiles)))
    print("Deleted entries have been saved to missingpaths table")
    conn.commit()
    conn.close()


 def import_files_work(appconfig, dirname):
    total_files = 0
    files_added_to_database = 0
    dbfiles = set()
    # Looking up each hash is sllllllow, so pull em all in as a set and just look there!
    print("Getting existing file locations from database...", end='')
    existing_files = get_filelist_from_database(appconfig)
    print("Got {:,d} file locations from database. Looking for new files.\n".format(len(existing_files)))
    for dirpath, dirnames, files in scandir.walk(dirname, topdown=False):
        total_files += len(files)
        file_counter = 0
        if len(files) > 0:
            safeprint("\n\tFound {:,d} files in {}. Processing...".format(len(files), dirpath))
        for name in files:
            full_path_name = os.path.join(dirpath, name)
            rel_path_name = os.path.relpath(full_path_name, start=dirname)
            file_counter += 1
            if full_path_name not in existing_files:
                if os.path.isfile(full_path_name) and len(rel_path_name.split('\\')) > 1:
                    parts = os.path.splitext(name.lower())
                    if len(parts) == 2:
                        fileinfo = get_file_data(full_path_name, rel_path_name, dirname)
                        if fileinfo['inputfile'] not in existing_files and \
                                fileinfo['extension'] not in excludedextensions:
                            files_added_to_database += 1
                            # since this is a new file, we add it to our set for future import operations
                            existing_files.add(fileinfo['inputfile'])
                            dbfiles.add(fileinfo['inputfile'], )
                            add_file_to_db(appconfig, fileinfo)
                            safeprint(
                                '\t\t({} [{:,d}/{:,d}]): Processing {} with {:,d} bytes...'.format(
                                    datetime.datetime.now().strftime('%x %X'),
                                    file_counter,
                                    len(files), fileinfo['inputfile'], fileinfo['filesize']))
                        else:
                            pass
            else:
                dbfiles.add(full_path_name, )
                continue
    missingset = existing_files - dbfiles
    missingfiles = list()
    for file in missingset:
        file = tuple((file,))
        missingfiles.append(file)
    add_missing_to_db(appconfig, missingfiles)
    return files_added_to_database, total_files


 def get_filelist_from_database(appconfig):
    # pull them out and cache on startup or when first pulled?
    conn = sqlite3.connect(appconfig.database_file)
    c = conn.cursor()
    c.execute("SELECT inputpath FROM files;")
    rows = c.fetchall()
    conn.close()
    filenames = [row[0] for row in rows]
    return set(filenames)


 def get_file_data(file, relpath, basepath):
    """
    Generates hashes for file and other file info such as size, etc.
    """
    # TODO can i use some kind of magic to determine mime type and forego extension?

    fileinfo = {'inputfile': file, 'filesize': os.path.getsize(file), 'hashes': {}}
    ignoredfiles = ['history.log', 'unsupported.txt', '.picasa.ini']
    untmp = relpath.split('\\')
    username = untmp[0]
    calcdir = os.path.join(untmp[0], untmp[1])
    errdir = basepath
    if len(calcdir) < len(relpath):
        albumdir = untmp[1]
        filedir = untmp[2]
    else:
        albumdir = None
        filedir = untmp[1]
    albumname = ''
    albumpost = ''
    albumcomment = ''
    albumindex = ''
    if albumdir is not None:
        albumhyphen = albumdir.split('-')
        if albumhyphen[0].startswith('t3_') and len(albumhyphen) == 4 and \
                albumhyphen[2].isdigit():  # newstyle comment album
            albumpost = albumhyphen[0]
            albumcomment = albumhyphen[1]
            albumindex = albumhyphen[2]
            albumname = albumhyphen[3]
        elif len(albumhyphen) == 3 and albumhyphen[1].isdigit():  # newstyle album
            albumpost = albumhyphen[0]
            albumcomment = ''
            albumindex = albumhyphen[1]
            albumname = albumhyphen[2]
        elif len(albumhyphen) == 2 and albumhyphen[1].startswith('c') and \
                4 < len(albumhyphen[0]) < 7:  # oldstyle comment album
            albumunder = (albumhyphen[1]).split('_')
            albumpost = 't3_' + albumhyphen[0]
            albumcomment = albumunder[0]
            albumindex = ''
            albumname = albumunder[1]
        elif len(albumhyphen) == 1 and len((albumhyphen[0]).split('_')) == 2:  # oldstyle album
            albumunder = (albumhyphen[0]).split('_')
            albumpost = albumunder[0]
            albumcomment = ''
            albumindex = ''
            albumname = albumunder[1]
            # print("oldstyle album", albumpost, albumcomment, albumindex, albumname)
        else:
            print("can't resolve album:", file)
            outfile = os.path.join(errdir, 'parsing_error ' +
                                   datetime.datetime.now().strftime("%Y-%m-%d") + '.txt')
            with open(outfile, 'a', encoding="utf-16") as logfile:
                logfile.write("{}: Albumname Parsing Error - {}\n"
                              .format(datetime.datetime.now().strftime("%Y-%m-%d %H:%M"), file))
    filehyphen = filedir.split('-')
    if filehyphen[0].startswith('t3_') and len(filehyphen) >= 5 and len(filehyphen[2]) < 4 and \
            filehyphen[2].isdigit() and filehyphen[3].isdigit():  # newstyle comment filename
        filepost = filehyphen[0]
        filecomment = filehyphen[1]
        filealbumindex = filehyphen[2]
        fileindex = filehyphen[3]
        filename = '-'.join(filehyphen[4:])
    elif len(filehyphen) > 3 and len(filehyphen[1]) < 4 and \
            filehyphen[1].isdigit() and filehyphen[2].isdigit():  # newstyle filename
        filepost = filehyphen[0]
        filecomment = ''
        filealbumindex = filehyphen[1]
        fileindex = filehyphen[2]
        filename = '-'.join(filehyphen[3:])
    elif len(filehyphen) >= 2 and 4 < len(filehyphen[0]) < 7:  # oldstyle comment filename
        filehyphen[1] = '-'.join(filehyphen[1:])  # recombine hyphens in actual filename
        fileunder = (filehyphen[1]).split('_')
        filepost = 't3_' + filehyphen[0]
        filecomment = fileunder[0]
        filealbumindex = ''
        fileindex = ''
        filename = '_'.join(fileunder[1:])
    elif len(filehyphen) >= 2 and len((filehyphen[0]).split('_')) >= 2:  # oldstyle hyphenated fname
        filehyphen[0] = '-'.join(filehyphen[0:])  # recombine hyphens in actual filename
        fileunder = (filehyphen[0]).split('_')
        filepost = fileunder[0]
        filecomment = ''
        filealbumindex = ''
        fileindex = ''
        filename = '_'.join(fileunder[1:])
    elif len(filehyphen) == 1 and len((filehyphen[0]).split('_')) >= 2:  # oldstyle filename
        fileunder = filehyphen[0].split('_')
        filepost = fileunder[0]
        filecomment = ''
        filealbumindex = ''
        fileindex = ''
        filename = '_'.join(fileunder[1:])
    else:
        print("can't resolve filename:", file)
        if albumpost != '':
            filepost = albumpost
        else:
            filepost = ''
        if albumcomment != '':
            filecomment = albumcomment
        else:
            filecomment = ''
        if albumindex != '':
            filealbumindex = albumindex
        else:
            filealbumindex = ''
        fileindex = ''
        filename = '_'.join(filehyphen[0:])
        if filename not in ignoredfiles:
            outfile = os.path.join(errdir, 'parsing_error ' +
                                   datetime.datetime.now().strftime("%Y-%m-%d") + '.txt')
            with open(outfile, 'a', encoding="utf-16") as logfile:
                logfile.write("{}: Filename Parsing Error - {}\n"
                              .format(datetime.datetime.now().strftime("%Y-%m-%d %H:%M"), file))

    parts = os.path.splitext(file.lower())
    ext = ''
    if len(parts) == 2:
        ext = parts[1]
    sha1 = hashlib.sha1()
    md5 = hashlib.md5()

    f = open(file, 'rb')
    # print('file extension = ', ext)
    # if ext in imageexts:
    try:
        (filewidth, fileheight) = dimensions(file)
        imghash = str(avhash(file))
        imgdhash = str(dhash(file))
        # imgrotavg = str(rotavhash(file))
        # vprint('\n[!] image hash (%s)' % imghash)

    except (RuntimeError, TypeError, NameError, ValueError, structerror):
        # Failed to get hash, delete image & raise exception
        print('image check failed')
        imghash = 0
        imgdhash = 'hashing_failed'
        # imgrotavg =0
        filewidth = 0
        fileheight = 0
        if filename not in ignoredfiles:
            outfile = os.path.join(errdir, 'hasherrors ' + datetime.datetime.now().strftime("%Y-%m-%d") + '.txt')
            with open(outfile, 'a', encoding="utf-16") as logfile:
                logfile.write(
                    "{}: Hashing Error - {}\n".format(datetime.datetime.now().strftime("%Y-%m-%d %H:%M"), file))
    if filewidth > 5000 or fileheight > 5000:
        print('\n[!] image too large to hash (%dx%d)' % (filewidth, fileheight))
        imghash = 999
        imgdhash = 'image_too_large'
        # imgrotavg = 999
        pass
    if filewidth == 161 and fileheight == 81:
        # Size of empty imgur image ('not found!')
        imghash = 503
        imgdhash = 'imgur_503_removed'
        # imgrotavg = 503
        pass

    filename = filename[0:(len(filename) - len(ext))]
    buf = f.read(BUFFER_SIZE)
    while buf != b'':
        md5.update(buf)
        sha1.update(buf)
        # imghash.update(buf)
        buf = f.read(BUFFER_SIZE)
    f.close()

    sha1b16 = sha1.hexdigest().upper()
    sha1b32 = base64.b32encode(base64.b16decode(sha1b16.upper())).decode().upper()
    md5hash = md5.hexdigest().upper()

    fileinfo['sha1b32'] = sha1b32
    fileinfo['md5'] = md5hash
    fileinfo['post'] = filepost
    fileinfo['comment'] = filecomment
    fileinfo['fileindex'] = fileindex
    fileinfo['filename'] = filename
    fileinfo['albumindex'] = filealbumindex
    fileinfo['albumname'] = albumname
    fileinfo['imghash'] = imghash
    fileinfo['imgdhash'] = imgdhash
    fileinfo['extension'] = ext.lower()
    fileinfo['filewidth'] = filewidth
    fileinfo['fileheight'] = fileheight
    fileinfo['username'] = username
    return fileinfo


 def init_db(appconfig):
    # create, setup tables
    # one table is hashname
    # another is for files that references hashname pk
    # this allows for easy expanding if hashname is missing without schema changes
    conn = sqlite3.connect(appconfig.database_file)

    c = conn.cursor()
    c.execute("PRAGMA synchronous = OFF")
    c.execute("PRAGMA journal_mode = MEMORY")
    c.execute('''CREATE TABLE IF NOT EXISTS users
         (userID INTEGER PRIMARY KEY, username TEXT)''')

    c.execute('''CREATE TABLE IF NOT EXISTS albums
         (albumID INTEGER PRIMARY KEY, albumname TEXT)''')

    c.execute('''CREATE TABLE IF NOT EXISTS files
         (fileID INTEGER PRIMARY KEY AUTOINCREMENT, userid INTEGER, inputpath TEXT, post TEXT,
         comment TEXT, filename TEXT, extension TEXT, albumID INTEGER, albumindex TEXT,
         fileindex TEXT, filesize INTEGER, filewidth INTEGER, fileheight INTEGER,
         filehashID INTEGER)''')

    c.execute('''CREATE TABLE IF NOT EXISTS filehashes (filehashID INTEGER PRIMARY KEY,
         imgdhash TEXT, imghash TEXT, md5 TEXT, sha1b32 TEXT)''')

    c.execute('''CREATE TABLE IF NOT EXISTS missingpaths (inputpath TEXT)''')

    hashes = ['imgdhash', 'imghash', 'md5', 'sha1b32']
    viewstate = """CREATE VIEW IF NOT EXISTS {0}_dupes AS
        SELECT files.inputpath, files.filename, users.username, albums.albumname, files.post,
        files.filesize, files.filewidth, files.fileheight, filehashes.{0}, files.filehashID
          FROM files JOIN filehashes ON files.filehashID = filehashes.filehashID
               JOIN users ON files.userid = users.userID
               JOIN albums ON files.albumID = albums.albumID
         WHERE filehashes.{0} IN ( SELECT filehashes.{0}
          FROM files JOIN filehashes ON filehashes.filehashID = files.filehashID
         WHERE filehashes.{0} <> 0 AND filehashes.{0} <> '0000000000000000' AND
              filehashes.{0} <> 'average_hash_error' AND filehashes.{0} <> 'imgur_503_removed' AND
              filehashes.{0} <> 'image_too_large' AND filehashes.{0} <> 'hashing_failed' AND
              filehashes.{0} <> 'differential_hash_error' AND filehashes.{0} <> 9999 AND
              filehashes.{0} <> 12345 AND filehashes.{0} <> 223344 AND filehashes.{0} <> 112233
        GROUP BY filehashes.{0} HAVING count() > 1) ORDER BY filehashes.{0}"""
    for ihash in hashes:
        curstate = viewstate.format(ihash)
        c.execute(curstate)
    conn.commit()

    inexactview = """CREATE VIEW IF NOT EXISTS inexact_matches AS
        SELECT files.inputpath, files.filename, files.post, files.filesize,
            files.filewidth, files.fileheight, filehashes.imgdhash, filehashes.md5
        FROM files JOIN filehashes ON files.filehashID = filehashes.filehashID
            WHERE files.filehashID IN ( SELECT files.filehashID FROM filehashes
        JOIN files ON files.filehashID = filehashes.filehashID
            WHERE filehashes.imgdhash IN
        ( SELECT imgdhash FROM filehashes
        WHERE filehashes.imgdhash <> 0 AND filehashes.imgdhash <> '0000000000000000'
            AND filehashes.imgdhash <> 'average_hash_error'
            AND filehashes.imgdhash <> 'imgur_503_removed'
            AND filehashes.imgdhash <> 'image_too_large' AND filehashes.imgdhash <> 'hashing_failed'
            AND filehashes.imgdhash <> 'differential_hash_error' AND filehashes.imgdhash <> 9999
            AND filehashes.imgdhash <> 12345 AND filehashes.imgdhash <> 223344
            AND filehashes.imgdhash <> 112233
        GROUP BY filehashes.imgdhash HAVING count() > 1 ) )
        AND files.filehashID NOT IN ( SELECT DISTINCT files.filehashID
          FROM files WHERE files.filehashID IN ( SELECT DISTINCT source.filehashID
          FROM files AS source LEFT JOIN files AS target ON source.fileID = target.fileID
          WHERE source.filehashid <> target.filehashid ORDER BY source.fileID )
        GROUP BY files.filename ) ORDER BY filehashes.imgdhash"""
    c.execute(inexactview)
    conn.commit()

    filenamemismatchview = """CREATE VIEW IF NOT EXISTS filenamehash_mismatch AS
         SELECT files.inputpath, users.username, albums.albumname, files.filename, files.filesize,
                files.filewidth, files.fileheight, files.filehashid
          FROM files JOIN users ON files.userid = users.userID
               JOIN albums ON files.albumID = albums.albumID
         WHERE files.filename IN ( SELECT DISTINCT files.filename FROM files
             WHERE files.filename IN ( SELECT DISTINCT source.filename
         FROM files AS source LEFT JOIN files AS target ON source.filename = target.filename
          WHERE source.filehashid <> target.filehashid
          ORDER BY source.filename ) GROUP BY files.filename ) AND files.filename <> 'default' AND
          files.filename <> 'encoded' AND files.filename <> 'giphy' AND
        files.filename <> 'image' AND files.filename <> 'large' AND files.filename <> '' AND
       files.filename <> 'media_command' AND length(files.filename) > 4
         ORDER BY files.filename"""
    c.execute(filenamemismatchview)
    conn.commit()

    filenamematchview = """CREATE VIEW IF NOT EXISTS filenamehash_match AS
       SELECT files.inputpath, users.username, albums.albumname, files.filename, files.filesize,
                files.filewidth, files.fileheight, files.filehashid
          FROM files JOIN users ON files.userid = users.userID
               JOIN albums ON files.albumID = albums.albumID
         WHERE files.filename IN ( SELECT DISTINCT files.filename FROM files
             WHERE files.filename IN ( SELECT DISTINCT source.filename
         FROM files AS source LEFT JOIN files AS target ON source.filename = target.filename
          WHERE source.filehashid = target.filehashid
          ORDER BY source.filename ) GROUP BY files.filename HAVING count() > 1)
          AND files.filename <> 'default' AND
          files.filename <> 'encoded' AND files.filename <> 'giphy' AND
        files.filename <> 'image' AND files.filename <> 'large' AND files.filename <> '' AND
       files.filename <> 'media_command' AND length(files.filename) > 4
         ORDER BY files.filename"""
    c.execute(filenamematchview)
    conn.commit()

    dirsizeview = """CREATE VIEW IF NOT EXISTS directory_size AS
                    SELECT users.username, SUM(files.filesize) AS dirsum
                      FROM filehashes JOIN files ON filehashes.filehashID = files.filehashID
                           JOIN users ON files.userid = users.userID
                     GROUP BY users.username ORDER BY sum(files.filesize) ASC;"""
    c.execute(dirsizeview)
    conn.commit()

    # add indexes

    c.execute('CREATE INDEX IF NOT EXISTS "IX_filehashes" ON "filehashes" '
              '("imgdhash","imghash","md5","sha1b32")')
    c.execute('CREATE INDEX IF NOT EXISTS "IX_filehashID" ON "files" ("filehashID")')
    c.execute('CREATE INDEX IF NOT EXISTS "IX_username" ON "users" ("username")')
    c.execute('CREATE INDEX IF NOT EXISTS "IX_albumname" ON "albums" ("albumname")')
    c.execute('CREATE INDEX IF NOT EXISTS "IU_inputpath" ON "files" '
              '("inputpath", "filesize", "filewidth", "fileheight")')
    conn.commit()

    conn.close()


 def import_files(appconfig, directory):
    """
    Attempts to recursively import files from values in directories and writes log files with actions taken
    @param appconfig: Configuration data
    @param directory: a list of directories to import from
    """
    print("Importing from '{}'".format(",".join(directory)))

    directory = directory.strip()
    if os.path.isdir(directory):

        (files_added_to_database, total_files) = import_files_work(appconfig, directory)

        print(
            '\n' + '*' * 4 + """ {:,d} total files found. {:,d} files were added to the database."""
            .format(total_files, files_added_to_database))

        directory_clean = re.sub(r'[^\w\-_. ]', '_', directory)

        logfile_name = os.path.join(appconfig.base_directory,
                                    "Import log for " + directory_clean + " " +
                                    datetime.datetime.now().strftime("%H%M%S%f") + '.txt')

        with open(logfile_name, 'w+', encoding="utf-16") as logfile:
            logfile.write('Directory processed: {}\n\n'.format(directory))
            logfile.write('Files found: {:,d}\n'.format(total_files))
            logfile.write('Files added to database: {:,d}\n'.format(files_added_to_database))
            logfile.write('*' * 78 + '\n\n')
    else:
        print("\t'{}' does not exist!".format(directory))

    # after import, tell the user to see generated logs (one per directory) in the main directory
    # but only if we actually attempted to import something
    if len(directory) > 0 and 'logfile_name' in locals():
        print("\n\nSee log files in {} for details.".format(appconfig.base_directory))


 def main():
    # this stores our application parameters so it can get passed around to functions
    appconfig = ApplicationConfiguration()
    appconfig.base_directory = r"F:\Python\databases"
    appconfig.database_file = os.path.join(appconfig.base_directory, appconfig.database_name)
    init_db(appconfig)
    targetroot = r"G:\GWScanner\\content"
    import_files(appconfig, targetroot)


 if __name__ == '__main__':
    main()
diff --git a/ImageHash.py b/ImageHash.py
 # https://pypi.org/project/ImageHash/
 from os import path, mkdir, sep, remove
 from sys import exit, argv

 from PIL import Image
 import numpy
 from struct import error as StructError


 # import scipy.fftpack
 # import multiprocessing as mp


 def binary_array_to_hex(arr):
    h = 0
    s = []
    for i, v in enumerate(arr.flatten()):
        if v: h += 2 ** (i % 8)
        if (i % 8) == 7:
            s.append(hex(h)[2:].rjust(2, '0'))
            h = 0
    return "".join(s)


 def dhash(im):
    hash_size = 8
    if not isinstance(im, Image.Image):
        try:
            im = Image.open(im)
            im = im.convert('L').resize((hash_size + 1, hash_size), Image.ANTIALIAS)
            pixels = numpy.array(im.getdata(), dtype=numpy.float).reshape((hash_size + 1, hash_size))
            # compute differences
            diff = pixels[1:, :] > pixels[:-1, :]
            diff = binary_array_to_hex(diff)
        except (OSError, SyntaxError, IndexError):
            diff = "differential_hash_error"
        return diff


 def avhash(im):
    """
        Shrinks image to 16x16 pixels,
        Finds average amongst the pixels,
        Iterates over every pixel, comparing to average.
        1 if above avg, 0 if below.
        Returns resulting integer. (hash of the image 'im')
        Updated to not use ternary operator (not available in python 2.4.x)
    """
    if not isinstance(im, Image.Image):
        try:
            im = Image.open(im)
            im = im.convert('L').resize((16, 16), Image.ANTIALIAS)
            ttl = 0
            for gd in im.getdata(): ttl += gd
            avg = ttl // 256
            result = 0
            for i, gd in enumerate(im.getdata()):
                if gd > avg:
                    result += (1 << i)
            del im
        except (OSError, SyntaxError, IndexError, StructError):
            result = "average_hash_error"
        return result


 def avhash_dict(im):
    """
        Generate hashes for the image, including variations of the image
        * Regular image
        * Mirrored (left-right)
        * Rotated left (90deg)
        * Rotated right (270deg)
    """
    if not isinstance(im, Image.Image):
        im = Image.open(im)
    im = im.resize((16, 16), Image.ANTIALIAS).convert('L')
    ttl = 0
    for gd in im.getdata(): ttl += gd
    avg = ttl // 256
    result = {}

    # Regular hash
    regular_hash = 0
    for i, gd in enumerate(im.getdata()):
        if gd > avg:
            regular_hash += (1 << i)
    result['hash'] = regular_hash

    # Mirror hash
    mirror_im = im.transpose(Image.FLIP_LEFT_RIGHT)
    mirror_hash = 0
    for i, gd in enumerate(mirror_im.getdata()):
        if gd > avg:
            mirror_hash += (1 << i)
    result['mirror'] = mirror_hash

    # Rotated 90deg hash
    left_im = im.transpose(Image.ROTATE_90)
    left_hash = 0
    for i, gd in enumerate(left_im.getdata()):
        if gd > avg:
            left_hash += (1 << i)
    result['left'] = left_hash

    # Rotated 270deg hash
    right_im = im.transpose(Image.ROTATE_270)
    right_hash = 0
    for i, gd in enumerate(right_im.getdata()):
        if gd > avg:
            right_hash += (1 << i)
    result['right'] = right_hash
    rotaverage = (regular_hash + mirror_hash + left_hash + right_hash) / 4
    del im
    return result


 def rotavhash(im):
    """
        Generate hashes for the image, including variations of the image
        * Regular image
        * Mirrored (left-right)
        * Rotated left (90deg)
        * Rotated right (270deg)
    """

    if not isinstance(im, Image.Image):
        try:
            im = Image.open(im)
            im = im.resize((16, 16), Image.ANTIALIAS).convert('L')
            ttl = 0
            for gd in im.getdata(): ttl += gd
            avg = ttl // 256
            result = {}

            # Regular hash
            regular_hash = 0
            for i, gd in enumerate(im.getdata()):
                if gd > avg:
                    regular_hash += (1 << i)
            result['hash'] = regular_hash

            # Mirror hash
            mirror_im = im.transpose(Image.FLIP_LEFT_RIGHT)
            mirror_hash = 0
            for i, gd in enumerate(mirror_im.getdata()):
                if gd > avg:
                    mirror_hash += (1 << i)
            result['mirror'] = mirror_hash

            # Rotated 90deg hash
            left_im = im.transpose(Image.ROTATE_90)
            left_hash = 0
            for i, gd in enumerate(left_im.getdata()):
                if gd > avg:
                    left_hash += (1 << i)
            result['left'] = left_hash

            # Rotated 270deg hash
            right_im = im.transpose(Image.ROTATE_270)
            right_hash = 0
            for i, gd in enumerate(right_im.getdata()):
                if gd > avg:
                    right_hash += (1 << i)
            result['right'] = right_hash
            rotaverage = (regular_hash + mirror_hash + left_hash + right_hash) / 4
            rotaverage = int(rotaverage)
            del im
        except (OSError, SyntaxError, IndexError):
            rotaverage = "rotated_average_error"
        return rotaverage


 def dimensions(im):
    """ Returns tuple (Width, Height) for given image. """
    if not isinstance(im, Image.Image):
        try:
            im = Image.open(im)
            result = im.size
        except OSError:
            print("Not a recognized Image File")
            # im.size = (0, 0)
            result = (0, 0)
        del im
        return result


 def create_thumb(im, num):
    """
        Creates a thumbnail for a given image file.
        Saves to 'thumbs' directory, named <num>.jpg
    """
    try:
        mkdir('thumbs')
    except OSError:
        pass

    if not isinstance(im, Image.Image):
        im = Image.open(im)
    # Convert to RGB if not already
    if im.mode != "RGB": im = im.convert("RGB")
    im.thumbnail((100, 100), Image.ANTIALIAS)
    im.save('thumbs%s%d.jpg' % (sep, num), 'JPEG')
    del im


 if __name__ == '__main__':
    args = argv[1:]
    if len(args) == 0:
        print('argument required: image file location')
        exit(1)
    filename = ' '.join(args)
    remove_file = False
    if not path.exists(filename):
        print('file not found: %s' % filename)
        exit(1)

    print('Hash:\t\t%d' % avhash(filename))

    print('')
    d = avhash_dict(filename)
    for key in d:
        print('Hash[%s] = \t%d' % (key, d[key]))
    print('')

    dim = dimensions(filename)
    print('Dimensions:\t%dx%d' % (dim[0], dim[1]))

    # create_thumb(filename, 1)
    if remove_file:
        remove(filename)
	# https://pypi.org/project/ImageHash/
	from os import path, mkdir, sep, remove
	from sys import exit, argv

	from PIL import Image
	import numpy
	from struct import error as StructError


	# import scipy.fftpack
	# import multiprocessing as mp


	def binary_array_to_hex(arr):
	h = 0
	s = []
	for i, v in enumerate(arr.flatten()):
	if v: h += 2 ** (i % 8)
	if (i % 8) == 7:
	s.append(hex(h)[2:].rjust(2, '0'))
	h = 0
	return "".join(s)


	def dhash(im):
	hash_size = 8
	if not isinstance(im, Image.Image):
	try:
	im = Image.open(im)
	im = im.convert('L').resize((hash_size + 1, hash_size), Image.ANTIALIAS)
	pixels = numpy.array(im.getdata(), dtype=numpy.float).reshape((hash_size + 1, hash_size))
	# compute differences
	diff = pixels[1:, :] > pixels[:-1, :]
	diff = binary_array_to_hex(diff)
	except (OSError, SyntaxError, IndexError):
	diff = "differential_hash_error"
	return diff


	def avhash(im):
	"""
	Shrinks image to 16x16 pixels,
	Finds average amongst the pixels,
	Iterates over every pixel, comparing to average.
	1 if above avg, 0 if below.
	Returns resulting integer. (hash of the image 'im')
	Updated to not use ternary operator (not available in python 2.4.x)
	"""
	if not isinstance(im, Image.Image):
	try:
	im = Image.open(im)
	im = im.convert('L').resize((16, 16), Image.ANTIALIAS)
	ttl = 0
	for gd in im.getdata(): ttl += gd
	avg = ttl // 256
	result = 0
	for i, gd in enumerate(im.getdata()):
	if gd > avg:
	result += (1 << i)
	del im
	except (OSError, SyntaxError, IndexError, StructError):
	result = "average_hash_error"
	return result


	def avhash_dict(im):
	"""
	Generate hashes for the image, including variations of the image
	* Regular image
	* Mirrored (left-right)
	* Rotated left (90deg)
	* Rotated right (270deg)
	"""
	if not isinstance(im, Image.Image):
	im = Image.open(im)
	im = im.resize((16, 16), Image.ANTIALIAS).convert('L')
	ttl = 0
	for gd in im.getdata(): ttl += gd
	avg = ttl // 256
	result = {}

	# Regular hash
	regular_hash = 0
	for i, gd in enumerate(im.getdata()):
	if gd > avg:
	regular_hash += (1 << i)
	result['hash'] = regular_hash

	# Mirror hash
	mirror_im = im.transpose(Image.FLIP_LEFT_RIGHT)
	mirror_hash = 0
	for i, gd in enumerate(mirror_im.getdata()):
	if gd > avg:
	mirror_hash += (1 << i)
	result['mirror'] = mirror_hash

	# Rotated 90deg hash
	left_im = im.transpose(Image.ROTATE_90)
	left_hash = 0
	for i, gd in enumerate(left_im.getdata()):
	if gd > avg:
	left_hash += (1 << i)
	result['left'] = left_hash

	# Rotated 270deg hash
	right_im = im.transpose(Image.ROTATE_270)
	right_hash = 0
	for i, gd in enumerate(right_im.getdata()):
	if gd > avg:
	right_hash += (1 << i)
	result['right'] = right_hash
	rotaverage = (regular_hash + mirror_hash + left_hash + right_hash) / 4
	del im
	return result


	def rotavhash(im):
	"""
	Generate hashes for the image, including variations of the image
	* Regular image
	* Mirrored (left-right)
	* Rotated left (90deg)
	* Rotated right (270deg)
	"""

	if not isinstance(im, Image.Image):
	try:
	im = Image.open(im)
	im = im.resize((16, 16), Image.ANTIALIAS).convert('L')
	ttl = 0
	for gd in im.getdata(): ttl += gd
	avg = ttl // 256
	result = {}

	# Regular hash
	regular_hash = 0
	for i, gd in enumerate(im.getdata()):
	if gd > avg:
	regular_hash += (1 << i)
	result['hash'] = regular_hash

	# Mirror hash
	mirror_im = im.transpose(Image.FLIP_LEFT_RIGHT)
	mirror_hash = 0
	for i, gd in enumerate(mirror_im.getdata()):
	if gd > avg:
	mirror_hash += (1 << i)
	result['mirror'] = mirror_hash

	# Rotated 90deg hash
	left_im = im.transpose(Image.ROTATE_90)
	left_hash = 0
	for i, gd in enumerate(left_im.getdata()):
	if gd > avg:
	left_hash += (1 << i)
	result['left'] = left_hash

	# Rotated 270deg hash
	right_im = im.transpose(Image.ROTATE_270)
	right_hash = 0
	for i, gd in enumerate(right_im.getdata()):
	if gd > avg:
	right_hash += (1 << i)
	result['right'] = right_hash
	rotaverage = (regular_hash + mirror_hash + left_hash + right_hash) / 4
	rotaverage = int(rotaverage)
	del im
	except (OSError, SyntaxError, IndexError):
	rotaverage = "rotated_average_error"
	return rotaverage


	def dimensions(im):
	""" Returns tuple (Width, Height) for given image. """
	if not isinstance(im, Image.Image):
	try:
	im = Image.open(im)
	result = im.size
	except OSError:
	print("Not a recognized Image File")
	# im.size = (0, 0)
	result = (0, 0)
	del im
	return result


	def create_thumb(im, num):
	"""
	Creates a thumbnail for a given image file.
	Saves to 'thumbs' directory, named <num>.jpg
	"""
	try:
	mkdir('thumbs')
	except OSError:
	pass

	if not isinstance(im, Image.Image):
	im = Image.open(im)
	# Convert to RGB if not already
	if im.mode != "RGB": im = im.convert("RGB")
	im.thumbnail((100, 100), Image.ANTIALIAS)
	im.save('thumbs%s%d.jpg' % (sep, num), 'JPEG')
	del im


	if __name__ == '__main__':
	args = argv[1:]
	if len(args) == 0:
	print('argument required: image file location')
	exit(1)
	filename = ' '.join(args)
	remove_file = False
	if not path.exists(filename):
	print('file not found: %s' % filename)
	exit(1)

	print('Hash:\t\t%d' % avhash(filename))

	print('')
	d = avhash_dict(filename)
	for key in d:
	print('Hash[%s] = \t%d' % (key, d[key]))
	print('')

	dim = dimensions(filename)
	print('Dimensions:\t%dx%d' % (dim[0], dim[1]))

	# create_thumb(filename, 1)
	if remove_file:
	remove(filename)