Created
November 25, 2019 02:23
-
-
Save ronivaldo/32860648d4ba998500e2af62ed9435a2 to your computer and use it in GitHub Desktop.
Backup Duplicate Files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import print_function | |
# tutorial | |
# https://www.pythoncentral.io/introductory-tutorial-python-sqlalchemy/ | |
# https://chartio.com/resources/tutorials/how-to-execute-raw-sql-in-sqlalchemy/ | |
import os | |
import sys | |
from sqlalchemy import Column, ForeignKey, Integer, String, Numeric | |
from sqlalchemy.ext.declarative import declarative_base | |
from sqlalchemy.orm import relationship | |
from sqlalchemy import create_engine | |
from sqlalchemy.orm import sessionmaker | |
import hashlib | |
import base64 | |
import time | |
Base = declarative_base() | |
class Directory(Base): | |
__tablename__ = 'directory' | |
id = Column(Integer, primary_key=True) | |
path = Column(String(250), nullable=False) | |
class File(Base): | |
__tablename__ = 'file' | |
id = Column(Integer, primary_key=True) | |
filename = Column(String(250)) | |
full_path = Column(String(250)) | |
dirname = Column(String(250)) | |
file_size = Column(Integer()) | |
last_access_time = Column(Integer()) | |
last_modification_time = Column(Integer()) | |
last_creation_time = Column(Integer()) | |
hash_full = Column(String(250)) | |
hash_1k = Column(String(250)) | |
directory_id = Column(Integer(), ForeignKey('directory.id')) | |
directory = relationship(Directory) | |
try: | |
os.remove("duplicate_files.db") | |
except Exception as e: | |
raise e | |
engine = create_engine('sqlite:///duplicate_files.db') | |
# Create all tables in the engine. This is equivalent to "Create Table" | |
# statements in raw SQL. | |
Base.metadata.create_all(engine) | |
DBSession = sessionmaker(bind=engine) | |
session = DBSession() | |
def get_file_details(full_path): | |
result = { | |
} | |
#print(full_path) | |
try: | |
# if the target is a symlink (soft one), this will | |
# dereference it - change the value to the actual target file | |
full_path = os.path.realpath(full_path) | |
result['full_path'] = full_path | |
dirname = os.path.dirname(full_path) | |
result['dirname'] = dirname | |
file_size = os.path.getsize(full_path) | |
result['file_size'] = file_size | |
last_access_time = os.path.getatime(full_path) | |
result['last_access_time'] = last_access_time | |
last_modification_time = os.path.getmtime(full_path) | |
# most important | |
result['last_modification_time'] = last_modification_time | |
last_creation_time = os.path.getctime(full_path) | |
result['last_creation_time'] = last_creation_time | |
except (OSError,): | |
# not accessible (permissions, etc) - pass on | |
pass | |
return result | |
extensions = [ | |
'jpg', | |
'jpeg', | |
'mpeg', | |
'mpg', | |
'avi', | |
'mp4' | |
] | |
# 'Z:\\_BACKUP_\\_fotos_\\2006', | |
paths = [ | |
'Z:\\_BACKUP_\\_fotos_\\2005', | |
'Z:\\_BACKUP_\\_fotos_\\2006', | |
] | |
paths = [ | |
'D:\\TMP\\duplicate_center\\test1', | |
'D:\\TMP\\duplicate_center\\test2', | |
] | |
path_analysis = {} | |
session.commit() | |
print('\nprocurando arquivos nos diretorios...') | |
for path in paths: | |
#path_analysis[path] = [] | |
new_directory = Directory(path=path) | |
session.add(new_directory) | |
print(path) | |
for dirpath, dirnames, filenames in os.walk(path): | |
#print(filenames) | |
for filename in filenames: | |
extension = filename.split('.')[-1].lower() | |
if extension not in extensions: | |
continue | |
full_path = os.path.join(dirpath, filename) | |
file_details = get_file_details(full_path) | |
print('.', end='', flush=True) | |
if file_details: | |
new_file = File() | |
new_file.filename = filename | |
new_file.full_path = file_details.get('full_path') | |
new_file.dirname = file_details.get('dirname') | |
new_file.file_size = file_details.get('file_size') | |
new_file.last_access_time = file_details.get('last_access_time') | |
new_file.last_modification_time = file_details.get('last_modification_time') | |
new_file.last_creation_time = file_details.get('last_creation_time') | |
new_file.directory = new_directory | |
session.add(new_file) | |
session.commit() | |
def chunk_reader(fobj, chunk_size=1024): | |
"""Generator that reads a file in chunks of bytes""" | |
while True: | |
chunk = fobj.read(chunk_size) | |
if not chunk: | |
return | |
yield chunk | |
def get_hash(filename, first_chunk_only=False, hash=hashlib.sha1): | |
hashobj = hash() | |
file_object = open(filename, 'rb') | |
if first_chunk_only: | |
hashobj.update(file_object.read(1024)) | |
else: | |
for chunk in chunk_reader(file_object): | |
hashobj.update(chunk) | |
hashed = hashobj.digest() | |
file_object.close() | |
return hashed | |
def localtime(unix_timestamp): | |
local_time = time.localtime(unix_timestamp) | |
return time.strftime("%Y-%m-%d %H:%M:%S", local_time) | |
# obter todos os arquivos do mesmo tamanho e calcular o hash_1k | |
print('\nCalculando hash 1K de arquivos do mesmo tamanho..') | |
with engine.connect() as con: | |
rs = con.execute( | |
''' | |
select | |
file_size, | |
count(*) as file_counter | |
from | |
file | |
where | |
file_size > 0 and | |
hash_1k is null | |
group by | |
file_size | |
having count(*) > 1 | |
order by 1 | |
''' | |
) | |
for row in rs: | |
file_size = dict(row)['file_size'] | |
#print('file_size={}'.format(file_size)) | |
files = session.query(File).filter_by(file_size=file_size).all() | |
for file in files: | |
#print(file.full_path) | |
#print(file.file_size) | |
print('.', end='', flush=True) | |
try: | |
hash_1k_bin = get_hash(file.full_path, first_chunk_only=True) | |
file.hash_1k = base64.b64encode(hash_1k_bin) | |
session.add(file) | |
except (OSError,): | |
# the file access might've changed till the exec point got here | |
continue | |
session.commit() | |
# obter todos os arquivos com mesmo tamanho e hash_1k e calcular o hash | |
print('\nCalculando hash full de arquivos com mesmo tamanho e hash 1K..') | |
with engine.connect() as con: | |
rs = con.execute( | |
''' | |
select | |
hash_1k, | |
file_size, | |
count(*) as hash_counter | |
from | |
file | |
where | |
hash_1k is not null and | |
hash_full is null | |
group by | |
hash_1k, | |
file_size | |
having count(*) > 1 | |
order by 1 | |
''' | |
) | |
for row in rs: | |
#print(row) | |
hash_1k = dict(row)['hash_1k'] | |
file_size = dict(row)['file_size'] | |
#print('hash_1k={}'.format(hash_1k)) | |
files = session.query(File).filter_by(hash_1k=hash_1k).filter_by(file_size=file_size).all() | |
for file in files: | |
#print(file.full_path) | |
#print(file.file_size) | |
print('.', end='', flush=True) | |
try: | |
hash_full_bin = get_hash(file.full_path, first_chunk_only=False) | |
file.hash_full = base64.b64encode(hash_full_bin) | |
session.add(file) | |
except (OSError,): | |
# the file access might've changed till the exec point got here | |
continue | |
session.commit() | |
# obter todos os arquivos com o mesmo nome e calcular o hash_1k | |
print('\nCalculando hash 1K de arquivos com o mesmo nome..') | |
with engine.connect() as con: | |
rs = con.execute( | |
''' | |
select | |
filename, | |
count(*) as file_counter | |
from | |
file | |
where | |
file_size > 0 and | |
hash_1k is null | |
group by | |
filename | |
having count(*) > 1 | |
order by 1 | |
''' | |
) | |
for row in rs: | |
filename = dict(row)['filename'] | |
files = session.query(File).filter_by(filename=filename).all() | |
for file in files: | |
print('.', end='', flush=True) | |
try: | |
hash_1k_bin = get_hash(file.full_path, first_chunk_only=True) | |
file.hash_1k = base64.b64encode(hash_1k_bin) | |
session.add(file) | |
except (OSError,): | |
# the file access might've changed till the exec point got here | |
continue | |
session.commit() | |
# obter todos os arquivos com mesmo nome e hash_1k e calcular o hash | |
print('\nCalculando hash full de arquivos com mesmo nome e hash 1K..') | |
with engine.connect() as con: | |
rs = con.execute( | |
''' | |
select | |
hash_1k, | |
filename, | |
count(*) as hash_counter | |
from | |
file | |
where | |
hash_1k is not null and | |
hash_full is null | |
group by | |
hash_1k, | |
filename | |
having count(*) > 1 | |
order by 1 | |
''' | |
) | |
for row in rs: | |
#print(row) | |
hash_1k = dict(row)['hash_1k'] | |
filename = dict(row)['filename'] | |
#print('hash_1k={}'.format(hash_1k)) | |
files = session.query(File).filter_by(hash_1k=hash_1k).filter_by(filename=filename).all() | |
for file in files: | |
#print(file.full_path) | |
#print(file.file_size) | |
print('.', end='', flush=True) | |
try: | |
hash_full_bin = get_hash(file.full_path, first_chunk_only=False) | |
file.hash_full = base64.b64encode(hash_full_bin) | |
session.add(file) | |
except (OSError,): | |
# the file access might've changed till the exec point got here | |
continue | |
session.commit() | |
# arquivos duplicados em cada um dos diretorios | |
print('\nArquivos duplicados em cada um dos diretorios..') | |
directories = session.query(Directory).all() | |
for directory in directories: | |
with engine.connect() as con: | |
rs = con.execute( | |
''' | |
select | |
hash_full, | |
count(*) | |
from | |
file f | |
where | |
directory_id = {} and | |
hash_full is not null and | |
not exists ( | |
select | |
* | |
from | |
file | |
where | |
directory_id <> f.directory_id and | |
hash_full = f.hash_full | |
) | |
group by | |
hash_full | |
having count(*) > 1 | |
order by 1 | |
'''.format(directory.id) | |
) | |
rows = rs.fetchall() | |
if len(rows) > 0: | |
print('----- duplicados ---------') | |
print(directory.path) | |
print('--------------------------') | |
for row in rows: | |
hash_full = dict(row)['hash_full'] | |
files = session.query( | |
File | |
).filter_by( | |
directory=directory | |
).filter_by( | |
hash_full=hash_full | |
).order_by( | |
File.last_creation_time.desc(), | |
File.last_modification_time.desc(), | |
File.last_access_time.desc(), | |
).all() | |
print('{} files'.format(len(files))) | |
for file in files: | |
print('--') | |
print(file.full_path) | |
print(localtime(file.last_creation_time)) | |
print(localtime(file.last_modification_time)) | |
print(localtime(file.last_access_time)) | |
# arquivos duplicados presents nos dois diretorios | |
print('\nArquivos duplicados presents nos dois diretorios..') | |
directory = session.query(Directory).first() | |
if directory: | |
with engine.connect() as con: | |
rs = con.execute( | |
''' | |
select | |
hash_full, | |
count(*) | |
from | |
file f | |
where | |
directory_id = {} and | |
hash_full is not null and | |
exists ( | |
select | |
* | |
from | |
file | |
where | |
directory_id <> f.directory_id and | |
hash_full = f.hash_full | |
) | |
group by | |
hash_full | |
having count(*) > 0 | |
order by 1 | |
'''.format(directory.id) | |
) | |
rows = rs.fetchall() | |
if len(rows) > 0: | |
print('----- duplicados ---------') | |
print('--------------------------') | |
for row in rows: | |
hash_full = dict(row)['hash_full'] | |
files = session.query( | |
File | |
).filter_by( | |
hash_full=hash_full | |
).order_by( | |
File.last_creation_time.desc(), | |
File.last_modification_time.desc(), | |
File.last_access_time.desc(), | |
).all() | |
print('{} files'.format(len(files))) | |
for file in files: | |
print('--') | |
print(file.full_path) | |
print(localtime(file.last_creation_time)) | |
print(localtime(file.last_modification_time)) | |
print(localtime(file.last_access_time)) | |
# obter os arquivos que estao na pasta atual, mas nao estao no backup | |
print('\nArquivos que estao na pasta atual, mas nao estao no backup..') | |
directory = session.query(Directory).first() | |
if directory: | |
with engine.connect() as con: | |
rs = con.execute( | |
''' | |
select | |
distinct filename | |
from | |
file f | |
where | |
directory_id = {} and | |
not exists ( | |
select | |
* | |
from | |
file | |
where | |
directory_id <> f.directory_id and | |
filename = f.filename | |
) | |
order by filename | |
'''.format(directory.id) | |
) | |
rows = rs.fetchall() | |
if len(rows) > 0: | |
print('----- nao estao no backup ---------') | |
print('-----------------------------------') | |
for row in rows: | |
filename = dict(row)['filename'] | |
files = session.query( | |
File | |
).filter_by( | |
directory=directory | |
).filter_by( | |
filename=filename | |
).order_by( | |
File.dirname.asc() | |
).all() | |
print('{} files'.format(len(files))) | |
for file in files: | |
print('--') | |
print(file.full_path) | |
print(localtime(file.last_creation_time)) | |
print(localtime(file.last_modification_time)) | |
print(localtime(file.last_access_time)) | |
# arquivos com mesmo nome, mas com hash distintos | |
print('\nArquivos com mesmo nome, mas com hash distintos..') | |
directories = session.query(Directory).all() | |
for directory in directories: | |
with engine.connect() as con: | |
rs = con.execute( | |
''' | |
select distinct | |
file1.filename, | |
file1.hash_full | |
from | |
file as file1, | |
file as file2 | |
where | |
file1.directory_id = {} and | |
file2.directory_id = file1.directory_id and | |
file1.filename = file2.filename and | |
file1.hash_full <> file2.hash_full | |
'''.format(directory.id) | |
) | |
rows = rs.fetchall() | |
if len(rows) > 0: | |
print('----- mesmo nome, hash distintos ---------') | |
print(directory.path) | |
print('-------------------------------------------') | |
for row in rows: | |
filename = dict(row)['filename'] | |
hash_full = dict(row)['hash_full'] | |
files = session.query( | |
File | |
).filter_by( | |
directory=directory | |
).filter_by( | |
filename=filename | |
).filter_by( | |
hash_full=hash_full | |
).order_by( | |
File.dirname.asc() | |
).all() | |
for file in files: | |
print('--') | |
print(file.full_path) | |
print(localtime(file.last_creation_time)) | |
print(localtime(file.last_modification_time)) | |
print(localtime(file.last_access_time)) | |
# arquivos com mesmo hash, mas com nomes distintos | |
print('\nArquivos com mesmo hash, mas com nomes distintos..') | |
directories = session.query(Directory).all() | |
for directory in directories: | |
with engine.connect() as con: | |
rs = con.execute( | |
''' | |
select | |
hash_full, | |
count(*) | |
from | |
file f | |
where | |
directory_id = {} and | |
hash_full is not null and | |
exists ( | |
select | |
* | |
from | |
file | |
where | |
directory_id = f.directory_id and | |
hash_full = f.hash_full and | |
filename <> f.filename | |
) | |
group by | |
hash_full | |
having count(*) > 1 | |
order by 1 | |
'''.format(directory.id) | |
) | |
rows = rs.fetchall() | |
if len(rows) > 0: | |
print('----- mesmo hash, nomes distintos ---------') | |
print(directory.path) | |
print('-------------------------------------------') | |
for row in rows: | |
hash_full = dict(row)['hash_full'] | |
files = session.query( | |
File | |
).filter_by( | |
directory=directory | |
).filter_by( | |
hash_full=hash_full | |
).order_by( | |
File.last_creation_time.desc(), | |
File.last_modification_time.desc(), | |
File.last_access_time.desc(), | |
).all() | |
print('{} files'.format(len(files))) | |
for file in files: | |
print('--') | |
print(file.full_path) | |
print(localtime(file.last_creation_time)) | |
print(localtime(file.last_modification_time)) | |
print(localtime(file.last_access_time)) | |
# arquivos mais novos na pasta (backup esta com mais antigo) com o mesmo nome?? | |
print('\nArquivos mais novos na pasta (backup esta com mais antigo) com o mesmo nome..') | |
directory = session.query(Directory).first() | |
if directory: | |
with engine.connect() as con: | |
rs = con.execute( | |
''' | |
select distinct | |
file1.full_path, | |
file1.hash_full | |
from | |
file as file1, | |
file as file2 | |
where | |
file1.directory_id = {} and | |
file2.directory_id <> file1.directory_id and | |
file1.filename = file2.filename and | |
( | |
file1.hash_1k <> file2.hash_1k or | |
file1.hash_full <> file2.hash_full | |
) and | |
file1.last_modification_time > file2.last_modification_time | |
'''.format(directory.id) | |
) | |
rows = rs.fetchall() | |
if len(rows) > 0: | |
print('----- arquivos mais novos na pasta ---------') | |
print(directory.path) | |
print('-------------------------------------------') | |
for row in rows: | |
full_path = dict(row)['full_path'] | |
hash_full = dict(row)['hash_full'] | |
files = session.query( | |
File | |
).filter_by( | |
directory=directory | |
).filter_by( | |
full_path=full_path | |
).filter_by( | |
hash_full=hash_full | |
).order_by( | |
File.dirname.asc() | |
).all() | |
for file in files: | |
print('--') | |
print(file.full_path) | |
print(localtime(file.last_creation_time)) | |
print(localtime(file.last_modification_time)) | |
print(localtime(file.last_access_time)) | |
# diretorios vazios?? | |
print('\ndiretorios vazios..') | |
directories = session.query(Directory).all() | |
for directory in directories: | |
print('----- diretorios vazios ---------') | |
print(directory.path) | |
print('---------------------------------') | |
path = directory.path | |
for dirpath, dirnames, filenames in os.walk(path): | |
if len(filenames) == 0: | |
print(dirpath) | |
# apagar copiando para uma pasta delete-me |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment