Last active
December 25, 2017 15:37
-
-
Save clementlefevre/e3fa2567c98f580ebb73f96401d91d2c to your computer and use it in GitHub Desktop.
Remove duplicates in folder
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
import sys | |
import os | |
import hashlib | |
import pandas as pd | |
import pickle | |
def chunk_reader(fobj, chunk_size=1024): | |
"""Generator that reads a file in chunks of bytes""" | |
while True: | |
chunk = fobj.read(chunk_size) | |
if not chunk: | |
return | |
yield chunk | |
def create_hashes(paths): | |
hashes = {} | |
for path in paths: | |
for dirpath, dirnames, filenames in os.walk(path): | |
for filename in filenames: | |
full_path = os.path.join(dirpath, filename) | |
mhash = hashlib.md5() | |
for chunk in chunk_reader(open(full_path, 'rb')): | |
mhash.update(chunk) | |
file_id = (mhash.digest(), os.path.getsize(full_path)) | |
if hashes.get(file_id): | |
hashes[file_id].append(full_path) | |
else: | |
hashes[file_id] = [full_path] | |
return hashes | |
def check_for_duplicates(folder_to_keep, display=False): | |
folders_to_clean = [folder for folder in os.listdir( | |
'.') if os.path.isdir(folder)] | |
folders_to_clean = [ | |
folder for folder in folders_to_clean if folder != folder_to_keep] | |
print "folders to clean : ", folders_to_clean | |
hashes_to_keep = create_hashes([folder_to_keep]) | |
print 'files in ', folder_to_keep, len(hashes_to_keep) | |
hashes_to_clean = create_hashes(folders_to_clean) | |
# print hashes_to_clean | |
print 'files in ', folders_to_clean, len(hashes_to_clean) | |
for file_hash, files_dup in hashes_to_clean.iteritems(): | |
duplicate = hashes_to_keep.get(file_hash, None) | |
if duplicate: | |
if display: | |
print "Duplicate found: %s" % (files_dup) | |
map(os.remove, files_dup) | |
print 'remaining files in folders to clean : {}'.format(len(create_hashes(folders_to_clean))) | |
for folder in folders_to_clean: | |
recursive_delete_if_empty(folder) | |
def recursive_delete_if_empty(path): | |
"""Recursively delete empty directories; return True | |
if everything was deleted.""" | |
if not os.path.isdir(path): | |
# If you also want to delete some files like desktop.ini, check | |
# for that here, and return True if you delete them. | |
return False | |
# Note that the list comprehension here is necessary, a | |
# generator expression would shortcut and we don't want that! | |
if all([recursive_delete_if_empty(os.path.join(path, filename)) | |
for filename in os.listdir(path)]): | |
# Either there was nothing here or it was all deleted | |
os.rmdir(path) | |
return True | |
else: | |
return False | |
def create_dupes_list(root_path): | |
root_path=root_path or '.' | |
folders_to_analyze = [os.path.join(root_path,folder) for folder in os.listdir( | |
root_path) if os.path.isdir(folder)] | |
hashes = create_hashes(folders_to_analyze) | |
pickle.dump(hashes, open("hashes.p", "wb")) | |
hashes = pickle.load(open("hashes.p", "rb")) | |
df = pd.DataFrame.from_dict(hashes,orient='index') | |
df_stacked = pd.DataFrame(df.stack(),columns=['path']) | |
df_stacked = df_stacked.reset_index(level=0) | |
df_stacked.rename(columns={'level_0':'hash'}, inplace=True) | |
df_stacked = df_stacked.sort_values('hash') | |
df_stacked['count'] =df_stacked.groupby('hash')["path"].transform("count") | |
df_stacked = df_stacked.sort_values('count',ascending=False) | |
df_files_paths = df_stacked['path'].str.split('/', 4, expand=True) | |
df_stacked['file_name']=df_stacked.path.str.extract('([^/]+$)') | |
df_stacked.drop('path',1,inplace=True) | |
df_final = pd.concat([df_files_paths, df_stacked], axis=1) | |
df_final = df_final[df_final['count']>1] | |
df_final=df_final.sort_values('hash') | |
df_final.to_csv('duplicates_list.csv') | |
df_summary = df_final.groupby([0,1])[['hash']].count().sort_values('hash',ascending=False) | |
df_summary.to_csv('duplicates_list_summary.csv') | |
if __name__ == '__main__': | |
try: | |
folder_to_keep = sys.argv[1] | |
except: | |
print 'no arg given !' | |
create_dupes_list() | |
#check_for_duplicates(folder_to_keep, True) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
copy .py file in folder of interest.
enter the folder name you want to keep as arg.
the script drops all duplicate files found in the other directories/subdirectories and finally drop all empty folder.