clementlefevre · December 25, 2017 15:37 · clementlefevre · Oct 7, 2017
diff --git a/drop_dupes.py b/drop_dupes.py
 #!/usr/bin/python

 import sys
 import os
 import hashlib
 import pandas as pd
 import pickle


 def chunk_reader(fobj, chunk_size=1024):
    """Generator that reads a file in chunks of bytes"""
    while True:
        chunk = fobj.read(chunk_size)
        if not chunk:
            return
        yield chunk


 def create_hashes(paths):
    hashes = {}
    for path in paths:

        for dirpath, dirnames, filenames in os.walk(path):
            for filename in filenames:
                full_path = os.path.join(dirpath, filename)

                mhash = hashlib.md5()
                for chunk in chunk_reader(open(full_path, 'rb')):
                    mhash.update(chunk)
                file_id = (mhash.digest(), os.path.getsize(full_path))
                if hashes.get(file_id):
                    hashes[file_id].append(full_path)
                else:
                    hashes[file_id] = [full_path]
    return hashes


 def check_for_duplicates(folder_to_keep, display=False):
    folders_to_clean = [folder for folder in os.listdir(
        '.') if os.path.isdir(folder)]
    folders_to_clean = [
        folder for folder in folders_to_clean if folder != folder_to_keep]
    print "folders to clean : ", folders_to_clean

    hashes_to_keep = create_hashes([folder_to_keep])
    print 'files in ', folder_to_keep, len(hashes_to_keep)

    hashes_to_clean = create_hashes(folders_to_clean)
    # print hashes_to_clean
    print 'files in ', folders_to_clean, len(hashes_to_clean)

    for file_hash, files_dup in hashes_to_clean.iteritems():
        duplicate = hashes_to_keep.get(file_hash, None)
        if duplicate:

            if display:
                print "Duplicate found:  %s" % (files_dup)
            map(os.remove, files_dup)

    print 'remaining files in folders to clean : {}'.format(len(create_hashes(folders_to_clean)))

    for folder in folders_to_clean:
        recursive_delete_if_empty(folder)


 def recursive_delete_if_empty(path):
    """Recursively delete empty directories; return True
    if everything was deleted."""

    if not os.path.isdir(path):
        # If you also want to delete some files like desktop.ini, check
        # for that here, and return True if you delete them.
        return False

    # Note that the list comprehension here is necessary, a
    # generator expression would shortcut and we don't want that!
    if all([recursive_delete_if_empty(os.path.join(path, filename))
            for filename in os.listdir(path)]):
        # Either there was nothing here or it was all deleted
        os.rmdir(path)
        return True
    else:
        return False


 def create_dupes_list(root_path):
    root_path=root_path or '.'
    
    folders_to_analyze = [os.path.join(root_path,folder) for folder in os.listdir(
        root_path) if os.path.isdir(folder)]
    hashes = create_hashes(folders_to_analyze)
    
    pickle.dump(hashes, open("hashes.p", "wb"))
    
    hashes = pickle.load(open("hashes.p", "rb"))
    df = pd.DataFrame.from_dict(hashes,orient='index')
    df_stacked = pd.DataFrame(df.stack(),columns=['path'])
    df_stacked = df_stacked.reset_index(level=0)
    df_stacked.rename(columns={'level_0':'hash'}, inplace=True)
    df_stacked = df_stacked.sort_values('hash')
    df_stacked['count'] =df_stacked.groupby('hash')["path"].transform("count")
    df_stacked = df_stacked.sort_values('count',ascending=False)
    df_files_paths = df_stacked['path'].str.split('/', 4, expand=True)
    df_stacked['file_name']=df_stacked.path.str.extract('([^/]+$)')
    df_stacked.drop('path',1,inplace=True)
    df_final = pd.concat([df_files_paths, df_stacked], axis=1)
    df_final = df_final[df_final['count']>1]
    df_final=df_final.sort_values('hash')
    df_final.to_csv('duplicates_list.csv')
    df_summary = df_final.groupby([0,1])[['hash']].count().sort_values('hash',ascending=False)
    df_summary.to_csv('duplicates_list_summary.csv')



 if __name__ == '__main__':
    try:
        folder_to_keep = sys.argv[1]
    except:
        print 'no arg given !'
    
    create_dupes_list()
    #check_for_duplicates(folder_to_keep, True)
	#!/usr/bin/python

	import sys
	import os
	import hashlib
	import pandas as pd
	import pickle


	def chunk_reader(fobj, chunk_size=1024):
	"""Generator that reads a file in chunks of bytes"""
	while True:
	chunk = fobj.read(chunk_size)
	if not chunk:
	return
	yield chunk


	def create_hashes(paths):
	hashes = {}
	for path in paths:

	for dirpath, dirnames, filenames in os.walk(path):
	for filename in filenames:
	full_path = os.path.join(dirpath, filename)

	mhash = hashlib.md5()
	for chunk in chunk_reader(open(full_path, 'rb')):
	mhash.update(chunk)
	file_id = (mhash.digest(), os.path.getsize(full_path))
	if hashes.get(file_id):
	hashes[file_id].append(full_path)
	else:
	hashes[file_id] = [full_path]
	return hashes


	def check_for_duplicates(folder_to_keep, display=False):
	folders_to_clean = [folder for folder in os.listdir(
	'.') if os.path.isdir(folder)]
	folders_to_clean = [
	folder for folder in folders_to_clean if folder != folder_to_keep]
	print "folders to clean : ", folders_to_clean

	hashes_to_keep = create_hashes([folder_to_keep])
	print 'files in ', folder_to_keep, len(hashes_to_keep)

	hashes_to_clean = create_hashes(folders_to_clean)
	# print hashes_to_clean
	print 'files in ', folders_to_clean, len(hashes_to_clean)

	for file_hash, files_dup in hashes_to_clean.iteritems():
	duplicate = hashes_to_keep.get(file_hash, None)
	if duplicate:

	if display:
	print "Duplicate found: %s" % (files_dup)
	map(os.remove, files_dup)

	print 'remaining files in folders to clean : {}'.format(len(create_hashes(folders_to_clean)))

	for folder in folders_to_clean:
	recursive_delete_if_empty(folder)


	def recursive_delete_if_empty(path):
	"""Recursively delete empty directories; return True
	if everything was deleted."""

	if not os.path.isdir(path):
	# If you also want to delete some files like desktop.ini, check
	# for that here, and return True if you delete them.
	return False

	# Note that the list comprehension here is necessary, a
	# generator expression would shortcut and we don't want that!
	if all([recursive_delete_if_empty(os.path.join(path, filename))
	for filename in os.listdir(path)]):
	# Either there was nothing here or it was all deleted
	os.rmdir(path)
	return True
	else:
	return False


	def create_dupes_list(root_path):
	root_path=root_path or '.'

	folders_to_analyze = [os.path.join(root_path,folder) for folder in os.listdir(
	root_path) if os.path.isdir(folder)]
	hashes = create_hashes(folders_to_analyze)

	pickle.dump(hashes, open("hashes.p", "wb"))

	hashes = pickle.load(open("hashes.p", "rb"))
	df = pd.DataFrame.from_dict(hashes,orient='index')
	df_stacked = pd.DataFrame(df.stack(),columns=['path'])
	df_stacked = df_stacked.reset_index(level=0)
	df_stacked.rename(columns={'level_0':'hash'}, inplace=True)
	df_stacked = df_stacked.sort_values('hash')
	df_stacked['count'] =df_stacked.groupby('hash')["path"].transform("count")
	df_stacked = df_stacked.sort_values('count',ascending=False)
	df_files_paths = df_stacked['path'].str.split('/', 4, expand=True)
	df_stacked['file_name']=df_stacked.path.str.extract('([^/]+$)')
	df_stacked.drop('path',1,inplace=True)
	df_final = pd.concat([df_files_paths, df_stacked], axis=1)
	df_final = df_final[df_final['count']>1]
	df_final=df_final.sort_values('hash')
	df_final.to_csv('duplicates_list.csv')
	df_summary = df_final.groupby([0,1])[['hash']].count().sort_values('hash',ascending=False)
	df_summary.to_csv('duplicates_list_summary.csv')



	if __name__ == '__main__':
	try:
	folder_to_keep = sys.argv[1]
	except:
	print 'no arg given !'

	create_dupes_list()
	#check_for_duplicates(folder_to_keep, True)