Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save adrianjagielak/477b251f764a8a3abc37047c3d4b67d7 to your computer and use it in GitHub Desktop.
Save adrianjagielak/477b251f764a8a3abc37047c3d4b67d7 to your computer and use it in GitHub Desktop.
Google Photos Takeout remove duplicates which are both in albums and in "Photos from XXXX"/"Archive"/"Trash"
import os
import hashlib
import datetime
def get_md5(file_path):
"""Compute MD5 hash of a file."""
hasher = hashlib.md5()
with open(file_path, 'rb') as file:
# read and update hash in chunks of 4K
for chunk in iter(lambda: file.read(4096), b''):
hasher.update(chunk)
return hasher.hexdigest()
def find_files(folder):
"""Find all files in a folder recursively."""
for root, _, files in os.walk(folder):
for file in files:
yield os.path.join(root, file)
def get_special_folders():
"""Return a list of special folders."""
current_year = datetime.datetime.now().year
year_folders = [f"Photos from {year}" for year in range(1979, current_year + 1)]
return ['Archive', 'Trash'] + year_folders
def main():
root_folder = 'Google Photos'
special_folders = get_special_folders()
# Create a dictionary to hold hashes and their corresponding paths.
# We'll first populate this with non-special folders.
file_hashes = {}
for folder in os.listdir(root_folder):
if folder not in special_folders:
for file_path in find_files(os.path.join(root_folder, folder)):
file_hash = get_md5(file_path)
file_hashes[file_hash] = file_path
# Now, we'll go through the special folders and delete duplicates.
for special_folder in special_folders:
for file_path in find_files(os.path.join(root_folder, special_folder)):
file_hash = get_md5(file_path)
if file_hash in file_hashes:
print(f"Deleting duplicate {file_path} ...")
os.remove(file_path)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment