Last active
May 30, 2024 20:27
-
-
Save adrianjagielak/477b251f764a8a3abc37047c3d4b67d7 to your computer and use it in GitHub Desktop.
Google Photos Takeout remove duplicates which are both in albums and in "Photos from XXXX"/"Archive"/"Trash"
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import hashlib | |
import datetime | |
def get_md5(file_path): | |
"""Compute MD5 hash of a file.""" | |
hasher = hashlib.md5() | |
with open(file_path, 'rb') as file: | |
# read and update hash in chunks of 4K | |
for chunk in iter(lambda: file.read(4096), b''): | |
hasher.update(chunk) | |
return hasher.hexdigest() | |
def find_files(folder): | |
"""Find all files in a folder recursively.""" | |
for root, _, files in os.walk(folder): | |
for file in files: | |
yield os.path.join(root, file) | |
def get_special_folders(): | |
"""Return a list of special folders.""" | |
current_year = datetime.datetime.now().year | |
year_folders = [f"Photos from {year}" for year in range(1979, current_year + 1)] | |
return ['Archive', 'Trash'] + year_folders | |
def main(): | |
root_folder = 'Google Photos' | |
special_folders = get_special_folders() | |
# Create a dictionary to hold hashes and their corresponding paths. | |
# We'll first populate this with non-special folders. | |
file_hashes = {} | |
for folder in os.listdir(root_folder): | |
if folder not in special_folders: | |
for file_path in find_files(os.path.join(root_folder, folder)): | |
file_hash = get_md5(file_path) | |
file_hashes[file_hash] = file_path | |
# Now, we'll go through the special folders and delete duplicates. | |
for special_folder in special_folders: | |
for file_path in find_files(os.path.join(root_folder, special_folder)): | |
file_hash = get_md5(file_path) | |
if file_hash in file_hashes: | |
print(f"Deleting duplicate {file_path} ...") | |
os.remove(file_path) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment