Created
August 1, 2024 20:36
-
-
Save muety/11737c29e825c85389d32ba9d21d134b to your computer and use it in GitHub Desktop.
Script to clean up duplicates in Immich based on their file size.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# Script to clean up duplicates in Immich based on their file size. | |
# | |
# Rationale: I observed that photos which I had applied the Google Photos' "Optimize" filter to, are imported in their original and edited versions, | |
# while the edited version usually seems to have a smaller file size, despite same resolution (lower JPEG quality, maybe?). | |
# This script iterates all duplicates detected by Immich itself and, given a pair of two, removes the smaller one if their file names and creations dates match | |
# | |
# Prerequisites: | |
# 1. Run Immich Smart Search | |
# 2. Run Immich duplicate detection (with threshold of 0.1) | |
# 3. Create API key | |
# 4. Run this script | |
import logging | |
from typing import List, Dict, Any | |
import httpx # pip install httpx | |
import argparse | |
logging.basicConfig(level=logging.INFO) | |
class DuplicatesCleaner: | |
def __init__(self, url: str, api_key: str, keep_smaller: bool = True): | |
self.keep_smaller: bool = keep_smaller | |
self.client: httpx.Client = httpx.Client(base_url=f'{url}/api', headers={ | |
'x-api-key': api_key | |
}) | |
def run(self): | |
self.batch_delete(self.filter_duplicates(self.fetch_duplicates())) | |
def fetch_duplicates(self) -> List[Dict[str, Any]]: | |
r = self.client.get('duplicates') | |
r.raise_for_status() | |
return r.json() | |
def filter_duplicates(self, duplicates: List[Dict[str, Any]]) -> List[str]: | |
# returns list of image ids to delete | |
to_delete: List[str] = [] | |
for d in duplicates: | |
assets = sorted(d['assets'], key=lambda p: p['exifInfo']['fileSizeInByte'], reverse=not self.keep_smaller) | |
names = set([p['originalFileName'] for p in assets]) | |
created_dates = set([p['fileCreatedAt'] for p in assets]) | |
if len(assets) < 2: | |
continue | |
if len(names) > 1 or len(created_dates) > 1: | |
logging.info(f'skipping duplicate {d["duplicateId"]} as their file names or dates differ') | |
continue | |
to_delete.append(assets[0]['id']) | |
logging.info(f'got {len(to_delete)} duplicates') | |
return to_delete | |
def batch_delete(self, ids: List[str]): | |
r = self.client.request('DELETE', 'assets', json={ | |
'force': True, | |
'ids': ids, | |
}) | |
r.raise_for_status() | |
logging.info(f'deleted {len(ids)} duplicates') | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser() | |
parser.add_argument('--url', type=str, required=True, help='Your Immich base URL') | |
parser.add_argument('--api_key', type=str, required=True, help='Your Immich API Key') | |
parser.add_argument('--keep_larger', action='store_true', help='Whether to keep the larger (file size) of both images instead of smaller') | |
args = parser.parse_args() | |
c: DuplicatesCleaner = DuplicatesCleaner(args.url, args.api_key, not args.keep_larger) | |
c.run() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment