muety · August 1, 2024 20:36
diff --git a/immich_clean_duplicates.py b/immich_clean_duplicates.py
 #!/usr/bin/python

 # Script to clean up duplicates in Immich based on their file size.
 #
 # Rationale: I observed that photos which I had applied the Google Photos' "Optimize" filter to, are imported in their original and edited versions,
 # while the edited version usually seems to have a smaller file size, despite same resolution (lower JPEG quality, maybe?).
 # This script iterates all duplicates detected by Immich itself and, given a pair of two, removes the smaller one if their file names and creations dates match
 #
 # Prerequisites:
 # 1. Run Immich Smart Search
 # 2. Run Immich duplicate detection (with threshold of 0.1)
 # 3. Create API key
 # 4. Run this script

 import logging
 from typing import List, Dict, Any

 import httpx  # pip install httpx
 import argparse

 logging.basicConfig(level=logging.INFO)


 class DuplicatesCleaner:
    def __init__(self, url: str, api_key: str, keep_smaller: bool = True):
        self.keep_smaller: bool = keep_smaller
        self.client: httpx.Client = httpx.Client(base_url=f'{url}/api', headers={
            'x-api-key': api_key
        })

    def run(self):
        self.batch_delete(self.filter_duplicates(self.fetch_duplicates()))

    def fetch_duplicates(self) -> List[Dict[str, Any]]:
        r = self.client.get('duplicates')
        r.raise_for_status()
        return r.json()

    def filter_duplicates(self, duplicates: List[Dict[str, Any]]) -> List[str]:
        # returns list of image ids to delete
        to_delete: List[str] = []

        for d in duplicates:
            assets = sorted(d['assets'], key=lambda p: p['exifInfo']['fileSizeInByte'], reverse=not self.keep_smaller)
            names = set([p['originalFileName'] for p in assets])
            created_dates = set([p['fileCreatedAt'] for p in assets])

            if len(assets) < 2:
                continue
            if len(names) > 1 or len(created_dates) > 1:
                logging.info(f'skipping duplicate {d["duplicateId"]} as their file names or dates differ')
                continue
            to_delete.append(assets[0]['id'])

        logging.info(f'got {len(to_delete)} duplicates')
        return to_delete

    def batch_delete(self, ids: List[str]):
        r = self.client.request('DELETE', 'assets', json={
            'force': True,
            'ids': ids,
        })
        r.raise_for_status()
        logging.info(f'deleted {len(ids)} duplicates')


 if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--url', type=str, required=True, help='Your Immich base URL')
    parser.add_argument('--api_key', type=str, required=True, help='Your Immich API Key')
    parser.add_argument('--keep_larger', action='store_true', help='Whether to keep the larger (file size) of both images instead of smaller')
    args = parser.parse_args()

    c: DuplicatesCleaner = DuplicatesCleaner(args.url, args.api_key, not args.keep_larger)
    c.run()
	#!/usr/bin/python

	# Script to clean up duplicates in Immich based on their file size.
	#
	# Rationale: I observed that photos which I had applied the Google Photos' "Optimize" filter to, are imported in their original and edited versions,
	# while the edited version usually seems to have a smaller file size, despite same resolution (lower JPEG quality, maybe?).
	# This script iterates all duplicates detected by Immich itself and, given a pair of two, removes the smaller one if their file names and creations dates match
	#
	# Prerequisites:
	# 1. Run Immich Smart Search
	# 2. Run Immich duplicate detection (with threshold of 0.1)
	# 3. Create API key
	# 4. Run this script

	import logging
	from typing import List, Dict, Any

	import httpx # pip install httpx
	import argparse

	logging.basicConfig(level=logging.INFO)


	class DuplicatesCleaner:
	def __init__(self, url: str, api_key: str, keep_smaller: bool = True):
	self.keep_smaller: bool = keep_smaller
	self.client: httpx.Client = httpx.Client(base_url=f'{url}/api', headers={
	'x-api-key': api_key
	})

	def run(self):
	self.batch_delete(self.filter_duplicates(self.fetch_duplicates()))

	def fetch_duplicates(self) -> List[Dict[str, Any]]:
	r = self.client.get('duplicates')
	r.raise_for_status()
	return r.json()

	def filter_duplicates(self, duplicates: List[Dict[str, Any]]) -> List[str]:
	# returns list of image ids to delete
	to_delete: List[str] = []

	for d in duplicates:
	assets = sorted(d['assets'], key=lambda p: p['exifInfo']['fileSizeInByte'], reverse=not self.keep_smaller)
	names = set([p['originalFileName'] for p in assets])
	created_dates = set([p['fileCreatedAt'] for p in assets])

	if len(assets) < 2:
	continue
	if len(names) > 1 or len(created_dates) > 1:
	logging.info(f'skipping duplicate {d["duplicateId"]} as their file names or dates differ')
	continue
	to_delete.append(assets[0]['id'])

	logging.info(f'got {len(to_delete)} duplicates')
	return to_delete

	def batch_delete(self, ids: List[str]):
	r = self.client.request('DELETE', 'assets', json={
	'force': True,
	'ids': ids,
	})
	r.raise_for_status()
	logging.info(f'deleted {len(ids)} duplicates')


	if __name__ == '__main__':
	parser = argparse.ArgumentParser()
	parser.add_argument('--url', type=str, required=True, help='Your Immich base URL')
	parser.add_argument('--api_key', type=str, required=True, help='Your Immich API Key')
	parser.add_argument('--keep_larger', action='store_true', help='Whether to keep the larger (file size) of both images instead of smaller')
	args = parser.parse_args()

	c: DuplicatesCleaner = DuplicatesCleaner(args.url, args.api_key, not args.keep_larger)
	c.run()