Skip to content

Instantly share code, notes, and snippets.

@adrianjagielak
Last active October 16, 2023 15:22
Show Gist options
  • Save adrianjagielak/7d6eb8cb750c93286438459f2a2b19b7 to your computer and use it in GitHub Desktop.
Save adrianjagielak/7d6eb8cb750c93286438459f2a2b19b7 to your computer and use it in GitHub Desktop.
Fix Google Photos Takeout timestamps+gps+broken files (populate file creation time (all files), file modification time (all files), and exif DateTimeOriginal (images), clear the gps exif if its invalid (0,0), and remove invalid broken images (move them to a "corrupted_files" dir)
import os
import json
from PIL import Image, UnidentifiedImageError
import datetime
import re
import shutil
def set_file_timestamps(filename, timestamp):
"""Set the creation and modification dates for a file."""
dt = datetime.datetime.fromtimestamp(int(timestamp))
dt_timestamp = dt.timestamp()
os.utime(filename, (dt_timestamp, dt_timestamp))
def set_image_datetime_original(filename, timestamp, corrupted_folder="corrupted_files"):
"""Set the DateTimeOriginal exif property for an image using Pillow."""
try:
img = Image.open(filename)
# Check if the image is in a format that supports EXIF data
if img.format in ["JPEG", "TIFF"]:
exif_data = img._getexif() or {}
dt_str = datetime.datetime.fromtimestamp(int(timestamp)).strftime('%Y:%m:%d %H:%M:%S')
exif_data[36867] = dt_str # 36867 is the tag for DateTimeOriginal
# Check and remove geolocation if set to 0, 0
latitude_tag = 0x0002 # GPSLatitude
longitude_tag = 0x0004 # GPSLongitude
if exif_data.get(latitude_tag) == (0, 0) and exif_data.get(longitude_tag) == (0, 0):
exif_data.pop(latitude_tag, None)
exif_data.pop(longitude_tag, None)
exif_data.pop(0x0001, None) # GPSLatitudeRef
exif_data.pop(0x0003, None) # GPSLongitudeRef
# Convert updated exif_data to bytes
exif_bytes = img.info.get("exif", b"")
# If there's existing EXIF data, save with updated EXIF
if exif_bytes:
img.save(filename, exif=exif_bytes, quality=100)
else:
# If no EXIF data, just save the image without updating EXIF
img.save(filename, quality=100)
else:
# If the image format doesn't support EXIF data, just save the image
img.save(filename, quality=100)
img.close()
except UnidentifiedImageError:
# If the file isn't a recognized image format, just skip setting the EXIF data.
pass
except OSError as e:
print(f"Error encountered for file {filename}: {e}")
# If the corrupted folder doesn't exist, create it
if not os.path.exists(corrupted_folder):
os.makedirs(corrupted_folder)
# Move the corrupted file to the corrupted folder
dest_path = os.path.join(corrupted_folder, os.path.basename(filename))
shutil.move(filename, dest_path)
print(f"Moved corrupted file {filename} to {dest_path}")
def get_json_filename(filename, root):
"""Attempt to find the corresponding JSON file for a given filename."""
print(f"\nProcessing: {filename}") # Print the current filename we're trying to process
# First approach: append .json to filename (e.g. photo.jpg -> photo.jpg.json)
json_filename = os.path.join(root, filename + '.json')
if os.path.exists(json_filename):
print(f"Found using first approach: {json_filename}")
return json_filename
# Second approach: remove extension and append .json (e.g. photo.jpg -> photo.json)
basename, ext = os.path.splitext(filename)
json_filename = os.path.join(root, basename + '.json')
if os.path.exists(json_filename):
print(f"Found using second approach: {json_filename}")
return json_filename
# Third approach: move number before the extension
# e.g. 20200912_122446(1).jpg -> 20200912_122446.jpg(1).json
number_match = re.search(r'(.+)(\(\d+\))(\..+)$', filename)
if number_match:
new_filename = number_match.group(1) + number_match.group(3) + number_match.group(2) + '.json'
json_filename = os.path.join(root, new_filename)
if os.path.exists(json_filename):
print(f"Found using third approach: {json_filename}")
return json_filename
else:
print(f"Expected but not found (third approach): {json_filename}")
# Fourth approach: match truncated filename (46 characters)
truncated_json_filename = os.path.join(root, filename[:46] + '.json') # Truncating to 46 characters
if os.path.exists(truncated_json_filename):
print(f"Found using fourth approach: {truncated_json_filename}")
return truncated_json_filename
else:
print(f"Expected but not found (fourth approach): {truncated_json_filename}")
# Fourth approach: match truncated filename (47 characters)
truncated_json_filename = os.path.join(root, filename[:47] + '.json') # Truncating to 47 characters
if os.path.exists(truncated_json_filename):
print(f"Found using fourth approach: {truncated_json_filename}")
return truncated_json_filename
else:
print(f"Expected but not found (fourth approach): {truncated_json_filename}")
# Fourth approach: match truncated filename (45 characters)
truncated_json_filename = os.path.join(root, filename[:45] + '.json') # Truncating to 45 characters
if os.path.exists(truncated_json_filename):
print(f"Found using fourth approach: {truncated_json_filename}")
return truncated_json_filename
else:
print(f"Expected but not found (fourth approach): {truncated_json_filename}")
# Fifth approach: check for HEIC variant (Apple Live Photo)
heic_json_filename = os.path.join(root, basename + '.HEIC.json')
if os.path.exists(heic_json_filename):
print(f"Found using fifth approach: {heic_json_filename}")
return heic_json_filename
else:
print(f"Expected but not found (fifth approach): {heic_json_filename}")
# Sixth approach: check for jpg variant (Apple Live Photo)
jpg_json_filename = os.path.join(root, basename + '.jpg.json')
if os.path.exists(jpg_json_filename):
print(f"Found using sixth approach: {jpg_json_filename}")
return jpg_json_filename
else:
print(f"Expected but not found (sixth approach): {jpg_json_filename}")
# Seventh approach: Check for numbered HEIC variant (e.g. file(1).MP4 -> file.HEIC(1).json)
if number_match:
heic_numbered_json_filename = os.path.join(root, number_match.group(1) + '.HEIC' + number_match.group(2) + '.json')
if os.path.exists(heic_numbered_json_filename):
print(f"Found using seventh approach: {heic_numbered_json_filename}")
return heic_numbered_json_filename
else:
print(f"Expected but not found (seventh approach): {heic_numbered_json_filename}")
# Eigth approach: Check for numbered jpg variant (e.g. file(1).MP4 -> file.jpg(1).json)
if number_match:
heic_numbered_json_filename = os.path.join(root, number_match.group(1) + '.jpg' + number_match.group(2) + '.json')
if os.path.exists(heic_numbered_json_filename):
print(f"Found using eigth approach: {heic_numbered_json_filename}")
return heic_numbered_json_filename
else:
print(f"Expected but not found (eigth approach): {heic_numbered_json_filename}")
# Ninth approach: Ignore number before extension but keep the extension
# e.g. 20201008_110754(2).mp4 -> 20201008_110754.mp4.json
if number_match:
ext_ignored_json_filename = os.path.join(root, number_match.group(1) + number_match.group(3) + '.json')
if os.path.exists(ext_ignored_json_filename):
print(f"Found using ninth approach: {ext_ignored_json_filename}")
return ext_ignored_json_filename
else:
print(f"Expected but not found (ninth approach): {ext_ignored_json_filename}")
# If none of the above worked, return None
return None
def process_directory(directory):
"""For each non-json file in the directory, set its timestamps and exif property."""
for root, dirs, files in os.walk(directory):
for filename in files:
if filename == ".DS_Store":
continue
if filename == "archive_browser.html":
continue
# Check if filename contains a dot
if '.' not in filename:
continue
if not filename.endswith('.json'):
json_filename = get_json_filename(filename, root)
# Check if the corresponding .json file was found
if not json_filename:
raise Exception(f"Metadata not found for {filename}")
# Load and parse the json
with open(json_filename, 'r') as f:
data = json.load(f)
# Extract the timestamp
timestamp = data.get('photoTakenTime', {}).get('timestamp')
if not timestamp:
raise Exception(f"photoTakenTime.timestamp not found or invalid in {json_filename}")
file_path = os.path.join(root, filename)
# Set exif properties and timestamps
set_image_datetime_original(file_path, timestamp)
set_file_timestamps(file_path, timestamp)
if __name__ == "__main__":
directory = input("Enter the directory path: ")
process_directory(directory)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment