Last active
October 16, 2023 15:22
-
-
Save adrianjagielak/7d6eb8cb750c93286438459f2a2b19b7 to your computer and use it in GitHub Desktop.
Fix Google Photos Takeout timestamps+gps+broken files (populate file creation time (all files), file modification time (all files), and exif DateTimeOriginal (images), clear the gps exif if its invalid (0,0), and remove invalid broken images (move them to a "corrupted_files" dir)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import json | |
from PIL import Image, UnidentifiedImageError | |
import datetime | |
import re | |
import shutil | |
def set_file_timestamps(filename, timestamp): | |
"""Set the creation and modification dates for a file.""" | |
dt = datetime.datetime.fromtimestamp(int(timestamp)) | |
dt_timestamp = dt.timestamp() | |
os.utime(filename, (dt_timestamp, dt_timestamp)) | |
def set_image_datetime_original(filename, timestamp, corrupted_folder="corrupted_files"): | |
"""Set the DateTimeOriginal exif property for an image using Pillow.""" | |
try: | |
img = Image.open(filename) | |
# Check if the image is in a format that supports EXIF data | |
if img.format in ["JPEG", "TIFF"]: | |
exif_data = img._getexif() or {} | |
dt_str = datetime.datetime.fromtimestamp(int(timestamp)).strftime('%Y:%m:%d %H:%M:%S') | |
exif_data[36867] = dt_str # 36867 is the tag for DateTimeOriginal | |
# Check and remove geolocation if set to 0, 0 | |
latitude_tag = 0x0002 # GPSLatitude | |
longitude_tag = 0x0004 # GPSLongitude | |
if exif_data.get(latitude_tag) == (0, 0) and exif_data.get(longitude_tag) == (0, 0): | |
exif_data.pop(latitude_tag, None) | |
exif_data.pop(longitude_tag, None) | |
exif_data.pop(0x0001, None) # GPSLatitudeRef | |
exif_data.pop(0x0003, None) # GPSLongitudeRef | |
# Convert updated exif_data to bytes | |
exif_bytes = img.info.get("exif", b"") | |
# If there's existing EXIF data, save with updated EXIF | |
if exif_bytes: | |
img.save(filename, exif=exif_bytes, quality=100) | |
else: | |
# If no EXIF data, just save the image without updating EXIF | |
img.save(filename, quality=100) | |
else: | |
# If the image format doesn't support EXIF data, just save the image | |
img.save(filename, quality=100) | |
img.close() | |
except UnidentifiedImageError: | |
# If the file isn't a recognized image format, just skip setting the EXIF data. | |
pass | |
except OSError as e: | |
print(f"Error encountered for file {filename}: {e}") | |
# If the corrupted folder doesn't exist, create it | |
if not os.path.exists(corrupted_folder): | |
os.makedirs(corrupted_folder) | |
# Move the corrupted file to the corrupted folder | |
dest_path = os.path.join(corrupted_folder, os.path.basename(filename)) | |
shutil.move(filename, dest_path) | |
print(f"Moved corrupted file {filename} to {dest_path}") | |
def get_json_filename(filename, root): | |
"""Attempt to find the corresponding JSON file for a given filename.""" | |
print(f"\nProcessing: {filename}") # Print the current filename we're trying to process | |
# First approach: append .json to filename (e.g. photo.jpg -> photo.jpg.json) | |
json_filename = os.path.join(root, filename + '.json') | |
if os.path.exists(json_filename): | |
print(f"Found using first approach: {json_filename}") | |
return json_filename | |
# Second approach: remove extension and append .json (e.g. photo.jpg -> photo.json) | |
basename, ext = os.path.splitext(filename) | |
json_filename = os.path.join(root, basename + '.json') | |
if os.path.exists(json_filename): | |
print(f"Found using second approach: {json_filename}") | |
return json_filename | |
# Third approach: move number before the extension | |
# e.g. 20200912_122446(1).jpg -> 20200912_122446.jpg(1).json | |
number_match = re.search(r'(.+)(\(\d+\))(\..+)$', filename) | |
if number_match: | |
new_filename = number_match.group(1) + number_match.group(3) + number_match.group(2) + '.json' | |
json_filename = os.path.join(root, new_filename) | |
if os.path.exists(json_filename): | |
print(f"Found using third approach: {json_filename}") | |
return json_filename | |
else: | |
print(f"Expected but not found (third approach): {json_filename}") | |
# Fourth approach: match truncated filename (46 characters) | |
truncated_json_filename = os.path.join(root, filename[:46] + '.json') # Truncating to 46 characters | |
if os.path.exists(truncated_json_filename): | |
print(f"Found using fourth approach: {truncated_json_filename}") | |
return truncated_json_filename | |
else: | |
print(f"Expected but not found (fourth approach): {truncated_json_filename}") | |
# Fourth approach: match truncated filename (47 characters) | |
truncated_json_filename = os.path.join(root, filename[:47] + '.json') # Truncating to 47 characters | |
if os.path.exists(truncated_json_filename): | |
print(f"Found using fourth approach: {truncated_json_filename}") | |
return truncated_json_filename | |
else: | |
print(f"Expected but not found (fourth approach): {truncated_json_filename}") | |
# Fourth approach: match truncated filename (45 characters) | |
truncated_json_filename = os.path.join(root, filename[:45] + '.json') # Truncating to 45 characters | |
if os.path.exists(truncated_json_filename): | |
print(f"Found using fourth approach: {truncated_json_filename}") | |
return truncated_json_filename | |
else: | |
print(f"Expected but not found (fourth approach): {truncated_json_filename}") | |
# Fifth approach: check for HEIC variant (Apple Live Photo) | |
heic_json_filename = os.path.join(root, basename + '.HEIC.json') | |
if os.path.exists(heic_json_filename): | |
print(f"Found using fifth approach: {heic_json_filename}") | |
return heic_json_filename | |
else: | |
print(f"Expected but not found (fifth approach): {heic_json_filename}") | |
# Sixth approach: check for jpg variant (Apple Live Photo) | |
jpg_json_filename = os.path.join(root, basename + '.jpg.json') | |
if os.path.exists(jpg_json_filename): | |
print(f"Found using sixth approach: {jpg_json_filename}") | |
return jpg_json_filename | |
else: | |
print(f"Expected but not found (sixth approach): {jpg_json_filename}") | |
# Seventh approach: Check for numbered HEIC variant (e.g. file(1).MP4 -> file.HEIC(1).json) | |
if number_match: | |
heic_numbered_json_filename = os.path.join(root, number_match.group(1) + '.HEIC' + number_match.group(2) + '.json') | |
if os.path.exists(heic_numbered_json_filename): | |
print(f"Found using seventh approach: {heic_numbered_json_filename}") | |
return heic_numbered_json_filename | |
else: | |
print(f"Expected but not found (seventh approach): {heic_numbered_json_filename}") | |
# Eigth approach: Check for numbered jpg variant (e.g. file(1).MP4 -> file.jpg(1).json) | |
if number_match: | |
heic_numbered_json_filename = os.path.join(root, number_match.group(1) + '.jpg' + number_match.group(2) + '.json') | |
if os.path.exists(heic_numbered_json_filename): | |
print(f"Found using eigth approach: {heic_numbered_json_filename}") | |
return heic_numbered_json_filename | |
else: | |
print(f"Expected but not found (eigth approach): {heic_numbered_json_filename}") | |
# Ninth approach: Ignore number before extension but keep the extension | |
# e.g. 20201008_110754(2).mp4 -> 20201008_110754.mp4.json | |
if number_match: | |
ext_ignored_json_filename = os.path.join(root, number_match.group(1) + number_match.group(3) + '.json') | |
if os.path.exists(ext_ignored_json_filename): | |
print(f"Found using ninth approach: {ext_ignored_json_filename}") | |
return ext_ignored_json_filename | |
else: | |
print(f"Expected but not found (ninth approach): {ext_ignored_json_filename}") | |
# If none of the above worked, return None | |
return None | |
def process_directory(directory): | |
"""For each non-json file in the directory, set its timestamps and exif property.""" | |
for root, dirs, files in os.walk(directory): | |
for filename in files: | |
if filename == ".DS_Store": | |
continue | |
if filename == "archive_browser.html": | |
continue | |
# Check if filename contains a dot | |
if '.' not in filename: | |
continue | |
if not filename.endswith('.json'): | |
json_filename = get_json_filename(filename, root) | |
# Check if the corresponding .json file was found | |
if not json_filename: | |
raise Exception(f"Metadata not found for {filename}") | |
# Load and parse the json | |
with open(json_filename, 'r') as f: | |
data = json.load(f) | |
# Extract the timestamp | |
timestamp = data.get('photoTakenTime', {}).get('timestamp') | |
if not timestamp: | |
raise Exception(f"photoTakenTime.timestamp not found or invalid in {json_filename}") | |
file_path = os.path.join(root, filename) | |
# Set exif properties and timestamps | |
set_image_datetime_original(file_path, timestamp) | |
set_file_timestamps(file_path, timestamp) | |
if __name__ == "__main__": | |
directory = input("Enter the directory path: ") | |
process_directory(directory) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment