Last active
May 21, 2025 15:34
-
-
Save RhetTbull/9035ff260d123413012758252a76d82a to your computer and use it in GitHub Desktop.
Generate video captions with Apple Intelligence
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env -S uv run --script | |
"""Generate captions for videos in Apple Photos using Apple's Media Analysis Service""" | |
# run with uv: `uv run https://gist.githubusercontent.com/RhetTbull/9035ff260d123413012758252a76d82a/raw/a5be13b0bcb1d5c2224ba1bb0828b652601b2482/video_captions.py` | |
# Following allows you to run the script with uv via `uv run video_captions.py` | |
# /// script | |
# dependencies = [ | |
# "pyobjc-core", | |
# "pyobjc-framework-Photos", | |
# "photokit", | |
# ] | |
# /// | |
import json | |
import sys | |
import threading | |
from typing import cast | |
import objc | |
from Foundation import NSArray, NSDictionary | |
from Photos import PHAsset | |
import photokit | |
PLMediaAnalysisServiceRequestAdapter = objc.lookUpClass( | |
"PLMediaAnalysisServiceRequestAdapter" | |
) | |
# Register metadata for the method so PyObjC knows how to handle the callbacks | |
objc.registerMetaDataForSelector( | |
b"PLMediaAnalysisServiceRequestAdapter", | |
b"requestVideoCaptionPreferenceForAssets:withOptions:progressHandler:completionHandler:", | |
{ | |
"arguments": { | |
2 + 2: {"type": b"@?"}, # progressHandler (block type) | |
2 | |
+ 3: { # completionHandler (block type) | |
"callable": { | |
"retval": {"type": b"v"}, | |
"arguments": { | |
0: {"type": b"^v"}, # block self | |
1: {"type": b"@"}, # results dictionary | |
2: {"type": b"@"}, # error | |
}, | |
} | |
}, | |
} | |
}, | |
) | |
def ns_to_py(obj): | |
"""Recursively convert NSDictionary/NSArray to Python dict/list""" | |
if isinstance(obj, NSDictionary): | |
return {str(k): ns_to_py(obj[k]) for k in obj} | |
elif isinstance(obj, NSArray): | |
return [ns_to_py(item) for item in obj] | |
else: | |
return obj # Assume it's a scalar (str, int, float, etc.) | |
def generate_caption_for_video_assets(assets) -> dict[str, dict]: | |
""" | |
Generate an AI-powered caption for the given PHAsset (image only). | |
Args: | |
assets: A list of Photos.PHAsset instances for videos to caption. | |
Returns: | |
A dictionary mapping video UUIDs to their captions (as dict of various values). | |
""" | |
captions = {} | |
completion_error = None | |
event = threading.Event() | |
def _completion_handler(results, error): | |
"""Completion handler for video caption generation.""" | |
nonlocal completion_error | |
if error is not None: | |
completion_error = error.localizedDescription() | |
event.set() | |
return | |
results = ns_to_py(results) | |
results = cast(dict, results) | |
pl = photokit.PhotoLibrary() | |
for uuid, result in results.items(): | |
asset = pl.asset(uuid) | |
captions[asset.original_filename] = result | |
event.set() | |
PLMediaAnalysisServiceRequestAdapter.requestVideoCaptionPreferenceForAssets_withOptions_progressHandler_completionHandler_( | |
assets, # assets array | |
None, # options (nil) | |
None, # progress handler (nil) | |
_completion_handler, # completion handler function | |
) | |
event.wait() # wait for completion, completion handler will be called when done | |
if completion_error: | |
raise Exception(completion_error) | |
return captions | |
def get_selected_video_assets() -> list[PHAsset]: | |
""" | |
Returns a list of currently selected video assets in Photos.app. | |
Returns: | |
list: A list of PHAsset objects that are videos, or an empty list if no videos are selected | |
or if Photos.app is not running. | |
""" | |
selection = photokit.PhotoLibrary().selection() | |
if not selection: | |
return [] | |
return [asset.phasset for asset in selection if asset.ismovie] | |
if __name__ == "__main__": | |
print("Getting selected video assets...", flush=True) | |
assets = get_selected_video_assets() | |
if not assets: | |
print("No videos selected in Photos.app.") | |
sys.exit(1) | |
print( | |
f"Found {len(assets)} selected video{'s' if len(assets) != 1 else ''} in Photos.app", | |
flush=True, | |
) | |
print("Requesting caption (this might take a moment)...", flush=True) | |
results = generate_caption_for_video_assets(assets) | |
print(json.dumps(results, indent=4)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This can be run via uv directly from the gist:
uv run https://gist.githubusercontent.com/RhetTbull/9035ff260d123413012758252a76d82a/raw/a5be13b0bcb1d5c2224ba1bb0828b652601b2482/video_captions.py