RhetTbull · May 21, 2025 15:34 · RhetTbull · May 21, 2025
diff --git a/video_captions.py b/video_captions.py
 #!/usr/bin/env -S uv run --script
 """Generate captions for videos in Apple Photos using Apple's Media Analysis Service"""

 # run with uv: `uv run https://gist.githubusercontent.com/RhetTbull/9035ff260d123413012758252a76d82a/raw/a5be13b0bcb1d5c2224ba1bb0828b652601b2482/video_captions.py`

 # Following allows you to run the script with uv via `uv run video_captions.py`
 # /// script
 # dependencies = [
 #   "pyobjc-core",
 #   "pyobjc-framework-Photos",
 #   "photokit",
 # ]
 # ///

 import json
 import sys
 import threading
 from typing import cast

 import objc
 from Foundation import NSArray, NSDictionary
 from Photos import PHAsset

 import photokit

 PLMediaAnalysisServiceRequestAdapter = objc.lookUpClass(
    "PLMediaAnalysisServiceRequestAdapter"
 )

 # Register metadata for the method so PyObjC knows how to handle the callbacks
 objc.registerMetaDataForSelector(
    b"PLMediaAnalysisServiceRequestAdapter",
    b"requestVideoCaptionPreferenceForAssets:withOptions:progressHandler:completionHandler:",
    {
        "arguments": {
            2 + 2: {"type": b"@?"},  # progressHandler (block type)
            2
            + 3: {  # completionHandler (block type)
                "callable": {
                    "retval": {"type": b"v"},
                    "arguments": {
                        0: {"type": b"^v"},  # block self
                        1: {"type": b"@"},  # results dictionary
                        2: {"type": b"@"},  # error
                    },
                }
            },
        }
    },
 )


 def ns_to_py(obj):
    """Recursively convert NSDictionary/NSArray to Python dict/list"""
    if isinstance(obj, NSDictionary):
        return {str(k): ns_to_py(obj[k]) for k in obj}
    elif isinstance(obj, NSArray):
        return [ns_to_py(item) for item in obj]
    else:
        return obj  # Assume it's a scalar (str, int, float, etc.)


 def generate_caption_for_video_assets(assets) -> dict[str, dict]:
    """
    Generate an AI-powered caption for the given PHAsset (image only).

    Args:
        assets:     A list of Photos.PHAsset instances for videos to caption.

    Returns:
        A dictionary mapping video UUIDs to their captions (as dict of various values).
    """

    captions = {}
    completion_error = None
    event = threading.Event()

    def _completion_handler(results, error):
        """Completion handler for video caption generation."""
        nonlocal completion_error
        if error is not None:
            completion_error = error.localizedDescription()
            event.set()
            return

        results = ns_to_py(results)
        results = cast(dict, results)
        pl = photokit.PhotoLibrary()
        for uuid, result in results.items():
            asset = pl.asset(uuid)
            captions[asset.original_filename] = result

        event.set()

    PLMediaAnalysisServiceRequestAdapter.requestVideoCaptionPreferenceForAssets_withOptions_progressHandler_completionHandler_(
        assets,  # assets array
        None,  # options (nil)
        None,  # progress handler (nil)
        _completion_handler,  # completion handler function
    )
    event.wait()  # wait for completion, completion handler will be called when done

    if completion_error:
        raise Exception(completion_error)
    return captions


 def get_selected_video_assets() -> list[PHAsset]:
    """
    Returns a list of currently selected video assets in Photos.app.

    Returns:
        list: A list of PHAsset objects that are videos, or an empty list if no videos are selected
              or if Photos.app is not running.
    """
    selection = photokit.PhotoLibrary().selection()
    if not selection:
        return []
    return [asset.phasset for asset in selection if asset.ismovie]


 if __name__ == "__main__":
    print("Getting selected video assets...", flush=True)
    assets = get_selected_video_assets()
    if not assets:
        print("No videos selected in Photos.app.")
        sys.exit(1)
    print(
        f"Found {len(assets)} selected video{'s' if len(assets) != 1 else ''} in Photos.app",
        flush=True,
    )

    print("Requesting caption (this might take a moment)...", flush=True)
    results = generate_caption_for_video_assets(assets)
    print(json.dumps(results, indent=4))
	#!/usr/bin/env -S uv run --script
	"""Generate captions for videos in Apple Photos using Apple's Media Analysis Service"""

	# run with uv: `uv run https://gist.githubusercontent.com/RhetTbull/9035ff260d123413012758252a76d82a/raw/a5be13b0bcb1d5c2224ba1bb0828b652601b2482/video_captions.py`

	# Following allows you to run the script with uv via `uv run video_captions.py`
	# /// script
	# dependencies = [
	# "pyobjc-core",
	# "pyobjc-framework-Photos",
	# "photokit",
	# ]
	# ///

	import json
	import sys
	import threading
	from typing import cast

	import objc
	from Foundation import NSArray, NSDictionary
	from Photos import PHAsset

	import photokit

	PLMediaAnalysisServiceRequestAdapter = objc.lookUpClass(
	"PLMediaAnalysisServiceRequestAdapter"
	)

	# Register metadata for the method so PyObjC knows how to handle the callbacks
	objc.registerMetaDataForSelector(
	b"PLMediaAnalysisServiceRequestAdapter",
	b"requestVideoCaptionPreferenceForAssets:withOptions:progressHandler:completionHandler:",
	{
	"arguments": {
	2 + 2: {"type": b"@?"}, # progressHandler (block type)
	2
	+ 3: { # completionHandler (block type)
	"callable": {
	"retval": {"type": b"v"},
	"arguments": {
	0: {"type": b"^v"}, # block self
	1: {"type": b"@"}, # results dictionary
	2: {"type": b"@"}, # error
	},
	}
	},
	}
	},
	)


	def ns_to_py(obj):
	"""Recursively convert NSDictionary/NSArray to Python dict/list"""
	if isinstance(obj, NSDictionary):
	return {str(k): ns_to_py(obj[k]) for k in obj}
	elif isinstance(obj, NSArray):
	return [ns_to_py(item) for item in obj]
	else:
	return obj # Assume it's a scalar (str, int, float, etc.)


	def generate_caption_for_video_assets(assets) -> dict[str, dict]:
	"""
	Generate an AI-powered caption for the given PHAsset (image only).

	Args:
	assets: A list of Photos.PHAsset instances for videos to caption.

	Returns:
	A dictionary mapping video UUIDs to their captions (as dict of various values).
	"""

	captions = {}
	completion_error = None
	event = threading.Event()

	def _completion_handler(results, error):
	"""Completion handler for video caption generation."""
	nonlocal completion_error
	if error is not None:
	completion_error = error.localizedDescription()
	event.set()
	return

	results = ns_to_py(results)
	results = cast(dict, results)
	pl = photokit.PhotoLibrary()
	for uuid, result in results.items():
	asset = pl.asset(uuid)
	captions[asset.original_filename] = result

	event.set()

	PLMediaAnalysisServiceRequestAdapter.requestVideoCaptionPreferenceForAssets_withOptions_progressHandler_completionHandler_(
	assets, # assets array
	None, # options (nil)
	None, # progress handler (nil)
	_completion_handler, # completion handler function
	)
	event.wait() # wait for completion, completion handler will be called when done

	if completion_error:
	raise Exception(completion_error)
	return captions


	def get_selected_video_assets() -> list[PHAsset]:
	"""
	Returns a list of currently selected video assets in Photos.app.

	Returns:
	list: A list of PHAsset objects that are videos, or an empty list if no videos are selected
	or if Photos.app is not running.
	"""
	selection = photokit.PhotoLibrary().selection()
	if not selection:
	return []
	return [asset.phasset for asset in selection if asset.ismovie]


	if __name__ == "__main__":
	print("Getting selected video assets...", flush=True)
	assets = get_selected_video_assets()
	if not assets:
	print("No videos selected in Photos.app.")
	sys.exit(1)
	print(
	f"Found {len(assets)} selected video{'s' if len(assets) != 1 else ''} in Photos.app",
	flush=True,
	)

	print("Requesting caption (this might take a moment)...", flush=True)
	results = generate_caption_for_video_assets(assets)
	print(json.dumps(results, indent=4))