Last active February 21, 2025 08:09
Use macOS OCR engine from Python
import re
import subprocess
from pathlib import Path
def ocr_macos_live_text_v1(image_file: Path | str) -> str:
live_text_script = f"""
use framework "Vision"
set url to current application's NSURL's fileURLWithPath:"{image_file}"
set requestHandler to current application's VNImageRequestHandler's alloc()'s initWithURL:url options:(missing value)
set request to current application's VNRecognizeTextRequest's alloc()'s init()
requestHandler's performRequests:(current application's NSArray's arrayWithObject:request) |error|:(missing value)
set fullText to {{}}
repeat with observation in (request's results())
copy ((first item in (observation's topCandidates:1))'s |string|() as text) to end of fullText
end repeat
return fullText
return subprocess.check_output(["osascript", "-e", live_text_script], text=True)
Textbox = tuple[str, float, float, float, float, float]
def ocr_macos_live_text_v2_with_bounding_boxes(image_file: Path | str) -> list[Textbox]:
import Cocoa
import objc
import Vision
from PIL import Image
read_observation_pattern = re.compile(r"<VNRecognizedTextObservation.+confidence=(.+?) boundingBox=\[(.+?)]")
def read_observation(o: Vision.VNRecognizedTextObservation, image_width: int, image_height: int) -> Textbox:
# Lazy hack, we should use proper APIs
match = read_observation_pattern.fullmatch(str(o))
assert match
x, y, w, h = map(float,","))
confidence = float(
text = o.topCandidates_(1)[0].string()
return (text, x * image_width, y * image_height, w * image_width, h * image_height, confidence)
width, height =
with objc.autorelease_pool():
recognize_request = Vision.VNRecognizeTextRequest.alloc().init()
request_handler = Vision.VNImageRequestHandler.alloc().initWithURL_options_(Cocoa.NSURL.fileURLWithPath_(image_file), None)
_, error = request_handler.performRequests_error_([recognize_request], None)
if error:
raise RuntimeError(f"Error in Live Text {error=}")
return [read_observation(o, width, height) for o in recognize_request.results()]

macOS Live Text has a very good quality/speed tradeoff.

Compared to Tesseract, it has much higher quality and is up to 3x as fast.

