Skip to content

Instantly share code, notes, and snippets.

@jonashaag
Last active February 21, 2025 08:09
Show Gist options
  • Save jonashaag/95e8b75ed44cc5b93cbc5d4599e3803a to your computer and use it in GitHub Desktop.
Save jonashaag/95e8b75ed44cc5b93cbc5d4599e3803a to your computer and use it in GitHub Desktop.
Use macOS OCR engine from Python
import re
import subprocess
from pathlib import Path
def ocr_macos_live_text_v1(image_file: Path | str) -> str:
live_text_script = f"""
use framework "Vision"
set url to current application's NSURL's fileURLWithPath:"{image_file}"
set requestHandler to current application's VNImageRequestHandler's alloc()'s initWithURL:url options:(missing value)
set request to current application's VNRecognizeTextRequest's alloc()'s init()
requestHandler's performRequests:(current application's NSArray's arrayWithObject:request) |error|:(missing value)
set fullText to {{}}
repeat with observation in (request's results())
copy ((first item in (observation's topCandidates:1))'s |string|() as text) to end of fullText
end repeat
return fullText
"""
return subprocess.check_output(["osascript", "-e", live_text_script], text=True)
Textbox = tuple[str, float, float, float, float, float]
def ocr_macos_live_text_v2_with_bounding_boxes(image_file: Path | str) -> list[Textbox]:
import Cocoa
import objc
import Vision
from PIL import Image
read_observation_pattern = re.compile(r"<VNRecognizedTextObservation.+confidence=(.+?) boundingBox=\[(.+?)]")
def read_observation(o: Vision.VNRecognizedTextObservation, image_width: int, image_height: int) -> Textbox:
# Lazy hack, we should use proper APIs
match = read_observation_pattern.fullmatch(str(o))
assert match
x, y, w, h = map(float, match.group(2).split(","))
confidence = float(match.group(1))
text = o.topCandidates_(1)[0].string()
return (text, x * image_width, y * image_height, w * image_width, h * image_height, confidence)
width, height = Image.open(image_file).size
with objc.autorelease_pool():
recognize_request = Vision.VNRecognizeTextRequest.alloc().init()
recognize_request.setAutomaticallyDetectsLanguage_(False)
recognize_request.setRecognitionLanguages_(["de-DE"])
recognize_request.setUsesLanguageCorrection_(False)
request_handler = Vision.VNImageRequestHandler.alloc().initWithURL_options_(Cocoa.NSURL.fileURLWithPath_(image_file), None)
_, error = request_handler.performRequests_error_([recognize_request], None)
if error:
raise RuntimeError(f"Error in Live Text {error=}")
return [read_observation(o, width, height) for o in recognize_request.results()]

macOS Live Text has a very good quality/speed tradeoff.

Compared to Tesseract, it has much higher quality and is up to 3x as fast.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment