macOS Live Text has a very good quality/speed tradeoff.
Compared to Tesseract, it has much higher quality and is up to 3x as fast.
import re | |
import subprocess | |
from pathlib import Path | |
def ocr_macos_live_text_v1(image_file: Path | str) -> str: | |
live_text_script = f""" | |
use framework "Vision" | |
set url to current application's NSURL's fileURLWithPath:"{image_file}" | |
set requestHandler to current application's VNImageRequestHandler's alloc()'s initWithURL:url options:(missing value) | |
set request to current application's VNRecognizeTextRequest's alloc()'s init() | |
requestHandler's performRequests:(current application's NSArray's arrayWithObject:request) |error|:(missing value) | |
set fullText to {{}} | |
repeat with observation in (request's results()) | |
copy ((first item in (observation's topCandidates:1))'s |string|() as text) to end of fullText | |
end repeat | |
return fullText | |
""" | |
return subprocess.check_output(["osascript", "-e", live_text_script], text=True) | |
Textbox = tuple[str, float, float, float, float, float] | |
def ocr_macos_live_text_v2_with_bounding_boxes(image_file: Path | str) -> list[Textbox]: | |
import Cocoa | |
import objc | |
import Vision | |
from PIL import Image | |
read_observation_pattern = re.compile(r"<VNRecognizedTextObservation.+confidence=(.+?) boundingBox=\[(.+?)]") | |
def read_observation(o: Vision.VNRecognizedTextObservation, image_width: int, image_height: int) -> Textbox: | |
# Lazy hack, we should use proper APIs | |
match = read_observation_pattern.fullmatch(str(o)) | |
assert match | |
x, y, w, h = map(float, match.group(2).split(",")) | |
confidence = float(match.group(1)) | |
text = o.topCandidates_(1)[0].string() | |
return (text, x * image_width, y * image_height, w * image_width, h * image_height, confidence) | |
width, height = Image.open(image_file).size | |
with objc.autorelease_pool(): | |
recognize_request = Vision.VNRecognizeTextRequest.alloc().init() | |
recognize_request.setAutomaticallyDetectsLanguage_(False) | |
recognize_request.setRecognitionLanguages_(["de-DE"]) | |
recognize_request.setUsesLanguageCorrection_(False) | |
request_handler = Vision.VNImageRequestHandler.alloc().initWithURL_options_(Cocoa.NSURL.fileURLWithPath_(image_file), None) | |
_, error = request_handler.performRequests_error_([recognize_request], None) | |
if error: | |
raise RuntimeError(f"Error in Live Text {error=}") | |
return [read_observation(o, width, height) for o in recognize_request.results()] |
macOS Live Text has a very good quality/speed tradeoff.
Compared to Tesseract, it has much higher quality and is up to 3x as fast.