macOS Live Text has a very good quality/speed tradeoff.
Compared to Tesseract, it has much higher quality and is up to 3x as fast.
| import re | |
| import subprocess | |
| from pathlib import Path | |
| def ocr_macos_live_text_v1(image_file: Path | str) -> str: | |
| live_text_script = f""" | |
| use framework "Vision" | |
| set url to current application's NSURL's fileURLWithPath:"{image_file}" | |
| set requestHandler to current application's VNImageRequestHandler's alloc()'s initWithURL:url options:(missing value) | |
| set request to current application's VNRecognizeTextRequest's alloc()'s init() | |
| requestHandler's performRequests:(current application's NSArray's arrayWithObject:request) |error|:(missing value) | |
| set fullText to {{}} | |
| repeat with observation in (request's results()) | |
| copy ((first item in (observation's topCandidates:1))'s |string|() as text) to end of fullText | |
| end repeat | |
| return fullText | |
| """ | |
| return subprocess.check_output(["osascript", "-e", live_text_script], text=True) | |
| Textbox = tuple[str, float, float, float, float, float] | |
| def ocr_macos_live_text_v2_with_bounding_boxes(image_file: Path | str) -> list[Textbox]: | |
| import Cocoa | |
| import objc | |
| import Vision | |
| from PIL import Image | |
| read_observation_pattern = re.compile(r"<VNRecognizedTextObservation.+confidence=(.+?) boundingBox=\[(.+?)]") | |
| def read_observation(o: Vision.VNRecognizedTextObservation, image_width: int, image_height: int) -> Textbox: | |
| # Lazy hack, we should use proper APIs | |
| match = read_observation_pattern.fullmatch(str(o)) | |
| assert match | |
| x, y, w, h = map(float, match.group(2).split(",")) | |
| confidence = float(match.group(1)) | |
| text = o.topCandidates_(1)[0].string() | |
| return (text, x * image_width, y * image_height, w * image_width, h * image_height, confidence) | |
| width, height = Image.open(image_file).size | |
| with objc.autorelease_pool(): | |
| recognize_request = Vision.VNRecognizeTextRequest.alloc().init() | |
| recognize_request.setAutomaticallyDetectsLanguage_(False) | |
| recognize_request.setRecognitionLanguages_(["de-DE"]) | |
| recognize_request.setUsesLanguageCorrection_(False) | |
| request_handler = Vision.VNImageRequestHandler.alloc().initWithURL_options_(Cocoa.NSURL.fileURLWithPath_(image_file), None) | |
| _, error = request_handler.performRequests_error_([recognize_request], None) | |
| if error: | |
| raise RuntimeError(f"Error in Live Text {error=}") | |
| return [read_observation(o, width, height) for o in recognize_request.results()] |
macOS Live Text has a very good quality/speed tradeoff.
Compared to Tesseract, it has much higher quality and is up to 3x as fast.