Last active February 21, 2025 08:09
Use Apple's Vision framework from Python to detect text in images
""" Use Apple's Vision Framework via PyObjC to detect text in images
To use:
python3 -m pip install pyobjc-core pyobjc-framework-Quartz pyobjc-framework-Vision wurlitzer
import pathlib
import Quartz
import Vision
from Cocoa import NSURL
from Foundation import NSDictionary
# needed to capture system-level stderr
from wurlitzer import pipes
def image_to_text(img_path, lang="eng"):
input_url = NSURL.fileURLWithPath_(img_path)
with pipes() as (out, err):
# capture stdout and stderr from system calls
# otherwise, Quartz.CIImage.imageWithContentsOfURL_
# prints to stderr something like:
# 2020-09-20 20:55:25.538 python[73042:5650492] Creating client/daemon connection: B8FE995E-3F27-47F4-9FA8-559C615FD774
# 2020-09-20 20:55:25.652 python[73042:5650492] Got the query meta data reply for:, response: 0
input_image = Quartz.CIImage.imageWithContentsOfURL_(input_url)
vision_options = NSDictionary.dictionaryWithDictionary_({})
vision_handler = Vision.VNImageRequestHandler.alloc().initWithCIImage_options_(
input_image, vision_options
results = []
handler = make_request_handler(results)
vision_request = Vision.VNRecognizeTextRequest.alloc().initWithCompletionHandler_(handler)
error = vision_handler.performRequests_error_([vision_request], None)
return results
def make_request_handler(results):
""" results: list to store results """
if not isinstance(results, list):
raise ValueError("results must be a list")
def handler(request, error):
if error:
print(f"Error! {error}")
observations = request.results()
for text_observation in observations:
recognized_text = text_observation.topCandidates_(1)[0]
results.append([recognized_text.string(), recognized_text.confidence()])
return handler
def main():
import sys
import pathlib
img_path = pathlib.Path(sys.argv[1])
if not img_path.is_file():
sys.exit("Invalid image path")
img_path = str(img_path.resolve())
results = image_to_text(img_path)
if __name__ == "__main__":
@bert9946 I'm not very familiar with numpy but this might work:

"""Create a CIImage from a numpy array"""

import io
import sys

import numpy as np
from AppKit import NSBitmapImageRep, NSImage
from Foundation import NSData
from PIL import Image
from Quartz import CIImage

def createNSImageFromNumpyArray(numpy_array):
    image = Image.fromarray(numpy_array)
    data = io.BytesIO(), "JPEG")
    nsdata = NSData.dataWithBytes_length_(data.getvalue(), len(data.getvalue()))
    rep = NSBitmapImageRep.imageRepWithData_(nsdata)
    nsimage = NSImage.alloc().initWithSize_((rep.pixelsWide(), rep.pixelsHigh()))
    return nsimage

def convertNSImageToCIImage(nsimage):
    imageData = nsimage.TIFFRepresentation()
    bitmap = NSBitmapImageRep.alloc().initWithData_(imageData)
    ciimage = CIImage.alloc().initWithBitmapImageRep_(bitmap)
    return ciimage

if __name__ == "__main__":
    filepath = sys.argv[1]
    pil_img =
    print(pil_img.format, pil_img.size, pil_img.mode)
    np_img = np.asarray(pil_img)
    nsi = createNSImageFromNumpyArray(np_img)
    cii = convertNSImageToCIImage(nsi)

@RhetTbull It works! This is very helpful. Thanks lot.

Can this be used in Linux systems or is it Mac specific?

@emilanovix this uses Apple macOS APIs thus it is macOS only. There are plenty of OCR packages that will run on Linux but this is specific to macOS.

Here's a cleaned-up version that includes bounding boxes

