Save RhetTbull/1c34fc07c95733642cffcd1ac587fc4c to your computer and use it in GitHub Desktop.
""" Use Apple's Vision Framework via PyObjC to detect text in images | |
To use: | |
python3 -m pip install pyobjc-core pyobjc-framework-Quartz pyobjc-framework-Vision wurlitzer | |
""" | |
import pathlib | |
import Quartz | |
import Vision | |
from Cocoa import NSURL | |
from Foundation import NSDictionary | |
# needed to capture system-level stderr | |
from wurlitzer import pipes | |
def image_to_text(img_path, lang="eng"): | |
input_url = NSURL.fileURLWithPath_(img_path) | |
with pipes() as (out, err): | |
# capture stdout and stderr from system calls | |
# otherwise, Quartz.CIImage.imageWithContentsOfURL_ | |
# prints to stderr something like: | |
# 2020-09-20 20:55:25.538 python[73042:5650492] Creating client/daemon connection: B8FE995E-3F27-47F4-9FA8-559C615FD774 | |
# 2020-09-20 20:55:25.652 python[73042:5650492] Got the query meta data reply for: com.apple.MobileAsset.RawCamera.Camera, response: 0 | |
input_image = Quartz.CIImage.imageWithContentsOfURL_(input_url) | |
vision_options = NSDictionary.dictionaryWithDictionary_({}) | |
vision_handler = Vision.VNImageRequestHandler.alloc().initWithCIImage_options_( | |
input_image, vision_options | |
) | |
results = [] | |
handler = make_request_handler(results) | |
vision_request = Vision.VNRecognizeTextRequest.alloc().initWithCompletionHandler_(handler) | |
error = vision_handler.performRequests_error_([vision_request], None) | |
return results | |
def make_request_handler(results): | |
""" results: list to store results """ | |
if not isinstance(results, list): | |
raise ValueError("results must be a list") | |
def handler(request, error): | |
if error: | |
print(f"Error! {error}") | |
else: | |
observations = request.results() | |
for text_observation in observations: | |
recognized_text = text_observation.topCandidates_(1)[0] | |
results.append([recognized_text.string(), recognized_text.confidence()]) | |
return handler | |
def main(): | |
import sys | |
import pathlib | |
img_path = pathlib.Path(sys.argv[1]) | |
if not img_path.is_file(): | |
sys.exit("Invalid image path") | |
img_path = str(img_path.resolve()) | |
results = image_to_text(img_path) | |
print(results) | |
if __name__ == "__main__": | |
main() |
@okpatil4u It's possible, but I've not written the python code. Take a look here to see the sample code on getting the bounding rectacngle.
Also, for a more robust implementation of this example, see here
Thanks @RhetTbull !
I will check.
Hi, nice work! I was wondering is there a way to detect other languages plus english?
Hi, nice work! I was wondering is there a way to detect other languages plus english?
Yes. See the implementation of this in my textinator app which shows how to get the list of supported languages and set the language.
Thank you @RhetTbull! I will check it out.
I recently found this and found it quite useful.
I was planning on OCRing about 10000 pdfs with apple's api. your code works well. however I'm a bit stuck on how to multithread/parallel process it. concurrent.futures does not seemingly work. if there any suggestion you would make for this?
@psungho I'm not sure how well the pyobjc stuff works with python's threads. I would try multiprocessing (spawn multiple separate python processes each running the vision framework).
doesn't really seem to be friendly. keep getting things like Object ID x,0 ref repaired where x is a number
I guess in theory you could use NSThreads instead? @RhetTbull
Not sure how much of a performance improvement it will bring. Relatively a new obj-c coder (in fact learning it for a project I have). What I want to do is OCR a bunch of pdfs concurrently -- maybe there is some alternate solution?
I had to run pip install pyobjc-framework-Quartz pyobjc-framework-Vision wurlitzer
to make it work on m2 mac.
I'm not familiar with objective-c. How to load image from a numpy array image, instead of a image from the disk?
@bert9946 I'm not very familiar with numpy but this might work:
"""Create a CIImage from a numpy array"""
import io
import sys
import numpy as np
from AppKit import NSBitmapImageRep, NSImage
from Foundation import NSData
from PIL import Image
from Quartz import CIImage
def createNSImageFromNumpyArray(numpy_array):
image = Image.fromarray(numpy_array)
data = io.BytesIO()
image.save(data, "JPEG")
nsdata = NSData.dataWithBytes_length_(data.getvalue(), len(data.getvalue()))
rep = NSBitmapImageRep.imageRepWithData_(nsdata)
nsimage = NSImage.alloc().initWithSize_((rep.pixelsWide(), rep.pixelsHigh()))
return nsimage
def convertNSImageToCIImage(nsimage):
imageData = nsimage.TIFFRepresentation()
bitmap = NSBitmapImageRep.alloc().initWithData_(imageData)
ciimage = CIImage.alloc().initWithBitmapImageRep_(bitmap)
return ciimage
if __name__ == "__main__":
filepath = sys.argv[1]
pil_img = Image.open(filepath)
print(pil_img.format, pil_img.size, pil_img.mode)
np_img = np.asarray(pil_img)
nsi = createNSImageFromNumpyArray(np_img)
cii = convertNSImageToCIImage(nsi)
@RhetTbull It works! This is very helpful. Thanks lot.
Can this be used in Linux systems or is it Mac specific?
@emilanovix this uses Apple macOS APIs thus it is macOS only. There are plenty of OCR packages that will run on Linux but this is specific to macOS.
Here's a cleaned-up version that includes bounding boxes https://gist.github.com/jonashaag/95e8b75ed44cc5b93cbc5d4599e3803a
Hello, thanks for this code. Is there a way to catch the bounding boxes ?