vehrka · August 29, 2015 13:59
diff --git a/gistfile1.py b/gistfile1.py
 #!/usr/bin/python

 from PIL import Image
 import sys

 import pyocr
 import pyocr.builders


 def ocrimage(imagename, threshold = 127, lang = 'spa', tlayout=3):
    """A function to automatize calls to the OCR
    
    It makes several assumptions:
    * You are using Tesseract (not tested against Cuneiform)
    * Your images are in good shape, no magic tricks here, a bad
       image gets a bad result, a good image MAY get a good result
    
    It takes one mandatory parameter

    * imagename --> the path and name of the image

    It can take three named parameters

    * threshold --> for the threshold operation

    * lang --> must be supported by tesseract and must be intalled

    * tlayout --> please read tesseact manual on -psm
    """
    try: 
        imageobj = Image.open(imagename)
    except:
        return False

    imageobj = imageobj.convert("L")
    imageobj = imageobj.point(lambda p: p > threshold and 255) 

    tool = pyocr.get_available_tools()[0]
    txt = tool.image_to_string(imageobj,
                               lang=lang,
                               builder=pyocr.builders.TextBuilder(tesseract_layout=tlayout))

    try: 
        # This tries to open an existing file but creates a new file if necessary.
        logfile = open("{}.txt".format(imagename), "a") 
        try: 
            logfile.write(txt.encode('utf8'))
        finally:
            logfile.close()
        return True 
    except IOError:
        return False
	#!/usr/bin/python

	from PIL import Image
	import sys

	import pyocr
	import pyocr.builders


	def ocrimage(imagename, threshold = 127, lang = 'spa', tlayout=3):
	"""A function to automatize calls to the OCR

	It makes several assumptions:
	* You are using Tesseract (not tested against Cuneiform)
	* Your images are in good shape, no magic tricks here, a bad
	image gets a bad result, a good image MAY get a good result

	It takes one mandatory parameter

	* imagename --> the path and name of the image

	It can take three named parameters

	* threshold --> for the threshold operation

	* lang --> must be supported by tesseract and must be intalled

	* tlayout --> please read tesseact manual on -psm
	"""
	try:
	imageobj = Image.open(imagename)
	except:
	return False

	imageobj = imageobj.convert("L")
	imageobj = imageobj.point(lambda p: p > threshold and 255)

	tool = pyocr.get_available_tools()[0]
	txt = tool.image_to_string(imageobj,
	lang=lang,
	builder=pyocr.builders.TextBuilder(tesseract_layout=tlayout))

	try:
	# This tries to open an existing file but creates a new file if necessary.
	logfile = open("{}.txt".format(imagename), "a")
	try:
	logfile.write(txt.encode('utf8'))
	finally:
	logfile.close()
	return True
	except IOError:
	return False