chris-x86-64 · August 31, 2015 00:51 · chris-x86-64 · May 15, 2015
diff --git a/ocr.py b/ocr.py
 import os
 import sys
 import json
 from optparse import OptionParser
 from PIL import Image
 from pyocr import pyocr
 from pyocr.builders import TextBuilder

 def get_opt():
 	parser = OptionParser()
 	parser.add_option("-d", "--directory", action = "store", dest = "directory", help = "[Required] Target directory")
 	parser.add_option("-l", "--lang", action = "store", dest = "lang", help = "Language", default = "jpn")
 	parser.add_option("-c", "--config", action = "store", dest = "args", help = "Other config args (e.g. digits, only works with Tesseract)")
 	parser.add_option("-o", "--output", action = "store", dest = "output_file", help = "Output file (Not supported at this time.)")
 	(options, args) = parser.parse_args()
 	if not options.directory:
 		parser.print_help()
 		sys.exit(1)
 	else:
 		print "[DEBUG] Entering directory %s ..." % options.directory
 	return options

 def select_tool():
 	tools = pyocr.get_available_tools()[:]
 	if len(tools) == 0:
 		print "[EMERG] There are no OCR tools available."
 		sys.exit(1)
 	print "[DEBUG] Using %s for OCR..." % (tools[0].get_name())
 	return tools[0]

 def process(matches, options):
 	tool = select_tool()
 	builder = TextBuilder()
 	builder.tesseract_configs = [options.args]
 	for f in matches:
 		text = tool.image_to_string(Image.open(f), lang = options.lang, builder = builder)
 		print json.dumps({"filename": f, "text": text}, ensure_ascii=False)


 if __name__ == "__main__":
 	options = get_opt()
 	matches = []
 	for root, dirnames, filenames in os.walk(options.directory):
 		for f in filenames:
 			if f.endswith(('.png', '.jpg', '.jpeg')):
 				matches.append(os.path.join(root,f))
 	
 	print "[DEBUG] Processing %d images..." % len(matches)
 	process(matches, options)
	import os
	import sys
	import json
	from optparse import OptionParser
	from PIL import Image
	from pyocr import pyocr
	from pyocr.builders import TextBuilder

	def get_opt():
	parser = OptionParser()
	parser.add_option("-d", "--directory", action = "store", dest = "directory", help = "[Required] Target directory")
	parser.add_option("-l", "--lang", action = "store", dest = "lang", help = "Language", default = "jpn")
	parser.add_option("-c", "--config", action = "store", dest = "args", help = "Other config args (e.g. digits, only works with Tesseract)")
	parser.add_option("-o", "--output", action = "store", dest = "output_file", help = "Output file (Not supported at this time.)")
	(options, args) = parser.parse_args()
	if not options.directory:
	parser.print_help()
	sys.exit(1)
	else:
	print "[DEBUG] Entering directory %s ..." % options.directory
	return options

	def select_tool():
	tools = pyocr.get_available_tools()[:]
	if len(tools) == 0:
	print "[EMERG] There are no OCR tools available."
	sys.exit(1)
	print "[DEBUG] Using %s for OCR..." % (tools[0].get_name())
	return tools[0]

	def process(matches, options):
	tool = select_tool()
	builder = TextBuilder()
	builder.tesseract_configs = [options.args]
	for f in matches:
	text = tool.image_to_string(Image.open(f), lang = options.lang, builder = builder)
	print json.dumps({"filename": f, "text": text}, ensure_ascii=False)


	if __name__ == "__main__":
	options = get_opt()
	matches = []
	for root, dirnames, filenames in os.walk(options.directory):
	for f in filenames:
	if f.endswith(('.png', '.jpg', '.jpeg')):
	matches.append(os.path.join(root,f))

	print "[DEBUG] Processing %d images..." % len(matches)
	process(matches, options)