achikin · August 30, 2016 22:29
diff --git a/Dockerfile b/Dockerfile
 FROM ubuntu:16.04

 WORKDIR /my/
 RUN apt-get -qq -y update
 RUN apt-get -qq -y install python
 RUN apt-get -qq -y install python-pip tesseract-ocr python-pythonmagick libopencv-dev python-opencv
 RUN pip install doc2text
 ADD dtt.py /my/
 ADD image.png /my/

 CMD ["/usr/bin/python","/my/dtt.py"]
diff --git a/dtt.py b/dtt.py
 import doc2text

 # Initialize the class.
 doc = doc2text.Document()

 # Read the file in. Currently accepts pdf, png, jpg, bmp, tiff.
 # If reading a PDF, doc2text will split the PDF into its component pages.
 doc.read('/my/image.png')

 # Crop the pages down to estimated text regions, deskew, and optimize for OCR.
 doc.process()

 # Extract text from the pages.
 doc.extract_text()
 text = doc.get_text()
 print text
	FROM ubuntu:16.04

	WORKDIR /my/
	RUN apt-get -qq -y update
	RUN apt-get -qq -y install python
	RUN apt-get -qq -y install python-pip tesseract-ocr python-pythonmagick libopencv-dev python-opencv
	RUN pip install doc2text
	ADD dtt.py /my/
	ADD image.png /my/

	CMD ["/usr/bin/python","/my/dtt.py"]
	import doc2text

	# Initialize the class.
	doc = doc2text.Document()

	# Read the file in. Currently accepts pdf, png, jpg, bmp, tiff.
	# If reading a PDF, doc2text will split the PDF into its component pages.
	doc.read('/my/image.png')

	# Crop the pages down to estimated text regions, deskew, and optimize for OCR.
	doc.process()

	# Extract text from the pages.
	doc.extract_text()
	text = doc.get_text()
	print text