Created
April 18, 2019 12:36
-
-
Save onyekaa/ce5c97e6eaa391c80b378324a38b5561 to your computer and use it in GitHub Desktop.
OCR Experiment
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# import the necessary packages | |
from PIL import Image | |
import pytesseract | |
import argparse | |
import cv2 | |
import os | |
import csv | |
# construct the argument parse and parse the arguments | |
ap = argparse.ArgumentParser() | |
ap.add_argument("-i", "--image", required=True, | |
help="path to input image to be OCR'd") | |
ap.add_argument("-p", "--preprocess", type=str, default="thresh", | |
help="type of preprocessing to be done") | |
args = vars(ap.parse_args()) | |
# load the example image and convert it to grayscale | |
# but first crop all the unnecessary bits. | |
crop = cv2.imread(args["image"]) | |
fname = os.path.splitext(args['image'])[0] | |
crname = '{}-cropped.png'.format(fname) | |
# resize image so crops are consistent | |
img = cv2.resize(crop,(2200,1700)) | |
cropimg = crop[641:1349, 40:2224] | |
# cv2.imwrite(os.path.join(path , 'waka.jpg'), img) | |
cv2.imwrite(crname, cropimg) | |
image = cv2.imread(crname) | |
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) | |
# check to see if we should apply thresholding to preprocess the | |
# image | |
if args["preprocess"] == "thresh": | |
gray = cv2.threshold(gray, 0, 255, | |
cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1] | |
# make a check to see if median blurring should be done to remove | |
# noise | |
elif args["preprocess"] == "blur": | |
gray = cv2.medianBlur(gray, 3) | |
# write the grayscale image to disk as a temporary file so we can | |
# apply OCR to it | |
filename = "{}.png".format(os.getpid()) | |
cv2.imwrite(filename, gray) | |
# load the image as a PIL/Pillow image, apply OCR, and then delete | |
# the temporary file | |
text = pytesseract.image_to_string(Image.open(filename), config='--psm 6') | |
# os.remove(filename) | |
print(text) | |
csvname = '{}.csv'.format(fname) | |
with open(csvname, mode='w') as b_file: | |
bw = csv.writer(b_file, delimiter=',', escapechar='\n', quotechar='"', lineterminator='\n', quoting=csv.QUOTE_MINIMAL) | |
bw.writerow(['no', 'title', 'studio', 'dis', 'weeks', 'locs', 'weekend revenue', 'rev_change', 'avg_loc', 'admissions', 'adm_change', 'flash_rev', 'flash_adm']) | |
line = '' | |
row = 1 | |
count = 0 | |
# Since the data is mostly split by new rows, we look for '\n' xters so we can | |
# split by rows | |
for t in text: | |
line += t | |
if t == '\n': | |
newcol = [] | |
count += 1 | |
columns = [c.strip() for c in line.strip(' ').split(' ')] | |
# Some rows are broken, skip over if so. We expect at least 10 cols per row | |
if len(columns) < 8: | |
count -= 1 | |
continue | |
# Get the movie title. | |
title = '' | |
newcol.append(count) | |
for i, c in enumerate(columns): | |
# if it's the number in the column matches the row count | |
# we assume this is the first column, and thus the title is next | |
if i == 0: | |
title = '' | |
tstart = True | |
continue | |
# To guess where the title column ends and begins we assume | |
# that all the words between the row number and the words below (studios) | |
# are titles. | |
if (c.isupper() == False) and (c not in ['INDP', 'BLUE PICS', 'FILMONE', 'SILVERBIRD', 'CRIMSON']): | |
# print(c.isupper()) | |
title += ' {}'.format(str(c)) | |
else: | |
# print(title, c) | |
newcol.append(title) | |
tstart = False | |
if not tstart: | |
newcol.append(c) | |
if newcol: | |
# print(newcol) | |
# print('final', count, newcol) | |
bw.writerow(newcol) | |
line = '' | |
columns = [] | |
# show the output images | |
# cv2.imshow("Image", image) | |
# cv2.imshow("Output", gray) | |
# cv2.waitKey(0) | |
# cv2.destroyAllWindows() | |
# cv2.waitKey(1) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment