-
-
Save akash-ch2812/d42acf86e4d6562819cf4cd37d1195e7 to your computer and use it in GitHub Desktop.
# use this command to install open cv2 | |
# pip install opencv-python | |
# use this command to install PIL | |
# pip install Pillow | |
import cv2 | |
from PIL import Image | |
def mark_region(imagE_path): | |
im = cv2.imread(image_path) | |
gray = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY) | |
blur = cv2.GaussianBlur(gray, (9,9), 0) | |
thresh = cv2.adaptiveThreshold(blur,255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV,11,30) | |
# Dilate to combine adjacent text contours | |
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (9,9)) | |
dilate = cv2.dilate(thresh, kernel, iterations=4) | |
# Find contours, highlight text areas, and extract ROIs | |
cnts = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) | |
cnts = cnts[0] if len(cnts) == 2 else cnts[1] | |
line_items_coordinates = [] | |
for c in cnts: | |
area = cv2.contourArea(c) | |
x,y,w,h = cv2.boundingRect(c) | |
if y >= 600 and x <= 1000: | |
if area > 10000: | |
image = cv2.rectangle(im, (x,y), (2200, y+h), color=(255,0,255), thickness=3) | |
line_items_coordinates.append([(x,y), (2200, y+h)]) | |
if y >= 2400 and x<= 2000: | |
image = cv2.rectangle(im, (x,y), (2200, y+h), color=(255,0,255), thickness=3) | |
line_items_coordinates.append([(x,y), (2200, y+h)]) | |
return image, line_items_coordinates |
Dear @akash-ch2812 or @jlumbroso,
I'm having some trouble using this code on some PDFs I'm working on, I thought either you might have some ideas on what's going wrong.
I have a single PDF called, "DNR_WFH.pdf". I was able to use the first section of code below to split the PDF into separate pages and name them accordingly.
from pdf2image import convert_from_path
pdfs = r"C:\Users\mhiebing\Desktop\DNR_WFH.pdf"
pages = convert_from_path(pdfs, 350)
i = 1
for page in pages:
image_name = "Page_" + str(i) + ".jpg"
page.save(image_name, "JPEG")
i = i+1
For the second section, I'm only looking at the first JPEG to keep things simple. When I call the mark_region(image_path)
function nothing happens. Is there supposed to be a window where I outline the boxes we want to extract text from? Here's what I have for the second section:
import cv2
import matplotlib.pyplot as plt
def mark_region(image_path):
#define the mark_region method
FILENAME = r"C:\Users\mhiebing\Documents\GitHub_Repos\MonthlyStatsExtract\Page_1.jpg" # <--- change this to be the file you want
image, line_items_coordinates = mark_region(FILENAME)
plt.figure(figsize=(20,20))
plt.imshow(image)
plt.savefig("image-with-regions.png") # <--- added this to output an image
image = cv2.imread(image_path)
# define threshold of regions to ignore
THRESHOLD_REGION_IGNORE = 40
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
blur = cv2.GaussianBlur(gray, (9,9), 0)
thresh = cv2.adaptiveThreshold(blur,255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV,11,30)
# Dilate to combine adjacent text contours
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (9,9))
dilate = cv2.dilate(thresh, kernel, iterations=4)
# Find contours, highlight text areas, and extract ROIs
cnts = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] if len(cnts) == 2 else cnts[1]
line_items_coordinates = []
for c in cnts:
area = cv2.contourArea(c)
x, y, w, h = cv2.boundingRect(c)
if w < THRESHOLD_REGION_IGNORE or h < THRESHOLD_REGION_IGNORE:
continue
image = cv2.rectangle(image, (x,y), (x+w, y+h), color=(255,0,255), thickness=3)
line_items_coordinates.append([(x,y), (x+w, y+h)])
return image, line_items_coordinates
mark_region
Thank you for putting up the article and supplying the code!
@Matthew-Hiebing
I'm just trying to adapt the existing fantastic code and I found your question.
This code snippet of your code is inside the function mark_region but should be outside. This because you are calling the function mark_region(FILENAME)
and pass a FILENAME:
#define the mark_region method
FILENAME = r"C:\Users\mhiebing\Documents\GitHub_Repos\MonthlyStatsExtract\Page_1.jpg" # <--- change this to be the file you want
image, line_items_coordinates = mark_region(FILENAME)
plt.figure(figsize=(20,20))
plt.imshow(image)
plt.savefig("image-with-regions.png") # <--- added this to output an image
Thank you so much for your work.
There were problems with opencv-python-4.5.5.62
.
Use opencv-python==4.1.2.30
.
if w < THRESHOLD_REGION_IGNORE or h < THRESHOLD_REGION_IGNORE: continue image = cv2.rectangle(image, (x,y), (x+w, y+h), color=(255,0,255), thickness=3) line_items_coordinates.append([(x,y), (x+w, y+h)])
Thanks a lot man, Before I was only getting few area marked but after revamp and using your code it's marking every single area
Thank you once again. Keep up the good work
if we want to optimize this code:
import cv2
from PIL import Image
from pdf2image import convert_from_path
def mark_region(im):
gray = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
blur = cv2.GaussianBlur(gray, (9,9), 0)
thresh = cv2.adaptiveThreshold(blur, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 11, 30)
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (9,9))
dilate = cv2.dilate(thresh, kernel, iterations=4)
cnts = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)[0]
line_items_coordinates = []
for c in cnts:
area = cv2.contourArea(c)
x, y, w, h = cv2.boundingRect(c)
if y >= 600 and x <= 1000 and area > 10000:
image = cv2.rectangle(im, (x,y), (2200, y+h), color=(255,0,255), thickness=3)
line_items_coordinates.append([(x,y), (2200, y+h)])
elif y >= 2400 and x<= 2000:
image = cv2.rectangle(im, (x,y), (2200, y+h), color=(255,0,255), thickness=3)
line_items_coordinates.append([(x,y), (2200, y+h)])
return image, line_items_coordinates
poppler_path = r 'you should write poppler bin folder path '
pages = convert_from_path("you should write here pdf path", 480, poppler_path=poppler_path)
for i, page in enumerate(pages):
image_name = f"Deneme_{i}.JPEG"
page.save(image_name, "JPEG")
im = cv2.imread(image_name)
marked_image, coordinates = mark_region(im)
cv2.imwrite(f"Marked_{image_name}", marked_image)
@jlumbroso - Thank you for the kind words 👍