Last active
October 12, 2021 06:07
-
-
Save VpkPrasanna/bcc2153130fa45bf5bd82518d0eb0769 to your computer and use it in GitHub Desktop.
This helps us to extract text as a separate instance of image
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Necessary Imports | |
import textseg as ts | |
from PyPDF2 import PdfFileReader | |
from pdf2image import convert_from_path | |
import cv2 | |
import json | |
import pandas as pd | |
import glob | |
import os | |
data = pd.DataFrame() | |
final_name_list=[] | |
final_text_opencv=[] | |
final_text_tessaract=[] | |
# Path of all Resume files | |
for i in resumes: | |
pdf = PdfFileReader(open(i,'rb')) | |
# Get fileName of each PDF File | |
fname = i.split('/')[-1] | |
# Check how many page each PDF contains | |
print(pdf.getNumPages()) | |
# Convert pdf object to image | |
images = convert_from_path(i) | |
resumes_img=[] | |
# append all image instance to a list to pass them through model | |
for j in range(len(images)): | |
# Save pages as images in the pdf | |
images[j].save(path_to_write+fname.split('.')[0]+'_'+ str(j) +'.jpg', 'JPEG') | |
resumes_img.append(path_to_write+fname.split('.')[0]+'_'+ str(j) +'.jpg') | |
name_list = fname.split('.')[0]+'_' +'.jpg' | |
text_opencv=[] | |
text_tessaract=[] | |
for i in resumes_img: | |
# read image using opencv | |
frame=cv2.imread(i) | |
os.remove(i) | |
img = i.split("/")[2] | |
# Pass the image to the model to get the text instance present in the image. | |
output_img,label,dilate, c_dict,df1, split_img=ts.get_text_seg(frame, img) | |
cv2.imwrite(path_to_write+img.split('.')[0]+".png",output_img) | |
for i in range(len(split_img)): | |
# This Loop will helps us to save the instance of text as a individual image. | |
cv2.imwrite(path_to_write+img.split('.')[0]+str(i)+".png", split_img[i]) | |
text_opencv.append(c_dict) | |
text_tessaract+=text_from_tesseract(output_img) | |
tesseract_str = ''.join(text_tessaract) | |
final_name_list.append(name_list) | |
final_text_opencv.append(text_opencv) | |
final_text_tessaract.append(tesseract_str) | |
# we are selecting the index 0 as we have passed one PDF as a input which contains one Page | |
print(final_text_opencv[0]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment