VpkPrasanna · October 12, 2021 06:07
diff --git a/East Text Detection.py b/East Text Detection.py
 # Necessary Imports
 import textseg as ts
 from PyPDF2 import PdfFileReader
 from pdf2image import convert_from_path
 import cv2
 import json
 import pandas as pd
 import glob
 import os

 data = pd.DataFrame()
 final_name_list=[]
 final_text_opencv=[]
 final_text_tessaract=[]
 # Path of all Resume files
 for i in resumes:
    pdf = PdfFileReader(open(i,'rb'))
    
    # Get fileName of each PDF File
    fname = i.split('/')[-1]
    
    # Check how many page each PDF contains 
    print(pdf.getNumPages())
    
    # Convert pdf object to image 
    images = convert_from_path(i)
    resumes_img=[]
    
    # append all image instance to a list to pass them through model
    for j in range(len(images)):
    
         # Save pages as images in the pdf
        images[j].save(path_to_write+fname.split('.')[0]+'_'+ str(j) +'.jpg', 'JPEG')
        resumes_img.append(path_to_write+fname.split('.')[0]+'_'+ str(j) +'.jpg')
    name_list = fname.split('.')[0]+'_' +'.jpg'
    text_opencv=[]
    text_tessaract=[]
    for i in resumes_img:
    
        # read image using opencv
        frame=cv2.imread(i)
        os.remove(i)
        img = i.split("/")[2]
        
        # Pass the image to the model to get the text instance present in the image.
        output_img,label,dilate, c_dict,df1, split_img=ts.get_text_seg(frame, img)
        cv2.imwrite(path_to_write+img.split('.')[0]+".png",output_img)
        for i in range(len(split_img)):
        
            # This Loop will helps us to save the instance of text as a individual image.
            cv2.imwrite(path_to_write+img.split('.')[0]+str(i)+".png", split_img[i])
        text_opencv.append(c_dict)
        text_tessaract+=text_from_tesseract(output_img)
        tesseract_str = ''.join(text_tessaract)   
    final_name_list.append(name_list)
    final_text_opencv.append(text_opencv)
    final_text_tessaract.append(tesseract_str)
    
 # we are selecting the index 0 as we have passed one PDF as a input which contains one Page
 print(final_text_opencv[0])
	# Necessary Imports
	import textseg as ts
	from PyPDF2 import PdfFileReader
	from pdf2image import convert_from_path
	import cv2
	import json
	import pandas as pd
	import glob
	import os

	data = pd.DataFrame()
	final_name_list=[]
	final_text_opencv=[]
	final_text_tessaract=[]
	# Path of all Resume files
	for i in resumes:
	pdf = PdfFileReader(open(i,'rb'))

	# Get fileName of each PDF File
	fname = i.split('/')[-1]

	# Check how many page each PDF contains
	print(pdf.getNumPages())

	# Convert pdf object to image
	images = convert_from_path(i)
	resumes_img=[]

	# append all image instance to a list to pass them through model
	for j in range(len(images)):

	# Save pages as images in the pdf
	images[j].save(path_to_write+fname.split('.')[0]+'_'+ str(j) +'.jpg', 'JPEG')
	resumes_img.append(path_to_write+fname.split('.')[0]+'_'+ str(j) +'.jpg')
	name_list = fname.split('.')[0]+'_' +'.jpg'
	text_opencv=[]
	text_tessaract=[]
	for i in resumes_img:

	# read image using opencv
	frame=cv2.imread(i)
	os.remove(i)
	img = i.split("/")[2]

	# Pass the image to the model to get the text instance present in the image.
	output_img,label,dilate, c_dict,df1, split_img=ts.get_text_seg(frame, img)
	cv2.imwrite(path_to_write+img.split('.')[0]+".png",output_img)
	for i in range(len(split_img)):

	# This Loop will helps us to save the instance of text as a individual image.
	cv2.imwrite(path_to_write+img.split('.')[0]+str(i)+".png", split_img[i])
	text_opencv.append(c_dict)
	text_tessaract+=text_from_tesseract(output_img)
	tesseract_str = ''.join(text_tessaract)
	final_name_list.append(name_list)
	final_text_opencv.append(text_opencv)
	final_text_tessaract.append(tesseract_str)

	# we are selecting the index 0 as we have passed one PDF as a input which contains one Page
	print(final_text_opencv[0])