Skip to content

Instantly share code, notes, and snippets.

@telecran-telecrit
Created August 6, 2024 04:35
Show Gist options
  • Save telecran-telecrit/f8e882d708f03a1cc6c8462dd62f54a0 to your computer and use it in GitHub Desktop.
Save telecran-telecrit/f8e882d708f03a1cc6c8462dd62f54a0 to your computer and use it in GitHub Desktop.
To extract text from PDF you can try my code examples
import re
import fitz
def validate_resume(filename):
# Open the PDF file and read its contents
with fitz.open(filename) as pdf:
resume_text = ''
for page in pdf:
resume_text += page.get_text()
# Check for certain keywords or phrases in the resume text
required_skills = ['Python', 'Java', 'JavaScript', 'SQL']
required_experience = ['years of experience', 'months of experience']
for skill in required_skills:
if not re.search(skill, resume_text, re.IGNORECASE):
return False
for exp in required_experience:
if not re.search(exp, resume_text, re.IGNORECASE):
return False
print(resume_text)
return True
filename = 'trial.pdf'
if (validate_resume(filename)):
print('[OK] valid resume: ' + filename)
else:
print('[ign] invalid resume.')
import re
import PyPDF2
filename = 'trial.pdf'
# Open the PDF file in read-binary mode
with open(filename , 'rb') as pdf_file:
# create a PDF reader object
pdf_reader = PyPDF2.PdfFileReader(pdf_file)
# Get the number of pages in the PDF file
num_pages = pdf_reader.numPages
# Loop through each page and extract the text
for page_num in range(num_pages):
# Get the current page object
page_obj = pdf_reader.getPage(page_num)
# Extract the text from the page
page_text = page_obj.extractText()
# Print the extracted text
print(page_text)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment