Created
November 13, 2019 10:23
-
-
Save Mehmet-Erkan/f9c7055521036514d18154cdae21df1a to your computer and use it in GitHub Desktop.
Python PyPDF2 match string from pdf document
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def find_match(pdfdoc, annotation: str) -> []: | |
results = [] | |
# extract text and do the search | |
for i in range(0, NumPages): | |
match = {} | |
pages = pdfdoc.getPage(i) | |
text = pages.extractText() | |
res_search = re.search(annotation, text) | |
if(res_search): | |
match['Page'] = str(i) | |
match['Annotation'] = res_search.group(0) | |
match['Start'] = res_search.start() | |
match['End'] = res_search.end() | |
results.append(match) | |
return results | |
doc = PyPDF2.PdfFileReader('./mypdf.pdf') | |
annotation = "searchMeString" | |
find_match(doc, annotation) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment