Skip to content

Instantly share code, notes, and snippets.

@justpeanuts
Forked from Samathy/dumppdfcomments.py
Created April 4, 2021 13:38
Show Gist options
  • Save justpeanuts/6376282928353ca3145f675f2089f830 to your computer and use it in GitHub Desktop.
Save justpeanuts/6376282928353ca3145f675f2089f830 to your computer and use it in GitHub Desktop.
Python Script to extract highlighted text from PDFs. Uses python-poppler-qt4. Updated [1] to Python 3 [1] https://stackoverflow.com/questions/21050551/extracting-text-from-higlighted-text-using-poppler-qt4-python-poppler-qt4
import popplerqt4
import sys
import PyQt4
def main():
doc = popplerqt4.Poppler.Document.load(sys.argv[1])
total_annotations = 0
for i in range(doc.numPages()):
#print("========= PAGE {} =========".format(i+1))
page = doc.page(i)
annotations = page.annotations()
(pwidth, pheight) = (page.pageSize().width(), page.pageSize().height())
if len(annotations) > 0:
for annotation in annotations:
if isinstance(annotation, popplerqt4.Poppler.Annotation):
total_annotations += 1
if(isinstance(annotation, popplerqt4.Poppler.HighlightAnnotation)):
quads = annotation.highlightQuads()
txt = ""
for quad in quads:
rect = (quad.points[0].x() * pwidth,
quad.points[0].y() * pheight,
quad.points[2].x() * pwidth,
quad.points[2].y() * pheight)
bdy = PyQt4.QtCore.QRectF()
bdy.setCoords(*rect)
txt = txt + str(page.text(bdy)) + ' '
#print("========= ANNOTATION =========")
print(txt)
if total_annotations > 0:
print (str(total_annotations) + " annotation(s) found")
else:
print ("no annotations found")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment