Skip to content

Instantly share code, notes, and snippets.

@pazz
Created April 24, 2013 20:00
Show Gist options
  • Select an option

  • Save pazz/5455090 to your computer and use it in GitHub Desktop.

Select an option

Save pazz/5455090 to your computer and use it in GitHub Desktop.
pdf2txt using pyminer
#!/usr/bin/env python2
import sys
from StringIO import StringIO
from pdfminer.pdfinterp import PDFResourceManager, process_pdf
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
def pdf2txt(fp, password='', outcodec='utf-8', caching=True):
"""
this does what's on the tin.
possibly raises `pdfminer.pdfinterp.PDFTextExtractionNotAllowed`
"""
# no idea what this does
laparams = LAParams()
rsrcmgr = PDFResourceManager(caching=caching)
outfp = StringIO()
device = TextConverter(rsrcmgr, outfp, codec=outcodec, laparams=laparams)
process_pdf(rsrcmgr, device, fp, set(), maxpages=0, password=password,
caching=caching, check_extractable=True)
device.close()
txt = outfp.getvalue()
outfp.close()
return txt
if __name__ == '__main__':
fp = file(sys.argv[1] , 'rb')
txt = pdf2txt(fp)
fp.close()
@np
Copy link

np commented Apr 24, 2013

What about print(txt) at the end of the file :)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment