Skip to content

Instantly share code, notes, and snippets.

@ricardosiri68
Created September 24, 2016 19:38
Show Gist options
  • Save ricardosiri68/56c5f5043bd8623d2f3b2797f5b33cbd to your computer and use it in GitHub Desktop.
Save ricardosiri68/56c5f5043bd8623d2f3b2797f5b33cbd to your computer and use it in GitHub Desktop.
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from cStringIO import StringIO
import models
class Pages(object):
def __init__(self, document):
self.__document = document
def __iter__(self):
manager = PDFResourceManager()
outfp = StringIO()
device = TextConverter(manager, outfp, laparams=LAParams())
interpreter = PDFPageInterpreter(manager, device)
for num, page in enumerate(PDFPage.create_pages(self.__document)):
if num > 314:
interpreter.process_page(page)
text = outfp.getvalue()
yield models.Page(page, text)
class Document(PDFDocument):
def __init__(self, parser):
super(Document, self).__init__(parser)
self.__parser = parser
self.__pages = None
if not self.is_extractable:
raise PDFTextExtractionNotAllowed
def pages(self):
if not self.__pages:
self.__pages = Pages(self)
return self.__pages
@property
def parser(self):
return self.__parser
import sys
from pdfminer.pdfparser import PDFParser
from collection import Document
if __name__ == '__main__':
with open(sys.argv[1], 'rb') as fp:
parser = PDFParser(fp)
document = Document(parser)
with open(sys.argv[2], 'w') as data_out:
for page in document.pages():
data_out.write(page.text)
class Page(object):
def __init__(self, page, text):
self.__page = page
self.__text = text
@property
def text(self):
return self.__text
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment