Skip to content

Instantly share code, notes, and snippets.

@sweemeng
Created October 15, 2011 06:41
Show Gist options
  • Select an option

  • Save sweemeng/1289157 to your computer and use it in GitHub Desktop.

Select an option

Save sweemeng/1289157 to your computer and use it in GitHub Desktop.
My Own python pdf toolkit
from pdfminer.pdfparser import PDFParser,PDFDocument
from pdfminer.pdfinterp import PDFResourceManager,PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LTTextBox, LTTextLine, LTFigure, LTImage, LTChar,LTTextBoxHorizontal
class Document:
def __init__(self,filename,password=''):
self.extracted_text = ''
self.xml_text = ''
self.fp = open(filename,'rb')
self.parser = PDFParser(self.fp)
self.doc = PDFDocument()
self.parser.set_document(self.doc)
self.doc.set_parser(self.parser)
self.password = password
self.doc.initialize(self.password)
self.rsrcmgr = PDFResourceManager()
self.laparams = LAParams()
self.device = PDFPageAggregator(self.rsrcmgr,laparams=self.laparams)
self.interpreter = PDFPageInterpreter(self.rsrcmgr,self.device)
def get_layout(self):
for page in self.doc.get_pages():
self.interpreter.process_page(page)
layout = self.device.get_result()
yield layout
def get_text(self):
for i in self.get_layout():
for j in i:
if isinstance(j,LTTextBox) or \
isinstance(j,LTTextBoxHorizontal) or \
isinstance(j,LTTextLine):
if j.get_text():
self.extracted_text = self.extracted_text + j.get_text()
return self.extracted_text
def to_xml(self):
self.temp_xml = '<?xml version="1.0" encoding="UTF-8" ?>\n'
for i in self.get_layout():
for j in i:
if isinstance(j,LTTextBox):
if j.get_text():
temp = '<LTTextBox>' + j.get_text() + '</LTTextBox>\n'
self.temp_xml = self.temp_xml + temp
if isinstance(j,LTTextBoxHorizontal):
if j.get_text():
temp = '<LTTextBoxHorizontal>' + j.get_text() + '</LTTextBoxHorizontal>\n'
self.temp_xml = self.temp_xml + temp
if isinstance(j,LTTextLine):
if j.get_text():
temp = '<LTTextLine>' + j.get_text() + '</LTTextLine>\n'
self.temp_xml = self.temp_xml + temp
return self.temp_xml
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment