Created
July 12, 2017 00:49
-
-
Save nathan-sixnines/69bb7379c33d00917c3f2f5c0cef0612 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import os | |
def pdf_to_csv(filename): | |
from cStringIO import StringIO | |
from pdfminer.converter import LTChar, TextConverter | |
from pdfminer.layout import LAParams | |
from pdfminer.pdfparser import PDFDocument, PDFParser | |
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter | |
class CsvConverter(TextConverter): | |
def __init__(self, *args, **kwargs): | |
TextConverter.__init__(self, *args, **kwargs) | |
def end_page(self, i): | |
from collections import defaultdict | |
lines = defaultdict(lambda : {}) | |
for child in self.cur_item._objs: #<-- changed | |
if isinstance(child, LTChar): | |
(_,_,x,y) = child.bbox | |
line = lines[int(-y)] | |
line[x] = child._text.encode(self.codec) #<-- changed | |
for y in sorted(lines.keys()): | |
line = lines[y] | |
self.outfp.write("".join(line[x] for x in sorted(line.keys()))) | |
self.outfp.write("\n") | |
# ... the following part of the code is a remix of the | |
# convert() function in the pdfminer/tools/pdf2text module | |
rsrc = PDFResourceManager() | |
outfp = StringIO() | |
device = CsvConverter(rsrc, outfp, codec="utf-8", laparams=LAParams()) | |
# becuase my test documents are utf-8 (note: utf-8 is the default codec) | |
doc = PDFDocument() | |
fp = open(filename, 'rb') | |
parser = PDFParser(fp) | |
parser.set_document(doc) | |
doc.set_parser(parser) | |
doc.initialize('') | |
interpreter = PDFPageInterpreter(rsrc, device) | |
for i, page in enumerate(doc.get_pages()): | |
#outfp.write("START PAGE %d\n" % i) | |
if page is not None: | |
interpreter.process_page(page) | |
#outfp.write("END PAGE %d\n" % i) | |
device.close() | |
fp.close() | |
return outfp.getvalue() | |
for subdir, dirs, files in os.walk(sys.argv[1]): | |
for file in files: | |
string = pdf_to_csv( os.path.join(subdir, file)) #print os.path.join(subdir, file) | |
with open( "%s/%s%s" % (sys.argv[2], file[:-4] ,".txt"), "w") as text_file: | |
text_file.write(string) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment