Skip to content

Instantly share code, notes, and snippets.

@bufke
Last active September 30, 2024 20:46
Show Gist options
  • Save bufke/8798262 to your computer and use it in GitHub Desktop.
Save bufke/8798262 to your computer and use it in GitHub Desktop.
Convert odt, doc, docx, pdf to text with python and some linux programs. Doesn't require Libreoffice.
from subprocess import Popen, PIPE
from docx import opendocx, getdocumenttext
#http://stackoverflow.com/questions/5725278/python-help-using-pdfminer-as-a-library
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
def convert_pdf_to_txt(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = file(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
fp.close()
device.close()
str = retstr.getvalue()
retstr.close()
return str
def document_to_text(filename, file_path):
if filename[-4:] == ".doc":
cmd = ['antiword', file_path]
p = Popen(cmd, stdout=PIPE)
stdout, stderr = p.communicate()
return stdout.decode('ascii', 'ignore')
elif filename[-5:] == ".docx":
document = opendocx(file_path)
paratextlist = getdocumenttext(document)
newparatextlist = []
for paratext in paratextlist:
newparatextlist.append(paratext.encode("utf-8"))
return '\n\n'.join(newparatextlist)
elif filename[-4:] == ".odt":
cmd = ['odt2txt', file_path]
p = Popen(cmd, stdout=PIPE)
stdout, stderr = p.communicate()
return stdout.decode('ascii', 'ignore')
elif filename[-4:] == ".pdf":
return convert_pdf_to_txt(file_path)
@ganeshkharad2
Copy link

ganeshkharad2 commented Aug 5, 2020

.doc conversion not working
returns empty string only

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment