Created
June 7, 2013 03:59
-
-
Save markrwilliams/5726983 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import lxml.etree | |
from zipfile import ZipFile | |
class DocX(object): | |
DOCPATH = 'word/document.xml' | |
def __init__(self, fn): | |
with ZipFile(fn).open(self.DOCPATH) as f: | |
self.doc = lxml.etree.parse(f) | |
self.nsmap = self.doc.getroot().nsmap | |
@property | |
def paragraphs(self): | |
paragraphs = [] | |
for p in self.doc.xpath('.//w:p', namespaces=self.nsmap): | |
line = '' | |
for text in p.xpath('.//w:t', namespaces=self.nsmap): | |
line += text.text | |
paragraphs.append(line) | |
return paragraphs | |
if __name__ == '__main__': | |
import codecs | |
import argparse | |
parser = argparse.ArgumentParser(description='.docx plain text extractor') | |
parser.add_argument('docx', | |
help='the source docx') | |
parser.add_argument('--output', | |
'-o', | |
default=None, | |
help='output file') | |
args = parser.parse_args() | |
plaintext = '\n'.join(DocX("/tmp/Gregory Pinto's Resume.docx").paragraphs) | |
if args.output: | |
with codecs.open(args.output, 'w', encoding='utf-8') as f: | |
f.write(plaintext) | |
else: | |
print plaintext |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment