Created
August 27, 2018 05:23
-
-
Save jnothman/c04cab657c995591a157e53c453229d6 to your computer and use it in GitHub Desktop.
Load tables from Word docx to pandas dataframe
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import zipfile | |
from lxml import etree | |
import pandas as pd | |
def read_docx(docx_file, **kwargs): | |
"""Read tables as DataFrames from a Word document | |
""" | |
ns = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'} | |
with zipfile.ZipFile(docx_file).open('word/document.xml') as f: | |
root = etree.parse(f) | |
for el in root.xpath('//w:tbl', namespaces=ns): | |
el.tag = 'table' | |
for el in root.xpath('//w:tr', namespaces=ns): | |
el.tag = 'tr' | |
for el in root.xpath('//w:tc', namespaces=ns): | |
el.tag = 'td' | |
return pd.read_html(etree.tostring(root), **kwargs) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment