Skip to content

Instantly share code, notes, and snippets.

@simonwo
Created July 20, 2020 20:10
Show Gist options
  • Save simonwo/cca8d2c201f6ced1b6cdd85d3e98c419 to your computer and use it in GitHub Desktop.
Save simonwo/cca8d2c201f6ced1b6cdd85d3e98c419 to your computer and use it in GitHub Desktop.
Pulling all of the tables from a Word document into JSON files
#!/usr/bin/env python3
from sys import stdout, stderr, argv
import os.path
import json
try:
from docx import Document
except ImportError:
print("Maybe you should $ pip install python-docx", file=stderr)
raise
for path in argv[1:]:
directory, filename = os.path.split(path)
basename, extension = os.path.splitext(filename)
with open(path, 'rb') as input_file:
document = Document(input_file)
for index, table in enumerate(document.tables):
headers = [cell.text for cell in table.rows[0].cells]
outpath = os.path.join(directory, f"{basename}table-{index}.json")
numrows = len(table.rows) - 1 # Content rows
print(outpath, file=stdout)
with open(outpath, 'w') as outputfile:
outputfile.write('[') # Instead of building a massive JSON
for index, row in enumerate(table.rows[1:]):
values = [cell.text for cell in row.cells]
data = dict(zip(headers, values))
json.dump(data, outputfile)
if index != numrows - 1:
outputfile.write(',')
outputfile.write(']')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment