Created
July 20, 2020 20:10
-
-
Save simonwo/cca8d2c201f6ced1b6cdd85d3e98c419 to your computer and use it in GitHub Desktop.
Pulling all of the tables from a Word document into JSON files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
from sys import stdout, stderr, argv | |
import os.path | |
import json | |
try: | |
from docx import Document | |
except ImportError: | |
print("Maybe you should $ pip install python-docx", file=stderr) | |
raise | |
for path in argv[1:]: | |
directory, filename = os.path.split(path) | |
basename, extension = os.path.splitext(filename) | |
with open(path, 'rb') as input_file: | |
document = Document(input_file) | |
for index, table in enumerate(document.tables): | |
headers = [cell.text for cell in table.rows[0].cells] | |
outpath = os.path.join(directory, f"{basename}table-{index}.json") | |
numrows = len(table.rows) - 1 # Content rows | |
print(outpath, file=stdout) | |
with open(outpath, 'w') as outputfile: | |
outputfile.write('[') # Instead of building a massive JSON | |
for index, row in enumerate(table.rows[1:]): | |
values = [cell.text for cell in row.cells] | |
data = dict(zip(headers, values)) | |
json.dump(data, outputfile) | |
if index != numrows - 1: | |
outputfile.write(',') | |
outputfile.write(']') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment