Skip to content

Instantly share code, notes, and snippets.

@chulman444
Created June 23, 2018 15:38
Show Gist options
  • Save chulman444/4e6d7c2bfbce9ed63e08f0a42843be06 to your computer and use it in GitHub Desktop.
Save chulman444/4e6d7c2bfbce9ed63e08f0a42843be06 to your computer and use it in GitHub Desktop.
Get all docx elements from docx files in a directory tree. Use with this script in this gist (https://gist.github.com/chulman444/b9e0ef2b61a241f47d25746a9a26e9f0)
import sys, os, json, time
from examine_docx_elements import run as getDocxElements
def main():
print("Started")
rootdir = sys.argv[1]
before = time.time()
output = run(rootdir)
after = time.time()
print(output)
print("Took {} seconds".format(after - before))
def run(rootdir):
all_docx_elements = []
ft = FilepathTracker()
for root, subdirs, files in os.walk(rootdir):
for f in files:
filepath = root + "/" + f
f_splits = f.rsplit(".")
if len(f_splits) > 1 and f_splits[1] == "docx":
docx_elements = getDocxElements(filepath)
all_docx_elements += docx_elements
ft.track(filepath, docx_elements)
ft.print()
unique_docx_elements = set(all_docx_elements)
return unique_docx_elements
class FilepathTracker():
def __init__(self):
self.tracked = {}
def track(self, filepath, docx_elements):
for el in docx_elements:
self.tracked.setdefault(el, []).append(filepath)
def print(self):
content = json.dumps(self.tracked, indent=4)
print(content)
self.printOccurrenceSummary()
def printOccurrenceSummary(self):
oc_summary = {}
for k in self.tracked.keys():
oc_summary[k] = len(self.tracked[k])
print(json.dumps(oc_summary, indent=4))
if __name__=="__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment