Created
December 1, 2022 18:42
-
-
Save devdave/3868ca58d639f7705ae37b391712a057 to your computer and use it in GitHub Desktop.
simple script to summarize the contents of a directory of docx files
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pathlib import Path | |
from collections import Counter | |
from docx import Document # pip install python-docx | |
from nltk.tokenize import sent_tokenize, word_tokenize # pip install nltk==3.5 | |
ROOT = Path(r"C:\\REDACTED\\book\\") | |
class DocCounter: | |
def __init__(self, root_path: Path): | |
self.root = root_path | |
self.counts = Counter([ | |
"sentences", | |
"unique_words", | |
"tokens"]) | |
@staticmethod | |
def get_body(path: Path): | |
doc = Document(path) | |
contents = [] | |
for para in doc.paragraphs: | |
#Collect non-blank lines, the tokenizer doesn't care about asthetics | |
if len(para.text.strip()) > 0: | |
contents.append(para.text) | |
# joining by newline fucks up sentence tokenizing. | |
return " ".join(contents) | |
def count_body(self, body: str): | |
sentences = sent_tokenize(body) | |
raw = word_tokenize(body) | |
unique = set(raw) | |
self.counts["sentences"] += len(sentences) | |
self.counts["tokens"] += len(raw) | |
self.counts["unique_words"] = len(unique) | |
return len(sentences), len(raw), len(unique) | |
def run(self): | |
for element in self.root.iterdir() : | |
if element.is_file() and (element.name.endswith(".docx") and not element.name.startswith("~")): | |
body = self.get_body(element) | |
print(f"Processing: {element.name}") | |
sent_ct, raw_ct, unique = self.count_body(body) | |
print(f"\tSentences: {sent_ct:,}, Tokens: {raw_ct:,} Unique: {unique:,}") | |
print() | |
return self.counts | |
def main(): | |
counts = DocCounter(ROOT).run() | |
print() | |
print(f"Totals -> Sentences: {counts['sentences']:,} Tokens: {counts['tokens']:,} Unique words: {counts['unique_words']:,} ") | |
if __name__ == '__main__': | |
main() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment