devdave · December 1, 2022 18:42
diff --git a/counter.py b/counter.py
 from pathlib import Path

 from collections import Counter

 from docx import Document  # pip install python-docx
 from nltk.tokenize import sent_tokenize, word_tokenize  # pip install nltk==3.5

 ROOT = Path(r"C:\\REDACTED\\book\\")


 class DocCounter:

    def __init__(self, root_path: Path):
        self.root = root_path
        self.counts = Counter([
            "sentences",
            "unique_words",
            "tokens"])


    @staticmethod
    def get_body(path: Path):
        doc = Document(path)
        contents = []
        for para in doc.paragraphs:
            #Collect non-blank lines, the tokenizer doesn't care about asthetics
            if len(para.text.strip()) > 0:
                contents.append(para.text)

        # joining by newline fucks up sentence tokenizing.
        return "  ".join(contents)

    def count_body(self, body: str):

        sentences = sent_tokenize(body)
        raw = word_tokenize(body)
        unique = set(raw)

        self.counts["sentences"] += len(sentences)
        self.counts["tokens"] += len(raw)
        self.counts["unique_words"] = len(unique)

        return len(sentences), len(raw), len(unique)

    def run(self):
        for element in self.root.iterdir() :
            if element.is_file() and (element.name.endswith(".docx") and not element.name.startswith("~")):
                body = self.get_body(element)
                print(f"Processing: {element.name}")
                sent_ct, raw_ct, unique = self.count_body(body)
                print(f"\tSentences: {sent_ct:,}, Tokens: {raw_ct:,} Unique: {unique:,}")
                print()


        return self.counts


 def main():

    counts = DocCounter(ROOT).run()

    print()
    print(f"Totals -> Sentences: {counts['sentences']:,} Tokens: {counts['tokens']:,} Unique words: {counts['unique_words']:,} ")




 if __name__ == '__main__':
    main()
	from pathlib import Path

	from collections import Counter

	from docx import Document # pip install python-docx
	from nltk.tokenize import sent_tokenize, word_tokenize # pip install nltk==3.5

	ROOT = Path(r"C:\\REDACTED\\book\\")


	class DocCounter:

	def __init__(self, root_path: Path):
	self.root = root_path
	self.counts = Counter([
	"sentences",
	"unique_words",
	"tokens"])


	@staticmethod
	def get_body(path: Path):
	doc = Document(path)
	contents = []
	for para in doc.paragraphs:
	#Collect non-blank lines, the tokenizer doesn't care about asthetics
	if len(para.text.strip()) > 0:
	contents.append(para.text)

	# joining by newline fucks up sentence tokenizing.
	return " ".join(contents)

	def count_body(self, body: str):

	sentences = sent_tokenize(body)
	raw = word_tokenize(body)
	unique = set(raw)

	self.counts["sentences"] += len(sentences)
	self.counts["tokens"] += len(raw)
	self.counts["unique_words"] = len(unique)

	return len(sentences), len(raw), len(unique)

	def run(self):
	for element in self.root.iterdir() :
	if element.is_file() and (element.name.endswith(".docx") and not element.name.startswith("~")):
	body = self.get_body(element)
	print(f"Processing: {element.name}")
	sent_ct, raw_ct, unique = self.count_body(body)
	print(f"\tSentences: {sent_ct:,}, Tokens: {raw_ct:,} Unique: {unique:,}")
	print()


	return self.counts


	def main():

	counts = DocCounter(ROOT).run()

	print()
	print(f"Totals -> Sentences: {counts['sentences']:,} Tokens: {counts['tokens']:,} Unique words: {counts['unique_words']:,} ")




	if __name__ == '__main__':
	main()