tnn4 · September 3, 2025 22:07
diff --git a/journal_analysis.py b/journal_analysis.py
 # Export onenote section to pdf

 # py -m venv venv-dir
 # . ./venv-dir/scripts/activate

 pdf_f="Gratitude Journal.pdf"

 def process_pdf_to_text(pdf):
    # pip install --upgrade pymupdf
    import pymupdf

    doc = pymupdf.open(pdf)
    output_txt=f"{pdf}_output.txt"
    out = open(output_txt, "wb")

    for page in doc:
        text = page.get_text().encode("utf8")
        out.write(text)
        out.write(bytes((12,)))

    out.close()

    # read text into string object
    with open(output_txt, "r", errors="ignore") as file:
        content = file.read()
        # fix for unicode decode error
        cleaned_content = content.encode("ascii", errors="ignore").decode("ascii")
    
    return cleaned_content
 #end

 # pip install --upgrade wordcloud
 from wordcloud import WordCloud
 import matplotlib.pyplot as plt

 def make_wordcloud(input_text="hello world"):
    # Generate word cloud
    wordcloud = WordCloud(width=1600,height=900,background_color="white").generate(input_text)

    # Display wordcloud
    plt.figure(figsize=(10,5))
    plt.imshow(wordcloud,interpolation="bilinear")
    plt.axis("off")
    plt.show()
 # end

 def get_word_freq():
 # pip install nltk

    from collections import Counter
    # import regex
    import re
    content2 = process_pdf_to_text(pdf_f).lower()
    words = re.findall(r"\b\w+\b", content2)
    word_counts = Counter(words)

    # remove stop words (words you don't want to process)
    # add stop words? https://stackoverflow.com/questions/5511708/adding-words-to-nltk-stoplist
    from nltk.corpus import stopwords
    stop_words = set(stopwords.words("english"))
    extra_stop_words = [ "journal", "page", "â", "25", "like", "2025", "pm", "1"]
    for word in extra_stop_words:
        stop_words.add(word)
    #end
    filtered_words = [ word for word in words if word.lower() not in stop_words ]
    
    cleaned_text = " ".join(filtered_words)
    make_wordcloud(cleaned_text)
    
    word_counts = Counter(filtered_words)
    list_size = 200
    """
        print(word_counts)
    with open("journal_word_count.txt", "w") as file:
        for item,count in word_counts.items():
            file.write(f"{item}: {count}\n")
        #end
    #close
    """
    print(word_counts.most_common(list_size))

    make_chart = False
    # Make bar chart
    if make_chart:
        # Extract labels and values from the counter
        labels = list(word_counts.keys())
        values = list(word_counts.values())

        plt.figure(figsize=(8, 6))
        plt.bar(labels, values, color='skyblue')
        plt.xlabel('Item')
        plt.ylabel('Count')
        plt.title("Word Frequency in Journal")
        plt.show()
    #fi
 # end

 if __name__ == "__main__":
    get_word_freq()
    make_wordcloud_b = False
    if make_wordcloud_b:
        make_wordcloud()
    #fi
 #fi
	# Export onenote section to pdf

	# py -m venv venv-dir
	# . ./venv-dir/scripts/activate

	pdf_f="Gratitude Journal.pdf"

	def process_pdf_to_text(pdf):
	# pip install --upgrade pymupdf
	import pymupdf

	doc = pymupdf.open(pdf)
	output_txt=f"{pdf}_output.txt"
	out = open(output_txt, "wb")

	for page in doc:
	text = page.get_text().encode("utf8")
	out.write(text)
	out.write(bytes((12,)))

	out.close()

	# read text into string object
	with open(output_txt, "r", errors="ignore") as file:
	content = file.read()
	# fix for unicode decode error
	cleaned_content = content.encode("ascii", errors="ignore").decode("ascii")

	return cleaned_content
	#end

	# pip install --upgrade wordcloud
	from wordcloud import WordCloud
	import matplotlib.pyplot as plt

	def make_wordcloud(input_text="hello world"):
	# Generate word cloud
	wordcloud = WordCloud(width=1600,height=900,background_color="white").generate(input_text)

	# Display wordcloud
	plt.figure(figsize=(10,5))
	plt.imshow(wordcloud,interpolation="bilinear")
	plt.axis("off")
	plt.show()
	# end

	def get_word_freq():
	# pip install nltk

	from collections import Counter
	# import regex
	import re
	content2 = process_pdf_to_text(pdf_f).lower()
	words = re.findall(r"\b\w+\b", content2)
	word_counts = Counter(words)

	# remove stop words (words you don't want to process)
	# add stop words? https://stackoverflow.com/questions/5511708/adding-words-to-nltk-stoplist
	from nltk.corpus import stopwords
	stop_words = set(stopwords.words("english"))
	extra_stop_words = [ "journal", "page", "â", "25", "like", "2025", "pm", "1"]
	for word in extra_stop_words:
	stop_words.add(word)
	#end
	filtered_words = [ word for word in words if word.lower() not in stop_words ]

	cleaned_text = " ".join(filtered_words)
	make_wordcloud(cleaned_text)

	word_counts = Counter(filtered_words)
	list_size = 200
	"""
	print(word_counts)
	with open("journal_word_count.txt", "w") as file:
	for item,count in word_counts.items():
	file.write(f"{item}: {count}\n")
	#end
	#close
	"""
	print(word_counts.most_common(list_size))

	make_chart = False
	# Make bar chart
	if make_chart:
	# Extract labels and values from the counter
	labels = list(word_counts.keys())
	values = list(word_counts.values())

	plt.figure(figsize=(8, 6))
	plt.bar(labels, values, color='skyblue')
	plt.xlabel('Item')
	plt.ylabel('Count')
	plt.title("Word Frequency in Journal")
	plt.show()
	#fi
	# end

	if __name__ == "__main__":
	get_word_freq()
	make_wordcloud_b = False
	if make_wordcloud_b:
	make_wordcloud()
	#fi
	#fi
No results found