Skip to content

Instantly share code, notes, and snippets.

@tnn4
Last active September 3, 2025 22:07
Show Gist options
  • Select an option

  • Save tnn4/af3053d3edb06af57fbe3906a87d46cd to your computer and use it in GitHub Desktop.

Select an option

Save tnn4/af3053d3edb06af57fbe3906a87d46cd to your computer and use it in GitHub Desktop.
journal analysis
# Export onenote section to pdf
# py -m venv venv-dir
# . ./venv-dir/scripts/activate
pdf_f="Gratitude Journal.pdf"
def process_pdf_to_text(pdf):
# pip install --upgrade pymupdf
import pymupdf
doc = pymupdf.open(pdf)
output_txt=f"{pdf}_output.txt"
out = open(output_txt, "wb")
for page in doc:
text = page.get_text().encode("utf8")
out.write(text)
out.write(bytes((12,)))
out.close()
# read text into string object
with open(output_txt, "r", errors="ignore") as file:
content = file.read()
# fix for unicode decode error
cleaned_content = content.encode("ascii", errors="ignore").decode("ascii")
return cleaned_content
#end
# pip install --upgrade wordcloud
from wordcloud import WordCloud
import matplotlib.pyplot as plt
def make_wordcloud(input_text="hello world"):
# Generate word cloud
wordcloud = WordCloud(width=1600,height=900,background_color="white").generate(input_text)
# Display wordcloud
plt.figure(figsize=(10,5))
plt.imshow(wordcloud,interpolation="bilinear")
plt.axis("off")
plt.show()
# end
def get_word_freq():
# pip install nltk
from collections import Counter
# import regex
import re
content2 = process_pdf_to_text(pdf_f).lower()
words = re.findall(r"\b\w+\b", content2)
word_counts = Counter(words)
# remove stop words (words you don't want to process)
# add stop words? https://stackoverflow.com/questions/5511708/adding-words-to-nltk-stoplist
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))
extra_stop_words = [ "journal", "page", "â", "25", "like", "2025", "pm", "1"]
for word in extra_stop_words:
stop_words.add(word)
#end
filtered_words = [ word for word in words if word.lower() not in stop_words ]
cleaned_text = " ".join(filtered_words)
make_wordcloud(cleaned_text)
word_counts = Counter(filtered_words)
list_size = 200
"""
print(word_counts)
with open("journal_word_count.txt", "w") as file:
for item,count in word_counts.items():
file.write(f"{item}: {count}\n")
#end
#close
"""
print(word_counts.most_common(list_size))
make_chart = False
# Make bar chart
if make_chart:
# Extract labels and values from the counter
labels = list(word_counts.keys())
values = list(word_counts.values())
plt.figure(figsize=(8, 6))
plt.bar(labels, values, color='skyblue')
plt.xlabel('Item')
plt.ylabel('Count')
plt.title("Word Frequency in Journal")
plt.show()
#fi
# end
if __name__ == "__main__":
get_word_freq()
make_wordcloud_b = False
if make_wordcloud_b:
make_wordcloud()
#fi
#fi
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment