Last active
September 3, 2025 22:07
-
-
Save tnn4/af3053d3edb06af57fbe3906a87d46cd to your computer and use it in GitHub Desktop.
journal analysis
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Export onenote section to pdf | |
| # py -m venv venv-dir | |
| # . ./venv-dir/scripts/activate | |
| pdf_f="Gratitude Journal.pdf" | |
| def process_pdf_to_text(pdf): | |
| # pip install --upgrade pymupdf | |
| import pymupdf | |
| doc = pymupdf.open(pdf) | |
| output_txt=f"{pdf}_output.txt" | |
| out = open(output_txt, "wb") | |
| for page in doc: | |
| text = page.get_text().encode("utf8") | |
| out.write(text) | |
| out.write(bytes((12,))) | |
| out.close() | |
| # read text into string object | |
| with open(output_txt, "r", errors="ignore") as file: | |
| content = file.read() | |
| # fix for unicode decode error | |
| cleaned_content = content.encode("ascii", errors="ignore").decode("ascii") | |
| return cleaned_content | |
| #end | |
| # pip install --upgrade wordcloud | |
| from wordcloud import WordCloud | |
| import matplotlib.pyplot as plt | |
| def make_wordcloud(input_text="hello world"): | |
| # Generate word cloud | |
| wordcloud = WordCloud(width=1600,height=900,background_color="white").generate(input_text) | |
| # Display wordcloud | |
| plt.figure(figsize=(10,5)) | |
| plt.imshow(wordcloud,interpolation="bilinear") | |
| plt.axis("off") | |
| plt.show() | |
| # end | |
| def get_word_freq(): | |
| # pip install nltk | |
| from collections import Counter | |
| # import regex | |
| import re | |
| content2 = process_pdf_to_text(pdf_f).lower() | |
| words = re.findall(r"\b\w+\b", content2) | |
| word_counts = Counter(words) | |
| # remove stop words (words you don't want to process) | |
| # add stop words? https://stackoverflow.com/questions/5511708/adding-words-to-nltk-stoplist | |
| from nltk.corpus import stopwords | |
| stop_words = set(stopwords.words("english")) | |
| extra_stop_words = [ "journal", "page", "â", "25", "like", "2025", "pm", "1"] | |
| for word in extra_stop_words: | |
| stop_words.add(word) | |
| #end | |
| filtered_words = [ word for word in words if word.lower() not in stop_words ] | |
| cleaned_text = " ".join(filtered_words) | |
| make_wordcloud(cleaned_text) | |
| word_counts = Counter(filtered_words) | |
| list_size = 200 | |
| """ | |
| print(word_counts) | |
| with open("journal_word_count.txt", "w") as file: | |
| for item,count in word_counts.items(): | |
| file.write(f"{item}: {count}\n") | |
| #end | |
| #close | |
| """ | |
| print(word_counts.most_common(list_size)) | |
| make_chart = False | |
| # Make bar chart | |
| if make_chart: | |
| # Extract labels and values from the counter | |
| labels = list(word_counts.keys()) | |
| values = list(word_counts.values()) | |
| plt.figure(figsize=(8, 6)) | |
| plt.bar(labels, values, color='skyblue') | |
| plt.xlabel('Item') | |
| plt.ylabel('Count') | |
| plt.title("Word Frequency in Journal") | |
| plt.show() | |
| #fi | |
| # end | |
| if __name__ == "__main__": | |
| get_word_freq() | |
| make_wordcloud_b = False | |
| if make_wordcloud_b: | |
| make_wordcloud() | |
| #fi | |
| #fi |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment