Last active
December 29, 2023 20:04
-
-
Save mattstein/f6cfe3897bdc50a5c3ace568d4d8f0a8 to your computer and use it in GitHub Desktop.
Use TextBlob to extract the most-used adjectives from a bunch of Markdown files.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import re | |
import glob | |
from textblob import TextBlob | |
from collections import Counter | |
from nltk.corpus import stopwords | |
# recursively find all the Markdown in a directory and smush it into a big string | |
def load_markdown_files(directory): | |
md_files = glob.glob(os.path.join(directory, '**/*.md'), recursive=True) | |
content = '' | |
for file in md_files: | |
with open(file, 'r') as f: | |
content += f.read() | |
return content | |
# !! update this with your own path | |
text = load_markdown_files('/path/to/directory/with/markdown') | |
# clean up that big string to scrape out stuff we don’t want | |
def clean_text(text): | |
# lowercase everything | |
text = text.lower() | |
# remove URLs and @’s | |
text = re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", text) | |
# remove stop words | |
stop = stopwords.words('english') | |
text = " ".join([word for word in text.split() if word not in (stop)]) | |
return text | |
# get the adjectives we’re looking for | |
def get_top_adjectives(text): | |
blob = TextBlob(clean_text(text)) | |
# collect adjectives based on this coding: https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html | |
adjectives = [word for (word, tag) in blob.tags if (tag == 'JJ' or tag == 'JJR' or tag == 'JJS')] | |
adjective_counts = Counter(adjectives) | |
top_adjectives = adjective_counts.most_common(30) | |
return top_adjectives | |
top_adjectives = get_top_adjectives(text) | |
for (word, count) in top_adjectives: | |
print(f"{word}: {count}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment