Skip to content

Instantly share code, notes, and snippets.

@mattstein
Last active December 29, 2023 20:04
Show Gist options
  • Save mattstein/f6cfe3897bdc50a5c3ace568d4d8f0a8 to your computer and use it in GitHub Desktop.
Save mattstein/f6cfe3897bdc50a5c3ace568d4d8f0a8 to your computer and use it in GitHub Desktop.
Use TextBlob to extract the most-used adjectives from a bunch of Markdown files.
import os
import re
import glob
from textblob import TextBlob
from collections import Counter
from nltk.corpus import stopwords
# recursively find all the Markdown in a directory and smush it into a big string
def load_markdown_files(directory):
md_files = glob.glob(os.path.join(directory, '**/*.md'), recursive=True)
content = ''
for file in md_files:
with open(file, 'r') as f:
content += f.read()
return content
# !! update this with your own path
text = load_markdown_files('/path/to/directory/with/markdown')
# clean up that big string to scrape out stuff we don’t want
def clean_text(text):
# lowercase everything
text = text.lower()
# remove URLs and @’s
text = re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", text)
# remove stop words
stop = stopwords.words('english')
text = " ".join([word for word in text.split() if word not in (stop)])
return text
# get the adjectives we’re looking for
def get_top_adjectives(text):
blob = TextBlob(clean_text(text))
# collect adjectives based on this coding: https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
adjectives = [word for (word, tag) in blob.tags if (tag == 'JJ' or tag == 'JJR' or tag == 'JJS')]
adjective_counts = Counter(adjectives)
top_adjectives = adjective_counts.most_common(30)
return top_adjectives
top_adjectives = get_top_adjectives(text)
for (word, count) in top_adjectives:
print(f"{word}: {count}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment