Created
April 21, 2024 15:17
-
-
Save andrewbolster/b9441c3ed551e48dbe4fac10ec325123 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import nltk | |
import random | |
from nltk.corpus import brown | |
# Ensure the necessary NLTK resources are downloaded | |
nltk.download('brown') | |
nltk.download('universal_tagset') | |
def get_common_words_by_pos(tag, num_words=100): | |
""" Return a list of the most common words for a given part of speech tag """ | |
# Create a frequency distribution of words in the Brown corpus tagged as specified POS | |
word_freq = nltk.FreqDist(w.lower() for w, pos in brown.tagged_words(tagset='universal') if pos == tag) | |
# Get the most common words up to the specified number | |
return [word for word, freq in word_freq.most_common(num_words)] | |
def generate_common_pairs(num_pairs=1): | |
# Get the most common adjectives and nouns | |
adjectives = get_common_words_by_pos('ADJ', 1000) # Top 100 adjectives | |
nouns = get_common_words_by_pos('NOUN', 1000) # Top 100 nouns | |
pairs = [] | |
for _ in range(num_pairs): | |
adjective = random.choice(adjectives) | |
noun = random.choice(nouns) | |
pairs.append(f"{adjective}-{noun}") | |
return pairs | |
# Example usage: | |
print(generate_common_pairs(5)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment