Skip to content

Instantly share code, notes, and snippets.

@kyoto-cheng
Last active June 25, 2021 15:38
Show Gist options
  • Save kyoto-cheng/97a70405e99bd85e7e3b453c456cc8b5 to your computer and use it in GitHub Desktop.
Save kyoto-cheng/97a70405e99bd85e7e3b453c456cc8b5 to your computer and use it in GitHub Desktop.
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
# Analyzing top frquent bi-gram words in the interview questions of type Methodology with CountVectorizer
def counter(Q_A, category, data, n_gram_min, n_gram_max):
data = data[data[category]==1]
word_vectorizer = CountVectorizer(ngram_range=(n_gram_min,n_gram_max), analyzer='word')
sparse_matrix = word_vectorizer.fit_transform(data[Q_A])
frequencies = sum(sparse_matrix).toarray()[0]
return pd.DataFrame(frequencies, index=word_vectorizer.get_feature_names(), columns=['frequency']).sort_values('frequency', ascending=False)
# Data visualization with bi-gram word clouds
for column in list(category.columns):
cluster = data[data[column]==1]
d = counter('Questions', column, cluster, 2, 2) # n-gram words within range (2,2) for example
frequency = d.T.to_dict('records')[0]
wordcloud = WordCloud(background_color='black',width=3000, height=2000, max_words=100,
random_state=1, colormap='Set2', collocations=False)
wordcloud.generate_from_frequencies(frequencies=frequency)
print('Cluster: {}'.format(column))
plt.figure(figsize=(20,10), facecolor='k')
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad=0)
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment