Last active
June 25, 2021 15:38
-
-
Save kyoto-cheng/97a70405e99bd85e7e3b453c456cc8b5 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from wordcloud import WordCloud | |
import matplotlib.pyplot as plt | |
from sklearn.feature_extraction.text import CountVectorizer | |
# Analyzing top frquent bi-gram words in the interview questions of type Methodology with CountVectorizer | |
def counter(Q_A, category, data, n_gram_min, n_gram_max): | |
data = data[data[category]==1] | |
word_vectorizer = CountVectorizer(ngram_range=(n_gram_min,n_gram_max), analyzer='word') | |
sparse_matrix = word_vectorizer.fit_transform(data[Q_A]) | |
frequencies = sum(sparse_matrix).toarray()[0] | |
return pd.DataFrame(frequencies, index=word_vectorizer.get_feature_names(), columns=['frequency']).sort_values('frequency', ascending=False) | |
# Data visualization with bi-gram word clouds | |
for column in list(category.columns): | |
cluster = data[data[column]==1] | |
d = counter('Questions', column, cluster, 2, 2) # n-gram words within range (2,2) for example | |
frequency = d.T.to_dict('records')[0] | |
wordcloud = WordCloud(background_color='black',width=3000, height=2000, max_words=100, | |
random_state=1, colormap='Set2', collocations=False) | |
wordcloud.generate_from_frequencies(frequencies=frequency) | |
print('Cluster: {}'.format(column)) | |
plt.figure(figsize=(20,10), facecolor='k') | |
plt.imshow(wordcloud) | |
plt.axis("off") | |
plt.tight_layout(pad=0) | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment