manmohan24nov · September 5, 2020 20:22
diff --git a/reuters_unigram_code.py b/reuters_unigram_code.py
 from gensim.parsing.preprocessing 
 import remove_stopwords					   
 import genism											      
 from wordcloud import WordCloud								   
 import numpy as np										   
 import random									          

 # import stopwords from gensim methods to stop_list variable
 # You can also manually add stopwords
 gensim_stopwords = gensim.parsing.preprocessing.STOPWORDS			   
 stopwords_list = list(set(gensim_stopwords))					           
 stopwords_update = ["mln", "vs","cts","said","billion","pct","dlrs","dlr"]			          
 stopwords = stopwords_list + stopwords_update
 articles_word_limit['temp_list'] = articles_word_limit['text_clean'].apply(lambda x:str(x).split())

 # Remove stopwords from the articles
 def remove_stopword(x):
    return [word for word in x if word not in stopwords]
 articles_word_limit['temp_list_stopw'] = articles_word_limit['temp_list'].apply(lambda x:remove_stopword(x))

 # generate n-gram words
 def generate_ngrams(text, n_gram=1):
    ngrams = zip(*[text[i:] for i in range(n_gram)])
    return [' '.join(ngram) for ngram in ngrams]
  
  
 article_unigrams = defaultdict(int)
 for tweet in articles_word_limit['temp_list_stopw']:
    for word in generate_ngrams(tweet):
        article_unigrams[word] += 1
        
 article_unigrams_df = pd.DataFrame(sorted(article_unigrams.items(), key=lambda x: x[1])[::-1])
 N=50

 # bar graph Plot of top 50 common unigram in reuters articles
 fig, axes = plt.subplots(figsize=(18, 50))
 plt.tight_layout()
 sns.barplot(y=article_unigrams_df[0].values[:N], x=article_unigrams_df[1].values[:N], color='red')
 axes.spines['right'].set_visible(False)
 axes.set_xlabel('')
 axes.set_ylabel('')
 axes.tick_params(axis='x', labelsize=13)
 axes.tick_params(axis='y', labelsize=13)
 axes.set_title(f'Top {N} most common unigrams in Reuters Articles', fontsize=15)
 plt.show()


 # Plot word cloud
 def col_func(word, font_size, position, orientation, font_path, random_state):
    colors = ['#b58900', '#cb4b16', '#dc322f', '#d33682', '#6c71c4',
              '#268bd2', '#2aa198', '#859900']
    return random.choice(colors)
 fd = {
    'fontsize': '32',
    'fontweight' : 'normal',
    'verticalalignment': 'baseline',
    'horizontalalignment': 'center',
 }
 wc = WordCloud(width=2000, height=1000, collocations=False,
               background_color="white",
               color_func=col_func,
               max_words=200,
               random_state=np.random.randint(1, 8)) .generate_from_frequencies(article_unigrams)
 fig, ax = plt.subplots(figsize=(20,10))
 ax.imshow(wc, interpolation='bilinear')
 ax.axis("off")
 ax.set_title(‘Unigram Words of Reuters Articles’, pad=24, fontdict=fd)
 plt.show()
	from gensim.parsing.preprocessing
	import remove_stopwords
	import genism
	from wordcloud import WordCloud
	import numpy as np
	import random

	# import stopwords from gensim methods to stop_list variable
	# You can also manually add stopwords
	gensim_stopwords = gensim.parsing.preprocessing.STOPWORDS
	stopwords_list = list(set(gensim_stopwords))
	stopwords_update = ["mln", "vs","cts","said","billion","pct","dlrs","dlr"]
	stopwords = stopwords_list + stopwords_update
	articles_word_limit['temp_list'] = articles_word_limit['text_clean'].apply(lambda x:str(x).split())

	# Remove stopwords from the articles
	def remove_stopword(x):
	return [word for word in x if word not in stopwords]
	articles_word_limit['temp_list_stopw'] = articles_word_limit['temp_list'].apply(lambda x:remove_stopword(x))

	# generate n-gram words
	def generate_ngrams(text, n_gram=1):
	ngrams = zip(*[text[i:] for i in range(n_gram)])
	return [' '.join(ngram) for ngram in ngrams]


	article_unigrams = defaultdict(int)
	for tweet in articles_word_limit['temp_list_stopw']:
	for word in generate_ngrams(tweet):
	article_unigrams[word] += 1

	article_unigrams_df = pd.DataFrame(sorted(article_unigrams.items(), key=lambda x: x[1])[::-1])
	N=50

	# bar graph Plot of top 50 common unigram in reuters articles
	fig, axes = plt.subplots(figsize=(18, 50))
	plt.tight_layout()
	sns.barplot(y=article_unigrams_df[0].values[:N], x=article_unigrams_df[1].values[:N], color='red')
	axes.spines['right'].set_visible(False)
	axes.set_xlabel('')
	axes.set_ylabel('')
	axes.tick_params(axis='x', labelsize=13)
	axes.tick_params(axis='y', labelsize=13)
	axes.set_title(f'Top {N} most common unigrams in Reuters Articles', fontsize=15)
	plt.show()


	# Plot word cloud
	def col_func(word, font_size, position, orientation, font_path, random_state):
	colors = ['#b58900', '#cb4b16', '#dc322f', '#d33682', '#6c71c4',
	'#268bd2', '#2aa198', '#859900']
	return random.choice(colors)
	fd = {
	'fontsize': '32',
	'fontweight' : 'normal',
	'verticalalignment': 'baseline',
	'horizontalalignment': 'center',
	}
	wc = WordCloud(width=2000, height=1000, collocations=False,
	background_color="white",
	color_func=col_func,
	max_words=200,
	random_state=np.random.randint(1, 8)) .generate_from_frequencies(article_unigrams)
	fig, ax = plt.subplots(figsize=(20,10))
	ax.imshow(wc, interpolation='bilinear')
	ax.axis("off")
	ax.set_title(‘Unigram Words of Reuters Articles’, pad=24, fontdict=fd)
	plt.show()