Skip to content

Instantly share code, notes, and snippets.

@pcolazurdo
Created December 7, 2019 16:54
Show Gist options
  • Save pcolazurdo/3bab636e0a0281970bd1042bef6c1acc to your computer and use it in GitHub Desktop.
Save pcolazurdo/3bab636e0a0281970bd1042bef6c1acc to your computer and use it in GitHub Desktop.
Worldcloud Generator
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
wordcloud
matplotlib
pandas
nltk
##################################################
# import modules #
##################################################
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
import pandas as pd
import os
import random
import nltk
##################################################
# read file #
##################################################
# set working directory
path = ""
os.chdir(path)
fileName = 'profile.txt'
with open(fileName, 'r') as file:
text = file.read().replace('\n', ' ')
print(text)
##################################################
# text processing #
##################################################
# turn to lowercase
text = text.lower()
# define words that are to be ignored in the word cloud
stopwords = set(STOPWORDS)
stopwords.update(set(nltk.corpus.stopwords.words('spanish')))
stopwords.update(set(nltk.corpus.stopwords.words('english')))
stopwords.update(["year", 'month', 'page', 'de', 'universidad', 'architect', 'january', 'october'])
# tokenize into words
from nltk.tokenize import word_tokenize
words = word_tokenize(text)
# Remove single-character tokens (mostly punctuation)
words = [word for word in words if len(word) > 1]
# Remove numbers
words = [word for word in words if not word.isnumeric()]
# Lowercase all words (default_stopwords are lowercase too)
words = [word.lower() for word in words]
# Remove stopwords
words = [word for word in words if word not in stopwords]
fdist = nltk.FreqDist(words)
# Output top 50 words
for word, frequency in fdist.most_common(50):
print(u'{};{}'.format(word, frequency))
# pos tagging
tags = nltk.pos_tag(words)
# lemmatization
from nltk.stem.wordnet import WordNetLemmatizer
lem = WordNetLemmatizer()
lemWords = []
for i in range(len(words)):
word = words[i]
tag = tags[i][1]
if 'VB' in tag:
lemWord = lem.lemmatize(word, "v")
elif tag == "PRP":
lemWord = word
else:
lemWord = lem.lemmatize(word)
lemWords.append(lemWord)
finalText = ' '.join(lemWords)
##################################################
# create word cloud #
##################################################
# linkedin background photo size is 1564x396
wordcloud = WordCloud(width = 1584, height = 396,
background_color ='black',
min_font_size = 5,
stopwords = stopwords,
random_state = 42,
collocations=False).generate(finalText)
# for grey scale
def grey_color_func(word, font_size, position, orientation, random_state = None, **kwargs):
# return "hsl(0, 0%%, %d%%)" % random.randint(60, 100)
return "hsl(0, 0%, {}%)".format(random.randint(60, 100))
# plot the wordcloud image
plt.figure(figsize = (8, 2), facecolor = None)
plt.imshow(wordcloud)
plt.imshow(wordcloud.recolor(color_func = grey_color_func, random_state = 3), interpolation = "bilinear")
plt.axis("off")
plt.tight_layout(pad = 0)
plt.savefig("wordcloud.png")
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment