Created
December 7, 2019 16:54
-
-
Save pcolazurdo/3bab636e0a0281970bd1042bef6c1acc to your computer and use it in GitHub Desktop.
Worldcloud Generator
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import nltk | |
nltk.download('stopwords') | |
nltk.download('punkt') | |
nltk.download('averaged_perceptron_tagger') | |
nltk.download('wordnet') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
wordcloud | |
matplotlib | |
pandas | |
nltk |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
################################################## | |
# import modules # | |
################################################## | |
from wordcloud import WordCloud, STOPWORDS | |
import matplotlib.pyplot as plt | |
import pandas as pd | |
import os | |
import random | |
import nltk | |
################################################## | |
# read file # | |
################################################## | |
# set working directory | |
path = "" | |
os.chdir(path) | |
fileName = 'profile.txt' | |
with open(fileName, 'r') as file: | |
text = file.read().replace('\n', ' ') | |
print(text) | |
################################################## | |
# text processing # | |
################################################## | |
# turn to lowercase | |
text = text.lower() | |
# define words that are to be ignored in the word cloud | |
stopwords = set(STOPWORDS) | |
stopwords.update(set(nltk.corpus.stopwords.words('spanish'))) | |
stopwords.update(set(nltk.corpus.stopwords.words('english'))) | |
stopwords.update(["year", 'month', 'page', 'de', 'universidad', 'architect', 'january', 'october']) | |
# tokenize into words | |
from nltk.tokenize import word_tokenize | |
words = word_tokenize(text) | |
# Remove single-character tokens (mostly punctuation) | |
words = [word for word in words if len(word) > 1] | |
# Remove numbers | |
words = [word for word in words if not word.isnumeric()] | |
# Lowercase all words (default_stopwords are lowercase too) | |
words = [word.lower() for word in words] | |
# Remove stopwords | |
words = [word for word in words if word not in stopwords] | |
fdist = nltk.FreqDist(words) | |
# Output top 50 words | |
for word, frequency in fdist.most_common(50): | |
print(u'{};{}'.format(word, frequency)) | |
# pos tagging | |
tags = nltk.pos_tag(words) | |
# lemmatization | |
from nltk.stem.wordnet import WordNetLemmatizer | |
lem = WordNetLemmatizer() | |
lemWords = [] | |
for i in range(len(words)): | |
word = words[i] | |
tag = tags[i][1] | |
if 'VB' in tag: | |
lemWord = lem.lemmatize(word, "v") | |
elif tag == "PRP": | |
lemWord = word | |
else: | |
lemWord = lem.lemmatize(word) | |
lemWords.append(lemWord) | |
finalText = ' '.join(lemWords) | |
################################################## | |
# create word cloud # | |
################################################## | |
# linkedin background photo size is 1564x396 | |
wordcloud = WordCloud(width = 1584, height = 396, | |
background_color ='black', | |
min_font_size = 5, | |
stopwords = stopwords, | |
random_state = 42, | |
collocations=False).generate(finalText) | |
# for grey scale | |
def grey_color_func(word, font_size, position, orientation, random_state = None, **kwargs): | |
# return "hsl(0, 0%%, %d%%)" % random.randint(60, 100) | |
return "hsl(0, 0%, {}%)".format(random.randint(60, 100)) | |
# plot the wordcloud image | |
plt.figure(figsize = (8, 2), facecolor = None) | |
plt.imshow(wordcloud) | |
plt.imshow(wordcloud.recolor(color_func = grey_color_func, random_state = 3), interpolation = "bilinear") | |
plt.axis("off") | |
plt.tight_layout(pad = 0) | |
plt.savefig("wordcloud.png") | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment