Created
May 25, 2019 11:30
-
-
Save do-me/d07753a2f7449d86bb6f78d780d49481 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Tweet to Twitter Birdcloud (Wordcloud) | |
import pandas as pd | |
import re | |
import os | |
os.chdir("C:/Users/Dome/Desktop/nu/Tweets/") | |
party= "linksfraktion" | |
df=pd.read_json(party+".json") | |
from nltk.tokenize import TweetTokenizer | |
from stop_words import get_stop_words | |
sw = get_stop_words('de') | |
tt = TweetTokenizer() | |
sw.extend(("macht","schen","schon","Kapitel","setzt","weitere","lehnt","viel", | |
"stellen","gehen","chen","geht","gilt","lehnen", "viele","gibt", | |
"darf","halten","dürfen","neben","gehört","vielen","jedoch", | |
"braucht", "nehmen","rung","seit","sollten","deren","etwa", | |
"beim","außerdem","stehen", "sitzen", "mehr","sollen","müssen", | |
"sowie","deshalb","daher","dafür","dabei","brauchen","zudem", | |
"setzen","besser","neue","neu","neuen","immer","gute")) | |
df['tokens'] = df['text'].apply(tt.tokenize) | |
# complicated expression: | |
# all items not in stopwords, or matching @/# | |
df['clean'] = df['tokens'].apply(lambda x: [item for item in x if | |
item.isalpha() and item.lower() not in sw or re.compile(r"@|#").match(item) | |
and item.lower() not in sw]) | |
###### clean df! ###### | |
search_values = ["migr","flücht","auslä","asyl","flucht","immigr","refugee", | |
"geflüchte", "ausland", "zuwander", "zugewandert"] | |
mig=df[df.text.str.contains('|'.join(search_values ))] | |
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator | |
from PIL import Image | |
import numpy as np | |
import matplotlib.pyplot as plt | |
# list flattening, as here we have a list in a list | |
flattened = [val for sublist in list(mig.clean) for val in sublist] | |
words = ' '.join(flattened) # list to text | |
# define twitter bird mask, must have white background | |
mask = np.array(Image.open("C:/Users/Dome/Desktop/nu/Tweets/Wordclouds/Twitter_bird.png")) | |
image_colors = ImageColorGenerator(mask) | |
# function takes in your text and your mask and generates a wordcloud. | |
def birdcloud(words, mask): | |
word_cloud = WordCloud(width = 512, height = 512,max_words=200, | |
background_color='white', stopwords=STOPWORDS, | |
mask=mask).generate(words) | |
plt.figure(figsize=(10,8),facecolor = 'white', edgecolor='blue') | |
plt.imshow(word_cloud.recolor(color_func=image_colors), | |
interpolation="bilinear") | |
plt.axis('off') | |
plt.tight_layout(pad=0) | |
plt.savefig("C:/Users/Dome/Desktop/nu/Tweets/Wordclouds/"+ | |
party+".png", format="png") | |
plt.show() | |
# generate your birdcloud | |
birdcloud(words, mask) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment