Skip to content

Instantly share code, notes, and snippets.

@schedutron
Created May 9, 2020 15:46
Show Gist options
  • Save schedutron/61d389d8032d859f84266221413d70f7 to your computer and use it in GitHub Desktop.
Save schedutron/61d389d8032d859f84266221413d70f7 to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
"""WhatsApp.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1jyDKgWPj7_nwyhRKyvjpeGojLRU0VqqF
"""
!pip install wordcloud
import re
import string
from collections import Counter
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from PIL import Image
import numpy as np
import requests
import matplotlib.pyplot as plt
HINDI_STOPWORDS = {'एस', 'तक', 'इसी', 'वे', 'थी', 'और', 'कुल', 'सभी', 'उनके', 'के', 'रहे', 'हुए', 'या', 'एवं', 'दो', 'जैसे', 'करने', 'था', 'उसके', 'को', 'से', 'कोई', 'लिए', 'रखें', 'इसमें', 'थे', 'ना', 'गया', 'दिया', 'तो', 'कि', 'यह', 'बहुत', 'सकते', 'जा', 'होती', 'करें', 'हो', 'सबसे', 'अपने', 'वर्ग', 'किया', 'साथ', 'पे', 'यदि', 'होते', 'इसे', 'अपनी', 'करता', 'आदि', 'ये', 'उनका', 'कुछ', 'वाले', 'वह', 'तरह', 'हैं', 'न', 'मे', 'करते', 'कहते', 'पर', 'की', 'आप', 'होने', 'होता', 'कहा', 'इसकी', 'इसके', 'सकता', 'का', 'बाद', 'जो', 'लिये', 'ही', 'रहा', 'करना', 'इस', 'कई', 'एक', 'उनकी', 'नहीं', 'इसका', 'अभी', 'किसी', 'बनी', 'द्वारा', 'हुआ', 'है', 'उस', 'हुई', 'जब', 'में', 'कर', 'ने', 'व', 'अपना'}
HINGLISH_STOPWORDS = {'rakho', 'poora', 'aa', 'kr', 'teen', 'all', 'nai', 'kyaa', 'thoda', 'mere', 'diyo', 'yahan', 'were', 'two', 'isski', 'i', 'this', 'downwards', 'ab', 'wala', 'next', 'bc', 'hui', 'well', 'seem', "i'm", 'naah', 'tabh', 'kari', 'couldn', 'thereafter', 'went', 'whether', 'later', 'first', 'hone', 'cause', 'humne', 'sabse', 'truly', 'dont', "we've", "wouldn't", 'becomes', 'inke', 'wasnt', 'arre', 'karta', 'hai', 'sath', 'karke', 'unka', 'pe', 'throughout', 'q', 'kull', 'useful', "mustn't", 'jahaan', 'always', 'use', 'looking', 'doesnt', 'dono', 'kaafi', 'rakhe', 'theirs', 'kyuki', 'took', 'these', 'ok', 'avum', 'everywhere', 'kept', 'rakh', 'kinhe', 'asking', 'few', 'must', 'says', 'honi', 'necessary', 'gone', 'as', 'therein', 'abbey', 'lekar', 'aint', 'aur', 'that', 'krne', 'vahan', 'teesre', 'can', 'near', "where's", "she's", 're', 'really', 'kitni', 'maano', 'too', 'kisko', 'per', 'main', 'mean', 'several', 'appear', 'vahi', 'off', 'wo', 'dude', "hadn't", 'nhi', "you're", 'together', 'reasonably', 'thanks', 'four', 'inc', 'itna', 'according', 'itne', 'waisi', 'thing', "who's", 'are', 'ltd', 'rahe', 'having', 'get', 'see', 'karen', 'thinking', 'its', 'here', 'bande', 'an', 'aadi', 'rhaa', 'thereupon', 'what', 'hers', 'achcha', 'ja', 'imo', 'inka', 'along', 'kya', 'did', 'karegi', 'myself', "aren't", 'usi', 'apni', 'let', 'least', 'then', "mightn't", 'both', 'same', 'tjhe', "c'mon", 'sometime', 'dekhi', 'hi', 'karenge', 'when', 'karte', 'taken', 'hua', 'kise', 'allow', 'around', 'toh', 'kaisi', 'kyun', 'banai', 'banda', 'pehle', 'should', 'yourselves', 'isnt', 'saare', 'knows', 'cant', 'o', 'think', 'willing', 'hamari', 'neednt', 'likely', 'yup', 'kis', 'mein', 'at', 'inse', 's', 'jabh', 'wagairah', 'anyway', 'after', 'bana', 'bas', 'karo', 'karun', 'jyaada', 'mera', 'goes', 'waisa', 'isse', 'them', 'aata', 'wali', 'however', 'nd', 'bole', 'huye', 'everything', 'apna', "i'd", 'provides', 'naa', 'whence', 'maybe', 'causes', 'others', 'yahi', 'inhi', 'may', 'rahi', 'tere', 'mana', 'dusra', 'hasnt', 'kinho', "it's", 'nowhere', 'sang', 'which', 'khud', 'couldnt', 'sa', 'regardless', 'maan', 'aaya', 'ap', 'aaj', 'whereas', 'btw', 'he', 'baar', 'eight', 'will', 'themselves', 'hither', 'konsa', 'used', 'hello', 'keep', 'yeh', 'down', 'please', 'until', 'most', 'good', 'koi', 'yeah', 'your', "haven't", 'll', 'hadd', 'anyone', "he's", 'kab', 'much', 'nahin', 'wahin', 'him', 'hona', 'just', 'kabhi', 'kyu', 'right', "wasn't", 'jahan', 'mustnt', 'sakti', 'nothing', 'know', 'very', 'aye', 'six', 'better', 'kam', 'kin', 'de', 'gayi', 'whereby', 'batao', 'five', 'hue', 'phle', 'my', 'jiska', 'bolo', 'plus', 'kahte', 'somebody', 'given', 'unko', 'secondly', 'shall', 'que', 'unke', 'yaha', 'was', 'rehte', 'ka', 'vaale', 'abe', 'yahaan', 'ho', 'anybody', 'otherwise', 'rakhi', 'd', 'it', 'serious', 'but', 'chaiye', 'example', 'ought', 'jinka', 'twice', 'tak', 'none', 'gets', 'karna', 'k', 'lagte', 'mai', 'saying', 'ourselves', 'kahin', 'seemed', 'whoever', 'ya', 'didn', 'log', 'over', 'exactly', 'almost', 'people', 'saath', "they're", 'ityadi', 't', 'using', 'usne', 'our', 'hota', 'ours', 'seven', 'apne', 'got', 'inki', 'whose', 'each', 'rha', 'haven', 'dusre', 'karne', 'bhitar', 'often', 'old', 'doosra', 'apart', 'okay', 'pehla', 'bheetar', 'kon', 'kaise', 'hamara', 'kaisa', 'banayi', 'kiski', 'jitni', 'already', 'kul', 'someone', 'to', 'unho', 'fifth', 'perhaps', 'hoga', 'inn', 'thence', 'inward', "they've", 'one', 'jin', 'bane', 'in', 'could', 'bol', 'kahaa', 'meri', 'dvaara', 'from', 'iss', 'jiski', 'rahaa', 'vahaan', 'yahin', 'lagta', 'certainly', 'isi', 'gaye', 'hun', 'non', 'take', 'thorough', 'm', 'ye', 'com', "we'd", 'didnt', "that'll", 'wouldnt', 'mano', 'apan', 'liya', 'dekho', 'koyi', 'yes', 'apnaa', 'banaye', 'viz', 'ko', 'else', 'vaisa', 'kinko', 'we', 'kyunki', "we're", 'less', 'weren', 'below', 'greetings', 'y', 'li', 'chhaiye', 'aati', 'aapne', 'third', "they'd", 'iskaa', 'about', 'afterwards', 'doing', 'kaunsa', 'yadi', 'se', 'become', 'aap', 'kare', 'jise', 'magar', 'won', 'little', 'boli', 'bhi', 'tho', 'need', 'hereupon', 'phli', 'therefore', 'within', "isn't", "weren't", 'accha', 'ri', 'isne', 'whenever', "can't", 'huyi', 'three', 'lagti', 'any', 'regarding', 'wherever', 'quite', 'vala', 'diye', 'dunga', 'is', 'you', 'hamne', 'idk', 'so', 'yah', 'who', 'kahi', 'nearly', "i'll", 'kaun', 'sara', 'eg', 'sent', 'bata', 'said', 'dijiye', 'teesri', 'noone', 'tum', 'maine', 'own', 'like', 'kara', 'shouldn', 'havent', 'besides', 'jaa', 'theek', 'ones', 'wahi', 've', 'into', 'hoyengi', 'kahan', 'kahaan', 'a', 'wale', 'diyaa', 'karega', 'howbeit', 'edu', 'itself', 'thoroughly', 'upar', 'something', 'doesn', 'tera', 'whole', "you'll", 'kinki', 'keh', 'tujhe', 'yehi', 'iske', 'ki', 'somehow', 'han', 'aaye', "what's", 'following', 'ise', 'seen', 'jiss', 'humein', 'liked', 'kuchch', 'dungi', 'raha', 'dvara', 'keeps', "let's", 'jab', 'why', 'might', 'th', 'thank', 'dwaara', 'oh', 'hence', 'brief', 'aren', 'karti', 'thik', 'towards', 'hereafter', 'banae', 'saktaa', 'or', 'being', 'hoyenge', 'unhi', 'across', 'himself', 'lol', 'vale', 'vaise', 'bandi', 'tries', 'lunga', 'far', 'moreover', 'seems', 'somewhere', 'kaha', 'rhe', 'thru', 'hoti', 'outside', 'teesra', 'whatever', 'karu', 'rakha', 'bad', 'aisi', 'seriously', 'woh', 'becoming', 'dega', 'second', 'the', 'umm', 'jayega', 'issi', 'out', 'unhone', 'jinhi', 'jinhone', 'uses', 'unhe', 'without', 'etc', 'alone', 'ke', 'thereby', 'pehli', 'um', 'novel', 'bro', 'amongst', 'hmm', 'jinhe', 'of', 'be', 'fir', 'kinn', 'up', 'un', 'karni', 'many', 'onto', 'kuch', 'logon', 'kinka', 'namely', 'dekhe', 'ghar', 'soch', 'thanx', 'waali', 'through', 'gaya', 'vo', 'also', 'vali', 'rakhen', 'once', 'last', 'sahi', 'mujhe', 'only', 'their', 'every', 'trying', 'honaa', 'whereafter', 'lo', 'thi', 'anyhow', 'herein', 'aise', 'although', 'yourself', 'uske', 'aside', 'unkaa', 'allows', 'aisa', 'hopefully', 'baad', 'help', 'tell', 'hasn', 'maane', 'not', 'nahi', 'cannot', 'beside', 'kitne', 'she', 'thought', 'yours', 'going', 'tab', 'dhang', 'abbe', 'mostly', 'between', 'jinho', 'nine', "you'd", 'degi', 'tarah', 'hotaa', 'ie', 'selves', 'mustn', 'ever', 'kino', 'waise', "shouldn't", 'sensible', 'follows', 'other', 'various', 'jaisa', 'chal', 'jaisi', 'followed', 'bolte', 'kar', 'meanwhile', 'bani', 'needn', "doesn't", 'puri', "it'd", 'behind', 'bhai', 'chalega', 'had', 'insofar', 'inhe', 'sure', 'banaya', 'by', 'kayi', 'tune', 'unto', "you've", 'saw', 'how', 'yet', 'dekh', 'best', 'di', 'known', 'has', 'itni', 'wagerah', 'alag', 'say', 'some', 'been', 'shan', 'lately', 'needs', 'mat', 'dusri', 'isn', 'pata', 'gives', 'regards', 'ussi', 'acha', 'either', 'denge', 'maana', 'would', 'anything', 'still', 'kal', 'under', 'new', 'bola', 'if', 'hongi', 'beforehand', 'poori', 'come', 'kiye', 'krna', 'jisse', 'overall', 'na', 'never', 'unless', 'wohi', 'whom', 'mane', 'itno', 'more', 'waale', 'sub', 'those', 'hain', 'jinke', 'tha', 'everyone', 'sometimes', 'banao', 'dede', 'jitne', "there's", 'karungi', 'dwara', 'saara', "i've", 'certain', 'jo', 'jata', 'haan', 'clearly', 'there', 'vaisi', 'mil', 'anyways', 'inkaa', 'ne', 'wherein', 'thus', 'arent', 'kisi', 'agar', 'kitno', 'normally', 'where', "couldn't", 'again', "didn't", 'and', 'actually', 'maani', 'shouldnt', 'on', 'aya', 'go', 'iski', 'lekin', 'karunga', 'true', 'dekha', 'le', 'hu', 'hereby', 'kuchh', 'for', 'teri', 'enough', 'kitna', 'teeno', 'try', 'tried', 'wants', 'wagaira', "it'll", 'mainly', 'tumhari', 'beyond', 'happens', 'kch', 'getting', 'nevertheless', 'sup', 'kine', 'seeing', "should've", 'another', 'pura', 'bilkul', 'became', 'somewhat', 'merely', 'do', 'hardly', 'before', 'unki', "shan't", 'tumhara', 'except', 'self', 'unse', 'way', 'they', 'thats', 'neeche', "they'll", 'than', 'inasmuch', 'kisliye', 'us', 'usse', 'am', 'hoge', 'done', 'looks', 'wahaan', 'unn', 'ek', 'hote', 'came', 'abhi', 'everybody', 'furthermore', 'jinn', 'whereupon', 'ityaadi', 'kiske', 'even', "here's", 'doosre', 'hum', 'uska', 'while', 'ask', 'tumhare', 'obviously', 'jis', 'kiska', 'bht', 'jitna', 'comes', 'now', 'liye', 'able', 'tu', 'unlikely', 'look', 'andar', 'uss', 'jinki', 'hadnt', 'bolta', 'soon', "that's", "don't", 'sabhi', 'mjhe', "needn't", 'hogi', 'jaise', 'jaha', 'want', 'hamare', 'such', 'huh', 'jidhar', 'forth', 'inho', 'mightnt', 'shant', 'waha', 'sab', 'accordingly', 'inner', 'rhi', 'tends', 'though', 'waala', 'kinke', 'wasn', 'mani', 'jisme', 'wouldn', "hasn't", 'vaala', 'hm', 'during', 'bolti', 'her', 'herself', "ain't", 'kiya', 'bohot', 'sakta', 'among', 'jiske', 'rather', 'because', 'further', 'lest', 'diya', 'ex', 'isme', 'mightn', 'nobody', 'par', 'si', 'hadn', 'with', 'phla', 'toward', 'name', 'werent', 'elsewhere', 'me', 'above', 'neither', 'nor', 'sakte', 'har', 'since', 'chahiye', "won't", 'against', 'have', 'no', 'anywhere', 'ain', 'iska', 'does', 'placed', 'usually', 'theres', "we'll", 'seeming', 'bahut', 'thodi', 'jyada', 'via', 'honge', 'gotten', 'nope', 'kafi', 'wont', 'kehte', 'upon', 'vahin', 'kisne', 'his', 'vaali', 'wahan'}
# _media as Android export has <media omitted> instead of specific media (like audio) omitted
media_omissions = {'_media', '\u200eimage', '\u200evideo', '\u200eGIF', 'card', '\u200eaudio', '\u200edocument', '\u200esticker'}
# Clean data
STOPWORDS = STOPWORDS.union({'changed', 'subject'}) # for WhatsApp
def parse_iphone():
words = []
with open('exported_chat.txt', 'r') as f:
_ = next(f) # omit first line; not needed
for line in f:
#line = next(f)
match = (re.search(r'(\[.*?\].*?:)?\s(.*)', line).groups()[1])
include = True
if match in ['‎This message was deleted.', '‎You deleted this message.']:
include = False
elif 'http' in match or 'https' in match:
include = False
elif 'omitted' in match:
for mtype in media_omissions:
if mtype[1:] in match:
include = False
break
if include:
candidates = [word.lower().strip() for word in re.split(r'\s+', match) if word != '']
words.extend([
word for word in candidates
if word not in STOPWORDS
])
return words
def parse_android():
words = []
with open('WhatsApp_Chat_Wing_since_Aug_2018_merged_large.txt', 'r') as f:
_ = next(f) # omit first line; not needed
for line in f:
#line = next(f)
match = (re.search(r'(.*?\-.*?:)?\s(.*)', line).groups()[1])
include = True
if 'omitted' in line:
pass
if match in {'This message was deleted', 'You deleted this message', '<Media omitted>'}:
include = False
elif 'http' in match or 'https' in match:
include = False
if include:
candidates = [word.lower().strip() for word in re.split(r'\s+', match) if word != '']
words.extend([
word for word in candidates
if word not in STOPWORDS
])
return words
punc_table = str.maketrans('', '', string.punctuation+'’')
digit_table = str.maketrans('', '', string.digits)
cleaned_words = [w.translate(punc_table).translate(digit_table) for w in parse_iphone()]
cleaned_words = [word for word in cleaned_words if len(word) > 1 and word not in HINDI_STOPWORDS and word not in HINGLISH_STOPWORDS]
#print(cleaned_words[:200])
#print(len(cleaned_words))
cleaned_words_joined = ' '.join(cleaned_words)
# For background image
#mask = np.array(Image.open(requests.get('https://images-na.ssl-images-amazon.com/images/I/819QJh4dX8L._AC_SL1500_.jpg', stream=True).raw))
mask = np.array(Image.open(requests.get('https://previews.123rf.com/images/kaarsten/kaarsten1111/kaarsten111100061/11544770-attractive-woman-in-bikini-isolated-on-white-background-.jpg', stream=True).raw))
def generate_wordcloud(words, mask):
word_cloud = WordCloud(font_path='arial-unicode-ms.ttf', width = 1024, height = 1024, background_color="black", mode="RGBA", mask=mask).generate(words)
image_colors = ImageColorGenerator(mask)
#plt.figure(figsize=(10,8), facecolor = 'black', edgecolor='blue')
plt.figure(figsize=[90, 160])
plt.imshow(word_cloud.recolor(color_func=image_colors), interpolation="bilinear")
plt.axis('off')
#plt.tight_layout(pad=0)
plt.savefig("wing.png", format="png")
plt.show()
generate_wordcloud(cleaned_words_joined, mask)
raw_histo = Counter(cleaned_words)
raw_histo.most_common(100)
"who’s".translate(punc_table).translate(digit_table)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment