Created
May 9, 2020 15:46
-
-
Save schedutron/61d389d8032d859f84266221413d70f7 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
"""WhatsApp.ipynb | |
Automatically generated by Colaboratory. | |
Original file is located at | |
https://colab.research.google.com/drive/1jyDKgWPj7_nwyhRKyvjpeGojLRU0VqqF | |
""" | |
!pip install wordcloud | |
import re | |
import string | |
from collections import Counter | |
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator | |
from PIL import Image | |
import numpy as np | |
import requests | |
import matplotlib.pyplot as plt | |
HINDI_STOPWORDS = {'एस', 'तक', 'इसी', 'वे', 'थी', 'और', 'कुल', 'सभी', 'उनके', 'के', 'रहे', 'हुए', 'या', 'एवं', 'दो', 'जैसे', 'करने', 'था', 'उसके', 'को', 'से', 'कोई', 'लिए', 'रखें', 'इसमें', 'थे', 'ना', 'गया', 'दिया', 'तो', 'कि', 'यह', 'बहुत', 'सकते', 'जा', 'होती', 'करें', 'हो', 'सबसे', 'अपने', 'वर्ग', 'किया', 'साथ', 'पे', 'यदि', 'होते', 'इसे', 'अपनी', 'करता', 'आदि', 'ये', 'उनका', 'कुछ', 'वाले', 'वह', 'तरह', 'हैं', 'न', 'मे', 'करते', 'कहते', 'पर', 'की', 'आप', 'होने', 'होता', 'कहा', 'इसकी', 'इसके', 'सकता', 'का', 'बाद', 'जो', 'लिये', 'ही', 'रहा', 'करना', 'इस', 'कई', 'एक', 'उनकी', 'नहीं', 'इसका', 'अभी', 'किसी', 'बनी', 'द्वारा', 'हुआ', 'है', 'उस', 'हुई', 'जब', 'में', 'कर', 'ने', 'व', 'अपना'} | |
HINGLISH_STOPWORDS = {'rakho', 'poora', 'aa', 'kr', 'teen', 'all', 'nai', 'kyaa', 'thoda', 'mere', 'diyo', 'yahan', 'were', 'two', 'isski', 'i', 'this', 'downwards', 'ab', 'wala', 'next', 'bc', 'hui', 'well', 'seem', "i'm", 'naah', 'tabh', 'kari', 'couldn', 'thereafter', 'went', 'whether', 'later', 'first', 'hone', 'cause', 'humne', 'sabse', 'truly', 'dont', "we've", "wouldn't", 'becomes', 'inke', 'wasnt', 'arre', 'karta', 'hai', 'sath', 'karke', 'unka', 'pe', 'throughout', 'q', 'kull', 'useful', "mustn't", 'jahaan', 'always', 'use', 'looking', 'doesnt', 'dono', 'kaafi', 'rakhe', 'theirs', 'kyuki', 'took', 'these', 'ok', 'avum', 'everywhere', 'kept', 'rakh', 'kinhe', 'asking', 'few', 'must', 'says', 'honi', 'necessary', 'gone', 'as', 'therein', 'abbey', 'lekar', 'aint', 'aur', 'that', 'krne', 'vahan', 'teesre', 'can', 'near', "where's", "she's", 're', 'really', 'kitni', 'maano', 'too', 'kisko', 'per', 'main', 'mean', 'several', 'appear', 'vahi', 'off', 'wo', 'dude', "hadn't", 'nhi', "you're", 'together', 'reasonably', 'thanks', 'four', 'inc', 'itna', 'according', 'itne', 'waisi', 'thing', "who's", 'are', 'ltd', 'rahe', 'having', 'get', 'see', 'karen', 'thinking', 'its', 'here', 'bande', 'an', 'aadi', 'rhaa', 'thereupon', 'what', 'hers', 'achcha', 'ja', 'imo', 'inka', 'along', 'kya', 'did', 'karegi', 'myself', "aren't", 'usi', 'apni', 'let', 'least', 'then', "mightn't", 'both', 'same', 'tjhe', "c'mon", 'sometime', 'dekhi', 'hi', 'karenge', 'when', 'karte', 'taken', 'hua', 'kise', 'allow', 'around', 'toh', 'kaisi', 'kyun', 'banai', 'banda', 'pehle', 'should', 'yourselves', 'isnt', 'saare', 'knows', 'cant', 'o', 'think', 'willing', 'hamari', 'neednt', 'likely', 'yup', 'kis', 'mein', 'at', 'inse', 's', 'jabh', 'wagairah', 'anyway', 'after', 'bana', 'bas', 'karo', 'karun', 'jyaada', 'mera', 'goes', 'waisa', 'isse', 'them', 'aata', 'wali', 'however', 'nd', 'bole', 'huye', 'everything', 'apna', "i'd", 'provides', 'naa', 'whence', 'maybe', 'causes', 'others', 'yahi', 'inhi', 'may', 'rahi', 'tere', 'mana', 'dusra', 'hasnt', 'kinho', "it's", 'nowhere', 'sang', 'which', 'khud', 'couldnt', 'sa', 'regardless', 'maan', 'aaya', 'ap', 'aaj', 'whereas', 'btw', 'he', 'baar', 'eight', 'will', 'themselves', 'hither', 'konsa', 'used', 'hello', 'keep', 'yeh', 'down', 'please', 'until', 'most', 'good', 'koi', 'yeah', 'your', "haven't", 'll', 'hadd', 'anyone', "he's", 'kab', 'much', 'nahin', 'wahin', 'him', 'hona', 'just', 'kabhi', 'kyu', 'right', "wasn't", 'jahan', 'mustnt', 'sakti', 'nothing', 'know', 'very', 'aye', 'six', 'better', 'kam', 'kin', 'de', 'gayi', 'whereby', 'batao', 'five', 'hue', 'phle', 'my', 'jiska', 'bolo', 'plus', 'kahte', 'somebody', 'given', 'unko', 'secondly', 'shall', 'que', 'unke', 'yaha', 'was', 'rehte', 'ka', 'vaale', 'abe', 'yahaan', 'ho', 'anybody', 'otherwise', 'rakhi', 'd', 'it', 'serious', 'but', 'chaiye', 'example', 'ought', 'jinka', 'twice', 'tak', 'none', 'gets', 'karna', 'k', 'lagte', 'mai', 'saying', 'ourselves', 'kahin', 'seemed', 'whoever', 'ya', 'didn', 'log', 'over', 'exactly', 'almost', 'people', 'saath', "they're", 'ityadi', 't', 'using', 'usne', 'our', 'hota', 'ours', 'seven', 'apne', 'got', 'inki', 'whose', 'each', 'rha', 'haven', 'dusre', 'karne', 'bhitar', 'often', 'old', 'doosra', 'apart', 'okay', 'pehla', 'bheetar', 'kon', 'kaise', 'hamara', 'kaisa', 'banayi', 'kiski', 'jitni', 'already', 'kul', 'someone', 'to', 'unho', 'fifth', 'perhaps', 'hoga', 'inn', 'thence', 'inward', "they've", 'one', 'jin', 'bane', 'in', 'could', 'bol', 'kahaa', 'meri', 'dvaara', 'from', 'iss', 'jiski', 'rahaa', 'vahaan', 'yahin', 'lagta', 'certainly', 'isi', 'gaye', 'hun', 'non', 'take', 'thorough', 'm', 'ye', 'com', "we'd", 'didnt', "that'll", 'wouldnt', 'mano', 'apan', 'liya', 'dekho', 'koyi', 'yes', 'apnaa', 'banaye', 'viz', 'ko', 'else', 'vaisa', 'kinko', 'we', 'kyunki', "we're", 'less', 'weren', 'below', 'greetings', 'y', 'li', 'chhaiye', 'aati', 'aapne', 'third', "they'd", 'iskaa', 'about', 'afterwards', 'doing', 'kaunsa', 'yadi', 'se', 'become', 'aap', 'kare', 'jise', 'magar', 'won', 'little', 'boli', 'bhi', 'tho', 'need', 'hereupon', 'phli', 'therefore', 'within', "isn't", "weren't", 'accha', 'ri', 'isne', 'whenever', "can't", 'huyi', 'three', 'lagti', 'any', 'regarding', 'wherever', 'quite', 'vala', 'diye', 'dunga', 'is', 'you', 'hamne', 'idk', 'so', 'yah', 'who', 'kahi', 'nearly', "i'll", 'kaun', 'sara', 'eg', 'sent', 'bata', 'said', 'dijiye', 'teesri', 'noone', 'tum', 'maine', 'own', 'like', 'kara', 'shouldn', 'havent', 'besides', 'jaa', 'theek', 'ones', 'wahi', 've', 'into', 'hoyengi', 'kahan', 'kahaan', 'a', 'wale', 'diyaa', 'karega', 'howbeit', 'edu', 'itself', 'thoroughly', 'upar', 'something', 'doesn', 'tera', 'whole', "you'll", 'kinki', 'keh', 'tujhe', 'yehi', 'iske', 'ki', 'somehow', 'han', 'aaye', "what's", 'following', 'ise', 'seen', 'jiss', 'humein', 'liked', 'kuchch', 'dungi', 'raha', 'dvara', 'keeps', "let's", 'jab', 'why', 'might', 'th', 'thank', 'dwaara', 'oh', 'hence', 'brief', 'aren', 'karti', 'thik', 'towards', 'hereafter', 'banae', 'saktaa', 'or', 'being', 'hoyenge', 'unhi', 'across', 'himself', 'lol', 'vale', 'vaise', 'bandi', 'tries', 'lunga', 'far', 'moreover', 'seems', 'somewhere', 'kaha', 'rhe', 'thru', 'hoti', 'outside', 'teesra', 'whatever', 'karu', 'rakha', 'bad', 'aisi', 'seriously', 'woh', 'becoming', 'dega', 'second', 'the', 'umm', 'jayega', 'issi', 'out', 'unhone', 'jinhi', 'jinhone', 'uses', 'unhe', 'without', 'etc', 'alone', 'ke', 'thereby', 'pehli', 'um', 'novel', 'bro', 'amongst', 'hmm', 'jinhe', 'of', 'be', 'fir', 'kinn', 'up', 'un', 'karni', 'many', 'onto', 'kuch', 'logon', 'kinka', 'namely', 'dekhe', 'ghar', 'soch', 'thanx', 'waali', 'through', 'gaya', 'vo', 'also', 'vali', 'rakhen', 'once', 'last', 'sahi', 'mujhe', 'only', 'their', 'every', 'trying', 'honaa', 'whereafter', 'lo', 'thi', 'anyhow', 'herein', 'aise', 'although', 'yourself', 'uske', 'aside', 'unkaa', 'allows', 'aisa', 'hopefully', 'baad', 'help', 'tell', 'hasn', 'maane', 'not', 'nahi', 'cannot', 'beside', 'kitne', 'she', 'thought', 'yours', 'going', 'tab', 'dhang', 'abbe', 'mostly', 'between', 'jinho', 'nine', "you'd", 'degi', 'tarah', 'hotaa', 'ie', 'selves', 'mustn', 'ever', 'kino', 'waise', "shouldn't", 'sensible', 'follows', 'other', 'various', 'jaisa', 'chal', 'jaisi', 'followed', 'bolte', 'kar', 'meanwhile', 'bani', 'needn', "doesn't", 'puri', "it'd", 'behind', 'bhai', 'chalega', 'had', 'insofar', 'inhe', 'sure', 'banaya', 'by', 'kayi', 'tune', 'unto', "you've", 'saw', 'how', 'yet', 'dekh', 'best', 'di', 'known', 'has', 'itni', 'wagerah', 'alag', 'say', 'some', 'been', 'shan', 'lately', 'needs', 'mat', 'dusri', 'isn', 'pata', 'gives', 'regards', 'ussi', 'acha', 'either', 'denge', 'maana', 'would', 'anything', 'still', 'kal', 'under', 'new', 'bola', 'if', 'hongi', 'beforehand', 'poori', 'come', 'kiye', 'krna', 'jisse', 'overall', 'na', 'never', 'unless', 'wohi', 'whom', 'mane', 'itno', 'more', 'waale', 'sub', 'those', 'hain', 'jinke', 'tha', 'everyone', 'sometimes', 'banao', 'dede', 'jitne', "there's", 'karungi', 'dwara', 'saara', "i've", 'certain', 'jo', 'jata', 'haan', 'clearly', 'there', 'vaisi', 'mil', 'anyways', 'inkaa', 'ne', 'wherein', 'thus', 'arent', 'kisi', 'agar', 'kitno', 'normally', 'where', "couldn't", 'again', "didn't", 'and', 'actually', 'maani', 'shouldnt', 'on', 'aya', 'go', 'iski', 'lekin', 'karunga', 'true', 'dekha', 'le', 'hu', 'hereby', 'kuchh', 'for', 'teri', 'enough', 'kitna', 'teeno', 'try', 'tried', 'wants', 'wagaira', "it'll", 'mainly', 'tumhari', 'beyond', 'happens', 'kch', 'getting', 'nevertheless', 'sup', 'kine', 'seeing', "should've", 'another', 'pura', 'bilkul', 'became', 'somewhat', 'merely', 'do', 'hardly', 'before', 'unki', "shan't", 'tumhara', 'except', 'self', 'unse', 'way', 'they', 'thats', 'neeche', "they'll", 'than', 'inasmuch', 'kisliye', 'us', 'usse', 'am', 'hoge', 'done', 'looks', 'wahaan', 'unn', 'ek', 'hote', 'came', 'abhi', 'everybody', 'furthermore', 'jinn', 'whereupon', 'ityaadi', 'kiske', 'even', "here's", 'doosre', 'hum', 'uska', 'while', 'ask', 'tumhare', 'obviously', 'jis', 'kiska', 'bht', 'jitna', 'comes', 'now', 'liye', 'able', 'tu', 'unlikely', 'look', 'andar', 'uss', 'jinki', 'hadnt', 'bolta', 'soon', "that's", "don't", 'sabhi', 'mjhe', "needn't", 'hogi', 'jaise', 'jaha', 'want', 'hamare', 'such', 'huh', 'jidhar', 'forth', 'inho', 'mightnt', 'shant', 'waha', 'sab', 'accordingly', 'inner', 'rhi', 'tends', 'though', 'waala', 'kinke', 'wasn', 'mani', 'jisme', 'wouldn', "hasn't", 'vaala', 'hm', 'during', 'bolti', 'her', 'herself', "ain't", 'kiya', 'bohot', 'sakta', 'among', 'jiske', 'rather', 'because', 'further', 'lest', 'diya', 'ex', 'isme', 'mightn', 'nobody', 'par', 'si', 'hadn', 'with', 'phla', 'toward', 'name', 'werent', 'elsewhere', 'me', 'above', 'neither', 'nor', 'sakte', 'har', 'since', 'chahiye', "won't", 'against', 'have', 'no', 'anywhere', 'ain', 'iska', 'does', 'placed', 'usually', 'theres', "we'll", 'seeming', 'bahut', 'thodi', 'jyada', 'via', 'honge', 'gotten', 'nope', 'kafi', 'wont', 'kehte', 'upon', 'vahin', 'kisne', 'his', 'vaali', 'wahan'} | |
# _media as Android export has <media omitted> instead of specific media (like audio) omitted | |
media_omissions = {'_media', '\u200eimage', '\u200evideo', '\u200eGIF', 'card', '\u200eaudio', '\u200edocument', '\u200esticker'} | |
# Clean data | |
STOPWORDS = STOPWORDS.union({'changed', 'subject'}) # for WhatsApp | |
def parse_iphone(): | |
words = [] | |
with open('exported_chat.txt', 'r') as f: | |
_ = next(f) # omit first line; not needed | |
for line in f: | |
#line = next(f) | |
match = (re.search(r'(\[.*?\].*?:)?\s(.*)', line).groups()[1]) | |
include = True | |
if match in ['This message was deleted.', 'You deleted this message.']: | |
include = False | |
elif 'http' in match or 'https' in match: | |
include = False | |
elif 'omitted' in match: | |
for mtype in media_omissions: | |
if mtype[1:] in match: | |
include = False | |
break | |
if include: | |
candidates = [word.lower().strip() for word in re.split(r'\s+', match) if word != ''] | |
words.extend([ | |
word for word in candidates | |
if word not in STOPWORDS | |
]) | |
return words | |
def parse_android(): | |
words = [] | |
with open('WhatsApp_Chat_Wing_since_Aug_2018_merged_large.txt', 'r') as f: | |
_ = next(f) # omit first line; not needed | |
for line in f: | |
#line = next(f) | |
match = (re.search(r'(.*?\-.*?:)?\s(.*)', line).groups()[1]) | |
include = True | |
if 'omitted' in line: | |
pass | |
if match in {'This message was deleted', 'You deleted this message', '<Media omitted>'}: | |
include = False | |
elif 'http' in match or 'https' in match: | |
include = False | |
if include: | |
candidates = [word.lower().strip() for word in re.split(r'\s+', match) if word != ''] | |
words.extend([ | |
word for word in candidates | |
if word not in STOPWORDS | |
]) | |
return words | |
punc_table = str.maketrans('', '', string.punctuation+'’') | |
digit_table = str.maketrans('', '', string.digits) | |
cleaned_words = [w.translate(punc_table).translate(digit_table) for w in parse_iphone()] | |
cleaned_words = [word for word in cleaned_words if len(word) > 1 and word not in HINDI_STOPWORDS and word not in HINGLISH_STOPWORDS] | |
#print(cleaned_words[:200]) | |
#print(len(cleaned_words)) | |
cleaned_words_joined = ' '.join(cleaned_words) | |
# For background image | |
#mask = np.array(Image.open(requests.get('https://images-na.ssl-images-amazon.com/images/I/819QJh4dX8L._AC_SL1500_.jpg', stream=True).raw)) | |
mask = np.array(Image.open(requests.get('https://previews.123rf.com/images/kaarsten/kaarsten1111/kaarsten111100061/11544770-attractive-woman-in-bikini-isolated-on-white-background-.jpg', stream=True).raw)) | |
def generate_wordcloud(words, mask): | |
word_cloud = WordCloud(font_path='arial-unicode-ms.ttf', width = 1024, height = 1024, background_color="black", mode="RGBA", mask=mask).generate(words) | |
image_colors = ImageColorGenerator(mask) | |
#plt.figure(figsize=(10,8), facecolor = 'black', edgecolor='blue') | |
plt.figure(figsize=[90, 160]) | |
plt.imshow(word_cloud.recolor(color_func=image_colors), interpolation="bilinear") | |
plt.axis('off') | |
#plt.tight_layout(pad=0) | |
plt.savefig("wing.png", format="png") | |
plt.show() | |
generate_wordcloud(cleaned_words_joined, mask) | |
raw_histo = Counter(cleaned_words) | |
raw_histo.most_common(100) | |
"who’s".translate(punc_table).translate(digit_table) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment