Last active
June 6, 2019 01:23
-
-
Save johnarban/4f28c606a5d5b8beafcfcaa529054593 to your computer and use it in GitHub Desktop.
code to analyze whatsapp chat log
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import matplotlib.pyplot as plt | |
import re | |
import emoji # pip install emojie | |
import datetime | |
from wordcloud import WordCloud #pip install wordcloud | |
import seaborn as sns # pip install seaborn | |
from astropy.time import Time # pip install astropy | |
plt.ion() | |
# I called my whatsapp chat log whatsappchat.txt | |
def find_newchat(xline): | |
result = re.search( | |
r'^([0-9]{1, 2}\/?){3}, [0-9]{2}:[0-9]{2}', xline) is not None | |
return result | |
def parse_line(line): | |
meta_regex = r'^([0-9]{1, 2}\/[0-9]{1, 2}\/[0-9]{1, 2}, [0-9]{2}:[0-9]{2}) -([^\n\r:]+): ([^\r]+)' | |
res = re.search(meta_regex, line) | |
try: | |
date, name, text = res.groups() | |
date = datetime.datetime.strptime(date, '%m/%d/%y, %H:%M') | |
return date, name, text | |
except: | |
print(line) | |
return 'alert', None, None | |
def makewordcloud(text, size=1500, cmap='tab10'): | |
longtext = ' '.join(text) | |
longtext = longtext.replace('\n', ' ') | |
longtext = longtext.replace('<Media omitted>', '') | |
# generate Word Cloud | |
wc = WordCloud().generate(longtext) | |
wc.stopwords.add('https') | |
wc.stopwords.add('us') | |
wc.stopwords.add('will') | |
wc.stopwords.add('youtu') | |
wc.stopwords.add('youtube') | |
wc.width = size | |
wc.height = size | |
wc.generate_from_text(longtext) | |
wc.recolor(colormap=cmap) | |
# wc.to_file('wordcloud.png') | |
return wc | |
def makewordcloudname(text, names, name, size=1500, cmap='tab10'): | |
keep = names == name | |
return makewordcloud(text[keep], size=size, cmap=cmap) | |
def urlFind(x, regex=None): | |
regex = r'http[s]?://[^\s\n]+' | |
return re.search(regex, x) | |
################### | |
## READ IN THE DATA | |
f = open('whatsappchat.txt', 'r') # open file <whatsappchat.txt> for 'r':read-only | |
chat = f.readlines() # read in every line | |
# each line of the file is read | |
# in to a separte element | |
# of a list: [line1, line2, line3, ... ] | |
f.close() # close the file | |
# begin data cleaning | |
newchat = list(map(find_newchat, chat)) # mark each new chat | |
# this must be done because whatsapp | |
# splits chats where ever the chat author | |
# put in a newline | |
# print total number of unique chats | |
print('There are {num} messages in chat'.format(num=np.sum(newline))) | |
# merge chats from single user that where | |
# split because of a new line | |
merged_lines = [] | |
for i, chit in enumerate(chat): | |
if newchat[i]: | |
merged_lines.append(chit) | |
if not newchat[i]: # if not a "new chat" merge with previous chat | |
merged_lines[-1] += chit | |
# separate out date, name and text of each chat | |
parsed = [parse_line(line) | |
for line in merged_lines if parse_line(line)[1] is not None] | |
date, names, text = np.transpose(parsed) # get containers for date, name, and text | |
# clean up names // basically get first names | |
names = np.array([n.split()[0] for n in names]) | |
# make wordcloud for each user | |
for name in np.unique(names): | |
if np.sum(names == name) > 90: | |
wc = makewordcloudname(text, names, name, size=600) | |
wc.to_file('wordcloud_'+name+'.png') | |
# Get urls and output to a file | |
urls = [urlFind(i).group() | |
for i in text if urlFind(i) is not None] | |
f = open('wtext.txt', 'w') | |
f.writelines(urls) | |
f.close() | |
# | |
# | |
# | |
# | |
################# | |
## now make some plots | |
################# | |
# | |
# | |
# | |
# | |
# --------Plot histogram of chats by user------------------ | |
plt.figure(figsize=(10, 8)) | |
name, nameid = np.unique(names, return_inverse=True) | |
hist, bins, _ = plt.hist(nameid, bins=len(name), | |
range=(nameid.min() - .5, nameid.max() + 0.5), rwidth=.75, | |
color='indianred', lw=2) | |
plt.xticks((bins[1:] + bins[:-1]) / 2., name, rotation='vertical') | |
plt.tight_layout() | |
for i in range(len(name)): | |
plt.annotate('{:0.0f}'.format(hist[i]), ((bins[1:] + bins[:-1])[i] / 2., | |
hist[i] + 15), va='center', ha='center') | |
plt.savefig('whatappchat_hist.png') | |
# -------------------------- | |
# --------Plot histogram of chats by day/time------------------ | |
decitime = [d.hour + d.minute / 60 + d.second / 3600 for d in date] | |
day = [d.weekday() for d in date] | |
jd = [Time(d).jd for d in date] | |
plt.figure(figsize=(10, 6)) | |
ax = sns.violinplot(x=day, y=decitime, bw=.25, scale='count', linewidth=1, inner=None) | |
ax.set_xticks([0, 1, 2, 3, 4, 5, 6]) | |
ax.set_xticklabels(['Sun', 'Mon', 'Tue', 'Wed', 'Thu', | |
'Fri', 'Sat'], rotation='vertical', fontsize=16) | |
ax.set_yticks(np.linspace(-1, 25, 13)) # tick manipulation b/c smoothing | |
ax.set_yticklabels([0, 2, 4, 6, 8, 10, 12, 2, 4, 6, 8, 10, 12], fontsize=16) | |
ax.minorticks_off() | |
ax.set_ylabel('H o u r', fontsize=18) | |
plt.tight_layout() | |
plt.savefig('whatapp_chat_hours.png') | |
# ---------All time chat rate----------------- | |
plt.figure() | |
jd = np.array([Time(d).jd for d in date]) | |
plt.hist(jd - jd.min(), bins=np.arange(0, jd.max() - jd.min() + 7, 7), density=False) | |
years = Time(['2017-01-01', '2018-01-01', '2019-01-01'], format='isot') | |
jdy = years.jd | |
plt.xticks(jdy - jd.min(), ['2017', '2018', '2019']) | |
# -------------------------- | |
# get emojis | |
emojis = [] | |
for t in text: | |
if emoji.emoji_count(t) > 0: | |
emoj = np.array([i['emoji'] for i in emoji.emoji_lis(t)]) | |
emojis = np.hstack([emojis, emoj]) | |
emoj, emojid = np.unique(emojis, return_inverse=True) | |
hist, bins = np.histogram(emojid, bins=len( | |
emoj), range=(emojid.min() - .5, emojid.max() + 0.5)) | |
srt = np.argsort(hist)[::-1][:10] | |
for i in range(10): | |
print('\t{emo}\t {num}'.format(emo=emoj[srt][i], num=hist[srt][i])) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment