Skip to content

Instantly share code, notes, and snippets.

@johnarban
Last active June 6, 2019 01:23
Show Gist options
  • Save johnarban/4f28c606a5d5b8beafcfcaa529054593 to your computer and use it in GitHub Desktop.
Save johnarban/4f28c606a5d5b8beafcfcaa529054593 to your computer and use it in GitHub Desktop.
code to analyze whatsapp chat log
import numpy as np
import matplotlib.pyplot as plt
import re
import emoji # pip install emojie
import datetime
from wordcloud import WordCloud #pip install wordcloud
import seaborn as sns # pip install seaborn
from astropy.time import Time # pip install astropy
plt.ion()
# I called my whatsapp chat log whatsappchat.txt
def find_newchat(xline):
result = re.search(
r'^([0-9]{1, 2}\/?){3}, [0-9]{2}:[0-9]{2}', xline) is not None
return result
def parse_line(line):
meta_regex = r'^([0-9]{1, 2}\/[0-9]{1, 2}\/[0-9]{1, 2}, [0-9]{2}:[0-9]{2}) -([^\n\r:]+): ([^\r]+)'
res = re.search(meta_regex, line)
try:
date, name, text = res.groups()
date = datetime.datetime.strptime(date, '%m/%d/%y, %H:%M')
return date, name, text
except:
print(line)
return 'alert', None, None
def makewordcloud(text, size=1500, cmap='tab10'):
longtext = ' '.join(text)
longtext = longtext.replace('\n', ' ')
longtext = longtext.replace('<Media omitted>', '')
# generate Word Cloud
wc = WordCloud().generate(longtext)
wc.stopwords.add('https')
wc.stopwords.add('us')
wc.stopwords.add('will')
wc.stopwords.add('youtu')
wc.stopwords.add('youtube')
wc.width = size
wc.height = size
wc.generate_from_text(longtext)
wc.recolor(colormap=cmap)
# wc.to_file('wordcloud.png')
return wc
def makewordcloudname(text, names, name, size=1500, cmap='tab10'):
keep = names == name
return makewordcloud(text[keep], size=size, cmap=cmap)
def urlFind(x, regex=None):
regex = r'http[s]?://[^\s\n]+'
return re.search(regex, x)
###################
## READ IN THE DATA
f = open('whatsappchat.txt', 'r') # open file <whatsappchat.txt> for 'r':read-only
chat = f.readlines() # read in every line
# each line of the file is read
# in to a separte element
# of a list: [line1, line2, line3, ... ]
f.close() # close the file
# begin data cleaning
newchat = list(map(find_newchat, chat)) # mark each new chat
# this must be done because whatsapp
# splits chats where ever the chat author
# put in a newline
# print total number of unique chats
print('There are {num} messages in chat'.format(num=np.sum(newline)))
# merge chats from single user that where
# split because of a new line
merged_lines = []
for i, chit in enumerate(chat):
if newchat[i]:
merged_lines.append(chit)
if not newchat[i]: # if not a "new chat" merge with previous chat
merged_lines[-1] += chit
# separate out date, name and text of each chat
parsed = [parse_line(line)
for line in merged_lines if parse_line(line)[1] is not None]
date, names, text = np.transpose(parsed) # get containers for date, name, and text
# clean up names // basically get first names
names = np.array([n.split()[0] for n in names])
# make wordcloud for each user
for name in np.unique(names):
if np.sum(names == name) > 90:
wc = makewordcloudname(text, names, name, size=600)
wc.to_file('wordcloud_'+name+'.png')
# Get urls and output to a file
urls = [urlFind(i).group()
for i in text if urlFind(i) is not None]
f = open('wtext.txt', 'w')
f.writelines(urls)
f.close()
#
#
#
#
#################
## now make some plots
#################
#
#
#
#
# --------Plot histogram of chats by user------------------
plt.figure(figsize=(10, 8))
name, nameid = np.unique(names, return_inverse=True)
hist, bins, _ = plt.hist(nameid, bins=len(name),
range=(nameid.min() - .5, nameid.max() + 0.5), rwidth=.75,
color='indianred', lw=2)
plt.xticks((bins[1:] + bins[:-1]) / 2., name, rotation='vertical')
plt.tight_layout()
for i in range(len(name)):
plt.annotate('{:0.0f}'.format(hist[i]), ((bins[1:] + bins[:-1])[i] / 2.,
hist[i] + 15), va='center', ha='center')
plt.savefig('whatappchat_hist.png')
# --------------------------
# --------Plot histogram of chats by day/time------------------
decitime = [d.hour + d.minute / 60 + d.second / 3600 for d in date]
day = [d.weekday() for d in date]
jd = [Time(d).jd for d in date]
plt.figure(figsize=(10, 6))
ax = sns.violinplot(x=day, y=decitime, bw=.25, scale='count', linewidth=1, inner=None)
ax.set_xticks([0, 1, 2, 3, 4, 5, 6])
ax.set_xticklabels(['Sun', 'Mon', 'Tue', 'Wed', 'Thu',
'Fri', 'Sat'], rotation='vertical', fontsize=16)
ax.set_yticks(np.linspace(-1, 25, 13)) # tick manipulation b/c smoothing
ax.set_yticklabels([0, 2, 4, 6, 8, 10, 12, 2, 4, 6, 8, 10, 12], fontsize=16)
ax.minorticks_off()
ax.set_ylabel('H o u r', fontsize=18)
plt.tight_layout()
plt.savefig('whatapp_chat_hours.png')
# ---------All time chat rate-----------------
plt.figure()
jd = np.array([Time(d).jd for d in date])
plt.hist(jd - jd.min(), bins=np.arange(0, jd.max() - jd.min() + 7, 7), density=False)
years = Time(['2017-01-01', '2018-01-01', '2019-01-01'], format='isot')
jdy = years.jd
plt.xticks(jdy - jd.min(), ['2017', '2018', '2019'])
# --------------------------
# get emojis
emojis = []
for t in text:
if emoji.emoji_count(t) > 0:
emoj = np.array([i['emoji'] for i in emoji.emoji_lis(t)])
emojis = np.hstack([emojis, emoj])
emoj, emojid = np.unique(emojis, return_inverse=True)
hist, bins = np.histogram(emojid, bins=len(
emoj), range=(emojid.min() - .5, emojid.max() + 0.5))
srt = np.argsort(hist)[::-1][:10]
for i in range(10):
print('\t{emo}\t {num}'.format(emo=emoj[srt][i], num=hist[srt][i]))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment