Skip to content

Instantly share code, notes, and snippets.

@guillermo-carrasco
Last active June 7, 2023 19:40
Show Gist options
  • Save guillermo-carrasco/da3e0896314e03ed3caeba3059337e95 to your computer and use it in GitHub Desktop.
Save guillermo-carrasco/da3e0896314e03ed3caeba3059337e95 to your computer and use it in GitHub Desktop.
Whatsapp chat parser class
import re
from collections import Counter
from datetime import datetime
import emoji
import pandas as pd
class WhatsappChat:
DATE_FORMAT = "[%m/%d/%y, %H:%M:%S]"
REGEX_DATE = """^(\u200e){0,1}\[[0-9\/]+(, )[0-9:]+\]"""
REGEX_CONTACT = """^(\u200e){0,1}\[[0-9\/]+(, )[0-9:]+\](.+?)(: )"""
REGEX_MESSAGE = """^(\u200e){0,1}\[[0-9\/]+(, )[0-9:]+\](.)+(: )(.+)"""
def __init__(self, chat_path):
self.emojis = Counter()
self._f_words = set()
self.chat_path = chat_path
self.chat = self.parse_chat()
def emoji_count(self, s):
count = 0
for c in s:
if emoji.is_emoji(c):
count += 1
self.emojis[c] += 1
return count
@staticmethod
def is_gif(s):
return "GIF omitted" in s
@staticmethod
def is_image(s):
return "image omitted" in s
@staticmethod
def is_document(s):
return "document omitted" in s
@staticmethod
def is_sticker(s):
return "sticker omitted" in s
@staticmethod
def is_audio(s):
return "audio omitted" in s
@staticmethod
def is_media(s):
return "omitted" in s
def contains_f_word(self, s):
pattern = '\s?f(uck|ucking|(\*+(k|ck|ng|ing)))+'
match = re.match(pattern, s, re.IGNORECASE)
if match:
self._f_words.add(match.group(0))
return True
return False
def parse_chat(self):
df_data = {
"date": [],
"contact": [],
"message": [],
}
with open(self.chat_path, "r") as f:
# Start with the first message
for line in f.readlines():
if re.search(self.REGEX_DATE, line):
date = datetime.strptime(
re.search(self.REGEX_DATE, line).group(0).replace("\u200e", ""),
self.DATE_FORMAT,
)
df_data["date"].append(date)
df_data["contact"].append(
re.search(self.REGEX_CONTACT, line).group(3)
)
df_data["message"].append(
re.search(self.REGEX_MESSAGE, line).group(5)
)
else:
df_data["message"][-1] += line
df = pd.DataFrame.from_dict(df_data)
df = df.set_index("date")
# Compute some message stats
df["message_length"] = df.message.apply(lambda s: len(s))
df["exclamation_marks"] = df.message.apply(lambda s: s.count("!"))
df["emoji_count"] = df.message.apply(self.emoji_count)
df["is_gif"] = df.message.apply(self.is_gif)
df["is_image"] = df.message.apply(self.is_image)
df["is_document"] = df.message.apply(self.is_document)
df["is_sticker"] = df.message.apply(self.is_sticker)
df["is_audio"] = df.message.apply(self.is_audio)
df["drop_emojis"] = df["exclamation_marks"] = df.message.apply(
lambda s: s.count("😅")
)
df["is_media"] = df.message.apply(self.is_media)
df["contains_f_word"] = df.message.apply(self.contains_f_word)
return df
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment