-
-
Save Mr-Saxobeat/24ae11228e36f55a3eb18d3320d4532a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def startsWithDateAndTime(s): | |
# regex pattern for date.(Works only for android. IOS Whatsapp export format is different. Will update the code soon | |
pattern = '^([0-9]+)(\/)([0-9]+)(\/)([0-9][0-9]), ([0-9]+):([0-9][0-9]) (AM|PM) -' | |
result = re.match(pattern, s) | |
if result: | |
return True | |
return False | |
# Finds username of any given format. | |
def FindAuthor(s): | |
patterns = [ | |
'([\w]+):', # First Name | |
'([\w]+[\s]+[\w]+):', # First Name + Last Name | |
'([\w]+[\s]+[\w]+[\s]+[\w]+):', # First Name + Middle Name + Last Name | |
'([+]\d{2} \d{5} \d{5}):', # Mobile Number (India) | |
'([+]\d{2} \d{3} \d{3} \d{4}):', # Mobile Number (US) | |
'([\w]+)[\u263a-\U0001f999]+:', # Name and Emoji | |
] | |
pattern = '^' + '|'.join(patterns) | |
result = re.match(pattern, s) | |
if result: | |
return True | |
return False | |
def getDataPoint(line): | |
splitLine = line.split(' - ') | |
dateTime = splitLine[0] | |
date, time = dateTime.split(', ') | |
message = ' '.join(splitLine[1:]) | |
if FindAuthor(message): | |
splitMessage = message.split(': ') | |
author = splitMessage[0] | |
message = ' '.join(splitMessage[1:]) | |
else: | |
author = None | |
return date, time, author, message | |
parsedData = [] # List to keep track of data so it can be used by a Pandas dataframe | |
# Upload your file here | |
conversationPath = '/content/WhatsApp Chat with Blabla (1).txt' # chat file | |
with open(conversationPath, encoding="utf-8") as fp: | |
fp.readline() # Skipping first line of the file because contains information related to something about end-to-end encryption | |
messageBuffer = [] | |
date, time, author = None, None, None | |
while True: | |
line = fp.readline() | |
if not line: | |
break | |
line = line.strip() | |
if startsWithDateAndTime(line): | |
if len(messageBuffer) > 0: | |
parsedData.append([date, time, author, ' '.join(messageBuffer)]) | |
messageBuffer.clear() | |
date, time, author, message = getDataPoint(line) | |
messageBuffer.append(message) | |
else: | |
messageBuffer.append(line) | |
df = pd.DataFrame(parsedData, columns=['Date', 'Time', 'Author', 'Message']) # Initialising a pandas Dataframe. | |
df["Date"] = pd.to_datetime(df["Date"]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment