Skip to content

Instantly share code, notes, and snippets.

@kurasaiteja
Created July 6, 2020 19:24
Show Gist options
  • Save kurasaiteja/5ba1fac6805c23e7e6d700e331acb42c to your computer and use it in GitHub Desktop.
Save kurasaiteja/5ba1fac6805c23e7e6d700e331acb42c to your computer and use it in GitHub Desktop.
def startsWithDateAndTime(s):
# regex pattern for date.(Works only for android. IOS Whatsapp export format is different. Will update the code soon
pattern = '^([0-9]+)(\/)([0-9]+)(\/)([0-9][0-9]), ([0-9]+):([0-9][0-9]) (AM|PM) -'
result = re.match(pattern, s)
if result:
return True
return False
# Finds username of any given format.
def FindAuthor(s):
patterns = [
'([\w]+):', # First Name
'([\w]+[\s]+[\w]+):', # First Name + Last Name
'([\w]+[\s]+[\w]+[\s]+[\w]+):', # First Name + Middle Name + Last Name
'([+]\d{2} \d{5} \d{5}):', # Mobile Number (India)
'([+]\d{2} \d{3} \d{3} \d{4}):', # Mobile Number (US)
'([\w]+)[\u263a-\U0001f999]+:', # Name and Emoji
]
pattern = '^' + '|'.join(patterns)
result = re.match(pattern, s)
if result:
return True
return False
def getDataPoint(line):
splitLine = line.split(' - ')
dateTime = splitLine[0]
date, time = dateTime.split(', ')
message = ' '.join(splitLine[1:])
if FindAuthor(message):
splitMessage = message.split(': ')
author = splitMessage[0]
message = ' '.join(splitMessage[1:])
else:
author = None
return date, time, author, message
parsedData = [] # List to keep track of data so it can be used by a Pandas dataframe
# Upload your file here
conversationPath = '/content/WhatsApp Chat with Blabla (1).txt' # chat file
with open(conversationPath, encoding="utf-8") as fp:
fp.readline() # Skipping first line of the file because contains information related to something about end-to-end encryption
messageBuffer = []
date, time, author = None, None, None
while True:
line = fp.readline()
if not line:
break
line = line.strip()
if startsWithDateAndTime(line):
if len(messageBuffer) > 0:
parsedData.append([date, time, author, ' '.join(messageBuffer)])
messageBuffer.clear()
date, time, author, message = getDataPoint(line)
messageBuffer.append(message)
else:
messageBuffer.append(line)
df = pd.DataFrame(parsedData, columns=['Date', 'Time', 'Author', 'Message']) # Initialising a pandas Dataframe.
df["Date"] = pd.to_datetime(df["Date"])
@DorHason
Copy link

First, thanks for sharing this code and the great article on Medium!
I think you're forgetting to add the last row of the file to 'df'. What you could do to fix it is simply adding lines 51 and 52 just before the break in line 48 (otherwise the last line is not added to parsedData).
Other than that - great job, and thanks again!

@kiran6747
Copy link

It's a very honour to share your code with us. I found one difficulty while uploading my .txt file can anyone solve this problem

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment