Skip to content

Instantly share code, notes, and snippets.

@kurasaiteja
Created July 6, 2020 19:24
Show Gist options
  • Save kurasaiteja/5ba1fac6805c23e7e6d700e331acb42c to your computer and use it in GitHub Desktop.
Save kurasaiteja/5ba1fac6805c23e7e6d700e331acb42c to your computer and use it in GitHub Desktop.
def startsWithDateAndTime(s):
# regex pattern for date.(Works only for android. IOS Whatsapp export format is different. Will update the code soon
pattern = '^([0-9]+)(\/)([0-9]+)(\/)([0-9][0-9]), ([0-9]+):([0-9][0-9]) (AM|PM) -'
result = re.match(pattern, s)
if result:
return True
return False
# Finds username of any given format.
def FindAuthor(s):
patterns = [
'([\w]+):', # First Name
'([\w]+[\s]+[\w]+):', # First Name + Last Name
'([\w]+[\s]+[\w]+[\s]+[\w]+):', # First Name + Middle Name + Last Name
'([+]\d{2} \d{5} \d{5}):', # Mobile Number (India)
'([+]\d{2} \d{3} \d{3} \d{4}):', # Mobile Number (US)
'([\w]+)[\u263a-\U0001f999]+:', # Name and Emoji
]
pattern = '^' + '|'.join(patterns)
result = re.match(pattern, s)
if result:
return True
return False
def getDataPoint(line):
splitLine = line.split(' - ')
dateTime = splitLine[0]
date, time = dateTime.split(', ')
message = ' '.join(splitLine[1:])
if FindAuthor(message):
splitMessage = message.split(': ')
author = splitMessage[0]
message = ' '.join(splitMessage[1:])
else:
author = None
return date, time, author, message
parsedData = [] # List to keep track of data so it can be used by a Pandas dataframe
# Upload your file here
conversationPath = '/content/WhatsApp Chat with Blabla (1).txt' # chat file
with open(conversationPath, encoding="utf-8") as fp:
fp.readline() # Skipping first line of the file because contains information related to something about end-to-end encryption
messageBuffer = []
date, time, author = None, None, None
while True:
line = fp.readline()
if not line:
break
line = line.strip()
if startsWithDateAndTime(line):
if len(messageBuffer) > 0:
parsedData.append([date, time, author, ' '.join(messageBuffer)])
messageBuffer.clear()
date, time, author, message = getDataPoint(line)
messageBuffer.append(message)
else:
messageBuffer.append(line)
df = pd.DataFrame(parsedData, columns=['Date', 'Time', 'Author', 'Message']) # Initialising a pandas Dataframe.
df["Date"] = pd.to_datetime(df["Date"])
@kiran6747
Copy link

It's a very honour to share your code with us. I found one difficulty while uploading my .txt file can anyone solve this problem

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment