Created
July 6, 2020 19:24
-
-
Save kurasaiteja/5ba1fac6805c23e7e6d700e331acb42c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def startsWithDateAndTime(s): | |
# regex pattern for date.(Works only for android. IOS Whatsapp export format is different. Will update the code soon | |
pattern = '^([0-9]+)(\/)([0-9]+)(\/)([0-9][0-9]), ([0-9]+):([0-9][0-9]) (AM|PM) -' | |
result = re.match(pattern, s) | |
if result: | |
return True | |
return False | |
# Finds username of any given format. | |
def FindAuthor(s): | |
patterns = [ | |
'([\w]+):', # First Name | |
'([\w]+[\s]+[\w]+):', # First Name + Last Name | |
'([\w]+[\s]+[\w]+[\s]+[\w]+):', # First Name + Middle Name + Last Name | |
'([+]\d{2} \d{5} \d{5}):', # Mobile Number (India) | |
'([+]\d{2} \d{3} \d{3} \d{4}):', # Mobile Number (US) | |
'([\w]+)[\u263a-\U0001f999]+:', # Name and Emoji | |
] | |
pattern = '^' + '|'.join(patterns) | |
result = re.match(pattern, s) | |
if result: | |
return True | |
return False | |
def getDataPoint(line): | |
splitLine = line.split(' - ') | |
dateTime = splitLine[0] | |
date, time = dateTime.split(', ') | |
message = ' '.join(splitLine[1:]) | |
if FindAuthor(message): | |
splitMessage = message.split(': ') | |
author = splitMessage[0] | |
message = ' '.join(splitMessage[1:]) | |
else: | |
author = None | |
return date, time, author, message | |
parsedData = [] # List to keep track of data so it can be used by a Pandas dataframe | |
# Upload your file here | |
conversationPath = '/content/WhatsApp Chat with Blabla (1).txt' # chat file | |
with open(conversationPath, encoding="utf-8") as fp: | |
fp.readline() # Skipping first line of the file because contains information related to something about end-to-end encryption | |
messageBuffer = [] | |
date, time, author = None, None, None | |
while True: | |
line = fp.readline() | |
if not line: | |
break | |
line = line.strip() | |
if startsWithDateAndTime(line): | |
if len(messageBuffer) > 0: | |
parsedData.append([date, time, author, ' '.join(messageBuffer)]) | |
messageBuffer.clear() | |
date, time, author, message = getDataPoint(line) | |
messageBuffer.append(message) | |
else: | |
messageBuffer.append(line) | |
df = pd.DataFrame(parsedData, columns=['Date', 'Time', 'Author', 'Message']) # Initialising a pandas Dataframe. | |
df["Date"] = pd.to_datetime(df["Date"]) |
It's a very honour to share your code with us. I found one difficulty while uploading my .txt file can anyone solve this problem
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
First, thanks for sharing this code and the great article on Medium!
I think you're forgetting to add the last row of the file to 'df'. What you could do to fix it is simply adding lines 51 and 52 just before the break in line 48 (otherwise the last line is not added to parsedData).
Other than that - great job, and thanks again!