Created
November 2, 2020 02:45
-
-
Save Rafastoievsky/4055397ecb078abdbc6b7902a76415e2 to your computer and use it in GitHub Desktop.
WhatsApp Group chat analysis: cleaning data functions
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def startsWithDateAndTime(s): | |
pattern = '^\d{1,2}/\d{1,2}/\d{1,2}, \d{1,2}:\d{1,2}\S [AaPp][Mm] -' | |
result = re.match(pattern, s) | |
if result: | |
return True | |
return False | |
def FindAuthor(s): | |
patterns = [ | |
'([\w]+):', # Nombre | |
'([\w]+[\s]+[\w]+):', # Nombre + Apellido | |
'([\w]+[\s]+[\w]+[\s]+[\w]+):', # Nombre + Segundo Nombre + Apellido | |
'([\w]+)[\u263a-\U0001f999]+:', # Nombre con Emoji | |
] | |
pattern = '^' + '|'.join(patterns) | |
result = re.match(pattern, s) | |
if result: | |
return True | |
return False | |
def getDataPoint(line): | |
splitLine = line.split(' - ') | |
dateTime = splitLine[0] | |
message = ' '.join(splitLine[1:]) | |
if FindAuthor(message): | |
splitMessage = message.split(': ') | |
author = splitMessage[0] | |
message = ' '.join(splitMessage[1:]) | |
else: | |
author = None | |
return dateTime, author, message |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment