Skip to content

Instantly share code, notes, and snippets.

@mrdaemon
Created December 12, 2014 07:23
Show Gist options
  • Save mrdaemon/db48a85da16000dde12f to your computer and use it in GitHub Desktop.
Save mrdaemon/db48a85da16000dde12f to your computer and use it in GitHub Desktop.
def tokenize(string):
# Regular expression to extract blocks
_recipient = re.compile(r'^(.+?)\s(in|to|about)\b',
re.IGNORECASE)
_time = re.compile(r'\bin\s(\d+\s\b.+?\b)(?:\sto\b|\sabout\b|$)',
re.IGNORECASE)
_message = re.compile(r'\b(?:to|about)\s(.+?)(?:\sin\b|$)',
re.IGNORECASE)
recipientmatch = _recipient.search(string)
timematch = _time.search(string)
messagematch = _message.search(string)
if recipientmatch is None:
raise ReminderSyntaxError("Remind who?")
if timematch is None:
raise ReminderSyntaxError("Remind when?")
if messagematch is None:
raise ReminderSyntaxError("Remind what..?")
# Fetch groups, process further and pack as tokens
# Construct rough, unvalidated list of usernames, using
# "and" and "," as delimiters to split on. The further
# tokenization simplifies the parser's operation.
recipientdelims = re.compile(r'\b,\sand\b\s|\b,\s|\s\band\b\s',
re.IGNORECASE)
recipientgroups = recipientmatch.group(1)
recipientblock = recipientdelims.split(recipientgroups)
# Time is just split in unavlidated elements. Should contain two,
# the numerical value and the human readable time unit.
timegroup = timematch.group(1)
timeblock = timegroup.split()
# message block is just kept as-is, as a string.
# It never really gets processed anyways.
messageblock = messagematch.group(1)
return {"recipient": recipientblock,
"time": timeblock, "message": messageblock }
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment