Created
December 12, 2014 07:23
-
-
Save mrdaemon/db48a85da16000dde12f to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def tokenize(string): | |
# Regular expression to extract blocks | |
_recipient = re.compile(r'^(.+?)\s(in|to|about)\b', | |
re.IGNORECASE) | |
_time = re.compile(r'\bin\s(\d+\s\b.+?\b)(?:\sto\b|\sabout\b|$)', | |
re.IGNORECASE) | |
_message = re.compile(r'\b(?:to|about)\s(.+?)(?:\sin\b|$)', | |
re.IGNORECASE) | |
recipientmatch = _recipient.search(string) | |
timematch = _time.search(string) | |
messagematch = _message.search(string) | |
if recipientmatch is None: | |
raise ReminderSyntaxError("Remind who?") | |
if timematch is None: | |
raise ReminderSyntaxError("Remind when?") | |
if messagematch is None: | |
raise ReminderSyntaxError("Remind what..?") | |
# Fetch groups, process further and pack as tokens | |
# Construct rough, unvalidated list of usernames, using | |
# "and" and "," as delimiters to split on. The further | |
# tokenization simplifies the parser's operation. | |
recipientdelims = re.compile(r'\b,\sand\b\s|\b,\s|\s\band\b\s', | |
re.IGNORECASE) | |
recipientgroups = recipientmatch.group(1) | |
recipientblock = recipientdelims.split(recipientgroups) | |
# Time is just split in unavlidated elements. Should contain two, | |
# the numerical value and the human readable time unit. | |
timegroup = timematch.group(1) | |
timeblock = timegroup.split() | |
# message block is just kept as-is, as a string. | |
# It never really gets processed anyways. | |
messageblock = messagematch.group(1) | |
return {"recipient": recipientblock, | |
"time": timeblock, "message": messageblock } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment