-
-
Save mrdaemon/b2621820ea243dee986d to your computer and use it in GitHub Desktop.
Shit regex tokenizer
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Python 2.7.6 (default, Nov 10 2013, 19:24:18) [MSC v.1500 32 bit (Intel)] on win32 | |
Type "copyright", "credits" or "license()" for more information. | |
>>> import re | |
>>> a = "bob in 28 seconds to mow the lawn" | |
>>> b = "jack and jill in 3 days to go fuck themselves" | |
>>> c = "jack, jill and dr_seuss to eat a dick in 3 days" | |
>>> d = "robert, mike and claire about that shovel in my trunk in 28 years" | |
>>> e = "robert, mike, douche, and oxford_comma to write about lawns to mow in 30 minutes" | |
>>> teststrings = [a,b,c,d,e] | |
>>> # Regexes! | |
>>> # Disclaimer: could probably be a bit cleaner | |
>>> _recipient = re.compile(r'^(.+?)\s(in|to|about)\b') | |
>>> _time = re.compile(r'\bin\s(\d+\s(?:minutes|hours|seconds|days|years))(?:\sto\b|\sabout\b|$)') | |
>>> _message = re.compile(r'\b(?:to|about)\s(.+?)(?:\sin\b|$)') | |
>>> # Super shit test harness, sorry | |
>>> def testparse(strings): | |
for line in strings: | |
print """ | |
---------------------------------------------------------- | |
INPUT: %s | |
RECIPIENT MATCH: %s | |
RECIPIENT TOKEN: %s | |
TIME MATCH: %s | |
TIME TOKEN: %s | |
MESSAGE MATCH: %s | |
MESSAGE TOKEN: %s | |
""" % ( | |
line, | |
_recipient.search(line).group(), | |
_recipient.search(line).group(1), | |
_time.search(line).group(), | |
_time.search(line).group(1), | |
_message.search(line).group(), | |
_message.search(line).group(1)) | |
>>> testparse(teststrings) | |
---------------------------------------------------------- | |
INPUT: bob in 28 seconds to mow the lawn | |
RECIPIENT MATCH: bob in | |
RECIPIENT TOKEN: bob | |
TIME MATCH: in 28 seconds to | |
TIME TOKEN: 28 seconds | |
MESSAGE MATCH: to mow the lawn | |
MESSAGE TOKEN: mow the lawn | |
---------------------------------------------------------- | |
INPUT: jack and jill in 3 days to go fuck themselves | |
RECIPIENT MATCH: jack and jill in | |
RECIPIENT TOKEN: jack and jill | |
TIME MATCH: in 3 days to | |
TIME TOKEN: 3 days | |
MESSAGE MATCH: to go fuck themselves | |
MESSAGE TOKEN: go fuck themselves | |
---------------------------------------------------------- | |
INPUT: jack, jill and dr_seuss to eat a dick in 3 days | |
RECIPIENT MATCH: jack, jill and dr_seuss to | |
RECIPIENT TOKEN: jack, jill and dr_seuss | |
TIME MATCH: in 3 days | |
TIME TOKEN: 3 days | |
MESSAGE MATCH: to eat a dick in | |
MESSAGE TOKEN: eat a dick | |
---------------------------------------------------------- | |
INPUT: robert, mike and claire about that shovel in my trunk in 28 years | |
RECIPIENT MATCH: robert, mike and claire about | |
RECIPIENT TOKEN: robert, mike and claire | |
TIME MATCH: in 28 years | |
TIME TOKEN: 28 years | |
MESSAGE MATCH: about that shovel in | |
MESSAGE TOKEN: that shovel | |
---------------------------------------------------------- | |
INPUT: robert, mike, douche, and oxford_comma to write about lawns to mow in 30 minutes | |
RECIPIENT MATCH: robert, mike, douche, and oxford_comma to | |
RECIPIENT TOKEN: robert, mike, douche, and oxford_comma | |
TIME MATCH: in 30 minutes | |
TIME TOKEN: 30 minutes | |
MESSAGE MATCH: to write about lawns to mow in | |
MESSAGE TOKEN: write about lawns to mow | |
>>> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment