Created
June 6, 2012 22:44
-
-
Save ptgolden/2885300 to your computer and use it in GitHub Desktop.
Natural language date parser (for historical dates)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
year_part = re.compile('(?:1[89]|20)\d{2}') | |
months = [r'jan(?:uary)?$', r'feb(?:ruary)?$', r'mar(?:ch)?$', | |
r'apr(?:il)?$', r'may$', r'jun(?:e)?$', | |
r'jul(?:y)?$', r'aug(?:ust)?$', r'sep(?:t|tember)?$', | |
r'oct(?:ober)?$', r'nov(?:ember)?$', r'dec(?:ember)?', | |
'dec', 'dec', 'dec', 'dec', 'dec', 'dec', 'dec', 'dec', | |
r'spr(?:ing)?$', r'sum(?:mer)?', r'(?:aut|autumn|fall)$', r'win(?:ter)?$'] | |
numbers = [r'[0-9]{1,2}(?:st|nd|rd)?$', r'[0-9]{4}$'] | |
joiners = ['-', 'from$', 'to$', ',', 'and', '[/]'] | |
unsure = ['[?]', 'c(?:irca|a)?$', 'ab(?:ou)?t'] | |
other = [r'[0-9]{1,2}(?:st|nd|rd)?$', r'[0-9]{4}$', ' ', '-', '[?]', | |
'[.]', '[/]', '~', ',', '^ca?[.]?$'] | |
def tokenize(string): | |
if not re.search(year_part, string): | |
return None | |
return [t for t in re.split(r'(\W)', string) if t] | |
def get_date_tokens_from_string(string): | |
tokens = tokenize(string) | |
if not tokens: | |
return | |
year_index = 0 | |
for counter, token in enumerate(tokens): | |
match = re.findall(year_part, token) | |
year_index = counter if (match and 1800 < int(match[0]) < 2013) else year_index | |
valid_date_parts = re.compile(r'|'.join(months + other), re.I) | |
date_begin = year_index | |
while True: | |
if re.match(valid_date_parts, tokens[date_begin]) and date_begin > 0: | |
date_begin = date_begin - 1 | |
else: | |
break | |
date_end = year_index | |
while True: | |
if re.match(valid_date_parts, tokens[date_end]) and len(tokens) > date_end + 1: | |
date_end = date_end + 1 | |
else: | |
break | |
date_str = ''.join(tokens[date_begin + 1:date_end]) | |
add_q = '?' in date_str | |
add_tilde = '~' in date_str | |
# Strip unnecessary chars, although this doesn't preserve '?' or '~' | |
date_str = re.sub(r'^\W+|\W+$', '', date_str) | |
date_str += '?' if add_q else '' | |
date_str += '~' if add_tilde else '' | |
return tokenize(date_str) | |
def parse_date_tokens(tokens): | |
date_str = '' | |
if not tokens: | |
return '' | |
def year_from_tokens(tokens): | |
for tindex, t in enumerate(tokens): | |
if re.match(r'\d{4}', t): | |
y_token_idx = tindex | |
year = t | |
return (year, y_token_idx) | |
def month_from_tokens(tokens): | |
month = -1 | |
for mindex, m in enumerate(months): | |
month_search = [(tindex, t) for tindex, t in enumerate(tokens) | |
if re.match(r'%s' % m, t, re.I)] | |
if month_search: | |
month = mindex + 1 | |
m_token_idx = month_search[0][0] | |
break | |
if month > 0: | |
return (month, m_token_idx) | |
else: | |
return (0, 0) | |
if len(tokens) == 1: | |
year, = tokens | |
date_str = year | |
elif len(tokens) == 3 and tokens[1] == '-': | |
date_str = '%s/%s' % (tokens[0], tokens[2]) | |
elif len(tokens) == 3: | |
year, yindex = year_from_tokens(tokens) | |
month, mindex = month_from_tokens(tokens) | |
date_str = '%s' % year | |
date_str += '-%02d' % month if month else '' | |
else: | |
year, yindex = year_from_tokens(tokens) | |
month, mindex = month_from_tokens(tokens) | |
date_str = '%s' % year | |
if month: | |
date_str += '-%02d' % month | |
day_search = re.compile(r'([0-9]{1,2})(?:st|nd|rd)?$') | |
dindex1 = mindex + 1 | |
dindex2 = mindex - 1 | |
threshold = 1 | |
day = 0 | |
while threshold < 5: | |
if re.match(day_search, tokens[dindex1]): | |
day = re.findall(day_search, tokens[dindex1])[0] | |
break | |
else: | |
dindex1 += 1 if len(tokens) > dindex1 + 1 else 0 | |
if re.match(day_search, tokens[dindex2]): | |
day = re.findall(day_search, tokens[dindex2])[0] | |
break | |
else: | |
dindex2 -= 1 if dindex2 > 0 else 0 | |
threshold += 1 | |
date_str += '-%02d' % int(day) if day else '' | |
if '?' in tokens or 'ca' in tokens or 'c' in tokens: | |
date_str += '?' | |
if '~' in tokens: | |
date_str += '~' | |
return date_str |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment