Created
May 2, 2018 07:26
-
-
Save jss367/0bd6dd32bcc8719312f6e56d4b27ca62 to your computer and use it in GitHub Desktop.
Converting different date types to a standard
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
test_cases = ['04/30/2009', '06/20/95', '8/2/69', '1/25/2011', '9/3/2002', '4-13-82', 'Mar-02-2009', 'Jan 20, 1974', | |
'March 20, 1990', 'Dec. 21, 2001', 'May 25 2009', '01 Mar 2002', '2 April 2003', '20 Aug. 2004', | |
'20 November, 1993', 'Aug 10th, 1994', 'Sept 1st, 2005', 'Feb. 22nd, 1988', 'Sept 2002', 'Sep 2002', | |
'December, 1998', 'Oct. 2000', '6/2008', '12/2001', '1998', '2002'] | |
# Create a dictionary to convert from month names to numbers (e.g. Jan = 01) | |
month_dict = dict(jan='01', feb='02', mar='03', apr='04', may='05', jun='06', jul='07', aug='08', sep='09', | |
oct='10', nov='11', dec='12') | |
def word_to_num(string): | |
""" | |
This function converts a string to lowercase and only accepts the first three letter. | |
This is to prepare a string for month_dict | |
Example: | |
word_to_num('January') -> jan | |
""" | |
s = string.lower()[:3] | |
return month_dict[s] | |
def date_converter(line): | |
""" | |
This function extracts dates in every format from text and converts them to YYYYMMDD. | |
Example: | |
date_converter("It was the May 1st, 2009") -> 20090501 | |
""" | |
results = [] | |
day = '01' | |
month = '01' | |
year = '1900' | |
# If format is MM/DD/YYYY or M/D/YY or some combination | |
regex = re.search('([0]?\d|[1][0-2])[/-]([0-3]?\d)[/-]([1-2]\d{3}|\d{2})', line) | |
# If format is DD Month YYYY or D Mon YY or some combination, also matches if no day given | |
month_regex = re.search( | |
'([0-3]?\d)\s*(Jan(?:uary)?(?:aury)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|June?|July?|Aug(' | |
'?:ust)?|Sept?(?:ember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?(?:emeber)?).?,?\s([1-2]\d{3})', | |
line) | |
# If format is Month/DD/YYYY or Mon/D/YY or or Month DDth, YYYY or some combination | |
rev_month_regex = re.search( | |
'(Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|June?|July?|Aug(?:ust)?|Sept?(?:ember)?|Oct(' | |
'?:ober)?|Nov(?:ember)?|Dec(?:ember)?).?[-\s]([0-3]?\d)(?:st|nd|rd|th)?[-,\s]\s*([1-2]\d{3})', | |
line) | |
# If format is any combination of just Month or Mon and YY or YYYY | |
no_day_regex = re.search( | |
'(Jan(?:uary)?(?:aury)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|June?|July?|Aug(?:ust)?|Sept?(' | |
'?:ember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?(?:emeber)?).?,?[\s]([1-2]\d{3}|\d{2})', | |
line) | |
# If format is MM/YYYY or M YYYY or some combination | |
no_day_digits_regex = re.search('([0]?\d|[1][0-2])[/\s]([1-2]\d{3})', line) | |
# If format only contains a year. If year is written alone it must be in form YYYY | |
year_only_regex = re.search('([1-2]\d{3})', line) | |
if regex: | |
day = regex.group(2) | |
month = regex.group(1) | |
year = regex.group(3) | |
elif month_regex: | |
day = month_regex.group(1) | |
month = word_to_num(month_regex.group(2)) | |
year = month_regex.group(3) | |
elif rev_month_regex: | |
day = rev_month_regex.group(2) | |
month = word_to_num(rev_month_regex.group(1)) | |
year = rev_month_regex.group(3) | |
elif no_day_regex: | |
month = word_to_num(no_day_regex.group(1)) | |
year = no_day_regex.group(2) | |
elif no_day_digits_regex: | |
month = no_day_digits_regex.group(1) | |
year = no_day_digits_regex.group(2) | |
elif year_only_regex: | |
year = year_only_regex.group(0) | |
# Make sure all variables have correct number, add zeros if necessary | |
month = month.zfill(2) | |
day = day.zfill(2) | |
if day == '00': | |
day = '01' | |
if year is not None and len(year) == 2: | |
year = '19' + year | |
results.append(year + month + day) | |
return results | |
test_run = [date_converter(w) for w in test_cases] | |
print(test_run) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment