Last active
May 5, 2018 04:54
-
-
Save bluenex/2347ae70f421a1b514831937cb7ff296 to your computer and use it in GitHub Desktop.
Regex patterns for kk's date 😂
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [{ | |
"cell_type": "code", | |
"execution_count": 0, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"5/1/1017 : 5/1/2017\n", | |
"a-001 : ---- REALLY NO IDEA, GO WITH 5/1/2017? ----\n", | |
"10/12017 : 10/1/2017\n", | |
"131/2017 : 13/1/2017\n", | |
"19/1//2017 : 19/1/2017\n", | |
"25/1/217 : 25/1/2017\n", | |
"27/12017 : 27/1/2017\n", | |
"30/1/201 : ---- THE 201X ----\n", | |
"29/1/201 : ---- THE 201X ----\n", | |
"30/1/25017 : 30/1/2017\n", | |
"22/2/017 : 22/2/2017\n", | |
"23/2/201 : ---- THE 201X ----\n", | |
"5/2/018 : 5/2/2018\n", | |
"7/2/20108 : 7/2/2018\n", | |
"10/2/20188 : 10/2/2018\n", | |
"12/2/018 : 12/2/2018\n", | |
"14/2/22018 : 14/2/2018\n", | |
"f : ---- REALLY NO IDEA, GO WITH 14/2/2018? ----\n", | |
"22/2/20188 : 22/2/2018\n", | |
"6/3/22018 : 6/3/2018\n", | |
"12/3/20118 : 12/3/2018\n", | |
"12/3/118 : 12/3/2018\n", | |
"25/3/20181 : 25/3/2018\n", | |
"27/3/20018 : 27/3/2018\n", | |
"29/3/20188 : 29/3/2018\n", | |
"3/3/2/2017 : 3/2/2017\n", | |
"9/3//2017 : 9/3/2017\n", | |
"13/22/2017 : ---- OVER 12 -> 13/22/2017 ----\n", | |
"ฟ-001 : ---- REALLY NO IDEA, GO WITH 9/3/2017? ----\n", | |
"36/3/17 : ---- OVER 31 -> 36/3/2017 ----\n", | |
"2/13/17 : ---- OVER 12 -> 2/13/2017 ----\n", | |
"7/4/217 : 7/4/2017\n", | |
"10/4/20107 : 10/4/2017\n", | |
"21/4/20117 : 21/4/2017\n", | |
"23/4/20174 : 23/4/2017\n", | |
"13/4/20107 : 13/4/2017\n", | |
"254/4/17 : ---- OVER 31 -> 254/4/17 ----\n", | |
"27/4/20117 : 27/4/2017\n", | |
"14/15/2017 : ---- OVER 12 -> 14/15/2017 ----\n", | |
"18/05/2017 : 18/05/2017\n", | |
"19/5/2017000 : 19/5/2017\n", | |
"00:00:00 : ---- REALLY NO IDEA, GO WITH 19/5/2017? ----\n", | |
"25/05/2017 : 25/05/2017\n", | |
"29/05/2017 : 29/05/2017\n", | |
"31/05/2017 : 31/05/2017\n", | |
"1/6/22017 : 1/6/2017\n", | |
"12/6/201 : ---- THE 201X ----\n", | |
"7/72017 : 7/7/2017\n", | |
"87/72017 : ---- OVER 31 -> 87/7/2017 ----\n", | |
"11/7//2017 : 11/7/2017\n", | |
"12/72017 : 12/7/2017\n", | |
"12/7/22017 : 12/7/2017\n", | |
"21/7/20147 : 21/7/2017\n", | |
"26/72017 : 26/7/2017\n", | |
"29/7/20147 : 29/7/2017\n", | |
"1/8/20170 : 1/8/2017\n", | |
"12/8/20017 : 12/8/2017\n", | |
"3/9/20170 : 3/9/2017\n", | |
"10/9/25017 : 10/9/2017\n", | |
"19/9/201 : ---- THE 201X ----\n", | |
"22/9/20147 : 22/9/2017\n", | |
"274/9/2017 : ---- OVER 31 -> 274/9/2017 ----\n", | |
"27/9/20107 : 27/9/2017\n", | |
"2/10/201 : ---- THE 201X ----\n", | |
"ภ: ---- REALLY NO IDEA, GO WITH 27/9/2017? ----\n", | |
"8/101/2017 : ---- REALLY NO IDEA, GO WITH 27/9/2017? ----\n", | |
"12/10/017 : 12/10/2017\n", | |
"16/10/20147 : 16/10/2017\n", | |
" 17/10/2017 : 17/10/2017\n", | |
"1/11/117 : 1/11/2017\n", | |
"11/11/22017 : 11/11/2017\n", | |
" 17/11/2017 : 17/11/2017\n", | |
"11/12/201 : ---- THE 201X ----\n", | |
"14/12/201 : ---- THE 201X ----\n", | |
"19/12/201 : ---- THE 201X ----\n", | |
"21/12/20147 : 21/12/2017\n", | |
"22/12/22017 : 22/12/2017\n", | |
"27/12/22017 : 27/12/2017\n", | |
"29/17/2017 : ---- OVER 12 -> 29/17/2017 ----\n", | |
"11/12018 : 11/1/2018\n", | |
"16/1/22018 : 16/12/2018\n", | |
"18/1/018 : 18/10/2018\n", | |
"19/1/22018 : 19/12/2018\n", | |
"26/1/201/8 : 26/1/2018\n" | |
] | |
} | |
], | |
"source": [ | |
"tmpdate = [i for i in df_cleaned['วันที่'].unique() if not isinstance(i, datetime.date)]\n", | |
"\n", | |
"# multiple patterns as a list\n", | |
"# https://stackoverflow.com/a/46328646/4010864 \n", | |
"\n", | |
"last = ''\n", | |
"\n", | |
"dateRegexPatterns = [\n", | |
" # remove starting whitespace\n", | |
" (r'^\\s', r''),\n", | |
" # 30/12017 > 30/1/2017\n", | |
" (r'(^\\d{1,2}/)(\\d{1})([2017]{4})$', r'\\1\\2/\\3'),\n", | |
" # 131/2017 > 13/1/2017\n", | |
" (r'(^[1-3][0-9])([0-9])/([20178]{4})$', r'\\1/\\2/\\3'),\n", | |
" # 30/1/25017 > 30/1/2017, 19/1//2017 > 19/1/2017, 5/1/1017 > 5/1/2017\n", | |
" (r'(^\\d{1,2}/\\d+/)(/?\\d*7\\d*)$', r'\\g<1>2017'),\n", | |
" # 16/1/22018 > 16/12/2018\n", | |
" (r'(^\\d{1,2}/)([1])/([0-2])(\\d*8\\d*)$', r'\\1\\2\\3/\\4'),\n", | |
" # 30/1/25018 > 30/1/2018\n", | |
" (r'(^\\d{1,2}/\\d+/)(/?\\d*8\\d*)$', r'\\g<1>2018'),\n", | |
" # 10/12018 > 10/1/2018, 27/12018 > 27/1/2018\n", | |
" (r'^(\\d+\\/)(\\d+)(\\d{3}8)$', r'\\1\\2/\\3'),\n", | |
" # 3/3/2/2017 > 3/2/2017\n", | |
" (r'^(\\d+)/(\\d+)/(\\d+)/(\\d*7\\d*)$', r'\\2/\\3/\\4'),\n", | |
" # 26/1/201/8 > 26/1/2018\n", | |
" (r'^(\\d+/)(\\d+/)(\\d*)/(\\d*)', r'\\1\\2\\3\\4'), \n", | |
" # remove ending whitespace \n", | |
" (r'$|\\s', r''),\n", | |
"]\n", | |
"\n", | |
"# apply all patterns\n", | |
"def regexDate(dateString):\n", | |
" for old, new in dateRegexPatterns:\n", | |
" dateString = re.sub(old, new, dateString)\n", | |
" return dateString\n", | |
"\n", | |
"# loop through each date\n", | |
"for date in tmpdate:\n", | |
" passRegex = str(date)\n", | |
" passRegex = regexDate(passRegex)\n", | |
" \n", | |
" # filter normies and non-normies\n", | |
" normies = re.match(r'(^[0-9]|[0-2][0-9]|[3][0-1])/([1-9]|0[1-9]|1[0-2])/(201[7-8]$)', passRegex)\n", | |
" the201 = re.match(r'.*/201$', passRegex)\n", | |
" over31 = re.match(r'[^0-2][2-9]|[0-9]{3}/.*', passRegex)\n", | |
" over12 = re.match(r'.*/(1[^0-2]|[^0-1][0-9])/.*', passRegex)\n", | |
" \n", | |
" # debugging -- to see what is not in pattern yet\n", | |
" if passRegex is not '' and not normies:\n", | |
"# passRegex = '---- PASS REGEX BUT STILL NOT PARSEABLE -> {0} ----'.format(passRegex)\n", | |
"\n", | |
" if over12:\n", | |
" passRegex = '---- OVER 12 -> {0} ----'.format(passRegex)\n", | |
" \n", | |
" if over31:\n", | |
" passRegex = '---- OVER 31 -> {0} ----'.format(passRegex)\n", | |
" \n", | |
" if the201:\n", | |
" passRegex = '---- THE 201X ----'\n", | |
"\n", | |
" # if it pass no filter, the value is still the same\n", | |
" # if really not possible to get pattern, use last iter?\n", | |
" if passRegex == str(date) and not normies:\n", | |
" passRegex = '---- REALLY NO IDEA, GO WITH {0}? ----'.format(last)\n", | |
" elif not normies:\n", | |
" pass\n", | |
" else:\n", | |
" last = passRegex\n", | |
"\n", | |
" print(\"{0} : {1}\".format(str(date), passRegex))" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.2" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment