Created
April 9, 2019 16:40
-
-
Save dipanjanS/d5eb49928016a94c1fbdfe20ed707392 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pyspark.sql.functions import udf | |
month_map = { | |
'Jan': 1, 'Feb': 2, 'Mar':3, 'Apr':4, 'May':5, 'Jun':6, 'Jul':7, | |
'Aug':8, 'Sep': 9, 'Oct':10, 'Nov': 11, 'Dec': 12 | |
} | |
def parse_clf_time(text): | |
""" Convert Common Log time format into a Python datetime object | |
Args: | |
text (str): date and time in Apache time format [dd/mmm/yyyy:hh:mm:ss (+/-)zzzz] | |
Returns: | |
a string suitable for passing to CAST('timestamp') | |
""" | |
# NOTE: We're ignoring the time zones here, might need to be handled depending on the problem you are solving | |
return "{0:04d}-{1:02d}-{2:02d} {3:02d}:{4:02d}:{5:02d}".format( | |
int(text[7:11]), | |
month_map[text[3:6]], | |
int(text[0:2]), | |
int(text[12:14]), | |
int(text[15:17]), | |
int(text[18:20]) | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment