Last active
August 29, 2015 14:01
-
-
Save randyzwitch/ae128196b5773d0118cf to your computer and use it in GitHub Desktop.
Building Hive UDF
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Want to try this as Hive UDF | |
#1. Compile using Jython (never done before) | |
#2. Re-write using pure Java (don't know Java) | |
#3. Re-write using Scala (barely done first week of Coursera FP class so far | |
#Hoping to return this as a single column first | |
#Eventually want to build a Hive UDTF, to make one row containing dates into multiple rows, one per date | |
from datetime import * | |
def burst_hour_timestamp(start_t, end_t): | |
'''Function takes start_t and end_t as strings, returns starting hours between interval''' | |
start_time_tuple = datetime.strptime(start_t, "%Y-%m-%d %H:%M:%S.%f") | |
end_time_tuple = datetime.strptime(end_t, "%Y-%m-%d %H:%M:%S.%f") | |
#Round start time down to beginning of hour | |
start_time_tuple = datetime.combine(start_time_tuple.date(), time(start_time_tuple.hour, 0)) | |
hour_list = [] | |
while start_time_tuple <= end_time_tuple: | |
hour_list.append(str(start_time_tuple)) | |
start_time_tuple += timedelta(hours=1) | |
return hour_list | |
#Example | |
#burst_hour_timestamp("2014-02-14 17:56:40.17", "2014-02-14 20:00:00.1") | |
#>['2014-02-14 17:00:00', | |
# '2014-02-14 18:00:00', | |
# '2014-02-14 19:00:00', | |
# '2014-02-14 20:00:00'] | |
#In Hive, use explode function on array returned by burst_hour_timestamp function | |
''' | |
--If values column is comma-separated string, then split value will make array | |
select | |
account_number, | |
values | |
from split_column_test | |
lateral view explode(split(value, ',')) a as values; | |
--This shows that the array format to not need split function | |
--["1", "2", "3"] | |
select | |
account_number, | |
split(value, ',') as array_result | |
from split_column_test; | |
--Apparently, exploding only works with one other column | |
--You explode, then match back to table | |
--This means we need a unique record id | |
select | |
account_number, | |
--start_, | |
--end, | |
start_interval_timestamp | |
from string_result | |
lateral view explode(split(array_result, ',')) a as start_interval_timestamp; | |
''' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment