randyzwitch · August 29, 2015 14:01
diff --git a/udf.py b/udf.py
 #Want to try this as Hive UDF
 #1. Compile using Jython (never done before)
 #2. Re-write using pure Java (don't know Java)
 #3. Re-write using Scala (barely done first week of Coursera FP class so far

 #Hoping to return this as a single column first
 #Eventually want to build a Hive UDTF, to make one row containing dates into multiple rows, one per date

 from datetime import *

 def burst_hour_timestamp(start_t, end_t):
    '''Function takes start_t and end_t as strings, returns starting hours between interval'''

    start_time_tuple = datetime.strptime(start_t, "%Y-%m-%d %H:%M:%S.%f")
    end_time_tuple = datetime.strptime(end_t, "%Y-%m-%d %H:%M:%S.%f")
    
    #Round start time down to beginning of hour
    start_time_tuple = datetime.combine(start_time_tuple.date(), time(start_time_tuple.hour, 0))

    hour_list = []
    while start_time_tuple <= end_time_tuple:
        hour_list.append(str(start_time_tuple))
        start_time_tuple += timedelta(hours=1)
  
    return hour_list

 #Example

 #burst_hour_timestamp("2014-02-14 17:56:40.17", "2014-02-14 20:00:00.1")
 #>['2014-02-14 17:00:00',
 # '2014-02-14 18:00:00',
 # '2014-02-14 19:00:00',
 # '2014-02-14 20:00:00']

 #In Hive, use explode function on array returned by burst_hour_timestamp function
 '''
 --If values column is comma-separated string, then split value will make array
 select
 account_number,
 values
 from split_column_test
 lateral view explode(split(value, ',')) a as values;

 --This shows that the array format to not need split function
 --["1", "2", "3"]
 select
 account_number,
 split(value, ',') as array_result
 from split_column_test;

 --Apparently, exploding only works with one other column
 --You explode, then match back to table
 --This means we need a unique record id

 select
 account_number,
 --start_,
 --end,
 start_interval_timestamp
 from string_result
 lateral view explode(split(array_result, ',')) a as start_interval_timestamp;
 '''
	#Want to try this as Hive UDF
	#1. Compile using Jython (never done before)
	#2. Re-write using pure Java (don't know Java)
	#3. Re-write using Scala (barely done first week of Coursera FP class so far

	#Hoping to return this as a single column first
	#Eventually want to build a Hive UDTF, to make one row containing dates into multiple rows, one per date

	from datetime import *

	def burst_hour_timestamp(start_t, end_t):
	'''Function takes start_t and end_t as strings, returns starting hours between interval'''

	start_time_tuple = datetime.strptime(start_t, "%Y-%m-%d %H:%M:%S.%f")
	end_time_tuple = datetime.strptime(end_t, "%Y-%m-%d %H:%M:%S.%f")

	#Round start time down to beginning of hour
	start_time_tuple = datetime.combine(start_time_tuple.date(), time(start_time_tuple.hour, 0))

	hour_list = []
	while start_time_tuple <= end_time_tuple:
	hour_list.append(str(start_time_tuple))
	start_time_tuple += timedelta(hours=1)

	return hour_list

	#Example

	#burst_hour_timestamp("2014-02-14 17:56:40.17", "2014-02-14 20:00:00.1")
	#>['2014-02-14 17:00:00',
	# '2014-02-14 18:00:00',
	# '2014-02-14 19:00:00',
	# '2014-02-14 20:00:00']

	#In Hive, use explode function on array returned by burst_hour_timestamp function
	'''
	--If values column is comma-separated string, then split value will make array
	select
	account_number,
	values
	from split_column_test
	lateral view explode(split(value, ',')) a as values;

	--This shows that the array format to not need split function
	--["1", "2", "3"]
	select
	account_number,
	split(value, ',') as array_result
	from split_column_test;

	--Apparently, exploding only works with one other column
	--You explode, then match back to table
	--This means we need a unique record id

	select
	account_number,
	--start_,
	--end,
	start_interval_timestamp
	from string_result
	lateral view explode(split(array_result, ',')) a as start_interval_timestamp;
	'''