hannesdatta · November 21, 2019 11:27
diff --git a/rolling.do b/rolling.do
 /*

 ===================================
 ROLLING WINDOW AGGREGATION IN STATA
 ===================================

 Problem:
 --------

 Assume you have a data set with time/datestamps, noting
 when emails were sent out to customers.

 For example,

 2019-11-20  
 2019-11-21
 2019-11-23
 2019-11-25

 Your goal is to understand how many emails were sent in a rolling window,
 around a specific date.

 Suppose that window is 1 day, then you desire to get a dataset like this:

 2019-11-20  2		(counting emails sent on 20th and 21st)
 2019-11-21  2		(counting emails sent on 20th and 21st)
 2019-11-23  1		(counting email sent on 23th)
 2019-11-25  1		(counting email sent on 25th)


 Solution:
 ---------

 (in pseudo code):

 For each timestamp in unique timestamps in the data:
   - Filter data for observations that fall in the desired range.
   - Count number of observations.
   - Write timestamp and number of observations to temporary file.
   
 Merge original data with temporary file.

 */



 ******************************
 * generate dummy input data  *
 ******************************

 clear all
 set seed 123

 set obs 10000

 * start and end dates in unixtime
 local startdate=1542709934
 local enddate=1574245934

 generate unixtime = floor((`enddate'-`startdate')*runiform())+`startdate'

 generate double statatime = unixtime*1000 + mdyhms(1,1,1970,0,0,0)
 format statatime %tc

 * round statatime to nearest 15-minute interval to reduce the number of unique
 * values to potentially match on
 gen double statatime_match = (unixtime - mod(unixtime, 15*60)) * 1000 + mdyhms(1,1,1970,0,0,0)
 format statatime_match %tc


 preserve

 ************************************************************************************
 * iterate through unique values of *matchvar*, sum up *freq* in rolling window (defined by
 * windowsize in milliseconds), and write to temp file.

 * input: matchvar (put desired variable there)
 * windowsize: in milliseconds
 * filename: fn_out
 *
 ************************************************************************************

 local fn_out "summary.csv"
 local windowsize 1000*60*60*24
 local matchvar statatime_match

 file open myfile using "`fn_out'", write replace
 file write myfile "`matchvar'" _tab "sum" _n

 cap drop freq
 gen freq = 1
 qui cap drop group
 qui cap egen group = group(`matchvar')
 qui cap su group, meanonly

 local nobs `r(max)'
 _dots 0, title(Loop running) reps(`nobs')

 forvalues i = 1/`nobs' {
 	_dots `i' 0
 	quietly su `matchvar' if group == `i'
 	local l `r(mean)'
 	quietly su freq if `matchvar'>=`l'-`windowsize' & `matchvar'<=`l'+`windowsize'
 	file write myfile "`l'" _tab "`r(sum)'" _n
 	}
 file close myfile

 drop _all
 insheet using "`fn_out'"
 local outfile = subinstr("`fn_out'",".csv","",.)
 save "`outfile'", replace


 ***************************************************
 * merge rolling window summary with original data *
 ***************************************************

 restore

 merge 1:1 `matchvar' using "`outfile'"
 drop _merge
	/*

	===================================
	ROLLING WINDOW AGGREGATION IN STATA
	===================================

	Problem:
	--------

	Assume you have a data set with time/datestamps, noting
	when emails were sent out to customers.

	For example,

	2019-11-20
	2019-11-21
	2019-11-23
	2019-11-25

	Your goal is to understand how many emails were sent in a rolling window,
	around a specific date.

	Suppose that window is 1 day, then you desire to get a dataset like this:

	2019-11-20 2 (counting emails sent on 20th and 21st)
	2019-11-21 2 (counting emails sent on 20th and 21st)
	2019-11-23 1 (counting email sent on 23th)
	2019-11-25 1 (counting email sent on 25th)


	Solution:
	---------

	(in pseudo code):

	For each timestamp in unique timestamps in the data:
	- Filter data for observations that fall in the desired range.
	- Count number of observations.
	- Write timestamp and number of observations to temporary file.

	Merge original data with temporary file.

	*/



	******************************
	* generate dummy input data *
	******************************

	clear all
	set seed 123

	set obs 10000

	* start and end dates in unixtime
	local startdate=1542709934
	local enddate=1574245934

	generate unixtime = floor((`enddate'-`startdate')*runiform())+`startdate'

	generate double statatime = unixtime*1000 + mdyhms(1,1,1970,0,0,0)
	format statatime %tc

	* round statatime to nearest 15-minute interval to reduce the number of unique
	* values to potentially match on
	gen double statatime_match = (unixtime - mod(unixtime, 1560)) 1000 + mdyhms(1,1,1970,0,0,0)
	format statatime_match %tc


	preserve

	************************************************************************************
	* iterate through unique values of matchvar, sum up freq in rolling window (defined by
	* windowsize in milliseconds), and write to temp file.

	* input: matchvar (put desired variable there)
	* windowsize: in milliseconds
	* filename: fn_out
	*
	************************************************************************************

	local fn_out "summary.csv"
	local windowsize 10006060*24
	local matchvar statatime_match

	file open myfile using "`fn_out'", write replace
	file write myfile "`matchvar'" _tab "sum" _n

	cap drop freq
	gen freq = 1
	qui cap drop group
	qui cap egen group = group(`matchvar')
	qui cap su group, meanonly

	local nobs `r(max)'
	_dots 0, title(Loop running) reps(`nobs')

	forvalues i = 1/`nobs' {
	_dots `i' 0
	quietly su `matchvar' if group == `i'
	local l `r(mean)'
	quietly su freq if `matchvar'>=`l'-`windowsize' & `matchvar'<=`l'+`windowsize'
	file write myfile "`l'" _tab "`r(sum)'" _n
	}
	file close myfile

	drop _all
	insheet using "`fn_out'"
	local outfile = subinstr("`fn_out'",".csv","",.)
	save "`outfile'", replace


	***************************************************
	* merge rolling window summary with original data *
	***************************************************

	restore

	merge 1:1 `matchvar' using "`outfile'"
	drop _merge