Last active
November 21, 2019 11:27
-
-
Save hannesdatta/e0c0bac8db4c51cb656228559ed645f7 to your computer and use it in GitHub Desktop.
Rolling window merge in Stata
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
=================================== | |
ROLLING WINDOW AGGREGATION IN STATA | |
=================================== | |
Problem: | |
-------- | |
Assume you have a data set with time/datestamps, noting | |
when emails were sent out to customers. | |
For example, | |
2019-11-20 | |
2019-11-21 | |
2019-11-23 | |
2019-11-25 | |
Your goal is to understand how many emails were sent in a rolling window, | |
around a specific date. | |
Suppose that window is 1 day, then you desire to get a dataset like this: | |
2019-11-20 2 (counting emails sent on 20th and 21st) | |
2019-11-21 2 (counting emails sent on 20th and 21st) | |
2019-11-23 1 (counting email sent on 23th) | |
2019-11-25 1 (counting email sent on 25th) | |
Solution: | |
--------- | |
(in pseudo code): | |
For each timestamp in unique timestamps in the data: | |
- Filter data for observations that fall in the desired range. | |
- Count number of observations. | |
- Write timestamp and number of observations to temporary file. | |
Merge original data with temporary file. | |
*/ | |
****************************** | |
* generate dummy input data * | |
****************************** | |
clear all | |
set seed 123 | |
set obs 10000 | |
* start and end dates in unixtime | |
local startdate=1542709934 | |
local enddate=1574245934 | |
generate unixtime = floor((`enddate'-`startdate')*runiform())+`startdate' | |
generate double statatime = unixtime*1000 + mdyhms(1,1,1970,0,0,0) | |
format statatime %tc | |
* round statatime to nearest 15-minute interval to reduce the number of unique | |
* values to potentially match on | |
gen double statatime_match = (unixtime - mod(unixtime, 15*60)) * 1000 + mdyhms(1,1,1970,0,0,0) | |
format statatime_match %tc | |
preserve | |
************************************************************************************ | |
* iterate through unique values of *matchvar*, sum up *freq* in rolling window (defined by | |
* windowsize in milliseconds), and write to temp file. | |
* input: matchvar (put desired variable there) | |
* windowsize: in milliseconds | |
* filename: fn_out | |
* | |
************************************************************************************ | |
local fn_out "summary.csv" | |
local windowsize 1000*60*60*24 | |
local matchvar statatime_match | |
file open myfile using "`fn_out'", write replace | |
file write myfile "`matchvar'" _tab "sum" _n | |
cap drop freq | |
gen freq = 1 | |
qui cap drop group | |
qui cap egen group = group(`matchvar') | |
qui cap su group, meanonly | |
local nobs `r(max)' | |
_dots 0, title(Loop running) reps(`nobs') | |
forvalues i = 1/`nobs' { | |
_dots `i' 0 | |
quietly su `matchvar' if group == `i' | |
local l `r(mean)' | |
quietly su freq if `matchvar'>=`l'-`windowsize' & `matchvar'<=`l'+`windowsize' | |
file write myfile "`l'" _tab "`r(sum)'" _n | |
} | |
file close myfile | |
drop _all | |
insheet using "`fn_out'" | |
local outfile = subinstr("`fn_out'",".csv","",.) | |
save "`outfile'", replace | |
*************************************************** | |
* merge rolling window summary with original data * | |
*************************************************** | |
restore | |
merge 1:1 `matchvar' using "`outfile'" | |
drop _merge |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment