Created
January 13, 2015 20:53
-
-
Save randyzwitch/01ac47b2eeda11fd02c8 to your computer and use it in GitHub Desktop.
Sessionizing Log File Data Using dplyr
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
###Sessionization using dplyr | |
library(dplyr) | |
#Open a localhost connection to Postgres | |
#Use table 'single_col_timestamp' | |
#group by uid and sort by timestamp for window function | |
#Do minutes calculation, working around missing support for extract(epoch from timestamp) | |
#Calculate event boundary and unique id via cumulative sum window function | |
sessions <- | |
src_postgres("logfiles") %>% | |
tbl("single_col_timestamp") %>% | |
group_by(uid) %>% | |
arrange(event_timestamp) %>% | |
mutate(minutes_since_last_event = ( | |
DATE_PART('day', event_timestamp - lag(event_timestamp)) * 24 + | |
DATE_PART('hour', event_timestamp - lag(event_timestamp)) * 60 + | |
DATE_PART('minute', event_timestamp - lag(event_timestamp)) * 60 + | |
DATE_PART('second', event_timestamp - lag(event_timestamp)) | |
) / 60 | |
) %>% | |
mutate(event_boundary = if(minutes_since_last_event > 30) 1 else 0, | |
session_id = order_by(event_timestamp, cumsum(if(minutes_since_last_event > 30) 1 else 0))) | |
#Show query syntax | |
show_query(sessions) | |
#Actually run the query | |
answer <- collect(sessions) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment