Skip to content

Instantly share code, notes, and snippets.

@DustinAlandzes
Last active May 16, 2017 11:04
Show Gist options
  • Save DustinAlandzes/5c84172ae8ec2ed31327f0c9d956103c to your computer and use it in GitHub Desktop.
Save DustinAlandzes/5c84172ae8ec2ed31327f0c9d956103c to your computer and use it in GitHub Desktop.
#from https://www.dataiku.com/learn/guide/code/reshaping_data/sessionization.html
import dataiku
import pandas as pd
from datetime import timedelta
# define treshold value
T = timedelta(seconds=30*60)
# load dataset
toy_data = dataiku.Dataset("toy_data").get_dataframe()
# add a column containing previous timestamp
toy_data = pd.concat([toy_data,
toy_data.groupby('user_id').transform(lambda x:x.shift(1))]
,axis=1)
toy_data.columns = ['user_id','mytimestamp','prev_mytimestamp']
# create the new session column
toy_data['new_session'] = ((toy_data['mytimestamp']
- toy_data['prev_mytimestamp'])>=T).astype(int)
# create the session_id
toy_data['increment'] = toy_data.groupby("user_id")['new_session'].cumsum()
toy_data['session_id'] = toy_data['user_id'].astype(str) + '_'
+ toy_data['increment'].astype(str)
# to get the same result as with hive/postgresql
toy_data = toy_data.sort(['user_id','mytimestamp'])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment