Skip to content

Instantly share code, notes, and snippets.

@chyikwei
Created August 28, 2013 14:13
Show Gist options
  • Save chyikwei/6366467 to your computer and use it in GitHub Desktop.
Save chyikwei/6366467 to your computer and use it in GitHub Desktop.
cohorts analysis script
import random as rand
from string import Template
from itertools import repeat
from datetime import datetime, timedelta
from apps.glue.util.date import start_of_next_month
from apps.dashboard.views import get_bigquery, get_signup_tables, get_interaction_tables
from apps.dashboard.views import datetime_human, date_range2, date_name
bq = get_bigquery()
DATE_OFFSET = timedelta(hours=11)
SAMPLE_SIZE = 1000
def get_in_args(name, ids):
if not ids:
return ""
if isinstance(ids, basestring):
ids = [ids]
elif isinstance(ids, set):
ids = list(ids)
names = ','.join(["'" + i + "'" for i in ids])
return '%s in (%s)' % (name, names)
def print_query(q, length=400):
side = length / 2
if len(q) <= side:
print "\n%s\n" % q
else:
print "\n%s...%s\n" % (q[:side], q[-side:])
class CohortAnalysis(object):
def __init__(self, start, end, last, apps=None, categories=None, actions=None):
#self.start_date = datetime.strptime(start, '%Y-%m-%d') + DATE_OFFSET
#self.end_date = datetime.strptime(end, '%Y-%m-%d') + timedelta(days=1) + DATE_OFFSET
self.start_date = start + DATE_OFFSET
self.end_date = end + timedelta(days=1) + DATE_OFFSET
self.last_date = last
self.apps = apps
self.categories = categories
self.actions = actions
# pull all signups
def get_signup_ids(self):
"""
SELECT user_id as id
FROM getglue.signups_201209
WHERE timestamp >= '2012-09-01 11:00:00' AND timestamp < '2012-10-01 11:00:00'
AND app in ('WebsiteHD', ....)
"""
q = Template("SELECT user_id as id " +
"FROM $signup " +
"WHERE $wheres " +
"GROUP BY id")
signup_table_name = get_signup_tables(self.start_date, self.end_date)
wheres = []
# start_time
wheres.append("timestamp >= '%s'" % datetime_human(self.start_date))
# end_time
wheres.append("timestamp < '%s'" % datetime_human(self.end_date))
if self.apps:
wheres.append(get_in_args('app', self.apps))
condition = " AND ".join(wheres)
q_signup = q.substitute(signup=signup_table_name, wheres=condition)
print_query(q_signup, length=800)
ids = []
for row in bq.query(q_signup):
id = row.get('id')
if id:
ids.append(id)
return ids
# find user_id with specific action (from singups ids)
def get_active_ids(self, user_ids):
"""
SELECT user_id as id
FROM getglue.interactions_201209
WHERE timestamp >= '2012-09-01 11:00:00' AND timestamp < '2012-10-01 11:00:00'
AND app in ('WebsiteHD', ...)
AND category in ('tv_shows', 'movies', ...)
AND action in ('checkin', ...)
AND user_id in (....)
GROUP BY id
"""
q = Template("SELECT user_id as id " +
"FROM $interactions " +
"WHERE $wheres " +
"GROUP BY id")
interaction_tables = get_interaction_tables(self.start_date, self.end_date)
wheres = []
# start_time
wheres.append("timestamp >= '%s'" % datetime_human(self.start_date))
# end_time
wheres.append("timestamp < '%s'" % datetime_human(self.end_date))
# user_ids
wheres.append(get_in_args('user_id', user_ids))
if self.apps:
wheres.append(get_in_args('app', self.apps))
if self.categories:
wheres.append(get_in_args('category', self.categories))
if self.actions:
wheres.append(get_in_args('action', self.actions))
condition = " AND ".join(wheres)
q_active_users = q.substitute(interactions=interaction_tables, wheres=condition)
print_query(q_active_users)
ids = []
for row in bq.query(q_active_users):
id = row.get('id')
if id:
ids.append(id)
return ids
def get_revisit_count(self, since, until, user_ids):
"""
SELECT COUNT(DISTINCT user_id) as id
FROM getglue.interactions_201209
WHERE timestamp >= '2012-10-01 11:00:00' AND timestamp < 'today'
AND user_id in (...)
AND apps in (...)
AND category in (...)
group by id
"""
today = datetime.today()
today = today.replace(hour=11, minute=0, second=0, microsecond=0)
if until > today or since >= until:
#dont want to calculate unfinished week
return 0
q = Template("SELECT COUNT(DISTINCT user_id) as cnt " +
"FROM $interactions " +
"WHERE $wheres ")
interaction_tables = get_interaction_tables(since, until)
wheres = []
# start_time
wheres.append("timestamp >= '%s'" % datetime_human(since))
# end_time
wheres.append("timestamp < '%s'" % datetime_human(until))
# user_ids
wheres.append(get_in_args('user_id', user_ids))
if self.categories:
wheres.append(get_in_args('category', self.categories))
if self.apps:
wheres.append(get_in_args('app', self.apps))
if self.actions:
wheres.append(get_in_args('action', self.actions))
revisit_count = 0
condition = " AND ".join(wheres)
q_retention = q.substitute(interactions=interaction_tables, wheres=condition)
print_query(q_retention, 800)
#print "query length: %d" % len(q_retention)
for row in bq.query(q_retention):
revisit_count += row.get('cnt', 0)
return revisit_count
def get_revisits(self, user_ids, roundType='week'):
revisits = []
for start, end in date_range2(self.start_date, self.last_date, roundType):
# skip signup period
if start == self.start_date:
print "skip"
continue
start = start.replace(hour=11, minute=0, second=0, microsecond=0)
#end = end.replace(hour=11, minute=0, second=0, microsecond=0) + timedelta(days=1)
end = self.last_date.replace(hour=11, minute=0, second=0, microsecond=0)
print start, end
revisit_count = self.get_revisit_count(start, end, user_ids)
revisits.append(revisit_count)
return revisits
def run_cohort_analysis(since, until, app=None, categories=None, action=None, roundType='week'):
# results
dates = []
results = {}
# set last date to today
last_date = datetime.today()
for dt_start, dt_end in date_range2(since, until, roundType):
date = date_name(dt_start, dt_end, roundType)
dates.append(date)
# setting for each time period
ca = CohortAnalysis(dt_start, dt_end, last_date, app, categories, action)
# get all signups
signup_ids = ca.get_signup_ids()
# get signups with specific action
active_ids = ca.get_active_ids(signup_ids)
# random sample users
#rand.seed(0)
final_ids = rand.sample(active_ids, SAMPLE_SIZE)
# get revisit count
counts = ca.get_revisits(final_ids, roundType)
results[date] = {'signups': len(signup_ids),
'active_ids': len(active_ids),
'retention': counts}
return results
def export_csv(results, file_name):
with open(file_name, 'w') as f:
dates = sorted(results.keys())
for date in dates:
print date
ret = results[date]
retention = ret.get('retention')
retention = [str(r) for r in retention]
f.write("%s,%d,%d,%s\n" % (date, ret['signups'], ret['active_ids'], ",".join(retention)))
def run():
#setting
since = datetime(2013, 1, 1)
until = datetime(2013, 7, 31)
app = ['WebsiteHD', 'MobileSiteHD', 'iPhoneHD', 'iPadHD', 'AndroidHD']
categories = ['tv_shows', 'movies', 'sports']
action = 'checkin'
export_file = 'data/checkin_cohort_20130823.csv'
results = run_cohort_analysis(since, until, app, categories, action, 'month')
export_csv(results, export_file)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment