Created
August 28, 2013 14:13
-
-
Save chyikwei/6366467 to your computer and use it in GitHub Desktop.
cohorts analysis script
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import random as rand | |
from string import Template | |
from itertools import repeat | |
from datetime import datetime, timedelta | |
from apps.glue.util.date import start_of_next_month | |
from apps.dashboard.views import get_bigquery, get_signup_tables, get_interaction_tables | |
from apps.dashboard.views import datetime_human, date_range2, date_name | |
bq = get_bigquery() | |
DATE_OFFSET = timedelta(hours=11) | |
SAMPLE_SIZE = 1000 | |
def get_in_args(name, ids): | |
if not ids: | |
return "" | |
if isinstance(ids, basestring): | |
ids = [ids] | |
elif isinstance(ids, set): | |
ids = list(ids) | |
names = ','.join(["'" + i + "'" for i in ids]) | |
return '%s in (%s)' % (name, names) | |
def print_query(q, length=400): | |
side = length / 2 | |
if len(q) <= side: | |
print "\n%s\n" % q | |
else: | |
print "\n%s...%s\n" % (q[:side], q[-side:]) | |
class CohortAnalysis(object): | |
def __init__(self, start, end, last, apps=None, categories=None, actions=None): | |
#self.start_date = datetime.strptime(start, '%Y-%m-%d') + DATE_OFFSET | |
#self.end_date = datetime.strptime(end, '%Y-%m-%d') + timedelta(days=1) + DATE_OFFSET | |
self.start_date = start + DATE_OFFSET | |
self.end_date = end + timedelta(days=1) + DATE_OFFSET | |
self.last_date = last | |
self.apps = apps | |
self.categories = categories | |
self.actions = actions | |
# pull all signups | |
def get_signup_ids(self): | |
""" | |
SELECT user_id as id | |
FROM getglue.signups_201209 | |
WHERE timestamp >= '2012-09-01 11:00:00' AND timestamp < '2012-10-01 11:00:00' | |
AND app in ('WebsiteHD', ....) | |
""" | |
q = Template("SELECT user_id as id " + | |
"FROM $signup " + | |
"WHERE $wheres " + | |
"GROUP BY id") | |
signup_table_name = get_signup_tables(self.start_date, self.end_date) | |
wheres = [] | |
# start_time | |
wheres.append("timestamp >= '%s'" % datetime_human(self.start_date)) | |
# end_time | |
wheres.append("timestamp < '%s'" % datetime_human(self.end_date)) | |
if self.apps: | |
wheres.append(get_in_args('app', self.apps)) | |
condition = " AND ".join(wheres) | |
q_signup = q.substitute(signup=signup_table_name, wheres=condition) | |
print_query(q_signup, length=800) | |
ids = [] | |
for row in bq.query(q_signup): | |
id = row.get('id') | |
if id: | |
ids.append(id) | |
return ids | |
# find user_id with specific action (from singups ids) | |
def get_active_ids(self, user_ids): | |
""" | |
SELECT user_id as id | |
FROM getglue.interactions_201209 | |
WHERE timestamp >= '2012-09-01 11:00:00' AND timestamp < '2012-10-01 11:00:00' | |
AND app in ('WebsiteHD', ...) | |
AND category in ('tv_shows', 'movies', ...) | |
AND action in ('checkin', ...) | |
AND user_id in (....) | |
GROUP BY id | |
""" | |
q = Template("SELECT user_id as id " + | |
"FROM $interactions " + | |
"WHERE $wheres " + | |
"GROUP BY id") | |
interaction_tables = get_interaction_tables(self.start_date, self.end_date) | |
wheres = [] | |
# start_time | |
wheres.append("timestamp >= '%s'" % datetime_human(self.start_date)) | |
# end_time | |
wheres.append("timestamp < '%s'" % datetime_human(self.end_date)) | |
# user_ids | |
wheres.append(get_in_args('user_id', user_ids)) | |
if self.apps: | |
wheres.append(get_in_args('app', self.apps)) | |
if self.categories: | |
wheres.append(get_in_args('category', self.categories)) | |
if self.actions: | |
wheres.append(get_in_args('action', self.actions)) | |
condition = " AND ".join(wheres) | |
q_active_users = q.substitute(interactions=interaction_tables, wheres=condition) | |
print_query(q_active_users) | |
ids = [] | |
for row in bq.query(q_active_users): | |
id = row.get('id') | |
if id: | |
ids.append(id) | |
return ids | |
def get_revisit_count(self, since, until, user_ids): | |
""" | |
SELECT COUNT(DISTINCT user_id) as id | |
FROM getglue.interactions_201209 | |
WHERE timestamp >= '2012-10-01 11:00:00' AND timestamp < 'today' | |
AND user_id in (...) | |
AND apps in (...) | |
AND category in (...) | |
group by id | |
""" | |
today = datetime.today() | |
today = today.replace(hour=11, minute=0, second=0, microsecond=0) | |
if until > today or since >= until: | |
#dont want to calculate unfinished week | |
return 0 | |
q = Template("SELECT COUNT(DISTINCT user_id) as cnt " + | |
"FROM $interactions " + | |
"WHERE $wheres ") | |
interaction_tables = get_interaction_tables(since, until) | |
wheres = [] | |
# start_time | |
wheres.append("timestamp >= '%s'" % datetime_human(since)) | |
# end_time | |
wheres.append("timestamp < '%s'" % datetime_human(until)) | |
# user_ids | |
wheres.append(get_in_args('user_id', user_ids)) | |
if self.categories: | |
wheres.append(get_in_args('category', self.categories)) | |
if self.apps: | |
wheres.append(get_in_args('app', self.apps)) | |
if self.actions: | |
wheres.append(get_in_args('action', self.actions)) | |
revisit_count = 0 | |
condition = " AND ".join(wheres) | |
q_retention = q.substitute(interactions=interaction_tables, wheres=condition) | |
print_query(q_retention, 800) | |
#print "query length: %d" % len(q_retention) | |
for row in bq.query(q_retention): | |
revisit_count += row.get('cnt', 0) | |
return revisit_count | |
def get_revisits(self, user_ids, roundType='week'): | |
revisits = [] | |
for start, end in date_range2(self.start_date, self.last_date, roundType): | |
# skip signup period | |
if start == self.start_date: | |
print "skip" | |
continue | |
start = start.replace(hour=11, minute=0, second=0, microsecond=0) | |
#end = end.replace(hour=11, minute=0, second=0, microsecond=0) + timedelta(days=1) | |
end = self.last_date.replace(hour=11, minute=0, second=0, microsecond=0) | |
print start, end | |
revisit_count = self.get_revisit_count(start, end, user_ids) | |
revisits.append(revisit_count) | |
return revisits | |
def run_cohort_analysis(since, until, app=None, categories=None, action=None, roundType='week'): | |
# results | |
dates = [] | |
results = {} | |
# set last date to today | |
last_date = datetime.today() | |
for dt_start, dt_end in date_range2(since, until, roundType): | |
date = date_name(dt_start, dt_end, roundType) | |
dates.append(date) | |
# setting for each time period | |
ca = CohortAnalysis(dt_start, dt_end, last_date, app, categories, action) | |
# get all signups | |
signup_ids = ca.get_signup_ids() | |
# get signups with specific action | |
active_ids = ca.get_active_ids(signup_ids) | |
# random sample users | |
#rand.seed(0) | |
final_ids = rand.sample(active_ids, SAMPLE_SIZE) | |
# get revisit count | |
counts = ca.get_revisits(final_ids, roundType) | |
results[date] = {'signups': len(signup_ids), | |
'active_ids': len(active_ids), | |
'retention': counts} | |
return results | |
def export_csv(results, file_name): | |
with open(file_name, 'w') as f: | |
dates = sorted(results.keys()) | |
for date in dates: | |
print date | |
ret = results[date] | |
retention = ret.get('retention') | |
retention = [str(r) for r in retention] | |
f.write("%s,%d,%d,%s\n" % (date, ret['signups'], ret['active_ids'], ",".join(retention))) | |
def run(): | |
#setting | |
since = datetime(2013, 1, 1) | |
until = datetime(2013, 7, 31) | |
app = ['WebsiteHD', 'MobileSiteHD', 'iPhoneHD', 'iPadHD', 'AndroidHD'] | |
categories = ['tv_shows', 'movies', 'sports'] | |
action = 'checkin' | |
export_file = 'data/checkin_cohort_20130823.csv' | |
results = run_cohort_analysis(since, until, app, categories, action, 'month') | |
export_csv(results, export_file) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment