Skip to content

Instantly share code, notes, and snippets.

@DustinAlandzes
Created June 28, 2017 22:15
Show Gist options
  • Save DustinAlandzes/131ed8fd24b2b3a3aaa0bd577224ccf7 to your computer and use it in GitHub Desktop.
Save DustinAlandzes/131ed8fd24b2b3a3aaa0bd577224ccf7 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import requests\n",
"\n",
"url = \"http://dmanalytics.org/wp-content/uploads/2014/10/chapter-12-relay-foods.xlsx\"\n",
"r = requests.get(url)\n",
"print(r.text)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"### 1. Create a period column based on the OrderDate ###\n",
"df['OrderPeriod'] = df.OrderDate.apply(lambda x: x.strftime('%Y-%m'))\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"### 2. Determine hte user's cohort group (based on their first order) ###\n",
"df.set_index('UserId', inplace=True)\n",
"\n",
"df['CohortGroup'] = df.groupby(level=0)['OrderDate'].min().apply(lambda x: x.strftime('%Y-%m'))\n",
"df.reset_index(inplace=True)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"### 3. Rollup data by CohortGroup & OrderPeriod ###\n",
"grouped = df.groupby(['CohortGroup', 'OrderPeriod'])\n",
"\n",
"# count the unique users, orders, and total revenue per Group + Period\n",
"cohorts = grouped.agg({'UserId': pd.Series.nunique,\n",
" 'OrderId': pd.Series.nunique,\n",
" 'TotalCharges': np.sum})\n",
"\n",
"# make the column names more meaningful\n",
"cohorts.rename(columns={'UserId': 'TotalUsers',\n",
" 'OrderId': 'TotalOrders'}, inplace=True)\n",
"cohorts.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"### 4. Label the CohortPeriod for each CohortGroup ####\n",
"def cohort_period(df):\n",
" \"\"\"\n",
" Creates a `CohortPeriod` column, which is the Nth period based on the user's first purchase.\n",
" \n",
" Example\n",
" -------\n",
" Say you want to get the 3rd month for every user:\n",
" df.sort(['UserId', 'OrderTime', inplace=True)\n",
" df = df.groupby('UserId').apply(cohort_period)\n",
" df[df.CohortPeriod == 3]\n",
" \"\"\"\n",
" df['CohortPeriod'] = np.arange(len(df)) + 1\n",
" return df\n",
"\n",
"cohorts = cohorts.groupby(level=0).apply(cohort_period)\n",
"cohorts.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"### 5. Make sure we did all that right ###\n",
"x = df[(df.CohortGroup == '2009-01') & (df.OrderPeriod == '2009-01')]\n",
"y = cohorts.ix[('2009-01', '2009-01')]\n",
"\n",
"assert(x['UserId'].nunique() == y['TotalUsers'])\n",
"assert(x['TotalCharges'].sum().round(2) == y['TotalCharges'].round(2))\n",
"assert(x['OrderId'].nunique() == y['TotalOrders'])\n",
"\n",
"x = df[(df.CohortGroup == '2009-01') & (df.OrderPeriod == '2009-09')]\n",
"y = cohorts.ix[('2009-01', '2009-09')]\n",
"\n",
"assert(x['UserId'].nunique() == y['TotalUsers'])\n",
"assert(x['TotalCharges'].sum().round(2) == y['TotalCharges'].round(2))\n",
"assert(x['OrderId'].nunique() == y['TotalOrders'])\n",
"\n",
"x = df[(df.CohortGroup == '2009-05') & (df.OrderPeriod == '2009-09')]\n",
"y = cohorts.ix[('2009-05', '2009-09')]\n",
"\n",
"assert(x['UserId'].nunique() == y['TotalUsers'])\n",
"assert(x['TotalCharges'].sum().round(2) == y['TotalCharges'].round(2))\n",
"assert(x['OrderId'].nunique() == y['TotalOrders'])\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"### User Retention by Cohort Group ###\n",
"# reindex the DataFrame\n",
"cohorts.reset_index(inplace=True)\n",
"cohorts.set_index(['CohortGroup', 'CohortPeriod'], inplace=True)\n",
"\n",
"# create a Series holding the total size of each CohortGroup\n",
"cohort_group_size = cohorts['TotalUsers'].groupby(level=0).first()\n",
"cohort_group_size.head()\n",
"\n",
"# To illustrate what unstack does, recall the first five TotalUsers values:\n",
"cohorts['TotalUsers'].head()\n",
"# And here's what they look like when we unstack the CohortGroup level from the index:\n",
"cohorts['TotalUsers'].unstack(0).head()\n",
"# Now, we can utilize broadcasting to divide each column by the corresponding cohort_group_size.\n",
"user_retention = cohorts['TotalUsers'].unstack(0).divide(cohort_group_size, axis=1)\n",
"user_retention.head(10)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Finally, we can plot the cohorts over time in an effort to spot behavioral differences or similarities.\n",
"user_retention[['2009-06', '2009-07', '2009-08']].plot(figsize=(10,5))\n",
"plt.title('Cohorts: User Retention')\n",
"plt.xticks(np.arange(1, 12.1, 1))\n",
"plt.xlim(1, 12)\n",
"plt.ylabel('% of Cohort Purchasing');"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Creating heatmaps in matplotlib is more difficult than it should be.\n",
"# Thankfully, Seaborn makes them easy for us.\n",
"# http://stanford.edu/~mwaskom/software/seaborn/\n",
"\n",
"import seaborn as sns\n",
"sns.set(style='white')\n",
"\n",
"plt.figure(figsize=(12, 8))\n",
"plt.title('Cohorts: User Retention')\n",
"sns.heatmap(user_retention.T, mask=user_retention.T.isnull(), annot=True, fmt='.0%');"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment