Created
          June 28, 2017 22:15 
        
      - 
      
 - 
        
Save DustinAlandzes/131ed8fd24b2b3a3aaa0bd577224ccf7 to your computer and use it in GitHub Desktop.  
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "import requests\n", | |
| "\n", | |
| "url = \"http://dmanalytics.org/wp-content/uploads/2014/10/chapter-12-relay-foods.xlsx\"\n", | |
| "r = requests.get(url)\n", | |
| "print(r.text)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "### 1. Create a period column based on the OrderDate ###\n", | |
| "df['OrderPeriod'] = df.OrderDate.apply(lambda x: x.strftime('%Y-%m'))\n", | |
| "df.head()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "### 2. Determine hte user's cohort group (based on their first order) ###\n", | |
| "df.set_index('UserId', inplace=True)\n", | |
| "\n", | |
| "df['CohortGroup'] = df.groupby(level=0)['OrderDate'].min().apply(lambda x: x.strftime('%Y-%m'))\n", | |
| "df.reset_index(inplace=True)\n", | |
| "df.head()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "### 3. Rollup data by CohortGroup & OrderPeriod ###\n", | |
| "grouped = df.groupby(['CohortGroup', 'OrderPeriod'])\n", | |
| "\n", | |
| "# count the unique users, orders, and total revenue per Group + Period\n", | |
| "cohorts = grouped.agg({'UserId': pd.Series.nunique,\n", | |
| " 'OrderId': pd.Series.nunique,\n", | |
| " 'TotalCharges': np.sum})\n", | |
| "\n", | |
| "# make the column names more meaningful\n", | |
| "cohorts.rename(columns={'UserId': 'TotalUsers',\n", | |
| " 'OrderId': 'TotalOrders'}, inplace=True)\n", | |
| "cohorts.head()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "### 4. Label the CohortPeriod for each CohortGroup ####\n", | |
| "def cohort_period(df):\n", | |
| " \"\"\"\n", | |
| " Creates a `CohortPeriod` column, which is the Nth period based on the user's first purchase.\n", | |
| " \n", | |
| " Example\n", | |
| " -------\n", | |
| " Say you want to get the 3rd month for every user:\n", | |
| " df.sort(['UserId', 'OrderTime', inplace=True)\n", | |
| " df = df.groupby('UserId').apply(cohort_period)\n", | |
| " df[df.CohortPeriod == 3]\n", | |
| " \"\"\"\n", | |
| " df['CohortPeriod'] = np.arange(len(df)) + 1\n", | |
| " return df\n", | |
| "\n", | |
| "cohorts = cohorts.groupby(level=0).apply(cohort_period)\n", | |
| "cohorts.head()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "### 5. Make sure we did all that right ###\n", | |
| "x = df[(df.CohortGroup == '2009-01') & (df.OrderPeriod == '2009-01')]\n", | |
| "y = cohorts.ix[('2009-01', '2009-01')]\n", | |
| "\n", | |
| "assert(x['UserId'].nunique() == y['TotalUsers'])\n", | |
| "assert(x['TotalCharges'].sum().round(2) == y['TotalCharges'].round(2))\n", | |
| "assert(x['OrderId'].nunique() == y['TotalOrders'])\n", | |
| "\n", | |
| "x = df[(df.CohortGroup == '2009-01') & (df.OrderPeriod == '2009-09')]\n", | |
| "y = cohorts.ix[('2009-01', '2009-09')]\n", | |
| "\n", | |
| "assert(x['UserId'].nunique() == y['TotalUsers'])\n", | |
| "assert(x['TotalCharges'].sum().round(2) == y['TotalCharges'].round(2))\n", | |
| "assert(x['OrderId'].nunique() == y['TotalOrders'])\n", | |
| "\n", | |
| "x = df[(df.CohortGroup == '2009-05') & (df.OrderPeriod == '2009-09')]\n", | |
| "y = cohorts.ix[('2009-05', '2009-09')]\n", | |
| "\n", | |
| "assert(x['UserId'].nunique() == y['TotalUsers'])\n", | |
| "assert(x['TotalCharges'].sum().round(2) == y['TotalCharges'].round(2))\n", | |
| "assert(x['OrderId'].nunique() == y['TotalOrders'])\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "### User Retention by Cohort Group ###\n", | |
| "# reindex the DataFrame\n", | |
| "cohorts.reset_index(inplace=True)\n", | |
| "cohorts.set_index(['CohortGroup', 'CohortPeriod'], inplace=True)\n", | |
| "\n", | |
| "# create a Series holding the total size of each CohortGroup\n", | |
| "cohort_group_size = cohorts['TotalUsers'].groupby(level=0).first()\n", | |
| "cohort_group_size.head()\n", | |
| "\n", | |
| "# To illustrate what unstack does, recall the first five TotalUsers values:\n", | |
| "cohorts['TotalUsers'].head()\n", | |
| "# And here's what they look like when we unstack the CohortGroup level from the index:\n", | |
| "cohorts['TotalUsers'].unstack(0).head()\n", | |
| "# Now, we can utilize broadcasting to divide each column by the corresponding cohort_group_size.\n", | |
| "user_retention = cohorts['TotalUsers'].unstack(0).divide(cohort_group_size, axis=1)\n", | |
| "user_retention.head(10)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "# Finally, we can plot the cohorts over time in an effort to spot behavioral differences or similarities.\n", | |
| "user_retention[['2009-06', '2009-07', '2009-08']].plot(figsize=(10,5))\n", | |
| "plt.title('Cohorts: User Retention')\n", | |
| "plt.xticks(np.arange(1, 12.1, 1))\n", | |
| "plt.xlim(1, 12)\n", | |
| "plt.ylabel('% of Cohort Purchasing');" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "# Creating heatmaps in matplotlib is more difficult than it should be.\n", | |
| "# Thankfully, Seaborn makes them easy for us.\n", | |
| "# http://stanford.edu/~mwaskom/software/seaborn/\n", | |
| "\n", | |
| "import seaborn as sns\n", | |
| "sns.set(style='white')\n", | |
| "\n", | |
| "plt.figure(figsize=(12, 8))\n", | |
| "plt.title('Cohorts: User Retention')\n", | |
| "sns.heatmap(user_retention.T, mask=user_retention.T.isnull(), annot=True, fmt='.0%');" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.5.2" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 1 | |
| } | 
  
    Sign up for free
    to join this conversation on GitHub.
    Already have an account?
    Sign in to comment