DustinAlandzes · June 28, 2017 22:15
diff --git a/cohort.ipynb b/cohort.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import requests\n",
    "\n",
    "url = \"http://dmanalytics.org/wp-content/uploads/2014/10/chapter-12-relay-foods.xlsx\"\n",
    "r = requests.get(url)\n",
    "print(r.text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "### 1. Create a period column based on the OrderDate ###\n",
    "df['OrderPeriod'] = df.OrderDate.apply(lambda x: x.strftime('%Y-%m'))\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "### 2. Determine hte user's cohort group (based on their first order) ###\n",
    "df.set_index('UserId', inplace=True)\n",
    "\n",
    "df['CohortGroup'] = df.groupby(level=0)['OrderDate'].min().apply(lambda x: x.strftime('%Y-%m'))\n",
    "df.reset_index(inplace=True)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "### 3. Rollup data by CohortGroup & OrderPeriod ###\n",
    "grouped = df.groupby(['CohortGroup', 'OrderPeriod'])\n",
    "\n",
    "# count the unique users, orders, and total revenue per Group + Period\n",
    "cohorts = grouped.agg({'UserId': pd.Series.nunique,\n",
    "                       'OrderId': pd.Series.nunique,\n",
    "                       'TotalCharges': np.sum})\n",
    "\n",
    "# make the column names more meaningful\n",
    "cohorts.rename(columns={'UserId': 'TotalUsers',\n",
    "                        'OrderId': 'TotalOrders'}, inplace=True)\n",
    "cohorts.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "### 4. Label the CohortPeriod for each CohortGroup ####\n",
    "def cohort_period(df):\n",
    "    \"\"\"\n",
    "    Creates a `CohortPeriod` column, which is the Nth period based on the user's first purchase.\n",
    "    \n",
    "    Example\n",
    "    -------\n",
    "    Say you want to get the 3rd month for every user:\n",
    "        df.sort(['UserId', 'OrderTime', inplace=True)\n",
    "        df = df.groupby('UserId').apply(cohort_period)\n",
    "        df[df.CohortPeriod == 3]\n",
    "    \"\"\"\n",
    "    df['CohortPeriod'] = np.arange(len(df)) + 1\n",
    "    return df\n",
    "\n",
    "cohorts = cohorts.groupby(level=0).apply(cohort_period)\n",
    "cohorts.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "### 5. Make sure we did all that right ###\n",
    "x = df[(df.CohortGroup == '2009-01') & (df.OrderPeriod == '2009-01')]\n",
    "y = cohorts.ix[('2009-01', '2009-01')]\n",
    "\n",
    "assert(x['UserId'].nunique() == y['TotalUsers'])\n",
    "assert(x['TotalCharges'].sum().round(2) == y['TotalCharges'].round(2))\n",
    "assert(x['OrderId'].nunique() == y['TotalOrders'])\n",
    "\n",
    "x = df[(df.CohortGroup == '2009-01') & (df.OrderPeriod == '2009-09')]\n",
    "y = cohorts.ix[('2009-01', '2009-09')]\n",
    "\n",
    "assert(x['UserId'].nunique() == y['TotalUsers'])\n",
    "assert(x['TotalCharges'].sum().round(2) == y['TotalCharges'].round(2))\n",
    "assert(x['OrderId'].nunique() == y['TotalOrders'])\n",
    "\n",
    "x = df[(df.CohortGroup == '2009-05') & (df.OrderPeriod == '2009-09')]\n",
    "y = cohorts.ix[('2009-05', '2009-09')]\n",
    "\n",
    "assert(x['UserId'].nunique() == y['TotalUsers'])\n",
    "assert(x['TotalCharges'].sum().round(2) == y['TotalCharges'].round(2))\n",
    "assert(x['OrderId'].nunique() == y['TotalOrders'])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "### User Retention by Cohort Group ###\n",
    "# reindex the DataFrame\n",
    "cohorts.reset_index(inplace=True)\n",
    "cohorts.set_index(['CohortGroup', 'CohortPeriod'], inplace=True)\n",
    "\n",
    "# create a Series holding the total size of each CohortGroup\n",
    "cohort_group_size = cohorts['TotalUsers'].groupby(level=0).first()\n",
    "cohort_group_size.head()\n",
    "\n",
    "# To illustrate what unstack does, recall the first five TotalUsers values:\n",
    "cohorts['TotalUsers'].head()\n",
    "# And here's what they look like when we unstack the CohortGroup level from the index:\n",
    "cohorts['TotalUsers'].unstack(0).head()\n",
    "# Now, we can utilize broadcasting to divide each column by the corresponding cohort_group_size.\n",
    "user_retention = cohorts['TotalUsers'].unstack(0).divide(cohort_group_size, axis=1)\n",
    "user_retention.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Finally, we can plot the cohorts over time in an effort to spot behavioral differences or similarities.\n",
    "user_retention[['2009-06', '2009-07', '2009-08']].plot(figsize=(10,5))\n",
    "plt.title('Cohorts: User Retention')\n",
    "plt.xticks(np.arange(1, 12.1, 1))\n",
    "plt.xlim(1, 12)\n",
    "plt.ylabel('% of Cohort Purchasing');"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Creating heatmaps in matplotlib is more difficult than it should be.\n",
    "# Thankfully, Seaborn makes them easy for us.\n",
    "# http://stanford.edu/~mwaskom/software/seaborn/\n",
    "\n",
    "import seaborn as sns\n",
    "sns.set(style='white')\n",
    "\n",
    "plt.figure(figsize=(12, 8))\n",
    "plt.title('Cohorts: User Retention')\n",
    "sns.heatmap(user_retention.T, mask=user_retention.T.isnull(), annot=True, fmt='.0%');"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
 }
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"import requests\n",
	"\n",
	"url = \"http://dmanalytics.org/wp-content/uploads/2014/10/chapter-12-relay-foods.xlsx\"\n",
	"r = requests.get(url)\n",
	"print(r.text)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"### 1. Create a period column based on the OrderDate ###\n",
	"df['OrderPeriod'] = df.OrderDate.apply(lambda x: x.strftime('%Y-%m'))\n",
	"df.head()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"### 2. Determine hte user's cohort group (based on their first order) ###\n",
	"df.set_index('UserId', inplace=True)\n",
	"\n",
	"df['CohortGroup'] = df.groupby(level=0)['OrderDate'].min().apply(lambda x: x.strftime('%Y-%m'))\n",
	"df.reset_index(inplace=True)\n",
	"df.head()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"### 3. Rollup data by CohortGroup & OrderPeriod ###\n",
	"grouped = df.groupby(['CohortGroup', 'OrderPeriod'])\n",
	"\n",
	"# count the unique users, orders, and total revenue per Group + Period\n",
	"cohorts = grouped.agg({'UserId': pd.Series.nunique,\n",
	" 'OrderId': pd.Series.nunique,\n",
	" 'TotalCharges': np.sum})\n",
	"\n",
	"# make the column names more meaningful\n",
	"cohorts.rename(columns={'UserId': 'TotalUsers',\n",
	" 'OrderId': 'TotalOrders'}, inplace=True)\n",
	"cohorts.head()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"### 4. Label the CohortPeriod for each CohortGroup ####\n",
	"def cohort_period(df):\n",
	" \"\"\"\n",
	" Creates a `CohortPeriod` column, which is the Nth period based on the user's first purchase.\n",
	" \n",
	" Example\n",
	" -------\n",
	" Say you want to get the 3rd month for every user:\n",
	" df.sort(['UserId', 'OrderTime', inplace=True)\n",
	" df = df.groupby('UserId').apply(cohort_period)\n",
	" df[df.CohortPeriod == 3]\n",
	" \"\"\"\n",
	" df['CohortPeriod'] = np.arange(len(df)) + 1\n",
	" return df\n",
	"\n",
	"cohorts = cohorts.groupby(level=0).apply(cohort_period)\n",
	"cohorts.head()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"### 5. Make sure we did all that right ###\n",
	"x = df[(df.CohortGroup == '2009-01') & (df.OrderPeriod == '2009-01')]\n",
	"y = cohorts.ix[('2009-01', '2009-01')]\n",
	"\n",
	"assert(x['UserId'].nunique() == y['TotalUsers'])\n",
	"assert(x['TotalCharges'].sum().round(2) == y['TotalCharges'].round(2))\n",
	"assert(x['OrderId'].nunique() == y['TotalOrders'])\n",
	"\n",
	"x = df[(df.CohortGroup == '2009-01') & (df.OrderPeriod == '2009-09')]\n",
	"y = cohorts.ix[('2009-01', '2009-09')]\n",
	"\n",
	"assert(x['UserId'].nunique() == y['TotalUsers'])\n",
	"assert(x['TotalCharges'].sum().round(2) == y['TotalCharges'].round(2))\n",
	"assert(x['OrderId'].nunique() == y['TotalOrders'])\n",
	"\n",
	"x = df[(df.CohortGroup == '2009-05') & (df.OrderPeriod == '2009-09')]\n",
	"y = cohorts.ix[('2009-05', '2009-09')]\n",
	"\n",
	"assert(x['UserId'].nunique() == y['TotalUsers'])\n",
	"assert(x['TotalCharges'].sum().round(2) == y['TotalCharges'].round(2))\n",
	"assert(x['OrderId'].nunique() == y['TotalOrders'])\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"### User Retention by Cohort Group ###\n",
	"# reindex the DataFrame\n",
	"cohorts.reset_index(inplace=True)\n",
	"cohorts.set_index(['CohortGroup', 'CohortPeriod'], inplace=True)\n",
	"\n",
	"# create a Series holding the total size of each CohortGroup\n",
	"cohort_group_size = cohorts['TotalUsers'].groupby(level=0).first()\n",
	"cohort_group_size.head()\n",
	"\n",
	"# To illustrate what unstack does, recall the first five TotalUsers values:\n",
	"cohorts['TotalUsers'].head()\n",
	"# And here's what they look like when we unstack the CohortGroup level from the index:\n",
	"cohorts['TotalUsers'].unstack(0).head()\n",
	"# Now, we can utilize broadcasting to divide each column by the corresponding cohort_group_size.\n",
	"user_retention = cohorts['TotalUsers'].unstack(0).divide(cohort_group_size, axis=1)\n",
	"user_retention.head(10)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# Finally, we can plot the cohorts over time in an effort to spot behavioral differences or similarities.\n",
	"user_retention[['2009-06', '2009-07', '2009-08']].plot(figsize=(10,5))\n",
	"plt.title('Cohorts: User Retention')\n",
	"plt.xticks(np.arange(1, 12.1, 1))\n",
	"plt.xlim(1, 12)\n",
	"plt.ylabel('% of Cohort Purchasing');"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# Creating heatmaps in matplotlib is more difficult than it should be.\n",
	"# Thankfully, Seaborn makes them easy for us.\n",
	"# http://stanford.edu/~mwaskom/software/seaborn/\n",
	"\n",
	"import seaborn as sns\n",
	"sns.set(style='white')\n",
	"\n",
	"plt.figure(figsize=(12, 8))\n",
	"plt.title('Cohorts: User Retention')\n",
	"sns.heatmap(user_retention.T, mask=user_retention.T.isnull(), annot=True, fmt='.0%');"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.5.2"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 1
	}