Skip to content

Instantly share code, notes, and snippets.

@saptarshiguha
Created November 17, 2016 00:40
Show Gist options
  • Select an option

  • Save saptarshiguha/db5747e321d21adafc692abb25b3a466 to your computer and use it in GitHub Desktop.

Select an option

Save saptarshiguha/db5747e321d21adafc692abb25b3a466 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Import data"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"mainpingspq = sqlContext.read.load(\"s3://telemetry-parquet/main_summary/v3\", \"parquet\")"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pyspark.sql.functions as F\n",
"import operator\n",
"import numpy as np"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Create relevant variables"
]
},
{
"cell_type": "code",
"execution_count": 183,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# filter the data by dates that we want\n",
"import datetime\n",
"DATE_1970_01_01 = datetime.datetime(1970, 1, 1)\n",
"START_profile = \"2016-08-14\"\n",
"END_profile = \"2016-08-28\"\n",
"DAYS_START_profile = (datetime.datetime.strptime(START_profile, \"%Y-%m-%d\") - DATE_1970_01_01).days #in days since Jan 1, 1970\n",
"DAYS_END_profile = (datetime.datetime.strptime(END_profile, \"%Y-%m-%d\") - DATE_1970_01_01).days #in days since Jan 1, 1970\n",
"START_ping = '2016-08-14'\n",
"END_ping = '2016-10-15'\n",
"START_s3 = '20160813'\n",
"END_s3 = '20161025'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Subset data by applying filters and selecting columns"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"u0 = mainpingspq.selectExpr(\"submission_date_s3\",\n",
" \"app_name\",\n",
" \"vendor\",\n",
" \"normalized_channel\",\n",
" \"profile_creation_date\",\n",
" \"subsession_start_date\",\n",
" \"e10s_cohort\",\n",
" \"client_id\",\n",
" \"e10s_enabled\")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"u1 = u0.filter(u0.submission_date_s3 >= START_s3)\\\n",
" .filter(u0.submission_date_s3 <= END_s3)\\\n",
" .filter(u0.app_name == 'Firefox')\\\n",
" .filter(u0.vendor == 'Mozilla')\\\n",
" .filter(u0.normalized_channel == 'release')\\\n",
" .filter(u0.profile_creation_date >= DAYS_START_profile)\\\n",
" .filter(u0.profile_creation_date <= DAYS_END_profile)\\\n",
" .filter(u0.subsession_start_date.substr(1,10) >= START_ping)\\\n",
" .filter(u0.subsession_start_date.substr(1,10) <= END_ping)\\\n",
" .filter(u0.e10s_cohort.isin('test','control'))"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"u2 = u1.selectExpr(\"client_id\",\n",
" \"profile_creation_date\",\n",
" \"substr(subsession_start_date, 1, 10) as date\",\n",
" \"e10s_enabled\",\n",
" \"e10s_cohort\")"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"202206844"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# count total rows (may change slightly)\n",
"u2.count() #202,206,844"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[Row(count(client_id)=8643850)]"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# count unique user_ids (may change slightly)\n",
"u2.agg(countDistinct(u2.client_id)).collect() #8,643,850"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Reshape data"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# For each client_id, subsession_start_date (as date), get max(profile_creation_date), max(e10s_cohort), min(e10s_cohort), max(e10s_enabled), min(e10s_enabled)\n",
"u3 = u2.groupBy([u2.client_id, u2.date]).agg(F.max(\"profile_creation_date\").alias(\"profile_creation_date\"),\n",
" F.max(\"e10s_cohort\").alias(\"e10s_cohort_max\"),\n",
" F.min(\"e10s_cohort\").alias(\"e10s_cohort_min\"),\n",
" F.max(\"e10s_enabled\").alias(\"e10s_enabled_max\"),\n",
" F.min(\"e10s_enabled\").alias(\"e10s_enabled_min\"))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Get distribution of data"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Rows per client_id\n",
"u4 = u3.groupBy(u3.client_id).agg(F.count(\"*\").alias(\"num_days_active\"))"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Distribution per number of days active (better)\n",
"u5 = u4.groupBy(u4.num_days_active).agg(F.count(\"*\").alias(\"cnt\"))"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {
"collapsed": true
},
"outputs": [
{
"data": {
"text/plain": [
"[Row(num_days_active=1, cnt=4204801),\n",
" Row(num_days_active=2, cnt=890262),\n",
" Row(num_days_active=3, cnt=453709),\n",
" Row(num_days_active=4, cnt=313680),\n",
" Row(num_days_active=5, cnt=241930),\n",
" Row(num_days_active=6, cnt=196693),\n",
" Row(num_days_active=7, cnt=165900),\n",
" Row(num_days_active=8, cnt=143558),\n",
" Row(num_days_active=9, cnt=126036),\n",
" Row(num_days_active=10, cnt=111326),\n",
" Row(num_days_active=11, cnt=100943),\n",
" Row(num_days_active=12, cnt=91051),\n",
" Row(num_days_active=13, cnt=83713),\n",
" Row(num_days_active=14, cnt=78057),\n",
" Row(num_days_active=15, cnt=72417),\n",
" Row(num_days_active=16, cnt=67222),\n",
" Row(num_days_active=17, cnt=63339),\n",
" Row(num_days_active=18, cnt=59691),\n",
" Row(num_days_active=19, cnt=56503),\n",
" Row(num_days_active=20, cnt=53782),\n",
" Row(num_days_active=21, cnt=51441),\n",
" Row(num_days_active=22, cnt=50137),\n",
" Row(num_days_active=23, cnt=47469),\n",
" Row(num_days_active=24, cnt=45686),\n",
" Row(num_days_active=25, cnt=44261),\n",
" Row(num_days_active=26, cnt=43156),\n",
" Row(num_days_active=27, cnt=42095),\n",
" Row(num_days_active=28, cnt=40619),\n",
" Row(num_days_active=29, cnt=40561),\n",
" Row(num_days_active=30, cnt=39174),\n",
" Row(num_days_active=31, cnt=37877),\n",
" Row(num_days_active=32, cnt=37045),\n",
" Row(num_days_active=33, cnt=36022),\n",
" Row(num_days_active=34, cnt=35887),\n",
" Row(num_days_active=35, cnt=34965),\n",
" Row(num_days_active=36, cnt=32984),\n",
" Row(num_days_active=37, cnt=31514),\n",
" Row(num_days_active=38, cnt=29389),\n",
" Row(num_days_active=39, cnt=27648),\n",
" Row(num_days_active=40, cnt=26283),\n",
" Row(num_days_active=41, cnt=25255),\n",
" Row(num_days_active=42, cnt=24126),\n",
" Row(num_days_active=43, cnt=22536),\n",
" Row(num_days_active=44, cnt=21242),\n",
" Row(num_days_active=45, cnt=19508),\n",
" Row(num_days_active=46, cnt=17694),\n",
" Row(num_days_active=47, cnt=17120),\n",
" Row(num_days_active=48, cnt=17024),\n",
" Row(num_days_active=49, cnt=17629),\n",
" Row(num_days_active=50, cnt=16640),\n",
" Row(num_days_active=51, cnt=15685),\n",
" Row(num_days_active=52, cnt=13770),\n",
" Row(num_days_active=53, cnt=11460),\n",
" Row(num_days_active=54, cnt=9984),\n",
" Row(num_days_active=55, cnt=8890),\n",
" Row(num_days_active=56, cnt=7619),\n",
" Row(num_days_active=57, cnt=6794),\n",
" Row(num_days_active=58, cnt=5898),\n",
" Row(num_days_active=59, cnt=5092),\n",
" Row(num_days_active=60, cnt=4031),\n",
" Row(num_days_active=61, cnt=3270),\n",
" Row(num_days_active=62, cnt=2343),\n",
" Row(num_days_active=63, cnt=1415)]"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"u5.orderBy(u5.num_days_active).collect()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Take a small sample to see what it looks like"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[Row(client_id=u'000068d1-0677-4db0-8b7e-9b97c597c486', date=u'2016-08-23', profile_creation_date=17036, e10s_cohort_max=u'control', e10s_cohort_min=u'control', e10s_enabled_max=False, e10s_enabled_min=False),\n",
" Row(client_id=u'00007aab-2946-4279-8774-15d8d94289c3', date=u'2016-09-23', profile_creation_date=17041, e10s_cohort_max=u'test', e10s_cohort_min=u'test', e10s_enabled_max=True, e10s_enabled_min=True),\n",
" Row(client_id=u'0000b7d5-d45b-467c-a0d8-a4f2f264be2a', date=u'2016-08-29', profile_creation_date=17029, e10s_cohort_max=u'control', e10s_cohort_min=u'control', e10s_enabled_max=False, e10s_enabled_min=False)]"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"u3.take(3)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Make into RDD"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"r3 = u3.rdd"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Group by client_id"
]
},
{
"cell_type": "code",
"execution_count": 131,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# mapValues(list) necessary to convert the iterable object into an actual list\n",
"\n",
"# for each client, get a list of rows containing all attributes\n",
"r6 = r3.groupBy(lambda x: x.client_id).mapValues(list)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"x = r6.take(2)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"[(u'ec8ea196-d2a6-400b-8c01-da0daeae6a23',\n",
" [Row(client_id=u'ec8ea196-d2a6-400b-8c01-da0daeae6a23', date=u'2016-08-25', profile_creation_date=17038, e10s_cohort_max=u'control', e10s_cohort_min=u'control', e10s_enabled_max=False, e10s_enabled_min=False)]),\n",
" (u'566e988a-0dc1-4aa3-bc28-5a8754843c2f',\n",
" [Row(client_id=u'566e988a-0dc1-4aa3-bc28-5a8754843c2f', date=u'2016-09-22', profile_creation_date=17028, e10s_cohort_max=u'control', e10s_cohort_min=u'control', e10s_enabled_max=False, e10s_enabled_min=False),\n",
" Row(client_id=u'566e988a-0dc1-4aa3-bc28-5a8754843c2f', date=u'2016-10-12', profile_creation_date=17028, e10s_cohort_max=u'test', e10s_cohort_min=u'test', e10s_enabled_max=True, e10s_enabled_min=True),\n",
" Row(client_id=u'566e988a-0dc1-4aa3-bc28-5a8754843c2f', date=u'2016-09-25', profile_creation_date=17028, e10s_cohort_max=u'test', e10s_cohort_min=u'test', e10s_enabled_max=True, e10s_enabled_min=True),\n",
" Row(client_id=u'566e988a-0dc1-4aa3-bc28-5a8754843c2f', date=u'2016-10-13', profile_creation_date=17028, e10s_cohort_max=u'test', e10s_cohort_min=u'test', e10s_enabled_max=True, e10s_enabled_min=True),\n",
" Row(client_id=u'566e988a-0dc1-4aa3-bc28-5a8754843c2f', date=u'2016-10-10', profile_creation_date=17028, e10s_cohort_max=u'test', e10s_cohort_min=u'test', e10s_enabled_max=True, e10s_enabled_min=True),\n",
" Row(client_id=u'566e988a-0dc1-4aa3-bc28-5a8754843c2f', date=u'2016-09-23', profile_creation_date=17028, e10s_cohort_max=u'test', e10s_cohort_min=u'test', e10s_enabled_max=True, e10s_enabled_min=True),\n",
" Row(client_id=u'566e988a-0dc1-4aa3-bc28-5a8754843c2f', date=u'2016-10-14', profile_creation_date=17028, e10s_cohort_max=u'test', e10s_cohort_min=u'test', e10s_enabled_max=True, e10s_enabled_min=True),\n",
" Row(client_id=u'566e988a-0dc1-4aa3-bc28-5a8754843c2f', date=u'2016-10-09', profile_creation_date=17028, e10s_cohort_max=u'test', e10s_cohort_min=u'test', e10s_enabled_max=True, e10s_enabled_min=True),\n",
" Row(client_id=u'566e988a-0dc1-4aa3-bc28-5a8754843c2f', date=u'2016-09-24', profile_creation_date=17028, e10s_cohort_max=u'test', e10s_cohort_min=u'test', e10s_enabled_max=True, e10s_enabled_min=True),\n",
" Row(client_id=u'566e988a-0dc1-4aa3-bc28-5a8754843c2f', date=u'2016-09-28', profile_creation_date=17028, e10s_cohort_max=u'test', e10s_cohort_min=u'test', e10s_enabled_max=True, e10s_enabled_min=True),\n",
" Row(client_id=u'566e988a-0dc1-4aa3-bc28-5a8754843c2f', date=u'2016-10-11', profile_creation_date=17028, e10s_cohort_max=u'test', e10s_cohort_min=u'test', e10s_enabled_max=True, e10s_enabled_min=True),\n",
" Row(client_id=u'566e988a-0dc1-4aa3-bc28-5a8754843c2f', date=u'2016-09-27', profile_creation_date=17028, e10s_cohort_max=u'test', e10s_cohort_min=u'test', e10s_enabled_max=True, e10s_enabled_min=True),\n",
" Row(client_id=u'566e988a-0dc1-4aa3-bc28-5a8754843c2f', date=u'2016-10-08', profile_creation_date=17028, e10s_cohort_max=u'test', e10s_cohort_min=u'test', e10s_enabled_max=True, e10s_enabled_min=True)])]"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# For each client_id, get profile_creation date (single number) and all attributes as lists\n",
"## Note: the lists are sorted by the ordering of date (subsession_start_date)"
]
},
{
"cell_type": "code",
"execution_count": 156,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"\"\"\"\n",
"Helper function for the map\n",
"\n",
"For each client_id, get a dictionary with:\n",
"-client_id\n",
"-profile_creation_date\n",
"-a list of subsession_start_dates\n",
"-a list of e10s_cohort_max\n",
"-a list of e10s_cohort_min\n",
"-a list of e10s_enabled_max\n",
"-a list of e10s_enabled_min\n",
"\n",
"These lists are also ordered chronologically from the first date (order is preserved for all lists)\n",
"\"\"\"\n",
"def transform_rows_into_ordered_lists(row):\n",
" d = dict(zip( #dictionary of lists\n",
" [\"date\",\"e10s_cohort_max\",\"e10s_cohort_min\",\"e10s_enabled_max\",\"e10s_enabled_min\"],\n",
" zip(*sorted(zip( #sort the following lists\n",
" [d.date for d in row[1]], #date (subsession_start_date)\n",
" [d.e10s_cohort_max for d in row[1]], #e10s_cohort_max\n",
" [d.e10s_cohort_min for d in row[1]], #e10s_cohort_min\n",
" [d.e10s_enabled_max for d in row[1]], #e10s_enabled_max\n",
" [d.e10s_enabled_min for d in row[1]]), #e10s_enabled_min\n",
" key=operator.itemgetter(0)))))\n",
" d['client_id'] = row[0]\n",
" d['profile_creation_date'] = row[1][0][2]\n",
" return d"
]
},
{
"cell_type": "code",
"execution_count": 157,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"r7 = r6.map(transform_rows_into_ordered_lists) #by the ordering of the first list (date)"
]
},
{
"cell_type": "code",
"execution_count": 158,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"[{'client_id': u'93a7731c-f26b-4aa4-82d4-0f95e5ffb6c5',\n",
" 'date': (u'2016-08-18',\n",
" u'2016-08-19',\n",
" u'2016-08-23',\n",
" u'2016-08-29',\n",
" u'2016-09-01',\n",
" u'2016-09-12',\n",
" u'2016-09-15',\n",
" u'2016-09-16',\n",
" u'2016-09-19',\n",
" u'2016-09-22',\n",
" u'2016-09-23',\n",
" u'2016-09-26',\n",
" u'2016-09-28',\n",
" u'2016-10-06',\n",
" u'2016-10-10',\n",
" u'2016-10-13'),\n",
" 'e10s_cohort_max': (u'control',\n",
" u'control',\n",
" u'control',\n",
" u'control',\n",
" u'control',\n",
" u'control',\n",
" u'control',\n",
" u'test',\n",
" u'test',\n",
" u'test',\n",
" u'test',\n",
" u'test',\n",
" u'test',\n",
" u'test',\n",
" u'test',\n",
" u'test'),\n",
" 'e10s_cohort_min': (u'control',\n",
" u'control',\n",
" u'control',\n",
" u'control',\n",
" u'control',\n",
" u'control',\n",
" u'control',\n",
" u'test',\n",
" u'test',\n",
" u'test',\n",
" u'test',\n",
" u'test',\n",
" u'test',\n",
" u'test',\n",
" u'test',\n",
" u'test'),\n",
" 'e10s_enabled_max': (False,\n",
" False,\n",
" False,\n",
" False,\n",
" False,\n",
" False,\n",
" False,\n",
" True,\n",
" True,\n",
" True,\n",
" True,\n",
" True,\n",
" True,\n",
" True,\n",
" True,\n",
" True),\n",
" 'e10s_enabled_min': (False,\n",
" False,\n",
" False,\n",
" False,\n",
" False,\n",
" False,\n",
" False,\n",
" True,\n",
" True,\n",
" True,\n",
" True,\n",
" True,\n",
" True,\n",
" True,\n",
" True,\n",
" True),\n",
" 'profile_creation_date': 17031}]"
]
},
"execution_count": 158,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"r7.take(1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Each `client_id` in **r7** now has elements stored in a dictionary:\n",
"\n",
"- row['client_id'] -> client_id\n",
"- row['profile_creation_date'] -> profile_creation_date\n",
"- row['date'] -> list of date (subsession_start_date)\n",
"- row['e10s_cohort_max'] -> list of e10s_cohort_max\n",
"- row['e10s_cohort_min'] -> list of e10s_cohort_min\n",
"- row['e10s_enabled_max'] -> list of e10s_enabled_max\n",
"- row['e10s_enabled_min'] -> list of e10s_enabled_min\n",
"\n",
"**Note**: the 5 lists in this dictionary are *ordered* chronologically since the first subsession_start_date\n",
"\n",
"A `client_id` is in 'ignore' cohort iff\n",
"\n",
"- there was a switch in cohorts throughout the experiment\n",
"\n",
"A `client_id` is in 'control' cohort iff\n",
"\n",
"- there was no switch in cohorts\n",
"- e10s_cohort == 'control' for all entries\n",
"- e10s_enabled stays False (seems to follow suit)\n",
"\n",
"A `client_id` is in 'variation' cohort iff\n",
"\n",
"- there was no switch in cohorts\n",
"- e10s_enabled == True within 3 days of profile_creation_date\n",
"- e10s_enabled stays True for the rest of the list (since ordered by date)\n",
"- e10s_cohort stays 'test' for the rest of the list (seems to follow suit)"
]
},
{
"cell_type": "code",
"execution_count": 177,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"\"\"\"\n",
"Determine whether a client_id has switched cohorts\n",
"We want to keep client_ids where, throughout the experiment, the e10s_cohort has remained the same\n",
"This is equivalent to saying that the set of e10s_cohort_max/min contains only one value\n",
"\"\"\"\n",
"def client_one_cohort(client):\n",
" if len(set(client['e10s_cohort_max']+client['e10s_cohort_min']))<=1:\n",
" return True\n",
" else:\n",
" return False\n",
"\n",
"\"\"\"\n",
"Determine whether a client_id is 'variation' or 'control'\n",
"If client_id passes the client_one_cohort() test, then all values in the lists e10s_cohort_max/min are the same\n",
" We therefore only need to check the first to determine the e10s_cohort\n",
"Also, we need that e10s_enabled_max/min has not varied throughout the experiment\n",
" We therefore check that the set of e10s_enabled_max/min contains only one value\n",
" For 'control', we also need to check that e10s_enabled is False, so we check e10s_enabled_max\n",
" For 'variation', we also need to check that e10s_enabled is True, so we check e10s_enabled_min\n",
"To determine whether the profile in 'test' switched e10s_enabled to True within 3 days of profile creation,\n",
"we check that the last e10s_enabled is True, and then get the date of the last switch in e10s_enabled\n",
" If this switch was within 3 days of profile creation, then the client_id is in 'variation'\n",
"\"\"\"\n",
"# np.where(np.array(client['e10s_enabled_min'][:-1]) != np.array(client['e10s_enabled_min'][1:]))[0][-1] is the last index of the change in e10s_enabled\n",
"def variation_or_control(client):\n",
" if client_one_cohort(client):\n",
" if client['e10s_cohort_max'][0] == 'control' and client['e10s_enabled_max'][0] == False and len(set(client['e10s_enabled_max']+client['e10s_enabled_min'])) <= 1:\n",
" return \"control\"\n",
" elif client['e10s_cohort_max'][0] == 'test' and client['e10s_enabled_min'][0] == True and len(set(client['e10s_enabled_max'] +client['e10s_enabled_min'])) <= 1:\n",
" return \"variation\"\n",
" elif client['e10s_cohort_max'][-1] == 'test' and client['e10s_enabled_min'][-1] == True and ((datetime.datetime.strptime(client['date'][np.where(np.array(client['e10s_enabled_min'][:-1]) != np.array(client['e10s_enabled_min'][1:]))[0][-1]], \"%Y-%m-%d\") - datetime.datetime(1970, 1, 1)).days - client['profile_creation_date']) <= 3:\n",
" return\"variation\"\n",
" else:\n",
" return \"changed_e10s_enabled\"\n",
" else:\n",
" return \"changed_e10s_cohort\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Get some counts"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"8643851"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Number of unique client_ids at this point\n",
"r7.count() #8,643,851"
]
},
{
"cell_type": "code",
"execution_count": 178,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"r8 = r7.map(variation_or_control)"
]
},
{
"cell_type": "code",
"execution_count": 332,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"5673668"
]
},
"execution_count": 332,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Number of client_ids in 'control'\n",
"r8.filter(lambda x: x=='control').count() #5,673,668"
]
},
{
"cell_type": "code",
"execution_count": 333,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"420410"
]
},
"execution_count": 333,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Number of client_ids in 'variation'\n",
"r8.filter(lambda x: x=='variation').count() #420,410"
]
},
{
"cell_type": "code",
"execution_count": 334,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"214278"
]
},
"execution_count": 334,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Number of client_ids that changed e10s_enabled outside 3 days limit\n",
"r8.filter(lambda x: x=='changed_e10s_enabled').count() #214,278"
]
},
{
"cell_type": "code",
"execution_count": 335,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"2335495"
]
},
"execution_count": 335,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Number of client_ids that changed e10s_cohort\n",
"r8.filter(lambda x: x=='changed_e10s_cohort').count() #2,335,495"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Get the counts per day"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"1) Get the group/cohort\n",
"\n",
"2) If it's 'control' or 'variation':\n",
" - (cohort, profile_creation_date, date) -> 1/0 (if user was active or not that **week**)\n",
"3) If not:\n",
" - ignore"
]
},
{
"cell_type": "code",
"execution_count": 336,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def generate_counts(client):\n",
" cohort = variation_or_control(client)\n",
" if cohort in [\"control\", \"variation\"]:\n",
" # profile_creation_date in YYYY-MM-DD format\n",
" pcd = DATE_1970_01_01 + datetime.timedelta(days=client['profile_creation_date'])\n",
" \n",
" week = range(11)\n",
" active = [0]*10\n",
" \n",
" for d in client['date']:\n",
" # indicate that profile was created (week0 counts profile creations)\n",
" active[0] = 1\n",
" # find week that contains the date\n",
" w = 1\n",
" while d > (pcd+datetime.timedelta(days=w*7-1)).strftime(\"%Y-%m-%d\"):\n",
" w+=1\n",
" # if the entry is 0, change it to 1\n",
" if active[w] == 0:\n",
" active[w] = 1\n",
" \n",
" for w, a in zip(week, active):\n",
" yield (pcd.strftime(\"%Y-%m-%d\"), cohort, w), a\n",
" else:\n",
" pass"
]
},
{
"cell_type": "code",
"execution_count": 337,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"r9 = r7.flatMap(generate_counts).reduceByKey(operator.add)"
]
},
{
"cell_type": "code",
"execution_count": 338,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"res = r9.collect()"
]
},
{
"cell_type": "code",
"execution_count": 324,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"300"
]
},
"execution_count": 324,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"\"\"\"\n",
"We should have 2*10*15 = 300 entries\n",
" 2 cohorts\n",
" 9 weeks of experiment\n",
" 15 days for profile creation\n",
"\"\"\"\n",
"len(res)"
]
},
{
"cell_type": "code",
"execution_count": 339,
"metadata": {
"collapsed": true
},
"outputs": [
{
"data": {
"text/plain": [
"[(('2016-08-22', 'variation', 0), 41155),\n",
" (('2016-08-21', 'control', 2), 29898),\n",
" (('2016-08-22', 'control', 4), 20613),\n",
" (('2016-08-28', 'variation', 0), 25539),\n",
" (('2016-08-27', 'variation', 8), 4446),\n",
" (('2016-08-24', 'variation', 0), 43549),\n",
" (('2016-08-15', 'variation', 7), 5939),\n",
" (('2016-08-17', 'variation', 0), 16006),\n",
" (('2016-08-23', 'variation', 1), 29198),\n",
" (('2016-08-14', 'control', 6), 7754),\n",
" (('2016-08-26', 'variation', 1), 27451),\n",
" (('2016-08-19', 'control', 8), 8526),\n",
" (('2016-08-20', 'variation', 2), 10226),\n",
" (('2016-08-25', 'variation', 1), 30276),\n",
" (('2016-08-25', 'control', 0), 436623),\n",
" (('2016-08-16', 'variation', 1), 3946),\n",
" (('2016-08-27', 'control', 5), 6689),\n",
" (('2016-08-20', 'control', 1), 301287),\n",
" (('2016-08-18', 'variation', 6), 8422),\n",
" (('2016-08-22', 'variation', 8), 14611),\n",
" (('2016-08-17', 'control', 4), 21677),\n",
" (('2016-08-27', 'variation', 0), 27666),\n",
" (('2016-08-21', 'variation', 3), 8747),\n",
" (('2016-08-24', 'variation', 8), 12483),\n",
" (('2016-08-28', 'control', 4), 7943),\n",
" (('2016-08-23', 'variation', 9), 0),\n",
" (('2016-08-26', 'control', 6), 9365),\n",
" (('2016-08-18', 'control', 7), 11118),\n",
" (('2016-08-26', 'variation', 9), 0),\n",
" (('2016-08-19', 'control', 0), 395311),\n",
" (('2016-08-25', 'control', 8), 5577),\n",
" (('2016-08-15', 'control', 1), 374949),\n",
" (('2016-08-19', 'variation', 6), 12536),\n",
" (('2016-08-23', 'control', 0), 438756),\n",
" (('2016-08-24', 'variation', 7), 15858),\n",
" (('2016-08-14', 'variation', 7), 3376),\n",
" (('2016-08-24', 'control', 7), 12788),\n",
" (('2016-08-20', 'control', 8), 5262),\n",
" (('2016-08-16', 'control', 4), 21267),\n",
" (('2016-08-27', 'variation', 9), 0),\n",
" (('2016-08-27', 'control', 2), 17976),\n",
" (('2016-08-15', 'variation', 6), 5224),\n",
" (('2016-08-22', 'variation', 3), 15433),\n",
" (('2016-08-21', 'control', 3), 14618),\n",
" (('2016-08-15', 'control', 9), 7771),\n",
" (('2016-08-28', 'variation', 7), 8175),\n",
" (('2016-08-22', 'control', 3), 24681),\n",
" (('2016-08-23', 'control', 8), 8784),\n",
" (('2016-08-26', 'variation', 0), 38394),\n",
" (('2016-08-17', 'variation', 3), 4249),\n",
" (('2016-08-25', 'control', 1), 422695),\n",
" (('2016-08-23', 'variation', 2), 16666),\n",
" (('2016-08-14', 'control', 7), 6018),\n",
" (('2016-08-18', 'control', 0), 424235),\n",
" (('2016-08-18', 'variation', 7), 8944),\n",
" (('2016-08-20', 'control', 0), 307812),\n",
" (('2016-08-20', 'variation', 1), 18886),\n",
" (('2016-08-25', 'variation', 6), 15865),\n",
" (('2016-08-27', 'variation', 1), 20974),\n",
" (('2016-08-16', 'variation', 2), 3411),\n",
" (('2016-08-17', 'control', 3), 30914),\n",
" (('2016-08-21', 'variation', 0), 23270),\n",
" (('2016-08-26', 'variation', 8), 8443),\n",
" (('2016-08-28', 'control', 7), 4340),\n",
" (('2016-08-25', 'control', 9), 0),\n",
" (('2016-08-26', 'control', 5), 11445),\n",
" (('2016-08-18', 'control', 8), 9595),\n",
" (('2016-08-15', 'control', 0), 389692),\n",
" (('2016-08-20', 'variation', 9), 3837),\n",
" (('2016-08-19', 'control', 3), 23552),\n",
" (('2016-08-24', 'variation', 6), 16507),\n",
" (('2016-08-21', 'control', 4), 11034),\n",
" (('2016-08-19', 'variation', 5), 12081),\n",
" (('2016-08-23', 'control', 1), 424462),\n",
" (('2016-08-21', 'variation', 8), 7334),\n",
" (('2016-08-14', 'variation', 4), 2290),\n",
" (('2016-08-24', 'control', 6), 15173),\n",
" (('2016-08-14', 'control', 8), 5085),\n",
" (('2016-08-27', 'control', 3), 12014),\n",
" (('2016-08-16', 'control', 5), 18464),\n",
" (('2016-08-22', 'control', 2), 47248),\n",
" (('2016-08-28', 'variation', 6), 8653),\n",
" (('2016-08-15', 'control', 8), 9077),\n",
" (('2016-08-15', 'variation', 5), 4558),\n",
" (('2016-08-22', 'variation', 2), 16378),\n",
" (('2016-08-23', 'control', 9), 0),\n",
" (('2016-08-18', 'control', 1), 407629),\n",
" (('2016-08-26', 'variation', 3), 14560),\n",
" (('2016-08-17', 'variation', 2), 3509),\n",
" (('2016-08-24', 'control', 9), 0),\n",
" (('2016-08-25', 'variation', 7), 15014),\n",
" (('2016-08-25', 'control', 2), 34793),\n",
" (('2016-08-23', 'variation', 3), 16000),\n",
" (('2016-08-14', 'control', 0), 269373),\n",
" (('2016-08-18', 'variation', 4), 7471),\n",
" (('2016-08-20', 'control', 3), 16185),\n",
" (('2016-08-20', 'variation', 0), 25228),\n",
" (('2016-08-17', 'control', 2), 53099),\n",
" (('2016-08-27', 'variation', 2), 11497),\n",
" (('2016-08-16', 'variation', 3), 4122),\n",
" (('2016-08-23', 'control', 6), 13053),\n",
" (('2016-08-21', 'variation', 1), 17882),\n",
" (('2016-08-18', 'control', 9), 5422),\n",
" (('2016-08-28', 'control', 6), 4695),\n",
" (('2016-08-24', 'control', 1), 421743),\n",
" (('2016-08-26', 'control', 4), 14748),\n",
" (('2016-08-21', 'control', 5), 8566),\n",
" (('2016-08-22', 'control', 9), 0),\n",
" (('2016-08-15', 'control', 3), 32137),\n",
" (('2016-08-19', 'control', 2), 45753),\n",
" (('2016-08-20', 'variation', 8), 7951),\n",
" (('2016-08-24', 'variation', 5), 16410),\n",
" (('2016-08-19', 'variation', 4), 12161),\n",
" (('2016-08-14', 'control', 9), 4707),\n",
" (('2016-08-21', 'variation', 9), 0),\n",
" (('2016-08-14', 'variation', 5), 2501),\n",
" (('2016-08-23', 'variation', 4), 15542),\n",
" (('2016-08-27', 'control', 0), 298538),\n",
" (('2016-08-16', 'control', 2), 55126),\n",
" (('2016-08-28', 'variation', 5), 9252),\n",
" (('2016-08-22', 'control', 1), 419578),\n",
" (('2016-08-16', 'variation', 4), 4932),\n",
" (('2016-08-15', 'variation', 4), 4112),\n",
" (('2016-08-22', 'variation', 5), 15654),\n",
" (('2016-08-24', 'control', 8), 8659),\n",
" (('2016-08-21', 'variation', 6), 8185),\n",
" (('2016-08-14', 'control', 1), 260949),\n",
" (('2016-08-18', 'control', 2), 49677),\n",
" (('2016-08-26', 'variation', 2), 15458),\n",
" (('2016-08-17', 'variation', 5), 5325),\n",
" (('2016-08-25', 'variation', 4), 15730),\n",
" (('2016-08-25', 'control', 3), 22060),\n",
" (('2016-08-26', 'control', 3), 19069),\n",
" (('2016-08-27', 'control', 8), 1350),\n",
" (('2016-08-18', 'variation', 5), 7703),\n",
" (('2016-08-20', 'control', 2), 33312),\n",
" (('2016-08-20', 'variation', 7), 8526),\n",
" (('2016-08-23', 'control', 7), 11075),\n",
" (('2016-08-17', 'control', 1), 417002),\n",
" (('2016-08-27', 'variation', 3), 10212),\n",
" (('2016-08-24', 'control', 0), 445407),\n",
" (('2016-08-28', 'control', 1), 275460),\n",
" (('2016-08-21', 'control', 6), 6480),\n",
" (('2016-08-22', 'control', 8), 9592),\n",
" (('2016-08-15', 'control', 2), 50845),\n",
" (('2016-08-19', 'control', 5), 15306),\n",
" (('2016-08-17', 'control', 9), 7238),\n",
" (('2016-08-24', 'variation', 4), 15764),\n",
" (('2016-08-15', 'variation', 3), 3290),\n",
" (('2016-08-19', 'variation', 3), 12234),\n",
" (('2016-08-23', 'variation', 5), 16171),\n",
" (('2016-08-26', 'variation', 5), 14524),\n",
" (('2016-08-16', 'control', 3), 33160),\n",
" (('2016-08-28', 'control', 9), 0),\n",
" (('2016-08-14', 'variation', 2), 1790),\n",
" (('2016-08-16', 'variation', 5), 5380),\n",
" (('2016-08-27', 'control', 1), 293382),\n",
" (('2016-08-18', 'variation', 2), 6816),\n",
" (('2016-08-20', 'control', 5), 9908),\n",
" (('2016-08-22', 'variation', 4), 15151),\n",
" (('2016-08-28', 'variation', 4), 9284),\n",
" (('2016-08-22', 'control', 0), 434761),\n",
" (('2016-08-21', 'variation', 7), 7796),\n",
" (('2016-08-17', 'variation', 4), 4817),\n",
" (('2016-08-26', 'control', 2), 27355),\n",
" (('2016-08-14', 'control', 2), 35682),\n",
" (('2016-08-18', 'control', 3), 27252),\n",
" (('2016-08-20', 'variation', 6), 8853),\n",
" (('2016-08-25', 'variation', 5), 15978),\n",
" (('2016-08-25', 'control', 4), 17592),\n",
" (('2016-08-27', 'control', 9), 0),\n",
" (('2016-08-15', 'control', 5), 16465),\n",
" (('2016-08-23', 'control', 4), 20910),\n",
" (('2016-08-17', 'control', 0), 435088),\n",
" (('2016-08-27', 'variation', 4), 10116),\n",
" (('2016-08-28', 'control', 0), 279793),\n",
" (('2016-08-24', 'control', 3), 25293),\n",
" (('2016-08-16', 'control', 8), 10247),\n",
" (('2016-08-19', 'control', 4), 18709),\n",
" (('2016-08-15', 'variation', 2), 2735),\n",
" (('2016-08-21', 'control', 7), 5345),\n",
" (('2016-08-22', 'control', 7), 11047),\n",
" (('2016-08-28', 'variation', 3), 9626),\n",
" (('2016-08-19', 'variation', 2), 13003),\n",
" (('2016-08-17', 'control', 8), 10716),\n",
" (('2016-08-24', 'variation', 3), 16002),\n",
" (('2016-08-26', 'variation', 4), 14174),\n",
" (('2016-08-14', 'variation', 3), 1939),\n",
" (('2016-08-28', 'control', 8), 0),\n",
" (('2016-08-26', 'control', 9), 0),\n",
" (('2016-08-23', 'variation', 6), 16409),\n",
" (('2016-08-20', 'control', 4), 12376),\n",
" (('2016-08-18', 'variation', 3), 7081),\n",
" (('2016-08-16', 'control', 0), 428364),\n",
" (('2016-08-25', 'variation', 2), 17136),\n",
" (('2016-08-16', 'variation', 6), 6164),\n",
" (('2016-08-27', 'control', 6), 5388),\n",
" (('2016-08-22', 'variation', 7), 15460),\n",
" (('2016-08-17', 'control', 7), 12328),\n",
" (('2016-08-21', 'variation', 4), 8119),\n",
" (('2016-08-17', 'variation', 7), 6770),\n",
" (('2016-08-25', 'control', 5), 13664),\n",
" (('2016-08-26', 'control', 1), 393355),\n",
" (('2016-08-14', 'control', 3), 21276),\n",
" (('2016-08-18', 'control', 4), 20381),\n",
" (('2016-08-15', 'control', 4), 19276),\n",
" (('2016-08-20', 'variation', 5), 8721),\n",
" (('2016-08-27', 'variation', 5), 10172),\n",
" (('2016-08-15', 'variation', 9), 6134),\n",
" (('2016-08-23', 'control', 5), 16433),\n",
" (('2016-08-14', 'variation', 8), 3372),\n",
" (('2016-08-28', 'control', 3), 10786),\n",
" (('2016-08-24', 'control', 2), 40670),\n",
" (('2016-08-18', 'variation', 8), 8750),\n",
" (('2016-08-16', 'control', 9), 8008),\n",
" (('2016-08-28', 'variation', 2), 10854),\n",
" (('2016-08-22', 'control', 6), 13139),\n",
" (('2016-08-19', 'control', 7), 9883),\n",
" (('2016-08-24', 'variation', 2), 16575),\n",
" (('2016-08-15', 'variation', 1), 3377),\n",
" (('2016-08-21', 'control', 0), 285779),\n",
" (('2016-08-19', 'variation', 1), 22889),\n",
" (('2016-08-26', 'variation', 7), 13761),\n",
" (('2016-08-25', 'variation', 3), 16013),\n",
" (('2016-08-14', 'variation', 0), 8656),\n",
" (('2016-08-26', 'control', 8), 3577),\n",
" (('2016-08-23', 'variation', 7), 16022),\n",
" (('2016-08-14', 'control', 4), 12492),\n",
" (('2016-08-27', 'control', 7), 4824),\n",
" (('2016-08-20', 'control', 7), 6038),\n",
" (('2016-08-18', 'variation', 0), 22234),\n",
" (('2016-08-16', 'control', 1), 411311),\n",
" (('2016-08-17', 'control', 6), 14779),\n",
" (('2016-08-16', 'variation', 7), 7001),\n",
" (('2016-08-22', 'variation', 6), 15991),\n",
" (('2016-08-21', 'control', 8), 4898),\n",
" (('2016-08-19', 'variation', 9), 7303),\n",
" (('2016-08-21', 'variation', 5), 8158),\n",
" (('2016-08-18', 'control', 5), 16859),\n",
" (('2016-08-17', 'variation', 6), 6181),\n",
" (('2016-08-25', 'control', 6), 11224),\n",
" (('2016-08-26', 'control', 0), 404136),\n",
" (('2016-08-28', 'variation', 9), 0),\n",
" (('2016-08-15', 'control', 7), 10903),\n",
" (('2016-08-20', 'variation', 4), 8877),\n",
" (('2016-08-27', 'variation', 6), 9674),\n",
" (('2016-08-15', 'variation', 8), 6211),\n",
" (('2016-08-23', 'control', 2), 43786),\n",
" (('2016-08-17', 'variation', 9), 5752),\n",
" (('2016-08-25', 'variation', 8), 11089),\n",
" (('2016-08-14', 'variation', 9), 3327),\n",
" (('2016-08-28', 'control', 2), 16178),\n",
" (('2016-08-24', 'control', 5), 18306),\n",
" (('2016-08-18', 'variation', 9), 6532),\n",
" (('2016-08-16', 'control', 6), 14707),\n",
" (('2016-08-21', 'control', 1), 280516),\n",
" (('2016-08-28', 'variation', 1), 19914),\n",
" (('2016-08-22', 'control', 5), 16832),\n",
" (('2016-08-19', 'control', 6), 11877),\n",
" (('2016-08-16', 'variation', 8), 7131),\n",
" (('2016-08-24', 'variation', 1), 29457),\n",
" (('2016-08-15', 'variation', 0), 14357),\n",
" (('2016-08-22', 'variation', 1), 27730),\n",
" (('2016-08-19', 'variation', 0), 33669),\n",
" (('2016-08-14', 'control', 5), 10013),\n",
" (('2016-08-26', 'variation', 6), 14251),\n",
" (('2016-08-17', 'variation', 1), 4135),\n",
" (('2016-08-25', 'variation', 0), 42308),\n",
" (('2016-08-14', 'variation', 1), 2836),\n",
" (('2016-08-23', 'variation', 0), 42177),\n",
" (('2016-08-27', 'control', 4), 8824),\n",
" (('2016-08-20', 'control', 6), 7048),\n",
" (('2016-08-18', 'variation', 1), 10439),\n",
" (('2016-08-20', 'variation', 3), 9281),\n",
" (('2016-08-19', 'control', 9), 3600),\n",
" (('2016-08-21', 'control', 9), 0),\n",
" (('2016-08-17', 'control', 5), 18733),\n",
" (('2016-08-16', 'variation', 0), 16202),\n",
" (('2016-08-24', 'variation', 9), 0),\n",
" (('2016-08-22', 'variation', 9), 0),\n",
" (('2016-08-19', 'variation', 8), 11857),\n",
" (('2016-08-21', 'variation', 2), 9571),\n",
" (('2016-08-18', 'control', 6), 13158),\n",
" (('2016-08-28', 'control', 5), 5724),\n",
" (('2016-08-25', 'control', 7), 9709),\n",
" (('2016-08-26', 'control', 7), 8353),\n",
" (('2016-08-23', 'variation', 8), 14030),\n",
" (('2016-08-15', 'control', 6), 13417),\n",
" (('2016-08-28', 'variation', 8), 0),\n",
" (('2016-08-19', 'control', 1), 382059),\n",
" (('2016-08-23', 'control', 3), 25100),\n",
" (('2016-08-27', 'variation', 7), 9221),\n",
" (('2016-08-17', 'variation', 8), 6925),\n",
" (('2016-08-19', 'variation', 7), 12356),\n",
" (('2016-08-24', 'control', 4), 22173),\n",
" (('2016-08-16', 'control', 7), 12203),\n",
" (('2016-08-25', 'variation', 9), 0),\n",
" (('2016-08-14', 'variation', 6), 2844),\n",
" (('2016-08-16', 'variation', 9), 6460),\n",
" (('2016-08-20', 'control', 9), 1520)]"
]
},
"execution_count": 339,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"res"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Transform into CSV format"
]
},
{
"cell_type": "code",
"execution_count": 340,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"l_res = [[elem for elem in row[0]]+[row[1]] for row in res]"
]
},
{
"cell_type": "code",
"execution_count": 341,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 342,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df = pd.DataFrame(l_res, columns=[\"Profile_Creation_Date\",\"Cohort\",\"Week\",\"Count\"])"
]
},
{
"cell_type": "code",
"execution_count": 343,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df = df.sort_values(['Profile_Creation_Date','Week','Cohort'])"
]
},
{
"cell_type": "code",
"execution_count": 344,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"pivoted = df.set_index(['Profile_Creation_Date','Week','Cohort']).unstack([\"Week\",\"Cohort\"])"
]
},
{
"cell_type": "code",
"execution_count": 345,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr>\n",
" <th></th>\n",
" <th colspan=\"20\" halign=\"left\">Count</th>\n",
" </tr>\n",
" <tr>\n",
" <th>Week</th>\n",
" <th colspan=\"2\" halign=\"left\">0</th>\n",
" <th colspan=\"2\" halign=\"left\">1</th>\n",
" <th colspan=\"2\" halign=\"left\">2</th>\n",
" <th colspan=\"2\" halign=\"left\">3</th>\n",
" <th colspan=\"2\" halign=\"left\">4</th>\n",
" <th colspan=\"2\" halign=\"left\">5</th>\n",
" <th colspan=\"2\" halign=\"left\">6</th>\n",
" <th colspan=\"2\" halign=\"left\">7</th>\n",
" <th colspan=\"2\" halign=\"left\">8</th>\n",
" <th colspan=\"2\" halign=\"left\">9</th>\n",
" </tr>\n",
" <tr>\n",
" <th>Cohort</th>\n",
" <th>control</th>\n",
" <th>variation</th>\n",
" <th>control</th>\n",
" <th>variation</th>\n",
" <th>control</th>\n",
" <th>variation</th>\n",
" <th>control</th>\n",
" <th>variation</th>\n",
" <th>control</th>\n",
" <th>variation</th>\n",
" <th>control</th>\n",
" <th>variation</th>\n",
" <th>control</th>\n",
" <th>variation</th>\n",
" <th>control</th>\n",
" <th>variation</th>\n",
" <th>control</th>\n",
" <th>variation</th>\n",
" <th>control</th>\n",
" <th>variation</th>\n",
" </tr>\n",
" <tr>\n",
" <th>Profile_Creation_Date</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>2016-08-14</th>\n",
" <td>269373</td>\n",
" <td>8656</td>\n",
" <td>260949</td>\n",
" <td>2836</td>\n",
" <td>35682</td>\n",
" <td>1790</td>\n",
" <td>21276</td>\n",
" <td>1939</td>\n",
" <td>12492</td>\n",
" <td>2290</td>\n",
" <td>10013</td>\n",
" <td>2501</td>\n",
" <td>7754</td>\n",
" <td>2844</td>\n",
" <td>6018</td>\n",
" <td>3376</td>\n",
" <td>5085</td>\n",
" <td>3372</td>\n",
" <td>4707</td>\n",
" <td>3327</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2016-08-15</th>\n",
" <td>389692</td>\n",
" <td>14357</td>\n",
" <td>374949</td>\n",
" <td>3377</td>\n",
" <td>50845</td>\n",
" <td>2735</td>\n",
" <td>32137</td>\n",
" <td>3290</td>\n",
" <td>19276</td>\n",
" <td>4112</td>\n",
" <td>16465</td>\n",
" <td>4558</td>\n",
" <td>13417</td>\n",
" <td>5224</td>\n",
" <td>10903</td>\n",
" <td>5939</td>\n",
" <td>9077</td>\n",
" <td>6211</td>\n",
" <td>7771</td>\n",
" <td>6134</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2016-08-16</th>\n",
" <td>428364</td>\n",
" <td>16202</td>\n",
" <td>411311</td>\n",
" <td>3946</td>\n",
" <td>55126</td>\n",
" <td>3411</td>\n",
" <td>33160</td>\n",
" <td>4122</td>\n",
" <td>21267</td>\n",
" <td>4932</td>\n",
" <td>18464</td>\n",
" <td>5380</td>\n",
" <td>14707</td>\n",
" <td>6164</td>\n",
" <td>12203</td>\n",
" <td>7001</td>\n",
" <td>10247</td>\n",
" <td>7131</td>\n",
" <td>8008</td>\n",
" <td>6460</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2016-08-17</th>\n",
" <td>435088</td>\n",
" <td>16006</td>\n",
" <td>417002</td>\n",
" <td>4135</td>\n",
" <td>53099</td>\n",
" <td>3509</td>\n",
" <td>30914</td>\n",
" <td>4249</td>\n",
" <td>21677</td>\n",
" <td>4817</td>\n",
" <td>18733</td>\n",
" <td>5325</td>\n",
" <td>14779</td>\n",
" <td>6181</td>\n",
" <td>12328</td>\n",
" <td>6770</td>\n",
" <td>10716</td>\n",
" <td>6925</td>\n",
" <td>7238</td>\n",
" <td>5752</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2016-08-18</th>\n",
" <td>424235</td>\n",
" <td>22234</td>\n",
" <td>407629</td>\n",
" <td>10439</td>\n",
" <td>49677</td>\n",
" <td>6816</td>\n",
" <td>27252</td>\n",
" <td>7081</td>\n",
" <td>20381</td>\n",
" <td>7471</td>\n",
" <td>16859</td>\n",
" <td>7703</td>\n",
" <td>13158</td>\n",
" <td>8422</td>\n",
" <td>11118</td>\n",
" <td>8944</td>\n",
" <td>9595</td>\n",
" <td>8750</td>\n",
" <td>5422</td>\n",
" <td>6532</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2016-08-19</th>\n",
" <td>395311</td>\n",
" <td>33669</td>\n",
" <td>382059</td>\n",
" <td>22889</td>\n",
" <td>45753</td>\n",
" <td>13003</td>\n",
" <td>23552</td>\n",
" <td>12234</td>\n",
" <td>18709</td>\n",
" <td>12161</td>\n",
" <td>15306</td>\n",
" <td>12081</td>\n",
" <td>11877</td>\n",
" <td>12536</td>\n",
" <td>9883</td>\n",
" <td>12356</td>\n",
" <td>8526</td>\n",
" <td>11857</td>\n",
" <td>3600</td>\n",
" <td>7303</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2016-08-20</th>\n",
" <td>307812</td>\n",
" <td>25228</td>\n",
" <td>301287</td>\n",
" <td>18886</td>\n",
" <td>33312</td>\n",
" <td>10226</td>\n",
" <td>16185</td>\n",
" <td>9281</td>\n",
" <td>12376</td>\n",
" <td>8877</td>\n",
" <td>9908</td>\n",
" <td>8721</td>\n",
" <td>7048</td>\n",
" <td>8853</td>\n",
" <td>6038</td>\n",
" <td>8526</td>\n",
" <td>5262</td>\n",
" <td>7951</td>\n",
" <td>1520</td>\n",
" <td>3837</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2016-08-21</th>\n",
" <td>285779</td>\n",
" <td>23270</td>\n",
" <td>280516</td>\n",
" <td>17882</td>\n",
" <td>29898</td>\n",
" <td>9571</td>\n",
" <td>14618</td>\n",
" <td>8747</td>\n",
" <td>11034</td>\n",
" <td>8119</td>\n",
" <td>8566</td>\n",
" <td>8158</td>\n",
" <td>6480</td>\n",
" <td>8185</td>\n",
" <td>5345</td>\n",
" <td>7796</td>\n",
" <td>4898</td>\n",
" <td>7334</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2016-08-22</th>\n",
" <td>434761</td>\n",
" <td>41155</td>\n",
" <td>419578</td>\n",
" <td>27730</td>\n",
" <td>47248</td>\n",
" <td>16378</td>\n",
" <td>24681</td>\n",
" <td>15433</td>\n",
" <td>20613</td>\n",
" <td>15151</td>\n",
" <td>16832</td>\n",
" <td>15654</td>\n",
" <td>13139</td>\n",
" <td>15991</td>\n",
" <td>11047</td>\n",
" <td>15460</td>\n",
" <td>9592</td>\n",
" <td>14611</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2016-08-23</th>\n",
" <td>438756</td>\n",
" <td>42177</td>\n",
" <td>424462</td>\n",
" <td>29198</td>\n",
" <td>43786</td>\n",
" <td>16666</td>\n",
" <td>25100</td>\n",
" <td>16000</td>\n",
" <td>20910</td>\n",
" <td>15542</td>\n",
" <td>16433</td>\n",
" <td>16171</td>\n",
" <td>13053</td>\n",
" <td>16409</td>\n",
" <td>11075</td>\n",
" <td>16022</td>\n",
" <td>8784</td>\n",
" <td>14030</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2016-08-24</th>\n",
" <td>445407</td>\n",
" <td>43549</td>\n",
" <td>421743</td>\n",
" <td>29457</td>\n",
" <td>40670</td>\n",
" <td>16575</td>\n",
" <td>25293</td>\n",
" <td>16002</td>\n",
" <td>22173</td>\n",
" <td>15764</td>\n",
" <td>18306</td>\n",
" <td>16410</td>\n",
" <td>15173</td>\n",
" <td>16507</td>\n",
" <td>12788</td>\n",
" <td>15858</td>\n",
" <td>8659</td>\n",
" <td>12483</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2016-08-25</th>\n",
" <td>436623</td>\n",
" <td>42308</td>\n",
" <td>422695</td>\n",
" <td>30276</td>\n",
" <td>34793</td>\n",
" <td>17136</td>\n",
" <td>22060</td>\n",
" <td>16013</td>\n",
" <td>17592</td>\n",
" <td>15730</td>\n",
" <td>13664</td>\n",
" <td>15978</td>\n",
" <td>11224</td>\n",
" <td>15865</td>\n",
" <td>9709</td>\n",
" <td>15014</td>\n",
" <td>5577</td>\n",
" <td>11089</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2016-08-26</th>\n",
" <td>404136</td>\n",
" <td>38394</td>\n",
" <td>393355</td>\n",
" <td>27451</td>\n",
" <td>27355</td>\n",
" <td>15458</td>\n",
" <td>19069</td>\n",
" <td>14560</td>\n",
" <td>14748</td>\n",
" <td>14174</td>\n",
" <td>11445</td>\n",
" <td>14524</td>\n",
" <td>9365</td>\n",
" <td>14251</td>\n",
" <td>8353</td>\n",
" <td>13761</td>\n",
" <td>3577</td>\n",
" <td>8443</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2016-08-27</th>\n",
" <td>298538</td>\n",
" <td>27666</td>\n",
" <td>293382</td>\n",
" <td>20974</td>\n",
" <td>17976</td>\n",
" <td>11497</td>\n",
" <td>12014</td>\n",
" <td>10212</td>\n",
" <td>8824</td>\n",
" <td>10116</td>\n",
" <td>6689</td>\n",
" <td>10172</td>\n",
" <td>5388</td>\n",
" <td>9674</td>\n",
" <td>4824</td>\n",
" <td>9221</td>\n",
" <td>1350</td>\n",
" <td>4446</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2016-08-28</th>\n",
" <td>279793</td>\n",
" <td>25539</td>\n",
" <td>275460</td>\n",
" <td>19914</td>\n",
" <td>16178</td>\n",
" <td>10854</td>\n",
" <td>10786</td>\n",
" <td>9626</td>\n",
" <td>7943</td>\n",
" <td>9284</td>\n",
" <td>5724</td>\n",
" <td>9252</td>\n",
" <td>4695</td>\n",
" <td>8653</td>\n",
" <td>4340</td>\n",
" <td>8175</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Count \\\n",
"Week 0 1 2 \n",
"Cohort control variation control variation control variation \n",
"Profile_Creation_Date \n",
"2016-08-14 269373 8656 260949 2836 35682 1790 \n",
"2016-08-15 389692 14357 374949 3377 50845 2735 \n",
"2016-08-16 428364 16202 411311 3946 55126 3411 \n",
"2016-08-17 435088 16006 417002 4135 53099 3509 \n",
"2016-08-18 424235 22234 407629 10439 49677 6816 \n",
"2016-08-19 395311 33669 382059 22889 45753 13003 \n",
"2016-08-20 307812 25228 301287 18886 33312 10226 \n",
"2016-08-21 285779 23270 280516 17882 29898 9571 \n",
"2016-08-22 434761 41155 419578 27730 47248 16378 \n",
"2016-08-23 438756 42177 424462 29198 43786 16666 \n",
"2016-08-24 445407 43549 421743 29457 40670 16575 \n",
"2016-08-25 436623 42308 422695 30276 34793 17136 \n",
"2016-08-26 404136 38394 393355 27451 27355 15458 \n",
"2016-08-27 298538 27666 293382 20974 17976 11497 \n",
"2016-08-28 279793 25539 275460 19914 16178 10854 \n",
"\n",
" \\\n",
"Week 3 4 5 \n",
"Cohort control variation control variation control variation \n",
"Profile_Creation_Date \n",
"2016-08-14 21276 1939 12492 2290 10013 2501 \n",
"2016-08-15 32137 3290 19276 4112 16465 4558 \n",
"2016-08-16 33160 4122 21267 4932 18464 5380 \n",
"2016-08-17 30914 4249 21677 4817 18733 5325 \n",
"2016-08-18 27252 7081 20381 7471 16859 7703 \n",
"2016-08-19 23552 12234 18709 12161 15306 12081 \n",
"2016-08-20 16185 9281 12376 8877 9908 8721 \n",
"2016-08-21 14618 8747 11034 8119 8566 8158 \n",
"2016-08-22 24681 15433 20613 15151 16832 15654 \n",
"2016-08-23 25100 16000 20910 15542 16433 16171 \n",
"2016-08-24 25293 16002 22173 15764 18306 16410 \n",
"2016-08-25 22060 16013 17592 15730 13664 15978 \n",
"2016-08-26 19069 14560 14748 14174 11445 14524 \n",
"2016-08-27 12014 10212 8824 10116 6689 10172 \n",
"2016-08-28 10786 9626 7943 9284 5724 9252 \n",
"\n",
" \\\n",
"Week 6 7 8 \n",
"Cohort control variation control variation control variation \n",
"Profile_Creation_Date \n",
"2016-08-14 7754 2844 6018 3376 5085 3372 \n",
"2016-08-15 13417 5224 10903 5939 9077 6211 \n",
"2016-08-16 14707 6164 12203 7001 10247 7131 \n",
"2016-08-17 14779 6181 12328 6770 10716 6925 \n",
"2016-08-18 13158 8422 11118 8944 9595 8750 \n",
"2016-08-19 11877 12536 9883 12356 8526 11857 \n",
"2016-08-20 7048 8853 6038 8526 5262 7951 \n",
"2016-08-21 6480 8185 5345 7796 4898 7334 \n",
"2016-08-22 13139 15991 11047 15460 9592 14611 \n",
"2016-08-23 13053 16409 11075 16022 8784 14030 \n",
"2016-08-24 15173 16507 12788 15858 8659 12483 \n",
"2016-08-25 11224 15865 9709 15014 5577 11089 \n",
"2016-08-26 9365 14251 8353 13761 3577 8443 \n",
"2016-08-27 5388 9674 4824 9221 1350 4446 \n",
"2016-08-28 4695 8653 4340 8175 0 0 \n",
"\n",
" \n",
"Week 9 \n",
"Cohort control variation \n",
"Profile_Creation_Date \n",
"2016-08-14 4707 3327 \n",
"2016-08-15 7771 6134 \n",
"2016-08-16 8008 6460 \n",
"2016-08-17 7238 5752 \n",
"2016-08-18 5422 6532 \n",
"2016-08-19 3600 7303 \n",
"2016-08-20 1520 3837 \n",
"2016-08-21 0 0 \n",
"2016-08-22 0 0 \n",
"2016-08-23 0 0 \n",
"2016-08-24 0 0 \n",
"2016-08-25 0 0 \n",
"2016-08-26 0 0 \n",
"2016-08-27 0 0 \n",
"2016-08-28 0 0 "
]
},
"execution_count": 345,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pivoted"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Export table to spreadsheet if necessary"
]
}
],
"metadata": {
"anaconda-cloud": {},
"kernelspec": {
"display_name": "Python [conda root]",
"language": "python",
"name": "conda-root-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.12"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment