Skip to content

Instantly share code, notes, and snippets.

@louisguitton
Created May 24, 2020 10:44
Show Gist options
  • Save louisguitton/cfe2218d6109b45915dcb87580322882 to your computer and use it in GitHub Desktop.
Save louisguitton/cfe2218d6109b45915dcb87580322882 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"import pandas as pd\n",
"from sqlalchemy import create_engine"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"with open('api.questionstat - 2020-04-09.json') as fh:\n",
" data = json.loads(fh.read())"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>model</th>\n",
" <th>pk</th>\n",
" <th>fields.question</th>\n",
" <th>fields.answer_choice</th>\n",
" <th>fields.created</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>api.questionstat</td>\n",
" <td>1730</td>\n",
" <td>62</td>\n",
" <td>b</td>\n",
" <td>2020-04-07 18:26:31.137000+00:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>api.questionstat</td>\n",
" <td>1731</td>\n",
" <td>44</td>\n",
" <td>c</td>\n",
" <td>2020-04-07 18:37:05.718000+00:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>api.questionstat</td>\n",
" <td>1732</td>\n",
" <td>4</td>\n",
" <td>d</td>\n",
" <td>2020-04-07 18:37:18.488000+00:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>api.questionstat</td>\n",
" <td>1733</td>\n",
" <td>27</td>\n",
" <td>d</td>\n",
" <td>2020-04-07 18:38:00.057000+00:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>api.questionstat</td>\n",
" <td>1734</td>\n",
" <td>9</td>\n",
" <td>b</td>\n",
" <td>2020-04-07 18:50:10.325000+00:00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" model pk fields.question fields.answer_choice \\\n",
"0 api.questionstat 1730 62 b \n",
"1 api.questionstat 1731 44 c \n",
"2 api.questionstat 1732 4 d \n",
"3 api.questionstat 1733 27 d \n",
"4 api.questionstat 1734 9 b \n",
"\n",
" fields.created \n",
"0 2020-04-07 18:26:31.137000+00:00 \n",
"1 2020-04-07 18:37:05.718000+00:00 \n",
"2 2020-04-07 18:37:18.488000+00:00 \n",
"3 2020-04-07 18:38:00.057000+00:00 \n",
"4 2020-04-07 18:50:10.325000+00:00 "
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"raw_events__answers = pd.json_normalize(data)\n",
"raw_events__answers[\"fields.created\"] = raw_events__answers[\"fields.created\"].apply(pd.to_datetime)\n",
"raw_events__answers.head()"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"engine = create_engine('sqlite:///raph.db')"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"raw_events__answers.to_sql(name=\"raw_events__answers\", con=engine)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"- Avoir des stats journalière (1 row par jour, JSONField). metrics: nombre de réponses totales, par heure, par page, etc\n",
"- Avoir des stats par question (1 row par question, JSONField). metrics: nombre de réponses totales, par choix de réponse\n",
"- Futur: Avoir des stats par quizz, like/dislike, ..."
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>answer_id</th>\n",
" <th>question_id</th>\n",
" <th>answer_choice</th>\n",
" <th>created_at</th>\n",
" <th>date_at</th>\n",
" <th>hour_at</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1730</td>\n",
" <td>62</td>\n",
" <td>b</td>\n",
" <td>2020-04-07 18:26:31.137000+00:00</td>\n",
" <td>2020-04-07</td>\n",
" <td>18</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1731</td>\n",
" <td>44</td>\n",
" <td>c</td>\n",
" <td>2020-04-07 18:37:05.718000+00:00</td>\n",
" <td>2020-04-07</td>\n",
" <td>18</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1732</td>\n",
" <td>4</td>\n",
" <td>d</td>\n",
" <td>2020-04-07 18:37:18.488000+00:00</td>\n",
" <td>2020-04-07</td>\n",
" <td>18</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1733</td>\n",
" <td>27</td>\n",
" <td>d</td>\n",
" <td>2020-04-07 18:38:00.057000+00:00</td>\n",
" <td>2020-04-07</td>\n",
" <td>18</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1734</td>\n",
" <td>9</td>\n",
" <td>b</td>\n",
" <td>2020-04-07 18:50:10.325000+00:00</td>\n",
" <td>2020-04-07</td>\n",
" <td>18</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" answer_id question_id answer_choice created_at \\\n",
"0 1730 62 b 2020-04-07 18:26:31.137000+00:00 \n",
"1 1731 44 c 2020-04-07 18:37:05.718000+00:00 \n",
"2 1732 4 d 2020-04-07 18:37:18.488000+00:00 \n",
"3 1733 27 d 2020-04-07 18:38:00.057000+00:00 \n",
"4 1734 9 b 2020-04-07 18:50:10.325000+00:00 \n",
"\n",
" date_at hour_at \n",
"0 2020-04-07 18 \n",
"1 2020-04-07 18 \n",
"2 2020-04-07 18 \n",
"3 2020-04-07 18 \n",
"4 2020-04-07 18 "
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"stg_events__answers = raw_events__answers.drop(columns=\"model\").rename(columns={\n",
" \"pk\": \"answer_id\", \n",
" \"fields.question\": \"question_id\", \n",
" \"fields.answer_choice\": \"answer_choice\", \n",
" \"fields.created\": \"created_at\"\n",
"})\n",
"stg_events__answers[\"date_at\"] = stg_events__answers.created_at.dt.date\n",
"stg_events__answers[\"hour_at\"] = stg_events__answers.created_at.dt.hour\n",
"stg_events__answers.head()"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>date_at</th>\n",
" <th>hour_at</th>\n",
" <th>question_id</th>\n",
" <th>answer_choice</th>\n",
" <th>answers_count</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2020-04-07</td>\n",
" <td>18</td>\n",
" <td>4</td>\n",
" <td>d</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2020-04-07</td>\n",
" <td>18</td>\n",
" <td>9</td>\n",
" <td>b</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2020-04-07</td>\n",
" <td>18</td>\n",
" <td>19</td>\n",
" <td>c</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2020-04-07</td>\n",
" <td>18</td>\n",
" <td>27</td>\n",
" <td>d</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2020-04-07</td>\n",
" <td>18</td>\n",
" <td>29</td>\n",
" <td>c</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" date_at hour_at question_id answer_choice answers_count\n",
"0 2020-04-07 18 4 d 1\n",
"1 2020-04-07 18 9 b 1\n",
"2 2020-04-07 18 19 c 1\n",
"3 2020-04-07 18 27 d 1\n",
"4 2020-04-07 18 29 c 1"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"fct_answers = stg_events__answers.groupby([\n",
" stg_events__answers.date_at,\n",
" stg_events__answers.hour_at,\n",
" stg_events__answers.question_id,\n",
" stg_events__answers.answer_choice\n",
"]).answer_id.agg(answers_count=\"count\").reset_index()\n",
"fct_answers.head()"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>date_at</th>\n",
" <th>answers_count</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2020-04-07</td>\n",
" <td>31</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2020-04-08</td>\n",
" <td>36</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" date_at answers_count\n",
"0 2020-04-07 31\n",
"1 2020-04-08 36"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"viz_day_stats = fct_answers.groupby(\"date_at\").answers_count.sum().reset_index()\n",
"viz_day_stats.head()"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th>answer_choice</th>\n",
" <th>a</th>\n",
" <th>b</th>\n",
" <th>c</th>\n",
" <th>d</th>\n",
" </tr>\n",
" <tr>\n",
" <th>question_id</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>NaN</td>\n",
" <td>1.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>NaN</td>\n",
" <td>1.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"answer_choice a b c d\n",
"question_id \n",
"1 NaN NaN 1.0 1.0\n",
"4 NaN NaN NaN 2.0\n",
"8 NaN NaN NaN 1.0\n",
"9 NaN 1.0 NaN NaN\n",
"10 NaN 1.0 NaN NaN"
]
},
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"viz_question_stats = fct_answers.groupby([\"question_id\", \"answer_choice\"]).answers_count.sum().unstack()\n",
"viz_question_stats.head()"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [],
"source": [
"stg_events__answers.to_sql(name=\"stg_events__answers\", con=engine)\n",
"fct_answers.to_sql(name=\"fct_answers\", con=engine)\n",
"viz_day_stats.to_sql(name=\"viz_day_stats\", con=engine)\n",
"viz_question_stats.to_sql(name=\"viz_question_stats\", con=engine)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.7"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment