Created
May 24, 2020 10:44
-
-
Save louisguitton/cfe2218d6109b45915dcb87580322882 to your computer and use it in GitHub Desktop.
data models for https://github.com/raphodn/know-your-planet
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import json\n", | |
"import pandas as pd\n", | |
"from sqlalchemy import create_engine" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"with open('api.questionstat - 2020-04-09.json') as fh:\n", | |
" data = json.loads(fh.read())" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 27, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>model</th>\n", | |
" <th>pk</th>\n", | |
" <th>fields.question</th>\n", | |
" <th>fields.answer_choice</th>\n", | |
" <th>fields.created</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>api.questionstat</td>\n", | |
" <td>1730</td>\n", | |
" <td>62</td>\n", | |
" <td>b</td>\n", | |
" <td>2020-04-07 18:26:31.137000+00:00</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>api.questionstat</td>\n", | |
" <td>1731</td>\n", | |
" <td>44</td>\n", | |
" <td>c</td>\n", | |
" <td>2020-04-07 18:37:05.718000+00:00</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>api.questionstat</td>\n", | |
" <td>1732</td>\n", | |
" <td>4</td>\n", | |
" <td>d</td>\n", | |
" <td>2020-04-07 18:37:18.488000+00:00</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>api.questionstat</td>\n", | |
" <td>1733</td>\n", | |
" <td>27</td>\n", | |
" <td>d</td>\n", | |
" <td>2020-04-07 18:38:00.057000+00:00</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>api.questionstat</td>\n", | |
" <td>1734</td>\n", | |
" <td>9</td>\n", | |
" <td>b</td>\n", | |
" <td>2020-04-07 18:50:10.325000+00:00</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" model pk fields.question fields.answer_choice \\\n", | |
"0 api.questionstat 1730 62 b \n", | |
"1 api.questionstat 1731 44 c \n", | |
"2 api.questionstat 1732 4 d \n", | |
"3 api.questionstat 1733 27 d \n", | |
"4 api.questionstat 1734 9 b \n", | |
"\n", | |
" fields.created \n", | |
"0 2020-04-07 18:26:31.137000+00:00 \n", | |
"1 2020-04-07 18:37:05.718000+00:00 \n", | |
"2 2020-04-07 18:37:18.488000+00:00 \n", | |
"3 2020-04-07 18:38:00.057000+00:00 \n", | |
"4 2020-04-07 18:50:10.325000+00:00 " | |
] | |
}, | |
"execution_count": 27, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"raw_events__answers = pd.json_normalize(data)\n", | |
"raw_events__answers[\"fields.created\"] = raw_events__answers[\"fields.created\"].apply(pd.to_datetime)\n", | |
"raw_events__answers.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 28, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"engine = create_engine('sqlite:///raph.db')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 29, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"raw_events__answers.to_sql(name=\"raw_events__answers\", con=engine)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"- Avoir des stats journalière (1 row par jour, JSONField). metrics: nombre de réponses totales, par heure, par page, etc\n", | |
"- Avoir des stats par question (1 row par question, JSONField). metrics: nombre de réponses totales, par choix de réponse\n", | |
"- Futur: Avoir des stats par quizz, like/dislike, ..." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 36, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>answer_id</th>\n", | |
" <th>question_id</th>\n", | |
" <th>answer_choice</th>\n", | |
" <th>created_at</th>\n", | |
" <th>date_at</th>\n", | |
" <th>hour_at</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>1730</td>\n", | |
" <td>62</td>\n", | |
" <td>b</td>\n", | |
" <td>2020-04-07 18:26:31.137000+00:00</td>\n", | |
" <td>2020-04-07</td>\n", | |
" <td>18</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>1731</td>\n", | |
" <td>44</td>\n", | |
" <td>c</td>\n", | |
" <td>2020-04-07 18:37:05.718000+00:00</td>\n", | |
" <td>2020-04-07</td>\n", | |
" <td>18</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>1732</td>\n", | |
" <td>4</td>\n", | |
" <td>d</td>\n", | |
" <td>2020-04-07 18:37:18.488000+00:00</td>\n", | |
" <td>2020-04-07</td>\n", | |
" <td>18</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>1733</td>\n", | |
" <td>27</td>\n", | |
" <td>d</td>\n", | |
" <td>2020-04-07 18:38:00.057000+00:00</td>\n", | |
" <td>2020-04-07</td>\n", | |
" <td>18</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>1734</td>\n", | |
" <td>9</td>\n", | |
" <td>b</td>\n", | |
" <td>2020-04-07 18:50:10.325000+00:00</td>\n", | |
" <td>2020-04-07</td>\n", | |
" <td>18</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" answer_id question_id answer_choice created_at \\\n", | |
"0 1730 62 b 2020-04-07 18:26:31.137000+00:00 \n", | |
"1 1731 44 c 2020-04-07 18:37:05.718000+00:00 \n", | |
"2 1732 4 d 2020-04-07 18:37:18.488000+00:00 \n", | |
"3 1733 27 d 2020-04-07 18:38:00.057000+00:00 \n", | |
"4 1734 9 b 2020-04-07 18:50:10.325000+00:00 \n", | |
"\n", | |
" date_at hour_at \n", | |
"0 2020-04-07 18 \n", | |
"1 2020-04-07 18 \n", | |
"2 2020-04-07 18 \n", | |
"3 2020-04-07 18 \n", | |
"4 2020-04-07 18 " | |
] | |
}, | |
"execution_count": 36, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"stg_events__answers = raw_events__answers.drop(columns=\"model\").rename(columns={\n", | |
" \"pk\": \"answer_id\", \n", | |
" \"fields.question\": \"question_id\", \n", | |
" \"fields.answer_choice\": \"answer_choice\", \n", | |
" \"fields.created\": \"created_at\"\n", | |
"})\n", | |
"stg_events__answers[\"date_at\"] = stg_events__answers.created_at.dt.date\n", | |
"stg_events__answers[\"hour_at\"] = stg_events__answers.created_at.dt.hour\n", | |
"stg_events__answers.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 39, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>date_at</th>\n", | |
" <th>hour_at</th>\n", | |
" <th>question_id</th>\n", | |
" <th>answer_choice</th>\n", | |
" <th>answers_count</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>2020-04-07</td>\n", | |
" <td>18</td>\n", | |
" <td>4</td>\n", | |
" <td>d</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>2020-04-07</td>\n", | |
" <td>18</td>\n", | |
" <td>9</td>\n", | |
" <td>b</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>2020-04-07</td>\n", | |
" <td>18</td>\n", | |
" <td>19</td>\n", | |
" <td>c</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>2020-04-07</td>\n", | |
" <td>18</td>\n", | |
" <td>27</td>\n", | |
" <td>d</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>2020-04-07</td>\n", | |
" <td>18</td>\n", | |
" <td>29</td>\n", | |
" <td>c</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" date_at hour_at question_id answer_choice answers_count\n", | |
"0 2020-04-07 18 4 d 1\n", | |
"1 2020-04-07 18 9 b 1\n", | |
"2 2020-04-07 18 19 c 1\n", | |
"3 2020-04-07 18 27 d 1\n", | |
"4 2020-04-07 18 29 c 1" | |
] | |
}, | |
"execution_count": 39, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"fct_answers = stg_events__answers.groupby([\n", | |
" stg_events__answers.date_at,\n", | |
" stg_events__answers.hour_at,\n", | |
" stg_events__answers.question_id,\n", | |
" stg_events__answers.answer_choice\n", | |
"]).answer_id.agg(answers_count=\"count\").reset_index()\n", | |
"fct_answers.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 41, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>date_at</th>\n", | |
" <th>answers_count</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>2020-04-07</td>\n", | |
" <td>31</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>2020-04-08</td>\n", | |
" <td>36</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" date_at answers_count\n", | |
"0 2020-04-07 31\n", | |
"1 2020-04-08 36" | |
] | |
}, | |
"execution_count": 41, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"viz_day_stats = fct_answers.groupby(\"date_at\").answers_count.sum().reset_index()\n", | |
"viz_day_stats.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 45, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th>answer_choice</th>\n", | |
" <th>a</th>\n", | |
" <th>b</th>\n", | |
" <th>c</th>\n", | |
" <th>d</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>question_id</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>1.0</td>\n", | |
" <td>1.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>2.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>8</th>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>1.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>9</th>\n", | |
" <td>NaN</td>\n", | |
" <td>1.0</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>10</th>\n", | |
" <td>NaN</td>\n", | |
" <td>1.0</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
"answer_choice a b c d\n", | |
"question_id \n", | |
"1 NaN NaN 1.0 1.0\n", | |
"4 NaN NaN NaN 2.0\n", | |
"8 NaN NaN NaN 1.0\n", | |
"9 NaN 1.0 NaN NaN\n", | |
"10 NaN 1.0 NaN NaN" | |
] | |
}, | |
"execution_count": 45, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"viz_question_stats = fct_answers.groupby([\"question_id\", \"answer_choice\"]).answers_count.sum().unstack()\n", | |
"viz_question_stats.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 46, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"stg_events__answers.to_sql(name=\"stg_events__answers\", con=engine)\n", | |
"fct_answers.to_sql(name=\"fct_answers\", con=engine)\n", | |
"viz_day_stats.to_sql(name=\"viz_day_stats\", con=engine)\n", | |
"viz_question_stats.to_sql(name=\"viz_question_stats\", con=engine)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.7" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment