Created
December 1, 2021 09:46
-
-
Save jph00/e1d3ffc797d7500fde160aa08191de64 to your computer and use it in GitHub Desktop.
riiid-acp-pub/01_pre-read.ipynb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"metadata": { | |
"trusted": false | |
}, | |
"cell_type": "code", | |
"source": "import gc, joblib, numpy as np, pandas as pd, pickle, enum\n\nfrom collections import defaultdict\nfrom concurrent.futures import ProcessPoolExecutor\nfrom pathlib import Path\nfrom scipy.sparse import coo_matrix, dok_matrix, lil_matrix, csr_matrix, bsr_matrix", | |
"execution_count": 1, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": false | |
}, | |
"cell_type": "code", | |
"source": "in_d = Path('input')", | |
"execution_count": 2, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"heading_collapsed": true | |
}, | |
"cell_type": "markdown", | |
"source": "# Questions" | |
}, | |
{ | |
"metadata": { | |
"hidden": true | |
}, | |
"cell_type": "markdown", | |
"source": "We often want to refer to continuous vs categorical vars, so set them now:" | |
}, | |
{ | |
"metadata": { | |
"trusted": false, | |
"hidden": true | |
}, | |
"cell_type": "code", | |
"source": "cats = ['bundle_id', 'correct_answer', 'part']\nconts = ['question_id']", | |
"execution_count": 3, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"hidden": true | |
}, | |
"cell_type": "markdown", | |
"source": "By default Pandas can use unnecessarily large data types:" | |
}, | |
{ | |
"metadata": { | |
"trusted": false, | |
"hidden": true | |
}, | |
"cell_type": "code", | |
"source": "questions_df = pd.read_csv(in_d / 'questions.csv', usecols=conts+cats+['tags'])\nquestions_df.info()", | |
"execution_count": 4, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": "<class 'pandas.core.frame.DataFrame'>\nRangeIndex: 13523 entries, 0 to 13522\nData columns (total 5 columns):\n # Column Non-Null Count Dtype \n--- ------ -------------- ----- \n 0 question_id 13523 non-null int64 \n 1 bundle_id 13523 non-null int64 \n 2 correct_answer 13523 non-null int64 \n 3 part 13523 non-null int64 \n 4 tags 13522 non-null object\ndtypes: int64(4), object(1)\nmemory usage: 528.4+ KB\n" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"hidden": true | |
}, | |
"cell_type": "markdown", | |
"source": "fastai's tabular module has lots of functionality for dealing with tabular models and data..." | |
}, | |
{ | |
"metadata": { | |
"trusted": false, | |
"hidden": true | |
}, | |
"cell_type": "code", | |
"source": "from fastai.tabular.all import *", | |
"execution_count": 5, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"hidden": true | |
}, | |
"cell_type": "markdown", | |
"source": "...including `df_shrink_dtypes`, which tells you what is the most efficient dtype for each column: " | |
}, | |
{ | |
"metadata": { | |
"trusted": false, | |
"hidden": true | |
}, | |
"cell_type": "code", | |
"source": "dtypes = df_shrink_dtypes(questions_df, obj2cat=False)\ndtypes", | |
"execution_count": 6, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": "{'question_id': dtype('int16'),\n 'bundle_id': dtype('int16'),\n 'correct_answer': dtype('int8'),\n 'part': dtype('int8')}" | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"hidden": true | |
}, | |
"cell_type": "markdown", | |
"source": "If we load the data with these types, we will have a much more efficient data frame:" | |
}, | |
{ | |
"metadata": { | |
"trusted": false, | |
"hidden": true | |
}, | |
"cell_type": "code", | |
"source": "questions_df = pd.read_csv(in_d / 'questions.csv', usecols=conts+cats+['tags'], dtype=dtypes)\nquestions_df.info()", | |
"execution_count": 70, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": "<class 'pandas.core.frame.DataFrame'>\nRangeIndex: 13523 entries, 0 to 13522\nData columns (total 5 columns):\n # Column Non-Null Count Dtype \n--- ------ -------------- ----- \n 0 question_id 13523 non-null int16 \n 1 bundle_id 13523 non-null int16 \n 2 correct_answer 13523 non-null int8 \n 3 part 13523 non-null int8 \n 4 tags 13522 non-null object\ndtypes: int16(2), int8(2), object(1)\nmemory usage: 185.0+ KB\n" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": false, | |
"hidden": true | |
}, | |
"cell_type": "code", | |
"source": "tags = [f'tag_{_}' for _ in range(6)]\nquestions_df[tags] = questions_df.tags.str.split(expand=True).fillna('-1').astype('int16')\nquestions_df.drop('tags', axis=1, inplace=True)", | |
"execution_count": 71, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"hidden": true | |
}, | |
"cell_type": "markdown", | |
"source": "fastai automatically creates categorical variables that include a separate category for _missing_, which is important for both training and inference. The original solution did this manually, but we don't have to:" | |
}, | |
{ | |
"metadata": { | |
"trusted": false, | |
"hidden": true | |
}, | |
"cell_type": "code", | |
"source": "questions_t = TabularPandas(questions_df, Categorify, cats+tags, conts)", | |
"execution_count": 72, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"hidden": true | |
}, | |
"cell_type": "markdown", | |
"source": "We can now create the required dict:" | |
}, | |
{ | |
"metadata": { | |
"trusted": false, | |
"hidden": true | |
}, | |
"cell_type": "code", | |
"source": "qc_d = {}\nfor i in range(len(questions_t)):\n row = questions_t.iloc[i]\n arr = row[['question_id', 'bundle_id', 'correct_answer', 'part'] + tags]\n qc_d[row['question_id']] = np.array(row, dtype=np.int16)", | |
"execution_count": 75, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": false, | |
"hidden": true | |
}, | |
"cell_type": "code", | |
"source": "qc_d[0]", | |
"execution_count": 76, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": "array([ 0, 1, 1, 1, 37, 60, 49, 7, 1, 1], dtype=int16)" | |
}, | |
"execution_count": 76, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
] | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "# Lectures" | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "We set _cats_ and _conts_ for _Lectures_ too, and then repeat the process..." | |
}, | |
{ | |
"metadata": { | |
"trusted": false | |
}, | |
"cell_type": "code", | |
"source": "cats = ['part', 'type_of']\nconts = ['lecture_id']", | |
"execution_count": 77, | |
"outputs": [] | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "...except we can save a step by calling `df_shrink` directly to update the dtypes for us in one step:" | |
}, | |
{ | |
"metadata": { | |
"trusted": false | |
}, | |
"cell_type": "code", | |
"source": "lectures_df = df_shrink(pd.read_csv(in_d / 'lectures.csv', usecols=conts+cats+['tag']), obj2cat=False)", | |
"execution_count": 78, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": false | |
}, | |
"cell_type": "code", | |
"source": "lectures_df.rename({'tag':'tag_0'}, axis=1, inplace=True)\n\nfor i in range(1, 6): lectures_df[f'tag_{i}']= pd.Series(0, index=lectures_df.index, dtype='uint8')\nassert lectures_df.isna().sum().sum() == 0", | |
"execution_count": 79, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": false | |
}, | |
"cell_type": "code", | |
"source": "lectures_t = TabularPandas(lectures_df, Categorify, cats+tags, conts)", | |
"execution_count": 80, | |
"outputs": [] | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "fastai stores a `CategoryMap` for each categorical column with the forward and reverse mappings:" | |
}, | |
{ | |
"metadata": { | |
"trusted": false | |
}, | |
"cell_type": "code", | |
"source": "qc = questions_t.procs.categorify.classes\nlc = lectures_t.procs.categorify.classes\nassert all(zip(lc['part'].items, qc['part'])) # all parts show up on both dfs", | |
"execution_count": 15, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": false | |
}, | |
"cell_type": "code", | |
"source": "lc_d = {}\nfor i in range(len(lectures_t)):\n row = lectures_t.iloc[i]\n arr = row[['lecture_id', 'part', 'type_of'] + tags]\n qc_d[row['lecture_id']] = np.array(row ,dtype=np.int16)", | |
"execution_count": 17, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": false | |
}, | |
"cell_type": "code", | |
"source": "LCols = enum.IntEnum('LCols', lectures_df.columns.to_list(), start=0)", | |
"execution_count": 16, | |
"outputs": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3 (ipykernel)", | |
"language": "python" | |
}, | |
"language_info": { | |
"name": "python", | |
"version": "3.9.5", | |
"mimetype": "text/x-python", | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"pygments_lexer": "ipython3", | |
"nbconvert_exporter": "python", | |
"file_extension": ".py" | |
}, | |
"toc": { | |
"base_numbering": 1, | |
"nav_menu": { | |
"height": "292px", | |
"width": "228px" | |
}, | |
"number_sections": true, | |
"sideBar": true, | |
"skip_h1_title": false, | |
"title_cell": "Table of Contents", | |
"title_sidebar": "Contents", | |
"toc_cell": false, | |
"toc_position": { | |
"height": "calc(100% - 180px)", | |
"left": "10px", | |
"top": "150px", | |
"width": "283px" | |
}, | |
"toc_section_display": true, | |
"toc_window_display": true | |
}, | |
"gist": { | |
"id": "", | |
"data": { | |
"description": "riiid-acp-pub/01_pre-read.ipynb", | |
"public": true | |
} | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment