Skip to content

Instantly share code, notes, and snippets.

@bnaul
Created November 22, 2017 00:59
Show Gist options
  • Save bnaul/e00ec418fbc3da6366008e0deb748eb6 to your computer and use it in GitHub Desktop.
Save bnaul/e00ec418fbc3da6366008e0deb748eb6 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"from personatrainer.scripts.generate_personas import *"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"4305165 0.csv\r\n"
]
}
],
"source": [
"!wc -l 0.csv"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 21min 50s, sys: 1.17 s, total: 21min 51s\n",
"Wall time: 21min 51s\n"
]
}
],
"source": [
"%%time\n",
"df = pd.read_csv('0.csv', parse_dates=[1, 2])"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>uid</th>\n",
" <th>enter_time</th>\n",
" <th>exit_time</th>\n",
" <th>lat</th>\n",
" <th>lng</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>7e4ec7c997fd3836ed6845ef145cfafe03e14455925a0a...</td>\n",
" <td>2017-08-24 12:48:21.000000</td>\n",
" <td>2017-08-24 12:53:49.000000</td>\n",
" <td>51.470015</td>\n",
" <td>-0.779697</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1a3ae5604c2d26b734f034a8956df50799ebf02d69de53...</td>\n",
" <td>2017-08-31 15:20:39.463415</td>\n",
" <td>2017-08-31 15:59:47.000000</td>\n",
" <td>32.115011</td>\n",
" <td>34.791570</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3421d00392ba7184c0a4973bcbc8b0911ff7de49a42e6f...</td>\n",
" <td>2017-09-11 15:12:25.000000</td>\n",
" <td>2017-09-11 18:19:26.481172</td>\n",
" <td>43.450645</td>\n",
" <td>-80.498186</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>6d8480d66cad0c5c0bf52799084dd29752ce43a1663d8c...</td>\n",
" <td>2017-09-03 12:58:00.000000</td>\n",
" <td>2017-09-03 13:04:02.000000</td>\n",
" <td>44.053652</td>\n",
" <td>-78.674988</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>c436a84188140154b4f2a127a7324b12f3bf5484989a17...</td>\n",
" <td>2017-09-25 02:08:49.000000</td>\n",
" <td>2017-09-25 02:13:53.000000</td>\n",
" <td>44.005362</td>\n",
" <td>-80.000066</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" uid \\\n",
"0 7e4ec7c997fd3836ed6845ef145cfafe03e14455925a0a... \n",
"1 1a3ae5604c2d26b734f034a8956df50799ebf02d69de53... \n",
"2 3421d00392ba7184c0a4973bcbc8b0911ff7de49a42e6f... \n",
"3 6d8480d66cad0c5c0bf52799084dd29752ce43a1663d8c... \n",
"4 c436a84188140154b4f2a127a7324b12f3bf5484989a17... \n",
"\n",
" enter_time exit_time lat lng \n",
"0 2017-08-24 12:48:21.000000 2017-08-24 12:53:49.000000 51.470015 -0.779697 \n",
"1 2017-08-31 15:20:39.463415 2017-08-31 15:59:47.000000 32.115011 34.791570 \n",
"2 2017-09-11 15:12:25.000000 2017-09-11 18:19:26.481172 43.450645 -80.498186 \n",
"3 2017-09-03 12:58:00.000000 2017-09-03 13:04:02.000000 44.053652 -78.674988 \n",
"4 2017-09-25 02:08:49.000000 2017-09-25 02:13:53.000000 44.005362 -80.000066 "
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# Reformat w/ consistent formats\n",
"df.to_csv('reformatted.csv', index=False)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 14.1 s, sys: 1.49 s, total: 15.6 s\n",
"Wall time: 15.6 s\n"
]
}
],
"source": [
"%%time\n",
"df = pd.read_csv('reformatted.csv', parse_dates=[1, 2],\n",
" date_parser=lambda x: pd.to_datetime(x, format='%Y/%m/%d %H:%M:%S'))"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"# Reformat datetime -> int\n",
"df['enter_time'] = df['enter_time'].astype(int)\n",
"df['exit_time'] = df['exit_time'].astype(int)\n",
"df.to_csv('ints.csv', index=False)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 8.13 s, sys: 644 ms, total: 8.78 s\n",
"Wall time: 8.78 s\n"
]
}
],
"source": [
"%%time\n",
"# w/o casting back to datetimes\n",
"df = pd.read_csv('ints.csv')"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 8.14 s, sys: 624 ms, total: 8.77 s\n",
"Wall time: 8.77 s\n"
]
}
],
"source": [
"%%time\n",
"# w/ casting back to datetimes\n",
"df = pd.read_csv('ints.csv')\n",
"df['enter_time'] = df['enter_time'].astype('<M8[ns]')\n",
"df['exit_time'] = df['exit_time'].astype('<M8[ns]')"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"# serialize as np.recarray\n",
"arr = df.to_records(index=False)\n",
"arr = arr.astype([('uid', 'S64'), ('enter_time', '<M8[ns]'), ('exit_time', '<M8[ns]'), ('lat', '<f8'), ('lng', '<f8')])\n",
"np.save('df.npy', arr, allow_pickle=False)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 304 ms, sys: 468 ms, total: 772 ms\n",
"Wall time: 770 ms\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>uid</th>\n",
" <th>enter_time</th>\n",
" <th>exit_time</th>\n",
" <th>lat</th>\n",
" <th>lng</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>b'7e4ec7c997fd3836ed6845ef145cfafe03e14455925a...</td>\n",
" <td>2017-08-24 12:48:21.000000</td>\n",
" <td>2017-08-24 12:53:49.000000</td>\n",
" <td>51.470015</td>\n",
" <td>-0.779697</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>b'1a3ae5604c2d26b734f034a8956df50799ebf02d69de...</td>\n",
" <td>2017-08-31 15:20:39.463415</td>\n",
" <td>2017-08-31 15:59:47.000000</td>\n",
" <td>32.115011</td>\n",
" <td>34.791570</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>b'3421d00392ba7184c0a4973bcbc8b0911ff7de49a42e...</td>\n",
" <td>2017-09-11 15:12:25.000000</td>\n",
" <td>2017-09-11 18:19:26.481172</td>\n",
" <td>43.450645</td>\n",
" <td>-80.498186</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>b'6d8480d66cad0c5c0bf52799084dd29752ce43a1663d...</td>\n",
" <td>2017-09-03 12:58:00.000000</td>\n",
" <td>2017-09-03 13:04:02.000000</td>\n",
" <td>44.053652</td>\n",
" <td>-78.674988</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>b'c436a84188140154b4f2a127a7324b12f3bf5484989a...</td>\n",
" <td>2017-09-25 02:08:49.000000</td>\n",
" <td>2017-09-25 02:13:53.000000</td>\n",
" <td>44.005362</td>\n",
" <td>-80.000066</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" uid \\\n",
"0 b'7e4ec7c997fd3836ed6845ef145cfafe03e14455925a... \n",
"1 b'1a3ae5604c2d26b734f034a8956df50799ebf02d69de... \n",
"2 b'3421d00392ba7184c0a4973bcbc8b0911ff7de49a42e... \n",
"3 b'6d8480d66cad0c5c0bf52799084dd29752ce43a1663d... \n",
"4 b'c436a84188140154b4f2a127a7324b12f3bf5484989a... \n",
"\n",
" enter_time exit_time lat lng \n",
"0 2017-08-24 12:48:21.000000 2017-08-24 12:53:49.000000 51.470015 -0.779697 \n",
"1 2017-08-31 15:20:39.463415 2017-08-31 15:59:47.000000 32.115011 34.791570 \n",
"2 2017-09-11 15:12:25.000000 2017-09-11 18:19:26.481172 43.450645 -80.498186 \n",
"3 2017-09-03 12:58:00.000000 2017-09-03 13:04:02.000000 44.053652 -78.674988 \n",
"4 2017-09-25 02:08:49.000000 2017-09-25 02:13:53.000000 44.005362 -80.000066 "
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%time\n",
"pd.DataFrame.from_records(np.load('df.npy')).head()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment