Created
November 22, 2017 00:59
-
-
Save bnaul/e00ec418fbc3da6366008e0deb748eb6 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import numpy as np\n", | |
| "import pandas as pd\n", | |
| "from personatrainer.scripts.generate_personas import *" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "4305165 0.csv\r\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "!wc -l 0.csv" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "CPU times: user 21min 50s, sys: 1.17 s, total: 21min 51s\n", | |
| "Wall time: 21min 51s\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "%%time\n", | |
| "df = pd.read_csv('0.csv', parse_dates=[1, 2])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>uid</th>\n", | |
| " <th>enter_time</th>\n", | |
| " <th>exit_time</th>\n", | |
| " <th>lat</th>\n", | |
| " <th>lng</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>7e4ec7c997fd3836ed6845ef145cfafe03e14455925a0a...</td>\n", | |
| " <td>2017-08-24 12:48:21.000000</td>\n", | |
| " <td>2017-08-24 12:53:49.000000</td>\n", | |
| " <td>51.470015</td>\n", | |
| " <td>-0.779697</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>1a3ae5604c2d26b734f034a8956df50799ebf02d69de53...</td>\n", | |
| " <td>2017-08-31 15:20:39.463415</td>\n", | |
| " <td>2017-08-31 15:59:47.000000</td>\n", | |
| " <td>32.115011</td>\n", | |
| " <td>34.791570</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>3421d00392ba7184c0a4973bcbc8b0911ff7de49a42e6f...</td>\n", | |
| " <td>2017-09-11 15:12:25.000000</td>\n", | |
| " <td>2017-09-11 18:19:26.481172</td>\n", | |
| " <td>43.450645</td>\n", | |
| " <td>-80.498186</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>6d8480d66cad0c5c0bf52799084dd29752ce43a1663d8c...</td>\n", | |
| " <td>2017-09-03 12:58:00.000000</td>\n", | |
| " <td>2017-09-03 13:04:02.000000</td>\n", | |
| " <td>44.053652</td>\n", | |
| " <td>-78.674988</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>c436a84188140154b4f2a127a7324b12f3bf5484989a17...</td>\n", | |
| " <td>2017-09-25 02:08:49.000000</td>\n", | |
| " <td>2017-09-25 02:13:53.000000</td>\n", | |
| " <td>44.005362</td>\n", | |
| " <td>-80.000066</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " uid \\\n", | |
| "0 7e4ec7c997fd3836ed6845ef145cfafe03e14455925a0a... \n", | |
| "1 1a3ae5604c2d26b734f034a8956df50799ebf02d69de53... \n", | |
| "2 3421d00392ba7184c0a4973bcbc8b0911ff7de49a42e6f... \n", | |
| "3 6d8480d66cad0c5c0bf52799084dd29752ce43a1663d8c... \n", | |
| "4 c436a84188140154b4f2a127a7324b12f3bf5484989a17... \n", | |
| "\n", | |
| " enter_time exit_time lat lng \n", | |
| "0 2017-08-24 12:48:21.000000 2017-08-24 12:53:49.000000 51.470015 -0.779697 \n", | |
| "1 2017-08-31 15:20:39.463415 2017-08-31 15:59:47.000000 32.115011 34.791570 \n", | |
| "2 2017-09-11 15:12:25.000000 2017-09-11 18:19:26.481172 43.450645 -80.498186 \n", | |
| "3 2017-09-03 12:58:00.000000 2017-09-03 13:04:02.000000 44.053652 -78.674988 \n", | |
| "4 2017-09-25 02:08:49.000000 2017-09-25 02:13:53.000000 44.005362 -80.000066 " | |
| ] | |
| }, | |
| "execution_count": 4, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "df.head()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "# Reformat w/ consistent formats\n", | |
| "df.to_csv('reformatted.csv', index=False)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 6, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "CPU times: user 14.1 s, sys: 1.49 s, total: 15.6 s\n", | |
| "Wall time: 15.6 s\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "%%time\n", | |
| "df = pd.read_csv('reformatted.csv', parse_dates=[1, 2],\n", | |
| " date_parser=lambda x: pd.to_datetime(x, format='%Y/%m/%d %H:%M:%S'))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 7, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "# Reformat datetime -> int\n", | |
| "df['enter_time'] = df['enter_time'].astype(int)\n", | |
| "df['exit_time'] = df['exit_time'].astype(int)\n", | |
| "df.to_csv('ints.csv', index=False)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 8, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "CPU times: user 8.13 s, sys: 644 ms, total: 8.78 s\n", | |
| "Wall time: 8.78 s\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "%%time\n", | |
| "# w/o casting back to datetimes\n", | |
| "df = pd.read_csv('ints.csv')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 9, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "CPU times: user 8.14 s, sys: 624 ms, total: 8.77 s\n", | |
| "Wall time: 8.77 s\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "%%time\n", | |
| "# w/ casting back to datetimes\n", | |
| "df = pd.read_csv('ints.csv')\n", | |
| "df['enter_time'] = df['enter_time'].astype('<M8[ns]')\n", | |
| "df['exit_time'] = df['exit_time'].astype('<M8[ns]')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 10, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "# serialize as np.recarray\n", | |
| "arr = df.to_records(index=False)\n", | |
| "arr = arr.astype([('uid', 'S64'), ('enter_time', '<M8[ns]'), ('exit_time', '<M8[ns]'), ('lat', '<f8'), ('lng', '<f8')])\n", | |
| "np.save('df.npy', arr, allow_pickle=False)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 14, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "CPU times: user 304 ms, sys: 468 ms, total: 772 ms\n", | |
| "Wall time: 770 ms\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>uid</th>\n", | |
| " <th>enter_time</th>\n", | |
| " <th>exit_time</th>\n", | |
| " <th>lat</th>\n", | |
| " <th>lng</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>b'7e4ec7c997fd3836ed6845ef145cfafe03e14455925a...</td>\n", | |
| " <td>2017-08-24 12:48:21.000000</td>\n", | |
| " <td>2017-08-24 12:53:49.000000</td>\n", | |
| " <td>51.470015</td>\n", | |
| " <td>-0.779697</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>b'1a3ae5604c2d26b734f034a8956df50799ebf02d69de...</td>\n", | |
| " <td>2017-08-31 15:20:39.463415</td>\n", | |
| " <td>2017-08-31 15:59:47.000000</td>\n", | |
| " <td>32.115011</td>\n", | |
| " <td>34.791570</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>b'3421d00392ba7184c0a4973bcbc8b0911ff7de49a42e...</td>\n", | |
| " <td>2017-09-11 15:12:25.000000</td>\n", | |
| " <td>2017-09-11 18:19:26.481172</td>\n", | |
| " <td>43.450645</td>\n", | |
| " <td>-80.498186</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>b'6d8480d66cad0c5c0bf52799084dd29752ce43a1663d...</td>\n", | |
| " <td>2017-09-03 12:58:00.000000</td>\n", | |
| " <td>2017-09-03 13:04:02.000000</td>\n", | |
| " <td>44.053652</td>\n", | |
| " <td>-78.674988</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>b'c436a84188140154b4f2a127a7324b12f3bf5484989a...</td>\n", | |
| " <td>2017-09-25 02:08:49.000000</td>\n", | |
| " <td>2017-09-25 02:13:53.000000</td>\n", | |
| " <td>44.005362</td>\n", | |
| " <td>-80.000066</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " uid \\\n", | |
| "0 b'7e4ec7c997fd3836ed6845ef145cfafe03e14455925a... \n", | |
| "1 b'1a3ae5604c2d26b734f034a8956df50799ebf02d69de... \n", | |
| "2 b'3421d00392ba7184c0a4973bcbc8b0911ff7de49a42e... \n", | |
| "3 b'6d8480d66cad0c5c0bf52799084dd29752ce43a1663d... \n", | |
| "4 b'c436a84188140154b4f2a127a7324b12f3bf5484989a... \n", | |
| "\n", | |
| " enter_time exit_time lat lng \n", | |
| "0 2017-08-24 12:48:21.000000 2017-08-24 12:53:49.000000 51.470015 -0.779697 \n", | |
| "1 2017-08-31 15:20:39.463415 2017-08-31 15:59:47.000000 32.115011 34.791570 \n", | |
| "2 2017-09-11 15:12:25.000000 2017-09-11 18:19:26.481172 43.450645 -80.498186 \n", | |
| "3 2017-09-03 12:58:00.000000 2017-09-03 13:04:02.000000 44.053652 -78.674988 \n", | |
| "4 2017-09-25 02:08:49.000000 2017-09-25 02:13:53.000000 44.005362 -80.000066 " | |
| ] | |
| }, | |
| "execution_count": 14, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "%%time\n", | |
| "pd.DataFrame.from_records(np.load('df.npy')).head()" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.6.3" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 2 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment