Created
July 14, 2018 04:05
-
-
Save maartenbreddels/b962f427a571f849481067ac4b117acb to your computer and use it in GitHub Desktop.
scipy2018 lightning talk on vaex
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"%matplotlib inline\n", | |
"import pylab as plt\n", | |
"import numpy as np\n", | |
"import warnings; warnings.simplefilter('ignore')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Vaex: out of core dataframes\n", | |
"### By Maarten Breddels (freelance/independent) @ ScipPy 2018\n", | |
" * conda install -c conda-forge vaex\n", | |
" * pip install --pre vaex\n", | |
" \n", | |
"# What it is?\n", | |
" * pandas like dataframe library for large datasets ($\\sim 10^9$ rows)\n", | |
" * mmap columnar data\n", | |
" * zero memory copy policy\n", | |
" * statistics on N-d grids ($+10^9$ rows/s)\n", | |
" * viz built in\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import vaex" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"-rw-r--r--@ 1 maartenbreddels staff 23G Mar 29 2017 /Users/maartenbreddels/datasets/nytaxi/nyc_taxi2015.hdf5\r\n" | |
] | |
} | |
], | |
"source": [ | |
"ls -lh /Users/maartenbreddels/datasets/nytaxi/nyc_taxi2015.hdf5" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<style>.vaex-description pre {\n", | |
" max-width : 450px;\n", | |
" white-space : nowrap;\n", | |
" overflow : hidden;\n", | |
" text-overflow: ellipsis;\n", | |
" }\n", | |
"\n", | |
" .vex-description pre:hover {\n", | |
" max-width : initial;\n", | |
" white-space: pre;\n", | |
" }</style>" | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<table class='table-striped'><thead><tr><th>#</th><th>VendorID</th><th>dropoff_dayofweek</th><th>dropoff_hour</th><th>dropoff_latitude</th><th>dropoff_longitude</th><th>extra</th><th>fare_amount</th><th>improvement_surcharge</th><th>mta_tax</th><th>passenger_count</th><th>payment_type</th><th>pickup_dayofweek</th><th>pickup_hour</th><th>pickup_latitude</th><th>pickup_longitude</th><th>tip_amount</th><th>tolls_amount</th><th>total_amount</th><th>tpep_dropoff_datetime</th><th>tpep_pickup_datetime</th><th>trip_distance</th></tr></thead><tr><td><i style='opacity: 0.6'>0</i></td><td>2</td><td>3.0</td><td>19.0</td><td>40.750617980957031</td><td>-73.974784851074219</td><td>1.0</td><td>12.0</td><td>0.29999999999999999</td><td>0.5</td><td>1</td><td>1</td><td>3.0</td><td>19.0</td><td>40.750110626220703</td><td>-73.993896484375</td><td>3.25</td><td>0.0</td><td>17.050000000000001</td><td>numpy.datetime64('2015-01-15T19:23:42.000000000')</td><td>numpy.datetime64('2015-01-15T19:05:39.000000000')</td><td>1.5900000000000001</td></tr><tr><td><i style='opacity: 0.6'>1</i></td><td>1</td><td>5.0</td><td>20.0</td><td>40.759109497070312</td><td>-73.994415283203125</td><td>0.5</td><td>14.5</td><td>0.29999999999999999</td><td>0.5</td><td>1</td><td>1</td><td>5.0</td><td>20.0</td><td>40.7242431640625</td><td>-74.00164794921875</td><td>2.0</td><td>0.0</td><td>17.800000000000001</td><td>numpy.datetime64('2015-01-10T20:53:28.000000000')</td><td>numpy.datetime64('2015-01-10T20:33:38.000000000')</td><td>3.2999999999999998</td></tr><tr><td><i style='opacity: 0.6'>2</i></td><td>1</td><td>5.0</td><td>20.0</td><td>40.824413299560547</td><td>-73.951820373535156</td><td>0.5</td><td>9.5</td><td>0.29999999999999999</td><td>0.5</td><td>1</td><td>2</td><td>5.0</td><td>20.0</td><td>40.802787780761719</td><td>-73.963340759277344</td><td>0.0</td><td>0.0</td><td>10.800000000000001</td><td>numpy.datetime64('2015-01-10T20:43:41.000000000')</td><td>numpy.datetime64('2015-01-10T20:33:38.000000000')</td><td>1.8</td></tr><tr><td><i style='opacity: 0.6'>3</i></td><td>1</td><td>5.0</td><td>20.0</td><td>40.719985961914062</td><td>-74.004325866699233</td><td>0.5</td><td>3.5</td><td>0.29999999999999999</td><td>0.5</td><td>1</td><td>2</td><td>5.0</td><td>20.0</td><td>40.713817596435547</td><td>-74.009086608886719</td><td>0.0</td><td>0.0</td><td>4.7999999999999998</td><td>numpy.datetime64('2015-01-10T20:35:31.000000000')</td><td>numpy.datetime64('2015-01-10T20:33:39.000000000')</td><td>0.5</td></tr><tr><td><i style='opacity: 0.6'>4</i></td><td>1</td><td>5.0</td><td>20.0</td><td>40.742652893066406</td><td>-74.004180908203125</td><td>0.5</td><td>15.0</td><td>0.29999999999999999</td><td>0.5</td><td>1</td><td>2</td><td>5.0</td><td>20.0</td><td>40.762428283691406</td><td>-73.971176147460938</td><td>0.0</td><td>0.0</td><td>16.300000000000001</td><td>numpy.datetime64('2015-01-10T20:52:58.000000000')</td><td>numpy.datetime64('2015-01-10T20:33:39.000000000')</td><td>3.0</td></tr><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td><tr><td><i style='opacity: 0.6'>146,112,984</i></td><td>2</td><td>4.0</td><td>0.0</td><td>40.722469329833984</td><td>-73.986213684082031</td><td>0.5</td><td>7.5</td><td>0.29999999999999999</td><td>0.5</td><td>5</td><td>1</td><td>3.0</td><td>23.0</td><td>40.720870971679688</td><td>-73.993812561035156</td><td>1.76</td><td>0.0</td><td>10.56</td><td>numpy.datetime64('2016-01-01T00:08:18.000000000')</td><td>numpy.datetime64('2015-12-31T23:59:56.000000000')</td><td>1.2</td></tr><tr><td><i style='opacity: 0.6'>146,112,985</i></td><td>1</td><td>4.0</td><td>0.0</td><td>40.752388000488281</td><td>-73.93951416015625</td><td>0.5</td><td>7.5</td><td>0.29999999999999999</td><td>0.5</td><td>2</td><td>2</td><td>3.0</td><td>23.0</td><td>40.760280609130852</td><td>-73.96527099609375</td><td>0.0</td><td>0.0</td><td>8.8000000000000007</td><td>numpy.datetime64('2016-01-01T00:05:19.000000000')</td><td>numpy.datetime64('2015-12-31T23:59:58.000000000')</td><td>2.0</td></tr><tr><td><i style='opacity: 0.6'>146,112,986</i></td><td>1</td><td>4.0</td><td>0.0</td><td>40.69329833984375</td><td>-73.988670349121094</td><td>0.5</td><td>13.5</td><td>0.29999999999999999</td><td>0.5</td><td>2</td><td>2</td><td>3.0</td><td>23.0</td><td>40.739078521728523</td><td>-73.987297058105469</td><td>0.0</td><td>0.0</td><td>14.800000000000001</td><td>numpy.datetime64('2016-01-01T00:12:55.000000000')</td><td>numpy.datetime64('2015-12-31T23:59:59.000000000')</td><td>3.7999999999999998</td></tr><tr><td><i style='opacity: 0.6'>146,112,987</i></td><td>2</td><td>4.0</td><td>0.0</td><td>40.705322265625</td><td>-74.017120361328125</td><td>0.5</td><td>8.5</td><td>0.29999999999999999</td><td>0.5</td><td>1</td><td>2</td><td>3.0</td><td>23.0</td><td>40.725692749023438</td><td>-73.99755859375</td><td>0.0</td><td>0.0</td><td>9.8000000000000007</td><td>numpy.datetime64('2016-01-01T00:10:26.000000000')</td><td>numpy.datetime64('2015-12-31T23:59:59.000000000')</td><td>1.96</td></tr><tr><td><i style='opacity: 0.6'>146,112,988</i></td><td>2</td><td>4.0</td><td>0.0</td><td>40.760570526123047</td><td>-73.990982055664062</td><td>0.5</td><td>13.5</td><td>0.29999999999999999</td><td>0.5</td><td>1</td><td>1</td><td>3.0</td><td>23.0</td><td>40.767257690429688</td><td>-73.98439788818358</td><td>2.96</td><td>0.0</td><td>17.760000000000002</td><td>numpy.datetime64('2016-01-01T00:21:30.000000000')</td><td>numpy.datetime64('2015-12-31T23:59:59.000000000')</td><td>1.0600000000000001</td></tr></table>" | |
], | |
"text/plain": [ | |
"<vaex.hdf5.dataset.Hdf5MemoryMapped at 0x10c158da0>" | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df = vaex.open('/Users/maartenbreddels/datasets/nytaxi/nyc_taxi2015.hdf5')\n", | |
"df" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Pandas like, but uses expressions" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([ 1.59, 3.3 , 1.8 , ..., 3.8 , 1.96, 1.06])" | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df.trip_distance.values" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"<vaex.expression.Expression(expressions='trip_distance')> instance at 0x10d975710 values=[1.59, 3.3, 1.8, 0.5, 3.0 ... (total 146112989 values) ... 1.2, 2.0, 3.8, 1.96, 1.06] " | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df.trip_distance" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"<vaex.expression.Expression(expressions='(tip_amount / total_amount)')> instance at 0x10d975828 values=[0.190615835777, 0.112359550562, 0.0, 0.0, 0.0 ... (total 146112989 values) ... 0.166666666667, 0.0, 0.0, 0.0, 0.166666666667] " | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df.tip_amount/df.total_amount" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'1,168,903,912'" | |
] | |
}, | |
"execution_count": 8, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"bytes_per_column = df.trip_distance.values.dtype.itemsize * len(df)\n", | |
"f\"{bytes_per_column:,}\"" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Expression / Virtual columns\n", | |
"![expres](./meme-expressions.jpg)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df['tip_percentage'] = df.tip_amount/df.total_amount" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"<vaex.expression.Expression(expressions='tip_percentage')> instance at 0x10d975ac8 values=[0.190615835777, 0.112359550562, 0.0, 0.0, 0.0 ... (total 146112989 values) ... 0.166666666667, 0.0, 0.0, 0.0, 0.166666666667] " | |
] | |
}, | |
"execution_count": 10, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df.tip_percentage" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<style>.vaex-description pre {\n", | |
" max-width : 450px;\n", | |
" white-space : nowrap;\n", | |
" overflow : hidden;\n", | |
" text-overflow: ellipsis;\n", | |
" }\n", | |
"\n", | |
" .vex-description pre:hover {\n", | |
" max-width : initial;\n", | |
" white-space: pre;\n", | |
" }</style>" | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<table class='table-striped'><thead><tr><th>#</th><th>VendorID</th><th>dropoff_dayofweek</th><th>dropoff_hour</th><th>dropoff_latitude</th><th>dropoff_longitude</th><th>extra</th><th>fare_amount</th><th>improvement_surcharge</th><th>mta_tax</th><th>passenger_count</th><th>payment_type</th><th>pickup_dayofweek</th><th>pickup_hour</th><th>pickup_latitude</th><th>pickup_longitude</th><th>tip_amount</th><th>tolls_amount</th><th>total_amount</th><th>tpep_dropoff_datetime</th><th>tpep_pickup_datetime</th><th>trip_distance</th><th>tip_percentage</th></tr></thead><tr><td><i style='opacity: 0.6'>0</i></td><td>2</td><td>3.0</td><td>19.0</td><td>40.750617980957031</td><td>-73.974784851074219</td><td>1.0</td><td>12.0</td><td>0.29999999999999999</td><td>0.5</td><td>1</td><td>1</td><td>3.0</td><td>19.0</td><td>40.750110626220703</td><td>-73.993896484375</td><td>3.25</td><td>0.0</td><td>17.050000000000001</td><td>numpy.datetime64('2015-01-15T19:23:42.000000000')</td><td>numpy.datetime64('2015-01-15T19:05:39.000000000')</td><td>1.5900000000000001</td><td>0.1906158357771261</td></tr><tr><td><i style='opacity: 0.6'>1</i></td><td>1</td><td>5.0</td><td>20.0</td><td>40.759109497070312</td><td>-73.994415283203125</td><td>0.5</td><td>14.5</td><td>0.29999999999999999</td><td>0.5</td><td>1</td><td>1</td><td>5.0</td><td>20.0</td><td>40.7242431640625</td><td>-74.00164794921875</td><td>2.0</td><td>0.0</td><td>17.800000000000001</td><td>numpy.datetime64('2015-01-10T20:53:28.000000000')</td><td>numpy.datetime64('2015-01-10T20:33:38.000000000')</td><td>3.2999999999999998</td><td>0.11235955056179775</td></tr><tr><td><i style='opacity: 0.6'>2</i></td><td>1</td><td>5.0</td><td>20.0</td><td>40.824413299560547</td><td>-73.951820373535156</td><td>0.5</td><td>9.5</td><td>0.29999999999999999</td><td>0.5</td><td>1</td><td>2</td><td>5.0</td><td>20.0</td><td>40.802787780761719</td><td>-73.963340759277344</td><td>0.0</td><td>0.0</td><td>10.800000000000001</td><td>numpy.datetime64('2015-01-10T20:43:41.000000000')</td><td>numpy.datetime64('2015-01-10T20:33:38.000000000')</td><td>1.8</td><td>0.0</td></tr><tr><td><i style='opacity: 0.6'>3</i></td><td>1</td><td>5.0</td><td>20.0</td><td>40.719985961914062</td><td>-74.004325866699233</td><td>0.5</td><td>3.5</td><td>0.29999999999999999</td><td>0.5</td><td>1</td><td>2</td><td>5.0</td><td>20.0</td><td>40.713817596435547</td><td>-74.009086608886719</td><td>0.0</td><td>0.0</td><td>4.7999999999999998</td><td>numpy.datetime64('2015-01-10T20:35:31.000000000')</td><td>numpy.datetime64('2015-01-10T20:33:39.000000000')</td><td>0.5</td><td>0.0</td></tr><tr><td><i style='opacity: 0.6'>4</i></td><td>1</td><td>5.0</td><td>20.0</td><td>40.742652893066406</td><td>-74.004180908203125</td><td>0.5</td><td>15.0</td><td>0.29999999999999999</td><td>0.5</td><td>1</td><td>2</td><td>5.0</td><td>20.0</td><td>40.762428283691406</td><td>-73.971176147460938</td><td>0.0</td><td>0.0</td><td>16.300000000000001</td><td>numpy.datetime64('2015-01-10T20:52:58.000000000')</td><td>numpy.datetime64('2015-01-10T20:33:39.000000000')</td><td>3.0</td><td>0.0</td></tr><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td><tr><td><i style='opacity: 0.6'>146,112,984</i></td><td>2</td><td>4.0</td><td>0.0</td><td>40.722469329833984</td><td>-73.986213684082031</td><td>0.5</td><td>7.5</td><td>0.29999999999999999</td><td>0.5</td><td>5</td><td>1</td><td>3.0</td><td>23.0</td><td>40.720870971679688</td><td>-73.993812561035156</td><td>1.76</td><td>0.0</td><td>10.56</td><td>numpy.datetime64('2016-01-01T00:08:18.000000000')</td><td>numpy.datetime64('2015-12-31T23:59:56.000000000')</td><td>1.2</td><td>0.16666666666666666</td></tr><tr><td><i style='opacity: 0.6'>146,112,985</i></td><td>1</td><td>4.0</td><td>0.0</td><td>40.752388000488281</td><td>-73.93951416015625</td><td>0.5</td><td>7.5</td><td>0.29999999999999999</td><td>0.5</td><td>2</td><td>2</td><td>3.0</td><td>23.0</td><td>40.760280609130852</td><td>-73.96527099609375</td><td>0.0</td><td>0.0</td><td>8.8000000000000007</td><td>numpy.datetime64('2016-01-01T00:05:19.000000000')</td><td>numpy.datetime64('2015-12-31T23:59:58.000000000')</td><td>2.0</td><td>0.0</td></tr><tr><td><i style='opacity: 0.6'>146,112,986</i></td><td>1</td><td>4.0</td><td>0.0</td><td>40.69329833984375</td><td>-73.988670349121094</td><td>0.5</td><td>13.5</td><td>0.29999999999999999</td><td>0.5</td><td>2</td><td>2</td><td>3.0</td><td>23.0</td><td>40.739078521728523</td><td>-73.987297058105469</td><td>0.0</td><td>0.0</td><td>14.800000000000001</td><td>numpy.datetime64('2016-01-01T00:12:55.000000000')</td><td>numpy.datetime64('2015-12-31T23:59:59.000000000')</td><td>3.7999999999999998</td><td>0.0</td></tr><tr><td><i style='opacity: 0.6'>146,112,987</i></td><td>2</td><td>4.0</td><td>0.0</td><td>40.705322265625</td><td>-74.017120361328125</td><td>0.5</td><td>8.5</td><td>0.29999999999999999</td><td>0.5</td><td>1</td><td>2</td><td>3.0</td><td>23.0</td><td>40.725692749023438</td><td>-73.99755859375</td><td>0.0</td><td>0.0</td><td>9.8000000000000007</td><td>numpy.datetime64('2016-01-01T00:10:26.000000000')</td><td>numpy.datetime64('2015-12-31T23:59:59.000000000')</td><td>1.96</td><td>0.0</td></tr><tr><td><i style='opacity: 0.6'>146,112,988</i></td><td>2</td><td>4.0</td><td>0.0</td><td>40.760570526123047</td><td>-73.990982055664062</td><td>0.5</td><td>13.5</td><td>0.29999999999999999</td><td>0.5</td><td>1</td><td>1</td><td>3.0</td><td>23.0</td><td>40.767257690429688</td><td>-73.98439788818358</td><td>2.96</td><td>0.0</td><td>17.760000000000002</td><td>numpy.datetime64('2016-01-01T00:21:30.000000000')</td><td>numpy.datetime64('2015-12-31T23:59:59.000000000')</td><td>1.0600000000000001</td><td>0.16666666666666666</td></tr></table>" | |
], | |
"text/plain": [ | |
"<vaex.hdf5.dataset.Hdf5MemoryMapped at 0x10c158da0>" | |
] | |
}, | |
"execution_count": 11, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"0.096040300836903153" | |
] | |
}, | |
"execution_count": 12, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df.mean(df.tip_percentage)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Lazy is good\n", | |
"See item 10/11 of [\"10 Things I Hate About pandas\"](http://wesmckinney.com/blog/apache-arrow-pandas-internals/) by Wes McKinney\n", | |
"> When you write df[df.c < 0].d.sum(), pandas creates a temporary DataFrame df[df.c < 0] then sums the d column of that temporary object. If df contains a lot of columns, this is ridiculously wasteful." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"0.198073253803\n" | |
] | |
} | |
], | |
"source": [ | |
"# this would cost 46+GB RAM using Pandas\n", | |
"# and wouldn't be possible on this laptop\n", | |
"print(df[df.tip_amount > 10].tip_percentage.mean())" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# N-d statistic and viz" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([[ 1., 4., 7., ..., 1., 1., 0.],\n", | |
" [ 8., 8., 14., ..., 0., 1., 0.],\n", | |
" [ 21., 8., 7., ..., 0., 1., 1.],\n", | |
" ..., \n", | |
" [ 1., 5., 0., ..., 0., 0., 0.],\n", | |
" [ 0., 0., 2., ..., 0., 0., 0.],\n", | |
" [ 0., 0., 1., ..., 0., 0., 0.]])" | |
] | |
}, | |
"execution_count": 14, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"counts2d = df.count(binby=[df.pickup_longitude, df.pickup_latitude], shape=(128,128))\n", | |
"counts2d" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"<matplotlib.image.AxesImage at 0x10800a710>" | |
] | |
}, | |
"execution_count": 15, | |
"metadata": {}, | |
"output_type": "execute_result" | |
}, | |
{ | |
"data": { | |
"image/png": "\n", | |
"text/plain": [ | |
"<matplotlib.figure.Figure at 0x10c158940>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"plt.imshow(np.log(counts2d.T+1), origin='lower')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df.plot_widget(df.pickup_longitude, df.pickup_latitude, f='log', controls_selection=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"0.08200438789770971" | |
] | |
}, | |
"execution_count": 20, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df.tip_percentage.mean(selection=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def arc_distance(theta_1, phi_1, theta_2, phi_2):\n", | |
" temp = (np.sin((theta_2-theta_1)/2*np.pi/180)**2\n", | |
" + np.cos(theta_1*np.pi/180)*np.cos(theta_2*np.pi/180) * np.sin((phi_2-phi_1)/2*np.pi/180)**2)\n", | |
" distance = 2 * np.arctan2(np.sqrt(temp), np.sqrt(1-temp))\n", | |
" return distance * 6400/1.6" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"5167.5802938043762" | |
] | |
}, | |
"execution_count": 20, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Distance between Groningen (NL) and Austin\n", | |
"arc_distance(53.1739086, 6.5990374, 30.2813584,-97.7558575)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df['arc_distance'] = arc_distance(df.pickup_longitude,\n", | |
" df.pickup_latitude,\n", | |
" df.dropoff_longitude,\n", | |
" df.dropoff_latitude)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"<vaex.expression.Expression(expressions='(((2 * arctan2(sqrt(((sin(((((dropoff_longitude - pickup_longitude) / 2) * 3.141592653589793) / 180)) ** 2) + ((cos(((pickup_longitude * 3.141592653589793) / 180)) * cos(((dropoff_longitude * 3.141592653589793) / 180))) * (sin(((((dropoff_latitude - pickup_latitude) / 2) * 3.141592653589793) / 180)) ** 2)))), sqrt((1 - ((sin(((((dropoff_longitude - pickup_longitude) / 2) * 3.141592653589793) / 180)) ** 2) + ((cos(((pickup_longitude * 3.141592653589793) / 180)) * cos(((dropoff_longitude * 3.141592653589793) / 180))) * (sin(((((dropoff_latitude - pickup_latitude) / 2) * 3.141592653589793) / 180)) ** 2))))))) * 6400) / 1.6)')> instance at 0x6cecd90b8 values=[1.33427949357, 0.839776114764, 0.906050493444, 0.352906059984, 2.33542574256 ... (total 146112989 values) ... 0.531393648669, 1.80460610131, 0.886795523869, 1.42076456204, 0.477360075018] " | |
] | |
}, | |
"execution_count": 22, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df.arc_distance.expand()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 23, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 1.6 s, sys: 28 ms, total: 1.63 s\n", | |
"Wall time: 258 ms\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"0.096040300836910855" | |
] | |
}, | |
"execution_count": 23, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"df.mean(df.tip_percentage)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 24, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 34.6 s, sys: 8.28 s, total: 42.8 s\n", | |
"Wall time: 9.14 s\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"1.8301208899585906" | |
] | |
}, | |
"execution_count": 24, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"df.mean(df.arc_distance)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 25, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df['arc_distance_jit'] = df.arc_distance.jit_numba()\n", | |
"# df['arc_distance_jit'] = df.arc_distance.jit_pythran()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 26, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 15.2 s, sys: 48.9 ms, total: 15.2 s\n", | |
"Wall time: 2.22 s\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"1.8301208899586112" | |
] | |
}, | |
"execution_count": 26, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"df.mean(df.arc_distance_jit)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Vaex\n", | |
" * Known API (Pandas)\n", | |
" * Super fast\n", | |
" * Expressions, no direct computation\n", | |
" * does not waste RAM\n", | |
" * JIT-ing\n", | |
" * derivatives\n", | |
" * machine learning: no more pipelines\n", | |
" * Remote datasets\n", | |
" * expressions and statistics gets transported, not data\n", | |
" " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import vaex\n", | |
"ds = vaex.open('/Users/maartenbreddels/datasets/aquarius/Aq-A-2-999-shuffled.hdf5')\n", | |
"ds.set_active_fraction(0.2)\n", | |
"ds.plot_widget('x', 'y', 'z', f='log', extent=[[40, 60]]*3, backend='ipyvolume', shape=100)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.4" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment