Skip to content

Instantly share code, notes, and snippets.

@maartenbreddels
Created July 14, 2018 04:05
Show Gist options
  • Save maartenbreddels/b962f427a571f849481067ac4b117acb to your computer and use it in GitHub Desktop.
Save maartenbreddels/b962f427a571f849481067ac4b117acb to your computer and use it in GitHub Desktop.
scipy2018 lightning talk on vaex
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"%matplotlib inline\n",
"import pylab as plt\n",
"import numpy as np\n",
"import warnings; warnings.simplefilter('ignore')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Vaex: out of core dataframes\n",
"### By Maarten Breddels (freelance/independent) @ ScipPy 2018\n",
" * conda install -c conda-forge vaex\n",
" * pip install --pre vaex\n",
" \n",
"# What it is?\n",
" * pandas like dataframe library for large datasets ($\\sim 10^9$ rows)\n",
" * mmap columnar data\n",
" * zero memory copy policy\n",
" * statistics on N-d grids ($+10^9$ rows/s)\n",
" * viz built in\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import vaex"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"-rw-r--r--@ 1 maartenbreddels staff 23G Mar 29 2017 /Users/maartenbreddels/datasets/nytaxi/nyc_taxi2015.hdf5\r\n"
]
}
],
"source": [
"ls -lh /Users/maartenbreddels/datasets/nytaxi/nyc_taxi2015.hdf5"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<style>.vaex-description pre {\n",
" max-width : 450px;\n",
" white-space : nowrap;\n",
" overflow : hidden;\n",
" text-overflow: ellipsis;\n",
" }\n",
"\n",
" .vex-description pre:hover {\n",
" max-width : initial;\n",
" white-space: pre;\n",
" }</style>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<table class='table-striped'><thead><tr><th>#</th><th>VendorID</th><th>dropoff_dayofweek</th><th>dropoff_hour</th><th>dropoff_latitude</th><th>dropoff_longitude</th><th>extra</th><th>fare_amount</th><th>improvement_surcharge</th><th>mta_tax</th><th>passenger_count</th><th>payment_type</th><th>pickup_dayofweek</th><th>pickup_hour</th><th>pickup_latitude</th><th>pickup_longitude</th><th>tip_amount</th><th>tolls_amount</th><th>total_amount</th><th>tpep_dropoff_datetime</th><th>tpep_pickup_datetime</th><th>trip_distance</th></tr></thead><tr><td><i style='opacity: 0.6'>0</i></td><td>2</td><td>3.0</td><td>19.0</td><td>40.750617980957031</td><td>-73.974784851074219</td><td>1.0</td><td>12.0</td><td>0.29999999999999999</td><td>0.5</td><td>1</td><td>1</td><td>3.0</td><td>19.0</td><td>40.750110626220703</td><td>-73.993896484375</td><td>3.25</td><td>0.0</td><td>17.050000000000001</td><td>numpy.datetime64('2015-01-15T19:23:42.000000000')</td><td>numpy.datetime64('2015-01-15T19:05:39.000000000')</td><td>1.5900000000000001</td></tr><tr><td><i style='opacity: 0.6'>1</i></td><td>1</td><td>5.0</td><td>20.0</td><td>40.759109497070312</td><td>-73.994415283203125</td><td>0.5</td><td>14.5</td><td>0.29999999999999999</td><td>0.5</td><td>1</td><td>1</td><td>5.0</td><td>20.0</td><td>40.7242431640625</td><td>-74.00164794921875</td><td>2.0</td><td>0.0</td><td>17.800000000000001</td><td>numpy.datetime64('2015-01-10T20:53:28.000000000')</td><td>numpy.datetime64('2015-01-10T20:33:38.000000000')</td><td>3.2999999999999998</td></tr><tr><td><i style='opacity: 0.6'>2</i></td><td>1</td><td>5.0</td><td>20.0</td><td>40.824413299560547</td><td>-73.951820373535156</td><td>0.5</td><td>9.5</td><td>0.29999999999999999</td><td>0.5</td><td>1</td><td>2</td><td>5.0</td><td>20.0</td><td>40.802787780761719</td><td>-73.963340759277344</td><td>0.0</td><td>0.0</td><td>10.800000000000001</td><td>numpy.datetime64('2015-01-10T20:43:41.000000000')</td><td>numpy.datetime64('2015-01-10T20:33:38.000000000')</td><td>1.8</td></tr><tr><td><i style='opacity: 0.6'>3</i></td><td>1</td><td>5.0</td><td>20.0</td><td>40.719985961914062</td><td>-74.004325866699233</td><td>0.5</td><td>3.5</td><td>0.29999999999999999</td><td>0.5</td><td>1</td><td>2</td><td>5.0</td><td>20.0</td><td>40.713817596435547</td><td>-74.009086608886719</td><td>0.0</td><td>0.0</td><td>4.7999999999999998</td><td>numpy.datetime64('2015-01-10T20:35:31.000000000')</td><td>numpy.datetime64('2015-01-10T20:33:39.000000000')</td><td>0.5</td></tr><tr><td><i style='opacity: 0.6'>4</i></td><td>1</td><td>5.0</td><td>20.0</td><td>40.742652893066406</td><td>-74.004180908203125</td><td>0.5</td><td>15.0</td><td>0.29999999999999999</td><td>0.5</td><td>1</td><td>2</td><td>5.0</td><td>20.0</td><td>40.762428283691406</td><td>-73.971176147460938</td><td>0.0</td><td>0.0</td><td>16.300000000000001</td><td>numpy.datetime64('2015-01-10T20:52:58.000000000')</td><td>numpy.datetime64('2015-01-10T20:33:39.000000000')</td><td>3.0</td></tr><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td><tr><td><i style='opacity: 0.6'>146,112,984</i></td><td>2</td><td>4.0</td><td>0.0</td><td>40.722469329833984</td><td>-73.986213684082031</td><td>0.5</td><td>7.5</td><td>0.29999999999999999</td><td>0.5</td><td>5</td><td>1</td><td>3.0</td><td>23.0</td><td>40.720870971679688</td><td>-73.993812561035156</td><td>1.76</td><td>0.0</td><td>10.56</td><td>numpy.datetime64('2016-01-01T00:08:18.000000000')</td><td>numpy.datetime64('2015-12-31T23:59:56.000000000')</td><td>1.2</td></tr><tr><td><i style='opacity: 0.6'>146,112,985</i></td><td>1</td><td>4.0</td><td>0.0</td><td>40.752388000488281</td><td>-73.93951416015625</td><td>0.5</td><td>7.5</td><td>0.29999999999999999</td><td>0.5</td><td>2</td><td>2</td><td>3.0</td><td>23.0</td><td>40.760280609130852</td><td>-73.96527099609375</td><td>0.0</td><td>0.0</td><td>8.8000000000000007</td><td>numpy.datetime64('2016-01-01T00:05:19.000000000')</td><td>numpy.datetime64('2015-12-31T23:59:58.000000000')</td><td>2.0</td></tr><tr><td><i style='opacity: 0.6'>146,112,986</i></td><td>1</td><td>4.0</td><td>0.0</td><td>40.69329833984375</td><td>-73.988670349121094</td><td>0.5</td><td>13.5</td><td>0.29999999999999999</td><td>0.5</td><td>2</td><td>2</td><td>3.0</td><td>23.0</td><td>40.739078521728523</td><td>-73.987297058105469</td><td>0.0</td><td>0.0</td><td>14.800000000000001</td><td>numpy.datetime64('2016-01-01T00:12:55.000000000')</td><td>numpy.datetime64('2015-12-31T23:59:59.000000000')</td><td>3.7999999999999998</td></tr><tr><td><i style='opacity: 0.6'>146,112,987</i></td><td>2</td><td>4.0</td><td>0.0</td><td>40.705322265625</td><td>-74.017120361328125</td><td>0.5</td><td>8.5</td><td>0.29999999999999999</td><td>0.5</td><td>1</td><td>2</td><td>3.0</td><td>23.0</td><td>40.725692749023438</td><td>-73.99755859375</td><td>0.0</td><td>0.0</td><td>9.8000000000000007</td><td>numpy.datetime64('2016-01-01T00:10:26.000000000')</td><td>numpy.datetime64('2015-12-31T23:59:59.000000000')</td><td>1.96</td></tr><tr><td><i style='opacity: 0.6'>146,112,988</i></td><td>2</td><td>4.0</td><td>0.0</td><td>40.760570526123047</td><td>-73.990982055664062</td><td>0.5</td><td>13.5</td><td>0.29999999999999999</td><td>0.5</td><td>1</td><td>1</td><td>3.0</td><td>23.0</td><td>40.767257690429688</td><td>-73.98439788818358</td><td>2.96</td><td>0.0</td><td>17.760000000000002</td><td>numpy.datetime64('2016-01-01T00:21:30.000000000')</td><td>numpy.datetime64('2015-12-31T23:59:59.000000000')</td><td>1.0600000000000001</td></tr></table>"
],
"text/plain": [
"<vaex.hdf5.dataset.Hdf5MemoryMapped at 0x10c158da0>"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = vaex.open('/Users/maartenbreddels/datasets/nytaxi/nyc_taxi2015.hdf5')\n",
"df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Pandas like, but uses expressions"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([ 1.59, 3.3 , 1.8 , ..., 3.8 , 1.96, 1.06])"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.trip_distance.values"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<vaex.expression.Expression(expressions='trip_distance')> instance at 0x10d975710 values=[1.59, 3.3, 1.8, 0.5, 3.0 ... (total 146112989 values) ... 1.2, 2.0, 3.8, 1.96, 1.06] "
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.trip_distance"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<vaex.expression.Expression(expressions='(tip_amount / total_amount)')> instance at 0x10d975828 values=[0.190615835777, 0.112359550562, 0.0, 0.0, 0.0 ... (total 146112989 values) ... 0.166666666667, 0.0, 0.0, 0.0, 0.166666666667] "
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.tip_amount/df.total_amount"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'1,168,903,912'"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bytes_per_column = df.trip_distance.values.dtype.itemsize * len(df)\n",
"f\"{bytes_per_column:,}\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Expression / Virtual columns\n",
"![expres](./meme-expressions.jpg)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"df['tip_percentage'] = df.tip_amount/df.total_amount"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<vaex.expression.Expression(expressions='tip_percentage')> instance at 0x10d975ac8 values=[0.190615835777, 0.112359550562, 0.0, 0.0, 0.0 ... (total 146112989 values) ... 0.166666666667, 0.0, 0.0, 0.0, 0.166666666667] "
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.tip_percentage"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<style>.vaex-description pre {\n",
" max-width : 450px;\n",
" white-space : nowrap;\n",
" overflow : hidden;\n",
" text-overflow: ellipsis;\n",
" }\n",
"\n",
" .vex-description pre:hover {\n",
" max-width : initial;\n",
" white-space: pre;\n",
" }</style>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<table class='table-striped'><thead><tr><th>#</th><th>VendorID</th><th>dropoff_dayofweek</th><th>dropoff_hour</th><th>dropoff_latitude</th><th>dropoff_longitude</th><th>extra</th><th>fare_amount</th><th>improvement_surcharge</th><th>mta_tax</th><th>passenger_count</th><th>payment_type</th><th>pickup_dayofweek</th><th>pickup_hour</th><th>pickup_latitude</th><th>pickup_longitude</th><th>tip_amount</th><th>tolls_amount</th><th>total_amount</th><th>tpep_dropoff_datetime</th><th>tpep_pickup_datetime</th><th>trip_distance</th><th>tip_percentage</th></tr></thead><tr><td><i style='opacity: 0.6'>0</i></td><td>2</td><td>3.0</td><td>19.0</td><td>40.750617980957031</td><td>-73.974784851074219</td><td>1.0</td><td>12.0</td><td>0.29999999999999999</td><td>0.5</td><td>1</td><td>1</td><td>3.0</td><td>19.0</td><td>40.750110626220703</td><td>-73.993896484375</td><td>3.25</td><td>0.0</td><td>17.050000000000001</td><td>numpy.datetime64('2015-01-15T19:23:42.000000000')</td><td>numpy.datetime64('2015-01-15T19:05:39.000000000')</td><td>1.5900000000000001</td><td>0.1906158357771261</td></tr><tr><td><i style='opacity: 0.6'>1</i></td><td>1</td><td>5.0</td><td>20.0</td><td>40.759109497070312</td><td>-73.994415283203125</td><td>0.5</td><td>14.5</td><td>0.29999999999999999</td><td>0.5</td><td>1</td><td>1</td><td>5.0</td><td>20.0</td><td>40.7242431640625</td><td>-74.00164794921875</td><td>2.0</td><td>0.0</td><td>17.800000000000001</td><td>numpy.datetime64('2015-01-10T20:53:28.000000000')</td><td>numpy.datetime64('2015-01-10T20:33:38.000000000')</td><td>3.2999999999999998</td><td>0.11235955056179775</td></tr><tr><td><i style='opacity: 0.6'>2</i></td><td>1</td><td>5.0</td><td>20.0</td><td>40.824413299560547</td><td>-73.951820373535156</td><td>0.5</td><td>9.5</td><td>0.29999999999999999</td><td>0.5</td><td>1</td><td>2</td><td>5.0</td><td>20.0</td><td>40.802787780761719</td><td>-73.963340759277344</td><td>0.0</td><td>0.0</td><td>10.800000000000001</td><td>numpy.datetime64('2015-01-10T20:43:41.000000000')</td><td>numpy.datetime64('2015-01-10T20:33:38.000000000')</td><td>1.8</td><td>0.0</td></tr><tr><td><i style='opacity: 0.6'>3</i></td><td>1</td><td>5.0</td><td>20.0</td><td>40.719985961914062</td><td>-74.004325866699233</td><td>0.5</td><td>3.5</td><td>0.29999999999999999</td><td>0.5</td><td>1</td><td>2</td><td>5.0</td><td>20.0</td><td>40.713817596435547</td><td>-74.009086608886719</td><td>0.0</td><td>0.0</td><td>4.7999999999999998</td><td>numpy.datetime64('2015-01-10T20:35:31.000000000')</td><td>numpy.datetime64('2015-01-10T20:33:39.000000000')</td><td>0.5</td><td>0.0</td></tr><tr><td><i style='opacity: 0.6'>4</i></td><td>1</td><td>5.0</td><td>20.0</td><td>40.742652893066406</td><td>-74.004180908203125</td><td>0.5</td><td>15.0</td><td>0.29999999999999999</td><td>0.5</td><td>1</td><td>2</td><td>5.0</td><td>20.0</td><td>40.762428283691406</td><td>-73.971176147460938</td><td>0.0</td><td>0.0</td><td>16.300000000000001</td><td>numpy.datetime64('2015-01-10T20:52:58.000000000')</td><td>numpy.datetime64('2015-01-10T20:33:39.000000000')</td><td>3.0</td><td>0.0</td></tr><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td><tr><td><i style='opacity: 0.6'>146,112,984</i></td><td>2</td><td>4.0</td><td>0.0</td><td>40.722469329833984</td><td>-73.986213684082031</td><td>0.5</td><td>7.5</td><td>0.29999999999999999</td><td>0.5</td><td>5</td><td>1</td><td>3.0</td><td>23.0</td><td>40.720870971679688</td><td>-73.993812561035156</td><td>1.76</td><td>0.0</td><td>10.56</td><td>numpy.datetime64('2016-01-01T00:08:18.000000000')</td><td>numpy.datetime64('2015-12-31T23:59:56.000000000')</td><td>1.2</td><td>0.16666666666666666</td></tr><tr><td><i style='opacity: 0.6'>146,112,985</i></td><td>1</td><td>4.0</td><td>0.0</td><td>40.752388000488281</td><td>-73.93951416015625</td><td>0.5</td><td>7.5</td><td>0.29999999999999999</td><td>0.5</td><td>2</td><td>2</td><td>3.0</td><td>23.0</td><td>40.760280609130852</td><td>-73.96527099609375</td><td>0.0</td><td>0.0</td><td>8.8000000000000007</td><td>numpy.datetime64('2016-01-01T00:05:19.000000000')</td><td>numpy.datetime64('2015-12-31T23:59:58.000000000')</td><td>2.0</td><td>0.0</td></tr><tr><td><i style='opacity: 0.6'>146,112,986</i></td><td>1</td><td>4.0</td><td>0.0</td><td>40.69329833984375</td><td>-73.988670349121094</td><td>0.5</td><td>13.5</td><td>0.29999999999999999</td><td>0.5</td><td>2</td><td>2</td><td>3.0</td><td>23.0</td><td>40.739078521728523</td><td>-73.987297058105469</td><td>0.0</td><td>0.0</td><td>14.800000000000001</td><td>numpy.datetime64('2016-01-01T00:12:55.000000000')</td><td>numpy.datetime64('2015-12-31T23:59:59.000000000')</td><td>3.7999999999999998</td><td>0.0</td></tr><tr><td><i style='opacity: 0.6'>146,112,987</i></td><td>2</td><td>4.0</td><td>0.0</td><td>40.705322265625</td><td>-74.017120361328125</td><td>0.5</td><td>8.5</td><td>0.29999999999999999</td><td>0.5</td><td>1</td><td>2</td><td>3.0</td><td>23.0</td><td>40.725692749023438</td><td>-73.99755859375</td><td>0.0</td><td>0.0</td><td>9.8000000000000007</td><td>numpy.datetime64('2016-01-01T00:10:26.000000000')</td><td>numpy.datetime64('2015-12-31T23:59:59.000000000')</td><td>1.96</td><td>0.0</td></tr><tr><td><i style='opacity: 0.6'>146,112,988</i></td><td>2</td><td>4.0</td><td>0.0</td><td>40.760570526123047</td><td>-73.990982055664062</td><td>0.5</td><td>13.5</td><td>0.29999999999999999</td><td>0.5</td><td>1</td><td>1</td><td>3.0</td><td>23.0</td><td>40.767257690429688</td><td>-73.98439788818358</td><td>2.96</td><td>0.0</td><td>17.760000000000002</td><td>numpy.datetime64('2016-01-01T00:21:30.000000000')</td><td>numpy.datetime64('2015-12-31T23:59:59.000000000')</td><td>1.0600000000000001</td><td>0.16666666666666666</td></tr></table>"
],
"text/plain": [
"<vaex.hdf5.dataset.Hdf5MemoryMapped at 0x10c158da0>"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.096040300836903153"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.mean(df.tip_percentage)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Lazy is good\n",
"See item 10/11 of [\"10 Things I Hate About pandas\"](http://wesmckinney.com/blog/apache-arrow-pandas-internals/) by Wes McKinney\n",
"> When you write df[df.c < 0].d.sum(), pandas creates a temporary DataFrame df[df.c < 0] then sums the d column of that temporary object. If df contains a lot of columns, this is ridiculously wasteful."
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.198073253803\n"
]
}
],
"source": [
"# this would cost 46+GB RAM using Pandas\n",
"# and wouldn't be possible on this laptop\n",
"print(df[df.tip_amount > 10].tip_percentage.mean())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# N-d statistic and viz"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[ 1., 4., 7., ..., 1., 1., 0.],\n",
" [ 8., 8., 14., ..., 0., 1., 0.],\n",
" [ 21., 8., 7., ..., 0., 1., 1.],\n",
" ..., \n",
" [ 1., 5., 0., ..., 0., 0., 0.],\n",
" [ 0., 0., 2., ..., 0., 0., 0.],\n",
" [ 0., 0., 1., ..., 0., 0., 0.]])"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"counts2d = df.count(binby=[df.pickup_longitude, df.pickup_latitude], shape=(128,128))\n",
"counts2d"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<matplotlib.image.AxesImage at 0x10800a710>"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "\n",
"text/plain": [
"<matplotlib.figure.Figure at 0x10c158940>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plt.imshow(np.log(counts2d.T+1), origin='lower')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df.plot_widget(df.pickup_longitude, df.pickup_latitude, f='log', controls_selection=True)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.08200438789770971"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.tip_percentage.mean(selection=True)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"def arc_distance(theta_1, phi_1, theta_2, phi_2):\n",
" temp = (np.sin((theta_2-theta_1)/2*np.pi/180)**2\n",
" + np.cos(theta_1*np.pi/180)*np.cos(theta_2*np.pi/180) * np.sin((phi_2-phi_1)/2*np.pi/180)**2)\n",
" distance = 2 * np.arctan2(np.sqrt(temp), np.sqrt(1-temp))\n",
" return distance * 6400/1.6"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"5167.5802938043762"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Distance between Groningen (NL) and Austin\n",
"arc_distance(53.1739086, 6.5990374, 30.2813584,-97.7558575)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"df['arc_distance'] = arc_distance(df.pickup_longitude,\n",
" df.pickup_latitude,\n",
" df.dropoff_longitude,\n",
" df.dropoff_latitude)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<vaex.expression.Expression(expressions='(((2 * arctan2(sqrt(((sin(((((dropoff_longitude - pickup_longitude) / 2) * 3.141592653589793) / 180)) ** 2) + ((cos(((pickup_longitude * 3.141592653589793) / 180)) * cos(((dropoff_longitude * 3.141592653589793) / 180))) * (sin(((((dropoff_latitude - pickup_latitude) / 2) * 3.141592653589793) / 180)) ** 2)))), sqrt((1 - ((sin(((((dropoff_longitude - pickup_longitude) / 2) * 3.141592653589793) / 180)) ** 2) + ((cos(((pickup_longitude * 3.141592653589793) / 180)) * cos(((dropoff_longitude * 3.141592653589793) / 180))) * (sin(((((dropoff_latitude - pickup_latitude) / 2) * 3.141592653589793) / 180)) ** 2))))))) * 6400) / 1.6)')> instance at 0x6cecd90b8 values=[1.33427949357, 0.839776114764, 0.906050493444, 0.352906059984, 2.33542574256 ... (total 146112989 values) ... 0.531393648669, 1.80460610131, 0.886795523869, 1.42076456204, 0.477360075018] "
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.arc_distance.expand()"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 1.6 s, sys: 28 ms, total: 1.63 s\n",
"Wall time: 258 ms\n"
]
},
{
"data": {
"text/plain": [
"0.096040300836910855"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%time\n",
"df.mean(df.tip_percentage)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 34.6 s, sys: 8.28 s, total: 42.8 s\n",
"Wall time: 9.14 s\n"
]
},
{
"data": {
"text/plain": [
"1.8301208899585906"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%time\n",
"df.mean(df.arc_distance)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"df['arc_distance_jit'] = df.arc_distance.jit_numba()\n",
"# df['arc_distance_jit'] = df.arc_distance.jit_pythran()"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 15.2 s, sys: 48.9 ms, total: 15.2 s\n",
"Wall time: 2.22 s\n"
]
},
{
"data": {
"text/plain": [
"1.8301208899586112"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%time\n",
"df.mean(df.arc_distance_jit)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Vaex\n",
" * Known API (Pandas)\n",
" * Super fast\n",
" * Expressions, no direct computation\n",
" * does not waste RAM\n",
" * JIT-ing\n",
" * derivatives\n",
" * machine learning: no more pipelines\n",
" * Remote datasets\n",
" * expressions and statistics gets transported, not data\n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import vaex\n",
"ds = vaex.open('/Users/maartenbreddels/datasets/aquarius/Aq-A-2-999-shuffled.hdf5')\n",
"ds.set_active_fraction(0.2)\n",
"ds.plot_widget('x', 'y', 'z', f='log', extent=[[40, 60]]*3, backend='ipyvolume', shape=100)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment