Skip to content

Instantly share code, notes, and snippets.

@jorisvandenbossche
Created February 13, 2020 15:26
Show Gist options
  • Select an option

  • Save jorisvandenbossche/9fe00cba02c221b7b15f8c43bd4a9b97 to your computer and use it in GitHub Desktop.

Select an option

Save jorisvandenbossche/9fe00cba02c221b7b15f8c43bd4a9b97 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Demo of a Dask reader for Arrow Datasets"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import dask\n",
"import dask.dataframe as dd\n",
"import pyarrow.dataset as ds"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We have a part of the NYC taxi data (6 months of 2016):"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"partitioning = ds.HivePartitioning(pa.schema([(\"year\", \"int32\"), (\"month\", \"int32\")]))"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"ddf = dd.read_arrow_dataset(\"nyc-taxi-data/dask-partitioned/\", partitioning=partitioning)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div><strong>Dask DataFrame Structure:</strong></div>\n",
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>vendor_id</th>\n",
" <th>pickup_at</th>\n",
" <th>dropoff_at</th>\n",
" <th>passenger_count</th>\n",
" <th>trip_distance</th>\n",
" <th>pickup_longitude</th>\n",
" <th>pickup_latitude</th>\n",
" <th>rate_code_id</th>\n",
" <th>store_and_fwd_flag</th>\n",
" <th>dropoff_longitude</th>\n",
" <th>dropoff_latitude</th>\n",
" <th>payment_type</th>\n",
" <th>fare_amount</th>\n",
" <th>extra</th>\n",
" <th>mta_tax</th>\n",
" <th>tip_amount</th>\n",
" <th>tolls_amount</th>\n",
" <th>improvement_surcharge</th>\n",
" <th>total_amount</th>\n",
" <th>index</th>\n",
" <th>year</th>\n",
" <th>month</th>\n",
" </tr>\n",
" <tr>\n",
" <th>npartitions=243</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th></th>\n",
" <td>object</td>\n",
" <td>datetime64[ns]</td>\n",
" <td>datetime64[ns]</td>\n",
" <td>int8</td>\n",
" <td>float32</td>\n",
" <td>float32</td>\n",
" <td>float32</td>\n",
" <td>object</td>\n",
" <td>object</td>\n",
" <td>float32</td>\n",
" <td>float32</td>\n",
" <td>object</td>\n",
" <td>float32</td>\n",
" <td>float32</td>\n",
" <td>float32</td>\n",
" <td>float32</td>\n",
" <td>float32</td>\n",
" <td>float32</td>\n",
" <td>float32</td>\n",
" <td>int64</td>\n",
" <td>int32</td>\n",
" <td>int32</td>\n",
" </tr>\n",
" <tr>\n",
" <th></th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th></th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th></th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>\n",
"<div>Dask Name: read-arrow-dataset, 243 tasks</div>"
],
"text/plain": [
"Dask DataFrame Structure:\n",
" vendor_id pickup_at dropoff_at passenger_count trip_distance pickup_longitude pickup_latitude rate_code_id store_and_fwd_flag dropoff_longitude dropoff_latitude payment_type fare_amount extra mta_tax tip_amount tolls_amount improvement_surcharge total_amount index year month\n",
"npartitions=243 \n",
" object datetime64[ns] datetime64[ns] int8 float32 float32 float32 object object float32 float32 object float32 float32 float32 float32 float32 float32 float32 int64 int32 int32\n",
" ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n",
"... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n",
" ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n",
" ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n",
"Dask Name: read-arrow-dataset, 243 tasks"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ddf"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"How many rows? (I am taking here the length of a single column instead of the full dataframe, to avoid loading the full data)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 5.41 s, sys: 51.3 ms, total: 5.46 s\n",
"Wall time: 947 ms\n"
]
},
{
"data": {
"text/plain": [
"69406520"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%time\n",
"len(ddf.trip_distance)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Average trip_distance?"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 5.8 s, sys: 86.3 ms, total: 5.88 s\n",
"Wall time: 1.1 s\n"
]
},
{
"data": {
"text/plain": [
"4.850022075555726"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%time ddf[\"trip_distance\"].mean().compute()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Applying a filter needs to be done in the `read_arrow_dataset` for now:"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"ddf_subset = dd.read_arrow_dataset(\"nyc-taxi-data/dask-partitioned/\", partitioning=partitioning,\n",
" filter=ds.field('passenger_count') > 8)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 1min 10s, sys: 831 ms, total: 1min 11s\n",
"Wall time: 10.3 s\n"
]
}
],
"source": [
"%%time\n",
"res = ddf_subset.compute()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>vendor_id</th>\n",
" <th>pickup_at</th>\n",
" <th>dropoff_at</th>\n",
" <th>passenger_count</th>\n",
" <th>trip_distance</th>\n",
" <th>pickup_longitude</th>\n",
" <th>pickup_latitude</th>\n",
" <th>rate_code_id</th>\n",
" <th>store_and_fwd_flag</th>\n",
" <th>dropoff_longitude</th>\n",
" <th>...</th>\n",
" <th>fare_amount</th>\n",
" <th>extra</th>\n",
" <th>mta_tax</th>\n",
" <th>tip_amount</th>\n",
" <th>tolls_amount</th>\n",
" <th>improvement_surcharge</th>\n",
" <th>total_amount</th>\n",
" <th>index</th>\n",
" <th>year</th>\n",
" <th>month</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>2016-01-20 05:42:05</td>\n",
" <td>2016-01-20 06:12:38</td>\n",
" <td>9</td>\n",
" <td>20.60</td>\n",
" <td>-73.984482</td>\n",
" <td>40.759647</td>\n",
" <td>3</td>\n",
" <td>N</td>\n",
" <td>-74.177200</td>\n",
" <td>...</td>\n",
" <td>72.5</td>\n",
" <td>0.5</td>\n",
" <td>0.0</td>\n",
" <td>15.000000</td>\n",
" <td>10.50</td>\n",
" <td>0.3</td>\n",
" <td>98.800003</td>\n",
" <td>6960517</td>\n",
" <td>2016</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2</td>\n",
" <td>2016-01-17 06:30:09</td>\n",
" <td>2016-01-17 06:59:28</td>\n",
" <td>9</td>\n",
" <td>23.83</td>\n",
" <td>-73.784081</td>\n",
" <td>40.646240</td>\n",
" <td>5</td>\n",
" <td>N</td>\n",
" <td>-73.835701</td>\n",
" <td>...</td>\n",
" <td>96.0</td>\n",
" <td>0.0</td>\n",
" <td>0.5</td>\n",
" <td>20.469999</td>\n",
" <td>5.54</td>\n",
" <td>0.3</td>\n",
" <td>122.809998</td>\n",
" <td>5232322</td>\n",
" <td>2016</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>2016-01-04 20:00:18</td>\n",
" <td>2016-01-04 20:06:20</td>\n",
" <td>9</td>\n",
" <td>1.30</td>\n",
" <td>-73.981537</td>\n",
" <td>40.781200</td>\n",
" <td>1</td>\n",
" <td>N</td>\n",
" <td>-73.971748</td>\n",
" <td>...</td>\n",
" <td>7.0</td>\n",
" <td>0.5</td>\n",
" <td>0.5</td>\n",
" <td>0.700000</td>\n",
" <td>0.00</td>\n",
" <td>0.3</td>\n",
" <td>9.000000</td>\n",
" <td>1074037</td>\n",
" <td>2016</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>2016-01-04 20:17:17</td>\n",
" <td>2016-01-04 20:24:00</td>\n",
" <td>9</td>\n",
" <td>1.00</td>\n",
" <td>-73.981506</td>\n",
" <td>40.780659</td>\n",
" <td>1</td>\n",
" <td>N</td>\n",
" <td>-73.981888</td>\n",
" <td>...</td>\n",
" <td>6.5</td>\n",
" <td>0.5</td>\n",
" <td>0.5</td>\n",
" <td>0.000000</td>\n",
" <td>0.00</td>\n",
" <td>0.3</td>\n",
" <td>7.800000</td>\n",
" <td>1076580</td>\n",
" <td>2016</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>2016-01-04 20:30:13</td>\n",
" <td>2016-01-04 20:33:59</td>\n",
" <td>9</td>\n",
" <td>0.50</td>\n",
" <td>-73.979240</td>\n",
" <td>40.757847</td>\n",
" <td>1</td>\n",
" <td>N</td>\n",
" <td>-73.985580</td>\n",
" <td>...</td>\n",
" <td>4.5</td>\n",
" <td>0.5</td>\n",
" <td>0.5</td>\n",
" <td>0.000000</td>\n",
" <td>0.00</td>\n",
" <td>0.3</td>\n",
" <td>5.800000</td>\n",
" <td>1078458</td>\n",
" <td>2016</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>2016-06-19 04:27:27</td>\n",
" <td>2016-06-19 04:45:50</td>\n",
" <td>9</td>\n",
" <td>9.80</td>\n",
" <td>-73.991066</td>\n",
" <td>40.750229</td>\n",
" <td>1</td>\n",
" <td>N</td>\n",
" <td>-73.861603</td>\n",
" <td>...</td>\n",
" <td>28.5</td>\n",
" <td>0.5</td>\n",
" <td>0.5</td>\n",
" <td>0.000000</td>\n",
" <td>5.54</td>\n",
" <td>0.3</td>\n",
" <td>35.340000</td>\n",
" <td>6874860</td>\n",
" <td>2016</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2</td>\n",
" <td>2016-06-14 22:22:00</td>\n",
" <td>2016-06-14 22:51:26</td>\n",
" <td>9</td>\n",
" <td>21.34</td>\n",
" <td>-74.007515</td>\n",
" <td>40.723526</td>\n",
" <td>5</td>\n",
" <td>N</td>\n",
" <td>-74.359749</td>\n",
" <td>...</td>\n",
" <td>95.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>10.50</td>\n",
" <td>0.3</td>\n",
" <td>105.800003</td>\n",
" <td>5240621</td>\n",
" <td>2016</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>2016-06-15 07:42:14</td>\n",
" <td>2016-06-15 07:42:27</td>\n",
" <td>9</td>\n",
" <td>0.00</td>\n",
" <td>-73.973999</td>\n",
" <td>40.750431</td>\n",
" <td>5</td>\n",
" <td>N</td>\n",
" <td>-73.973999</td>\n",
" <td>...</td>\n",
" <td>9.0</td>\n",
" <td>0.0</td>\n",
" <td>0.5</td>\n",
" <td>1.200000</td>\n",
" <td>0.00</td>\n",
" <td>0.3</td>\n",
" <td>11.000000</td>\n",
" <td>5317592</td>\n",
" <td>2016</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2</td>\n",
" <td>2016-06-27 16:04:58</td>\n",
" <td>2016-06-27 16:06:28</td>\n",
" <td>9</td>\n",
" <td>0.00</td>\n",
" <td>-74.181297</td>\n",
" <td>40.687801</td>\n",
" <td>5</td>\n",
" <td>N</td>\n",
" <td>-74.181290</td>\n",
" <td>...</td>\n",
" <td>90.0</td>\n",
" <td>0.0</td>\n",
" <td>0.5</td>\n",
" <td>0.000000</td>\n",
" <td>0.00</td>\n",
" <td>0.3</td>\n",
" <td>90.800003</td>\n",
" <td>9250166</td>\n",
" <td>2016</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>2016-06-27 16:04:58</td>\n",
" <td>2016-06-27 16:06:28</td>\n",
" <td>9</td>\n",
" <td>0.00</td>\n",
" <td>-74.181297</td>\n",
" <td>40.687801</td>\n",
" <td>5</td>\n",
" <td>N</td>\n",
" <td>-74.181290</td>\n",
" <td>...</td>\n",
" <td>-90.0</td>\n",
" <td>0.0</td>\n",
" <td>-0.5</td>\n",
" <td>0.000000</td>\n",
" <td>0.00</td>\n",
" <td>-0.3</td>\n",
" <td>-90.800003</td>\n",
" <td>9250167</td>\n",
" <td>2016</td>\n",
" <td>6</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>125 rows × 22 columns</p>\n",
"</div>"
],
"text/plain": [
" vendor_id pickup_at dropoff_at passenger_count \\\n",
"0 1 2016-01-20 05:42:05 2016-01-20 06:12:38 9 \n",
"0 2 2016-01-17 06:30:09 2016-01-17 06:59:28 9 \n",
"0 1 2016-01-04 20:00:18 2016-01-04 20:06:20 9 \n",
"1 1 2016-01-04 20:17:17 2016-01-04 20:24:00 9 \n",
"2 1 2016-01-04 20:30:13 2016-01-04 20:33:59 9 \n",
".. ... ... ... ... \n",
"0 1 2016-06-19 04:27:27 2016-06-19 04:45:50 9 \n",
"0 2 2016-06-14 22:22:00 2016-06-14 22:51:26 9 \n",
"1 2 2016-06-15 07:42:14 2016-06-15 07:42:27 9 \n",
"0 2 2016-06-27 16:04:58 2016-06-27 16:06:28 9 \n",
"1 2 2016-06-27 16:04:58 2016-06-27 16:06:28 9 \n",
"\n",
" trip_distance pickup_longitude pickup_latitude rate_code_id \\\n",
"0 20.60 -73.984482 40.759647 3 \n",
"0 23.83 -73.784081 40.646240 5 \n",
"0 1.30 -73.981537 40.781200 1 \n",
"1 1.00 -73.981506 40.780659 1 \n",
"2 0.50 -73.979240 40.757847 1 \n",
".. ... ... ... ... \n",
"0 9.80 -73.991066 40.750229 1 \n",
"0 21.34 -74.007515 40.723526 5 \n",
"1 0.00 -73.973999 40.750431 5 \n",
"0 0.00 -74.181297 40.687801 5 \n",
"1 0.00 -74.181297 40.687801 5 \n",
"\n",
" store_and_fwd_flag dropoff_longitude ... fare_amount extra mta_tax \\\n",
"0 N -74.177200 ... 72.5 0.5 0.0 \n",
"0 N -73.835701 ... 96.0 0.0 0.5 \n",
"0 N -73.971748 ... 7.0 0.5 0.5 \n",
"1 N -73.981888 ... 6.5 0.5 0.5 \n",
"2 N -73.985580 ... 4.5 0.5 0.5 \n",
".. ... ... ... ... ... ... \n",
"0 N -73.861603 ... 28.5 0.5 0.5 \n",
"0 N -74.359749 ... 95.0 0.0 0.0 \n",
"1 N -73.973999 ... 9.0 0.0 0.5 \n",
"0 N -74.181290 ... 90.0 0.0 0.5 \n",
"1 N -74.181290 ... -90.0 0.0 -0.5 \n",
"\n",
" tip_amount tolls_amount improvement_surcharge total_amount index \\\n",
"0 15.000000 10.50 0.3 98.800003 6960517 \n",
"0 20.469999 5.54 0.3 122.809998 5232322 \n",
"0 0.700000 0.00 0.3 9.000000 1074037 \n",
"1 0.000000 0.00 0.3 7.800000 1076580 \n",
"2 0.000000 0.00 0.3 5.800000 1078458 \n",
".. ... ... ... ... ... \n",
"0 0.000000 5.54 0.3 35.340000 6874860 \n",
"0 0.000000 10.50 0.3 105.800003 5240621 \n",
"1 1.200000 0.00 0.3 11.000000 5317592 \n",
"0 0.000000 0.00 0.3 90.800003 9250166 \n",
"1 0.000000 0.00 -0.3 -90.800003 9250167 \n",
"\n",
" year month \n",
"0 2016 1 \n",
"0 2016 1 \n",
"0 2016 1 \n",
"1 2016 1 \n",
"2 2016 1 \n",
".. ... ... \n",
"0 2016 6 \n",
"0 2016 6 \n",
"1 2016 6 \n",
"0 2016 6 \n",
"1 2016 6 \n",
"\n",
"[125 rows x 22 columns]"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"res"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This filter resulted in a pandas DataFrame of 125 rows.\n",
"\n",
"Getting a single column of that subset:"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 4.35 s, sys: 71.7 ms, total: 4.42 s\n",
"Wall time: 864 ms\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>trip_distance</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>20.60</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>23.83</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1.30</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0.50</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>9.80</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>21.34</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0.00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>125 rows × 1 columns</p>\n",
"</div>"
],
"text/plain": [
" trip_distance\n",
"0 20.60\n",
"0 23.83\n",
"0 1.30\n",
"1 1.00\n",
"2 0.50\n",
".. ...\n",
"0 9.80\n",
"0 21.34\n",
"1 0.00\n",
"0 0.00\n",
"1 0.00\n",
"\n",
"[125 rows x 1 columns]"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%time\n",
"ddf_subset[['trip_distance']].compute()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python (arrow-dev)",
"language": "python",
"name": "arrow-dev"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment