Created
February 13, 2020 15:26
-
-
Save jorisvandenbossche/9fe00cba02c221b7b15f8c43bd4a9b97 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "# Demo of a Dask reader for Arrow Datasets" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import dask\n", | |
| "import dask.dataframe as dd\n", | |
| "import pyarrow.dataset as ds" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "We have a part of the NYC taxi data (6 months of 2016):" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "partitioning = ds.HivePartitioning(pa.schema([(\"year\", \"int32\"), (\"month\", \"int32\")]))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "ddf = dd.read_arrow_dataset(\"nyc-taxi-data/dask-partitioned/\", partitioning=partitioning)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div><strong>Dask DataFrame Structure:</strong></div>\n", | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>vendor_id</th>\n", | |
| " <th>pickup_at</th>\n", | |
| " <th>dropoff_at</th>\n", | |
| " <th>passenger_count</th>\n", | |
| " <th>trip_distance</th>\n", | |
| " <th>pickup_longitude</th>\n", | |
| " <th>pickup_latitude</th>\n", | |
| " <th>rate_code_id</th>\n", | |
| " <th>store_and_fwd_flag</th>\n", | |
| " <th>dropoff_longitude</th>\n", | |
| " <th>dropoff_latitude</th>\n", | |
| " <th>payment_type</th>\n", | |
| " <th>fare_amount</th>\n", | |
| " <th>extra</th>\n", | |
| " <th>mta_tax</th>\n", | |
| " <th>tip_amount</th>\n", | |
| " <th>tolls_amount</th>\n", | |
| " <th>improvement_surcharge</th>\n", | |
| " <th>total_amount</th>\n", | |
| " <th>index</th>\n", | |
| " <th>year</th>\n", | |
| " <th>month</th>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>npartitions=243</th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th></th>\n", | |
| " <td>object</td>\n", | |
| " <td>datetime64[ns]</td>\n", | |
| " <td>datetime64[ns]</td>\n", | |
| " <td>int8</td>\n", | |
| " <td>float32</td>\n", | |
| " <td>float32</td>\n", | |
| " <td>float32</td>\n", | |
| " <td>object</td>\n", | |
| " <td>object</td>\n", | |
| " <td>float32</td>\n", | |
| " <td>float32</td>\n", | |
| " <td>object</td>\n", | |
| " <td>float32</td>\n", | |
| " <td>float32</td>\n", | |
| " <td>float32</td>\n", | |
| " <td>float32</td>\n", | |
| " <td>float32</td>\n", | |
| " <td>float32</td>\n", | |
| " <td>float32</td>\n", | |
| " <td>int64</td>\n", | |
| " <td>int32</td>\n", | |
| " <td>int32</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th></th>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>...</th>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th></th>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th></th>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>\n", | |
| "<div>Dask Name: read-arrow-dataset, 243 tasks</div>" | |
| ], | |
| "text/plain": [ | |
| "Dask DataFrame Structure:\n", | |
| " vendor_id pickup_at dropoff_at passenger_count trip_distance pickup_longitude pickup_latitude rate_code_id store_and_fwd_flag dropoff_longitude dropoff_latitude payment_type fare_amount extra mta_tax tip_amount tolls_amount improvement_surcharge total_amount index year month\n", | |
| "npartitions=243 \n", | |
| " object datetime64[ns] datetime64[ns] int8 float32 float32 float32 object object float32 float32 object float32 float32 float32 float32 float32 float32 float32 int64 int32 int32\n", | |
| " ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n", | |
| "... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n", | |
| " ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n", | |
| " ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n", | |
| "Dask Name: read-arrow-dataset, 243 tasks" | |
| ] | |
| }, | |
| "execution_count": 4, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "ddf" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "How many rows? (I am taking here the length of a single column instead of the full dataframe, to avoid loading the full data)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "CPU times: user 5.41 s, sys: 51.3 ms, total: 5.46 s\n", | |
| "Wall time: 947 ms\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "69406520" | |
| ] | |
| }, | |
| "execution_count": 5, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "%%time\n", | |
| "len(ddf.trip_distance)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "Average trip_distance?" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 6, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "CPU times: user 5.8 s, sys: 86.3 ms, total: 5.88 s\n", | |
| "Wall time: 1.1 s\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "4.850022075555726" | |
| ] | |
| }, | |
| "execution_count": 6, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "%time ddf[\"trip_distance\"].mean().compute()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "Applying a filter needs to be done in the `read_arrow_dataset` for now:" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 7, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "ddf_subset = dd.read_arrow_dataset(\"nyc-taxi-data/dask-partitioned/\", partitioning=partitioning,\n", | |
| " filter=ds.field('passenger_count') > 8)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 8, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "CPU times: user 1min 10s, sys: 831 ms, total: 1min 11s\n", | |
| "Wall time: 10.3 s\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "%%time\n", | |
| "res = ddf_subset.compute()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 9, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>vendor_id</th>\n", | |
| " <th>pickup_at</th>\n", | |
| " <th>dropoff_at</th>\n", | |
| " <th>passenger_count</th>\n", | |
| " <th>trip_distance</th>\n", | |
| " <th>pickup_longitude</th>\n", | |
| " <th>pickup_latitude</th>\n", | |
| " <th>rate_code_id</th>\n", | |
| " <th>store_and_fwd_flag</th>\n", | |
| " <th>dropoff_longitude</th>\n", | |
| " <th>...</th>\n", | |
| " <th>fare_amount</th>\n", | |
| " <th>extra</th>\n", | |
| " <th>mta_tax</th>\n", | |
| " <th>tip_amount</th>\n", | |
| " <th>tolls_amount</th>\n", | |
| " <th>improvement_surcharge</th>\n", | |
| " <th>total_amount</th>\n", | |
| " <th>index</th>\n", | |
| " <th>year</th>\n", | |
| " <th>month</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>1</td>\n", | |
| " <td>2016-01-20 05:42:05</td>\n", | |
| " <td>2016-01-20 06:12:38</td>\n", | |
| " <td>9</td>\n", | |
| " <td>20.60</td>\n", | |
| " <td>-73.984482</td>\n", | |
| " <td>40.759647</td>\n", | |
| " <td>3</td>\n", | |
| " <td>N</td>\n", | |
| " <td>-74.177200</td>\n", | |
| " <td>...</td>\n", | |
| " <td>72.5</td>\n", | |
| " <td>0.5</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>15.000000</td>\n", | |
| " <td>10.50</td>\n", | |
| " <td>0.3</td>\n", | |
| " <td>98.800003</td>\n", | |
| " <td>6960517</td>\n", | |
| " <td>2016</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>2</td>\n", | |
| " <td>2016-01-17 06:30:09</td>\n", | |
| " <td>2016-01-17 06:59:28</td>\n", | |
| " <td>9</td>\n", | |
| " <td>23.83</td>\n", | |
| " <td>-73.784081</td>\n", | |
| " <td>40.646240</td>\n", | |
| " <td>5</td>\n", | |
| " <td>N</td>\n", | |
| " <td>-73.835701</td>\n", | |
| " <td>...</td>\n", | |
| " <td>96.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.5</td>\n", | |
| " <td>20.469999</td>\n", | |
| " <td>5.54</td>\n", | |
| " <td>0.3</td>\n", | |
| " <td>122.809998</td>\n", | |
| " <td>5232322</td>\n", | |
| " <td>2016</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>1</td>\n", | |
| " <td>2016-01-04 20:00:18</td>\n", | |
| " <td>2016-01-04 20:06:20</td>\n", | |
| " <td>9</td>\n", | |
| " <td>1.30</td>\n", | |
| " <td>-73.981537</td>\n", | |
| " <td>40.781200</td>\n", | |
| " <td>1</td>\n", | |
| " <td>N</td>\n", | |
| " <td>-73.971748</td>\n", | |
| " <td>...</td>\n", | |
| " <td>7.0</td>\n", | |
| " <td>0.5</td>\n", | |
| " <td>0.5</td>\n", | |
| " <td>0.700000</td>\n", | |
| " <td>0.00</td>\n", | |
| " <td>0.3</td>\n", | |
| " <td>9.000000</td>\n", | |
| " <td>1074037</td>\n", | |
| " <td>2016</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>1</td>\n", | |
| " <td>2016-01-04 20:17:17</td>\n", | |
| " <td>2016-01-04 20:24:00</td>\n", | |
| " <td>9</td>\n", | |
| " <td>1.00</td>\n", | |
| " <td>-73.981506</td>\n", | |
| " <td>40.780659</td>\n", | |
| " <td>1</td>\n", | |
| " <td>N</td>\n", | |
| " <td>-73.981888</td>\n", | |
| " <td>...</td>\n", | |
| " <td>6.5</td>\n", | |
| " <td>0.5</td>\n", | |
| " <td>0.5</td>\n", | |
| " <td>0.000000</td>\n", | |
| " <td>0.00</td>\n", | |
| " <td>0.3</td>\n", | |
| " <td>7.800000</td>\n", | |
| " <td>1076580</td>\n", | |
| " <td>2016</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>1</td>\n", | |
| " <td>2016-01-04 20:30:13</td>\n", | |
| " <td>2016-01-04 20:33:59</td>\n", | |
| " <td>9</td>\n", | |
| " <td>0.50</td>\n", | |
| " <td>-73.979240</td>\n", | |
| " <td>40.757847</td>\n", | |
| " <td>1</td>\n", | |
| " <td>N</td>\n", | |
| " <td>-73.985580</td>\n", | |
| " <td>...</td>\n", | |
| " <td>4.5</td>\n", | |
| " <td>0.5</td>\n", | |
| " <td>0.5</td>\n", | |
| " <td>0.000000</td>\n", | |
| " <td>0.00</td>\n", | |
| " <td>0.3</td>\n", | |
| " <td>5.800000</td>\n", | |
| " <td>1078458</td>\n", | |
| " <td>2016</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>...</th>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>1</td>\n", | |
| " <td>2016-06-19 04:27:27</td>\n", | |
| " <td>2016-06-19 04:45:50</td>\n", | |
| " <td>9</td>\n", | |
| " <td>9.80</td>\n", | |
| " <td>-73.991066</td>\n", | |
| " <td>40.750229</td>\n", | |
| " <td>1</td>\n", | |
| " <td>N</td>\n", | |
| " <td>-73.861603</td>\n", | |
| " <td>...</td>\n", | |
| " <td>28.5</td>\n", | |
| " <td>0.5</td>\n", | |
| " <td>0.5</td>\n", | |
| " <td>0.000000</td>\n", | |
| " <td>5.54</td>\n", | |
| " <td>0.3</td>\n", | |
| " <td>35.340000</td>\n", | |
| " <td>6874860</td>\n", | |
| " <td>2016</td>\n", | |
| " <td>6</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>2</td>\n", | |
| " <td>2016-06-14 22:22:00</td>\n", | |
| " <td>2016-06-14 22:51:26</td>\n", | |
| " <td>9</td>\n", | |
| " <td>21.34</td>\n", | |
| " <td>-74.007515</td>\n", | |
| " <td>40.723526</td>\n", | |
| " <td>5</td>\n", | |
| " <td>N</td>\n", | |
| " <td>-74.359749</td>\n", | |
| " <td>...</td>\n", | |
| " <td>95.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.000000</td>\n", | |
| " <td>10.50</td>\n", | |
| " <td>0.3</td>\n", | |
| " <td>105.800003</td>\n", | |
| " <td>5240621</td>\n", | |
| " <td>2016</td>\n", | |
| " <td>6</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>2</td>\n", | |
| " <td>2016-06-15 07:42:14</td>\n", | |
| " <td>2016-06-15 07:42:27</td>\n", | |
| " <td>9</td>\n", | |
| " <td>0.00</td>\n", | |
| " <td>-73.973999</td>\n", | |
| " <td>40.750431</td>\n", | |
| " <td>5</td>\n", | |
| " <td>N</td>\n", | |
| " <td>-73.973999</td>\n", | |
| " <td>...</td>\n", | |
| " <td>9.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.5</td>\n", | |
| " <td>1.200000</td>\n", | |
| " <td>0.00</td>\n", | |
| " <td>0.3</td>\n", | |
| " <td>11.000000</td>\n", | |
| " <td>5317592</td>\n", | |
| " <td>2016</td>\n", | |
| " <td>6</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>2</td>\n", | |
| " <td>2016-06-27 16:04:58</td>\n", | |
| " <td>2016-06-27 16:06:28</td>\n", | |
| " <td>9</td>\n", | |
| " <td>0.00</td>\n", | |
| " <td>-74.181297</td>\n", | |
| " <td>40.687801</td>\n", | |
| " <td>5</td>\n", | |
| " <td>N</td>\n", | |
| " <td>-74.181290</td>\n", | |
| " <td>...</td>\n", | |
| " <td>90.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.5</td>\n", | |
| " <td>0.000000</td>\n", | |
| " <td>0.00</td>\n", | |
| " <td>0.3</td>\n", | |
| " <td>90.800003</td>\n", | |
| " <td>9250166</td>\n", | |
| " <td>2016</td>\n", | |
| " <td>6</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>2</td>\n", | |
| " <td>2016-06-27 16:04:58</td>\n", | |
| " <td>2016-06-27 16:06:28</td>\n", | |
| " <td>9</td>\n", | |
| " <td>0.00</td>\n", | |
| " <td>-74.181297</td>\n", | |
| " <td>40.687801</td>\n", | |
| " <td>5</td>\n", | |
| " <td>N</td>\n", | |
| " <td>-74.181290</td>\n", | |
| " <td>...</td>\n", | |
| " <td>-90.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>-0.5</td>\n", | |
| " <td>0.000000</td>\n", | |
| " <td>0.00</td>\n", | |
| " <td>-0.3</td>\n", | |
| " <td>-90.800003</td>\n", | |
| " <td>9250167</td>\n", | |
| " <td>2016</td>\n", | |
| " <td>6</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "<p>125 rows × 22 columns</p>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " vendor_id pickup_at dropoff_at passenger_count \\\n", | |
| "0 1 2016-01-20 05:42:05 2016-01-20 06:12:38 9 \n", | |
| "0 2 2016-01-17 06:30:09 2016-01-17 06:59:28 9 \n", | |
| "0 1 2016-01-04 20:00:18 2016-01-04 20:06:20 9 \n", | |
| "1 1 2016-01-04 20:17:17 2016-01-04 20:24:00 9 \n", | |
| "2 1 2016-01-04 20:30:13 2016-01-04 20:33:59 9 \n", | |
| ".. ... ... ... ... \n", | |
| "0 1 2016-06-19 04:27:27 2016-06-19 04:45:50 9 \n", | |
| "0 2 2016-06-14 22:22:00 2016-06-14 22:51:26 9 \n", | |
| "1 2 2016-06-15 07:42:14 2016-06-15 07:42:27 9 \n", | |
| "0 2 2016-06-27 16:04:58 2016-06-27 16:06:28 9 \n", | |
| "1 2 2016-06-27 16:04:58 2016-06-27 16:06:28 9 \n", | |
| "\n", | |
| " trip_distance pickup_longitude pickup_latitude rate_code_id \\\n", | |
| "0 20.60 -73.984482 40.759647 3 \n", | |
| "0 23.83 -73.784081 40.646240 5 \n", | |
| "0 1.30 -73.981537 40.781200 1 \n", | |
| "1 1.00 -73.981506 40.780659 1 \n", | |
| "2 0.50 -73.979240 40.757847 1 \n", | |
| ".. ... ... ... ... \n", | |
| "0 9.80 -73.991066 40.750229 1 \n", | |
| "0 21.34 -74.007515 40.723526 5 \n", | |
| "1 0.00 -73.973999 40.750431 5 \n", | |
| "0 0.00 -74.181297 40.687801 5 \n", | |
| "1 0.00 -74.181297 40.687801 5 \n", | |
| "\n", | |
| " store_and_fwd_flag dropoff_longitude ... fare_amount extra mta_tax \\\n", | |
| "0 N -74.177200 ... 72.5 0.5 0.0 \n", | |
| "0 N -73.835701 ... 96.0 0.0 0.5 \n", | |
| "0 N -73.971748 ... 7.0 0.5 0.5 \n", | |
| "1 N -73.981888 ... 6.5 0.5 0.5 \n", | |
| "2 N -73.985580 ... 4.5 0.5 0.5 \n", | |
| ".. ... ... ... ... ... ... \n", | |
| "0 N -73.861603 ... 28.5 0.5 0.5 \n", | |
| "0 N -74.359749 ... 95.0 0.0 0.0 \n", | |
| "1 N -73.973999 ... 9.0 0.0 0.5 \n", | |
| "0 N -74.181290 ... 90.0 0.0 0.5 \n", | |
| "1 N -74.181290 ... -90.0 0.0 -0.5 \n", | |
| "\n", | |
| " tip_amount tolls_amount improvement_surcharge total_amount index \\\n", | |
| "0 15.000000 10.50 0.3 98.800003 6960517 \n", | |
| "0 20.469999 5.54 0.3 122.809998 5232322 \n", | |
| "0 0.700000 0.00 0.3 9.000000 1074037 \n", | |
| "1 0.000000 0.00 0.3 7.800000 1076580 \n", | |
| "2 0.000000 0.00 0.3 5.800000 1078458 \n", | |
| ".. ... ... ... ... ... \n", | |
| "0 0.000000 5.54 0.3 35.340000 6874860 \n", | |
| "0 0.000000 10.50 0.3 105.800003 5240621 \n", | |
| "1 1.200000 0.00 0.3 11.000000 5317592 \n", | |
| "0 0.000000 0.00 0.3 90.800003 9250166 \n", | |
| "1 0.000000 0.00 -0.3 -90.800003 9250167 \n", | |
| "\n", | |
| " year month \n", | |
| "0 2016 1 \n", | |
| "0 2016 1 \n", | |
| "0 2016 1 \n", | |
| "1 2016 1 \n", | |
| "2 2016 1 \n", | |
| ".. ... ... \n", | |
| "0 2016 6 \n", | |
| "0 2016 6 \n", | |
| "1 2016 6 \n", | |
| "0 2016 6 \n", | |
| "1 2016 6 \n", | |
| "\n", | |
| "[125 rows x 22 columns]" | |
| ] | |
| }, | |
| "execution_count": 9, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "res" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "This filter resulted in a pandas DataFrame of 125 rows.\n", | |
| "\n", | |
| "Getting a single column of that subset:" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 11, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "CPU times: user 4.35 s, sys: 71.7 ms, total: 4.42 s\n", | |
| "Wall time: 864 ms\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>trip_distance</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>20.60</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>23.83</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>1.30</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>1.00</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>0.50</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>...</th>\n", | |
| " <td>...</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>9.80</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>21.34</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>0.00</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>0.00</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>0.00</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "<p>125 rows × 1 columns</p>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " trip_distance\n", | |
| "0 20.60\n", | |
| "0 23.83\n", | |
| "0 1.30\n", | |
| "1 1.00\n", | |
| "2 0.50\n", | |
| ".. ...\n", | |
| "0 9.80\n", | |
| "0 21.34\n", | |
| "1 0.00\n", | |
| "0 0.00\n", | |
| "1 0.00\n", | |
| "\n", | |
| "[125 rows x 1 columns]" | |
| ] | |
| }, | |
| "execution_count": 11, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "%%time\n", | |
| "ddf_subset[['trip_distance']].compute()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python (arrow-dev)", | |
| "language": "python", | |
| "name": "arrow-dev" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.7.3" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 4 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment