Last active
September 12, 2025 16:14
-
-
Save rjzamora/047afe6b4c4b38cd3c130d9ec1e81dbe to your computer and use it in GitHub Desktop.
Simple from_map example
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "c1fbae3f-8cac-4246-8d22-aca9d43ac903", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"<dask.config.set at 0x7fa1c79e25a0>" | |
] | |
}, | |
"execution_count": 1, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"import pyarrow.dataset as ds\n", | |
"import dask.dataframe as dd\n", | |
"from dask import config\n", | |
"import cudf\n", | |
"\n", | |
"# This shouldn't be necessary, but setting\n", | |
"# the backend to make sure the necessary\n", | |
"# dispatch functions get registered.\n", | |
"config.set({\"dataframe.backend\": \"cudf\"})" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"id": "4b8b67b9-6b9d-424c-9d7a-f331261f3b67", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"path = \"/datasets/rzamora/sample_csv\" # Or something like \"gs://pathtofolder/\"\n", | |
"## Uncomment to write example dataset\n", | |
"#from dask.datasets import timeseries\n", | |
"#timeseries()[[\"x\", \"y\", \"name\", \"id\"]].to_csv(path)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"id": "5ee2b718-f833-4272-a228-a3f1d744a741", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Function that loads a fragment to cudf\n", | |
"def load_csv_partition(paths: list[str], **options):\n", | |
" # The dataset and/or to_table calls can probably be\n", | |
" # optimized for GCS performance.\n", | |
" # E.g. see: https://arrow.apache.org/docs/python/generated/pyarrow.dataset.Dataset.html#pyarrow.dataset.Dataset.to_table\n", | |
" dataset = ds.dataset(paths, format=\"csv\", **options)\n", | |
" return cudf.DataFrame.from_arrow(dataset.to_table())" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"id": "e3057e20-43f5-4c3d-bb3a-ca5513528c48", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Create PyArrow dataset to collect a list of remote paths\n", | |
"# (There is probably a better way to do this if it's slow)\n", | |
"ds_options = {}\n", | |
"dataset = ds.dataset(path, format=\"csv\", **ds_options)\n", | |
"# NOTE: You may need to add back the gs:// prefix to each\n", | |
"# element of this list for GCS\n", | |
"all_paths = dataset.files" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"id": "134d374a-cc43-4ae5-a54f-76169e364786", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Choose how many files to map to each partition (stride).\n", | |
"# The ideal value will depend on the desired partition size.\n", | |
"stride = 3\n", | |
"partitions = [all_paths[i:i+stride] for i in range(0,len(all_paths), stride)]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"id": "4b97e3eb-a0f9-44b6-861a-62c32a23fa88", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Create a cudf-backed Dask-DataFrame using from_map\n", | |
"df = dd.from_map(load_csv_partition, partitions, **ds_options)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"id": "64f2ad34-1373-4268-a5d4-35d99c14745e", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"10 timestamp x y name id\n", | |
"0 2000-01-01 00:00:00 0.414311 0.278401 George 976\n", | |
"1 2000-01-01 00:00:01 0.653929 0.273288 Alice 1023\n", | |
"2 2000-01-01 00:00:02 0.057673 -0.576633 Quinn 971\n", | |
"3 2000-01-01 00:00:03 -0.174379 -0.115820 Michael 1022\n", | |
"4 2000-01-01 00:00:04 -0.610007 0.607593 Zelda 938\n" | |
] | |
} | |
], | |
"source": [ | |
"print(df.npartitions, df.head())" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"id": "6b5375f2-70cb-4bb4-9c98-e1d4a18e93f2", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Simple from_map case. \n", | |
"df_simple = dd.from_map(cudf.read_csv, all_paths)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"id": "595b83d4-7f7c-49d7-853f-963540a83f0d", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"30 timestamp x y name id\n", | |
"0 2000-01-01 00:00:00 0.414311 0.278401 George 976\n", | |
"1 2000-01-01 00:00:01 0.653929 0.273288 Alice 1023\n", | |
"2 2000-01-01 00:00:02 0.057673 -0.576633 Quinn 971\n", | |
"3 2000-01-01 00:00:03 -0.174379 -0.115820 Michael 1022\n", | |
"4 2000-01-01 00:00:04 -0.610007 0.607593 Zelda 938\n" | |
] | |
} | |
], | |
"source": [ | |
"print(df_simple.npartitions, df_simple.head())" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "cd1e810d-2374-43b9-ab6e-c0e3ede885be", | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3 (ipykernel)", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.12.11" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment