Skip to content

Instantly share code, notes, and snippets.

@rjzamora
Last active September 12, 2025 16:14
Show Gist options
  • Save rjzamora/047afe6b4c4b38cd3c130d9ec1e81dbe to your computer and use it in GitHub Desktop.
Save rjzamora/047afe6b4c4b38cd3c130d9ec1e81dbe to your computer and use it in GitHub Desktop.
Simple from_map example
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "c1fbae3f-8cac-4246-8d22-aca9d43ac903",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<dask.config.set at 0x7fa1c79e25a0>"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pyarrow.dataset as ds\n",
"import dask.dataframe as dd\n",
"from dask import config\n",
"import cudf\n",
"\n",
"# This shouldn't be necessary, but setting\n",
"# the backend to make sure the necessary\n",
"# dispatch functions get registered.\n",
"config.set({\"dataframe.backend\": \"cudf\"})"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "4b8b67b9-6b9d-424c-9d7a-f331261f3b67",
"metadata": {},
"outputs": [],
"source": [
"path = \"/datasets/rzamora/sample_csv\" # Or something like \"gs://pathtofolder/\"\n",
"## Uncomment to write example dataset\n",
"#from dask.datasets import timeseries\n",
"#timeseries()[[\"x\", \"y\", \"name\", \"id\"]].to_csv(path)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "5ee2b718-f833-4272-a228-a3f1d744a741",
"metadata": {},
"outputs": [],
"source": [
"# Function that loads a fragment to cudf\n",
"def load_csv_partition(paths: list[str], **options):\n",
" # The dataset and/or to_table calls can probably be\n",
" # optimized for GCS performance.\n",
" # E.g. see: https://arrow.apache.org/docs/python/generated/pyarrow.dataset.Dataset.html#pyarrow.dataset.Dataset.to_table\n",
" dataset = ds.dataset(paths, format=\"csv\", **options)\n",
" return cudf.DataFrame.from_arrow(dataset.to_table())"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "e3057e20-43f5-4c3d-bb3a-ca5513528c48",
"metadata": {},
"outputs": [],
"source": [
"# Create PyArrow dataset to collect a list of remote paths\n",
"# (There is probably a better way to do this if it's slow)\n",
"ds_options = {}\n",
"dataset = ds.dataset(path, format=\"csv\", **ds_options)\n",
"# NOTE: You may need to add back the gs:// prefix to each\n",
"# element of this list for GCS\n",
"all_paths = dataset.files"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "134d374a-cc43-4ae5-a54f-76169e364786",
"metadata": {},
"outputs": [],
"source": [
"# Choose how many files to map to each partition (stride).\n",
"# The ideal value will depend on the desired partition size.\n",
"stride = 3\n",
"partitions = [all_paths[i:i+stride] for i in range(0,len(all_paths), stride)]"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "4b97e3eb-a0f9-44b6-861a-62c32a23fa88",
"metadata": {},
"outputs": [],
"source": [
"# Create a cudf-backed Dask-DataFrame using from_map\n",
"df = dd.from_map(load_csv_partition, partitions, **ds_options)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "64f2ad34-1373-4268-a5d4-35d99c14745e",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"10 timestamp x y name id\n",
"0 2000-01-01 00:00:00 0.414311 0.278401 George 976\n",
"1 2000-01-01 00:00:01 0.653929 0.273288 Alice 1023\n",
"2 2000-01-01 00:00:02 0.057673 -0.576633 Quinn 971\n",
"3 2000-01-01 00:00:03 -0.174379 -0.115820 Michael 1022\n",
"4 2000-01-01 00:00:04 -0.610007 0.607593 Zelda 938\n"
]
}
],
"source": [
"print(df.npartitions, df.head())"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "6b5375f2-70cb-4bb4-9c98-e1d4a18e93f2",
"metadata": {},
"outputs": [],
"source": [
"# Simple from_map case. \n",
"df_simple = dd.from_map(cudf.read_csv, all_paths)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "595b83d4-7f7c-49d7-853f-963540a83f0d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"30 timestamp x y name id\n",
"0 2000-01-01 00:00:00 0.414311 0.278401 George 976\n",
"1 2000-01-01 00:00:01 0.653929 0.273288 Alice 1023\n",
"2 2000-01-01 00:00:02 0.057673 -0.576633 Quinn 971\n",
"3 2000-01-01 00:00:03 -0.174379 -0.115820 Michael 1022\n",
"4 2000-01-01 00:00:04 -0.610007 0.607593 Zelda 938\n"
]
}
],
"source": [
"print(df_simple.npartitions, df_simple.head())"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cd1e810d-2374-43b9-ab6e-c0e3ede885be",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment