rjzamora · September 12, 2025 16:14
diff --git a/from_map_csv.ipynb b/from_map_csv.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "c1fbae3f-8cac-4246-8d22-aca9d43ac903",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<dask.config.set at 0x7fa1c79e25a0>"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pyarrow.dataset as ds\n",
    "import dask.dataframe as dd\n",
    "from dask import config\n",
    "import cudf\n",
    "\n",
    "# This shouldn't be necessary, but setting\n",
    "# the backend to make sure the necessary\n",
    "# dispatch functions get registered.\n",
    "config.set({\"dataframe.backend\": \"cudf\"})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "4b8b67b9-6b9d-424c-9d7a-f331261f3b67",
   "metadata": {},
   "outputs": [],
   "source": [
    "path = \"/datasets/rzamora/sample_csv\" # Or something like \"gs://pathtofolder/\"\n",
    "## Uncomment to write example dataset\n",
    "#from dask.datasets import timeseries\n",
    "#timeseries()[[\"x\", \"y\", \"name\", \"id\"]].to_csv(path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "5ee2b718-f833-4272-a228-a3f1d744a741",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Function that loads a fragment to cudf\n",
    "def load_csv_partition(paths: list[str], **options):\n",
    "    # The dataset and/or to_table calls can probably be\n",
    "    # optimized for GCS performance.\n",
    "    # E.g. see: https://arrow.apache.org/docs/python/generated/pyarrow.dataset.Dataset.html#pyarrow.dataset.Dataset.to_table\n",
    "    dataset = ds.dataset(paths, format=\"csv\", **options)\n",
    "    return cudf.DataFrame.from_arrow(dataset.to_table())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "e3057e20-43f5-4c3d-bb3a-ca5513528c48",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create PyArrow dataset to collect a list of remote paths\n",
    "# (There is probably a better way to do this if it's slow)\n",
    "ds_options = {}\n",
    "dataset = ds.dataset(path, format=\"csv\", **ds_options)\n",
    "# NOTE: You may need to add back the gs:// prefix to each\n",
    "# element of this list for GCS\n",
    "all_paths = dataset.files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "134d374a-cc43-4ae5-a54f-76169e364786",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Choose how many files to map to each partition (stride).\n",
    "# The ideal value will depend on the desired partition size.\n",
    "stride = 3\n",
    "partitions = [all_paths[i:i+stride] for i in range(0,len(all_paths), stride)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "4b97e3eb-a0f9-44b6-861a-62c32a23fa88",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create a cudf-backed Dask-DataFrame using from_map\n",
    "df = dd.from_map(load_csv_partition, partitions, **ds_options)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "64f2ad34-1373-4268-a5d4-35d99c14745e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "10             timestamp         x         y     name    id\n",
      "0 2000-01-01 00:00:00  0.414311  0.278401   George   976\n",
      "1 2000-01-01 00:00:01  0.653929  0.273288    Alice  1023\n",
      "2 2000-01-01 00:00:02  0.057673 -0.576633    Quinn   971\n",
      "3 2000-01-01 00:00:03 -0.174379 -0.115820  Michael  1022\n",
      "4 2000-01-01 00:00:04 -0.610007  0.607593    Zelda   938\n"
     ]
    }
   ],
   "source": [
    "print(df.npartitions, df.head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "6b5375f2-70cb-4bb4-9c98-e1d4a18e93f2",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Simple from_map case. \n",
    "df_simple = dd.from_map(cudf.read_csv, all_paths)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "595b83d4-7f7c-49d7-853f-963540a83f0d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "30              timestamp         x         y     name    id\n",
      "0  2000-01-01 00:00:00  0.414311  0.278401   George   976\n",
      "1  2000-01-01 00:00:01  0.653929  0.273288    Alice  1023\n",
      "2  2000-01-01 00:00:02  0.057673 -0.576633    Quinn   971\n",
      "3  2000-01-01 00:00:03 -0.174379 -0.115820  Michael  1022\n",
      "4  2000-01-01 00:00:04 -0.610007  0.607593    Zelda   938\n"
     ]
    }
   ],
   "source": [
    "print(df_simple.npartitions, df_simple.head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cd1e810d-2374-43b9-ab6e-c0e3ede885be",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"id": "c1fbae3f-8cac-4246-8d22-aca9d43ac903",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"<dask.config.set at 0x7fa1c79e25a0>"
	]
	},
	"execution_count": 1,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"import pyarrow.dataset as ds\n",
	"import dask.dataframe as dd\n",
	"from dask import config\n",
	"import cudf\n",
	"\n",
	"# This shouldn't be necessary, but setting\n",
	"# the backend to make sure the necessary\n",
	"# dispatch functions get registered.\n",
	"config.set({\"dataframe.backend\": \"cudf\"})"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"id": "4b8b67b9-6b9d-424c-9d7a-f331261f3b67",
	"metadata": {},
	"outputs": [],
	"source": [
	"path = \"/datasets/rzamora/sample_csv\" # Or something like \"gs://pathtofolder/\"\n",
	"## Uncomment to write example dataset\n",
	"#from dask.datasets import timeseries\n",
	"#timeseries()[[\"x\", \"y\", \"name\", \"id\"]].to_csv(path)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"id": "5ee2b718-f833-4272-a228-a3f1d744a741",
	"metadata": {},
	"outputs": [],
	"source": [
	"# Function that loads a fragment to cudf\n",
	"def load_csv_partition(paths: list[str], **options):\n",
	" # The dataset and/or to_table calls can probably be\n",
	" # optimized for GCS performance.\n",
	" # E.g. see: https://arrow.apache.org/docs/python/generated/pyarrow.dataset.Dataset.html#pyarrow.dataset.Dataset.to_table\n",
	" dataset = ds.dataset(paths, format=\"csv\", **options)\n",
	" return cudf.DataFrame.from_arrow(dataset.to_table())"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"id": "e3057e20-43f5-4c3d-bb3a-ca5513528c48",
	"metadata": {},
	"outputs": [],
	"source": [
	"# Create PyArrow dataset to collect a list of remote paths\n",
	"# (There is probably a better way to do this if it's slow)\n",
	"ds_options = {}\n",
	"dataset = ds.dataset(path, format=\"csv\", **ds_options)\n",
	"# NOTE: You may need to add back the gs:// prefix to each\n",
	"# element of this list for GCS\n",
	"all_paths = dataset.files"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"id": "134d374a-cc43-4ae5-a54f-76169e364786",
	"metadata": {},
	"outputs": [],
	"source": [
	"# Choose how many files to map to each partition (stride).\n",
	"# The ideal value will depend on the desired partition size.\n",
	"stride = 3\n",
	"partitions = [all_paths[i:i+stride] for i in range(0,len(all_paths), stride)]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"id": "4b97e3eb-a0f9-44b6-861a-62c32a23fa88",
	"metadata": {},
	"outputs": [],
	"source": [
	"# Create a cudf-backed Dask-DataFrame using from_map\n",
	"df = dd.from_map(load_csv_partition, partitions, **ds_options)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"id": "64f2ad34-1373-4268-a5d4-35d99c14745e",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"10 timestamp x y name id\n",
	"0 2000-01-01 00:00:00 0.414311 0.278401 George 976\n",
	"1 2000-01-01 00:00:01 0.653929 0.273288 Alice 1023\n",
	"2 2000-01-01 00:00:02 0.057673 -0.576633 Quinn 971\n",
	"3 2000-01-01 00:00:03 -0.174379 -0.115820 Michael 1022\n",
	"4 2000-01-01 00:00:04 -0.610007 0.607593 Zelda 938\n"
	]
	}
	],
	"source": [
	"print(df.npartitions, df.head())"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"id": "6b5375f2-70cb-4bb4-9c98-e1d4a18e93f2",
	"metadata": {},
	"outputs": [],
	"source": [
	"# Simple from_map case. \n",
	"df_simple = dd.from_map(cudf.read_csv, all_paths)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"id": "595b83d4-7f7c-49d7-853f-963540a83f0d",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"30 timestamp x y name id\n",
	"0 2000-01-01 00:00:00 0.414311 0.278401 George 976\n",
	"1 2000-01-01 00:00:01 0.653929 0.273288 Alice 1023\n",
	"2 2000-01-01 00:00:02 0.057673 -0.576633 Quinn 971\n",
	"3 2000-01-01 00:00:03 -0.174379 -0.115820 Michael 1022\n",
	"4 2000-01-01 00:00:04 -0.610007 0.607593 Zelda 938\n"
	]
	}
	],
	"source": [
	"print(df_simple.npartitions, df_simple.head())"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "cd1e810d-2374-43b9-ab6e-c0e3ede885be",
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3 (ipykernel)",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.12.11"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}