Last active
December 17, 2024 03:29
-
-
Save NellyWhads/fdfb261a027be7e7bc87bec91d9e9035 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from pprint import pprint\n", | |
"\n", | |
"import numpy as np\n", | |
"import pandas as pd\n", | |
"import pyarrow as pa\n", | |
"import pyarrow.parquet as pq\n", | |
"import ray.data\n" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Flat dataframe" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"num_samples = 5" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>D</th>\n", | |
" <th>s</th>\n", | |
" <th>ms</th>\n", | |
" <th>us</th>\n", | |
" <th>ns</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>2026-05-01</td>\n", | |
" <td>2024-01-01 00:14:11</td>\n", | |
" <td>2024-01-01 00:00:00.851</td>\n", | |
" <td>2024-01-01 00:00:00.000851</td>\n", | |
" <td>2024-01-01 00:00:00.000000851</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>2025-08-22</td>\n", | |
" <td>2024-01-01 00:09:59</td>\n", | |
" <td>2024-01-01 00:00:00.599</td>\n", | |
" <td>2024-01-01 00:00:00.000599</td>\n", | |
" <td>2024-01-01 00:00:00.000000599</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>2024-12-04</td>\n", | |
" <td>2024-01-01 00:05:38</td>\n", | |
" <td>2024-01-01 00:00:00.338</td>\n", | |
" <td>2024-01-01 00:00:00.000338</td>\n", | |
" <td>2024-01-01 00:00:00.000000338</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>2026-03-17</td>\n", | |
" <td>2024-01-01 00:13:26</td>\n", | |
" <td>2024-01-01 00:00:00.806</td>\n", | |
" <td>2024-01-01 00:00:00.000806</td>\n", | |
" <td>2024-01-01 00:00:00.000000806</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>2025-05-25</td>\n", | |
" <td>2024-01-01 00:08:30</td>\n", | |
" <td>2024-01-01 00:00:00.510</td>\n", | |
" <td>2024-01-01 00:00:00.000510</td>\n", | |
" <td>2024-01-01 00:00:00.000000510</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" D s ms \\\n", | |
"0 2026-05-01 2024-01-01 00:14:11 2024-01-01 00:00:00.851 \n", | |
"1 2025-08-22 2024-01-01 00:09:59 2024-01-01 00:00:00.599 \n", | |
"2 2024-12-04 2024-01-01 00:05:38 2024-01-01 00:00:00.338 \n", | |
"3 2026-03-17 2024-01-01 00:13:26 2024-01-01 00:00:00.806 \n", | |
"4 2025-05-25 2024-01-01 00:08:30 2024-01-01 00:00:00.510 \n", | |
"\n", | |
" us ns \n", | |
"0 2024-01-01 00:00:00.000851 2024-01-01 00:00:00.000000851 \n", | |
"1 2024-01-01 00:00:00.000599 2024-01-01 00:00:00.000000599 \n", | |
"2 2024-01-01 00:00:00.000338 2024-01-01 00:00:00.000000338 \n", | |
"3 2024-01-01 00:00:00.000806 2024-01-01 00:00:00.000000806 \n", | |
"4 2024-01-01 00:00:00.000510 2024-01-01 00:00:00.000000510 " | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Create random timestamps with different precisions\n", | |
"base_timestamp = np.datetime64(\"2024-01-01\")\n", | |
"random_offsets = np.random.randint(0, 1000, size=num_samples)\n", | |
"\n", | |
"df = pd.DataFrame(\n", | |
" {\n", | |
" \"D\": [base_timestamp + np.timedelta64(offset, \"D\") for offset in random_offsets],\n", | |
" \"s\": [base_timestamp + np.timedelta64(offset, \"s\") for offset in random_offsets],\n", | |
" \"ms\": [base_timestamp + np.timedelta64(offset, \"ms\") for offset in random_offsets],\n", | |
" \"us\": [base_timestamp + np.timedelta64(offset, \"us\") for offset in random_offsets],\n", | |
" \"ns\": [base_timestamp + np.timedelta64(offset, \"ns\") for offset in random_offsets],\n", | |
" }\n", | |
")\n", | |
"df" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"parquet_file = \"test.parquet\"\n", | |
"pq.write_table(pa.Table.from_pandas(df), parquet_file)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"2024-12-16 22:29:21,211\tINFO worker.py:1743 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32m127.0.0.1:8266 \u001b[39m\u001b[22m\n" | |
] | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "84ea75106e8947e6867375122aedcc3f", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"Parquet Files Sample 0: 0%| | 0/1 [00:00<?, ?it/s]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"2024-12-16 22:29:22,284\tINFO streaming_executor.py:115 -- Starting execution of Dataset. Full log is in /tmp/ray/session_2024-12-16_22-29-19_508831_9390/logs/ray-data.log\n", | |
"2024-12-16 22:29:22,284\tINFO streaming_executor.py:116 -- Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadParquet] -> LimitOperator[limit=5]\n", | |
"\n" | |
] | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "681f02513e8a4c49a66eae7359caa9b6", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"- ReadParquet->SplitBlocks(22) 1: 0%| | 0/1 [00:00<?, ?it/s]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "0ef4e388dbce49409f0eb6b33ccc2a0c", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"- limit=5 2: 0%| | 0/1 [00:00<?, ?it/s]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "f7f8e787ae0243eab732e9497fece637", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"Running 0: 0%| | 0/1 [00:00<?, ?it/s]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>D</th>\n", | |
" <th>s</th>\n", | |
" <th>ms</th>\n", | |
" <th>us</th>\n", | |
" <th>ns</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>2026-05-01</td>\n", | |
" <td>2024-01-01 00:14:11</td>\n", | |
" <td>2024-01-01 00:00:00.851</td>\n", | |
" <td>2024-01-01 00:00:00.000851</td>\n", | |
" <td>2024-01-01 00:00:00.000000851</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>2025-08-22</td>\n", | |
" <td>2024-01-01 00:09:59</td>\n", | |
" <td>2024-01-01 00:00:00.599</td>\n", | |
" <td>2024-01-01 00:00:00.000599</td>\n", | |
" <td>2024-01-01 00:00:00.000000599</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>2024-12-04</td>\n", | |
" <td>2024-01-01 00:05:38</td>\n", | |
" <td>2024-01-01 00:00:00.338</td>\n", | |
" <td>2024-01-01 00:00:00.000338</td>\n", | |
" <td>2024-01-01 00:00:00.000000338</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>2026-03-17</td>\n", | |
" <td>2024-01-01 00:13:26</td>\n", | |
" <td>2024-01-01 00:00:00.806</td>\n", | |
" <td>2024-01-01 00:00:00.000806</td>\n", | |
" <td>2024-01-01 00:00:00.000000806</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>2025-05-25</td>\n", | |
" <td>2024-01-01 00:08:30</td>\n", | |
" <td>2024-01-01 00:00:00.510</td>\n", | |
" <td>2024-01-01 00:00:00.000510</td>\n", | |
" <td>2024-01-01 00:00:00.000000510</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" D s ms \\\n", | |
"0 2026-05-01 2024-01-01 00:14:11 2024-01-01 00:00:00.851 \n", | |
"1 2025-08-22 2024-01-01 00:09:59 2024-01-01 00:00:00.599 \n", | |
"2 2024-12-04 2024-01-01 00:05:38 2024-01-01 00:00:00.338 \n", | |
"3 2026-03-17 2024-01-01 00:13:26 2024-01-01 00:00:00.806 \n", | |
"4 2025-05-25 2024-01-01 00:08:30 2024-01-01 00:00:00.510 \n", | |
"\n", | |
" us ns \n", | |
"0 2024-01-01 00:00:00.000851 2024-01-01 00:00:00.000000851 \n", | |
"1 2024-01-01 00:00:00.000599 2024-01-01 00:00:00.000000599 \n", | |
"2 2024-01-01 00:00:00.000338 2024-01-01 00:00:00.000000338 \n", | |
"3 2024-01-01 00:00:00.000806 2024-01-01 00:00:00.000000806 \n", | |
"4 2024-01-01 00:00:00.000510 2024-01-01 00:00:00.000000510 " | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"ds = ray.data.read_parquet(parquet_file)\n", | |
"ds_df = ds.take_batch(num_samples, batch_format=\"pandas\")\n", | |
"ds_df\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"2024-12-16 22:29:22,375\tINFO streaming_executor.py:115 -- Starting execution of Dataset. Full log is in /tmp/ray/session_2024-12-16_22-29-19_508831_9390/logs/ray-data.log\n", | |
"2024-12-16 22:29:22,375\tINFO streaming_executor.py:116 -- Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadParquet] -> TaskPoolMapOperator[MapBatches(<lambda>)] -> LimitOperator[limit=5]\n", | |
"\n" | |
] | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "ad3c934bd9d24c298df039e40c6500cd", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"- ReadParquet->SplitBlocks(22) 1: 0%| | 0/1 [00:00<?, ?it/s]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "cc066808ae404b7a9698c66a2f6d3663", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"- MapBatches(<lambda>) 2: 0%| | 0/1 [00:00<?, ?it/s]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "89f5391354dc4de58bfd15f36ca24e58", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"- limit=5 3: 0%| | 0/1 [00:00<?, ?it/s]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "b8da76afc62b4b3a94953df6f7fd963b", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"Running 0: 0%| | 0/1 [00:00<?, ?it/s]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>D</th>\n", | |
" <th>s</th>\n", | |
" <th>ms</th>\n", | |
" <th>us</th>\n", | |
" <th>ns</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>2026-05-01</td>\n", | |
" <td>2024-01-01 00:14:11</td>\n", | |
" <td>2024-01-01 00:00:00.851</td>\n", | |
" <td>2024-01-01 00:00:00.000851</td>\n", | |
" <td>2024-01-01 00:00:00.000000851</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>2025-08-22</td>\n", | |
" <td>2024-01-01 00:09:59</td>\n", | |
" <td>2024-01-01 00:00:00.599</td>\n", | |
" <td>2024-01-01 00:00:00.000599</td>\n", | |
" <td>2024-01-01 00:00:00.000000599</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>2024-12-04</td>\n", | |
" <td>2024-01-01 00:05:38</td>\n", | |
" <td>2024-01-01 00:00:00.338</td>\n", | |
" <td>2024-01-01 00:00:00.000338</td>\n", | |
" <td>2024-01-01 00:00:00.000000338</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>2026-03-17</td>\n", | |
" <td>2024-01-01 00:13:26</td>\n", | |
" <td>2024-01-01 00:00:00.806</td>\n", | |
" <td>2024-01-01 00:00:00.000806</td>\n", | |
" <td>2024-01-01 00:00:00.000000806</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>2025-05-25</td>\n", | |
" <td>2024-01-01 00:08:30</td>\n", | |
" <td>2024-01-01 00:00:00.510</td>\n", | |
" <td>2024-01-01 00:00:00.000510</td>\n", | |
" <td>2024-01-01 00:00:00.000000510</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" D s ms \\\n", | |
"0 2026-05-01 2024-01-01 00:14:11 2024-01-01 00:00:00.851 \n", | |
"1 2025-08-22 2024-01-01 00:09:59 2024-01-01 00:00:00.599 \n", | |
"2 2024-12-04 2024-01-01 00:05:38 2024-01-01 00:00:00.338 \n", | |
"3 2026-03-17 2024-01-01 00:13:26 2024-01-01 00:00:00.806 \n", | |
"4 2025-05-25 2024-01-01 00:08:30 2024-01-01 00:00:00.510 \n", | |
"\n", | |
" us ns \n", | |
"0 2024-01-01 00:00:00.000851 2024-01-01 00:00:00.000000851 \n", | |
"1 2024-01-01 00:00:00.000599 2024-01-01 00:00:00.000000599 \n", | |
"2 2024-01-01 00:00:00.000338 2024-01-01 00:00:00.000000338 \n", | |
"3 2024-01-01 00:00:00.000806 2024-01-01 00:00:00.000000806 \n", | |
"4 2024-01-01 00:00:00.000510 2024-01-01 00:00:00.000000510 " | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"map_batches_ds = ds.map_batches(lambda x: x, batch_format=\"pandas\")\n", | |
"map_batches_ds_df = map_batches_ds.take_batch(num_samples, batch_format=\"pandas\")\n", | |
"map_batches_ds_df\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"2024-12-16 22:29:23,065\tINFO streaming_executor.py:115 -- Starting execution of Dataset. Full log is in /tmp/ray/session_2024-12-16_22-29-19_508831_9390/logs/ray-data.log\n", | |
"2024-12-16 22:29:23,066\tINFO streaming_executor.py:116 -- Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadParquet] -> TaskPoolMapOperator[MapBatches(<lambda>)] -> LimitOperator[limit=5]\n", | |
"\n" | |
] | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "a5ee77a3ec904f83ac418bcdd12d8879", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"- ReadParquet->SplitBlocks(22) 1: 0%| | 0/1 [00:00<?, ?it/s]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "b59430b34ef24925b855f52fecbc1acc", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"- MapBatches(<lambda>) 2: 0%| | 0/1 [00:00<?, ?it/s]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "f7ae7506eefa4b09a11e0134aa67a756", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"- limit=5 3: 0%| | 0/1 [00:00<?, ?it/s]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "7a67b1e71cab460dbcbbcb6d22d90a29", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"Running 0: 0%| | 0/1 [00:00<?, ?it/s]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>D</th>\n", | |
" <th>s</th>\n", | |
" <th>ms</th>\n", | |
" <th>us</th>\n", | |
" <th>ns</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>2026-05-01</td>\n", | |
" <td>2024-01-01 00:14:11</td>\n", | |
" <td>2024-01-01 00:00:00.851</td>\n", | |
" <td>2024-01-01 00:00:00.000851</td>\n", | |
" <td>2024-01-01 00:00:00.000000851</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>2025-08-22</td>\n", | |
" <td>2024-01-01 00:09:59</td>\n", | |
" <td>2024-01-01 00:00:00.599</td>\n", | |
" <td>2024-01-01 00:00:00.000599</td>\n", | |
" <td>2024-01-01 00:00:00.000000599</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>2024-12-04</td>\n", | |
" <td>2024-01-01 00:05:38</td>\n", | |
" <td>2024-01-01 00:00:00.338</td>\n", | |
" <td>2024-01-01 00:00:00.000338</td>\n", | |
" <td>2024-01-01 00:00:00.000000338</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>2026-03-17</td>\n", | |
" <td>2024-01-01 00:13:26</td>\n", | |
" <td>2024-01-01 00:00:00.806</td>\n", | |
" <td>2024-01-01 00:00:00.000806</td>\n", | |
" <td>2024-01-01 00:00:00.000000806</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>2025-05-25</td>\n", | |
" <td>2024-01-01 00:08:30</td>\n", | |
" <td>2024-01-01 00:00:00.510</td>\n", | |
" <td>2024-01-01 00:00:00.000510</td>\n", | |
" <td>2024-01-01 00:00:00.000000510</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" D s ms \\\n", | |
"0 2026-05-01 2024-01-01 00:14:11 2024-01-01 00:00:00.851 \n", | |
"1 2025-08-22 2024-01-01 00:09:59 2024-01-01 00:00:00.599 \n", | |
"2 2024-12-04 2024-01-01 00:05:38 2024-01-01 00:00:00.338 \n", | |
"3 2026-03-17 2024-01-01 00:13:26 2024-01-01 00:00:00.806 \n", | |
"4 2025-05-25 2024-01-01 00:08:30 2024-01-01 00:00:00.510 \n", | |
"\n", | |
" us ns \n", | |
"0 2024-01-01 00:00:00.000851 2024-01-01 00:00:00.000000851 \n", | |
"1 2024-01-01 00:00:00.000599 2024-01-01 00:00:00.000000599 \n", | |
"2 2024-01-01 00:00:00.000338 2024-01-01 00:00:00.000000338 \n", | |
"3 2024-01-01 00:00:00.000806 2024-01-01 00:00:00.000000806 \n", | |
"4 2024-01-01 00:00:00.000510 2024-01-01 00:00:00.000000510 " | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"map_batches_ds = ds.map_batches(lambda x: x)\n", | |
"map_batches_ds_df = map_batches_ds.take_batch(num_samples, batch_format=\"pandas\")\n", | |
"map_batches_ds_df" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"2024-12-16 22:29:23,131\tINFO streaming_executor.py:115 -- Starting execution of Dataset. Full log is in /tmp/ray/session_2024-12-16_22-29-19_508831_9390/logs/ray-data.log\n", | |
"2024-12-16 22:29:23,131\tINFO streaming_executor.py:116 -- Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadParquet] -> TaskPoolMapOperator[Map(<lambda>)] -> LimitOperator[limit=5]\n", | |
"\n" | |
] | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "69b9d68e14bb4c06ba789c16fd0b88b6", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"- ReadParquet->SplitBlocks(22) 1: 0%| | 0/1 [00:00<?, ?it/s]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "f62c45e7b7dc48bea2ae01774fd2ed3a", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"- Map(<lambda>) 2: 0%| | 0/1 [00:00<?, ?it/s]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "1034b5f238cf4697914bfc0e31e2ff9c", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"- limit=5 3: 0%| | 0/1 [00:00<?, ?it/s]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "1f76479c48cf47809562628e6b83b503", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"Running 0: 0%| | 0/1 [00:00<?, ?it/s]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>D</th>\n", | |
" <th>s</th>\n", | |
" <th>ms</th>\n", | |
" <th>us</th>\n", | |
" <th>ns</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>2026-05-01</td>\n", | |
" <td>2024-01-01 00:14:11</td>\n", | |
" <td>2024-01-01 00:00:00.851</td>\n", | |
" <td>2024-01-01 00:00:00.000851</td>\n", | |
" <td>2024-01-01</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>2025-08-22</td>\n", | |
" <td>2024-01-01 00:09:59</td>\n", | |
" <td>2024-01-01 00:00:00.599</td>\n", | |
" <td>2024-01-01 00:00:00.000599</td>\n", | |
" <td>2024-01-01</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>2024-12-04</td>\n", | |
" <td>2024-01-01 00:05:38</td>\n", | |
" <td>2024-01-01 00:00:00.338</td>\n", | |
" <td>2024-01-01 00:00:00.000338</td>\n", | |
" <td>2024-01-01</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>2026-03-17</td>\n", | |
" <td>2024-01-01 00:13:26</td>\n", | |
" <td>2024-01-01 00:00:00.806</td>\n", | |
" <td>2024-01-01 00:00:00.000806</td>\n", | |
" <td>2024-01-01</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>2025-05-25</td>\n", | |
" <td>2024-01-01 00:08:30</td>\n", | |
" <td>2024-01-01 00:00:00.510</td>\n", | |
" <td>2024-01-01 00:00:00.000510</td>\n", | |
" <td>2024-01-01</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" D s ms \\\n", | |
"0 2026-05-01 2024-01-01 00:14:11 2024-01-01 00:00:00.851 \n", | |
"1 2025-08-22 2024-01-01 00:09:59 2024-01-01 00:00:00.599 \n", | |
"2 2024-12-04 2024-01-01 00:05:38 2024-01-01 00:00:00.338 \n", | |
"3 2026-03-17 2024-01-01 00:13:26 2024-01-01 00:00:00.806 \n", | |
"4 2025-05-25 2024-01-01 00:08:30 2024-01-01 00:00:00.510 \n", | |
"\n", | |
" us ns \n", | |
"0 2024-01-01 00:00:00.000851 2024-01-01 \n", | |
"1 2024-01-01 00:00:00.000599 2024-01-01 \n", | |
"2 2024-01-01 00:00:00.000338 2024-01-01 \n", | |
"3 2024-01-01 00:00:00.000806 2024-01-01 \n", | |
"4 2024-01-01 00:00:00.000510 2024-01-01 " | |
] | |
}, | |
"execution_count": 8, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"map_ds = ds.map(lambda x: x)\n", | |
"map_ds_df = map_ds.take_batch(num_samples, batch_format=\"pandas\")\n", | |
"map_ds_df\n" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Nested Dataframe" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Create random timestamps with different precisions using pandas first\n", | |
"base_timestamp = pd.Timestamp(\"2024-01-01\")\n", | |
"random_offsets = np.random.randint(0, 1000, size=num_samples)\n", | |
"\n", | |
"# Create pandas timestamps with different precisions\n", | |
"d_timestamps = [base_timestamp + pd.Timedelta(days=offset) for offset in random_offsets]\n", | |
"s_timestamps = [base_timestamp + pd.Timedelta(seconds=offset) for offset in random_offsets]\n", | |
"ms_timestamps = [base_timestamp + pd.Timedelta(milliseconds=offset) for offset in random_offsets]\n", | |
"us_timestamps = [base_timestamp + pd.Timedelta(microseconds=offset) for offset in random_offsets]\n", | |
"ns_timestamps = [base_timestamp + pd.Timedelta(nanoseconds=offset) for offset in random_offsets]\n", | |
"\n", | |
"# Convert to pyarrow arrays\n", | |
"d_array = pa.array(d_timestamps, type=pa.date32())\n", | |
"s_array = pa.array(s_timestamps, type=pa.timestamp(\"s\"))\n", | |
"ms_array = pa.array(ms_timestamps, type=pa.timestamp(\"ms\"))\n", | |
"us_array = pa.array(us_timestamps, type=pa.timestamp(\"us\"))\n", | |
"ns_array = pa.array(ns_timestamps, type=pa.timestamp(\"ns\"))\n", | |
"\n", | |
"# Create struct array\n", | |
"struct_array = pa.StructArray.from_arrays(\n", | |
" [d_array, s_array, ms_array, us_array, ns_array],\n", | |
" fields=[\n", | |
" pa.field(\"D\", pa.date32()),\n", | |
" pa.field(\"s\", pa.timestamp(\"s\")),\n", | |
" pa.field(\"ms\", pa.timestamp(\"ms\")),\n", | |
" pa.field(\"us\", pa.timestamp(\"us\")),\n", | |
" pa.field(\"ns\", pa.timestamp(\"ns\")),\n", | |
" ],\n", | |
")\n", | |
"\n", | |
"# Create table\n", | |
"table = pa.Table.from_arrays([struct_array], [\"timestamps\"])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"{'D': <pyarrow.Date32Scalar: datetime.date(2024, 9, 16)>,\n", | |
" 'ms': <pyarrow.TimestampScalar: '2024-01-01T00:00:00.259'>,\n", | |
" 'ns': <pyarrow.TimestampScalar: '2024-01-01T00:00:00.000000259'>,\n", | |
" 's': <pyarrow.TimestampScalar: '2024-01-01T00:04:19'>,\n", | |
" 'us': <pyarrow.TimestampScalar: '2024-01-01T00:00:00.000259'>}\n" | |
] | |
} | |
], | |
"source": [ | |
"pprint(dict(table[\"timestamps\"][0]))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"parquet_file_2 = \"test_2.parquet\"\n", | |
"pq.write_table(table, parquet_file_2)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"pyarrow.Table\n", | |
"timestamps: struct<D: date32[day], s: timestamp[ms], ms: timestamp[ms], us: timestamp[us], ns: timestamp[ns]>\n", | |
" child 0, D: date32[day]\n", | |
" child 1, s: timestamp[ms]\n", | |
" child 2, ms: timestamp[ms]\n", | |
" child 3, us: timestamp[us]\n", | |
" child 4, ns: timestamp[ns]\n", | |
"----\n", | |
"timestamps: [\n", | |
" -- is_valid: all not null\n", | |
" -- child 0 type: date32[day]\n", | |
"[2024-09-16,2025-07-31,2025-11-03,2026-05-26,2025-10-09]\n", | |
" -- child 1 type: timestamp[ms]\n", | |
"[2024-01-01 00:04:19.000,2024-01-01 00:09:37.000,2024-01-01 00:11:12.000,2024-01-01 00:14:36.000,2024-01-01 00:10:47.000]\n", | |
" -- child 2 type: timestamp[ms]\n", | |
"[2024-01-01 00:00:00.259,2024-01-01 00:00:00.577,2024-01-01 00:00:00.672,2024-01-01 00:00:00.876,2024-01-01 00:00:00.647]\n", | |
" -- child 3 type: timestamp[us]\n", | |
"[2024-01-01 00:00:00.000259,2024-01-01 00:00:00.000577,2024-01-01 00:00:00.000672,2024-01-01 00:00:00.000876,2024-01-01 00:00:00.000647]\n", | |
" -- child 4 type: timestamp[ns]\n", | |
"[2024-01-01 00:00:00.000000259,2024-01-01 00:00:00.000000577,2024-01-01 00:00:00.000000672,2024-01-01 00:00:00.000000876,2024-01-01 00:00:00.000000647]]" | |
] | |
}, | |
"execution_count": 12, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"table_2 = pq.read_table(parquet_file_2)\n", | |
"table_2" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"{'D': <pyarrow.Date32Scalar: datetime.date(2024, 9, 16)>,\n", | |
" 'ms': <pyarrow.TimestampScalar: '2024-01-01T00:00:00.259'>,\n", | |
" 'ns': <pyarrow.TimestampScalar: '2024-01-01T00:00:00.000000259'>,\n", | |
" 's': <pyarrow.TimestampScalar: '2024-01-01T00:04:19.000'>,\n", | |
" 'us': <pyarrow.TimestampScalar: '2024-01-01T00:00:00.000259'>}\n" | |
] | |
} | |
], | |
"source": [ | |
"pprint(dict(table_2[\"timestamps\"][0]))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "9e807375ab984ebfa3cd8ebc297ec529", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"Parquet Files Sample 0: 0%| | 0/1 [00:00<?, ?it/s]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"2024-12-16 22:29:23,336\tINFO streaming_executor.py:115 -- Starting execution of Dataset. Full log is in /tmp/ray/session_2024-12-16_22-29-19_508831_9390/logs/ray-data.log\n", | |
"2024-12-16 22:29:23,336\tINFO streaming_executor.py:116 -- Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadParquet] -> LimitOperator[limit=5]\n", | |
"\n" | |
] | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "b1345fb2084843f0b514ba9892c695e7", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"- ReadParquet->SplitBlocks(22) 1: 0%| | 0/1 [00:00<?, ?it/s]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "fafd5850162b4384b30707d2eb2617ef", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"- limit=5 2: 0%| | 0/1 [00:00<?, ?it/s]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "0e606e9a1ce24e34a82d6d6ee93a0edc", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"Running 0: 0%| | 0/1 [00:00<?, ?it/s]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>timestamps</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>{'D': 2024-09-16, 's': 2024-01-01 00:04:19, 'm...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>{'D': 2025-07-31, 's': 2024-01-01 00:09:37, 'm...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>{'D': 2025-11-03, 's': 2024-01-01 00:11:12, 'm...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>{'D': 2026-05-26, 's': 2024-01-01 00:14:36, 'm...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>{'D': 2025-10-09, 's': 2024-01-01 00:10:47, 'm...</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" timestamps\n", | |
"0 {'D': 2024-09-16, 's': 2024-01-01 00:04:19, 'm...\n", | |
"1 {'D': 2025-07-31, 's': 2024-01-01 00:09:37, 'm...\n", | |
"2 {'D': 2025-11-03, 's': 2024-01-01 00:11:12, 'm...\n", | |
"3 {'D': 2026-05-26, 's': 2024-01-01 00:14:36, 'm...\n", | |
"4 {'D': 2025-10-09, 's': 2024-01-01 00:10:47, 'm..." | |
] | |
}, | |
"execution_count": 14, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"ds_2 = ray.data.read_parquet(parquet_file_2)\n", | |
"ds_2_df = ds_2.take_batch(num_samples, batch_format=\"pandas\")\n", | |
"ds_2_df" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"{'D': datetime.date(2024, 9, 16),\n", | |
" 'ms': datetime.datetime(2024, 1, 1, 0, 0, 0, 259000),\n", | |
" 'ns': 1704067200000000259,\n", | |
" 's': datetime.datetime(2024, 1, 1, 0, 4, 19),\n", | |
" 'us': datetime.datetime(2024, 1, 1, 0, 0, 0, 259)}\n" | |
] | |
} | |
], | |
"source": [ | |
"pprint(ds_2_df[\"timestamps\"][0])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"2024-12-16 22:29:23,418\tINFO streaming_executor.py:115 -- Starting execution of Dataset. Full log is in /tmp/ray/session_2024-12-16_22-29-19_508831_9390/logs/ray-data.log\n", | |
"2024-12-16 22:29:23,419\tINFO streaming_executor.py:116 -- Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadParquet] -> LimitOperator[limit=5]\n", | |
"\n" | |
] | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "3a2091adf475407eae763298340db577", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"- ReadParquet->SplitBlocks(22) 1: 0%| | 0/1 [00:00<?, ?it/s]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "e128af6ab8ea42c28882a31632541618", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"- limit=5 2: 0%| | 0/1 [00:00<?, ?it/s]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "1a2cc9a9ca0f4fe5adcdd45cf9c3195b", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"Running 0: 0%| | 0/1 [00:00<?, ?it/s]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"{'D': datetime.date(2024, 9, 16),\n", | |
" 'ms': datetime.datetime(2024, 1, 1, 0, 0, 0, 259000),\n", | |
" 'ns': 1704067200000000259,\n", | |
" 's': datetime.datetime(2024, 1, 1, 0, 4, 19),\n", | |
" 'us': datetime.datetime(2024, 1, 1, 0, 0, 0, 259)}\n" | |
] | |
} | |
], | |
"source": [ | |
"ds_2_batch = ds_2.take_batch(num_samples)\n", | |
"pprint(ds_2_batch[\"timestamps\"][0])\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"{'D': datetime.date(2024, 9, 16),\n", | |
" 'ms': datetime.datetime(2024, 1, 1, 0, 0, 0, 259000),\n", | |
" 'ns': 1704067200000000259,\n", | |
" 's': datetime.datetime(2024, 1, 1, 0, 4, 19),\n", | |
" 'us': datetime.datetime(2024, 1, 1, 0, 0, 0, 259)}\n" | |
] | |
} | |
], | |
"source": [ | |
"df_2 = table_2.to_pandas()\n", | |
"pprint(df_2[\"timestamps\"][0])" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": ".venv", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.10.15" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment