Created
October 8, 2020 21:40
-
-
Save ogrisel/48ab78ae7cb5269c4a9ddf645c655fc9 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import numpy as np\n", | |
"import dask.array as da\n", | |
"import dask.dataframe as ddf" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"41666666.0" | |
] | |
}, | |
"execution_count": 2, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"5e9 // (4 * 30)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 8.68 ms, sys: 71 µs, total: 8.75 ms\n", | |
"Wall time: 9.26 ms\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"\n", | |
"data = da.random.normal(size=(int(42e6), 30)).astype(np.float32)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<table>\n", | |
"<tr>\n", | |
"<td>\n", | |
"<table>\n", | |
" <thead>\n", | |
" <tr><td> </td><th> Array </th><th> Chunk </th></tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr><th> Bytes </th><td> 5.04 GB </td> <td> 63.00 MB </td></tr>\n", | |
" <tr><th> Shape </th><td> (42000000, 30) </td> <td> (525000, 30) </td></tr>\n", | |
" <tr><th> Count </th><td> 160 Tasks </td><td> 80 Chunks </td></tr>\n", | |
" <tr><th> Type </th><td> float32 </td><td> numpy.ndarray </td></tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</td>\n", | |
"<td>\n", | |
"<svg width=\"75\" height=\"170\" style=\"stroke:rgb(0,0,0);stroke-width:1\" >\n", | |
"\n", | |
" <!-- Horizontal lines -->\n", | |
" <line x1=\"0\" y1=\"0\" x2=\"25\" y2=\"0\" style=\"stroke-width:2\" />\n", | |
" <line x1=\"0\" y1=\"6\" x2=\"25\" y2=\"6\" />\n", | |
" <line x1=\"0\" y1=\"12\" x2=\"25\" y2=\"12\" />\n", | |
" <line x1=\"0\" y1=\"18\" x2=\"25\" y2=\"18\" />\n", | |
" <line x1=\"0\" y1=\"24\" x2=\"25\" y2=\"24\" />\n", | |
" <line x1=\"0\" y1=\"31\" x2=\"25\" y2=\"31\" />\n", | |
" <line x1=\"0\" y1=\"37\" x2=\"25\" y2=\"37\" />\n", | |
" <line x1=\"0\" y1=\"43\" x2=\"25\" y2=\"43\" />\n", | |
" <line x1=\"0\" y1=\"49\" x2=\"25\" y2=\"49\" />\n", | |
" <line x1=\"0\" y1=\"55\" x2=\"25\" y2=\"55\" />\n", | |
" <line x1=\"0\" y1=\"63\" x2=\"25\" y2=\"63\" />\n", | |
" <line x1=\"0\" y1=\"69\" x2=\"25\" y2=\"69\" />\n", | |
" <line x1=\"0\" y1=\"75\" x2=\"25\" y2=\"75\" />\n", | |
" <line x1=\"0\" y1=\"81\" x2=\"25\" y2=\"81\" />\n", | |
" <line x1=\"0\" y1=\"87\" x2=\"25\" y2=\"87\" />\n", | |
" <line x1=\"0\" y1=\"94\" x2=\"25\" y2=\"94\" />\n", | |
" <line x1=\"0\" y1=\"100\" x2=\"25\" y2=\"100\" />\n", | |
" <line x1=\"0\" y1=\"106\" x2=\"25\" y2=\"106\" />\n", | |
" <line x1=\"0\" y1=\"112\" x2=\"25\" y2=\"112\" />\n", | |
" <line x1=\"0\" y1=\"120\" x2=\"25\" y2=\"120\" style=\"stroke-width:2\" />\n", | |
"\n", | |
" <!-- Vertical lines -->\n", | |
" <line x1=\"0\" y1=\"0\" x2=\"0\" y2=\"120\" style=\"stroke-width:2\" />\n", | |
" <line x1=\"25\" y1=\"0\" x2=\"25\" y2=\"120\" style=\"stroke-width:2\" />\n", | |
"\n", | |
" <!-- Colored Rectangle -->\n", | |
" <polygon points=\"0.0,0.0 25.412616514582485,0.0 25.412616514582485,120.0 0.0,120.0\" style=\"fill:#8B4903A0;stroke-width:0\"/>\n", | |
"\n", | |
" <!-- Text -->\n", | |
" <text x=\"12.706308\" y=\"140.000000\" font-size=\"1.0rem\" font-weight=\"100\" text-anchor=\"middle\" >30</text>\n", | |
" <text x=\"45.412617\" y=\"60.000000\" font-size=\"1.0rem\" font-weight=\"100\" text-anchor=\"middle\" transform=\"rotate(-90,45.412617,60.000000)\">42000000</text>\n", | |
"</svg>\n", | |
"</td>\n", | |
"</tr>\n", | |
"</table>" | |
], | |
"text/plain": [ | |
"dask.array<astype, shape=(42000000, 30), dtype=float32, chunksize=(525000, 30), chunktype=numpy.ndarray>" | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"data" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div><strong>Dask DataFrame Structure:</strong></div>\n", | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>c_00</th>\n", | |
" <th>c_01</th>\n", | |
" <th>c_02</th>\n", | |
" <th>c_03</th>\n", | |
" <th>c_04</th>\n", | |
" <th>c_05</th>\n", | |
" <th>c_06</th>\n", | |
" <th>c_07</th>\n", | |
" <th>c_08</th>\n", | |
" <th>c_09</th>\n", | |
" <th>c_10</th>\n", | |
" <th>c_11</th>\n", | |
" <th>c_12</th>\n", | |
" <th>c_13</th>\n", | |
" <th>c_14</th>\n", | |
" <th>c_15</th>\n", | |
" <th>c_16</th>\n", | |
" <th>c_17</th>\n", | |
" <th>c_18</th>\n", | |
" <th>c_19</th>\n", | |
" <th>c_20</th>\n", | |
" <th>c_21</th>\n", | |
" <th>c_22</th>\n", | |
" <th>c_23</th>\n", | |
" <th>c_24</th>\n", | |
" <th>c_25</th>\n", | |
" <th>c_26</th>\n", | |
" <th>c_27</th>\n", | |
" <th>c_28</th>\n", | |
" <th>c_29</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>npartitions=80</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>525000</th>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>...</th>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>41475000</th>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>41999999</th>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>\n", | |
"<div>Dask Name: from-dask, 240 tasks</div>" | |
], | |
"text/plain": [ | |
"Dask DataFrame Structure:\n", | |
" c_00 c_01 c_02 c_03 c_04 c_05 c_06 c_07 c_08 c_09 c_10 c_11 c_12 c_13 c_14 c_15 c_16 c_17 c_18 c_19 c_20 c_21 c_22 c_23 c_24 c_25 c_26 c_27 c_28 c_29\n", | |
"npartitions=80 \n", | |
"0 float32 float32 float32 float32 float32 float32 float32 float32 float32 float32 float32 float32 float32 float32 float32 float32 float32 float32 float32 float32 float32 float32 float32 float32 float32 float32 float32 float32 float32 float32\n", | |
"525000 ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n", | |
"... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n", | |
"41475000 ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n", | |
"41999999 ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n", | |
"Dask Name: from-dask, 240 tasks" | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"columns = [f\"c_{i:02d}\" for i in range(data.shape[1])]\n", | |
"df = ddf.from_dask_array(data, columns=columns)\n", | |
"df" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 1min 14s, sys: 10.4 s, total: 1min 25s\n", | |
"Wall time: 30.7 s\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"# Actually generate random data, convert to 32bit float and serialize by\n", | |
"# chunks on the disk\n", | |
"df.to_parquet(\"df.parquet\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"del df, data" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"total 5,1G\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 4,1K oct. 8 23:38 _common_metadata\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 218K oct. 8 23:38 _metadata\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.0.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.10.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.11.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.12.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.13.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.14.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.15.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.16.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.17.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.18.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.19.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.1.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.20.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.21.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.22.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.23.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.24.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.25.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.26.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.27.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.28.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.29.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.2.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.30.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.31.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.32.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.33.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.34.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.35.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.36.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.37.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.38.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.39.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.3.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.40.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.41.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.42.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.43.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.44.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.45.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.46.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.47.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.48.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.49.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.4.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.50.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.51.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.52.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.53.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.54.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.55.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.56.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.57.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.58.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.59.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.5.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.60.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.61.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.62.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.63.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.64.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.65.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.66.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.67.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.68.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.69.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.6.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.70.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.71.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.72.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.73.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.74.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.75.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.76.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.77.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.78.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.79.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.7.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.8.parquet\n", | |
"-rw-rw-r-- 1 ogrisel ogrisel 65M oct. 8 23:38 part.9.parquet\n" | |
] | |
} | |
], | |
"source": [ | |
"!ls -lh df.parquet" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Simulate a new Python program that starts from the parquet data on the disk" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import numpy as np\n", | |
"import dask.dataframe as ddf\n", | |
"from dask_ml.preprocessing import MinMaxScaler" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div><strong>Dask DataFrame Structure:</strong></div>\n", | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>c_00</th>\n", | |
" <th>c_01</th>\n", | |
" <th>c_02</th>\n", | |
" <th>c_03</th>\n", | |
" <th>c_04</th>\n", | |
" <th>c_05</th>\n", | |
" <th>c_06</th>\n", | |
" <th>c_07</th>\n", | |
" <th>c_08</th>\n", | |
" <th>c_09</th>\n", | |
" <th>c_10</th>\n", | |
" <th>c_11</th>\n", | |
" <th>c_12</th>\n", | |
" <th>c_13</th>\n", | |
" <th>c_14</th>\n", | |
" <th>c_15</th>\n", | |
" <th>c_16</th>\n", | |
" <th>c_17</th>\n", | |
" <th>c_18</th>\n", | |
" <th>c_19</th>\n", | |
" <th>c_20</th>\n", | |
" <th>c_21</th>\n", | |
" <th>c_22</th>\n", | |
" <th>c_23</th>\n", | |
" <th>c_24</th>\n", | |
" <th>c_25</th>\n", | |
" <th>c_26</th>\n", | |
" <th>c_27</th>\n", | |
" <th>c_28</th>\n", | |
" <th>c_29</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>npartitions=80</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>525000</th>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>...</th>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>41475000</th>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>41999999</th>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>\n", | |
"<div>Dask Name: read-parquet, 80 tasks</div>" | |
], | |
"text/plain": [ | |
"Dask DataFrame Structure:\n", | |
" c_00 c_01 c_02 c_03 c_04 c_05 c_06 c_07 c_08 c_09 c_10 c_11 c_12 c_13 c_14 c_15 c_16 c_17 c_18 c_19 c_20 c_21 c_22 c_23 c_24 c_25 c_26 c_27 c_28 c_29\n", | |
"npartitions=80 \n", | |
"0 float32 float32 float32 float32 float32 float32 float32 float32 float32 float32 float32 float32 float32 float32 float32 float32 float32 float32 float32 float32 float32 float32 float32 float32 float32 float32 float32 float32 float32 float32\n", | |
"525000 ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n", | |
"... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n", | |
"41475000 ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n", | |
"41999999 ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n", | |
"Dask Name: read-parquet, 80 tasks" | |
] | |
}, | |
"execution_count": 10, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df = ddf.read_parquet(\"df.parquet\")\n", | |
"df" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 20.3 s, sys: 4.68 s, total: 25 s\n", | |
"Wall time: 8.03 s\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div><strong>Dask DataFrame Structure:</strong></div>\n", | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>c_00</th>\n", | |
" <th>c_01</th>\n", | |
" <th>c_02</th>\n", | |
" <th>c_03</th>\n", | |
" <th>c_04</th>\n", | |
" <th>c_05</th>\n", | |
" <th>c_06</th>\n", | |
" <th>c_07</th>\n", | |
" <th>c_08</th>\n", | |
" <th>c_09</th>\n", | |
" <th>c_10</th>\n", | |
" <th>c_11</th>\n", | |
" <th>c_12</th>\n", | |
" <th>c_13</th>\n", | |
" <th>c_14</th>\n", | |
" <th>c_15</th>\n", | |
" <th>c_16</th>\n", | |
" <th>c_17</th>\n", | |
" <th>c_18</th>\n", | |
" <th>c_19</th>\n", | |
" <th>c_20</th>\n", | |
" <th>c_21</th>\n", | |
" <th>c_22</th>\n", | |
" <th>c_23</th>\n", | |
" <th>c_24</th>\n", | |
" <th>c_25</th>\n", | |
" <th>c_26</th>\n", | |
" <th>c_27</th>\n", | |
" <th>c_28</th>\n", | |
" <th>c_29</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>npartitions=80</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" <td>float32</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>525000</th>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>...</th>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>41475000</th>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>41999999</th>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>\n", | |
"<div>Dask Name: add, 240 tasks</div>" | |
], | |
"text/plain": [ | |
"Dask DataFrame Structure:\n", | |
" c_00 c_01 c_02 c_03 c_04 c_05 c_06 c_07 c_08 c_09 c_10 c_11 c_12 c_13 c_14 c_15 c_16 c_17 c_18 c_19 c_20 c_21 c_22 c_23 c_24 c_25 c_26 c_27 c_28 c_29\n", | |
"npartitions=80 \n", | |
"0 float32 float32 float32 float32 float32 float32 float32 float32 float32 float32 float32 float32 float32 float32 float32 float32 float32 float32 float32 float32 float32 float32 float32 float32 float32 float32 float32 float32 float32 float32\n", | |
"525000 ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n", | |
"... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n", | |
"41475000 ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n", | |
"41999999 ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n", | |
"Dask Name: add, 240 tasks" | |
] | |
}, | |
"execution_count": 11, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"%%time \n", | |
"data_scaled = MinMaxScaler().fit_transform(df)\n", | |
"data_scaled" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Materialize as a single contiguous numpy array in memory." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 6.97 s, sys: 6.66 s, total: 13.6 s\n", | |
"Wall time: 7.09 s\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"data_scaled_numpy = data_scaled.to_dask_array().compute()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([[0.5636334 , 0.42943045, 0.42165464, ..., 0.58489186, 0.52082884,\n", | |
" 0.6474119 ],\n", | |
" [0.6436356 , 0.4364732 , 0.2791962 , ..., 0.5561331 , 0.5236513 ,\n", | |
" 0.51622826],\n", | |
" [0.5076153 , 0.35435796, 0.636679 , ..., 0.56980395, 0.46468422,\n", | |
" 0.4050402 ],\n", | |
" ...,\n", | |
" [0.57878864, 0.35109144, 0.65948766, ..., 0.49761993, 0.4737778 ,\n", | |
" 0.59080714],\n", | |
" [0.38248256, 0.44249856, 0.4731444 , ..., 0.4293675 , 0.41000423,\n", | |
" 0.48954332],\n", | |
" [0.59831095, 0.37868384, 0.43457246, ..., 0.49542814, 0.5613119 ,\n", | |
" 0.5574408 ]], dtype=float32)" | |
] | |
}, | |
"execution_count": 13, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"data_scaled_numpy" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
" C_CONTIGUOUS : True\n", | |
" F_CONTIGUOUS : False\n", | |
" OWNDATA : True\n", | |
" WRITEABLE : True\n", | |
" ALIGNED : True\n", | |
" WRITEBACKIFCOPY : False\n", | |
" UPDATEIFCOPY : False" | |
] | |
}, | |
"execution_count": 14, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"data_scaled_numpy.flags" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"5.04" | |
] | |
}, | |
"execution_count": 15, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"data_scaled_numpy.nbytes / 1e9" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.8.5" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment