Last active
March 14, 2019 05:12
-
-
Save kayush2O6/c004868c6cdee15698d2582b13a671e7 to your computer and use it in GitHub Desktop.
Final version with suggested solution
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<table style=\"border: 2px solid white;\">\n", | |
"<tr>\n", | |
"<td style=\"vertical-align: top; border: 0px solid white\">\n", | |
"<h3>Client</h3>\n", | |
"<ul>\n", | |
" <li><b>Scheduler: </b>tcp://127.0.0.1:37180\n", | |
" <li><b>Dashboard: </b><a href='http://127.0.0.1:8787/status' target='_blank'>http://127.0.0.1:8787/status</a>\n", | |
"</ul>\n", | |
"</td>\n", | |
"<td style=\"vertical-align: top; border: 0px solid white\">\n", | |
"<h3>Cluster</h3>\n", | |
"<ul>\n", | |
" <li><b>Workers: </b>2</li>\n", | |
" <li><b>Cores: </b>2</li>\n", | |
" <li><b>Memory: </b>236.66 GB</li>\n", | |
"</ul>\n", | |
"</td>\n", | |
"</tr>\n", | |
"</table>" | |
], | |
"text/plain": [ | |
"<Client: scheduler='tcp://127.0.0.1:37180' processes=2 cores=2>" | |
] | |
}, | |
"execution_count": 1, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"from dask.distributed import Client, wait\n", | |
"from dask_cuda import LocalCUDACluster\n", | |
"from dask.delayed import delayed\n", | |
"cluster = LocalCUDACluster()\n", | |
"client = Client(cluster)\n", | |
"client" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import cudf\n", | |
"import numpy as np\n", | |
"from librmm_cffi import librmm" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def read_table(filename):\n", | |
" names = ['a', 'b', 'c', 'd']\n", | |
" dtypes = ['int', 'str', 'str', 'str']\n", | |
" columns = cudf.io.csv.read_csv_strings(filename, delimiter='\\t',\n", | |
" names=names, dtype=dtypes,\n", | |
" skiprows=1)\n", | |
" return (columns, names)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def prepare_df(ret, item=0):\n", | |
" cols = ret[0]\n", | |
" names = ret[1]\n", | |
" size = cols[1].sublist([item]).lstrip('[').rstrip(']').split(',')[0].size()\n", | |
" gdf = cudf.dataframe.DataFrame()\n", | |
" for i in range(1, 4):\n", | |
" float_array = librmm.device_array(size, dtype=np.float32)\n", | |
" cols[i].sublist([item]).lstrip('[').rstrip(']').split(',')[0].stof(float_array.device_ctypes_pointer.value)\n", | |
" gdf[names[i]]=cudf.Series(float_array)\n", | |
" return gdf" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"ret = delayed(read_table)(\"foo.csv\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"b'\\x80\\x03cdask.delayed\\nDelayed\\nq\\x00)\\x81q\\x01X/\\x00\\x00\\x00read_table-bcd5ed0b-4913-4888-adda-7a2576fef250q\\x02cdask.highlevelgraph\\nHighLevelGraph\\nq\\x03)\\x81q\\x04}q\\x05(X\\x06\\x00\\x00\\x00layersq\\x06}q\\x07h\\x02}q\\x08h\\x02c__main__\\nread_table\\nq\\tX\\x07\\x00\\x00\\x00foo.csvq\\n\\x86q\\x0bssX\\x0c\\x00\\x00\\x00dependenciesq\\x0c}q\\rh\\x02cbuiltins\\nset\\nq\\x0e]q\\x0f\\x85q\\x10Rq\\x11subN\\x87q\\x12b.'" | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"import pickle\n", | |
"pickle.dumps(ret)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"gdf1 = delayed(prepare_df)(ret)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"b'\\x80\\x03cdask.delayed\\nDelayed\\nq\\x00)\\x81q\\x01X/\\x00\\x00\\x00prepare_df-4770a828-e3be-4814-80a5-bcece29e1136q\\x02cdask.highlevelgraph\\nHighLevelGraph\\nq\\x03)\\x81q\\x04}q\\x05(X\\x06\\x00\\x00\\x00layersq\\x06}q\\x07(h\\x02}q\\x08h\\x02c__main__\\nprepare_df\\nq\\tX/\\x00\\x00\\x00read_table-bcd5ed0b-4913-4888-adda-7a2576fef250q\\n\\x86q\\x0bsh\\n}q\\x0ch\\nc__main__\\nread_table\\nq\\rX\\x07\\x00\\x00\\x00foo.csvq\\x0e\\x86q\\x0fsuX\\x0c\\x00\\x00\\x00dependenciesq\\x10}q\\x11(h\\x02cbuiltins\\nset\\nq\\x12]q\\x13h\\na\\x85q\\x14Rq\\x15h\\nh\\x12]q\\x16\\x85q\\x17Rq\\x18uubN\\x87q\\x19b.'" | |
] | |
}, | |
"execution_count": 8, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"pickle.dumps(gdf1)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"image/svg+xml": [ | |
"<svg height=\"396pt\" viewBox=\"0.00 0.00 113.71 396.21\" width=\"114pt\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n", | |
"<g class=\"graph\" id=\"graph0\" transform=\"scale(1 1) rotate(0) translate(4 392.2117)\">\n", | |
"<title>%3</title>\n", | |
"<polygon fill=\"#ffffff\" points=\"-4,4 -4,-392.2117 109.7099,-392.2117 109.7099,4 -4,4\" stroke=\"transparent\"/>\n", | |
"<!-- -3842856131783809843 -->\n", | |
"<g class=\"node\" id=\"node1\">\n", | |
"<title>-3842856131783809843</title>\n", | |
"<polygon fill=\"none\" points=\"79.8549,-388.2117 25.8549,-388.2117 25.8549,-352.2117 79.8549,-352.2117 79.8549,-388.2117\" stroke=\"#000000\"/>\n", | |
"</g>\n", | |
"<!-- -8716603847870901542 -->\n", | |
"<g class=\"node\" id=\"node2\">\n", | |
"<title>-8716603847870901542</title>\n", | |
"<ellipse cx=\"52.8549\" cy=\"-263.3568\" fill=\"none\" rx=\"52.7103\" ry=\"52.7103\" stroke=\"#000000\"/>\n", | |
"<text fill=\"#000000\" font-family=\"Times,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"52.8549\" y=\"-259.1568\">prepare_df</text>\n", | |
"</g>\n", | |
"<!-- -8716603847870901542->-3842856131783809843 -->\n", | |
"<g class=\"edge\" id=\"edge1\">\n", | |
"<title>-8716603847870901542->-3842856131783809843</title>\n", | |
"<path d=\"M52.8549,-316.4712C52.8549,-325.2577 52.8549,-334.0554 52.8549,-341.9139\" fill=\"none\" stroke=\"#000000\"/>\n", | |
"<polygon fill=\"#000000\" points=\"49.355,-341.9584 52.8549,-351.9584 56.355,-341.9585 49.355,-341.9584\" stroke=\"#000000\"/>\n", | |
"</g>\n", | |
"<!-- -4317651821274558598 -->\n", | |
"<g class=\"node\" id=\"node3\">\n", | |
"<title>-4317651821274558598</title>\n", | |
"<polygon fill=\"none\" points=\"79.8549,-174.5018 25.8549,-174.5018 25.8549,-138.5018 79.8549,-138.5018 79.8549,-174.5018\" stroke=\"#000000\"/>\n", | |
"</g>\n", | |
"<!-- -4317651821274558598->-8716603847870901542 -->\n", | |
"<g class=\"edge\" id=\"edge2\">\n", | |
"<title>-4317651821274558598->-8716603847870901542</title>\n", | |
"<path d=\"M52.8549,-174.6283C52.8549,-181.887 52.8549,-190.7193 52.8549,-199.9642\" fill=\"none\" stroke=\"#000000\"/>\n", | |
"<polygon fill=\"#000000\" points=\"49.355,-200.1029 52.8549,-210.103 56.355,-200.103 49.355,-200.1029\" stroke=\"#000000\"/>\n", | |
"</g>\n", | |
"<!-- 8439453770685328357 -->\n", | |
"<g class=\"node\" id=\"node4\">\n", | |
"<title>8439453770685328357</title>\n", | |
"<ellipse cx=\"52.8549\" cy=\"-51.2509\" fill=\"none\" rx=\"51.003\" ry=\"51.003\" stroke=\"#000000\"/>\n", | |
"<text fill=\"#000000\" font-family=\"Times,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"52.8549\" y=\"-47.0509\">read_table</text>\n", | |
"</g>\n", | |
"<!-- 8439453770685328357->-4317651821274558598 -->\n", | |
"<g class=\"edge\" id=\"edge3\">\n", | |
"<title>8439453770685328357->-4317651821274558598</title>\n", | |
"<path d=\"M52.8549,-102.6431C52.8549,-111.3967 52.8549,-120.1958 52.8549,-128.0709\" fill=\"none\" stroke=\"#000000\"/>\n", | |
"<polygon fill=\"#000000\" points=\"49.355,-128.1459 52.8549,-138.146 56.355,-128.146 49.355,-128.1459\" stroke=\"#000000\"/>\n", | |
"</g>\n", | |
"</g>\n", | |
"</svg>" | |
], | |
"text/plain": [ | |
"<IPython.core.display.SVG object>" | |
] | |
}, | |
"execution_count": 9, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"gdf1.visualize(filename='gdf1.svg')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"gdf = gdf1.compute(scheduler='single-threaded')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>b</th>\n", | |
" <th>c</th>\n", | |
" <th>d</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>1.1</td>\n", | |
" <td>2.0</td>\n", | |
" <td>0.4</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>2.2</td>\n", | |
" <td>3.0</td>\n", | |
" <td>0.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>3.0</td>\n", | |
" <td>4.0</td>\n", | |
" <td>0.0</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" b c d\n", | |
"0 1.1 2.0 0.4\n", | |
"1 2.2 3.0 0.0\n", | |
"2 3.0 4.0 0.0" | |
] | |
}, | |
"execution_count": 11, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"gdf.to_pandas()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.8" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
We can make this file beautiful and searchable if this error is corrected: It looks like row 2 should actually have 1 column, instead of 7 in line 1.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
a b c d | |
12 [1.1,2.2,3.0] [2,3,4] [0.4, 0.2, 0.9] | |
15 [3.1,4.1] [3,2] [0.6, 0.8] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment