Skip to content

Instantly share code, notes, and snippets.

@gumdropsteve
Last active August 4, 2020 19:46
Show Gist options
  • Save gumdropsteve/36079fd4dcf614c9aff42191ae82ae12 to your computer and use it in GitHub Desktop.
Save gumdropsteve/36079fd4dcf614c9aff42191ae82ae12 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# CSV to Parquet\n",
"BlazingSQL speedup over pandas.\n",
"\n",
"#### Download Dataset"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"--2020-08-04 19:38:27-- https://blazingsql-colab.s3.amazonaws.com/netflow_data/nf-chunk2.csv\n",
"Resolving blazingsql-colab.s3.amazonaws.com (blazingsql-colab.s3.amazonaws.com)... 52.216.1.0\n",
"Connecting to blazingsql-colab.s3.amazonaws.com (blazingsql-colab.s3.amazonaws.com)|52.216.1.0|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 2725056295 (2.5G) [text/csv]\n",
"Saving to: ‘nf-chunk2.csv’\n",
"\n",
"nf-chunk2.csv 100%[===================>] 2.54G 51.9MB/s in 54s \n",
"\n",
"2020-08-04 19:39:24 (48.2 MB/s) - ‘nf-chunk2.csv’ saved [2725056295/2725056295]\n",
"\n"
]
}
],
"source": [
"from os import path\n",
"\n",
"if not path.exists('nf-chunk2.csv'): \n",
" !wget https://blazingsql-colab.s3.amazonaws.com/netflow_data/nf-chunk2.csv"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## pandas"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 34.2 s, sys: 6.65 s, total: 40.9 s\n",
"Wall time: 44.8 s\n"
]
}
],
"source": [
"%%time\n",
"import pandas as pd\n",
"\n",
"df = pd.read_csv('nf-chunk2.csv')\n",
"\n",
"df.to_parquet('pd_chunk.parquet', index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## BlazingSQL"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"BlazingContext ready\n",
"CPU times: user 7.03 s, sys: 2.96 s, total: 9.99 s\n",
"Wall time: 17.1 s\n"
]
}
],
"source": [
"%%time\n",
"from blazingsql import BlazingContext\n",
"bc = BlazingContext()\n",
"\n",
"bc.create_table('netflow', 'nf-chunk2.csv')\n",
"\n",
"bc.sql('select * from netflow').to_parquet('bsql_chunk.parquet', index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Dataset\n",
"The `nf-chunk2.csv` sample holds 21M rows of netflow data."
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>count(*)</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>21526138</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" count(*)\n",
"0 21526138"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bc.sql('select count(*) from netflow')"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>TimeSeconds</th>\n",
" <th>parsedDate</th>\n",
" <th>dateTimeStr</th>\n",
" <th>ipLayerProtocol</th>\n",
" <th>ipLayerProtocolCode</th>\n",
" <th>firstSeenSrcIp</th>\n",
" <th>firstSeenDestIp</th>\n",
" <th>firstSeenSrcPort</th>\n",
" <th>firstSeenDestPort</th>\n",
" <th>moreFragments</th>\n",
" <th>contFragments</th>\n",
" <th>durationSeconds</th>\n",
" <th>firstSeenSrcPayloadBytes</th>\n",
" <th>firstSeenDestPayloadBytes</th>\n",
" <th>firstSeenSrcTotalBytes</th>\n",
" <th>firstSeenDestTotalBytes</th>\n",
" <th>firstSeenSrcPacketCount</th>\n",
" <th>firstSeenDestPacketCount</th>\n",
" <th>recordForceOut</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1.364948e+09</td>\n",
" <td>2013-04-03 00:11:50</td>\n",
" <td>2.013040e+13</td>\n",
" <td>6</td>\n",
" <td>TCP</td>\n",
" <td>10.38.37.13</td>\n",
" <td>172.20.0.3</td>\n",
" <td>42559</td>\n",
" <td>25</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>10</td>\n",
" <td>36</td>\n",
" <td>125</td>\n",
" <td>422</td>\n",
" <td>403</td>\n",
" <td>7</td>\n",
" <td>5</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1.364948e+09</td>\n",
" <td>2013-04-03 00:11:53</td>\n",
" <td>2.013040e+13</td>\n",
" <td>6</td>\n",
" <td>TCP</td>\n",
" <td>10.13.77.49</td>\n",
" <td>172.30.0.4</td>\n",
" <td>42566</td>\n",
" <td>25</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>9</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>186</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1.364948e+09</td>\n",
" <td>2013-04-03 00:11:54</td>\n",
" <td>2.013040e+13</td>\n",
" <td>17</td>\n",
" <td>UDP</td>\n",
" <td>172.10.0.40</td>\n",
" <td>172.255.255.255</td>\n",
" <td>138</td>\n",
" <td>138</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>201</td>\n",
" <td>0</td>\n",
" <td>243</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1.364948e+09</td>\n",
" <td>2013-04-03 00:11:57</td>\n",
" <td>2.013040e+13</td>\n",
" <td>6</td>\n",
" <td>TCP</td>\n",
" <td>10.156.215.83</td>\n",
" <td>172.10.0.7</td>\n",
" <td>42593</td>\n",
" <td>80</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>170</td>\n",
" <td>336</td>\n",
" <td>448</td>\n",
" <td>506</td>\n",
" <td>5</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1.364948e+09</td>\n",
" <td>2013-04-03 00:12:00</td>\n",
" <td>2.013040e+13</td>\n",
" <td>6</td>\n",
" <td>TCP</td>\n",
" <td>10.170.32.110</td>\n",
" <td>172.20.0.4</td>\n",
" <td>42612</td>\n",
" <td>80</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>1870</td>\n",
" <td>79850</td>\n",
" <td>5730</td>\n",
" <td>84250</td>\n",
" <td>70</td>\n",
" <td>80</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" TimeSeconds parsedDate dateTimeStr ipLayerProtocol \\\n",
"0 1.364948e+09 2013-04-03 00:11:50 2.013040e+13 6 \n",
"1 1.364948e+09 2013-04-03 00:11:53 2.013040e+13 6 \n",
"2 1.364948e+09 2013-04-03 00:11:54 2.013040e+13 17 \n",
"3 1.364948e+09 2013-04-03 00:11:57 2.013040e+13 6 \n",
"4 1.364948e+09 2013-04-03 00:12:00 2.013040e+13 6 \n",
"\n",
" ipLayerProtocolCode firstSeenSrcIp firstSeenDestIp firstSeenSrcPort \\\n",
"0 TCP 10.38.37.13 172.20.0.3 42559 \n",
"1 TCP 10.13.77.49 172.30.0.4 42566 \n",
"2 UDP 172.10.0.40 172.255.255.255 138 \n",
"3 TCP 10.156.215.83 172.10.0.7 42593 \n",
"4 TCP 10.170.32.110 172.20.0.4 42612 \n",
"\n",
" firstSeenDestPort moreFragments contFragments durationSeconds \\\n",
"0 25 0 0 10 \n",
"1 25 0 0 9 \n",
"2 138 0 0 0 \n",
"3 80 0 0 0 \n",
"4 80 0 0 3 \n",
"\n",
" firstSeenSrcPayloadBytes firstSeenDestPayloadBytes \\\n",
"0 36 125 \n",
"1 0 0 \n",
"2 201 0 \n",
"3 170 336 \n",
"4 1870 79850 \n",
"\n",
" firstSeenSrcTotalBytes firstSeenDestTotalBytes firstSeenSrcPacketCount \\\n",
"0 422 403 7 \n",
"1 186 0 3 \n",
"2 243 0 1 \n",
"3 448 506 5 \n",
"4 5730 84250 70 \n",
"\n",
" firstSeenDestPacketCount recordForceOut \n",
"0 5 0 \n",
"1 0 0 \n",
"2 0 0 \n",
"3 3 0 \n",
"4 80 0 "
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bc.sql('select * from netflow limit 5')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "RAPIDS Nightly",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment