Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Select an option

  • Save vaclavdekanovsky/b2656ce7a14c082d4cde1f8862cc5de8 to your computer and use it in GitHub Desktop.

Select an option

Save vaclavdekanovsky/b2656ce7a14c082d4cde1f8862cc5de8 to your computer and use it in GitHub Desktop.
Running performance test to see the file size and read/write speed of various pandas .to_() methods
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"ExecuteTime": {
"end_time": "2020-08-18T19:45:32.553194Z",
"start_time": "2020-08-18T19:45:32.549204Z"
}
},
"source": [
"# Performance Test of various data frame persistance methods"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"ExecuteTime": {
"end_time": "2020-08-18T21:28:09.244116Z",
"start_time": "2020-08-18T21:28:09.241129Z"
}
},
"outputs": [],
"source": [
"dataset_size = 5000"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"ExecuteTime": {
"end_time": "2020-08-18T21:28:10.485607Z",
"start_time": "2020-08-18T21:28:09.246110Z"
}
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\v.dekanovsky\\Anaconda3\\lib\\site-packages\\xarray\\core\\merge.py:10: FutureWarning: The Panel class is removed from pandas. Accessing it from the top-level namespace will also be removed in the next version\n",
" PANDAS_TYPES = (pd.Series, pd.DataFrame, pd.Panel)\n"
]
}
],
"source": [
"import random\n",
"import string\n",
"import datetime\n",
"import time\n",
"import pandas as pd\n",
"from pathlib import Path\n",
"import plotly.express as pe"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"ExecuteTime": {
"end_time": "2020-08-18T21:28:10.497582Z",
"start_time": "2020-08-18T21:28:10.488601Z"
}
},
"outputs": [],
"source": [
"def get_random_string(length: int) -> str:\n",
" \"\"\"Generated random string up to the specific lenght\"\"\"\n",
" \n",
" letters = string.ascii_letters\n",
" result_str = ''.join([random.choice(letters) for i in range(random.randint(3,length))])\n",
" return result_str\n",
"\n",
"def get_random_sentence(words: int=5) -> str:\n",
" \"\"\"Generate random sentence - list of random works\"\"\"\n",
" \n",
" words = []\n",
" for i in range(5):\n",
" words.append(get_random_string(10)[:random.randint(0,10)])\n",
" return \" \".join(words)\n",
"\n",
"def random_date(start: str, end: str, format: str) -> str:\n",
" \"\"\"Generate random date of specified format\"\"\"\n",
" \n",
" earliest = datetime.datetime.strptime(start, format)\n",
" latest = datetime.datetime.strptime(end, format)\n",
" delta = latest - earliest\n",
" int_delta = (delta.days * 24 * 60 * 60) + delta.seconds \n",
" random_second = random.randrange(int_delta)\n",
" return (earliest + datetime.timedelta(seconds = random_second)).strftime(format)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"ExecuteTime": {
"end_time": "2020-08-18T21:28:12.537172Z",
"start_time": "2020-08-18T21:28:10.501570Z"
}
},
"outputs": [],
"source": [
"# https://numpy.org/doc/stable/user/basics.types.html\n",
"data = []\n",
"for i in range(dataset_size):\n",
" data.append(\n",
" [random.randint(-127,127), # int8\n",
" random.randint(-32768,32767), # int16\n",
" random.randint(-2147483648,2147483647), # int32\n",
" random.randint(-9223372036854775808 ,9223372036854775807), # int64\n",
" random.randint(0,255), # uint8\n",
" round(random.uniform(0,10000),2),\n",
" round(random.uniform(0,1000000),2),\n",
" get_random_string(10),\n",
" get_random_sentence(5),\n",
" random.choice([get_random_string(10) for i in range(25)]),\n",
" random_date(\"1900-01-01\",\"2020-05-01\",\"%Y-%m-%d\"),\n",
" random_date(\"1900-01-01T00:00:00\",\"2020-05-01T23:59:59\",\"%Y-%m-%dT%H:%M:%S\"),\n",
" random.choice([True,False])])"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"ExecuteTime": {
"end_time": "2020-08-18T21:28:13.216329Z",
"start_time": "2020-08-18T21:28:12.539169Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 5000 entries, 0 to 4999\n",
"Data columns (total 15 columns):\n",
"Int8 5000 non-null int8\n",
"Int16 5000 non-null int16\n",
"Int32 5000 non-null int32\n",
"Int64 5000 non-null int64\n",
"UInt8 5000 non-null uint8\n",
"Float32 5000 non-null float32\n",
"Float64 5000 non-null float64\n",
"String 5000 non-null object\n",
"Sentence 5000 non-null object\n",
"Category 5000 non-null category\n",
"Date 5000 non-null datetime64[ns]\n",
"DateTime 5000 non-null datetime64[ns]\n",
"Bool 5000 non-null bool\n",
"DateTime+Zone 5000 non-null datetime64[ns]\n",
"TimeDelta 5000 non-null timedelta64[ns]\n",
"dtypes: bool(1), category(1), datetime64[ns](3), float32(1), float64(1), int16(1), int32(1), int64(1), int8(1), object(2), timedelta64[ns](1), uint8(1)\n",
"memory usage: 584.9+ KB\n"
]
}
],
"source": [
"# create data frame from the list of generated random values\n",
"\n",
"column_names = [\"Int8\", \"Int16\", \"Int32\", \"Int64\", \"UInt8\", \"Float32\", \"Float64\", \n",
" \"String\", \"Sentence\", \"Category\", \"Date\", \"DateTime\", \"Bool\"]\n",
"df = pd.DataFrame(data, columns=column_names)\n",
"df[\"Int8\"] = df[\"Int8\"].astype(\"int8\")\n",
"df[\"Int16\"] = df[\"Int16\"].astype(\"int16\")\n",
"df[\"Int32\"] = df[\"Int32\"].astype(\"int32\")\n",
"df[\"UInt8\"] = df[\"UInt8\"].astype(\"uint8\")\n",
"df[\"Float32\"] = df[\"Float32\"].astype(\"float32\")\n",
"df[\"Category\"] = df[\"Category\"].astype(\"category\")\n",
"df[\"Date\"] = pd.to_datetime(df[\"Date\"], format=\"%Y-%m-%d\")\n",
"df[\"DateTime\"] = pd.to_datetime(df[\"DateTime\"], format=\"%Y-%m-%dT%H:%M:%S\")\n",
"df[\"DateTime+Zone\"] = pd.to_datetime(df[\"DateTime\"], format=\"%Y-%m-%dT%H:%M:%S\")\n",
"df[\"TimeDelta\"] = df[\"DateTime\"]-df[\"Date\"]\n",
"#df[\"DateTime+Zone\"] = df[\"DateTime+Zone\"].dt.tz_localize('Europe/Vienna')\n",
"\n",
"# store for size comparison\n",
"df.to_csv(\"benchmark.csv\", index=False)\n",
"orig_size = Path(\"benchmark.csv\").stat().st_size\n",
"\n",
"df.info()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"ExecuteTime": {
"end_time": "2020-08-18T21:28:13.240295Z",
"start_time": "2020-08-18T21:28:13.218323Z"
}
},
"outputs": [],
"source": [
"# define the types of transformation to use\n",
"exporting_types = {\n",
" \"csv\": {\n",
" \"type\": \"CSV\",\n",
" \"extension\": \".csv\",\n",
" \"write_function\": pd.DataFrame.to_csv,\n",
" \"write_params\": {\"index\": False},\n",
" \"read_function\": pd.read_csv,\n",
" \"read_params\": {}\n",
" },\n",
" \"csv_zip\": {\n",
" \"type\": \"CSV zip\",\n",
" \"extension\": \".zip\",\n",
" \"write_function\": pd.DataFrame.to_csv,\n",
" \"write_params\": {\"index\": False, \"compression\": \"zip\"},\n",
" \"read_function\": pd.read_csv,\n",
" \"read_params\": {\"compression\": \"zip\"}\n",
" },\n",
" \"picklea\": {\n",
" \"type\": \"Pickle bz2\",\n",
" \"extension\": \".pkl.bz2\",\n",
" \"write_function\": pd.DataFrame.to_pickle,\n",
" \"write_params\": {\"compression\": \"bz2\"},\n",
" \"read_function\": pd.read_pickle,\n",
" \"read_params\": {\"compression\": \"bz2\"}\n",
" },\n",
" \"pickleb\": {\n",
" \"type\": \"Pickle gzip\",\n",
" \"extension\": \".pkl.gzip\",\n",
" \"write_function\": pd.DataFrame.to_pickle,\n",
" \"write_params\": {\"compression\": \"gzip\"},\n",
" \"read_function\": pd.read_pickle,\n",
" \"read_params\": {\"compression\": \"gzip\"}\n",
" },\n",
" \"picklec\": {\n",
" \"type\": \"Pickle zip\",\n",
" \"extension\": \".pkl.zip\",\n",
" \"write_function\": pd.DataFrame.to_pickle,\n",
" \"write_params\": {\"compression\": \"zip\"},\n",
" \"read_function\": pd.read_pickle,\n",
" \"read_params\": {\"compression\": \"zip\"}\n",
" },\n",
" \"pickled\": {\n",
" \"type\": \"Pickle infer\",\n",
" \"extension\": \".pkl\",\n",
" \"write_function\": pd.DataFrame.to_pickle,\n",
" \"write_params\": {},\n",
" \"read_function\": pd.read_pickle,\n",
" \"read_params\": {}\n",
" },\n",
" \"picklee\": {\n",
" \"type\": \"Pickle xz\",\n",
" \"extension\": \".pkl.xz\",\n",
" \"write_function\": pd.DataFrame.to_pickle,\n",
" \"write_params\": {\"compression\": \"xz\"},\n",
" \"read_function\": pd.read_pickle,\n",
" \"read_params\": {\"compression\": \"xz\"}\n",
" },\n",
" \"parquet_pyarrow\": {\n",
" \"type\": \"Parquet via PyArrow\",\n",
" \"extension\": \".parquet.gzip\",\n",
" \"write_function\": pd.DataFrame.to_parquet,\n",
" \"write_params\": {},\n",
" \"read_function\": pd.read_parquet,\n",
" \"read_params\": {}\n",
" },\n",
" \"parquet_fastparquet\": {\n",
" \"type\": \"Parquet via fastparquet\",\n",
" \"extension\": \".parquet.gzip\",\n",
" \"write_function\": pd.DataFrame.to_parquet,\n",
" \"write_params\": {\"engine\":\"fastparquet\",\"compression\":\"GZIP\"},\n",
" \"read_function\": pd.read_parquet,\n",
" \"read_params\": {\"engine\":\"fastparquet\"}\n",
" }, \n",
" \"Hdf5f\": {\n",
" \"type\": \"Hdf5 fixed\",\n",
" \"extension\": \".h5\",\n",
" \"write_function\": pd.DataFrame.to_hdf,\n",
" \"write_params\": {\"key\":\"df\", \"format\":\"fixed\"},\n",
" \"read_function\": pd.read_hdf,\n",
" \"read_params\": {\"key\":\"df\"}\n",
" },\n",
" \"Hdf5t\": {\n",
" \"type\": \"Hdf5 table\",\n",
" \"extension\": \".h5\",\n",
" \"write_function\": pd.DataFrame.to_hdf,\n",
" \"write_params\": {\"key\":\"df\", \"format\":\"table\"},\n",
" \"read_function\": pd.read_hdf,\n",
" \"read_params\": {\"key\":\"df\"}\n",
" },\n",
" \"Hdf5c\": {\n",
" \"type\": \"Hdf5 compressed\",\n",
" \"extension\": \".h5\",\n",
" \"write_function\": pd.DataFrame.to_hdf,\n",
" \"write_params\": {\"key\":\"df\", \"format\":\"table\", \"complevel\": 9, \"complib\": \"bzip2\"},\n",
" \"read_function\": pd.read_hdf,\n",
" \"read_params\": {\"key\":\"df\"}\n",
" },\n",
" \"Excel\": {\n",
" \"type\": \"Excel\",\n",
" \"extension\": \".xlsx\",\n",
" \"write_function\": pd.DataFrame.to_excel,\n",
" \"write_params\": {\"index\":False},\n",
" \"read_function\": pd.read_excel,\n",
" \"read_params\": {}\n",
" },\n",
" # to_sql and read_sql have too much different syntax\n",
" #\"SQL\": {\n",
" # \"type\": \"SQL\",\n",
" # \"extension\": \"\",\n",
" # \"write_function\": pd.DataFrame.to_sql,\n",
" # \"write_params\": {\"name\": \"pandas\", \"con\": engine},\n",
" # \"read_function\": pd.read_sql,\n",
" # \"read_params\": {\"name\": \"pandas\", \"con\": engine}\n",
" #},\n",
" # feather file gets blocked by ArrowIO after the first run\n",
" \"Feather\": {\n",
" \"type\": \"Feather\",\n",
" \"extension\": \".f\",\n",
" \"write_function\": pd.DataFrame.to_feather,\n",
" \"write_params\": {},\n",
" \"read_function\": pd.read_feather,\n",
" \"read_params\": {}\n",
" }\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"ExecuteTime": {
"end_time": "2020-08-18T21:28:13.251235Z",
"start_time": "2020-08-18T21:28:13.243254Z"
}
},
"outputs": [],
"source": [
"def repeat(f: dict, repetitions: int, df: pd.DataFrame, file: str) -> dict:\n",
" \"\"\" Perform an operation specified by input dict number of repetition times on the data frame\"\"\"\n",
" \n",
" writes_elapsed = []\n",
" reads_elapsed = []\n",
" for r in range(repetitions):\n",
" \n",
" # write\n",
" start_time = time.time()\n",
" \n",
" # first parameter is self (the dataframe), second the path and then **kwargs\n",
" f[\"write_function\"](df, file, **f[\"write_params\"])\n",
" write_elapsed = time.time() - start_time\n",
" writes_elapsed.append(write_elapsed)\n",
" \n",
" # read\n",
" start_time = time.time()\n",
" df_loaded = f[\"read_function\"](file, **f[\"read_params\"])\n",
" read_elapsed = time.time() - start_time\n",
" reads_elapsed.append(read_elapsed)\n",
" \n",
" return {\"write\": sum(writes_elapsed)/len(writes_elapsed),\n",
" \"read\": sum(reads_elapsed)/len(reads_elapsed),\n",
" \"df_loaded\": df_loaded}"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"ExecuteTime": {
"end_time": "2020-08-18T21:28:13.263206Z",
"start_time": "2020-08-18T21:28:13.253229Z"
}
},
"outputs": [],
"source": [
"def statistics(df: pd.DataFrame, df_loaded: pd.DataFrame, new_file: str, \n",
" tp: str, benchmark: dict, out: dict, orig_size: int) -> pd.DataFrame:\n",
" \"\"\"Generate statistics based on performance test results\"\"\"\n",
" \n",
" comparison_df = pd.DataFrame({\"orig\": df.dtypes, \"new\": df_loaded.dtypes})\n",
" comparison_df[\"same\"] = comparison_df[\"orig\"]==comparison_df[\"new\"]\n",
" #mismatched_types = comparison_df[comparison_df[\"same\"]==False].index.to_list()\n",
" \n",
" new_size = Path(new_file).stat().st_size\n",
" \n",
" return pd.DataFrame({\n",
" \"dtype_preservation\": comparison_df[\"same\"].sum()/comparison_df[\"same\"].count(),\n",
" \"compression\": new_size/orig_size,\n",
" \"write_time\": out[\"write\"]/benchmark[\"write\"],\n",
" \"read_time\": out[\"read\"]/benchmark[\"read\"],\n",
" #\"mismatched_types\": mismatched_types\n",
" }, \n",
" index=[tp]), comparison_df"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"ExecuteTime": {
"end_time": "2020-08-18T21:28:13.275172Z",
"start_time": "2020-08-18T21:28:13.265198Z"
}
},
"outputs": [],
"source": [
"def performance_test(exporting_types: dict, df: pd.DataFrame, \n",
" out_file: str=\"out\", repetitions: int = 7) -> pd.DataFrame:\n",
" \n",
" \"\"\"Run performance test for predefined dict of operations\"\"\"\n",
"\n",
" results = []\n",
" for k,v in exporting_types.items():\n",
"\n",
" # create a file name\n",
" new_file = out_file + v[\"extension\"]\n",
" \n",
" try:\n",
" # repeat the writing and reading several times\n",
" out = repeat(v, repetitions, df, new_file)\n",
"\n",
" # CSV is the first one and it's set as benchmark for reading and writing times\n",
" if v[\"type\"] == \"CSV\":\n",
" benchmark = out\n",
" df.to_csv(\"benchmark.csv\", index=False)\n",
" orig_size = Path(\"benchmark.csv\").stat().st_size\n",
"\n",
" # process the results - dtypes_preservation, compression, write and read_time\n",
" results.append(statistics(df, out[\"df_loaded\"], new_file, v[\"type\"], benchmark, out, orig_size)[0])\n",
" \n",
" except Exception as e:\n",
" print(f\"{k} failed - {e}\")\n",
"\n",
" return pd.concat(results)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"ExecuteTime": {
"end_time": "2020-08-18T21:28:55.118672Z",
"start_time": "2020-08-18T21:28:13.277164Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"parquet_pyarrow failed - ('Unsupported numpy type 22', 'Conversion failed for column TimeDelta with type timedelta64[ns]')\n",
"Hdf5f failed - Cannot store a category dtype in a HDF5 dataset that uses format=\"fixed\". Use format=\"table\".\n",
"Feather failed - ('Unsupported numpy type 22', 'Conversion failed for column TimeDelta with type timedelta64[ns]')\n"
]
},
{
"data": {
"text/html": [
"<style type=\"text/css\" >\n",
"</style><table id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6\" ><thead> <tr> <th class=\"blank level0\" ></th> <th class=\"col_heading level0 col0\" >dtype_preservation</th> <th class=\"col_heading level0 col1\" >compression</th> <th class=\"col_heading level0 col2\" >write_time</th> <th class=\"col_heading level0 col3\" >read_time</th> </tr></thead><tbody>\n",
" <tr>\n",
" <th id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6level0_row0\" class=\"row_heading level0 row0\" >CSV</th>\n",
" <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row0_col0\" class=\"data row0 col0\" >33.33%</td>\n",
" <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row0_col1\" class=\"data row0 col1\" >100.00%</td>\n",
" <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row0_col2\" class=\"data row0 col2\" >100.00%</td>\n",
" <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row0_col3\" class=\"data row0 col3\" >100.00%</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6level0_row1\" class=\"row_heading level0 row1\" >CSV zip</th>\n",
" <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row1_col0\" class=\"data row1 col0\" >33.33%</td>\n",
" <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row1_col1\" class=\"data row1 col1\" >48.30%</td>\n",
" <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row1_col2\" class=\"data row1 col2\" >136.77%</td>\n",
" <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row1_col3\" class=\"data row1 col3\" >137.51%</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6level0_row2\" class=\"row_heading level0 row2\" >Pickle bz2</th>\n",
" <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row2_col0\" class=\"data row2 col0\" >100.00%</td>\n",
" <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row2_col1\" class=\"data row2 col1\" >44.04%</td>\n",
" <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row2_col2\" class=\"data row2 col2\" >37.22%</td>\n",
" <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row2_col3\" class=\"data row2 col3\" >140.98%</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6level0_row3\" class=\"row_heading level0 row3\" >Pickle gzip</th>\n",
" <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row3_col0\" class=\"data row3 col0\" >100.00%</td>\n",
" <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row3_col1\" class=\"data row3 col1\" >44.47%</td>\n",
" <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row3_col2\" class=\"data row3 col2\" >25.71%</td>\n",
" <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row3_col3\" class=\"data row3 col3\" >38.20%</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6level0_row4\" class=\"row_heading level0 row4\" >Pickle zip</th>\n",
" <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row4_col0\" class=\"data row4 col0\" >100.00%</td>\n",
" <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row4_col1\" class=\"data row4 col1\" >44.48%</td>\n",
" <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row4_col2\" class=\"data row4 col2\" >20.51%</td>\n",
" <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row4_col3\" class=\"data row4 col3\" >40.64%</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6level0_row5\" class=\"row_heading level0 row5\" >Pickle infer</th>\n",
" <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row5_col0\" class=\"data row5 col0\" >100.00%</td>\n",
" <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row5_col1\" class=\"data row5 col1\" >55.01%</td>\n",
" <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row5_col2\" class=\"data row5 col2\" >1.91%</td>\n",
" <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row5_col3\" class=\"data row5 col3\" >24.70%</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6level0_row6\" class=\"row_heading level0 row6\" >Pickle xz</th>\n",
" <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row6_col0\" class=\"data row6 col0\" >100.00%</td>\n",
" <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row6_col1\" class=\"data row6 col1\" >38.01%</td>\n",
" <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row6_col2\" class=\"data row6 col2\" >123.95%</td>\n",
" <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row6_col3\" class=\"data row6 col3\" >168.34%</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6level0_row7\" class=\"row_heading level0 row7\" >Parquet via fastparquet</th>\n",
" <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row7_col0\" class=\"data row7 col0\" >100.00%</td>\n",
" <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row7_col1\" class=\"data row7 col1\" >43.66%</td>\n",
" <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row7_col2\" class=\"data row7 col2\" >192.23%</td>\n",
" <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row7_col3\" class=\"data row7 col3\" >330.38%</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6level0_row8\" class=\"row_heading level0 row8\" >Hdf5 table</th>\n",
" <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row8_col0\" class=\"data row8 col0\" >100.00%</td>\n",
" <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row8_col1\" class=\"data row8 col1\" >25818.87%</td>\n",
" <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row8_col2\" class=\"data row8 col2\" >38.61%</td>\n",
" <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row8_col3\" class=\"data row8 col3\" >173.34%</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6level0_row9\" class=\"row_heading level0 row9\" >Hdf5 compressed</th>\n",
" <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row9_col0\" class=\"data row9 col0\" >100.00%</td>\n",
" <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row9_col1\" class=\"data row9 col1\" >25819.12%</td>\n",
" <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row9_col2\" class=\"data row9 col2\" >87.69%</td>\n",
" <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row9_col3\" class=\"data row9 col3\" >311.56%</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6level0_row10\" class=\"row_heading level0 row10\" >Excel</th>\n",
" <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row10_col0\" class=\"data row10 col0\" >53.33%</td>\n",
" <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row10_col1\" class=\"data row10 col1\" >75.17%</td>\n",
" <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row10_col2\" class=\"data row10 col2\" >1307.60%</td>\n",
" <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row10_col3\" class=\"data row10 col3\" >3539.83%</td>\n",
" </tr>\n",
" </tbody></table>"
],
"text/plain": [
"<pandas.io.formats.style.Styler at 0x21fe73a5d08>"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# performance test\n",
"performance_df = performance_test(exporting_types, df)\n",
"\n",
"# results\n",
"performance_df.style.format(\"{:.2%}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2020-08-18T21:28:56.196130Z",
"start_time": "2020-08-18T21:28:55.121632Z"
}
},
"outputs": [],
"source": [
"# display the graph with the results\n",
"fig = pe.bar(performance_df.T, barmode='group', text=\"value\")\n",
" \n",
"fig.update_traces(texttemplate='%{text:.2%}', textposition='auto')\n",
"fig.update_layout(title=f\"Statistics for {dataset_size} records\")\n",
"fig.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2020-08-18T21:28:56.294900Z",
"start_time": "2020-08-18T21:28:56.201117Z"
}
},
"outputs": [],
"source": [
"fig = pe.bar(performance_df.loc[[\"CSV\",\"Pickle infer\",\"Parquet via fastparquet\", \"Hdf5 table\", \"Excel\"]].T, \n",
" barmode='group', text=\"value\")\n",
"fig.update_traces(texttemplate='%{text:.2%}', textposition='auto')\n",
"fig.update_layout(title=f\"Statistics for {dataset_size} records - performance test 7 repetititons\", \n",
" yaxis={\"type\": \"log\", \"title\": \"value % (log scale)\"})\n",
"fig.show()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
},
"toc": {
"base_numbering": 1,
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"title_cell": "Table of Contents",
"title_sidebar": "Contents",
"toc_cell": false,
"toc_position": {},
"toc_section_display": true,
"toc_window_display": false
},
"varInspector": {
"cols": {
"lenName": 16,
"lenType": 16,
"lenVar": 40
},
"kernels_config": {
"python": {
"delete_cmd_postfix": "",
"delete_cmd_prefix": "del ",
"library": "var_list.py",
"varRefreshCmd": "print(var_dic_list())"
},
"r": {
"delete_cmd_postfix": ") ",
"delete_cmd_prefix": "rm(",
"library": "var_list.r",
"varRefreshCmd": "cat(var_dic_list()) "
}
},
"types_to_exclude": [
"module",
"function",
"builtin_function_or_method",
"instance",
"_Feature"
],
"window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment