Created
August 18, 2020 21:33
-
-
Save vaclavdekanovsky/b2656ce7a14c082d4cde1f8862cc5de8 to your computer and use it in GitHub Desktop.
Running performance test to see the file size and read/write speed of various pandas .to_() methods
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2020-08-18T19:45:32.553194Z", | |
| "start_time": "2020-08-18T19:45:32.549204Z" | |
| } | |
| }, | |
| "source": [ | |
| "# Performance Test of various data frame persistance methods" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2020-08-18T21:28:09.244116Z", | |
| "start_time": "2020-08-18T21:28:09.241129Z" | |
| } | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "dataset_size = 5000" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2020-08-18T21:28:10.485607Z", | |
| "start_time": "2020-08-18T21:28:09.246110Z" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "C:\\Users\\v.dekanovsky\\Anaconda3\\lib\\site-packages\\xarray\\core\\merge.py:10: FutureWarning: The Panel class is removed from pandas. Accessing it from the top-level namespace will also be removed in the next version\n", | |
| " PANDAS_TYPES = (pd.Series, pd.DataFrame, pd.Panel)\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "import random\n", | |
| "import string\n", | |
| "import datetime\n", | |
| "import time\n", | |
| "import pandas as pd\n", | |
| "from pathlib import Path\n", | |
| "import plotly.express as pe" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2020-08-18T21:28:10.497582Z", | |
| "start_time": "2020-08-18T21:28:10.488601Z" | |
| } | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "def get_random_string(length: int) -> str:\n", | |
| " \"\"\"Generated random string up to the specific lenght\"\"\"\n", | |
| " \n", | |
| " letters = string.ascii_letters\n", | |
| " result_str = ''.join([random.choice(letters) for i in range(random.randint(3,length))])\n", | |
| " return result_str\n", | |
| "\n", | |
| "def get_random_sentence(words: int=5) -> str:\n", | |
| " \"\"\"Generate random sentence - list of random works\"\"\"\n", | |
| " \n", | |
| " words = []\n", | |
| " for i in range(5):\n", | |
| " words.append(get_random_string(10)[:random.randint(0,10)])\n", | |
| " return \" \".join(words)\n", | |
| "\n", | |
| "def random_date(start: str, end: str, format: str) -> str:\n", | |
| " \"\"\"Generate random date of specified format\"\"\"\n", | |
| " \n", | |
| " earliest = datetime.datetime.strptime(start, format)\n", | |
| " latest = datetime.datetime.strptime(end, format)\n", | |
| " delta = latest - earliest\n", | |
| " int_delta = (delta.days * 24 * 60 * 60) + delta.seconds \n", | |
| " random_second = random.randrange(int_delta)\n", | |
| " return (earliest + datetime.timedelta(seconds = random_second)).strftime(format)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2020-08-18T21:28:12.537172Z", | |
| "start_time": "2020-08-18T21:28:10.501570Z" | |
| } | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "# https://numpy.org/doc/stable/user/basics.types.html\n", | |
| "data = []\n", | |
| "for i in range(dataset_size):\n", | |
| " data.append(\n", | |
| " [random.randint(-127,127), # int8\n", | |
| " random.randint(-32768,32767), # int16\n", | |
| " random.randint(-2147483648,2147483647), # int32\n", | |
| " random.randint(-9223372036854775808 ,9223372036854775807), # int64\n", | |
| " random.randint(0,255), # uint8\n", | |
| " round(random.uniform(0,10000),2),\n", | |
| " round(random.uniform(0,1000000),2),\n", | |
| " get_random_string(10),\n", | |
| " get_random_sentence(5),\n", | |
| " random.choice([get_random_string(10) for i in range(25)]),\n", | |
| " random_date(\"1900-01-01\",\"2020-05-01\",\"%Y-%m-%d\"),\n", | |
| " random_date(\"1900-01-01T00:00:00\",\"2020-05-01T23:59:59\",\"%Y-%m-%dT%H:%M:%S\"),\n", | |
| " random.choice([True,False])])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2020-08-18T21:28:13.216329Z", | |
| "start_time": "2020-08-18T21:28:12.539169Z" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "<class 'pandas.core.frame.DataFrame'>\n", | |
| "RangeIndex: 5000 entries, 0 to 4999\n", | |
| "Data columns (total 15 columns):\n", | |
| "Int8 5000 non-null int8\n", | |
| "Int16 5000 non-null int16\n", | |
| "Int32 5000 non-null int32\n", | |
| "Int64 5000 non-null int64\n", | |
| "UInt8 5000 non-null uint8\n", | |
| "Float32 5000 non-null float32\n", | |
| "Float64 5000 non-null float64\n", | |
| "String 5000 non-null object\n", | |
| "Sentence 5000 non-null object\n", | |
| "Category 5000 non-null category\n", | |
| "Date 5000 non-null datetime64[ns]\n", | |
| "DateTime 5000 non-null datetime64[ns]\n", | |
| "Bool 5000 non-null bool\n", | |
| "DateTime+Zone 5000 non-null datetime64[ns]\n", | |
| "TimeDelta 5000 non-null timedelta64[ns]\n", | |
| "dtypes: bool(1), category(1), datetime64[ns](3), float32(1), float64(1), int16(1), int32(1), int64(1), int8(1), object(2), timedelta64[ns](1), uint8(1)\n", | |
| "memory usage: 584.9+ KB\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "# create data frame from the list of generated random values\n", | |
| "\n", | |
| "column_names = [\"Int8\", \"Int16\", \"Int32\", \"Int64\", \"UInt8\", \"Float32\", \"Float64\", \n", | |
| " \"String\", \"Sentence\", \"Category\", \"Date\", \"DateTime\", \"Bool\"]\n", | |
| "df = pd.DataFrame(data, columns=column_names)\n", | |
| "df[\"Int8\"] = df[\"Int8\"].astype(\"int8\")\n", | |
| "df[\"Int16\"] = df[\"Int16\"].astype(\"int16\")\n", | |
| "df[\"Int32\"] = df[\"Int32\"].astype(\"int32\")\n", | |
| "df[\"UInt8\"] = df[\"UInt8\"].astype(\"uint8\")\n", | |
| "df[\"Float32\"] = df[\"Float32\"].astype(\"float32\")\n", | |
| "df[\"Category\"] = df[\"Category\"].astype(\"category\")\n", | |
| "df[\"Date\"] = pd.to_datetime(df[\"Date\"], format=\"%Y-%m-%d\")\n", | |
| "df[\"DateTime\"] = pd.to_datetime(df[\"DateTime\"], format=\"%Y-%m-%dT%H:%M:%S\")\n", | |
| "df[\"DateTime+Zone\"] = pd.to_datetime(df[\"DateTime\"], format=\"%Y-%m-%dT%H:%M:%S\")\n", | |
| "df[\"TimeDelta\"] = df[\"DateTime\"]-df[\"Date\"]\n", | |
| "#df[\"DateTime+Zone\"] = df[\"DateTime+Zone\"].dt.tz_localize('Europe/Vienna')\n", | |
| "\n", | |
| "# store for size comparison\n", | |
| "df.to_csv(\"benchmark.csv\", index=False)\n", | |
| "orig_size = Path(\"benchmark.csv\").stat().st_size\n", | |
| "\n", | |
| "df.info()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 6, | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2020-08-18T21:28:13.240295Z", | |
| "start_time": "2020-08-18T21:28:13.218323Z" | |
| } | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "# define the types of transformation to use\n", | |
| "exporting_types = {\n", | |
| " \"csv\": {\n", | |
| " \"type\": \"CSV\",\n", | |
| " \"extension\": \".csv\",\n", | |
| " \"write_function\": pd.DataFrame.to_csv,\n", | |
| " \"write_params\": {\"index\": False},\n", | |
| " \"read_function\": pd.read_csv,\n", | |
| " \"read_params\": {}\n", | |
| " },\n", | |
| " \"csv_zip\": {\n", | |
| " \"type\": \"CSV zip\",\n", | |
| " \"extension\": \".zip\",\n", | |
| " \"write_function\": pd.DataFrame.to_csv,\n", | |
| " \"write_params\": {\"index\": False, \"compression\": \"zip\"},\n", | |
| " \"read_function\": pd.read_csv,\n", | |
| " \"read_params\": {\"compression\": \"zip\"}\n", | |
| " },\n", | |
| " \"picklea\": {\n", | |
| " \"type\": \"Pickle bz2\",\n", | |
| " \"extension\": \".pkl.bz2\",\n", | |
| " \"write_function\": pd.DataFrame.to_pickle,\n", | |
| " \"write_params\": {\"compression\": \"bz2\"},\n", | |
| " \"read_function\": pd.read_pickle,\n", | |
| " \"read_params\": {\"compression\": \"bz2\"}\n", | |
| " },\n", | |
| " \"pickleb\": {\n", | |
| " \"type\": \"Pickle gzip\",\n", | |
| " \"extension\": \".pkl.gzip\",\n", | |
| " \"write_function\": pd.DataFrame.to_pickle,\n", | |
| " \"write_params\": {\"compression\": \"gzip\"},\n", | |
| " \"read_function\": pd.read_pickle,\n", | |
| " \"read_params\": {\"compression\": \"gzip\"}\n", | |
| " },\n", | |
| " \"picklec\": {\n", | |
| " \"type\": \"Pickle zip\",\n", | |
| " \"extension\": \".pkl.zip\",\n", | |
| " \"write_function\": pd.DataFrame.to_pickle,\n", | |
| " \"write_params\": {\"compression\": \"zip\"},\n", | |
| " \"read_function\": pd.read_pickle,\n", | |
| " \"read_params\": {\"compression\": \"zip\"}\n", | |
| " },\n", | |
| " \"pickled\": {\n", | |
| " \"type\": \"Pickle infer\",\n", | |
| " \"extension\": \".pkl\",\n", | |
| " \"write_function\": pd.DataFrame.to_pickle,\n", | |
| " \"write_params\": {},\n", | |
| " \"read_function\": pd.read_pickle,\n", | |
| " \"read_params\": {}\n", | |
| " },\n", | |
| " \"picklee\": {\n", | |
| " \"type\": \"Pickle xz\",\n", | |
| " \"extension\": \".pkl.xz\",\n", | |
| " \"write_function\": pd.DataFrame.to_pickle,\n", | |
| " \"write_params\": {\"compression\": \"xz\"},\n", | |
| " \"read_function\": pd.read_pickle,\n", | |
| " \"read_params\": {\"compression\": \"xz\"}\n", | |
| " },\n", | |
| " \"parquet_pyarrow\": {\n", | |
| " \"type\": \"Parquet via PyArrow\",\n", | |
| " \"extension\": \".parquet.gzip\",\n", | |
| " \"write_function\": pd.DataFrame.to_parquet,\n", | |
| " \"write_params\": {},\n", | |
| " \"read_function\": pd.read_parquet,\n", | |
| " \"read_params\": {}\n", | |
| " },\n", | |
| " \"parquet_fastparquet\": {\n", | |
| " \"type\": \"Parquet via fastparquet\",\n", | |
| " \"extension\": \".parquet.gzip\",\n", | |
| " \"write_function\": pd.DataFrame.to_parquet,\n", | |
| " \"write_params\": {\"engine\":\"fastparquet\",\"compression\":\"GZIP\"},\n", | |
| " \"read_function\": pd.read_parquet,\n", | |
| " \"read_params\": {\"engine\":\"fastparquet\"}\n", | |
| " }, \n", | |
| " \"Hdf5f\": {\n", | |
| " \"type\": \"Hdf5 fixed\",\n", | |
| " \"extension\": \".h5\",\n", | |
| " \"write_function\": pd.DataFrame.to_hdf,\n", | |
| " \"write_params\": {\"key\":\"df\", \"format\":\"fixed\"},\n", | |
| " \"read_function\": pd.read_hdf,\n", | |
| " \"read_params\": {\"key\":\"df\"}\n", | |
| " },\n", | |
| " \"Hdf5t\": {\n", | |
| " \"type\": \"Hdf5 table\",\n", | |
| " \"extension\": \".h5\",\n", | |
| " \"write_function\": pd.DataFrame.to_hdf,\n", | |
| " \"write_params\": {\"key\":\"df\", \"format\":\"table\"},\n", | |
| " \"read_function\": pd.read_hdf,\n", | |
| " \"read_params\": {\"key\":\"df\"}\n", | |
| " },\n", | |
| " \"Hdf5c\": {\n", | |
| " \"type\": \"Hdf5 compressed\",\n", | |
| " \"extension\": \".h5\",\n", | |
| " \"write_function\": pd.DataFrame.to_hdf,\n", | |
| " \"write_params\": {\"key\":\"df\", \"format\":\"table\", \"complevel\": 9, \"complib\": \"bzip2\"},\n", | |
| " \"read_function\": pd.read_hdf,\n", | |
| " \"read_params\": {\"key\":\"df\"}\n", | |
| " },\n", | |
| " \"Excel\": {\n", | |
| " \"type\": \"Excel\",\n", | |
| " \"extension\": \".xlsx\",\n", | |
| " \"write_function\": pd.DataFrame.to_excel,\n", | |
| " \"write_params\": {\"index\":False},\n", | |
| " \"read_function\": pd.read_excel,\n", | |
| " \"read_params\": {}\n", | |
| " },\n", | |
| " # to_sql and read_sql have too much different syntax\n", | |
| " #\"SQL\": {\n", | |
| " # \"type\": \"SQL\",\n", | |
| " # \"extension\": \"\",\n", | |
| " # \"write_function\": pd.DataFrame.to_sql,\n", | |
| " # \"write_params\": {\"name\": \"pandas\", \"con\": engine},\n", | |
| " # \"read_function\": pd.read_sql,\n", | |
| " # \"read_params\": {\"name\": \"pandas\", \"con\": engine}\n", | |
| " #},\n", | |
| " # feather file gets blocked by ArrowIO after the first run\n", | |
| " \"Feather\": {\n", | |
| " \"type\": \"Feather\",\n", | |
| " \"extension\": \".f\",\n", | |
| " \"write_function\": pd.DataFrame.to_feather,\n", | |
| " \"write_params\": {},\n", | |
| " \"read_function\": pd.read_feather,\n", | |
| " \"read_params\": {}\n", | |
| " }\n", | |
| "}" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 7, | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2020-08-18T21:28:13.251235Z", | |
| "start_time": "2020-08-18T21:28:13.243254Z" | |
| } | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "def repeat(f: dict, repetitions: int, df: pd.DataFrame, file: str) -> dict:\n", | |
| " \"\"\" Perform an operation specified by input dict number of repetition times on the data frame\"\"\"\n", | |
| " \n", | |
| " writes_elapsed = []\n", | |
| " reads_elapsed = []\n", | |
| " for r in range(repetitions):\n", | |
| " \n", | |
| " # write\n", | |
| " start_time = time.time()\n", | |
| " \n", | |
| " # first parameter is self (the dataframe), second the path and then **kwargs\n", | |
| " f[\"write_function\"](df, file, **f[\"write_params\"])\n", | |
| " write_elapsed = time.time() - start_time\n", | |
| " writes_elapsed.append(write_elapsed)\n", | |
| " \n", | |
| " # read\n", | |
| " start_time = time.time()\n", | |
| " df_loaded = f[\"read_function\"](file, **f[\"read_params\"])\n", | |
| " read_elapsed = time.time() - start_time\n", | |
| " reads_elapsed.append(read_elapsed)\n", | |
| " \n", | |
| " return {\"write\": sum(writes_elapsed)/len(writes_elapsed),\n", | |
| " \"read\": sum(reads_elapsed)/len(reads_elapsed),\n", | |
| " \"df_loaded\": df_loaded}" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 8, | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2020-08-18T21:28:13.263206Z", | |
| "start_time": "2020-08-18T21:28:13.253229Z" | |
| } | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "def statistics(df: pd.DataFrame, df_loaded: pd.DataFrame, new_file: str, \n", | |
| " tp: str, benchmark: dict, out: dict, orig_size: int) -> pd.DataFrame:\n", | |
| " \"\"\"Generate statistics based on performance test results\"\"\"\n", | |
| " \n", | |
| " comparison_df = pd.DataFrame({\"orig\": df.dtypes, \"new\": df_loaded.dtypes})\n", | |
| " comparison_df[\"same\"] = comparison_df[\"orig\"]==comparison_df[\"new\"]\n", | |
| " #mismatched_types = comparison_df[comparison_df[\"same\"]==False].index.to_list()\n", | |
| " \n", | |
| " new_size = Path(new_file).stat().st_size\n", | |
| " \n", | |
| " return pd.DataFrame({\n", | |
| " \"dtype_preservation\": comparison_df[\"same\"].sum()/comparison_df[\"same\"].count(),\n", | |
| " \"compression\": new_size/orig_size,\n", | |
| " \"write_time\": out[\"write\"]/benchmark[\"write\"],\n", | |
| " \"read_time\": out[\"read\"]/benchmark[\"read\"],\n", | |
| " #\"mismatched_types\": mismatched_types\n", | |
| " }, \n", | |
| " index=[tp]), comparison_df" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 9, | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2020-08-18T21:28:13.275172Z", | |
| "start_time": "2020-08-18T21:28:13.265198Z" | |
| } | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "def performance_test(exporting_types: dict, df: pd.DataFrame, \n", | |
| " out_file: str=\"out\", repetitions: int = 7) -> pd.DataFrame:\n", | |
| " \n", | |
| " \"\"\"Run performance test for predefined dict of operations\"\"\"\n", | |
| "\n", | |
| " results = []\n", | |
| " for k,v in exporting_types.items():\n", | |
| "\n", | |
| " # create a file name\n", | |
| " new_file = out_file + v[\"extension\"]\n", | |
| " \n", | |
| " try:\n", | |
| " # repeat the writing and reading several times\n", | |
| " out = repeat(v, repetitions, df, new_file)\n", | |
| "\n", | |
| " # CSV is the first one and it's set as benchmark for reading and writing times\n", | |
| " if v[\"type\"] == \"CSV\":\n", | |
| " benchmark = out\n", | |
| " df.to_csv(\"benchmark.csv\", index=False)\n", | |
| " orig_size = Path(\"benchmark.csv\").stat().st_size\n", | |
| "\n", | |
| " # process the results - dtypes_preservation, compression, write and read_time\n", | |
| " results.append(statistics(df, out[\"df_loaded\"], new_file, v[\"type\"], benchmark, out, orig_size)[0])\n", | |
| " \n", | |
| " except Exception as e:\n", | |
| " print(f\"{k} failed - {e}\")\n", | |
| "\n", | |
| " return pd.concat(results)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 10, | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2020-08-18T21:28:55.118672Z", | |
| "start_time": "2020-08-18T21:28:13.277164Z" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "parquet_pyarrow failed - ('Unsupported numpy type 22', 'Conversion failed for column TimeDelta with type timedelta64[ns]')\n", | |
| "Hdf5f failed - Cannot store a category dtype in a HDF5 dataset that uses format=\"fixed\". Use format=\"table\".\n", | |
| "Feather failed - ('Unsupported numpy type 22', 'Conversion failed for column TimeDelta with type timedelta64[ns]')\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<style type=\"text/css\" >\n", | |
| "</style><table id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6\" ><thead> <tr> <th class=\"blank level0\" ></th> <th class=\"col_heading level0 col0\" >dtype_preservation</th> <th class=\"col_heading level0 col1\" >compression</th> <th class=\"col_heading level0 col2\" >write_time</th> <th class=\"col_heading level0 col3\" >read_time</th> </tr></thead><tbody>\n", | |
| " <tr>\n", | |
| " <th id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6level0_row0\" class=\"row_heading level0 row0\" >CSV</th>\n", | |
| " <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row0_col0\" class=\"data row0 col0\" >33.33%</td>\n", | |
| " <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row0_col1\" class=\"data row0 col1\" >100.00%</td>\n", | |
| " <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row0_col2\" class=\"data row0 col2\" >100.00%</td>\n", | |
| " <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row0_col3\" class=\"data row0 col3\" >100.00%</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6level0_row1\" class=\"row_heading level0 row1\" >CSV zip</th>\n", | |
| " <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row1_col0\" class=\"data row1 col0\" >33.33%</td>\n", | |
| " <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row1_col1\" class=\"data row1 col1\" >48.30%</td>\n", | |
| " <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row1_col2\" class=\"data row1 col2\" >136.77%</td>\n", | |
| " <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row1_col3\" class=\"data row1 col3\" >137.51%</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6level0_row2\" class=\"row_heading level0 row2\" >Pickle bz2</th>\n", | |
| " <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row2_col0\" class=\"data row2 col0\" >100.00%</td>\n", | |
| " <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row2_col1\" class=\"data row2 col1\" >44.04%</td>\n", | |
| " <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row2_col2\" class=\"data row2 col2\" >37.22%</td>\n", | |
| " <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row2_col3\" class=\"data row2 col3\" >140.98%</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6level0_row3\" class=\"row_heading level0 row3\" >Pickle gzip</th>\n", | |
| " <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row3_col0\" class=\"data row3 col0\" >100.00%</td>\n", | |
| " <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row3_col1\" class=\"data row3 col1\" >44.47%</td>\n", | |
| " <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row3_col2\" class=\"data row3 col2\" >25.71%</td>\n", | |
| " <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row3_col3\" class=\"data row3 col3\" >38.20%</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6level0_row4\" class=\"row_heading level0 row4\" >Pickle zip</th>\n", | |
| " <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row4_col0\" class=\"data row4 col0\" >100.00%</td>\n", | |
| " <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row4_col1\" class=\"data row4 col1\" >44.48%</td>\n", | |
| " <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row4_col2\" class=\"data row4 col2\" >20.51%</td>\n", | |
| " <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row4_col3\" class=\"data row4 col3\" >40.64%</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6level0_row5\" class=\"row_heading level0 row5\" >Pickle infer</th>\n", | |
| " <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row5_col0\" class=\"data row5 col0\" >100.00%</td>\n", | |
| " <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row5_col1\" class=\"data row5 col1\" >55.01%</td>\n", | |
| " <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row5_col2\" class=\"data row5 col2\" >1.91%</td>\n", | |
| " <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row5_col3\" class=\"data row5 col3\" >24.70%</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6level0_row6\" class=\"row_heading level0 row6\" >Pickle xz</th>\n", | |
| " <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row6_col0\" class=\"data row6 col0\" >100.00%</td>\n", | |
| " <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row6_col1\" class=\"data row6 col1\" >38.01%</td>\n", | |
| " <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row6_col2\" class=\"data row6 col2\" >123.95%</td>\n", | |
| " <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row6_col3\" class=\"data row6 col3\" >168.34%</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6level0_row7\" class=\"row_heading level0 row7\" >Parquet via fastparquet</th>\n", | |
| " <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row7_col0\" class=\"data row7 col0\" >100.00%</td>\n", | |
| " <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row7_col1\" class=\"data row7 col1\" >43.66%</td>\n", | |
| " <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row7_col2\" class=\"data row7 col2\" >192.23%</td>\n", | |
| " <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row7_col3\" class=\"data row7 col3\" >330.38%</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6level0_row8\" class=\"row_heading level0 row8\" >Hdf5 table</th>\n", | |
| " <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row8_col0\" class=\"data row8 col0\" >100.00%</td>\n", | |
| " <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row8_col1\" class=\"data row8 col1\" >25818.87%</td>\n", | |
| " <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row8_col2\" class=\"data row8 col2\" >38.61%</td>\n", | |
| " <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row8_col3\" class=\"data row8 col3\" >173.34%</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6level0_row9\" class=\"row_heading level0 row9\" >Hdf5 compressed</th>\n", | |
| " <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row9_col0\" class=\"data row9 col0\" >100.00%</td>\n", | |
| " <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row9_col1\" class=\"data row9 col1\" >25819.12%</td>\n", | |
| " <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row9_col2\" class=\"data row9 col2\" >87.69%</td>\n", | |
| " <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row9_col3\" class=\"data row9 col3\" >311.56%</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6level0_row10\" class=\"row_heading level0 row10\" >Excel</th>\n", | |
| " <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row10_col0\" class=\"data row10 col0\" >53.33%</td>\n", | |
| " <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row10_col1\" class=\"data row10 col1\" >75.17%</td>\n", | |
| " <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row10_col2\" class=\"data row10 col2\" >1307.60%</td>\n", | |
| " <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row10_col3\" class=\"data row10 col3\" >3539.83%</td>\n", | |
| " </tr>\n", | |
| " </tbody></table>" | |
| ], | |
| "text/plain": [ | |
| "<pandas.io.formats.style.Styler at 0x21fe73a5d08>" | |
| ] | |
| }, | |
| "execution_count": 10, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "# performance test\n", | |
| "performance_df = performance_test(exporting_types, df)\n", | |
| "\n", | |
| "# results\n", | |
| "performance_df.style.format(\"{:.2%}\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2020-08-18T21:28:56.196130Z", | |
| "start_time": "2020-08-18T21:28:55.121632Z" | |
| } | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "# display the graph with the results\n", | |
| "fig = pe.bar(performance_df.T, barmode='group', text=\"value\")\n", | |
| " \n", | |
| "fig.update_traces(texttemplate='%{text:.2%}', textposition='auto')\n", | |
| "fig.update_layout(title=f\"Statistics for {dataset_size} records\")\n", | |
| "fig.show()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2020-08-18T21:28:56.294900Z", | |
| "start_time": "2020-08-18T21:28:56.201117Z" | |
| } | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "fig = pe.bar(performance_df.loc[[\"CSV\",\"Pickle infer\",\"Parquet via fastparquet\", \"Hdf5 table\", \"Excel\"]].T, \n", | |
| " barmode='group', text=\"value\")\n", | |
| "fig.update_traces(texttemplate='%{text:.2%}', textposition='auto')\n", | |
| "fig.update_layout(title=f\"Statistics for {dataset_size} records - performance test 7 repetititons\", \n", | |
| " yaxis={\"type\": \"log\", \"title\": \"value % (log scale)\"})\n", | |
| "fig.show()" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.7.4" | |
| }, | |
| "toc": { | |
| "base_numbering": 1, | |
| "nav_menu": {}, | |
| "number_sections": true, | |
| "sideBar": true, | |
| "skip_h1_title": false, | |
| "title_cell": "Table of Contents", | |
| "title_sidebar": "Contents", | |
| "toc_cell": false, | |
| "toc_position": {}, | |
| "toc_section_display": true, | |
| "toc_window_display": false | |
| }, | |
| "varInspector": { | |
| "cols": { | |
| "lenName": 16, | |
| "lenType": 16, | |
| "lenVar": 40 | |
| }, | |
| "kernels_config": { | |
| "python": { | |
| "delete_cmd_postfix": "", | |
| "delete_cmd_prefix": "del ", | |
| "library": "var_list.py", | |
| "varRefreshCmd": "print(var_dic_list())" | |
| }, | |
| "r": { | |
| "delete_cmd_postfix": ") ", | |
| "delete_cmd_prefix": "rm(", | |
| "library": "var_list.r", | |
| "varRefreshCmd": "cat(var_dic_list()) " | |
| } | |
| }, | |
| "types_to_exclude": [ | |
| "module", | |
| "function", | |
| "builtin_function_or_method", | |
| "instance", | |
| "_Feature" | |
| ], | |
| "window_display": false | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 2 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment