vaclavdekanovsky · August 18, 2020 21:33
diff --git a/Pandas_Persistance_Performance_Test.ipynb b/Pandas_Persistance_Performance_Test.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-18T19:45:32.553194Z",
     "start_time": "2020-08-18T19:45:32.549204Z"
    }
   },
   "source": [
    "# Performance Test of various data frame persistance methods"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-18T21:28:09.244116Z",
     "start_time": "2020-08-18T21:28:09.241129Z"
    }
   },
   "outputs": [],
   "source": [
    "dataset_size = 5000"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-18T21:28:10.485607Z",
     "start_time": "2020-08-18T21:28:09.246110Z"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\v.dekanovsky\\Anaconda3\\lib\\site-packages\\xarray\\core\\merge.py:10: FutureWarning: The Panel class is removed from pandas. Accessing it from the top-level namespace will also be removed in the next version\n",
      "  PANDAS_TYPES = (pd.Series, pd.DataFrame, pd.Panel)\n"
     ]
    }
   ],
   "source": [
    "import random\n",
    "import string\n",
    "import datetime\n",
    "import time\n",
    "import pandas as pd\n",
    "from pathlib import Path\n",
    "import plotly.express as pe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-18T21:28:10.497582Z",
     "start_time": "2020-08-18T21:28:10.488601Z"
    }
   },
   "outputs": [],
   "source": [
    "def get_random_string(length: int) -> str:\n",
    "    \"\"\"Generated random string up to the specific lenght\"\"\"\n",
    "    \n",
    "    letters = string.ascii_letters\n",
    "    result_str = ''.join([random.choice(letters) for i in range(random.randint(3,length))])\n",
    "    return result_str\n",
    "\n",
    "def get_random_sentence(words: int=5) -> str:\n",
    "    \"\"\"Generate random sentence - list of random works\"\"\"\n",
    "    \n",
    "    words = []\n",
    "    for i in range(5):\n",
    "        words.append(get_random_string(10)[:random.randint(0,10)])\n",
    "    return \" \".join(words)\n",
    "\n",
    "def random_date(start: str, end: str, format: str) -> str:\n",
    "    \"\"\"Generate random date of specified format\"\"\"\n",
    "    \n",
    "    earliest = datetime.datetime.strptime(start, format)\n",
    "    latest  = datetime.datetime.strptime(end, format)\n",
    "    delta = latest - earliest\n",
    "    int_delta = (delta.days * 24 * 60 * 60) + delta.seconds    \n",
    "    random_second = random.randrange(int_delta)\n",
    "    return (earliest + datetime.timedelta(seconds = random_second)).strftime(format)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-18T21:28:12.537172Z",
     "start_time": "2020-08-18T21:28:10.501570Z"
    }
   },
   "outputs": [],
   "source": [
    "# https://numpy.org/doc/stable/user/basics.types.html\n",
    "data = []\n",
    "for i in range(dataset_size):\n",
    "    data.append(\n",
    "        [random.randint(-127,127),  # int8\n",
    "         random.randint(-32768,32767),  # int16\n",
    "         random.randint(-2147483648,2147483647),  # int32\n",
    "         random.randint(-9223372036854775808 ,9223372036854775807),  # int64\n",
    "         random.randint(0,255),  # uint8\n",
    "         round(random.uniform(0,10000),2),\n",
    "         round(random.uniform(0,1000000),2),\n",
    "         get_random_string(10),\n",
    "         get_random_sentence(5),\n",
    "         random.choice([get_random_string(10) for i in range(25)]),\n",
    "         random_date(\"1900-01-01\",\"2020-05-01\",\"%Y-%m-%d\"),\n",
    "         random_date(\"1900-01-01T00:00:00\",\"2020-05-01T23:59:59\",\"%Y-%m-%dT%H:%M:%S\"),\n",
    "         random.choice([True,False])])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-18T21:28:13.216329Z",
     "start_time": "2020-08-18T21:28:12.539169Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 5000 entries, 0 to 4999\n",
      "Data columns (total 15 columns):\n",
      "Int8             5000 non-null int8\n",
      "Int16            5000 non-null int16\n",
      "Int32            5000 non-null int32\n",
      "Int64            5000 non-null int64\n",
      "UInt8            5000 non-null uint8\n",
      "Float32          5000 non-null float32\n",
      "Float64          5000 non-null float64\n",
      "String           5000 non-null object\n",
      "Sentence         5000 non-null object\n",
      "Category         5000 non-null category\n",
      "Date             5000 non-null datetime64[ns]\n",
      "DateTime         5000 non-null datetime64[ns]\n",
      "Bool             5000 non-null bool\n",
      "DateTime+Zone    5000 non-null datetime64[ns]\n",
      "TimeDelta        5000 non-null timedelta64[ns]\n",
      "dtypes: bool(1), category(1), datetime64[ns](3), float32(1), float64(1), int16(1), int32(1), int64(1), int8(1), object(2), timedelta64[ns](1), uint8(1)\n",
      "memory usage: 584.9+ KB\n"
     ]
    }
   ],
   "source": [
    "# create data frame from the list of generated random values\n",
    "\n",
    "column_names = [\"Int8\", \"Int16\", \"Int32\", \"Int64\", \"UInt8\", \"Float32\", \"Float64\", \n",
    "                \"String\", \"Sentence\", \"Category\", \"Date\", \"DateTime\", \"Bool\"]\n",
    "df = pd.DataFrame(data, columns=column_names)\n",
    "df[\"Int8\"] = df[\"Int8\"].astype(\"int8\")\n",
    "df[\"Int16\"] = df[\"Int16\"].astype(\"int16\")\n",
    "df[\"Int32\"] = df[\"Int32\"].astype(\"int32\")\n",
    "df[\"UInt8\"] = df[\"UInt8\"].astype(\"uint8\")\n",
    "df[\"Float32\"] = df[\"Float32\"].astype(\"float32\")\n",
    "df[\"Category\"] = df[\"Category\"].astype(\"category\")\n",
    "df[\"Date\"] = pd.to_datetime(df[\"Date\"], format=\"%Y-%m-%d\")\n",
    "df[\"DateTime\"] = pd.to_datetime(df[\"DateTime\"], format=\"%Y-%m-%dT%H:%M:%S\")\n",
    "df[\"DateTime+Zone\"] = pd.to_datetime(df[\"DateTime\"], format=\"%Y-%m-%dT%H:%M:%S\")\n",
    "df[\"TimeDelta\"] = df[\"DateTime\"]-df[\"Date\"]\n",
    "#df[\"DateTime+Zone\"] = df[\"DateTime+Zone\"].dt.tz_localize('Europe/Vienna')\n",
    "\n",
    "# store for size comparison\n",
    "df.to_csv(\"benchmark.csv\", index=False)\n",
    "orig_size = Path(\"benchmark.csv\").stat().st_size\n",
    "\n",
    "df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-18T21:28:13.240295Z",
     "start_time": "2020-08-18T21:28:13.218323Z"
    }
   },
   "outputs": [],
   "source": [
    "# define the types of transformation to use\n",
    "exporting_types = {\n",
    "    \"csv\": {\n",
    "        \"type\": \"CSV\",\n",
    "        \"extension\": \".csv\",\n",
    "        \"write_function\": pd.DataFrame.to_csv,\n",
    "        \"write_params\": {\"index\": False},\n",
    "        \"read_function\": pd.read_csv,\n",
    "        \"read_params\": {}\n",
    "    },\n",
    "    \"csv_zip\": {\n",
    "        \"type\": \"CSV zip\",\n",
    "        \"extension\": \".zip\",\n",
    "        \"write_function\": pd.DataFrame.to_csv,\n",
    "        \"write_params\": {\"index\": False, \"compression\": \"zip\"},\n",
    "        \"read_function\": pd.read_csv,\n",
    "        \"read_params\": {\"compression\": \"zip\"}\n",
    "    },\n",
    "    \"picklea\": {\n",
    "        \"type\": \"Pickle bz2\",\n",
    "        \"extension\": \".pkl.bz2\",\n",
    "        \"write_function\": pd.DataFrame.to_pickle,\n",
    "        \"write_params\": {\"compression\": \"bz2\"},\n",
    "        \"read_function\": pd.read_pickle,\n",
    "        \"read_params\": {\"compression\": \"bz2\"}\n",
    "    },\n",
    "    \"pickleb\": {\n",
    "        \"type\": \"Pickle gzip\",\n",
    "        \"extension\": \".pkl.gzip\",\n",
    "        \"write_function\": pd.DataFrame.to_pickle,\n",
    "        \"write_params\": {\"compression\": \"gzip\"},\n",
    "        \"read_function\": pd.read_pickle,\n",
    "        \"read_params\": {\"compression\": \"gzip\"}\n",
    "    },\n",
    "    \"picklec\": {\n",
    "        \"type\": \"Pickle zip\",\n",
    "        \"extension\": \".pkl.zip\",\n",
    "        \"write_function\": pd.DataFrame.to_pickle,\n",
    "        \"write_params\": {\"compression\": \"zip\"},\n",
    "        \"read_function\": pd.read_pickle,\n",
    "        \"read_params\": {\"compression\": \"zip\"}\n",
    "    },\n",
    "    \"pickled\": {\n",
    "        \"type\": \"Pickle infer\",\n",
    "        \"extension\": \".pkl\",\n",
    "        \"write_function\": pd.DataFrame.to_pickle,\n",
    "        \"write_params\": {},\n",
    "        \"read_function\": pd.read_pickle,\n",
    "        \"read_params\": {}\n",
    "    },\n",
    "    \"picklee\": {\n",
    "        \"type\": \"Pickle xz\",\n",
    "        \"extension\": \".pkl.xz\",\n",
    "        \"write_function\": pd.DataFrame.to_pickle,\n",
    "        \"write_params\": {\"compression\": \"xz\"},\n",
    "        \"read_function\": pd.read_pickle,\n",
    "        \"read_params\": {\"compression\": \"xz\"}\n",
    "    },\n",
    "    \"parquet_pyarrow\": {\n",
    "        \"type\": \"Parquet via PyArrow\",\n",
    "        \"extension\": \".parquet.gzip\",\n",
    "        \"write_function\": pd.DataFrame.to_parquet,\n",
    "        \"write_params\": {},\n",
    "        \"read_function\": pd.read_parquet,\n",
    "        \"read_params\": {}\n",
    "    },\n",
    "    \"parquet_fastparquet\": {\n",
    "        \"type\": \"Parquet via fastparquet\",\n",
    "        \"extension\": \".parquet.gzip\",\n",
    "        \"write_function\": pd.DataFrame.to_parquet,\n",
    "        \"write_params\": {\"engine\":\"fastparquet\",\"compression\":\"GZIP\"},\n",
    "        \"read_function\": pd.read_parquet,\n",
    "        \"read_params\": {\"engine\":\"fastparquet\"}\n",
    "    },    \n",
    "    \"Hdf5f\": {\n",
    "        \"type\": \"Hdf5 fixed\",\n",
    "        \"extension\": \".h5\",\n",
    "        \"write_function\": pd.DataFrame.to_hdf,\n",
    "        \"write_params\": {\"key\":\"df\", \"format\":\"fixed\"},\n",
    "        \"read_function\": pd.read_hdf,\n",
    "        \"read_params\": {\"key\":\"df\"}\n",
    "    },\n",
    "    \"Hdf5t\": {\n",
    "        \"type\": \"Hdf5 table\",\n",
    "        \"extension\": \".h5\",\n",
    "        \"write_function\": pd.DataFrame.to_hdf,\n",
    "        \"write_params\": {\"key\":\"df\", \"format\":\"table\"},\n",
    "        \"read_function\": pd.read_hdf,\n",
    "        \"read_params\": {\"key\":\"df\"}\n",
    "    },\n",
    "    \"Hdf5c\": {\n",
    "        \"type\": \"Hdf5 compressed\",\n",
    "        \"extension\": \".h5\",\n",
    "        \"write_function\": pd.DataFrame.to_hdf,\n",
    "        \"write_params\": {\"key\":\"df\", \"format\":\"table\", \"complevel\": 9, \"complib\": \"bzip2\"},\n",
    "        \"read_function\": pd.read_hdf,\n",
    "        \"read_params\": {\"key\":\"df\"}\n",
    "    },\n",
    "    \"Excel\": {\n",
    "        \"type\": \"Excel\",\n",
    "        \"extension\": \".xlsx\",\n",
    "        \"write_function\": pd.DataFrame.to_excel,\n",
    "        \"write_params\": {\"index\":False},\n",
    "        \"read_function\": pd.read_excel,\n",
    "        \"read_params\": {}\n",
    "    },\n",
    "    # to_sql and read_sql have too much different syntax\n",
    "    #\"SQL\": {\n",
    "    #    \"type\": \"SQL\",\n",
    "    #    \"extension\": \"\",\n",
    "    #    \"write_function\": pd.DataFrame.to_sql,\n",
    "    #    \"write_params\": {\"name\": \"pandas\", \"con\": engine},\n",
    "    #    \"read_function\": pd.read_sql,\n",
    "    #    \"read_params\": {\"name\": \"pandas\", \"con\": engine}\n",
    "    #},\n",
    "    # feather file gets blocked by ArrowIO after the first run\n",
    "    \"Feather\": {\n",
    "        \"type\": \"Feather\",\n",
    "        \"extension\": \".f\",\n",
    "        \"write_function\": pd.DataFrame.to_feather,\n",
    "        \"write_params\": {},\n",
    "        \"read_function\": pd.read_feather,\n",
    "        \"read_params\": {}\n",
    "    }\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-18T21:28:13.251235Z",
     "start_time": "2020-08-18T21:28:13.243254Z"
    }
   },
   "outputs": [],
   "source": [
    "def repeat(f: dict, repetitions: int, df: pd.DataFrame, file: str) -> dict:\n",
    "    \"\"\" Perform an operation specified by input dict number of repetition times on the data frame\"\"\"\n",
    "    \n",
    "    writes_elapsed = []\n",
    "    reads_elapsed = []\n",
    "    for r in range(repetitions):\n",
    "        \n",
    "        # write\n",
    "        start_time = time.time()\n",
    "        \n",
    "        # first parameter is self (the dataframe), second the path and then **kwargs\n",
    "        f[\"write_function\"](df, file, **f[\"write_params\"])\n",
    "        write_elapsed = time.time() - start_time\n",
    "        writes_elapsed.append(write_elapsed)\n",
    "        \n",
    "        # read\n",
    "        start_time = time.time()\n",
    "        df_loaded = f[\"read_function\"](file, **f[\"read_params\"])\n",
    "        read_elapsed = time.time() - start_time\n",
    "        reads_elapsed.append(read_elapsed)\n",
    "    \n",
    "    return {\"write\": sum(writes_elapsed)/len(writes_elapsed),\n",
    "           \"read\": sum(reads_elapsed)/len(reads_elapsed),\n",
    "           \"df_loaded\": df_loaded}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-18T21:28:13.263206Z",
     "start_time": "2020-08-18T21:28:13.253229Z"
    }
   },
   "outputs": [],
   "source": [
    "def statistics(df: pd.DataFrame, df_loaded: pd.DataFrame, new_file: str, \n",
    "               tp: str, benchmark: dict, out: dict, orig_size: int) -> pd.DataFrame:\n",
    "    \"\"\"Generate statistics based on performance test results\"\"\"\n",
    "    \n",
    "    comparison_df = pd.DataFrame({\"orig\": df.dtypes, \"new\": df_loaded.dtypes})\n",
    "    comparison_df[\"same\"] = comparison_df[\"orig\"]==comparison_df[\"new\"]\n",
    "    #mismatched_types = comparison_df[comparison_df[\"same\"]==False].index.to_list()\n",
    "    \n",
    "    new_size = Path(new_file).stat().st_size\n",
    "    \n",
    "    return pd.DataFrame({\n",
    "    \"dtype_preservation\": comparison_df[\"same\"].sum()/comparison_df[\"same\"].count(),\n",
    "    \"compression\": new_size/orig_size,\n",
    "    \"write_time\": out[\"write\"]/benchmark[\"write\"],\n",
    "    \"read_time\": out[\"read\"]/benchmark[\"read\"],\n",
    "    #\"mismatched_types\": mismatched_types\n",
    "    }, \n",
    "        index=[tp]), comparison_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-18T21:28:13.275172Z",
     "start_time": "2020-08-18T21:28:13.265198Z"
    }
   },
   "outputs": [],
   "source": [
    "def performance_test(exporting_types: dict, df: pd.DataFrame, \n",
    "                     out_file: str=\"out\", repetitions: int = 7) -> pd.DataFrame:\n",
    "    \n",
    "    \"\"\"Run performance test for predefined dict of operations\"\"\"\n",
    "\n",
    "    results = []\n",
    "    for k,v in exporting_types.items():\n",
    "\n",
    "        # create a file name\n",
    "        new_file = out_file + v[\"extension\"]\n",
    "        \n",
    "        try:\n",
    "            # repeat the writing and reading several times\n",
    "            out = repeat(v, repetitions, df, new_file)\n",
    "\n",
    "            # CSV is the first one and it's set as benchmark for reading and writing times\n",
    "            if v[\"type\"] == \"CSV\":\n",
    "                benchmark = out\n",
    "                df.to_csv(\"benchmark.csv\", index=False)\n",
    "                orig_size = Path(\"benchmark.csv\").stat().st_size\n",
    "\n",
    "            # process the results - dtypes_preservation, compression, write and read_time\n",
    "            results.append(statistics(df, out[\"df_loaded\"], new_file, v[\"type\"], benchmark, out, orig_size)[0])\n",
    "        \n",
    "        except Exception as e:\n",
    "            print(f\"{k} failed - {e}\")\n",
    "\n",
    "    return pd.concat(results)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-18T21:28:55.118672Z",
     "start_time": "2020-08-18T21:28:13.277164Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "parquet_pyarrow failed - ('Unsupported numpy type 22', 'Conversion failed for column TimeDelta with type timedelta64[ns]')\n",
      "Hdf5f failed - Cannot store a category dtype in a HDF5 dataset that uses format=\"fixed\". Use format=\"table\".\n",
      "Feather failed - ('Unsupported numpy type 22', 'Conversion failed for column TimeDelta with type timedelta64[ns]')\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<style  type=\"text/css\" >\n",
       "</style><table id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6\" ><thead>    <tr>        <th class=\"blank level0\" ></th>        <th class=\"col_heading level0 col0\" >dtype_preservation</th>        <th class=\"col_heading level0 col1\" >compression</th>        <th class=\"col_heading level0 col2\" >write_time</th>        <th class=\"col_heading level0 col3\" >read_time</th>    </tr></thead><tbody>\n",
       "                <tr>\n",
       "                        <th id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6level0_row0\" class=\"row_heading level0 row0\" >CSV</th>\n",
       "                        <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row0_col0\" class=\"data row0 col0\" >33.33%</td>\n",
       "                        <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row0_col1\" class=\"data row0 col1\" >100.00%</td>\n",
       "                        <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row0_col2\" class=\"data row0 col2\" >100.00%</td>\n",
       "                        <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row0_col3\" class=\"data row0 col3\" >100.00%</td>\n",
       "            </tr>\n",
       "            <tr>\n",
       "                        <th id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6level0_row1\" class=\"row_heading level0 row1\" >CSV zip</th>\n",
       "                        <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row1_col0\" class=\"data row1 col0\" >33.33%</td>\n",
       "                        <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row1_col1\" class=\"data row1 col1\" >48.30%</td>\n",
       "                        <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row1_col2\" class=\"data row1 col2\" >136.77%</td>\n",
       "                        <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row1_col3\" class=\"data row1 col3\" >137.51%</td>\n",
       "            </tr>\n",
       "            <tr>\n",
       "                        <th id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6level0_row2\" class=\"row_heading level0 row2\" >Pickle bz2</th>\n",
       "                        <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row2_col0\" class=\"data row2 col0\" >100.00%</td>\n",
       "                        <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row2_col1\" class=\"data row2 col1\" >44.04%</td>\n",
       "                        <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row2_col2\" class=\"data row2 col2\" >37.22%</td>\n",
       "                        <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row2_col3\" class=\"data row2 col3\" >140.98%</td>\n",
       "            </tr>\n",
       "            <tr>\n",
       "                        <th id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6level0_row3\" class=\"row_heading level0 row3\" >Pickle gzip</th>\n",
       "                        <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row3_col0\" class=\"data row3 col0\" >100.00%</td>\n",
       "                        <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row3_col1\" class=\"data row3 col1\" >44.47%</td>\n",
       "                        <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row3_col2\" class=\"data row3 col2\" >25.71%</td>\n",
       "                        <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row3_col3\" class=\"data row3 col3\" >38.20%</td>\n",
       "            </tr>\n",
       "            <tr>\n",
       "                        <th id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6level0_row4\" class=\"row_heading level0 row4\" >Pickle zip</th>\n",
       "                        <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row4_col0\" class=\"data row4 col0\" >100.00%</td>\n",
       "                        <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row4_col1\" class=\"data row4 col1\" >44.48%</td>\n",
       "                        <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row4_col2\" class=\"data row4 col2\" >20.51%</td>\n",
       "                        <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row4_col3\" class=\"data row4 col3\" >40.64%</td>\n",
       "            </tr>\n",
       "            <tr>\n",
       "                        <th id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6level0_row5\" class=\"row_heading level0 row5\" >Pickle infer</th>\n",
       "                        <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row5_col0\" class=\"data row5 col0\" >100.00%</td>\n",
       "                        <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row5_col1\" class=\"data row5 col1\" >55.01%</td>\n",
       "                        <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row5_col2\" class=\"data row5 col2\" >1.91%</td>\n",
       "                        <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row5_col3\" class=\"data row5 col3\" >24.70%</td>\n",
       "            </tr>\n",
       "            <tr>\n",
       "                        <th id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6level0_row6\" class=\"row_heading level0 row6\" >Pickle xz</th>\n",
       "                        <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row6_col0\" class=\"data row6 col0\" >100.00%</td>\n",
       "                        <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row6_col1\" class=\"data row6 col1\" >38.01%</td>\n",
       "                        <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row6_col2\" class=\"data row6 col2\" >123.95%</td>\n",
       "                        <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row6_col3\" class=\"data row6 col3\" >168.34%</td>\n",
       "            </tr>\n",
       "            <tr>\n",
       "                        <th id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6level0_row7\" class=\"row_heading level0 row7\" >Parquet via fastparquet</th>\n",
       "                        <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row7_col0\" class=\"data row7 col0\" >100.00%</td>\n",
       "                        <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row7_col1\" class=\"data row7 col1\" >43.66%</td>\n",
       "                        <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row7_col2\" class=\"data row7 col2\" >192.23%</td>\n",
       "                        <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row7_col3\" class=\"data row7 col3\" >330.38%</td>\n",
       "            </tr>\n",
       "            <tr>\n",
       "                        <th id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6level0_row8\" class=\"row_heading level0 row8\" >Hdf5 table</th>\n",
       "                        <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row8_col0\" class=\"data row8 col0\" >100.00%</td>\n",
       "                        <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row8_col1\" class=\"data row8 col1\" >25818.87%</td>\n",
       "                        <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row8_col2\" class=\"data row8 col2\" >38.61%</td>\n",
       "                        <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row8_col3\" class=\"data row8 col3\" >173.34%</td>\n",
       "            </tr>\n",
       "            <tr>\n",
       "                        <th id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6level0_row9\" class=\"row_heading level0 row9\" >Hdf5 compressed</th>\n",
       "                        <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row9_col0\" class=\"data row9 col0\" >100.00%</td>\n",
       "                        <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row9_col1\" class=\"data row9 col1\" >25819.12%</td>\n",
       "                        <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row9_col2\" class=\"data row9 col2\" >87.69%</td>\n",
       "                        <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row9_col3\" class=\"data row9 col3\" >311.56%</td>\n",
       "            </tr>\n",
       "            <tr>\n",
       "                        <th id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6level0_row10\" class=\"row_heading level0 row10\" >Excel</th>\n",
       "                        <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row10_col0\" class=\"data row10 col0\" >53.33%</td>\n",
       "                        <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row10_col1\" class=\"data row10 col1\" >75.17%</td>\n",
       "                        <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row10_col2\" class=\"data row10 col2\" >1307.60%</td>\n",
       "                        <td id=\"T_d19b5a46_e199_11ea_ab65_50e08598ebb6row10_col3\" class=\"data row10 col3\" >3539.83%</td>\n",
       "            </tr>\n",
       "    </tbody></table>"
      ],
      "text/plain": [
       "<pandas.io.formats.style.Styler at 0x21fe73a5d08>"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# performance test\n",
    "performance_df = performance_test(exporting_types, df)\n",
    "\n",
    "# results\n",
    "performance_df.style.format(\"{:.2%}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-18T21:28:56.196130Z",
     "start_time": "2020-08-18T21:28:55.121632Z"
    }
   },
   "outputs": [],
   "source": [
    "# display the graph with the results\n",
    "fig = pe.bar(performance_df.T, barmode='group', text=\"value\")\n",
    "    \n",
    "fig.update_traces(texttemplate='%{text:.2%}', textposition='auto')\n",
    "fig.update_layout(title=f\"Statistics for {dataset_size} records\")\n",
    "fig.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-18T21:28:56.294900Z",
     "start_time": "2020-08-18T21:28:56.201117Z"
    }
   },
   "outputs": [],
   "source": [
    "fig = pe.bar(performance_df.loc[[\"CSV\",\"Pickle infer\",\"Parquet via fastparquet\", \"Hdf5 table\", \"Excel\"]].T, \n",
    "             barmode='group', text=\"value\")\n",
    "fig.update_traces(texttemplate='%{text:.2%}', textposition='auto')\n",
    "fig.update_layout(title=f\"Statistics for {dataset_size} records - performance test 7 repetititons\", \n",
    "                  yaxis={\"type\": \"log\", \"title\": \"value % (log scale)\"})\n",
    "fig.show()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
No results found