Last active
March 2, 2020 22:00
-
-
Save xmnlab/8488e7b0b7988407521ce4cdc273fc4c to your computer and use it in GitHub Desktop.
Ibis-NTiles-OmniSciDB-vs-PostgreSQL.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import random\n", | |
"import traceback\n", | |
"\n", | |
"import ibis\n", | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"import sqlalchemy as sa" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'1.3.0+1.g0b65f4ad.dirty'" | |
] | |
}, | |
"execution_count": 2, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"ibis.__version__" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def random_salary(v_min, v_max): \n", | |
" return float(random.randint(v_min, v_max))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def cursor2df(cursor):\n", | |
" col_names = [c.name for c in cursor.description]\n", | |
" return pd.DataFrame(cursor.fetchall(), columns=col_names)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"omniscidb_conf = dict(\n", | |
" host='localhost', port='6274', \n", | |
" user='admin', password='HyperInteractive', \n", | |
" database='ibis_testing'\n", | |
")\n", | |
"\n", | |
"omniscidb = ibis.omniscidb.connect(**omniscidb_conf)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'5.2.0dev-20200301-0e1eaa97b1'" | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# show database version\n", | |
"omniscidb.con._client.get_version()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['diamonds', 'batting', 'awards_players', 'functional_alltypes', 'geo']" | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"omniscidb.list_tables()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"psql_conf = dict(\n", | |
" host='localhost', \n", | |
" port='5432', \n", | |
" user='postgres', \n", | |
" password='postgres', \n", | |
" database='ibis_testing'\n", | |
")\n", | |
"psql = ibis.postgres.connect(**psql_conf)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['array_types',\n", | |
" 'awards_players',\n", | |
" 'batting',\n", | |
" 'diamonds',\n", | |
" 'films',\n", | |
" 'functional_alltypes',\n", | |
" 'geo',\n", | |
" 'geography_columns',\n", | |
" 'geometry_columns',\n", | |
" 'intervals',\n", | |
" 'not_supported_intervals',\n", | |
" 'raster_columns',\n", | |
" 'raster_overviews',\n", | |
" 'spatial_ref_sys',\n", | |
" 'tzone']" | |
] | |
}, | |
"execution_count": 9, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"psql.list_tables()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Salary table analysis" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"dept = {\n", | |
" 'AC': 'Accounting',\n", | |
" 'AD': 'Administration',\n", | |
" 'EX': 'Executive',\n", | |
" 'FI': 'Finance',\n", | |
" 'HR': 'Human Resorce',\n", | |
" 'IT': 'IT',\n", | |
" 'PU': 'Purchasing',\n", | |
" 'SA': 'Sales',\n", | |
" 'SH': 'Shipping'\n", | |
"}\n", | |
"\n", | |
"num_rows = 31\n", | |
"\n", | |
"df_test = pd.DataFrame({\n", | |
" 'first_name': ['FirstName{}'.format(i) for i in range(num_rows)],\n", | |
" 'last_name': ['LastName{}'.format(i) for i in range(num_rows)],\n", | |
" 'department_name': (\n", | |
" [dept['AC']] * 2 \n", | |
" + [dept['AD']] * 1\n", | |
" + [dept['EX']] * 3\n", | |
" + [dept['FI']] * 6\n", | |
" + [dept['HR']] * 1\n", | |
" + [dept['IT']] * 2\n", | |
" + [dept['PU']] * 3\n", | |
" + [dept['SA']] * 6\n", | |
" + [dept['SH']] * 7\n", | |
" ),\n", | |
" 'salary':[random_salary(1000, 12000) for i in range(num_rows)] \n", | |
"})\n", | |
"# display(df_test)\n", | |
"\n", | |
"# PSQL\n", | |
"sch = ibis.schema([\n", | |
" ('first_name', 'string'),\n", | |
" ('last_name', 'string'),\n", | |
" ('department_name', 'string'),\n", | |
" ('salary', 'float64'),\n", | |
"])\n", | |
"table_name = 'salary'\n", | |
"psql.drop_table(table_name, force=True)\n", | |
"psql.create_table(table_name, schema=sch)\n", | |
"df_test.to_sql(table_name, con=psql.con, if_exists='replace', index=False)\n", | |
"\n", | |
"# OMNISCIDB\n", | |
"omniscidb.drop_table(table_name, force=True)\n", | |
"omniscidb.create_table(table_name, schema=sch)\n", | |
"omniscidb.load_data(table_name, df_test)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# omniscidb\n", | |
"omni_t = omniscidb.table('salary')\n", | |
"omni_df = omni_t.execute()\n", | |
"# psql\n", | |
"psql_t = psql.table('salary')\n", | |
"psql_df = psql_t.execute()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# check if the table result is the same\n", | |
"pd.testing.assert_frame_equal(omni_df, psql_df)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"PSQL >>> SELECT t0.first_name, t0.last_name, t0.department_name, t0.salary, ntile(%(param_1)s) OVER (PARTITION BY t0.department_name ORDER BY t0.salary ASC) - %(param_2)s AS salary_group \n", | |
"FROM salary AS t0 \n", | |
"\n", | |
"OMNI >>> SELECT *,\n", | |
" ntile(2) OVER (PARTITION BY \"department_name\" ORDER BY \"salary\") - 1 AS salary_group\n", | |
"FROM salary\n" | |
] | |
} | |
], | |
"source": [ | |
"result_fn = lambda t, win: t.department_name.ntile(buckets=2).over(win)\n", | |
"expected_fn = lambda t: ntile(t.department_name, bucket=2)\n", | |
"\n", | |
"create_expr = lambda t: (\n", | |
" t.mutate(\n", | |
" salary_group=result_fn(\n", | |
" t,\n", | |
" win=ibis.window(\n", | |
" following=0,\n", | |
" group_by=[t.department_name],\n", | |
" order_by=[t.salary],\n", | |
" ),\n", | |
" )\n", | |
" )\n", | |
")\n", | |
"\n", | |
"omni_expr = create_expr(omni_t)\n", | |
"psql_expr = create_expr(psql_t)\n", | |
"\n", | |
"print('PSQL >>>', psql_expr.compile(), '\\n')\n", | |
"print('OMNI >>>', omni_expr.compile())\n", | |
"\n", | |
"k_sort = ['department_name', 'salary']\n", | |
"\n", | |
"omni_result = (\n", | |
" omni_expr.execute()\n", | |
" .drop(columns='rowid')\n", | |
" .sort_values(k_sort)\n", | |
" .reset_index(drop=True)\n", | |
")\n", | |
"psql_result = (\n", | |
" psql_expr.execute()\n", | |
" .sort_values(k_sort)\n", | |
" .reset_index(drop=True)\n", | |
")\n", | |
"\n", | |
"pd.testing.assert_frame_equal(omni_result, psql_result)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Functional All types table analysis" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"k = ['index', 'id', 'int_col', 'double_col', 'string_col', 'float_col']\n", | |
"\n", | |
"omni_alltypes = omniscidb.table('functional_alltypes')[k]\n", | |
"psql_alltypes = psql.table('functional_alltypes')[k]\n", | |
"\n", | |
"\n", | |
"# k = ['index', 'id', 'string_col', 'float_col']\n", | |
"\n", | |
"omni_alltypes_df = omni_alltypes.execute()\n", | |
"psql_alltypes_df = psql_alltypes.execute()\n", | |
"\n", | |
"pd.testing.assert_frame_equal(omni_alltypes_df, psql_alltypes_df)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"result_alltypes_fn = lambda t, win: t.string_col.ntile(buckets=7).over(win)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"OMNI >>> SELECT *, ntile(7) OVER (PARTITION BY \"string_col\" ORDER BY \"id\") - 1 AS val\n", | |
"FROM (\n", | |
" SELECT \"index\", \"id\", \"int_col\", \"double_col\", \"string_col\", \"float_col\"\n", | |
" FROM functional_alltypes\n", | |
") t0\n", | |
"\n", | |
"PSQL >>> SELECT t0.index, t0.id, t0.int_col, t0.double_col, t0.string_col, t0.float_col, ntile(%(param_1)s) OVER (PARTITION BY t0.string_col ORDER BY t0.id ASC) - %(param_2)s AS val \n", | |
"FROM (SELECT t1.index AS index, t1.id AS id, t1.int_col AS int_col, t1.double_col AS double_col, t1.string_col AS string_col, t1.float_col AS float_col \n", | |
"FROM functional_alltypes AS t1) AS t0\n" | |
] | |
} | |
], | |
"source": [ | |
"omni_alltypes_expr = omni_alltypes.mutate(\n", | |
" val=result_alltypes_fn(\n", | |
" omni_alltypes,\n", | |
" win=ibis.window(\n", | |
" following=0,\n", | |
" group_by=[omni_alltypes.string_col],\n", | |
" order_by=[omni_alltypes.id],\n", | |
" ),\n", | |
" )\n", | |
")\n", | |
"print('OMNI >>>', omni_alltypes_expr.compile())\n", | |
"\n", | |
"psql_alltypes_expr = psql_alltypes.mutate(\n", | |
" val=result_alltypes_fn(\n", | |
" psql_alltypes,\n", | |
" win=ibis.window(\n", | |
" following=0,\n", | |
" group_by=[psql_alltypes.string_col],\n", | |
" order_by=[psql_alltypes.id],\n", | |
" ),\n", | |
" )\n", | |
")\n", | |
"print('\\nPSQL >>>', psql_alltypes_expr.compile())" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"Traceback (most recent call last):\n", | |
" File \"<ipython-input-17-b8cd0ac59782>\", line 7, in <module>\n", | |
" psql_alltypes_result.val\n", | |
" File \"/home/xmn/miniconda3/envs/ibis-nb/lib/python3.7/site-packages/pandas/_testing.py\", line 1186, in assert_series_equal\n", | |
" obj=str(obj),\n", | |
" File \"pandas/_libs/testing.pyx\", line 65, in pandas._libs.testing.assert_almost_equal\n", | |
" File \"pandas/_libs/testing.pyx\", line 174, in pandas._libs.testing.assert_almost_equal\n", | |
" File \"/home/xmn/miniconda3/envs/ibis-nb/lib/python3.7/site-packages/pandas/_testing.py\", line 915, in raise_assert_detail\n", | |
" raise AssertionError(msg)\n", | |
"AssertionError: Series are different\n", | |
"\n", | |
"Series values are different (1.36986 %)\n", | |
"[left]: [6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...]\n", | |
"[right]: [6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...]\n" | |
] | |
} | |
], | |
"source": [ | |
"omni_alltypes_result = omni_alltypes_expr.execute().set_index('index').sort_index()\n", | |
"psql_alltypes_result = psql_alltypes_expr.execute().set_index('index').sort_index()\n", | |
"\n", | |
"try:\n", | |
" pd.testing.assert_series_equal(\n", | |
" omni_alltypes_result.val,\n", | |
" psql_alltypes_result.val\n", | |
" )\n", | |
"except Exception:\n", | |
" traceback.print_exc()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Debug\n", | |
"\n", | |
"Premiss:\n", | |
"\n", | |
"```\n", | |
"SELECT *, ntile(7) OVER (PARTITION BY \"string_col\" ORDER BY \"id\") - 1 AS val\n", | |
"FROM (\n", | |
" SELECT \"index\", \"id\", \"string_col\", \"float_col\"\n", | |
" FROM functional_alltypes\n", | |
") t0\n", | |
"```" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>string_col</th>\n", | |
" <th>count</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>0</td>\n", | |
" <td>730</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>1</td>\n", | |
" <td>730</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>2</td>\n", | |
" <td>730</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>3</td>\n", | |
" <td>730</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>4</td>\n", | |
" <td>730</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>5</td>\n", | |
" <td>730</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6</th>\n", | |
" <td>6</td>\n", | |
" <td>730</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>7</th>\n", | |
" <td>7</td>\n", | |
" <td>730</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>8</th>\n", | |
" <td>8</td>\n", | |
" <td>730</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>9</th>\n", | |
" <td>9</td>\n", | |
" <td>730</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" string_col count\n", | |
"0 0 730\n", | |
"1 1 730\n", | |
"2 2 730\n", | |
"3 3 730\n", | |
"4 4 730\n", | |
"5 5 730\n", | |
"6 6 730\n", | |
"7 7 730\n", | |
"8 8 730\n", | |
"9 9 730" | |
] | |
}, | |
"execution_count": 18, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"t = omniscidb.table('functional_alltypes')\n", | |
"t.group_by('string_col').count().execute()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Grouped by string_col, there are total 10 groups (0 to 9), where each group has 730 rows.\n", | |
"\n", | |
"Using a bucket 7, ntile should divide 730 by 7, so the result should be\n", | |
"from 0 to 6 where each sub-group should countain 104 row and \n", | |
"the first 2 groups should contain 1 more row." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"104" | |
] | |
}, | |
"execution_count": 19, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"730//7" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"2" | |
] | |
}, | |
"execution_count": 20, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"730 - 104*7" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"string_col val\n", | |
"0 0 105\n", | |
" 1 105\n", | |
" 2 105\n", | |
" 3 105\n", | |
" 4 105\n", | |
" 5 105\n", | |
" 6 100\n", | |
"Name: val, dtype: int64" | |
] | |
}, | |
"execution_count": 21, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"omni_alltypes_result.groupby(['string_col', 'val'])['val'].count()[:7]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"string_col val\n", | |
"0 0 105\n", | |
" 1 105\n", | |
" 2 104\n", | |
" 3 104\n", | |
" 4 104\n", | |
" 5 104\n", | |
" 6 104\n", | |
"Name: val, dtype: int64" | |
] | |
}, | |
"execution_count": 22, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"psql_alltypes_result.groupby(['string_col', 'val'])['val'].count()[:7]" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.6" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment