Skip to content

Instantly share code, notes, and snippets.

@xmnlab
Last active March 2, 2020 22:00
Show Gist options
  • Save xmnlab/8488e7b0b7988407521ce4cdc273fc4c to your computer and use it in GitHub Desktop.
Save xmnlab/8488e7b0b7988407521ce4cdc273fc4c to your computer and use it in GitHub Desktop.
Ibis-NTiles-OmniSciDB-vs-PostgreSQL.ipynb
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import random\n",
"import traceback\n",
"\n",
"import ibis\n",
"import pandas as pd\n",
"import numpy as np\n",
"import sqlalchemy as sa"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'1.3.0+1.g0b65f4ad.dirty'"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ibis.__version__"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"def random_salary(v_min, v_max): \n",
" return float(random.randint(v_min, v_max))"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"def cursor2df(cursor):\n",
" col_names = [c.name for c in cursor.description]\n",
" return pd.DataFrame(cursor.fetchall(), columns=col_names)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"omniscidb_conf = dict(\n",
" host='localhost', port='6274', \n",
" user='admin', password='HyperInteractive', \n",
" database='ibis_testing'\n",
")\n",
"\n",
"omniscidb = ibis.omniscidb.connect(**omniscidb_conf)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'5.2.0dev-20200301-0e1eaa97b1'"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# show database version\n",
"omniscidb.con._client.get_version()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['diamonds', 'batting', 'awards_players', 'functional_alltypes', 'geo']"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"omniscidb.list_tables()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"psql_conf = dict(\n",
" host='localhost', \n",
" port='5432', \n",
" user='postgres', \n",
" password='postgres', \n",
" database='ibis_testing'\n",
")\n",
"psql = ibis.postgres.connect(**psql_conf)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['array_types',\n",
" 'awards_players',\n",
" 'batting',\n",
" 'diamonds',\n",
" 'films',\n",
" 'functional_alltypes',\n",
" 'geo',\n",
" 'geography_columns',\n",
" 'geometry_columns',\n",
" 'intervals',\n",
" 'not_supported_intervals',\n",
" 'raster_columns',\n",
" 'raster_overviews',\n",
" 'spatial_ref_sys',\n",
" 'tzone']"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"psql.list_tables()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Salary table analysis"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"dept = {\n",
" 'AC': 'Accounting',\n",
" 'AD': 'Administration',\n",
" 'EX': 'Executive',\n",
" 'FI': 'Finance',\n",
" 'HR': 'Human Resorce',\n",
" 'IT': 'IT',\n",
" 'PU': 'Purchasing',\n",
" 'SA': 'Sales',\n",
" 'SH': 'Shipping'\n",
"}\n",
"\n",
"num_rows = 31\n",
"\n",
"df_test = pd.DataFrame({\n",
" 'first_name': ['FirstName{}'.format(i) for i in range(num_rows)],\n",
" 'last_name': ['LastName{}'.format(i) for i in range(num_rows)],\n",
" 'department_name': (\n",
" [dept['AC']] * 2 \n",
" + [dept['AD']] * 1\n",
" + [dept['EX']] * 3\n",
" + [dept['FI']] * 6\n",
" + [dept['HR']] * 1\n",
" + [dept['IT']] * 2\n",
" + [dept['PU']] * 3\n",
" + [dept['SA']] * 6\n",
" + [dept['SH']] * 7\n",
" ),\n",
" 'salary':[random_salary(1000, 12000) for i in range(num_rows)] \n",
"})\n",
"# display(df_test)\n",
"\n",
"# PSQL\n",
"sch = ibis.schema([\n",
" ('first_name', 'string'),\n",
" ('last_name', 'string'),\n",
" ('department_name', 'string'),\n",
" ('salary', 'float64'),\n",
"])\n",
"table_name = 'salary'\n",
"psql.drop_table(table_name, force=True)\n",
"psql.create_table(table_name, schema=sch)\n",
"df_test.to_sql(table_name, con=psql.con, if_exists='replace', index=False)\n",
"\n",
"# OMNISCIDB\n",
"omniscidb.drop_table(table_name, force=True)\n",
"omniscidb.create_table(table_name, schema=sch)\n",
"omniscidb.load_data(table_name, df_test)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"# omniscidb\n",
"omni_t = omniscidb.table('salary')\n",
"omni_df = omni_t.execute()\n",
"# psql\n",
"psql_t = psql.table('salary')\n",
"psql_df = psql_t.execute()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"# check if the table result is the same\n",
"pd.testing.assert_frame_equal(omni_df, psql_df)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"PSQL >>> SELECT t0.first_name, t0.last_name, t0.department_name, t0.salary, ntile(%(param_1)s) OVER (PARTITION BY t0.department_name ORDER BY t0.salary ASC) - %(param_2)s AS salary_group \n",
"FROM salary AS t0 \n",
"\n",
"OMNI >>> SELECT *,\n",
" ntile(2) OVER (PARTITION BY \"department_name\" ORDER BY \"salary\") - 1 AS salary_group\n",
"FROM salary\n"
]
}
],
"source": [
"result_fn = lambda t, win: t.department_name.ntile(buckets=2).over(win)\n",
"expected_fn = lambda t: ntile(t.department_name, bucket=2)\n",
"\n",
"create_expr = lambda t: (\n",
" t.mutate(\n",
" salary_group=result_fn(\n",
" t,\n",
" win=ibis.window(\n",
" following=0,\n",
" group_by=[t.department_name],\n",
" order_by=[t.salary],\n",
" ),\n",
" )\n",
" )\n",
")\n",
"\n",
"omni_expr = create_expr(omni_t)\n",
"psql_expr = create_expr(psql_t)\n",
"\n",
"print('PSQL >>>', psql_expr.compile(), '\\n')\n",
"print('OMNI >>>', omni_expr.compile())\n",
"\n",
"k_sort = ['department_name', 'salary']\n",
"\n",
"omni_result = (\n",
" omni_expr.execute()\n",
" .drop(columns='rowid')\n",
" .sort_values(k_sort)\n",
" .reset_index(drop=True)\n",
")\n",
"psql_result = (\n",
" psql_expr.execute()\n",
" .sort_values(k_sort)\n",
" .reset_index(drop=True)\n",
")\n",
"\n",
"pd.testing.assert_frame_equal(omni_result, psql_result)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Functional All types table analysis"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"k = ['index', 'id', 'int_col', 'double_col', 'string_col', 'float_col']\n",
"\n",
"omni_alltypes = omniscidb.table('functional_alltypes')[k]\n",
"psql_alltypes = psql.table('functional_alltypes')[k]\n",
"\n",
"\n",
"# k = ['index', 'id', 'string_col', 'float_col']\n",
"\n",
"omni_alltypes_df = omni_alltypes.execute()\n",
"psql_alltypes_df = psql_alltypes.execute()\n",
"\n",
"pd.testing.assert_frame_equal(omni_alltypes_df, psql_alltypes_df)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"result_alltypes_fn = lambda t, win: t.string_col.ntile(buckets=7).over(win)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"OMNI >>> SELECT *, ntile(7) OVER (PARTITION BY \"string_col\" ORDER BY \"id\") - 1 AS val\n",
"FROM (\n",
" SELECT \"index\", \"id\", \"int_col\", \"double_col\", \"string_col\", \"float_col\"\n",
" FROM functional_alltypes\n",
") t0\n",
"\n",
"PSQL >>> SELECT t0.index, t0.id, t0.int_col, t0.double_col, t0.string_col, t0.float_col, ntile(%(param_1)s) OVER (PARTITION BY t0.string_col ORDER BY t0.id ASC) - %(param_2)s AS val \n",
"FROM (SELECT t1.index AS index, t1.id AS id, t1.int_col AS int_col, t1.double_col AS double_col, t1.string_col AS string_col, t1.float_col AS float_col \n",
"FROM functional_alltypes AS t1) AS t0\n"
]
}
],
"source": [
"omni_alltypes_expr = omni_alltypes.mutate(\n",
" val=result_alltypes_fn(\n",
" omni_alltypes,\n",
" win=ibis.window(\n",
" following=0,\n",
" group_by=[omni_alltypes.string_col],\n",
" order_by=[omni_alltypes.id],\n",
" ),\n",
" )\n",
")\n",
"print('OMNI >>>', omni_alltypes_expr.compile())\n",
"\n",
"psql_alltypes_expr = psql_alltypes.mutate(\n",
" val=result_alltypes_fn(\n",
" psql_alltypes,\n",
" win=ibis.window(\n",
" following=0,\n",
" group_by=[psql_alltypes.string_col],\n",
" order_by=[psql_alltypes.id],\n",
" ),\n",
" )\n",
")\n",
"print('\\nPSQL >>>', psql_alltypes_expr.compile())"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Traceback (most recent call last):\n",
" File \"<ipython-input-17-b8cd0ac59782>\", line 7, in <module>\n",
" psql_alltypes_result.val\n",
" File \"/home/xmn/miniconda3/envs/ibis-nb/lib/python3.7/site-packages/pandas/_testing.py\", line 1186, in assert_series_equal\n",
" obj=str(obj),\n",
" File \"pandas/_libs/testing.pyx\", line 65, in pandas._libs.testing.assert_almost_equal\n",
" File \"pandas/_libs/testing.pyx\", line 174, in pandas._libs.testing.assert_almost_equal\n",
" File \"/home/xmn/miniconda3/envs/ibis-nb/lib/python3.7/site-packages/pandas/_testing.py\", line 915, in raise_assert_detail\n",
" raise AssertionError(msg)\n",
"AssertionError: Series are different\n",
"\n",
"Series values are different (1.36986 %)\n",
"[left]: [6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...]\n",
"[right]: [6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...]\n"
]
}
],
"source": [
"omni_alltypes_result = omni_alltypes_expr.execute().set_index('index').sort_index()\n",
"psql_alltypes_result = psql_alltypes_expr.execute().set_index('index').sort_index()\n",
"\n",
"try:\n",
" pd.testing.assert_series_equal(\n",
" omni_alltypes_result.val,\n",
" psql_alltypes_result.val\n",
" )\n",
"except Exception:\n",
" traceback.print_exc()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Debug\n",
"\n",
"Premiss:\n",
"\n",
"```\n",
"SELECT *, ntile(7) OVER (PARTITION BY \"string_col\" ORDER BY \"id\") - 1 AS val\n",
"FROM (\n",
" SELECT \"index\", \"id\", \"string_col\", \"float_col\"\n",
" FROM functional_alltypes\n",
") t0\n",
"```"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>string_col</th>\n",
" <th>count</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>730</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>730</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>730</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>730</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4</td>\n",
" <td>730</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>5</td>\n",
" <td>730</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>6</td>\n",
" <td>730</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>7</td>\n",
" <td>730</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>8</td>\n",
" <td>730</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>9</td>\n",
" <td>730</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" string_col count\n",
"0 0 730\n",
"1 1 730\n",
"2 2 730\n",
"3 3 730\n",
"4 4 730\n",
"5 5 730\n",
"6 6 730\n",
"7 7 730\n",
"8 8 730\n",
"9 9 730"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"t = omniscidb.table('functional_alltypes')\n",
"t.group_by('string_col').count().execute()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Grouped by string_col, there are total 10 groups (0 to 9), where each group has 730 rows.\n",
"\n",
"Using a bucket 7, ntile should divide 730 by 7, so the result should be\n",
"from 0 to 6 where each sub-group should countain 104 row and \n",
"the first 2 groups should contain 1 more row."
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"104"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"730//7"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"2"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"730 - 104*7"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"string_col val\n",
"0 0 105\n",
" 1 105\n",
" 2 105\n",
" 3 105\n",
" 4 105\n",
" 5 105\n",
" 6 100\n",
"Name: val, dtype: int64"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"omni_alltypes_result.groupby(['string_col', 'val'])['val'].count()[:7]"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"string_col val\n",
"0 0 105\n",
" 1 105\n",
" 2 104\n",
" 3 104\n",
" 4 104\n",
" 5 104\n",
" 6 104\n",
"Name: val, dtype: int64"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"psql_alltypes_result.groupby(['string_col', 'val'])['val'].count()[:7]"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment