Created
December 23, 2015 11:35
-
-
Save thekensta/05cf3b6a2a81fc15d58c to your computer and use it in GitHub Desktop.
Window Function Comparison
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 98, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd\n", | |
"import requests\n", | |
"from lxml import html" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 102, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# TODO get Spark / Hive data \n", | |
"google = \"https://cloud.google.com/bigquery/query-reference\"\n", | |
"hive = \"https://cwiki.apache.org/confluence/display/Hive/LanguageManual+WindowingAndAnalytics\"\n", | |
"mssql = \"https://msdn.microsoft.com/en-GB/library/ms189461.aspx\"\n", | |
"oracle = \"http://docs.oracle.com/cd/E11882_01/server.112/e41084/functions004.htm#SQLRF06174\"\n", | |
"redshift = \"http://docs.aws.amazon.com/redshift/latest/dg/c_Window_functions.html\"\n", | |
"spark = \"https://databricks.com/blog/2015/07/15/introducing-window-functions-in-spark-sql.html\"\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 70, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"## Copy and pasted text here \n", | |
"## Simply too much of a pain in the arse to scrape these from sites for current purposes\n", | |
"list_of_supported_window_functions = \"\"\"\n", | |
"\n", | |
"AVG (numeric_expr)\n", | |
"COUNT (*)\n", | |
"COUNT ([DISTINCT] field)\n", | |
"MAX (field)\n", | |
"MIN (field)\n", | |
"STDDEV (numeric_expr)\n", | |
"SUM (field)\n", | |
"CUME_DIST ()\n", | |
"DENSE_RANK ()\n", | |
"FIRST_VALUE (<field_name>)\n", | |
"LAG (<expr>[, <offset>[, <default_value>]])\n", | |
"LAST_VALUE (<field_name>)\n", | |
"LEAD (<expr>[, <offset>[, <default_value>]])\n", | |
"NTH_VALUE (<expr>, <n>)\n", | |
"NTILE (<num_buckets>)\n", | |
"PERCENT_RANK ()\n", | |
"PERCENTILE_CONT (<percentile>)\n", | |
"PERCENTILE_DISC (<percentile>)\n", | |
"RANK ()\n", | |
"RATIO_TO_REPORT (<column>)\n", | |
"ROW_NUMBER ()\n", | |
"\n", | |
"AVG Window Function\n", | |
"COUNT Window Function\n", | |
"CUME_DIST Window Function\n", | |
"DENSE_RANK Window Function\n", | |
"FIRST_VALUE\n", | |
"LAST_VALUE Window Functions\n", | |
"LAG Window Function\n", | |
"LEAD Window Function\n", | |
"LISTAGG Window Function\n", | |
"MAX Window Function\n", | |
"MEDIAN Window Function\n", | |
"MIN Window Function\n", | |
"NTH_VALUE Window Function\n", | |
"NTILE Window Function\n", | |
"PERCENT_RANK Window Function\n", | |
"PERCENTILE_CONT Window Function\n", | |
"PERCENTILE_DISC Window Function\n", | |
"RANK Window Function\n", | |
"RATIO_TO_REPORT Window Function\n", | |
"ROW_NUMBER Window Function\n", | |
"STDDEV_SAMP\n", | |
"STDDEV_POP Window Functions\n", | |
"SUM Window Function\n", | |
"VAR_SAMP \n", | |
"VAR_POP Window Functions\n", | |
"DENSE_RANK\n", | |
"NTILE\n", | |
"PERCENT_RANK\n", | |
"RANK\n", | |
"ROW_NUMBER\n", | |
"\n", | |
"AVG *\n", | |
"CORR *\n", | |
"COUNT *\n", | |
"COVAR_POP *\n", | |
"COVAR_SAMP *\n", | |
"CUME_DIST\n", | |
"DENSE_RANK\n", | |
"FIRST\n", | |
"FIRST_VALUE *\n", | |
"LAG\n", | |
"LAST\n", | |
"LAST_VALUE *\n", | |
"LEAD\n", | |
"LISTAGG\n", | |
"MAX *\n", | |
"MIN *\n", | |
"NTH_VALUE *\n", | |
"NTILE\n", | |
"PERCENT_RANK\n", | |
"PERCENTILE_CONT\n", | |
"PERCENTILE_DISC\n", | |
"RANK\n", | |
"RATIO_TO_REPORT\n", | |
"REGR_* Functions *\n", | |
"ROW_NUMBER\n", | |
"STDDEV *\n", | |
"STDDEV_POP *\n", | |
"STDDEV_SAMP *\n", | |
"SUM *\n", | |
"VAR_POP *\n", | |
"VAR_SAMP *\n", | |
"VARIANCE *\n", | |
"\n", | |
"RANK\n", | |
"NTILE\n", | |
"DENSE_RANK\n", | |
"ROW_NUMBER\n", | |
"AVG \n", | |
"MIN \n", | |
"CHECKSUM_AGG \n", | |
"SUM \n", | |
"COUNT \n", | |
"STDEV \n", | |
"COUNT_BIG \n", | |
"STDEVP \n", | |
"GROUPING \n", | |
"VAR \n", | |
"GROUPING_ID \n", | |
"VARP \n", | |
"MAX\n", | |
"CUME_DIST (Transact-SQL)\n", | |
"LEAD (Transact-SQL)\n", | |
"FIRST_VALUE (Transact-SQL)\n", | |
"PERCENTILE_CONT (Transact-SQL)\n", | |
"LAG (Transact-SQL)\n", | |
"PERCENTILE_DISC (Transact-SQL)\n", | |
"LAST_VALUE (Transact-SQL)\n", | |
"PERCENT_RANK (Transact-SQL)\n", | |
"\n", | |
" \n", | |
"\"\"\".strip()\n", | |
"\n", | |
"bq, rs, orac, ms, _ = list_of_supported_window_functions.split(\"\\n\\n\")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Clean up and merge the data using outer joins on the function name" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 95, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# Clean up pasted text \n", | |
"bqfun = pd.DataFrame({\"Function\": [b.split(' ', 1)[0].strip() for b in StringIO.StringIO(bq)], \"Big Query\": 1}).drop_duplicates()\n", | |
"rsfun = pd.DataFrame({\"Function\": [r.split(' ', 1)[0].strip() for r in StringIO.StringIO(rs)], \"Redshift\": 1}).drop_duplicates()\n", | |
"orfun = pd.DataFrame({\"Function\": [r.split(' ', 1)[0].strip() for r in StringIO.StringIO(orac)], \"Oracle\": 1}).drop_duplicates()\n", | |
"msfun = pd.DataFrame({\"Function\": [r.split(' ', 1)[0].strip() for r in StringIO.StringIO(ms)], \"MS Sql\": 1}).drop_duplicates()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 96, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"dbfun = pd.merge(bqfun, rsfun, how='outer', on='Function')\n", | |
"dbfun = pd.merge(dbfun, msfun, how='outer', on='Function')\n", | |
"dbfun = pd.merge(dbfun, orfun, how='outer', on='Function')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Pivot and print the output" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 97, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>Function</th>\n", | |
" <th>Big Query</th>\n", | |
" <th>MS Sql</th>\n", | |
" <th>Oracle</th>\n", | |
" <th>Redshift</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>AVG</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>CHECKSUM_AGG</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>CORR</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>COUNT</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>COUNT_BIG</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>COVAR_POP</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6</th>\n", | |
" <td>COVAR_SAMP</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>7</th>\n", | |
" <td>CUME_DIST</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>8</th>\n", | |
" <td>DENSE_RANK</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>9</th>\n", | |
" <td>FIRST</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>10</th>\n", | |
" <td>FIRST_VALUE</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>11</th>\n", | |
" <td>GROUPING</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>12</th>\n", | |
" <td>GROUPING_ID</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>13</th>\n", | |
" <td>LAG</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>14</th>\n", | |
" <td>LAST</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>15</th>\n", | |
" <td>LAST_VALUE</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>16</th>\n", | |
" <td>LEAD</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>17</th>\n", | |
" <td>LISTAGG</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>18</th>\n", | |
" <td>MAX</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>19</th>\n", | |
" <td>MEDIAN</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>20</th>\n", | |
" <td>MIN</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>21</th>\n", | |
" <td>NTH_VALUE</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>22</th>\n", | |
" <td>NTILE</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>23</th>\n", | |
" <td>PERCENTILE_CONT</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>24</th>\n", | |
" <td>PERCENTILE_DISC</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>25</th>\n", | |
" <td>PERCENT_RANK</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>26</th>\n", | |
" <td>RANK</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>27</th>\n", | |
" <td>RATIO_TO_REPORT</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>28</th>\n", | |
" <td>REGR_*</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>29</th>\n", | |
" <td>ROW_NUMBER</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>30</th>\n", | |
" <td>STDDEV</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>31</th>\n", | |
" <td>STDDEV_POP</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>32</th>\n", | |
" <td>STDDEV_SAMP</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>33</th>\n", | |
" <td>STDEV</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>34</th>\n", | |
" <td>STDEVP</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>35</th>\n", | |
" <td>SUM</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>36</th>\n", | |
" <td>VAR</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>37</th>\n", | |
" <td>VARIANCE</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>38</th>\n", | |
" <td>VARP</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>39</th>\n", | |
" <td>VAR_POP</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>40</th>\n", | |
" <td>VAR_SAMP</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" Function Big Query MS Sql Oracle Redshift\n", | |
"0 AVG 1 1 1 1\n", | |
"1 CHECKSUM_AGG 0 1 0 0\n", | |
"2 CORR 0 0 1 0\n", | |
"3 COUNT 1 1 1 1\n", | |
"4 COUNT_BIG 0 1 0 0\n", | |
"5 COVAR_POP 0 0 1 0\n", | |
"6 COVAR_SAMP 0 0 1 0\n", | |
"7 CUME_DIST 1 1 1 1\n", | |
"8 DENSE_RANK 1 1 1 1\n", | |
"9 FIRST 0 0 1 0\n", | |
"10 FIRST_VALUE 1 1 1 1\n", | |
"11 GROUPING 0 1 0 0\n", | |
"12 GROUPING_ID 0 1 0 0\n", | |
"13 LAG 1 1 1 1\n", | |
"14 LAST 0 0 1 0\n", | |
"15 LAST_VALUE 1 1 1 1\n", | |
"16 LEAD 1 1 1 1\n", | |
"17 LISTAGG 0 0 1 1\n", | |
"18 MAX 1 1 1 1\n", | |
"19 MEDIAN 0 0 0 1\n", | |
"20 MIN 1 1 1 1\n", | |
"21 NTH_VALUE 1 0 1 1\n", | |
"22 NTILE 1 1 1 1\n", | |
"23 PERCENTILE_CONT 1 1 1 1\n", | |
"24 PERCENTILE_DISC 1 1 1 1\n", | |
"25 PERCENT_RANK 1 1 1 1\n", | |
"26 RANK 1 1 1 1\n", | |
"27 RATIO_TO_REPORT 1 0 1 1\n", | |
"28 REGR_* 0 0 1 0\n", | |
"29 ROW_NUMBER 1 1 1 1\n", | |
"30 STDDEV 1 0 1 0\n", | |
"31 STDDEV_POP 0 0 1 1\n", | |
"32 STDDEV_SAMP 0 0 1 1\n", | |
"33 STDEV 0 1 0 0\n", | |
"34 STDEVP 0 1 0 0\n", | |
"35 SUM 1 1 1 1\n", | |
"36 VAR 0 1 0 0\n", | |
"37 VARIANCE 0 0 1 0\n", | |
"38 VARP 0 1 0 0\n", | |
"39 VAR_POP 0 0 1 1\n", | |
"40 VAR_SAMP 0 0 1 1" | |
] | |
}, | |
"execution_count": 97, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"dbfun.pivot_table(values=[\"Big Query\", \"Redshift\", \"Oracle\", \"MS Sql\"], index=\"Function\").reset_index().fillna(0)" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 2", | |
"language": "python", | |
"name": "python2" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.10" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment