Created
October 11, 2018 13:28
-
-
Save IllusiveMilkman/70c319d60756b78dc11366ffdb5127b3 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- Window Function examples | |
-- PostgreSQL conference South Africa 2018 | |
-- By Willem Booysen | |
-- Youtube: https://www.youtube.com/watch?v=blHEnrYwySE | |
-- Create database and templates for demo | |
DROP DATABASE IF EXISTS WindowFunctions; | |
CREATE DATABASE WindowFunctions; | |
-- Create table Payroll | |
DROP TABLE IF EXISTS Payroll CASCADE; | |
CREATE TABLE Payroll ( | |
emp_no INTEGER, | |
emp_name VARCHAR(20) NOT NULL, | |
dept_name VARCHAR(15) NOT NULL, | |
salary_amt DECIMAL (8,2) NOT NULL CHECK (salary_amt > 0.00) | |
); | |
-- Populate table Payroll | |
INSERT INTO Payroll (emp_no, emp_name, dept_name, salary_amt ) VALUES | |
(1, 'Mark Stone', 'Accounting', 16000.00), | |
(2, 'Maria Stone', 'Accounting', 13000.00), | |
(3, 'Geetha Singh', 'Accounting', 13000.00), | |
(4, 'Richard Hathaway', 'Accounting', 14000.00), | |
(5, 'Joseph Bastion', 'Accounting', 14000.00), | |
(6, 'Arthur Prince', 'Production',12000.00), | |
(7, 'Adele Morse', 'Production', 13000.00), | |
(8, 'Sheamus O Kelly', 'Production', 24000.00), | |
(9, 'Sheilah Flask', 'Production', 24000.00), | |
(10, 'Brian James', 'Production', 16000.00), | |
(11, 'Adam Scott', 'Production', 16000.00), | |
(12, 'Maurice Moss', 'IT',12000.00), | |
(13, 'Roy', 'IT', 12001.00), | |
(14, 'Jen Barber', 'IT', 28000.00), | |
(15, 'Richard Hammond', 'IT', 10000.00), | |
(16, 'James May', 'IT', 10000.00), | |
(18, 'Jeremy Clarkson', 'IT', 10000.00), | |
(17, 'John Doe', 'IT', 100000.00) --Note how emp_no was switched around here... | |
; | |
SELECT * FROM Payroll; | |
-- Examples: First the traditional method | |
-- Basic aggregate functions, using the department as the Group | |
SELECT | |
dept_name, | |
COUNT(*) AS dept_employee_count, | |
MIN(salary_amt) AS min_dept_salary, | |
MAX(salary_amt) AS max_dept_salary, | |
AVG(salary_amt)::DECIMAL(8,2) AS average_dept_salary, | |
SUM(salary_amt) AS total_dept_salaries | |
FROM Payroll | |
GROUP BY dept_name | |
ORDER BY dept_name | |
; | |
-- The individual rows have been grouped into subsets and returned in Department Groups as single rows. | |
-- If we wanted this information to be compared to each salaried employee, we would normally JOIN with a CTE | |
WITH Dept_stats AS ( | |
SELECT | |
dept_name, | |
COUNT(*) AS dept_employee_count, | |
MIN(salary_amt) AS min_dept_salary, | |
MAX(salary_amt) AS max_dept_salary, | |
AVG(salary_amt)::DECIMAL(8,2) AS average_dept_salary, | |
SUM(salary_amt) AS total_dept_salaries | |
FROM Payroll | |
GROUP BY dept_name | |
ORDER BY dept_name | |
) | |
SELECT | |
Payroll.*, | |
(Select count(*) from Payroll) AS total_employee_count, | |
Dept_stats.dept_employee_count, | |
Dept_stats.min_dept_salary, | |
Dept_stats.max_dept_salary, | |
Dept_stats.average_dept_salary, | |
Dept_stats.total_dept_salaries | |
FROM Payroll | |
LEFT OUTER JOIN Dept_stats ON (Payroll.dept_name = Dept_stats.dept_name) | |
ORDER BY Payroll.dept_name, emp_name | |
; | |
-- Window Function --> How clean is this in comparison? | |
SELECT | |
*, | |
COUNT(*) OVER () AS total_employee_count, | |
COUNT(*) OVER (PARTITION BY dept_name) AS dept_employee_count, | |
MIN(salary_amt) OVER (PARTITION BY dept_name) AS min_dept_salary, | |
MAX(salary_amt) OVER (PARTITION BY dept_name) AS max_dept_salary, | |
AVG(salary_amt) OVER (PARTITION BY dept_name)::DECIMAL(8,2) AS avg_dept_sal, | |
SUM(salary_amt) OVER (PARTITION BY dept_name) AS total_dept_salaries | |
FROM Payroll | |
ORDER BY dept_name, emp_name; | |
-- DEMO 1 complete | |
-- Basic Syntax: | |
-- A window function call always contains an OVER clause. | |
-- "( )" --> This is the Window | |
SELECT | |
*, | |
COUNT(*) OVER () AS total_employee_count, | |
COUNT(*) OVER (PARTITION BY dept_name) AS dept_employee_count | |
FROM Payroll | |
; | |
SELECT | |
*, | |
COUNT(*) OVER () AS total_employee_count, | |
COUNT(*) OVER (PARTITION BY dept_name) AS dept_employee_count | |
FROM Payroll | |
WHERE dept_name = 'IT' --> Windows into your data are affected by the WHERE clause. Windows cannot work outside your base data set. | |
; | |
SELECT | |
*, | |
COUNT(*) OVER () AS total_employee_count, | |
COUNT(*) OVER (PARTITION BY dept_name) AS dept_employee_count, | |
MIN(salary_amt) OVER (PARTITION BY dept_name) AS min_dept_salary, | |
MAX(salary_amt) OVER (PARTITION BY dept_name) AS max_dept_salary, | |
AVG(salary_amt) OVER (PARTITION BY dept_name)::DECIMAL(8,2) AS avg_dept_sal, | |
SUM(salary_amt) OVER (PARTITION BY dept_name) AS total_dept_salaries | |
FROM Payroll | |
ORDER BY dept_name, emp_name; | |
-- And now for some handy comparison figures... | |
SELECT | |
*, | |
(salary_amt / (SUM(salary_amt) OVER (PARTITION BY dept_name)) * 100)::DECIMAL(18,2) AS Dept_Percentage, | |
(salary_amt / (SUM(salary_amt) OVER () ) * 100)::DECIMAL(18,2) AS Company_Percentage | |
FROM Payroll | |
ORDER BY dept_name, dept_percentage; | |
-- ORDER BY examples here --> Skipping to ROW_NUMBER for the moment as this best illustrates the various Window orders. | |
-- and how a Window order is not affected by the outer SELECT order. | |
SELECT | |
*, | |
-- Note the difference between row_number and the emp_no in terms of when it was captured. | |
ROW_NUMBER() OVER () AS "Base Row No", | |
-- No Partition, just an ORDER BY, thus the whole base resultset is used. | |
ROW_NUMBER() OVER (ORDER BY salary_amt) AS "Salary Row No", | |
-- Order each partition first, then assign row numbers. | |
ROW_NUMBER() OVER (PARTITION BY dept_name ORDER BY salary_amt) AS "Dept,Salary Row No" | |
FROM Payroll | |
ORDER BY emp_no; | |
-- DEMO 2 complete | |
-- Row_Order, Rank & Dense_rank: | |
-- The natural order of the table as we declared it. | |
SELECT * | |
FROM Payroll | |
ORDER BY emp_no; | |
-- ROW_NUMBER a really good way to explain the relationship between Windows and Partitions: | |
SELECT *, | |
-- Row_Number assigns a unique integer to each row within your partition within your window. | |
ROW_NUMBER() OVER (), -- Note the difference between row_number and the emp_no in terms of when it was captured. | |
ROW_NUMBER() OVER (ORDER BY salary_amt), -- No Partition, just an ORDER BY, thus the whole base resultset is used. | |
ROW_NUMBER() OVER (PARTITION BY dept_name), | |
ROW_NUMBER() OVER (PARTITION BY dept_name ORDER BY salary_amt), | |
-- Ranks --> Equal values are ranked the same, creating gaps in numbering | |
RANK() OVER (), -- Ranks are useless without an ORDER BY | |
RANK() OVER (PARTITION BY dept_name), | |
RANK() OVER (ORDER BY salary_amt), | |
RANK() OVER (PARTITION BY dept_name ORDER BY salary_amt), | |
-- Dense_Ranks --> Equal values are ranked the same, without gaps in numbering | |
DENSE_RANK() OVER (), | |
DENSE_RANK() OVER (PARTITION BY dept_name), | |
DENSE_RANK() OVER (ORDER BY salary_amt), | |
DENSE_RANK() OVER (PARTITION BY dept_name ORDER BY salary_amt) | |
FROM Payroll | |
ORDER BY emp_no | |
; | |
-- GREAT! Let's find the top 2 earners in every department | |
-- ======================================================= | |
-- Solution with ROW_NUMBERS | |
WITH ctePayroll AS ( | |
SELECT *, | |
ROW_NUMBER() OVER (PARTITION BY dept_name ORDER BY salary_amt DESC) AS dept_row | |
FROM Payroll | |
--WHERE ROW_NUMBER() OVER (PARTITION BY dept_name ORDER BY salary_amt DESC) <= 2 --> WFs not allowed in WHERE clauses | |
) | |
SELECT * | |
FROM ctePayroll | |
WHERE dept_row <= 2 | |
; --> BUT hold on! What about people who earn the same salaries? | |
-- Solution with RANK | |
WITH ctePayroll AS ( | |
SELECT *, | |
RANK() OVER (PARTITION BY dept_name ORDER BY salary_amt DESC) AS dept_rank | |
FROM Payroll | |
) | |
SELECT * | |
FROM ctePayroll | |
WHERE dept_rank <= 2 | |
; --> HOWEVER, Production is just plain wrong??? | |
-- Solution with DENSE_RANK | |
WITH ctePayroll AS ( | |
SELECT *, | |
DENSE_RANK() OVER (PARTITION BY dept_name ORDER BY salary_amt DESC) AS dept_rank | |
FROM Payroll | |
) | |
SELECT * | |
FROM ctePayroll | |
WHERE dept_rank <= 2 | |
--ORDER BY dept_rank --> Note how we can order based on CTE name, cannot do this without CTE | |
; | |
-- DEMO 3 complete | |
-- percent_rank and cume_rank | |
WITH base_data AS ( | |
SELECT generate_series(1,10) AS series | |
) | |
SELECT *, | |
(PERCENT_RANK() OVER (ORDER BY series))::DEC(8,2), -- to determine the relative standing of a value within a set or rows | |
(CUME_DIST() OVER (ORDER BY series))::DEC(8,2), -- Perfect for a 10-90 split in data | |
NTILE(2) OVER (ORDER BY series), -- Split your dataset into x parts | |
NTILE(3) OVER (ORDER BY series) | |
FROM base_data | |
ORDER BY series | |
; | |
--ntile with partitions | |
SELECT *, | |
--ntile(bucket size) | |
ntile(2) OVER (PARTITION BY dept_name ORDER BY salary_amt), | |
ntile(3) OVER (PARTITION BY dept_name ORDER BY salary_amt) | |
FROM Payroll | |
ORDER BY dept_name, salary_amt; | |
-- DEMO 4 complete | |
-- LAG(), LEAD() --> Offset from current row | |
-- Let's return the employee names before and after the current row: | |
SELECT emp_no, | |
dept_name, | |
emp_name, | |
LAG(emp_name) OVER (PARTITION BY dept_name ORDER BY emp_no) AS "Previous Employee", | |
LEAD(emp_name) OVER (PARTITION BY dept_name ORDER BY emp_no) AS "Next Employee" | |
FROM Payroll | |
ORDER BY emp_no; | |
-- The syntax of LAG and LEAD | |
-- LAG/LEAD (column or expression [,offset] [,default]) | |
SELECT *, | |
LAG(emp_name) OVER (PARTITION BY dept_name ORDER BY emp_no) AS "Previous Employee", | |
LAG(emp_name, 2) OVER (PARTITION BY dept_name ORDER BY emp_no) AS "Previous Offset 2", | |
LAG(emp_name, 2, 'nada...') OVER (PARTITION BY dept_name ORDER BY emp_no) AS "Previous Offset 2 with Defaults" | |
FROM Payroll | |
ORDER BY emp_no; | |
-- FIRST_VALUE(), LAST_VALUE() ... Offset relative to beginning/end of the window frame | |
SELECT *, | |
FIRST_VALUE(emp_name) OVER (PARTITION BY dept_name ORDER BY salary_amt DESC) AS "Higest Earner", | |
MAX(salary_amt) OVER (PARTITION BY dept_name ORDER BY salary_amt DESC) AS "Highest Salary", | |
LAST_VALUE(emp_name) OVER (PARTITION BY dept_name ORDER BY salary_amt DESC) AS "Lowest Earner", --> Notice how LAST and MIN move with Window Frame | |
MIN(salary_amt) OVER (PARTITION BY dept_name ORDER BY salary_amt DESC) AS "Lowest Salary", | |
LAST_VALUE(emp_name) OVER (PARTITION BY dept_name) AS "Lowest Earner for real", | |
MIN(salary_amt) OVER (PARTITION BY dept_name) "Lowest Salary for real" | |
FROM Payroll | |
ORDER BY dept_name, salary_amt DESC | |
; | |
-- Now the previous one failed miserably, so why the Highest Earner worked, but not the Lowest? | |
-- Because the ORDER BY initiates the Roller Blind effect, thus MIN/MAX/FIRST/LAST is calculated as the Window Frame grows. | |
-- Demo Complete | |
-- UNBOUNDED PRECEDING and UNBOUNDED FOLLOWING | |
-- and.. RANGE vs ROWS | |
SELECT *, | |
LAST_VALUE(emp_name) OVER (PARTITION BY dept_name ORDER BY salary_amt DESC) AS "ORDER BY - DEFAULT BEHAVIOUR", | |
LAST_VALUE(emp_name) OVER (PARTITION BY dept_name ORDER BY salary_amt DESC | |
RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS "Lowest Earner - RANGE to Current Row", --> So ORDER BY default is RANGE | |
LAST_VALUE(emp_name) OVER (PARTITION BY dept_name ORDER BY salary_amt DESC | |
ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS "Lowest Earner - ROWS to Current Row", --> Overriding default behaviour here | |
LAST_VALUE(emp_name) OVER (PARTITION BY dept_name ORDER BY salary_amt DESC | |
ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) AS "Lowest Earner" --> Window functions are easy, there's almost no syntax or variations to remember... | |
FROM Payroll | |
; | |
-- The catch-all phrase "ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING" | |
SELECT *, | |
FIRST_VALUE(emp_name) OVER (PARTITION BY dept_name ORDER BY salary_amt DESC | |
ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS "Higest Earner", | |
LAST_VALUE(emp_name) OVER (PARTITION BY dept_name ORDER BY salary_amt DESC | |
ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) AS "Lowest Earner" | |
FROM Payroll | |
; | |
-- To clearly illustrate the difference between ROWS and RANGE: | |
SELECT *, | |
SUM(salary_amt) OVER (PARTITION BY dept_name ORDER BY salary_amt DESC | |
RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS "Running Totals with RANGE", | |
SUM(salary_amt) OVER (PARTITION BY dept_name ORDER BY salary_amt DESC | |
ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS "Running Totals with ROWS" --> BTW This is how we do running totals easily. | |
FROM Payroll | |
; | |
-- Getting really specific about what rows to include... | |
SELECT *, | |
sum(salary_amt) OVER (PARTITION BY dept_name ORDER BY salary_amt DESC | |
ROWS BETWEEN 1 FOLLOWING AND UNBOUNDED FOLLOWING) AS "The rest", | |
sum(salary_amt) OVER (PARTITION BY dept_name ORDER BY salary_amt DESC | |
ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING) AS "Prev, Current & Next", | |
sum(salary_amt) OVER (PARTITION BY dept_name ORDER BY salary_amt DESC | |
ROWS BETWEEN 1 FOLLOWING AND 2 FOLLOWING) AS "Next 2" | |
FROM Payroll | |
; | |
-- Demo complete | |
/* | |
And now for Running Totals... | |
*/ | |
DROP TABLE IF EXISTS running_totals; | |
CREATE TABLE running_totals AS ( | |
SELECT generate_series(1,5000) AS my_id, | |
(random()* 100)::decimal(15,2) AS Amount | |
) | |
; | |
-- Let's take a look at our data | |
SELECT * FROM running_totals; | |
SELECT count(*), sum(amount)::money FROM running_totals; | |
-- Traditional method | |
SELECT t1.my_id, | |
t1.amount, | |
SUM(t2.amount) AS running_total | |
FROM running_totals t1 | |
INNER JOIN running_totals t2 ON t1.my_id >= t2.my_id | |
GROUP BY t1.my_id, t1.amount | |
ORDER BY t1.my_id; | |
-- Window Functions | |
SELECT *, | |
SUM(amount) OVER (ORDER BY my_id ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) | |
FROM running_totals | |
ORDER BY my_id; | |
--- DEMO COMPLETE | |
-- Watch out! | |
-- Issues with Distinct... | |
SELECT | |
*, | |
COUNT(DISTINCT dept_name) OVER () | |
FROM Payroll | |
ORDER BY dept_name, emp_name; | |
-- You'd have to use GROUP BY (and list every column in the table), or some trickery... | |
WITH dept_stats AS ( | |
SELECT | |
COUNT(DISTINCT dept_name) AS dept_count | |
FROM Payroll | |
) | |
SELECT *, | |
(SELECT * FROM dept_stats) | |
FROM Payroll; | |
--Using Window Funcitons in WHERE ... You shall not pass! | |
SELECT *, | |
ROW_NUMBER() OVER (PARTITION BY dept_name ORDER BY salary_amt DESC) AS dept_row | |
FROM Payroll | |
WHERE ROW_NUMBER() OVER (PARTITION BY dept_name ORDER BY salary_amt DESC) <= 2 --> WFs not allowed in WHERE clauses | |
; | |
-- thanks :) |
Awesome! Enjoy the journey, Postgres is fantastic. Glad I could
contribute to the greater Postgres society. Window functions can be
overwhelming when you first come across them, but they help every now and
then. Good luck!
β¦On Tue, 27 Oct 2020 at 13:19, Oluwaseun Oyebade ***@***.***> wrote:
***@***.**** commented on this gist.
------------------------------
I started learning Postgress a few days ago, precisely the 25th of
October. Your talk on windows function is enlightening for a non-accountant
π.
β
You are receiving this because you authored the thread.
Reply to this email directly, view it on GitHub
<https://gist.github.com/70c319d60756b78dc11366ffdb5127b3#gistcomment-3505735>,
or unsubscribe
<https://github.com/notifications/unsubscribe-auth/AFIIESQXO62UTWDP6J3WVB3SM2UCJANCNFSM4TAVUZOQ>
.
Great examples. The one on how row_number and rank may not give the correct output when getting top 2 earners demonstrates clearly the differences between row_number, rank and dense_rank.
Dear Jayadevanm
I truly appreciate the feedback. It's so nice to see my gists still
working for people around the world. Thank you for taking the time to
write the note, it means the world to me.
Kind regards
Willem
β¦On Sat, 24 Apr 2021 at 15:23, Jayadevan ***@***.***> wrote:
***@***.**** commented on this gist.
------------------------------
Great examples. The one on how row_number and rank may not give the
correct output when getting top 2 earners demonstrates clearly the
differences between row_number, rank and dense_rank.
β
You are receiving this because you authored the thread.
Reply to this email directly, view it on GitHub
<https://gist.github.com/70c319d60756b78dc11366ffdb5127b3#gistcomment-3718380>,
or unsubscribe
<https://github.com/notifications/unsubscribe-auth/AFIIESQ4RLCGVFYTPTOM7GTTKLA43ANCNFSM4TAVUZOQ>
.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I started learning Postgress a few days ago, precisely the 25th of October. Your talk on windows function is enlightening for a non-accountant π. Kindly share the link to the slides.