Created
March 11, 2022 06:59
-
-
Save zhangce/8b71ad118e80d48da4ba8b53007cef57 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
------------------------------------ | |
-- Script for Interactive Session -- | |
------------------------------------ | |
-- Have messages displayed in English | |
SET lc_messages TO 'en_US.UTF-8'; | |
-- CREATE Employees ----------------------------------------------------------- | |
CREATE TABLE employees ( | |
emp_no INT NOT NULL, | |
birth_date DATE NULL, | |
first_name VARCHAR(14) NOT NULL, | |
last_name VARCHAR(16) NOT NULL, | |
gender CHAR(1) NOT NULL DEFAULT '?', | |
hire_date DATE NOT NULL, | |
PRIMARY KEY (emp_no) | |
); | |
SELECT * FROM employees; | |
INSERT INTO employees | |
VALUES (10001, '1980-10-02', 'Ce', 'Zhang', 'M', '2016-01-01'); | |
INSERT INTO employees | |
(emp_no, birth_date, first_name, last_name, gender, hire_date) | |
VALUES (10002, '1980-10-02', 'Gustavo', 'Alonso', 'M', '2016-01-01'); | |
INSERT INTO employees | |
(emp_no, first_name, last_name, gender, hire_date) | |
VALUES (10003, 'Ingo', 'Mueller', 'M', '2016-01-01'); | |
SELECT * FROM employees; | |
-- Things that can go wrong: | |
-- Insert the same emp_no again (forbidden by primary key) | |
INSERT INTO employees | |
VALUES (10001, '1953-09-02', 'Georgi', 'Facello', 'M', '1986-06-26'); | |
-- Wrong type (int) | |
INSERT INTO employees | |
VALUES ('asdf', '1953-09-02', 'Georgi', 'Facello', 'M', '1986-06-26'); | |
-- Wrong type (date) | |
INSERT INTO employees | |
VALUES (10004, 'hello', 'Georgi', 'Facello', 'M', '1986-06-26'); | |
-- Wrong type (CHAR length) | |
INSERT INTO employees | |
VALUES (10004, '1953-09-02', 'Georgi', 'Facello', 'MM', '1986-06-26'); | |
-- Missing value (no first name) | |
INSERT INTO employees | |
(emp_no, birth_date, last_name, gender, hire_date) | |
VALUES (10004, '1953-09-02', 'Facello', 'M', '1986-06-26'); | |
-- Missing value with DEFAULT works! | |
INSERT INTO employees | |
(emp_no, birth_date, first_name, last_name, hire_date) | |
VALUES (10004, '1953-09-02', 'Georgi', 'Facello', '1986-06-26'); | |
SELECT * FROM employees; | |
-- More on SELECT ------------------------------------------------------------- | |
SELECT first_name, last_name FROM employees; -- projection | |
SELECT first_name AS fn, last_name AS ln FROM employees; -- rename columns | |
SELECT 'Hello', first_name, last_name FROM employees; -- select constants | |
SELECT emp_no, emp_no FROM employees; -- selection columns several times | |
SELECT emp_no AS no1, emp_no AS no2 FROM employees; -- rename | |
SELECT emp_no AS no1, emp_no * 2 AS no2 FROM employees; -- use expressions | |
SELECT emp_no AS no1, emp_no * 2 AS no2, emp_no + emp_no FROM employees; -- use anonymous expressions | |
SELECT first_name || ' ' || last_name FROM employees; -- use functions | |
SELECT first_name || ' ' || last_name AS name FROM employees; -- use functions | |
SELECT birth_date, birth_date + 5 FROM employees; -- special expressions for dates | |
SELECT AGE(birth_date) FROM employees; -- special functions for dates | |
-- CREATE Departments --------------------------------------------------------- | |
CREATE TABLE departments ( | |
dept_no INT NOT NULL, | |
dept_name VARCHAR(40) NOT NULL, | |
PRIMARY KEY (dept_no) | |
); | |
SELECT * FROM departments; | |
INSERT INTO departments VALUES (1, 'Sales'); | |
INSERT INTO departments (dept_name, dept_no) VALUES ('Human Resources', 2); | |
INSERT INTO departments VALUES (3, 'Marketing'), (4, 'Legal'); | |
INSERT INTO departments | |
SELECT 5, 'Development'; -- use SELECT to generate new row | |
SELECT * FROM departments; | |
-- Create dept_emp, which connects employees with departments | |
CREATE TABLE dept_emp ( | |
emp_no INT NOT NULL, | |
dept_no INT NOT NULL, | |
from_date DATE NOT NULL, | |
to_date DATE NOT NULL, | |
PRIMARY KEY (emp_no, dept_no, from_date) | |
); | |
SELECT * FROM dept_emp; | |
-- Use SELECT to connect every employee with a department | |
INSERT INTO dept_emp | |
SELECT emp_no, 5 AS dept_no, NOW(), '9999-01-01' FROM employees; | |
SELECT * FROM dept_emp; | |
-- Things that can go wrong: | |
-- Insert the same records again (forbidden by primary key) | |
INSERT INTO dept_emp | |
SELECT emp_no, 5 AS dept_no, NOW(), '9999-01-01' FROM employees; | |
-- Changing one of the three key attributes is enough | |
INSERT INTO dept_emp | |
SELECT emp_no, 5 AS dept_no, '1999-01-01', NOW() FROM employees; | |
-- Insert a record of a non-existing employee/department | |
-- currently works (we do not use no foreign keys yes!) | |
INSERT INTO dept_emp VALUES (10001, 6, '1980-01-01', '1999-01-01'); | |
-- JOINS ---------------------------------------------------------------------- | |
SELECT * FROM employees; | |
SELECT * FROM departments; | |
SELECT * FROM dept_emp; | |
SELECT * FROM employees, departments, dept_emp; -- start with a Cartesian product | |
SELECT * | |
FROM employees, departments, dept_emp | |
WHERE employees.emp_no = dept_emp.emp_no | |
AND dept_emp.dept_no = departments.dept_no; -- select only those rows that join | |
-- Project to most interesting columns | |
SELECT emp_no, first_name, last_name, dept_no, dept_name -- does not work: ambiguous columns! | |
FROM employees, departments, dept_emp | |
WHERE employees.emp_no = dept_emp.emp_no | |
AND dept_emp.dept_no = departments.dept_no; | |
SELECT employees.emp_no, first_name, last_name, | |
departments.dept_no, dept_name | |
from_date, to_date | |
FROM employees, departments, dept_emp -- better :) | |
WHERE employees.emp_no = dept_emp.emp_no | |
AND dept_emp.dept_no = departments.dept_no; | |
-- This returns too many records: | |
-- In our schema, time plays a role! | |
SELECT employees.emp_no, first_name, last_name, departments.dept_no, dept_name, to_date | |
FROM employees, departments, dept_emp | |
WHERE employees.emp_no = dept_emp.emp_no | |
AND dept_emp.dept_no = departments.dept_no | |
AND to_date > NOW(); -- only join on *current* dept_emp entries | |
-- Join variants | |
SELECT employees.emp_no, first_name, last_name, departments.dept_no, dept_name, to_date | |
FROM employees | |
CROSS JOIN departments -- make cartesian product explicit | |
CROSS JOIN dept_emp -- make cartesian product explicit | |
WHERE employees.emp_no = dept_emp.emp_no | |
AND dept_emp.dept_no = departments.dept_no | |
AND to_date > NOW(); | |
SELECT employees.emp_no, first_name, last_name, departments.dept_no, dept_name, to_date | |
FROM employees | |
JOIN dept_emp ON employees.emp_no = dept_emp.emp_no -- make join explicit | |
JOIN departments ON dept_emp.dept_no = departments.dept_no -- make join explicit | |
WHERE | |
to_date > NOW(); | |
SELECT employees.emp_no, first_name, last_name, departments.dept_no, dept_name, to_date | |
FROM employees | |
JOIN dept_emp USING (emp_no) -- join on equality | |
JOIN departments USING (dept_no) -- join on equality | |
WHERE | |
to_date > NOW(); | |
SELECT employees.emp_no, first_name, last_name, departments.dept_no, dept_name, to_date | |
FROM employees | |
NATURAL JOIN dept_emp -- join on equality | |
NATURAL JOIN departments -- of columns with the same name | |
WHERE | |
to_date > NOW(); | |
-- Aggregations and nesting --------------------------------------------------- | |
DROP TABLE employees; | |
DROP TABLE departments; | |
DROP TABLE dept_emp; | |
SELECT * FROM employees; -- does not work: table does not exist anymore! | |
-- (insert the data sets from our website here) | |
SELECT * FROM employees; -- should display something again | |
SELECT COUNT(*) FROM employees; -- 1024 | |
SELECT MAX(emp_no) FROM employees; | |
-- some last names occur several times, | |
-- so number of distinct values is less than 1024 | |
SELECT COUNT(DISTINCT last_name) FROM employees; | |
SELECT DISTINCT last_name FROM employees; | |
-- By sorting, we can see that SELECT DISTINCT actually removes duplicates | |
SELECT DISTINCT last_name | |
FROM employees | |
ORDER BY last_name DESC; | |
-- Side note: we can also sort by several columns | |
SELECT first_name, last_name | |
FROM employees | |
ORDER BY last_name ASC, first_name DESC; -- even in different directions | |
-- How to find how often each last name occurs? | |
SELECT last_name, COUNT(*) | |
FROM employees | |
GROUP BY last_name; | |
SELECT last_name, COUNT(*) | |
FROM employees | |
GROUP BY last_name | |
ORDER BY COUNT(*) DESC; -- common names on top | |
-- How could we show any first_name per group? | |
-- Apply an aggregate function | |
SELECT MAX(first_name) last_name, COUNT(*) as c | |
FROM employees | |
GROUP BY last_name | |
ORDER BY c DESC; | |
SELECT (ARRAY_AGG(first_name))[1], last_name, COUNT(*) as c | |
FROM employees | |
GROUP BY last_name | |
ORDER BY c DESC; | |
-- How to see only names occurring 4 times? | |
SELECT last_name, COUNT(*) | |
FROM employees | |
GROUP BY last_name | |
-- WHERE? No, WHERE is done *before* the grouping! | |
HAVING COUNT(*) = 4; | |
/* | |
Think of the WHERE predicate as a function applied on a single tuple at a time | |
which come from the FROM clause. | |
The COUNT colomn in this example does not exist yet when the result in the FROM clause is produced. | |
*/ | |
-- Why does this work? | |
SELECT * FROM | |
( | |
SELECT last_name, COUNT(*) AS c | |
FROM employees | |
GROUP BY last_name | |
) AS r1 | |
WHERE c = 4; -- We can use WHERE by using a subquery | |
-- How to select the most common name (without using the constant 4)? | |
-- Step 1: compute the number of occurrences of the most common name(s) | |
SELECT MAX(c) | |
FROM | |
( | |
SELECT DISTINCT COUNT(*) AS c | |
FROM employees | |
GROUP BY last_name | |
) AS r2; -- returns 4 | |
-- Step 2: use that query as subquery | |
SELECT * FROM | |
( | |
SELECT last_name, COUNT(*) AS c | |
FROM employees | |
GROUP BY last_name | |
) AS r1 | |
WHERE c = ( | |
SELECT MAX(c) | |
FROM | |
( | |
SELECT DISTINCT COUNT(*) AS c | |
FROM employees | |
GROUP BY last_name | |
) AS r2 | |
); | |
-- Using WITH clause | |
WITH name_counts AS | |
( | |
SELECT last_name, COUNT(*) as count | |
FROM employees | |
GROUP BY last_name | |
), max_name_count AS | |
( | |
SELECT MAX(count) as max_count | |
FROM name_counts --we can use name_counts here because it was defined before | |
) | |
SELECT last_name, count | |
FROM name_counts, max_name_count | |
WHERE count = max_count | |
ORDER BY max_count DESC; | |
-- Employees with the min salary per department | |
/* | |
Always try to reduce the complexity by dissecting the problem in smaller steps | |
*/ | |
-- Step 1: employees, salary, department | |
SELECT emp_no, salary, dept_no | |
FROM employees | |
JOIN salaries USING(emp_no) | |
JOIN dept_emp USING(emp_no) | |
ORDER BY emp_no; | |
-- Step 2: min salary, department | |
SELECT MIN(salary) AS sal, dept_no | |
FROM employees | |
JOIN salaries USING(emp_no) | |
JOIN dept_emp USING(emp_no) | |
GROUP BY dept_no | |
ORDER BY sal; | |
-- Step 3: Join on common columns | |
WITH emp_sal_dept AS | |
( | |
SELECT emp_no, salary, dept_no | |
FROM employees as e | |
JOIN salaries as s USING(emp_no) | |
JOIN dept_emp as de USING(emp_no) | |
ORDER BY emp_no | |
), min_salaries AS | |
( | |
SELECT MIN(salary) as salary, dept_no | |
FROM emp_sal_dept | |
GROUP BY dept_no | |
ORDER BY salary | |
) | |
SELECT emp_no, dept_no, salary | |
FROM emp_sal_dept | |
JOIN min_salaries USING(dept_no, salary) -- pay attention, we join on both attributes | |
ORDER BY dept_no; | |
/* | |
The above query is not accurate because the salaries and employments have to and from dates associated | |
with them. We need to join only the employements and the salaries which coexisted at the same time for a particular employee. | |
Drawing for emp_no = 10004, brackets means start and the end of the interval: | |
salaries: [ sal_1][ sal_2 ] | |
dept_emp: [ dept_1 ][ dept_2 ] | |
Now image employee 10004 always got a raise so sal_1 < sal_2 < sal_3. | |
If we join without looking at the temporal relation we would get as the result: | |
emp_no | salary | dept_no | |
10004 | sal_1 | dept_1 | |
10004 | sal_1 | dept_2 <--wrong | |
10004 | sal_2 | dept_1 | |
10004 | sal_2 | dept_2 | |
The sal_1 dept_2 combination never happend if we would look at the temporal relationship, the employee was earning sal_2 | |
during his work at dept_2. This means that once when we compute the minimal salary per department the entry (sal_1, dept_2) | |
might pull the minimal salary down for dept_2 even though there was never an employee working for that department for that | |
salary | |
*/ | |
-- Correct query | |
WITH emp_sal_dept AS | |
( | |
SELECT emp_no, salary, dept_no | |
FROM employees as e | |
JOIN salaries as s USING(emp_no) | |
JOIN dept_emp as de USING(emp_no) | |
WHERE (s.from_date, s.to_date) OVERLAPS (de.from_date, de.to_date) -- This eliminates incorrect entries | |
ORDER BY emp_no | |
), min_salaries AS | |
( | |
SELECT MIN(salary) as salary, dept_no | |
FROM emp_sal_dept | |
GROUP BY dept_no | |
ORDER BY salary | |
) | |
SELECT emp_no, dept_no, salary | |
FROM emp_sal_dept | |
JOIN min_salaries USING(dept_no, salary) | |
ORDER BY dept_no; | |
-- Correlated subquery in select | |
-- Show employees, their salary and the average salary for that department | |
-- Step 1: employee, salary, department: | |
SELECT emp_no, | |
salary, | |
dept_no | |
FROM employees AS e | |
JOIN salaries AS s USING(emp_no) | |
JOIN dept_emp AS de USING(emp_no) | |
WHERE (s.from_date, | |
s.to_date) OVERLAPS (de.from_date, | |
de.to_date) | |
ORDER BY emp_no; | |
-- Step 2: average salary for a particular department | |
WITH emp_sal_dept AS | |
( SELECT emp_no, | |
salary, | |
dept_no | |
FROM employees AS e | |
JOIN salaries AS s USING(emp_no) | |
JOIN dept_emp AS de USING(emp_no) | |
WHERE (s.from_date, | |
s.to_date) OVERLAPS (de.from_date, | |
de.to_date) | |
ORDER BY emp_no) | |
SELECT avg(salary) AS salary | |
FROM emp_sal_dept | |
WHERE dept_no = 'd005'; | |
-- Step 3: add the average to the employee, salary, department table | |
WITH emp_sal_dept AS | |
( SELECT emp_no, | |
salary, | |
dept_no | |
FROM employees AS e | |
JOIN salaries AS s USING(emp_no) | |
JOIN dept_emp AS de USING(emp_no) | |
WHERE (s.from_date, | |
s.to_date) OVERLAPS (de.from_date, | |
de.to_date) | |
ORDER BY emp_no) | |
SELECT emp_no, | |
salary, | |
( SELECT avg(salary) AS avg_salary | |
FROM emp_sal_dept | |
WHERE emp_sal_dept.dept_no = emp_sal_dept_o.dept_no --reference to the outer relation | |
) , dept_no | |
FROM emp_sal_dept AS emp_sal_dept_o; | |
-- Alternatively, solve with a join | |
WITH emp_sal_dept AS | |
( SELECT emp_no, | |
salary, | |
dept_no | |
FROM employees AS e | |
JOIN salaries AS s USING(emp_no) | |
JOIN dept_emp AS de USING(emp_no) | |
WHERE (s.from_date, | |
s.to_date) OVERLAPS (de.from_date, | |
de.to_date) | |
ORDER BY emp_no), | |
avg_salaries AS | |
( SELECT dept_no, | |
avg(salary) AS avg_salary | |
FROM emp_sal_dept | |
GROUP BY dept_no) | |
SELECT emp_no, | |
salary, | |
avg_salary, | |
dept_no | |
FROM emp_sal_dept | |
JOIN avg_salaries USING(dept_no); | |
-- Correlated query in the WHERE | |
-- Oldest Managers per department | |
WITH max_age_dept AS | |
( SELECT max(age(birth_date)) AS max_age, | |
dept_no | |
FROM employees AS e | |
JOIN dept_manager AS dm USING(emp_no) | |
GROUP BY dept_no | |
ORDER BY dept_no) | |
SELECT emp_no, | |
age(birth_date) | |
FROM employees AS eo | |
JOIN dept_manager AS dmo USING(emp_no) | |
WHERE age(eo.birth_date) >= | |
( SELECT max_age | |
FROM max_age_dept | |
WHERE dept_no = dmo.dept_no ); | |
-- Exercise: implement the above query with a join | |
/* | |
Try to write a correlated subquery in the FROM clause | |
Why does this not work? | |
*/ | |
WITH emp_sal_dept AS | |
( SELECT emp_no, | |
salary, | |
dept_no | |
FROM employees AS e | |
JOIN salaries AS s USING(emp_no) | |
JOIN dept_emp AS de USING(emp_no) | |
WHERE (s.from_date, | |
s.to_date) OVERLAPS (de.from_date, | |
de.to_date) | |
ORDER BY emp_no) | |
SELECT emp_no, | |
salary, | |
avg_salary, | |
dept_no | |
FROM emp_sal_dept AS emp_sal_dept_o, | |
( SELECT avg(salary) AS avg_salary | |
FROM emp_sal_dept | |
WHERE emp_sal_dept.dept_no = emp_sal_dept_o.dept_no ) AS average_salary; | |
/* | |
Answer: the subquery has to be evaluated before the outer query, | |
so emp_sal_dept_o is not available to the subquery yet | |
*/ | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment