Last active
August 26, 2016 23:35
-
-
Save jpotts18/fe7087f04b74ec2b4bbac0857c548813 to your computer and use it in GitHub Desktop.
Data Vault / Kimball Modeling Exercise with Sakila Database
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
USE sakila; | |
-- Film | |
/* | |
This step represents moving the data from production tables into a new database. | |
This stage allows data manipulation without locking tables or impacting the production system. | |
*/ | |
DROP TABLE IF EXISTS src_film; | |
CREATE TABLE src_film LIKE film; | |
INSERT src_film SELECT * FROM film; | |
/* | |
The creation of hub tables insulates your data warehouse from external systems. | |
This step might sound like overkill but this table allows you to own your keys and provides a | |
mapping table for quick lookups | |
*/ | |
DROP TABLE IF EXISTS hub_film; | |
CREATE TABLE hub_film ( | |
`key` INT NOT NULL auto_increment primary key, | |
`id` INT NOT NULL, | |
`created_at` datetime, | |
`updated_at` datetime, | |
`source` varchar(100) | |
); | |
INSERT INTO hub_film (id, created_at, updated_at, source) | |
SELECT | |
film_id, | |
NOW(), | |
NOW(), | |
'sakila' | |
FROM src_film | |
ON DUPLICATE KEY UPDATE updated_at=NOW(); | |
-- Customer | |
DROP TABLE IF EXISTS src_customer; | |
CREATE TABLE src_customer LIKE customer; -- using LIKE copies the data types and indexes over | |
INSERT src_customer SELECT * FROM customer; | |
DROP TABLE IF EXISTS hub_customer; | |
CREATE TABLE hub_customer ( | |
`key` INT NOT NULL auto_increment primary key, | |
`id` INT NOT NULL, | |
`email` varchar(100), | |
`created_at` datetime, | |
`updated_at` datetime, | |
`source` varchar(100) | |
); | |
INSERT INTO hub_customer (id, email, created_at, updated_at, source) | |
SELECT | |
customer_id, | |
email, | |
NOW(), | |
NOW(), | |
'sakila' | |
FROM src_customer | |
ON DUPLICATE KEY UPDATE updated_at=NOW(); | |
/* | |
Dimension tables contain details about each instance of an object. | |
For example, the items dimension table would contain a record for each item sold in the store. | |
It might include information such as the cost of the item, the supplier, color, sizes, and similar data. | |
http://databases.about.com/od/datamining/a/Facts-Vs-Dimensions.htm | |
*/ | |
DROP TABLE IF EXISTS dim_customer; | |
CREATE TABLE dim_customer | |
SELECT | |
h.`key`, | |
c.*, | |
CONCAT(c.first_name,' ',c.last_name) as full_name | |
FROM src_customer c | |
JOIN hub_customer h ON c.customer_id = h.id; | |
-- Store | |
DROP TABLE IF EXISTS src_store; | |
CREATE TABLE src_store LIKE store; | |
INSERT src_store SELECT * FROM store; | |
DROP TABLE IF EXISTS hub_store; | |
CREATE TABLE hub_store LIKE hub_film; -- Easier to write | |
INSERT INTO hub_store (id, created_at, updated_at, source) | |
SELECT | |
s.store_id, | |
NOW(), | |
NOW(), | |
'sakila' | |
FROM src_store s | |
ON DUPLICATE KEY UPDATE updated_at=NOW(); | |
DROP TABLE IF EXISTS dim_store; | |
CREATE TABLE dim_store | |
SELECT | |
h.`key`, | |
s.* | |
FROM src_store s | |
JOIN hub_store h on s.store_id = h.id | |
DROP TABLE IF EXISTS fact_rental; | |
CREATE TABLE fact_rental ( | |
rental_id integer not null, | |
inventory_id integer not null, | |
customer_id integer not null, | |
film_id integer not null, | |
staff_id integer not null, | |
payment_id integer not null, | |
rental_date datetime, | |
payment_date datetime, | |
rental_return_date datetime, | |
rental_duration integer, | |
payment_amount integer, | |
rental_rate integer | |
); | |
INSERT INTO fact_rental ( | |
rental_id, | |
inventory_id, | |
customer_id, | |
film_id, | |
staff_id, | |
payment_id, | |
rental_date, | |
payment_date, | |
rental_return_date, | |
rental_duration, | |
payment_amount, | |
rental_rate | |
) | |
SELECT | |
-- Dimension | |
r.rental_id, | |
r.inventory_id, | |
r.customer_id, | |
f.film_id, | |
r.staff_id, | |
p.payment_id, | |
-- State transitions | |
r.rental_date, | |
p.payment_date, | |
r.return_date as rental_return_date, | |
-- | |
f.rental_duration, | |
p.amount as payment_amount, | |
f.rental_rate | |
FROM rental r | |
JOIN inventory i ON i.inventory_id = r.inventory_id | |
JOIN film f on i.film_id = f.film_id | |
JOIN payment p on p.rental_id = r.rental_id; | |
-- Check integrity | |
SELECT | |
(SELECT COUNT(*) FROM payment) payment_row_count, | |
(SELECT COUNT(*) FROM fact_rental) fact_rental_row_count, | |
(SELECT COUNT(*) FROM rental) rental_row_count; | |
-- Looks like we have 5 payments without rentals | |
SELECT * FROM payment WHERE rental_id IS NULL; | |
-- Aggregates | |
SELECT | |
COUNT(*) num_rental, | |
MAX(payment_amount) max_sale, | |
MIN(payment_amount) min_sale, | |
SUM(payment_amount) sales, | |
AVG(payment_amount) average_sale, | |
STD(payment_amount) std_sale | |
FROM fact_rental; | |
SELECT | |
f.customer_id, | |
COUNT(*) rentals, | |
SUM(payment_amount) sales, | |
c.* | |
FROM fact_rental f | |
JOIN customer c ON c.customer_id = f.customer_id | |
GROUP BY 1 | |
ORDER BY rentals DESC; | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment