Created
December 18, 2012 07:10
-
-
Save rpietro/4325744 to your computer and use it in GitHub Desktop.
sqldf tutorial from http://goo.gl/dSuj8 - i just moved the data into the script (using dput) to make it self-contained
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Load sqldf package, which will load all others necessary | |
#By default, SQLite runs in background to do processing, could use others DB engines if you wanted | |
library("sqldf") | |
#Import employees data | |
employees <- structure(list(id = 1:20, lastname = structure(c(5L, 14L, 13L, 15L, 6L, 16L, 9L, 1L, 3L, 12L, 10L, 8L, 12L, 3L, 11L, 13L, 10L, 7L, 2L, 4L), .Label = c("a", "b", "c", "f", "g", "h", "i", "j", "n", "o", "p", "r", "s", "t", "w", "z"), class = "factor"), firstname = structure(c(12L, 6L, 5L, 12L, 11L, 15L, 9L, 18L, 17L, 7L, 8L, 10L, 4L, 14L, 19L, 16L, 1L, 13L, 2L, 3L), .Label = c("chris", "dima", "drew", "eric", "hila", "jason", "jeremy", "joe", "jon", "jowanza", "lashanda", "matt", "michael", "michelle", "randy", "rudi", "solon", "stewart", "tim"), class = "factor"), gender = structure(c(2L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("f", "m"), class = "factor")), .Names = c("id", "lastname", "firstname", "gender"), class = "data.frame", row.names = c(NA, -20L)) | |
#Import orders data - Matt G, Jason T, Matt W, and La Shanda H go to sushi lunch | |
#taking a potential client with them | |
orders <- structure(list(id = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 4L, 4L, 5L, 5L, 5L, NA, NA), item = structure(c(8L, 9L, 12L, 7L, 10L, 13L, 5L, 11L, 8L, 15L, 13L, 16L, 1L, 3L, 14L, 6L, 15L, 2L, 4L), .Label = c("california roll", "chicken teriaki hibachi", "cucumber roll", "diet coke", "edamame", "firecracker roll", "keystone roll", "playboy roll", "rockstar roll", "salmon sashimi", "salmon skin roll", "spider roll", "tuna sashimi", "unagi roll", "unobtanium roll", "yellowtail hand roll"), class = "factor"), quantity_ordered = c(1L, 1L, 1L, 1L, 6L, 6L, 1L, 1L, 1L, 1L, 4L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), item_cost = c(12, 10, 8, 25, 3, 2.5, 6, 8, 12, 35, 2.5, 7, 4, 3.5, 6.5, 9, 35, 7.95, 1.95)), .Names = c("id", "item", "quantity_ordered", "item_cost"), class = "data.frame", row.names = c(NA, -19L)) | |
##### Single Table operations #### | |
#1 - Select all employees that are male | |
male_employees <- sqldf("SELECT * FROM employees WHERE gender = 'm'") | |
#2 - Get a count by first name | |
name_counts <- sqldf("SELECT firstname, COUNT (firstname) as occurances FROM employees GROUP BY firstname") | |
name_counts | |
#3 - Get a count by first name, excluding non-employees | |
name_counts_emponly <- sqldf("SELECT firstname, COUNT(firstname) as occurances | |
FROM employees | |
WHERE firstname != 'rudi' | |
GROUP BY firstname") | |
name_counts_emponly | |
#4 - Use a case statement to define a new data column of california employees, | |
# using "lower" to evaluate all names as lowercase to ensure case insensitivity | |
employees_cali <- sqldf("SELECT *, | |
CASE | |
WHEN lower(firstname) = 'stewart' THEN 1 | |
WHEN lower(firstname) = 'hila' THEN 1 | |
WHEN lower(firstname) = 'jon' THEN 1 | |
WHEN lower(firstname) = 'solon' THEN 1 | |
ELSE 0 | |
END as cali_emp | |
FROM employees | |
") | |
employees_cali | |
#5 - Sort employees_cali by cali_emp descending, first name ascending (ascending is default) | |
employees_cali_sorted <- sqldf("SELECT *, | |
CASE | |
WHEN lower(firstname) = 'stewart' THEN 1 | |
WHEN lower(firstname) = 'hila' THEN 1 | |
WHEN lower(firstname) = 'jon' THEN 1 | |
WHEN lower(firstname) = 'solon' THEN 1 | |
ELSE 0 | |
END as cali_emp | |
FROM employees | |
ORDER BY cali_emp DESC, firstname | |
") | |
##### Multi-Table operations #### | |
#1. Left join employees and orders table (keep all records from employees table, | |
# matching records from orders) | |
left_join <- sqldf("SELECT * | |
FROM employees a | |
LEFT JOIN orders b ON a.id=b.id | |
WHERE a.firstname != 'rudi' | |
") | |
#2. "Right join" isn't supported in sqldf package, but switching order of tables and left join | |
# is functionally equivalent | |
right_join_equiv <- sqldf("SELECT * | |
FROM orders b | |
LEFT JOIN employees a ON a.id=b.id | |
") | |
#3. Inner join...select only records that match both tables | |
inner_join <- sqldf("SELECT * | |
FROM employees a, orders b | |
WHERE a.id=b.id | |
") | |
#4. Matt G. sees bill, wonders how bill can be so low! | |
# Join orders to employees, find who is ordering items less than 10 dollars, sorted by lowest cost | |
inexpensive_items <- sqldf("SELECT * | |
FROM orders a | |
LEFT JOIN employees b ON a.id= b.id | |
WHERE item_cost < 10 | |
ORDER BY item_cost | |
") | |
#4a. Realizing some things are priced by piece, figure out who spent less than $20 on any one | |
# type of food | |
inexpensive_line_items <- sqldf("SELECT *, | |
(item_cost * quantity_ordered) as item_level_cost | |
FROM orders a | |
LEFT JOIN employees b ON a.id= b.id | |
WHERE item_level_cost < 20 | |
ORDER BY item_level_cost | |
") | |
#5. Realizing that even item level cost is wrong question, Matt G. wants to know whose total lunch < $30 | |
# Need to use GROUP BY to get totals by person, then use HAVING instead of WHERE because | |
# of the use of the GROUP BY summary function (WHERE is a record level operator) | |
lunch_under_30 <- sqldf("SELECT lastname, firstname, | |
SUM(item_cost * quantity_ordered) as lunch_cost | |
FROM orders a | |
LEFT JOIN employees b ON a.id= b.id | |
GROUP BY a.id | |
HAVING lunch_cost < 30 | |
") | |
#6. Matt G. wants to keep track of food consumption per person, particularly who the "lightweights" | |
# are in the group. Who's eating less than average on a cost basis? | |
# This requires a subquery to first determine the average cost of this meal, passing that value | |
# to the HAVING clause | |
#Subquery: returns a single value for the average lunch cost for employees (those with valid ID num) | |
#"SELECT SUM(item_cost * quantity_ordered)/COUNT(DISTINCT id) as avg_lunch_cost | |
# FROM orders WHERE id != 'NA'") | |
lower_than_average_cost <- sqldf("SELECT lastname, firstname, | |
SUM(item_cost * quantity_ordered) as lunch_cost | |
FROM orders a | |
LEFT JOIN employees b ON a.id= b.id | |
WHERE a.id != 'NA' | |
GROUP BY a.id | |
HAVING lunch_cost < ( | |
SELECT SUM(item_cost * quantity_ordered)/COUNT(DISTINCT id) as avg_lunch_cost | |
FROM orders | |
WHERE id != 'NA' | |
) | |
") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment