rpietro · December 18, 2012 07:10
diff --git a/r_sqldf_tutorial.r b/r_sqldf_tutorial.r
 #Load sqldf package, which will load all others necessary
 #By default, SQLite runs in background to do processing, could use others DB engines if you wanted

 library("sqldf")

 #Import employees data

 employees  <- structure(list(id = 1:20, lastname = structure(c(5L, 14L, 13L, 15L, 6L, 16L, 9L, 1L, 3L, 12L, 10L, 8L, 12L, 3L, 11L, 13L, 10L, 7L, 2L, 4L), .Label = c("a", "b", "c", "f", "g", "h", "i", "j", "n", "o", "p", "r", "s", "t", "w", "z"), class = "factor"), firstname = structure(c(12L, 6L, 5L, 12L, 11L, 15L, 9L, 18L, 17L, 7L, 8L, 10L, 4L, 14L, 19L, 16L, 1L, 13L, 2L, 3L), .Label = c("chris", "dima", "drew", "eric", "hila", "jason", "jeremy", "joe", "jon", "jowanza", "lashanda", "matt", "michael", "michelle", "randy", "rudi", "solon", "stewart", "tim"), class = "factor"), gender = structure(c(2L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("f", "m"), class = "factor")), .Names = c("id", "lastname", "firstname", "gender"), class = "data.frame", row.names = c(NA, -20L))


 #Import orders data - Matt G, Jason T, Matt W, and La Shanda H go to sushi lunch
 #taking a potential client with them

 orders <- structure(list(id = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 4L, 4L, 5L, 5L, 5L, NA, NA), item = structure(c(8L, 9L, 12L, 7L, 10L, 13L, 5L, 11L, 8L, 15L, 13L, 16L, 1L, 3L, 14L, 6L, 15L, 2L, 4L), .Label = c("california roll", "chicken teriaki hibachi", "cucumber roll", "diet coke", "edamame", "firecracker roll", "keystone roll", "playboy roll", "rockstar roll", "salmon sashimi", "salmon skin roll", "spider roll", "tuna sashimi", "unagi roll", "unobtanium roll", "yellowtail hand roll"), class = "factor"), quantity_ordered = c(1L, 1L, 1L, 1L, 6L, 6L, 1L, 1L, 1L, 1L, 4L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), item_cost = c(12, 10, 8, 25, 3, 2.5, 6, 8, 12, 35, 2.5, 7, 4, 3.5, 6.5, 9, 35, 7.95, 1.95)), .Names = c("id", "item", "quantity_ordered", "item_cost"), class = "data.frame", row.names = c(NA, -19L))


 ##### Single Table operations ####

 #1 - Select all employees that are male

 male_employees <- sqldf("SELECT * FROM employees WHERE gender = 'm'")

 #2 - Get a count by first name

 name_counts <- sqldf("SELECT firstname, COUNT (firstname) as occurances FROM employees GROUP BY firstname")
 name_counts
 #3 - Get a count by first name, excluding non-employees

 name_counts_emponly <- sqldf("SELECT firstname, COUNT(firstname) as occurances
                             FROM employees 
                             WHERE firstname != 'rudi'
                             GROUP BY firstname")
 name_counts_emponly

 #4 - Use a case statement to define a new data column of california employees, 
 #    using "lower" to evaluate all names as lowercase to ensure case insensitivity

 employees_cali <- sqldf("SELECT *, 
                        CASE
                          WHEN lower(firstname) = 'stewart' THEN 1
                          WHEN lower(firstname) = 'hila' THEN 1
                          WHEN lower(firstname) = 'jon' THEN 1
                          WHEN lower(firstname) = 'solon' THEN 1
                          ELSE 0
                        END as cali_emp
                        FROM employees
                        ")
 employees_cali

 #5 - Sort employees_cali by cali_emp descending, first name ascending (ascending is default)

 employees_cali_sorted <- sqldf("SELECT *, 
                               CASE
                                  WHEN lower(firstname) = 'stewart' THEN 1
                                  WHEN lower(firstname) = 'hila' THEN 1
                                  WHEN lower(firstname) = 'jon' THEN 1
                                  WHEN lower(firstname) = 'solon' THEN 1
                                  ELSE 0
                                END as cali_emp
                                FROM employees
                                ORDER BY cali_emp DESC, firstname
                        ")

 ##### Multi-Table operations ####

 #1.  Left join employees and orders table (keep all records from employees table, 
 #    matching records from orders)

 left_join <- sqldf("SELECT * 
                   FROM employees a 
                   LEFT JOIN orders b ON a.id=b.id
                   WHERE a.firstname != 'rudi'
                   ")

 #2.  "Right join" isn't supported in sqldf package, but switching order of tables and left join
 #    is functionally equivalent

 right_join_equiv <- sqldf("SELECT * 
                            FROM orders b 
                            LEFT JOIN employees a ON a.id=b.id
                          ")

 #3.  Inner join...select only records that match both tables

 inner_join <- sqldf("SELECT * 
                   FROM employees a, orders b
                   WHERE a.id=b.id
                   ")

 #4.  Matt G. sees bill, wonders how bill can be so low!
 #    Join orders to employees, find who is ordering items less than 10 dollars, sorted by lowest cost

 inexpensive_items <- sqldf("SELECT * 
                         FROM orders a 
                         LEFT JOIN employees b ON a.id= b.id
                         WHERE item_cost < 10
                         ORDER BY item_cost
                         ")

 #4a.  Realizing some things are priced by piece, figure out who spent less than $20 on any one
 #     type of food

 inexpensive_line_items <- sqldf("SELECT *, 
                                (item_cost * quantity_ordered) as item_level_cost
                                 FROM orders a 
                                 LEFT JOIN employees b ON a.id= b.id
                                 WHERE item_level_cost < 20
                                 ORDER BY item_level_cost
                                ")

 #5.  Realizing that even item level cost is wrong question, Matt G. wants to know whose total lunch < $30
 #    Need to use GROUP BY to get totals by person, then use HAVING instead of WHERE because
 #    of the use of the GROUP BY summary function (WHERE is a record level operator)

 lunch_under_30 <- sqldf("SELECT lastname, firstname,
                         SUM(item_cost * quantity_ordered) as lunch_cost
                         FROM orders a 
                         LEFT JOIN employees b ON a.id= b.id
                         GROUP BY a.id
                         HAVING lunch_cost < 30
                         ")

 #6.  Matt G. wants to keep track of food consumption per person, particularly who the "lightweights"
 #    are in the group.  Who's eating less than average on a cost basis?
 #    This requires a subquery to first determine the average cost of this meal, passing that value
 #    to the HAVING clause

 #Subquery: returns a single value for the average lunch cost for employees (those with valid ID num)
 #"SELECT SUM(item_cost * quantity_ordered)/COUNT(DISTINCT id) as avg_lunch_cost
 # FROM orders WHERE id != 'NA'")

 lower_than_average_cost <- sqldf("SELECT lastname, firstname,
                                  SUM(item_cost * quantity_ordered) as lunch_cost
                                  FROM orders a 
                                  LEFT JOIN employees b ON a.id= b.id
                                  WHERE a.id != 'NA'
                                  GROUP BY a.id
                                  HAVING lunch_cost < (
                                                        SELECT SUM(item_cost * quantity_ordered)/COUNT(DISTINCT id) as avg_lunch_cost
                                                        FROM orders 
                                                        WHERE id != 'NA'
                                                      )
                                ")
	#Load sqldf package, which will load all others necessary
	#By default, SQLite runs in background to do processing, could use others DB engines if you wanted

	library("sqldf")

	#Import employees data

	employees <- structure(list(id = 1:20, lastname = structure(c(5L, 14L, 13L, 15L, 6L, 16L, 9L, 1L, 3L, 12L, 10L, 8L, 12L, 3L, 11L, 13L, 10L, 7L, 2L, 4L), .Label = c("a", "b", "c", "f", "g", "h", "i", "j", "n", "o", "p", "r", "s", "t", "w", "z"), class = "factor"), firstname = structure(c(12L, 6L, 5L, 12L, 11L, 15L, 9L, 18L, 17L, 7L, 8L, 10L, 4L, 14L, 19L, 16L, 1L, 13L, 2L, 3L), .Label = c("chris", "dima", "drew", "eric", "hila", "jason", "jeremy", "joe", "jon", "jowanza", "lashanda", "matt", "michael", "michelle", "randy", "rudi", "solon", "stewart", "tim"), class = "factor"), gender = structure(c(2L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("f", "m"), class = "factor")), .Names = c("id", "lastname", "firstname", "gender"), class = "data.frame", row.names = c(NA, -20L))


	#Import orders data - Matt G, Jason T, Matt W, and La Shanda H go to sushi lunch
	#taking a potential client with them

	orders <- structure(list(id = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 4L, 4L, 5L, 5L, 5L, NA, NA), item = structure(c(8L, 9L, 12L, 7L, 10L, 13L, 5L, 11L, 8L, 15L, 13L, 16L, 1L, 3L, 14L, 6L, 15L, 2L, 4L), .Label = c("california roll", "chicken teriaki hibachi", "cucumber roll", "diet coke", "edamame", "firecracker roll", "keystone roll", "playboy roll", "rockstar roll", "salmon sashimi", "salmon skin roll", "spider roll", "tuna sashimi", "unagi roll", "unobtanium roll", "yellowtail hand roll"), class = "factor"), quantity_ordered = c(1L, 1L, 1L, 1L, 6L, 6L, 1L, 1L, 1L, 1L, 4L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), item_cost = c(12, 10, 8, 25, 3, 2.5, 6, 8, 12, 35, 2.5, 7, 4, 3.5, 6.5, 9, 35, 7.95, 1.95)), .Names = c("id", "item", "quantity_ordered", "item_cost"), class = "data.frame", row.names = c(NA, -19L))


	##### Single Table operations ####

	#1 - Select all employees that are male

	male_employees <- sqldf("SELECT * FROM employees WHERE gender = 'm'")

	#2 - Get a count by first name

	name_counts <- sqldf("SELECT firstname, COUNT (firstname) as occurances FROM employees GROUP BY firstname")
	name_counts
	#3 - Get a count by first name, excluding non-employees

	name_counts_emponly <- sqldf("SELECT firstname, COUNT(firstname) as occurances
	FROM employees
	WHERE firstname != 'rudi'
	GROUP BY firstname")
	name_counts_emponly

	#4 - Use a case statement to define a new data column of california employees,
	# using "lower" to evaluate all names as lowercase to ensure case insensitivity

	employees_cali <- sqldf("SELECT *,
	CASE
	WHEN lower(firstname) = 'stewart' THEN 1
	WHEN lower(firstname) = 'hila' THEN 1
	WHEN lower(firstname) = 'jon' THEN 1
	WHEN lower(firstname) = 'solon' THEN 1
	ELSE 0
	END as cali_emp
	FROM employees
	")
	employees_cali

	#5 - Sort employees_cali by cali_emp descending, first name ascending (ascending is default)

	employees_cali_sorted <- sqldf("SELECT *,
	CASE
	WHEN lower(firstname) = 'stewart' THEN 1
	WHEN lower(firstname) = 'hila' THEN 1
	WHEN lower(firstname) = 'jon' THEN 1
	WHEN lower(firstname) = 'solon' THEN 1
	ELSE 0
	END as cali_emp
	FROM employees
	ORDER BY cali_emp DESC, firstname
	")

	##### Multi-Table operations ####

	#1. Left join employees and orders table (keep all records from employees table,
	# matching records from orders)

	left_join <- sqldf("SELECT *
	FROM employees a
	LEFT JOIN orders b ON a.id=b.id
	WHERE a.firstname != 'rudi'
	")

	#2. "Right join" isn't supported in sqldf package, but switching order of tables and left join
	# is functionally equivalent

	right_join_equiv <- sqldf("SELECT *
	FROM orders b
	LEFT JOIN employees a ON a.id=b.id
	")

	#3. Inner join...select only records that match both tables

	inner_join <- sqldf("SELECT *
	FROM employees a, orders b
	WHERE a.id=b.id
	")

	#4. Matt G. sees bill, wonders how bill can be so low!
	# Join orders to employees, find who is ordering items less than 10 dollars, sorted by lowest cost

	inexpensive_items <- sqldf("SELECT *
	FROM orders a
	LEFT JOIN employees b ON a.id= b.id
	WHERE item_cost < 10
	ORDER BY item_cost
	")

	#4a. Realizing some things are priced by piece, figure out who spent less than $20 on any one
	# type of food

	inexpensive_line_items <- sqldf("SELECT *,
	(item_cost * quantity_ordered) as item_level_cost
	FROM orders a
	LEFT JOIN employees b ON a.id= b.id
	WHERE item_level_cost < 20
	ORDER BY item_level_cost
	")

	#5. Realizing that even item level cost is wrong question, Matt G. wants to know whose total lunch < $30
	# Need to use GROUP BY to get totals by person, then use HAVING instead of WHERE because
	# of the use of the GROUP BY summary function (WHERE is a record level operator)

	lunch_under_30 <- sqldf("SELECT lastname, firstname,
	SUM(item_cost * quantity_ordered) as lunch_cost
	FROM orders a
	LEFT JOIN employees b ON a.id= b.id
	GROUP BY a.id
	HAVING lunch_cost < 30
	")

	#6. Matt G. wants to keep track of food consumption per person, particularly who the "lightweights"
	# are in the group. Who's eating less than average on a cost basis?
	# This requires a subquery to first determine the average cost of this meal, passing that value
	# to the HAVING clause

	#Subquery: returns a single value for the average lunch cost for employees (those with valid ID num)
	#"SELECT SUM(item_cost * quantity_ordered)/COUNT(DISTINCT id) as avg_lunch_cost
	# FROM orders WHERE id != 'NA'")

	lower_than_average_cost <- sqldf("SELECT lastname, firstname,
	SUM(item_cost * quantity_ordered) as lunch_cost
	FROM orders a
	LEFT JOIN employees b ON a.id= b.id
	WHERE a.id != 'NA'
	GROUP BY a.id
	HAVING lunch_cost < (
	SELECT SUM(item_cost * quantity_ordered)/COUNT(DISTINCT id) as avg_lunch_cost
	FROM orders
	WHERE id != 'NA'
	)
	")