viniciusmss · December 30, 2019 14:54
diff --git a/Quiz 2.txt b/Quiz 2.txt
 # Quiz 2 -- More R Coding...

 ## This quiz covers competencies you should know by the first day of class.
 ## The first ten questions are intended to be basic.
 ## The final five questions are more advanced.
 ## An answer key will be provided by Monday, Aug 12.

 ## NOTE: Most students find it helpful to code in R using R-Studio.
 ## This involves installing R, and installing R-Studio, and then 
 ## running R-Studio. Google it if you run into problems, and 
 ## if Google can't answer your questions, feel free to email me.

 # Load the Matching library, as described at the beginning of the prior quiz
 # (Quiz 1). If you've never installed this Matching library before,
 # you'll have to begin by doing so -- one way is to use use "install.packages()",
 # but there are also 'point-and-click' ways to do it.
 # So---the first step is to install the library, and the next step is to load it.

 # The "Matching" library comes with a data set called 
 # GerberGreenImai which relates to a series of published papers.
 # You can see some of those papers if (after loading the library)
 # you type:
 ?GerberGreenImai

 # This command opens the help page for the data set. At the bottom of the help page,
 # you'll see a list of 4 relevant published papers.

 # To load the data set into working memory, type:
 data(GerberGreenImai)

 # To take a quick look at the data set, type:
 head(GerberGreenImai)

 # Question 1 (using "dim")
 # What are the dimensions of the data set (how many rows and columns)?
 dim(GerberGreenImai)  # rows: 10829 columns: 26

 # Question 2 (using "names")
 # What are the variable names?
 names(GerberGreenImai)
 # [1] "PERSONS"  "WARD"     "QUESTION" "MAILGRP"  "PHONEGRP" "PERSNGRP" "APPEAL"   "CONTACT"  "MAILINGS" "AGE"     
 # [11] "MAJORPTY" "VOTE96.0" "VOTE96.1" "MAILCALL" "VOTED98"  "PHNSCRPT" "DIS.MC"   "DIS.PHN"  "PHN.C"    "PHNTRT1" 
 # [21] "PHNTRT2"  "PHN.C1"   "PHN.C2"   "NEW"      "phone"    "AGE2"   

 # Question 3 (using "is.na")
 # Is there any missing data? If so, how many elements are missing 
 # and which rows contain the missing data?
 any(is.na(GerberGreenImai))  # False, which means no missing data.
 sum(is.na(GerberGreenImai))  # how many elements are missing? Zero.

 # Question 4 (using the "mode" function, or "is.factor")
 # In this data set, each column is a variable (e.g., "AGE", "WARD", etc.). 
 # Which of these variables (if any) is a factor? 
 lapply(GerberGreenImai, is.factor)  # WARD is a factor

 # Question 5 (using the "class" function)
 # Code a for loop that outputs column name and variable type, per the below format:

 # column name #1  variable type #1
 # column name #2  variable type #2
 # column name #3  variable type #3
 # etc., etc., etc.,  type of each 
 for (colname in names(GerberGreenImai)) {
  cat(colname, ":", class(GerberGreenImai[[colname]]), "\n")
 }

 # Question 6
 # What do you think the variable AGE2 represents? 
 # Looks like age squared divided by 100

 # Question 7
 # Which individuals (and how many) voted in 1998?
 which(GerberGreenImai$VOTED98 == 1)  # Individual indices
 sum(GerberGreenImai$VOTED98)  # Number of people

 # Question 8
 # Which individuals (and how many) have age > 35 and did NOT vote in 1998?
 which(GerberGreenImai$AGE > 35 & GerberGreenImai$VOTED98 == 0)
 sum(GerberGreenImai$AGE > 35 & GerberGreenImai$VOTED98 == 0)

 # Question 9
 # What line of code eliminates all rows that satisfy the conditions of Question 8?
 GerberGreenImai[!(GerberGreenImai$AGE > 35 & GerberGreenImai$VOTED98 == 0), ]
 # or
 # subset(GerberGreenImai, !(GerberGreenImai$AGE > 35 & GerberGreenImai$VOTED98 == 0))

 # Question 10
 # Append a column that records a "1" if Question 8's conditions satisfy, 0 if not.
 # Congrats, you just created a dummy variable! Google it if you have questions.
 GerberGreenImai$myfirstdummy <- as.numeric(GerberGreenImai$AGE > 35 & GerberGreenImai$VOTED98 == 0)

 ### More advanced questions
 # Question 11
 # Write a function that performs the algorithm you identified in Question 6.
 algo1 <- function(age) {return((age^2)/100)}
 # and then try algo1(9) 
 # or try algo1(20) or whatever...

 # Question 12
 # Run a regression w/ AGE2 and AGE as dependent and independent vars (respectively).
 # Do you get statistically significant results? Is there something causal going on?
 # Does the model make predictions that are not possible in the real world? Explain.
 lm.age <- lm(AGE2 ~ AGE, data=GerberGreenImai)
 summary(lm.age)
 # We get statistically significant results. There's no causation proper here since AGE2
 # is calculated directly from AGE. The model does make impossible predictions since the intercept is negative.
 # If AGE is zero, AGE2 will be -24.9. 

 # Question 13 (hint: see the help page for "formula")
 # Consider the following code:
 reg1 <- lm(PHN.C1 ~ PERSONS + VOTE96.1 + NEW + MAJORPTY + AGE + 
             WARD + I(PERSONS*VOTE96.1), data=GerberGreenImai[c(1:1000),])
 # you can then type something like "summary(reg1)"...

 # This regression predicts whether or not a subject received a get-out-the-vote
 # phone call based on the first 1000 observations. 

 # What is meant by I(PERSONS*VOTE96.1)? Use the resulting regression
 # model to make a prediction for individuals in rows 1001 and 1002. Do these 
 # predictions make sense? Why or why not?

 # It is an interaction term between the number of person in the household and whether 
 # they voted in 1996. Namely, the effect of both factors together in influencing the outcome
 # prediction that is not explained by the factors in isolation.
 testdata <- GerberGreenImai[c(1001:1002),]
 predict(reg1, testdata)  
 # These predictions make sense if they are interpreted as
 # probabilities of having received a phone call.

 # Question 14
 # Consider the "WARD" variable. Observe the first 20 elements.
 # What happens when you perform arithmetic or statistical operations on these
 # elements? Why? Provide intuition for the result you obtain.
 GerberGreenImai$WARD[1:20]
 mean(GerberGreenImai$WARD) # argument is not numeric or logical: returning NA
 # R does not interpret the WARD variable as a number since it is coded as a factor.
 # hence, it returns errors if we try to perform some operations over this variable.

 # Question 15
 # Write code that takes the "WARD" variable and creates dummy variables for
 # each individual ward. We can call this "dummy-fying" a categorical variable.
 for (lvl in levels(GerberGreenImai$WARD)) {
  GerberGreenImai[[lvl]] <- GerberGreenImai$WARD == lvl
 }

 # Given your answer to Question 14, do you see why you would want to "dummy-fy" 
 # this variable prior to performing statistical analysis on it (and
 # variables likes this one)?

 # We dummify since there is no sensible way to translate difference categories
 # of a factor variable into numbers that can be statistically analyzed. When we 
 # dummify, we are able to isolate the effect of being part of a certain category or not.
	# Quiz 2 -- More R Coding...

	## This quiz covers competencies you should know by the first day of class.
	## The first ten questions are intended to be basic.
	## The final five questions are more advanced.
	## An answer key will be provided by Monday, Aug 12.

	## NOTE: Most students find it helpful to code in R using R-Studio.
	## This involves installing R, and installing R-Studio, and then
	## running R-Studio. Google it if you run into problems, and
	## if Google can't answer your questions, feel free to email me.

	# Load the Matching library, as described at the beginning of the prior quiz
	# (Quiz 1). If you've never installed this Matching library before,
	# you'll have to begin by doing so -- one way is to use use "install.packages()",
	# but there are also 'point-and-click' ways to do it.
	# So---the first step is to install the library, and the next step is to load it.

	# The "Matching" library comes with a data set called
	# GerberGreenImai which relates to a series of published papers.
	# You can see some of those papers if (after loading the library)
	# you type:
	?GerberGreenImai

	# This command opens the help page for the data set. At the bottom of the help page,
	# you'll see a list of 4 relevant published papers.

	# To load the data set into working memory, type:
	data(GerberGreenImai)

	# To take a quick look at the data set, type:
	head(GerberGreenImai)

	# Question 1 (using "dim")
	# What are the dimensions of the data set (how many rows and columns)?
	dim(GerberGreenImai) # rows: 10829 columns: 26

	# Question 2 (using "names")
	# What are the variable names?
	names(GerberGreenImai)
	# [1] "PERSONS" "WARD" "QUESTION" "MAILGRP" "PHONEGRP" "PERSNGRP" "APPEAL" "CONTACT" "MAILINGS" "AGE"
	# [11] "MAJORPTY" "VOTE96.0" "VOTE96.1" "MAILCALL" "VOTED98" "PHNSCRPT" "DIS.MC" "DIS.PHN" "PHN.C" "PHNTRT1"
	# [21] "PHNTRT2" "PHN.C1" "PHN.C2" "NEW" "phone" "AGE2"

	# Question 3 (using "is.na")
	# Is there any missing data? If so, how many elements are missing
	# and which rows contain the missing data?
	any(is.na(GerberGreenImai)) # False, which means no missing data.
	sum(is.na(GerberGreenImai)) # how many elements are missing? Zero.

	# Question 4 (using the "mode" function, or "is.factor")
	# In this data set, each column is a variable (e.g., "AGE", "WARD", etc.).
	# Which of these variables (if any) is a factor?
	lapply(GerberGreenImai, is.factor) # WARD is a factor

	# Question 5 (using the "class" function)
	# Code a for loop that outputs column name and variable type, per the below format:

	# column name #1 variable type #1
	# column name #2 variable type #2
	# column name #3 variable type #3
	# etc., etc., etc., type of each
	for (colname in names(GerberGreenImai)) {
	cat(colname, ":", class(GerberGreenImai[[colname]]), "\n")
	}

	# Question 6
	# What do you think the variable AGE2 represents?
	# Looks like age squared divided by 100

	# Question 7
	# Which individuals (and how many) voted in 1998?
	which(GerberGreenImai$VOTED98 == 1) # Individual indices
	sum(GerberGreenImai$VOTED98) # Number of people

	# Question 8
	# Which individuals (and how many) have age > 35 and did NOT vote in 1998?
	which(GerberGreenImai$AGE > 35 & GerberGreenImai$VOTED98 == 0)
	sum(GerberGreenImai$AGE > 35 & GerberGreenImai$VOTED98 == 0)

	# Question 9
	# What line of code eliminates all rows that satisfy the conditions of Question 8?
	GerberGreenImai[!(GerberGreenImai$AGE > 35 & GerberGreenImai$VOTED98 == 0), ]
	# or
	# subset(GerberGreenImai, !(GerberGreenImai$AGE > 35 & GerberGreenImai$VOTED98 == 0))

	# Question 10
	# Append a column that records a "1" if Question 8's conditions satisfy, 0 if not.
	# Congrats, you just created a dummy variable! Google it if you have questions.
	GerberGreenImai$myfirstdummy <- as.numeric(GerberGreenImai$AGE > 35 & GerberGreenImai$VOTED98 == 0)

	### More advanced questions
	# Question 11
	# Write a function that performs the algorithm you identified in Question 6.
	algo1 <- function(age) {return((age^2)/100)}
	# and then try algo1(9)
	# or try algo1(20) or whatever...

	# Question 12
	# Run a regression w/ AGE2 and AGE as dependent and independent vars (respectively).
	# Do you get statistically significant results? Is there something causal going on?
	# Does the model make predictions that are not possible in the real world? Explain.
	lm.age <- lm(AGE2 ~ AGE, data=GerberGreenImai)
	summary(lm.age)
	# We get statistically significant results. There's no causation proper here since AGE2
	# is calculated directly from AGE. The model does make impossible predictions since the intercept is negative.
	# If AGE is zero, AGE2 will be -24.9.

	# Question 13 (hint: see the help page for "formula")
	# Consider the following code:
	reg1 <- lm(PHN.C1 ~ PERSONS + VOTE96.1 + NEW + MAJORPTY + AGE +
	WARD + I(PERSONS*VOTE96.1), data=GerberGreenImai[c(1:1000),])
	# you can then type something like "summary(reg1)"...

	# This regression predicts whether or not a subject received a get-out-the-vote
	# phone call based on the first 1000 observations.

	# What is meant by I(PERSONS*VOTE96.1)? Use the resulting regression
	# model to make a prediction for individuals in rows 1001 and 1002. Do these
	# predictions make sense? Why or why not?

	# It is an interaction term between the number of person in the household and whether
	# they voted in 1996. Namely, the effect of both factors together in influencing the outcome
	# prediction that is not explained by the factors in isolation.
	testdata <- GerberGreenImai[c(1001:1002),]
	predict(reg1, testdata)
	# These predictions make sense if they are interpreted as
	# probabilities of having received a phone call.

	# Question 14
	# Consider the "WARD" variable. Observe the first 20 elements.
	# What happens when you perform arithmetic or statistical operations on these
	# elements? Why? Provide intuition for the result you obtain.
	GerberGreenImai$WARD[1:20]
	mean(GerberGreenImai$WARD) # argument is not numeric or logical: returning NA
	# R does not interpret the WARD variable as a number since it is coded as a factor.
	# hence, it returns errors if we try to perform some operations over this variable.

	# Question 15
	# Write code that takes the "WARD" variable and creates dummy variables for
	# each individual ward. We can call this "dummy-fying" a categorical variable.
	for (lvl in levels(GerberGreenImai$WARD)) {
	GerberGreenImai[[lvl]] <- GerberGreenImai$WARD == lvl
	}

	# Given your answer to Question 14, do you see why you would want to "dummy-fy"
	# this variable prior to performing statistical analysis on it (and
	# variables likes this one)?

	# We dummify since there is no sensible way to translate difference categories
	# of a factor variable into numbers that can be statistically analyzed. When we
	# dummify, we are able to isolate the effect of being part of a certain category or not.