viniciusmss · September 26, 2019 11:13
diff --git a/Assignment 1_Answer key.R b/Assignment 1_Answer key.R
 ######### RUN THE CODE BELOW IN R. R-STUDIO IS THE RECOMMENDED IDE. BOTH R AND R-STUDIO ARE FREE.
 ######### QUESTIONS SHOULD BE POSTED TO PIAZZA
 ######### THE ACTUAL ASSIGNMENT BEGINS ON LINE 71


 ### Multilateral Development Institution Data
 foo <- read.csv("https://tinyurl.com/yb4phxx8") # read in the data

 # column names
 names(foo)

 # dimensions of the data set
 dim(foo)

 # quick look at the data structure
 head(foo)

 # one thing to be very careful with (in this data set) is the use of dates. 8 columns involve dates.

 # take note of the columns representing calendar dates
 date.columns <- c(11, 12, 14, 15, 16, 17, 18, 25)

 # these columns need some tweaking--I want to address missing values, calling the blank (empty) 
 # elements "NA" instead of leaving them blank, and I wish to tell R these are "Date" objects.

 for(i in date.columns)  # this "for loop" only loops through the "date.columns" -- no other columns.
  
 {
  
  # identify which values are missing in the "i"th column of the foo data set
  which_values_are_missing <- which(as.character(foo[, i]) == "")
  
  # those values that are missing (blank) in the "i"th column are replaced by <NA>
  # because R knows how to handle "NA" -- NA means something special in R--blanks are handled 
  # more unpredictably (which is bad).
  foo[which_values_are_missing, i] <- NA
  
  # last step--replace each of these columns (which is structured as a column of "factor" values)
  # as a column of dates--i.e., convert them to an object of "class" = Date. They are dates, after all.
  # And if you convert them to the Date class, R will know they are dates and you can manipulate 
  # dates in a simple, straightforward way. Otherwise, you won't be able to easily manipulate them
  # arithmetically.  E.g., for simple Date operations, see lines 48-58 below...
  # **By the way, if you don't understand what a "factor" is in R, you should Google it.** 
  foo[, i] <- as.Date(as.character(foo[, i]))
  
 }

 # Now R knows that these columns are comprised of dates
 # for example...  Replicate this yourself...

 foo[3,12]
 # [1] "1968-03-13"

 foo[4,12]
 # [1] "1968-07-03"

 foo[3,12] - foo[4,12]
 # Time difference of -112 days

 # Also, one additional helpful hint... How to eliminate rows with NAs...
 # The "is.na" function--for more info, Google it or type ?is.na at the R command prompt in the console.
 which.have.NAs <- which(is.na(foo$Rating == TRUE)) # for which rows is the claim "is.na" a TRUE claim?

 # Then, if you wanted to, e.g., remove all those rows, retaining only the rows with ratings...
 new_foo <- foo[-which.have.NAs, ]
 # Notice I called this tweaked data set "new_foo" instead of rewriting over the original data set...
 # It's a bit safer to do this, in case I decide I want to quickly revert back to the original data set.

 ###########################################################################

 ### ASSIGNMENT 1 -- You may want to read ALL the questions before you begin. 
 ### NOTE: FOR ALL QUESTIONS BELOW, ONLY CONSIDER PROJECTS WITH 
 ### non-missing "Circulation.Date" >= 2009-01-01. 
 ### EXCLUDE ALL OTHER PROJECTS FROM YOUR ANALYSIS.
 ### YOU MUST provide a link to your R code. ------ DON'T FORGET TO DO THIS!!!!!!!!!!!!
 # Take note of the column names: i.e., you can type: names(foo)
 # fyi: the column called "Rating" is the success rating at completion. 0 = lowest, 3 = highest.


 noNA_foo <- new_foo[!is.na(new_foo$CirculationDate), ]
 df <- noNA_foo[which(noNA_foo$CirculationDate >= as.Date("2009-01-01")), ]

 # Check:
 summary(df$CirculationDate)



 # (1) ... (see prompt in the assignment)
 # (a) Is this claim true? Explain. 

 # Some projects have no original completion dates.
 # We need to disregard them.

 mask <- which(is.na(df$OriginalCompletionDate))
 df_withDates <- df[-mask,]
 expected_duration <- mean(df_withDates$OriginalCompletionDate) - mean(df_withDates$ApprovalDate) 
 expected_duration
 # Time difference of 650.9313 days, which is slightly less than two full years.
 # The claim is partially true depending on your tolerance.

 # (b)

 # Create a circulation year column
 df_withDates$CirculationYear <- format(df_withDates$CirculationDate, "%Y")

 # Are there rows with NAs in the Revised Completion Date column?
 sum(is.na(df_withDates$RevisedCompletionDate))  # No

 # And an delay column
 df_withDates$Delay <- df_withDates$RevisedCompletionDate - df_withDates$OriginalCompletionDate 

 # The simplest way to do it probably involves a for loop in which you iterate through
 # circulation years. I'm providing a fancier solution so that you can get used to 
 # reading more complex R code. The dplyr library provides a data processing pipeline
 # that is very similar to data analysis in SQL. DataCamp has tons of resources on it.

 library(dplyr)  
 delayByYear <- df_withDates %>%  # %>% means "pass what is to the left to the function that follows"
  group_by(CirculationYear) %>%    
  summarise(mean.delay = mean(Delay), 
            median.delay = median(Delay),
            IQR.delay = quantile(Delay, 0.75) - quantile(Delay, 0.25))
 delayByYear

 # Not the prettiest plot, but you get the point.
 plot(delayByYear$CirculationYear, delayByYear$mean.delay, 
     pch=15, col="red", ylim=c(100, 800), 
     xlab="Circulation Year", ylab="Delay (days)", main="Project Delay")
 points(delayByYear$CirculationYear, delayByYear$median.delay, pch=16, col="blue")
 points(delayByYear$CirculationYear, delayByYear$IQR.delay, pch=17, col="green")
 legend("bottomleft", pch=c(15,16,17), 
       col=c("red","blue","green"), 
       legend=c("Mean Delay", "Median Delay", "IQR of Delays"))
 grid(nx=NA, ny=NULL)

 # RESPONSE:
 # Project delays have fluctuated over the years. Starting with an average of 658 days in
 # 2009, it decreased to 503 in 2013 before increasing again to 627 in 2015. The average went
 # down to 566 in 2018. The median delay also fluctuates but tends to be much lower than mean delays.
 # The IQR follows a similar trend to the mean.

 # (c)

 # I did a year-by-year analysis, but this was not necessary.

 # Create an actual duration column
 df_withDates$ActualDuration <- df_withDates$RevisedCompletionDate - df_withDates$ApprovalDate

 # Actual duration statistics
 mean(df_withDates$ActualDuration) 
 median(df_withDates$ActualDuration)
 quantile(df_withDates$ActualDuration)
 IQR(df_withDates$ActualDuration) 

 df_withDates$ExpectedDuration <- df_withDates$OriginalCompletionDate - df_withDates$ApprovalDate

 mean(df_withDates$ExpectedDuration)
 median(df_withDates$ExpectedDuration) 
 quantile(df_withDates$ExpectedDuration) 
 IQR(df_withDates$ExpectedDuration)
 range(df_withDates$ExpectedDuration)

 ActualDurationByYear <- df_withDates %>%  
  group_by(CirculationYear) %>%    
  summarise(mean.duration = mean(ActualDuration), 
            median.duration = median(ActualDuration),
            IQR.duration = quantile(ActualDuration, 0.75) - quantile(ActualDuration, 0.25))
 ActualDurationByYear

 plot(ActualDurationByYear$CirculationYear, 
     ActualDurationByYear$mean.duration, pch=15, col="red", ylim=c(300, 1300),
     xlab="Circulation Year", ylab="Duration (days)", main="Actual Project Duration")
 points(ActualDurationByYear$CirculationYear, 
       ActualDurationByYear$median.duration, pch=16, col="blue")
 points(ActualDurationByYear$CirculationYear, 
       ActualDurationByYear$IQR.duration, pch=17, col="green")
 legend("bottomleft", pch=c(15,16,17), 
       col=c("red","blue","green"), 
       legend=c("Mean", "Median", "IQR"))
 grid(nx=NA, ny=NULL)

 # RESPONSE:

 # A similar analysis as shown in 1(b)

 # (2) 

 # We want project between 2010 and now
 df2010 <- noNA_foo[which(noNA_foo$CirculationDate >= as.Date("2010-01-01")), ]

 # The most straightforward way is...
 print("Distribution of Project Ratings")
 prop.table(table(df2010$Rating)) * 100  
 # 2.4% of projects received a rating of 0, 12.9% of 1, 71% of 2, and 13% of 3. Hence, the 
 # majority of projects receive a rating of 2.


 # (3) 

 # Only PATA projects
 df_PATA <- df2010[which(df2010$Type == "PATA"),]

 print("Distribution of Project Ratings (PATA only)")
 table(df_PATA$Rating) / length(df_PATA$Rating) * 100  
 # 1% of projects received a rating of 0, 8% of 1, 72% of 2, and 19% of 3. 
 # We see that PATA have less projects rated as 0 or 1 and more rated as 3.


 # (4) 


 # There are several ways to do it. Here's a way:
 df_TopQuant <- df[which(df$RevisedAmount >= quantile(df$RevisedAmount, 0.9)),]
 df_BottomQuant <- df[which(df$RevisedAmount <= quantile(df$RevisedAmount, 0.1)),]

 # Compare the ratings
 (table(df_TopQuant$Rating) / length(df_TopQuant$Rating) * 100) - 
  (table(df_BottomQuant$Rating) / length(df_BottomQuant$Rating) * 100)
 # The differences seem to be quite small (i.e., about 2% difference max)

 # Compare other features
 # I will look at count instead of percentages because, given that there are 
 # more categories, the percentages might be misleading (e.g., a difference of 100%
 # because there is one project of a given type )
 # Dept
 table(df_TopQuant$Dept) - table(df_BottomQuant$Dept)
 # More projects in the top quantile are within the SDCC, whereas
 # more projects in the bottom  quantile are in the SERD, ERCD and SARD departments.

 # Cluster
 (table(df_TopQuant$Cluster) / length(df_TopQuant$Cluster) * 100) - 
  (table(df_BottomQuant$Cluster) / length(df_BottomQuant$Cluster) * 100)
 # A few significant differences percentage-wise, but if you look at the counts you will
 # see that there's a lot of missing data here, so not very conclusive.

 # Country
 (table(df_TopQuant$Country) / length(df_TopQuant$Country) * 100) - 
  (table(df_BottomQuant$Country) / length(df_BottomQuant$Country) * 100)
 # Several differences here.

 # Hence, the budget seems to be decorrelated with the rating. However,
 # it might be associated with some other project characteristics, such as country
 # department. Nevertheless, we would not be able to draw causal conclusions given that
 # the data is observational and neither unobservable nor observable characteristics
 # are balanced across the "treatment" and control group.



 # (5)
 # (a) decision problem or objective?
 # R: Minimize project completion delays
 # (b) lever or levers?
 # R: Budget alocation (we may want to consider levers which describe how the budget is spent as well)
 # (c) ideal RCT design?
 # R: Allocate budget randomly, which will in expectation make projects with more and less
 # budget comparable because their other characteristics will overlap, and then measure
 # the difference in ratings. 
 # (d) dependent variable(s) and independent variable(s) in the modeler
 # R: The independent variable will be the budget allocated, whereas the dependent variable
 # will be the project delay.
 # (e) And---Why would running RCTs and modeling/optimizing over RCT results be preferable 
 # to using (observational, non-RCT) "foo" data?
 # R: Because in observational data, projects with higher and lower budget allocations
 # are not guaranteed to be comparable in expectation. For example, such projects may be deployed
 # in different regions of the world, which might affect the difference in ratings. Hence,
 # We cannot infer that the difference in ratings is due to the difference in budget given that
 # there are confounding variables.
	######### RUN THE CODE BELOW IN R. R-STUDIO IS THE RECOMMENDED IDE. BOTH R AND R-STUDIO ARE FREE.
	######### QUESTIONS SHOULD BE POSTED TO PIAZZA
	######### THE ACTUAL ASSIGNMENT BEGINS ON LINE 71


	### Multilateral Development Institution Data
	foo <- read.csv("https://tinyurl.com/yb4phxx8") # read in the data

	# column names
	names(foo)

	# dimensions of the data set
	dim(foo)

	# quick look at the data structure
	head(foo)

	# one thing to be very careful with (in this data set) is the use of dates. 8 columns involve dates.

	# take note of the columns representing calendar dates
	date.columns <- c(11, 12, 14, 15, 16, 17, 18, 25)

	# these columns need some tweaking--I want to address missing values, calling the blank (empty)
	# elements "NA" instead of leaving them blank, and I wish to tell R these are "Date" objects.

	for(i in date.columns) # this "for loop" only loops through the "date.columns" -- no other columns.

	{

	# identify which values are missing in the "i"th column of the foo data set
	which_values_are_missing <- which(as.character(foo[, i]) == "")

	# those values that are missing (blank) in the "i"th column are replaced by <NA>
	# because R knows how to handle "NA" -- NA means something special in R--blanks are handled
	# more unpredictably (which is bad).
	foo[which_values_are_missing, i] <- NA

	# last step--replace each of these columns (which is structured as a column of "factor" values)
	# as a column of dates--i.e., convert them to an object of "class" = Date. They are dates, after all.
	# And if you convert them to the Date class, R will know they are dates and you can manipulate
	# dates in a simple, straightforward way. Otherwise, you won't be able to easily manipulate them
	# arithmetically. E.g., for simple Date operations, see lines 48-58 below...
	# By the way, if you don't understand what a "factor" is in R, you should Google it.
	foo[, i] <- as.Date(as.character(foo[, i]))

	}

	# Now R knows that these columns are comprised of dates
	# for example... Replicate this yourself...

	foo[3,12]
	# [1] "1968-03-13"

	foo[4,12]
	# [1] "1968-07-03"

	foo[3,12] - foo[4,12]
	# Time difference of -112 days

	# Also, one additional helpful hint... How to eliminate rows with NAs...
	# The "is.na" function--for more info, Google it or type ?is.na at the R command prompt in the console.
	which.have.NAs <- which(is.na(foo$Rating == TRUE)) # for which rows is the claim "is.na" a TRUE claim?

	# Then, if you wanted to, e.g., remove all those rows, retaining only the rows with ratings...
	new_foo <- foo[-which.have.NAs, ]
	# Notice I called this tweaked data set "new_foo" instead of rewriting over the original data set...
	# It's a bit safer to do this, in case I decide I want to quickly revert back to the original data set.

	###########################################################################

	### ASSIGNMENT 1 -- You may want to read ALL the questions before you begin.
	### NOTE: FOR ALL QUESTIONS BELOW, ONLY CONSIDER PROJECTS WITH
	### non-missing "Circulation.Date" >= 2009-01-01.
	### EXCLUDE ALL OTHER PROJECTS FROM YOUR ANALYSIS.
	### YOU MUST provide a link to your R code. ------ DON'T FORGET TO DO THIS!!!!!!!!!!!!
	# Take note of the column names: i.e., you can type: names(foo)
	# fyi: the column called "Rating" is the success rating at completion. 0 = lowest, 3 = highest.


	noNA_foo <- new_foo[!is.na(new_foo$CirculationDate), ]
	df <- noNA_foo[which(noNA_foo$CirculationDate >= as.Date("2009-01-01")), ]

	# Check:
	summary(df$CirculationDate)



	# (1) ... (see prompt in the assignment)
	# (a) Is this claim true? Explain.

	# Some projects have no original completion dates.
	# We need to disregard them.

	mask <- which(is.na(df$OriginalCompletionDate))
	df_withDates <- df[-mask,]
	expected_duration <- mean(df_withDates$OriginalCompletionDate) - mean(df_withDates$ApprovalDate)
	expected_duration
	# Time difference of 650.9313 days, which is slightly less than two full years.
	# The claim is partially true depending on your tolerance.

	# (b)

	# Create a circulation year column
	df_withDates$CirculationYear <- format(df_withDates$CirculationDate, "%Y")

	# Are there rows with NAs in the Revised Completion Date column?
	sum(is.na(df_withDates$RevisedCompletionDate)) # No

	# And an delay column
	df_withDates$Delay <- df_withDates$RevisedCompletionDate - df_withDates$OriginalCompletionDate

	# The simplest way to do it probably involves a for loop in which you iterate through
	# circulation years. I'm providing a fancier solution so that you can get used to
	# reading more complex R code. The dplyr library provides a data processing pipeline
	# that is very similar to data analysis in SQL. DataCamp has tons of resources on it.

	library(dplyr)
	delayByYear <- df_withDates %>% # %>% means "pass what is to the left to the function that follows"
	group_by(CirculationYear) %>%
	summarise(mean.delay = mean(Delay),
	median.delay = median(Delay),
	IQR.delay = quantile(Delay, 0.75) - quantile(Delay, 0.25))
	delayByYear

	# Not the prettiest plot, but you get the point.
	plot(delayByYear$CirculationYear, delayByYear$mean.delay,
	pch=15, col="red", ylim=c(100, 800),
	xlab="Circulation Year", ylab="Delay (days)", main="Project Delay")
	points(delayByYear$CirculationYear, delayByYear$median.delay, pch=16, col="blue")
	points(delayByYear$CirculationYear, delayByYear$IQR.delay, pch=17, col="green")
	legend("bottomleft", pch=c(15,16,17),
	col=c("red","blue","green"),
	legend=c("Mean Delay", "Median Delay", "IQR of Delays"))
	grid(nx=NA, ny=NULL)

	# RESPONSE:
	# Project delays have fluctuated over the years. Starting with an average of 658 days in
	# 2009, it decreased to 503 in 2013 before increasing again to 627 in 2015. The average went
	# down to 566 in 2018. The median delay also fluctuates but tends to be much lower than mean delays.
	# The IQR follows a similar trend to the mean.

	# (c)

	# I did a year-by-year analysis, but this was not necessary.

	# Create an actual duration column
	df_withDates$ActualDuration <- df_withDates$RevisedCompletionDate - df_withDates$ApprovalDate

	# Actual duration statistics
	mean(df_withDates$ActualDuration)
	median(df_withDates$ActualDuration)
	quantile(df_withDates$ActualDuration)
	IQR(df_withDates$ActualDuration)

	df_withDates$ExpectedDuration <- df_withDates$OriginalCompletionDate - df_withDates$ApprovalDate

	mean(df_withDates$ExpectedDuration)
	median(df_withDates$ExpectedDuration)
	quantile(df_withDates$ExpectedDuration)
	IQR(df_withDates$ExpectedDuration)
	range(df_withDates$ExpectedDuration)

	ActualDurationByYear <- df_withDates %>%
	group_by(CirculationYear) %>%
	summarise(mean.duration = mean(ActualDuration),
	median.duration = median(ActualDuration),
	IQR.duration = quantile(ActualDuration, 0.75) - quantile(ActualDuration, 0.25))
	ActualDurationByYear

	plot(ActualDurationByYear$CirculationYear,
	ActualDurationByYear$mean.duration, pch=15, col="red", ylim=c(300, 1300),
	xlab="Circulation Year", ylab="Duration (days)", main="Actual Project Duration")
	points(ActualDurationByYear$CirculationYear,
	ActualDurationByYear$median.duration, pch=16, col="blue")
	points(ActualDurationByYear$CirculationYear,
	ActualDurationByYear$IQR.duration, pch=17, col="green")
	legend("bottomleft", pch=c(15,16,17),
	col=c("red","blue","green"),
	legend=c("Mean", "Median", "IQR"))
	grid(nx=NA, ny=NULL)

	# RESPONSE:

	# A similar analysis as shown in 1(b)

	# (2)

	# We want project between 2010 and now
	df2010 <- noNA_foo[which(noNA_foo$CirculationDate >= as.Date("2010-01-01")), ]

	# The most straightforward way is...
	print("Distribution of Project Ratings")
	prop.table(table(df2010$Rating)) * 100
	# 2.4% of projects received a rating of 0, 12.9% of 1, 71% of 2, and 13% of 3. Hence, the
	# majority of projects receive a rating of 2.


	# (3)

	# Only PATA projects
	df_PATA <- df2010[which(df2010$Type == "PATA"),]

	print("Distribution of Project Ratings (PATA only)")
	table(df_PATA$Rating) / length(df_PATA$Rating) * 100
	# 1% of projects received a rating of 0, 8% of 1, 72% of 2, and 19% of 3.
	# We see that PATA have less projects rated as 0 or 1 and more rated as 3.


	# (4)


	# There are several ways to do it. Here's a way:
	df_TopQuant <- df[which(df$RevisedAmount >= quantile(df$RevisedAmount, 0.9)),]
	df_BottomQuant <- df[which(df$RevisedAmount <= quantile(df$RevisedAmount, 0.1)),]

	# Compare the ratings
	(table(df_TopQuant$Rating) / length(df_TopQuant$Rating) * 100) -
	(table(df_BottomQuant$Rating) / length(df_BottomQuant$Rating) * 100)
	# The differences seem to be quite small (i.e., about 2% difference max)

	# Compare other features
	# I will look at count instead of percentages because, given that there are
	# more categories, the percentages might be misleading (e.g., a difference of 100%
	# because there is one project of a given type )
	# Dept
	table(df_TopQuant$Dept) - table(df_BottomQuant$Dept)
	# More projects in the top quantile are within the SDCC, whereas
	# more projects in the bottom quantile are in the SERD, ERCD and SARD departments.

	# Cluster
	(table(df_TopQuant$Cluster) / length(df_TopQuant$Cluster) * 100) -
	(table(df_BottomQuant$Cluster) / length(df_BottomQuant$Cluster) * 100)
	# A few significant differences percentage-wise, but if you look at the counts you will
	# see that there's a lot of missing data here, so not very conclusive.

	# Country
	(table(df_TopQuant$Country) / length(df_TopQuant$Country) * 100) -
	(table(df_BottomQuant$Country) / length(df_BottomQuant$Country) * 100)
	# Several differences here.

	# Hence, the budget seems to be decorrelated with the rating. However,
	# it might be associated with some other project characteristics, such as country
	# department. Nevertheless, we would not be able to draw causal conclusions given that
	# the data is observational and neither unobservable nor observable characteristics
	# are balanced across the "treatment" and control group.



	# (5)
	# (a) decision problem or objective?
	# R: Minimize project completion delays
	# (b) lever or levers?
	# R: Budget alocation (we may want to consider levers which describe how the budget is spent as well)
	# (c) ideal RCT design?
	# R: Allocate budget randomly, which will in expectation make projects with more and less
	# budget comparable because their other characteristics will overlap, and then measure
	# the difference in ratings.
	# (d) dependent variable(s) and independent variable(s) in the modeler
	# R: The independent variable will be the budget allocated, whereas the dependent variable
	# will be the project delay.
	# (e) And---Why would running RCTs and modeling/optimizing over RCT results be preferable
	# to using (observational, non-RCT) "foo" data?
	# R: Because in observational data, projects with higher and lower budget allocations
	# are not guaranteed to be comparable in expectation. For example, such projects may be deployed
	# in different regions of the world, which might affect the difference in ratings. Hence,
	# We cannot infer that the difference in ratings is due to the difference in budget given that
	# there are confounding variables.
No results found