#Question 1a foo <- read.csv("https://tinyurl.com/yb4phxx8") names(foo) dim(foo) head(foo) data.columns = c(11, 12, 14, 15, 16, 17, 18, 25) for(i in data.columns) { which_values_are_missing <- which(as.character(foo[, i]) == "") foo[which_values_are_missing, i] <- NA foo[, i] <- as.Date(as.character(foo[, i])) } head(foo) which.after.2008 <- which(foo$CirculationDate >= as.Date("2008-01-01")) latest_foo <- foo[which.after.2008, ] head(latest_foo) which.have.NAs <- which(is.na(latest_foo$OriginalCompletionDate == TRUE)) new_foo <- latest_foo[-which.have.NAs, ] head(new_foo) project_duration = new_foo$OriginalCompletionDate-new_foo$ApprovalDate mean(project_duration) median(project_duration) quantile(project_duration) just2008 = new_foo[new_foo$CirculationDate >= as.Date("2008-01-01") & new_foo$CirculationDate <= as.Date("2008-12-31"), ] just2009 = new_foo[new_foo$CirculationDate >= as.Date("2009-01-01") & new_foo$CirculationDate <= as.Date("2009-12-31"), ] just2010 = new_foo[new_foo$CirculationDate >= as.Date("2010-01-01") & new_foo$CirculationDate <= as.Date("2010-12-31"), ] just2011 = new_foo[new_foo$CirculationDate >= as.Date("2011-01-01") & new_foo$CirculationDate <= as.Date("2011-12-31"), ] just2012 = new_foo[new_foo$CirculationDate >= as.Date("2012-01-01") & new_foo$CirculationDate <= as.Date("2012-12-31"), ] just2013 = new_foo[new_foo$CirculationDate >= as.Date("2013-01-01") & new_foo$CirculationDate <= as.Date("2013-12-31"), ] just2014 = new_foo[new_foo$CirculationDate >= as.Date("2014-01-01") & new_foo$CirculationDate <= as.Date("2014-12-31"), ] just2015 = new_foo[new_foo$CirculationDate >= as.Date("2015-01-01") & new_foo$CirculationDate <= as.Date("2015-12-31"), ] just2016 = new_foo[new_foo$CirculationDate >= as.Date("2016-01-01") & new_foo$CirculationDate <= as.Date("2016-12-31"), ] just2017 = new_foo[new_foo$CirculationDate >= as.Date("2017-01-01") & new_foo$CirculationDate <= as.Date("2017-12-31"), ] just2018 = new_foo[new_foo$CirculationDate >= as.Date("2018-01-01") & new_foo$CirculationDate <= as.Date("2018-12-31"), ] project_duration_2008 = just2008$OriginalCompletionDate-just2008$ApprovalDate project_duration_2009 = just2009$OriginalCompletionDate-just2009$ApprovalDate project_duration_2010 = just2010$OriginalCompletionDate-just2010$ApprovalDate project_duration_2011 = just2011$OriginalCompletionDate-just2011$ApprovalDate project_duration_2012 = just2012$OriginalCompletionDate-just2012$ApprovalDate project_duration_2013 = just2013$OriginalCompletionDate-just2013$ApprovalDate project_duration_2014 = just2014$OriginalCompletionDate-just2014$ApprovalDate project_duration_2015 = just2015$OriginalCompletionDate-just2015$ApprovalDate project_duration_2016 = just2016$OriginalCompletionDate-just2016$ApprovalDate project_duration_2017 = just2017$OriginalCompletionDate-just2017$ApprovalDate project_duration_2018 = just2018$OriginalCompletionDate-just2018$ApprovalDate mean(project_duration_2008) mean(project_duration_2009) mean(project_duration_2010) mean(project_duration_2011) mean(project_duration_2012) mean(project_duration_2013) mean(project_duration_2014) mean(project_duration_2015) mean(project_duration_2016) mean(project_duration_2017) mean(project_duration_2018) duration_days = data.frame(mean(project_duration_2008),mean(project_duration_2009),mean(project_duration_2010),mean(project_duration_2011),mean(project_duration_2012),mean(project_duration_2013),mean(project_duration_2014),mean(project_duration_2015),mean(project_duration_2016),mean(project_duration_2017),mean(project_duration_2018)) #Question 1b project_duration = new_foo$OriginalCompletionDate-new_foo$ApprovalDate actual_duration = abs(new_foo$ApprovalDate-new_foo$RevisedCompletionDate) difference = actual_duration - project_duration difference mean(difference) median(difference) quantile(difference) #Question 2 zero = length(which(latest_foo$Rating == 0))/length(latest_foo$Rating) * 100 one = length(which(latest_foo$Rating == 1))/length(latest_foo$Rating) * 100 two = length(which(latest_foo$Rating == 2))/length(latest_foo$Rating) * 100 three = length(which(latest_foo$Rating == 3))/length(latest_foo$Rating) * 100 slices <- c(zero, one, two, three) lbls <- c("0-rated", "1-rated", "2-rated", "3-rated") pct <- round(slices/sum(slices)*100) lbls <- paste(lbls, pct) # add percents to labels lbls <- paste(lbls,"%",sep="") # ad % to labels pie(slices,labels = lbls, col=rainbow(length(lbls)), main="How the projects were rated") #Question 3 PPTA = which(latest_foo$Type == "PPTA") latest_foo_no_PPTA <- latest_foo[-PPTA, ] zero = length(which(latest_foo_no_PPTA$Rating == 0))/length(latest_foo_no_PPTA$Rating) * 100 one = length(which(latest_foo_no_PPTA$Rating == 1))/length(latest_foo_no_PPTA$Rating) * 100 two = length(which(latest_foo_no_PPTA$Rating == 2))/length(latest_foo_no_PPTA$Rating) * 100 three = length(which(latest_foo_no_PPTA$Rating == 3))/length(latest_foo_no_PPTA$Rating) * 100 slices <- c(zero, one, two, three) lbls <- c("0-rated", "1-rated", "2-rated", "3-rated") pct <- round(slices/sum(slices)*100) lbls <- paste(lbls, pct) # add percents to labels lbls <- paste(lbls,"%",sep="") # ad % to labels pie(slices,labels = lbls, col=rainbow(length(lbls)), main="How the non-PPTA projects were rated") #Question 4 quantile(latest_foo$RevisedAmount) bottom25 = latest_foo[latest_foo$RevisedAmount <= 0.4, ] top25 = latest_foo[latest_foo$RevisedAmount >= 1, ] which.are.NAs = which(is.na(bottom25$Rating == TRUE)) bottom25_no_NA <- bottom25[-which.are.NAs, ] mean(bottom25_no_NA$Rating) mean(top25$Rating) #let 0 represent bottom25 #let 1 represent top25 x_var = rep(0, each = 530) y_var = rep(1, each = 543) variables = c(x_var, y_var) variables ratings = c(bottom25_no_NA$Rating, top25$Rating) ratings x = variables # binary variable (0/1) y = ratings # ratings of the projects plot(x, y, pch = 19, col = "blue", cex = 0.5, main = "The Effect of Budget Sizing on Project Ratings", xlab = "Budget Sizing", ylab = "Project Ratings", ylim = c(0,3), xaxt = "n") axis(side = 1, at = c(0,1)) regression_1 <- lm(y ~ x) summary(regression_1) abline(regression_1, col = "red", lwd = 3) # regression line