#Question 1a

foo <- read.csv("https://tinyurl.com/yb4phxx8")

names(foo)
dim(foo)
head(foo)
data.columns = c(11, 12, 14, 15, 16, 17, 18, 25)
for(i in data.columns)
{
  which_values_are_missing <- which(as.character(foo[, i]) == "")
  foo[which_values_are_missing, i] <- NA
  foo[, i] <- as.Date(as.character(foo[, i]))
}
head(foo)
which.after.2008 <- which(foo$CirculationDate >= as.Date("2008-01-01"))
latest_foo <- foo[which.after.2008, ]
head(latest_foo)
which.have.NAs <- which(is.na(latest_foo$OriginalCompletionDate == TRUE))
new_foo <- latest_foo[-which.have.NAs, ]
head(new_foo)

project_duration = new_foo$OriginalCompletionDate-new_foo$ApprovalDate
mean(project_duration)
median(project_duration)
quantile(project_duration)

just2008 = new_foo[new_foo$CirculationDate >= as.Date("2008-01-01") & new_foo$CirculationDate <= as.Date("2008-12-31"), ]
just2009 = new_foo[new_foo$CirculationDate >= as.Date("2009-01-01") & new_foo$CirculationDate <= as.Date("2009-12-31"), ]
just2010 = new_foo[new_foo$CirculationDate >= as.Date("2010-01-01") & new_foo$CirculationDate <= as.Date("2010-12-31"), ]
just2011 = new_foo[new_foo$CirculationDate >= as.Date("2011-01-01") & new_foo$CirculationDate <= as.Date("2011-12-31"), ]
just2012 = new_foo[new_foo$CirculationDate >= as.Date("2012-01-01") & new_foo$CirculationDate <= as.Date("2012-12-31"), ]
just2013 = new_foo[new_foo$CirculationDate >= as.Date("2013-01-01") & new_foo$CirculationDate <= as.Date("2013-12-31"), ]
just2014 = new_foo[new_foo$CirculationDate >= as.Date("2014-01-01") & new_foo$CirculationDate <= as.Date("2014-12-31"), ]
just2015 = new_foo[new_foo$CirculationDate >= as.Date("2015-01-01") & new_foo$CirculationDate <= as.Date("2015-12-31"), ]
just2016 = new_foo[new_foo$CirculationDate >= as.Date("2016-01-01") & new_foo$CirculationDate <= as.Date("2016-12-31"), ]
just2017 = new_foo[new_foo$CirculationDate >= as.Date("2017-01-01") & new_foo$CirculationDate <= as.Date("2017-12-31"), ]
just2018 = new_foo[new_foo$CirculationDate >= as.Date("2018-01-01") & new_foo$CirculationDate <= as.Date("2018-12-31"), ]

project_duration_2008 = just2008$OriginalCompletionDate-just2008$ApprovalDate
project_duration_2009 = just2009$OriginalCompletionDate-just2009$ApprovalDate
project_duration_2010 = just2010$OriginalCompletionDate-just2010$ApprovalDate
project_duration_2011 = just2011$OriginalCompletionDate-just2011$ApprovalDate
project_duration_2012 = just2012$OriginalCompletionDate-just2012$ApprovalDate
project_duration_2013 = just2013$OriginalCompletionDate-just2013$ApprovalDate
project_duration_2014 = just2014$OriginalCompletionDate-just2014$ApprovalDate
project_duration_2015 = just2015$OriginalCompletionDate-just2015$ApprovalDate
project_duration_2016 = just2016$OriginalCompletionDate-just2016$ApprovalDate
project_duration_2017 = just2017$OriginalCompletionDate-just2017$ApprovalDate
project_duration_2018 = just2018$OriginalCompletionDate-just2018$ApprovalDate

mean(project_duration_2008)
mean(project_duration_2009)
mean(project_duration_2010)
mean(project_duration_2011)
mean(project_duration_2012)
mean(project_duration_2013)
mean(project_duration_2014)
mean(project_duration_2015)
mean(project_duration_2016)
mean(project_duration_2017)
mean(project_duration_2018)

duration_days = data.frame(mean(project_duration_2008),mean(project_duration_2009),mean(project_duration_2010),mean(project_duration_2011),mean(project_duration_2012),mean(project_duration_2013),mean(project_duration_2014),mean(project_duration_2015),mean(project_duration_2016),mean(project_duration_2017),mean(project_duration_2018))


#Question 1b
project_duration = new_foo$OriginalCompletionDate-new_foo$ApprovalDate
actual_duration = abs(new_foo$ApprovalDate-new_foo$RevisedCompletionDate)
difference = actual_duration - project_duration
difference
mean(difference)
median(difference)
quantile(difference)

#Question 2
zero = length(which(latest_foo$Rating == 0))/length(latest_foo$Rating) * 100
one = length(which(latest_foo$Rating == 1))/length(latest_foo$Rating) * 100
two = length(which(latest_foo$Rating == 2))/length(latest_foo$Rating) * 100
three = length(which(latest_foo$Rating == 3))/length(latest_foo$Rating) * 100

slices <- c(zero, one, two, three) 
lbls <- c("0-rated", "1-rated", "2-rated", "3-rated")
pct <- round(slices/sum(slices)*100)
lbls <- paste(lbls, pct) # add percents to labels 
lbls <- paste(lbls,"%",sep="") # ad % to labels 
pie(slices,labels = lbls, col=rainbow(length(lbls)),
    main="How the projects were rated")

#Question 3
PPTA = which(latest_foo$Type == "PPTA")
latest_foo_no_PPTA <- latest_foo[-PPTA, ]

zero = length(which(latest_foo_no_PPTA$Rating == 0))/length(latest_foo_no_PPTA$Rating) * 100
one = length(which(latest_foo_no_PPTA$Rating == 1))/length(latest_foo_no_PPTA$Rating) * 100
two = length(which(latest_foo_no_PPTA$Rating == 2))/length(latest_foo_no_PPTA$Rating) * 100
three = length(which(latest_foo_no_PPTA$Rating == 3))/length(latest_foo_no_PPTA$Rating) * 100

slices <- c(zero, one, two, three) 
lbls <- c("0-rated", "1-rated", "2-rated", "3-rated")
pct <- round(slices/sum(slices)*100)
lbls <- paste(lbls, pct) # add percents to labels 
lbls <- paste(lbls,"%",sep="") # ad % to labels 
pie(slices,labels = lbls, col=rainbow(length(lbls)),
    main="How the non-PPTA projects were rated")


#Question 4
quantile(latest_foo$RevisedAmount)

bottom25 = latest_foo[latest_foo$RevisedAmount <= 0.4, ]
top25 = latest_foo[latest_foo$RevisedAmount >= 1, ]
which.are.NAs = which(is.na(bottom25$Rating == TRUE))
bottom25_no_NA <- bottom25[-which.are.NAs, ]
mean(bottom25_no_NA$Rating)
mean(top25$Rating)

#let 0 represent bottom25
#let 1 represent top25
x_var = rep(0, each = 530)
y_var = rep(1, each = 543)
variables = c(x_var, y_var)
variables

ratings = c(bottom25_no_NA$Rating, top25$Rating)
ratings

x = variables  # binary variable (0/1)
y = ratings   # ratings of the projects

plot(x, y,          
     pch = 19, col = "blue", cex = 0.5, 
     main = "The Effect of Budget Sizing on Project Ratings", 
     xlab =  "Budget Sizing", ylab = "Project Ratings",
     ylim = c(0,3),
     xaxt = "n")    

axis(side = 1, at = c(0,1)) 

regression_1 <- lm(y ~ x)
summary(regression_1)

abline(regression_1, col = "red", lwd = 3)   # regression line