Created
January 25, 2019 17:19
-
-
Save steven-tey/8e08842e84b95bfc41db97e91301616d to your computer and use it in GitHub Desktop.
CS112 Assignment 1 - R Competency and The Drivetrain Approach to Decision Making
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Question 1a | |
foo <- read.csv("https://tinyurl.com/yb4phxx8") | |
names(foo) | |
dim(foo) | |
head(foo) | |
data.columns = c(11, 12, 14, 15, 16, 17, 18, 25) | |
for(i in data.columns) | |
{ | |
which_values_are_missing <- which(as.character(foo[, i]) == "") | |
foo[which_values_are_missing, i] <- NA | |
foo[, i] <- as.Date(as.character(foo[, i])) | |
} | |
head(foo) | |
which.after.2008 <- which(foo$CirculationDate >= as.Date("2008-01-01")) | |
latest_foo <- foo[which.after.2008, ] | |
head(latest_foo) | |
which.have.NAs <- which(is.na(latest_foo$OriginalCompletionDate == TRUE)) | |
new_foo <- latest_foo[-which.have.NAs, ] | |
head(new_foo) | |
project_duration = new_foo$OriginalCompletionDate-new_foo$ApprovalDate | |
mean(project_duration) | |
median(project_duration) | |
quantile(project_duration) | |
just2008 = new_foo[new_foo$CirculationDate >= as.Date("2008-01-01") & new_foo$CirculationDate <= as.Date("2008-12-31"), ] | |
just2009 = new_foo[new_foo$CirculationDate >= as.Date("2009-01-01") & new_foo$CirculationDate <= as.Date("2009-12-31"), ] | |
just2010 = new_foo[new_foo$CirculationDate >= as.Date("2010-01-01") & new_foo$CirculationDate <= as.Date("2010-12-31"), ] | |
just2011 = new_foo[new_foo$CirculationDate >= as.Date("2011-01-01") & new_foo$CirculationDate <= as.Date("2011-12-31"), ] | |
just2012 = new_foo[new_foo$CirculationDate >= as.Date("2012-01-01") & new_foo$CirculationDate <= as.Date("2012-12-31"), ] | |
just2013 = new_foo[new_foo$CirculationDate >= as.Date("2013-01-01") & new_foo$CirculationDate <= as.Date("2013-12-31"), ] | |
just2014 = new_foo[new_foo$CirculationDate >= as.Date("2014-01-01") & new_foo$CirculationDate <= as.Date("2014-12-31"), ] | |
just2015 = new_foo[new_foo$CirculationDate >= as.Date("2015-01-01") & new_foo$CirculationDate <= as.Date("2015-12-31"), ] | |
just2016 = new_foo[new_foo$CirculationDate >= as.Date("2016-01-01") & new_foo$CirculationDate <= as.Date("2016-12-31"), ] | |
just2017 = new_foo[new_foo$CirculationDate >= as.Date("2017-01-01") & new_foo$CirculationDate <= as.Date("2017-12-31"), ] | |
just2018 = new_foo[new_foo$CirculationDate >= as.Date("2018-01-01") & new_foo$CirculationDate <= as.Date("2018-12-31"), ] | |
project_duration_2008 = just2008$OriginalCompletionDate-just2008$ApprovalDate | |
project_duration_2009 = just2009$OriginalCompletionDate-just2009$ApprovalDate | |
project_duration_2010 = just2010$OriginalCompletionDate-just2010$ApprovalDate | |
project_duration_2011 = just2011$OriginalCompletionDate-just2011$ApprovalDate | |
project_duration_2012 = just2012$OriginalCompletionDate-just2012$ApprovalDate | |
project_duration_2013 = just2013$OriginalCompletionDate-just2013$ApprovalDate | |
project_duration_2014 = just2014$OriginalCompletionDate-just2014$ApprovalDate | |
project_duration_2015 = just2015$OriginalCompletionDate-just2015$ApprovalDate | |
project_duration_2016 = just2016$OriginalCompletionDate-just2016$ApprovalDate | |
project_duration_2017 = just2017$OriginalCompletionDate-just2017$ApprovalDate | |
project_duration_2018 = just2018$OriginalCompletionDate-just2018$ApprovalDate | |
mean(project_duration_2008) | |
mean(project_duration_2009) | |
mean(project_duration_2010) | |
mean(project_duration_2011) | |
mean(project_duration_2012) | |
mean(project_duration_2013) | |
mean(project_duration_2014) | |
mean(project_duration_2015) | |
mean(project_duration_2016) | |
mean(project_duration_2017) | |
mean(project_duration_2018) | |
duration_days = data.frame(mean(project_duration_2008),mean(project_duration_2009),mean(project_duration_2010),mean(project_duration_2011),mean(project_duration_2012),mean(project_duration_2013),mean(project_duration_2014),mean(project_duration_2015),mean(project_duration_2016),mean(project_duration_2017),mean(project_duration_2018)) | |
#Question 1b | |
project_duration = new_foo$OriginalCompletionDate-new_foo$ApprovalDate | |
actual_duration = abs(new_foo$ApprovalDate-new_foo$RevisedCompletionDate) | |
difference = actual_duration - project_duration | |
difference | |
mean(difference) | |
median(difference) | |
quantile(difference) | |
#Question 2 | |
zero = length(which(latest_foo$Rating == 0))/length(latest_foo$Rating) * 100 | |
one = length(which(latest_foo$Rating == 1))/length(latest_foo$Rating) * 100 | |
two = length(which(latest_foo$Rating == 2))/length(latest_foo$Rating) * 100 | |
three = length(which(latest_foo$Rating == 3))/length(latest_foo$Rating) * 100 | |
slices <- c(zero, one, two, three) | |
lbls <- c("0-rated", "1-rated", "2-rated", "3-rated") | |
pct <- round(slices/sum(slices)*100) | |
lbls <- paste(lbls, pct) # add percents to labels | |
lbls <- paste(lbls,"%",sep="") # ad % to labels | |
pie(slices,labels = lbls, col=rainbow(length(lbls)), | |
main="How the projects were rated") | |
#Question 3 | |
PPTA = which(latest_foo$Type == "PPTA") | |
latest_foo_no_PPTA <- latest_foo[-PPTA, ] | |
zero = length(which(latest_foo_no_PPTA$Rating == 0))/length(latest_foo_no_PPTA$Rating) * 100 | |
one = length(which(latest_foo_no_PPTA$Rating == 1))/length(latest_foo_no_PPTA$Rating) * 100 | |
two = length(which(latest_foo_no_PPTA$Rating == 2))/length(latest_foo_no_PPTA$Rating) * 100 | |
three = length(which(latest_foo_no_PPTA$Rating == 3))/length(latest_foo_no_PPTA$Rating) * 100 | |
slices <- c(zero, one, two, three) | |
lbls <- c("0-rated", "1-rated", "2-rated", "3-rated") | |
pct <- round(slices/sum(slices)*100) | |
lbls <- paste(lbls, pct) # add percents to labels | |
lbls <- paste(lbls,"%",sep="") # ad % to labels | |
pie(slices,labels = lbls, col=rainbow(length(lbls)), | |
main="How the non-PPTA projects were rated") | |
#Question 4 | |
quantile(latest_foo$RevisedAmount) | |
bottom25 = latest_foo[latest_foo$RevisedAmount <= 0.4, ] | |
top25 = latest_foo[latest_foo$RevisedAmount >= 1, ] | |
which.are.NAs = which(is.na(bottom25$Rating == TRUE)) | |
bottom25_no_NA <- bottom25[-which.are.NAs, ] | |
mean(bottom25_no_NA$Rating) | |
mean(top25$Rating) | |
#let 0 represent bottom25 | |
#let 1 represent top25 | |
x_var = rep(0, each = 530) | |
y_var = rep(1, each = 543) | |
variables = c(x_var, y_var) | |
variables | |
ratings = c(bottom25_no_NA$Rating, top25$Rating) | |
ratings | |
x = variables # binary variable (0/1) | |
y = ratings # ratings of the projects | |
plot(x, y, | |
pch = 19, col = "blue", cex = 0.5, | |
main = "The Effect of Budget Sizing on Project Ratings", | |
xlab = "Budget Sizing", ylab = "Project Ratings", | |
ylim = c(0,3), | |
xaxt = "n") | |
axis(side = 1, at = c(0,1)) | |
regression_1 <- lm(y ~ x) | |
summary(regression_1) | |
abline(regression_1, col = "red", lwd = 3) # regression line |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment