Created
September 23, 2018 12:51
-
-
Save viniciusmss/10cc410b5cbbc0196ccdcb500234895b to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Importing the data | |
# I'm just importing the dataset after having transformed the data columns to numeric. | |
df <- read.csv("C:/Users/Vinic/Downloads/Copy of datasetforCS112 - TA.csv", | |
stringsAsFactors = F) | |
str(df) | |
# Converting char columns to numeric | |
df$cumulative.disbursements <- | |
as.numeric(df$cumulative.disbursements) | |
df$undisbursed.amount <- | |
as.numeric(df$undisbursed.amount) | |
str(df) | |
# Converting int columns to Date | |
df$approval.date <- | |
as.Date(df$approval.date, origin = as.Date('1899-12-30')) | |
df$implementation.start.date <- | |
as.Date(df$implementation.start.date, origin = as.Date('1899-12-30')) | |
df$original.completion.date <- | |
as.Date(df$original.completion.date, origin = as.Date('1899-12-30')) | |
df$revised.completion.date <- | |
as.Date(df$revised.completion.date, origin = as.Date('1899-12-30')) | |
str(df) | |
# Are the dates in the first rows of the dataset | |
# the same as in the Google Sheets spreadsheet? | |
head(df) | |
# This is the data entry error (see the revised completion date): | |
View(df[df$approval.date == as.Date("1984-06-06"),]) | |
# Exclude projects approved pre-1/1/1995 & post-1/1/2017 | |
to_be_excluded <- | |
which(df$approval.date < as.Date("1995-01-01") | | |
df$approval.date > as.Date("2016-12-31")) | |
df_subset <- df[-to_be_excluded,] | |
summary(df_subset) | |
# 1. How many unique projects are represented in this data set? | |
sprintf("There are %d unique projects in this dataset.", nrow(df_subset)) | |
# 2. What variables are measured for each project? | |
names(df_subset) | |
# 3. Does this data set contain missing data? | |
if (any(is.na(df_subset))) | |
print("The data set contains missing data.") | |
# 4. Approximately how many projects are approved each year? | |
# The most simple way of doing it... | |
sprintf( | |
"Approximately %.f projects are approved each year.", | |
nrow(df_subset)/length(unique(format(df_subset$approval.date, "%Y"))) | |
) | |
# A slightly better way of doing it, so you can see the change over years | |
library(dplyr) | |
df_subset$approval.year <- format(df_subset$approval.date, "%Y") | |
approval_by_year <- df_subset %>% | |
group_by(approval.year) %>% | |
summarise(project_count = n()) | |
approval_by_year | |
plot(approval_by_year) | |
# 5. Approximately how many projects are completed each year? | |
# As in the question above, | |
sprintf( | |
"Approximately %.f projects are completed each year.", | |
nrow(df_subset)/length(unique(format(df_subset$revised.completion.date, "%Y"))) | |
) | |
# Be reminded that if you do just this, the average will include | |
# projects that have not been completed yet. Hence, you should | |
# do the following: | |
# : | |
df_completion_subset <- df_subset[which(df_subset$status == "FINANCIALLY COMPLETED"),] | |
sprintf( | |
"Approximately %.f projects are completed each year.", | |
nrow(df_completion_subset)/length(unique(format(df_completion_subset$revised.completion.date, "%Y"))) | |
) | |
# For a more granular view... | |
df_completion_subset$completion.year <- | |
format(df_completion_subset$revised.completion.date, "%Y") | |
completion_by_year <- df_completion_subset %>% | |
group_by(completion.year) %>% | |
summarise(project_count = n()) | |
completion_by_year | |
plot(completion_by_year) | |
# 6. What fraction of projects have completion dates that are different from revised completion dates? | |
# For this comparison to work, we have to exclude projects in which either date is not available. | |
with_both_dates <- which(!is.na(df_subset$original.completion.date) & | |
!is.na(df_subset$revised.completion.date)) | |
df_subset_with_dates <- df_subset[with_both_dates,] | |
with_different_dates <- | |
which(df_subset_with_dates$original.completion.date != df_subset_with_dates$revised.completion.date) | |
fraction_completed_revised <- length(with_different_dates) / nrow(df_subset_with_dates) | |
sprintf("%.1f%% of projects have original completion dates different than the revised dates.", | |
fraction_completed_revised * 100) | |
# If you did not disconsider rows with missing data, that's fine. | |
# The way you treat NAs will depend on the context of your problem. | |
# 7. What does R's quantile function tell you about the distributions of... | |
probs <- c(0, 0.05, 0.25, 0.5, 0.75, 0.95, 1) | |
### i. project budgets? | |
quantile(df_subset$project.budget, probs) | |
# R: It can tell us how many projects are above/below a certain value. For example, | |
# the median is 0.6, so half of the projects have budgets larger than 0.6. | |
# 25% have budgets larger than 1, and only 5% larger than 2.4. The project | |
# with the largest budget is 47.03, whereas the smallest budget is 0.009. | |
### ii. cumulative disbursements ((for completed projects only)) | |
quantile(df_completion_subset$cumulative.disbursements, probs, na.rm=T) | |
### iii. start-dates (i.e., approval dates) | |
# This one is a little tricky... | |
as.Date(quantile(unclass(df_subset$approval.date), probs), origin = "1970-01-01") | |
### iv. end-dates | |
as.Date(quantile(unclass( | |
df_completion_subset$revised.completion.date), probs, na.rm=T), | |
origin = "1970-01-01") | |
### v. the difference between original end-dates and revised end-dates | |
revisions <- as.numeric(df_subset_with_dates$revised.completion.date) - | |
as.numeric(df_subset_with_dates$original.completion.date) | |
quantile(revisions, probs) | |
### vi. durations (i.e., from approval to revised completion dates) | |
with_revised_date <- which(!is.na(df_subset$revised.completion.date)) | |
df_revised_date_Subset <- df_subset[with_revised_date, ] | |
durations <- as.numeric(df_revised_date_Subset$revised.completion.date) - | |
as.numeric(df_revised_date_Subset$approval.date) | |
quantile(durations, probs) | |
# (here I'm using the assignment 1 version of this question) | |
# 8. Create a histogram or a probability density plot that shows the distribution of | |
# project budgets. Be sure to label the x axis and give your plot a title. | |
hist(df_subset$project.budget, breaks=100, xlim=c(0, 10), freq=F, xlab="Project Budget", | |
main = "Histogram of Project Budgets") | |
# 9. Restricting your analysis to completed projects for which we have a | |
# success/failure assessment, is there a relationship between success/failure and: | |
df_assessed_subset <- df_subset[which(!is.na(df_subset$success.rating)), ] | |
### ii. project-type? | |
library(tidyr) | |
success_by_type <- df_assessed_subset %>% | |
group_by(project.type, success.rating) %>% | |
summarise(project_count = n()) %>% | |
spread(success.rating, project_count) | |
success_by_type | |
# "PP" projects tend to fail much more often than "AD" or "RE" projects. | |
# alternatively... | |
lm.success <- lm(success.rating ~ project.type, data=df_assessed_subset) | |
summary(lm.success) | |
### i and iii. are not interpretable. | |
# 10. Approximately what fraction of projects get assessed at project completion, | |
# and has this changed over time? | |
sprintf("Approximately %.1f%% of projects get assessed at project completion.", | |
nrow(df_assessed_subset)/nrow(df_completion_subset)*100) | |
# Now, for change over time... | |
# Format the year of assessed projects | |
df_assessed_subset$completion.year <- format(df_assessed_subset$revised.completion.date, "%Y") | |
# Count how many projects were completed each year | |
assessed_by_year <- df_completion_subset %>% | |
group_by(completion.year) %>% | |
summarise(project.count = n()) %>% | |
filter(!is.na(completion.year)) | |
# Count how many projects were assessed each year | |
assessed.count <- df_assessed_subset %>% | |
group_by(completion.year) %>% | |
summarise(assessed.count = n()) %>% | |
filter(!is.na(completion.year)) %>% | |
select(assessed.count) | |
assessed_by_year <- data.frame(assessed_by_year, assessed.count) | |
assessed_by_year$percentage.assessed <- | |
100 * assessed_by_year$assessed.count / assessed_by_year$project.count | |
assessed_by_year <- assessed_by_year %>% | |
select(-assessed.count, -project.count) | |
View(assessed_by_year) | |
plot(assessed_by_year$completion.year, assessed_by_year$percentage.assessed) | |
# 11. Is there a type of project that does not get assessed at project completion? | |
assessed_by_project <- df_completion_subset %>% | |
filter(!is.na(success.rating)) %>% | |
group_by(project.type) %>% | |
summarise(projects.assessed = n()) | |
total_by_project <- df_completion_subset %>% | |
group_by(project.type) %>% | |
summarise(projects.total = n()) %>% | |
select(projects.total) | |
assessed_by_project <- data.frame(assessed_by_project, total_by_project) | |
assessed_by_project$percentage.assessed <- | |
100 * assessed_by_project$projects.assessed / assessed_by_project$projects.total | |
assessed_by_project <- assessed_by_project %>% | |
select(-projects.assessed, -projects.total) # PP doesn't get assessed | |
# 12. Has the fraction of projects assessed at project completion changed over | |
# time? Explain. | |
assessed_by_project_over_time <- df_completion_subset %>% | |
filter(!is.na(success.rating)) %>% | |
group_by(project.type, completion.year) %>% | |
filter(!is.na(completion.year)) %>% | |
summarise(projects.assessed = n()) | |
total_by_project_over_time <- df_completion_subset %>% | |
group_by(project.type, completion.year) %>% | |
filter(!is.na(completion.year)) %>% | |
summarise(projects.total = n()) | |
assessed_by_project_over_time <- full_join(assessed_by_project_over_time, | |
total_by_project_over_time, | |
by = c("project.type" = "project.type", "completion.year" = "completion.year")) | |
assessed_by_project_over_time[is.na(assessed_by_project_over_time)] <- 0 | |
assessed_by_project_over_time$percentage.assessed <- | |
100 * assessed_by_project_over_time$projects.assessed / assessed_by_project_over_time$projects.total | |
assessed_by_project_over_time <- assessed_by_project_over_time %>% | |
select(-projects.assessed, -projects.total) | |
View(assessed_by_project_over_time) | |
# 13. Has the distribution of project-types changed over time? | |
project_type_over_time <- df_completion_subset %>% | |
group_by(project.type, completion.year) %>% | |
filter(!is.na(completion.year)) %>% | |
summarise(projects.type.year = n()) | |
total_projects_over_time <- df_completion_subset %>% | |
group_by(completion.year) %>% | |
filter(!is.na(completion.year)) %>% | |
summarise(projects.year = n()) | |
project_type_over_time <- left_join(project_type_over_time, | |
total_projects_over_time, | |
by = c("completion.year" = "completion.year")) | |
project_type_over_time$percentage.assessed <- | |
100 * project_type_over_time$projects.type.year / project_type_over_time$projects.year | |
project_type_over_time <- project_type_over_time %>% | |
select(-projects.type.year, -projects.year) | |
View(project_type_over_time) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment