Skip to content

Instantly share code, notes, and snippets.

@viniciusmss
Created September 23, 2018 12:51
Show Gist options
  • Save viniciusmss/10cc410b5cbbc0196ccdcb500234895b to your computer and use it in GitHub Desktop.
Save viniciusmss/10cc410b5cbbc0196ccdcb500234895b to your computer and use it in GitHub Desktop.
# Importing the data
# I'm just importing the dataset after having transformed the data columns to numeric.
df <- read.csv("C:/Users/Vinic/Downloads/Copy of datasetforCS112 - TA.csv",
stringsAsFactors = F)
str(df)
# Converting char columns to numeric
df$cumulative.disbursements <-
as.numeric(df$cumulative.disbursements)
df$undisbursed.amount <-
as.numeric(df$undisbursed.amount)
str(df)
# Converting int columns to Date
df$approval.date <-
as.Date(df$approval.date, origin = as.Date('1899-12-30'))
df$implementation.start.date <-
as.Date(df$implementation.start.date, origin = as.Date('1899-12-30'))
df$original.completion.date <-
as.Date(df$original.completion.date, origin = as.Date('1899-12-30'))
df$revised.completion.date <-
as.Date(df$revised.completion.date, origin = as.Date('1899-12-30'))
str(df)
# Are the dates in the first rows of the dataset
# the same as in the Google Sheets spreadsheet?
head(df)
# This is the data entry error (see the revised completion date):
View(df[df$approval.date == as.Date("1984-06-06"),])
# Exclude projects approved pre-1/1/1995 & post-1/1/2017
to_be_excluded <-
which(df$approval.date < as.Date("1995-01-01") |
df$approval.date > as.Date("2016-12-31"))
df_subset <- df[-to_be_excluded,]
summary(df_subset)
# 1. How many unique projects are represented in this data set?
sprintf("There are %d unique projects in this dataset.", nrow(df_subset))
# 2. What variables are measured for each project?
names(df_subset)
# 3. Does this data set contain missing data?
if (any(is.na(df_subset)))
print("The data set contains missing data.")
# 4. Approximately how many projects are approved each year?
# The most simple way of doing it...
sprintf(
"Approximately %.f projects are approved each year.",
nrow(df_subset)/length(unique(format(df_subset$approval.date, "%Y")))
)
# A slightly better way of doing it, so you can see the change over years
library(dplyr)
df_subset$approval.year <- format(df_subset$approval.date, "%Y")
approval_by_year <- df_subset %>%
group_by(approval.year) %>%
summarise(project_count = n())
approval_by_year
plot(approval_by_year)
# 5. Approximately how many projects are completed each year?
# As in the question above,
sprintf(
"Approximately %.f projects are completed each year.",
nrow(df_subset)/length(unique(format(df_subset$revised.completion.date, "%Y")))
)
# Be reminded that if you do just this, the average will include
# projects that have not been completed yet. Hence, you should
# do the following:
# :
df_completion_subset <- df_subset[which(df_subset$status == "FINANCIALLY COMPLETED"),]
sprintf(
"Approximately %.f projects are completed each year.",
nrow(df_completion_subset)/length(unique(format(df_completion_subset$revised.completion.date, "%Y")))
)
# For a more granular view...
df_completion_subset$completion.year <-
format(df_completion_subset$revised.completion.date, "%Y")
completion_by_year <- df_completion_subset %>%
group_by(completion.year) %>%
summarise(project_count = n())
completion_by_year
plot(completion_by_year)
# 6. What fraction of projects have completion dates that are different from revised completion dates?
# For this comparison to work, we have to exclude projects in which either date is not available.
with_both_dates <- which(!is.na(df_subset$original.completion.date) &
!is.na(df_subset$revised.completion.date))
df_subset_with_dates <- df_subset[with_both_dates,]
with_different_dates <-
which(df_subset_with_dates$original.completion.date != df_subset_with_dates$revised.completion.date)
fraction_completed_revised <- length(with_different_dates) / nrow(df_subset_with_dates)
sprintf("%.1f%% of projects have original completion dates different than the revised dates.",
fraction_completed_revised * 100)
# If you did not disconsider rows with missing data, that's fine.
# The way you treat NAs will depend on the context of your problem.
# 7. What does R's quantile function tell you about the distributions of...
probs <- c(0, 0.05, 0.25, 0.5, 0.75, 0.95, 1)
### i. project budgets?
quantile(df_subset$project.budget, probs)
# R: It can tell us how many projects are above/below a certain value. For example,
# the median is 0.6, so half of the projects have budgets larger than 0.6.
# 25% have budgets larger than 1, and only 5% larger than 2.4. The project
# with the largest budget is 47.03, whereas the smallest budget is 0.009.
### ii. cumulative disbursements ((for completed projects only))
quantile(df_completion_subset$cumulative.disbursements, probs, na.rm=T)
### iii. start-dates (i.e., approval dates)
# This one is a little tricky...
as.Date(quantile(unclass(df_subset$approval.date), probs), origin = "1970-01-01")
### iv. end-dates
as.Date(quantile(unclass(
df_completion_subset$revised.completion.date), probs, na.rm=T),
origin = "1970-01-01")
### v. the difference between original end-dates and revised end-dates
revisions <- as.numeric(df_subset_with_dates$revised.completion.date) -
as.numeric(df_subset_with_dates$original.completion.date)
quantile(revisions, probs)
### vi. durations (i.e., from approval to revised completion dates)
with_revised_date <- which(!is.na(df_subset$revised.completion.date))
df_revised_date_Subset <- df_subset[with_revised_date, ]
durations <- as.numeric(df_revised_date_Subset$revised.completion.date) -
as.numeric(df_revised_date_Subset$approval.date)
quantile(durations, probs)
# (here I'm using the assignment 1 version of this question)
# 8. Create a histogram or a probability density plot that shows the distribution of
# project budgets. Be sure to label the x axis and give your plot a title.
hist(df_subset$project.budget, breaks=100, xlim=c(0, 10), freq=F, xlab="Project Budget",
main = "Histogram of Project Budgets")
# 9. Restricting your analysis to completed projects for which we have a
# success/failure assessment, is there a relationship between success/failure and:
df_assessed_subset <- df_subset[which(!is.na(df_subset$success.rating)), ]
### ii. project-type?
library(tidyr)
success_by_type <- df_assessed_subset %>%
group_by(project.type, success.rating) %>%
summarise(project_count = n()) %>%
spread(success.rating, project_count)
success_by_type
# "PP" projects tend to fail much more often than "AD" or "RE" projects.
# alternatively...
lm.success <- lm(success.rating ~ project.type, data=df_assessed_subset)
summary(lm.success)
### i and iii. are not interpretable.
# 10. Approximately what fraction of projects get assessed at project completion,
# and has this changed over time?
sprintf("Approximately %.1f%% of projects get assessed at project completion.",
nrow(df_assessed_subset)/nrow(df_completion_subset)*100)
# Now, for change over time...
# Format the year of assessed projects
df_assessed_subset$completion.year <- format(df_assessed_subset$revised.completion.date, "%Y")
# Count how many projects were completed each year
assessed_by_year <- df_completion_subset %>%
group_by(completion.year) %>%
summarise(project.count = n()) %>%
filter(!is.na(completion.year))
# Count how many projects were assessed each year
assessed.count <- df_assessed_subset %>%
group_by(completion.year) %>%
summarise(assessed.count = n()) %>%
filter(!is.na(completion.year)) %>%
select(assessed.count)
assessed_by_year <- data.frame(assessed_by_year, assessed.count)
assessed_by_year$percentage.assessed <-
100 * assessed_by_year$assessed.count / assessed_by_year$project.count
assessed_by_year <- assessed_by_year %>%
select(-assessed.count, -project.count)
View(assessed_by_year)
plot(assessed_by_year$completion.year, assessed_by_year$percentage.assessed)
# 11. Is there a type of project that does not get assessed at project completion?
assessed_by_project <- df_completion_subset %>%
filter(!is.na(success.rating)) %>%
group_by(project.type) %>%
summarise(projects.assessed = n())
total_by_project <- df_completion_subset %>%
group_by(project.type) %>%
summarise(projects.total = n()) %>%
select(projects.total)
assessed_by_project <- data.frame(assessed_by_project, total_by_project)
assessed_by_project$percentage.assessed <-
100 * assessed_by_project$projects.assessed / assessed_by_project$projects.total
assessed_by_project <- assessed_by_project %>%
select(-projects.assessed, -projects.total) # PP doesn't get assessed
# 12. Has the fraction of projects assessed at project completion changed over
# time? Explain.
assessed_by_project_over_time <- df_completion_subset %>%
filter(!is.na(success.rating)) %>%
group_by(project.type, completion.year) %>%
filter(!is.na(completion.year)) %>%
summarise(projects.assessed = n())
total_by_project_over_time <- df_completion_subset %>%
group_by(project.type, completion.year) %>%
filter(!is.na(completion.year)) %>%
summarise(projects.total = n())
assessed_by_project_over_time <- full_join(assessed_by_project_over_time,
total_by_project_over_time,
by = c("project.type" = "project.type", "completion.year" = "completion.year"))
assessed_by_project_over_time[is.na(assessed_by_project_over_time)] <- 0
assessed_by_project_over_time$percentage.assessed <-
100 * assessed_by_project_over_time$projects.assessed / assessed_by_project_over_time$projects.total
assessed_by_project_over_time <- assessed_by_project_over_time %>%
select(-projects.assessed, -projects.total)
View(assessed_by_project_over_time)
# 13. Has the distribution of project-types changed over time?
project_type_over_time <- df_completion_subset %>%
group_by(project.type, completion.year) %>%
filter(!is.na(completion.year)) %>%
summarise(projects.type.year = n())
total_projects_over_time <- df_completion_subset %>%
group_by(completion.year) %>%
filter(!is.na(completion.year)) %>%
summarise(projects.year = n())
project_type_over_time <- left_join(project_type_over_time,
total_projects_over_time,
by = c("completion.year" = "completion.year"))
project_type_over_time$percentage.assessed <-
100 * project_type_over_time$projects.type.year / project_type_over_time$projects.year
project_type_over_time <- project_type_over_time %>%
select(-projects.type.year, -projects.year)
View(project_type_over_time)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment