viniciusmss · September 23, 2018 12:51
diff --git a/Assignment_1.R b/Assignment_1.R
 # Importing the data
 # I'm just importing the dataset after having transformed the data columns to numeric.
 df <- read.csv("C:/Users/Vinic/Downloads/Copy of datasetforCS112 - TA.csv",
           stringsAsFactors = F)
 str(df)

 # Converting char columns to numeric
 df$cumulative.disbursements <-
  as.numeric(df$cumulative.disbursements)
 df$undisbursed.amount <-
  as.numeric(df$undisbursed.amount)
 str(df)

 # Converting int columns to Date
 df$approval.date <-
  as.Date(df$approval.date, origin = as.Date('1899-12-30'))
 df$implementation.start.date <-
  as.Date(df$implementation.start.date, origin = as.Date('1899-12-30'))
 df$original.completion.date <-
  as.Date(df$original.completion.date, origin = as.Date('1899-12-30'))
 df$revised.completion.date <-
  as.Date(df$revised.completion.date, origin = as.Date('1899-12-30'))
 str(df)

 # Are the dates in the first rows of the dataset
 # the same as in the Google Sheets spreadsheet?
 head(df)

 # This is the data entry error (see the revised completion date):
 View(df[df$approval.date == as.Date("1984-06-06"),])

 # Exclude projects approved pre-1/1/1995 & post-1/1/2017
 to_be_excluded <-
  which(df$approval.date < as.Date("1995-01-01") |
          df$approval.date > as.Date("2016-12-31"))

 df_subset <- df[-to_be_excluded,]
 summary(df_subset)

 # 1. How many unique projects are represented in this data set?
 sprintf("There are %d unique projects in this dataset.", nrow(df_subset))

 # 2. What variables are measured for each project?
 names(df_subset)

 # 3. Does this data set contain missing data?
 if (any(is.na(df_subset)))
  print("The data set contains missing data.")

 # 4. Approximately how many projects are approved each year?

 # The most simple way of doing it...
 sprintf(
  "Approximately %.f projects are approved each year.",
  nrow(df_subset)/length(unique(format(df_subset$approval.date, "%Y")))
 )

 # A slightly better way of doing it, so you can see the change over years
 library(dplyr)
 df_subset$approval.year <- format(df_subset$approval.date, "%Y")
 approval_by_year <- df_subset %>%
  group_by(approval.year) %>%
  summarise(project_count = n())
 approval_by_year
 plot(approval_by_year)

 # 5. Approximately how many projects are completed each year?

 # As in the question above,
 sprintf(
  "Approximately %.f projects are completed each year.",
  nrow(df_subset)/length(unique(format(df_subset$revised.completion.date, "%Y")))
 )
 # Be reminded that if you do just this, the average will include
 # projects that have not been completed yet. Hence, you should 
 # do the following:

 # :
 df_completion_subset <- df_subset[which(df_subset$status == "FINANCIALLY COMPLETED"),]
 sprintf(
  "Approximately %.f projects are completed each year.",
  nrow(df_completion_subset)/length(unique(format(df_completion_subset$revised.completion.date, "%Y")))
 )

 # For a more granular view...
 df_completion_subset$completion.year <-
  format(df_completion_subset$revised.completion.date, "%Y")
 completion_by_year <- df_completion_subset %>%
  group_by(completion.year) %>%
  summarise(project_count = n())
 completion_by_year
 plot(completion_by_year)

 # 6. What fraction of projects have completion dates that are different from revised completion dates?
 # For this comparison to work, we have to exclude projects in which either date is not available.

 with_both_dates <- which(!is.na(df_subset$original.completion.date) &
                           !is.na(df_subset$revised.completion.date))
 df_subset_with_dates <- df_subset[with_both_dates,]
 with_different_dates <- 
  which(df_subset_with_dates$original.completion.date != df_subset_with_dates$revised.completion.date)
 fraction_completed_revised <-   length(with_different_dates) /  nrow(df_subset_with_dates)
 sprintf("%.1f%% of projects have original completion dates different than the revised dates.",
        fraction_completed_revised * 100)

 # If you did not disconsider rows with missing data, that's fine.
 # The way you treat NAs will depend on the context of your problem.

 # 7. What does R's quantile function tell you about the distributions of...
 probs <- c(0, 0.05, 0.25, 0.5, 0.75, 0.95, 1)

 ### i. project budgets?
 quantile(df_subset$project.budget, probs)

 # R: It can tell us how many projects are above/below a certain value. For example,
 # the median is 0.6, so half of the projects have budgets larger than 0.6.
 # 25% have budgets larger than 1, and only 5% larger than 2.4. The project
 # with the largest budget is 47.03, whereas the smallest budget is 0.009.

 ### ii. cumulative disbursements ((for completed projects only))
 quantile(df_completion_subset$cumulative.disbursements, probs, na.rm=T)

 ### iii. start-dates (i.e., approval dates)
 # This one is a little tricky...
 as.Date(quantile(unclass(df_subset$approval.date), probs), origin = "1970-01-01")

 ### iv. end-dates
 as.Date(quantile(unclass(
  df_completion_subset$revised.completion.date), probs, na.rm=T), 
  origin = "1970-01-01")

 ### v. the difference between original end-dates and revised end-dates
 revisions <- as.numeric(df_subset_with_dates$revised.completion.date) -
  as.numeric(df_subset_with_dates$original.completion.date)
 quantile(revisions, probs)

 ### vi. durations (i.e., from approval to revised completion dates)
 with_revised_date <- which(!is.na(df_subset$revised.completion.date))
 df_revised_date_Subset <- df_subset[with_revised_date, ]
 durations <- as.numeric(df_revised_date_Subset$revised.completion.date) -
  as.numeric(df_revised_date_Subset$approval.date)
 quantile(durations, probs)

 # (here I'm using the assignment 1 version of this question)
 # 8. Create a histogram or a probability density plot that shows the distribution of
 # project budgets. Be sure to label the x axis and give your plot a title.
 hist(df_subset$project.budget, breaks=100, xlim=c(0, 10), freq=F, xlab="Project Budget",
     main = "Histogram of Project Budgets")

 # 9. Restricting your analysis to completed projects for which we have a 
 # success/failure assessment, is there a relationship between success/failure and:
 df_assessed_subset <- df_subset[which(!is.na(df_subset$success.rating)), ]

 ### ii. project-type?
 library(tidyr) 
 success_by_type <- df_assessed_subset %>%
  group_by(project.type, success.rating) %>%
  summarise(project_count = n()) %>%
  spread(success.rating, project_count)
 success_by_type
 # "PP" projects tend to fail much more often than "AD" or "RE" projects.

 # alternatively...

 lm.success <- lm(success.rating ~ project.type, data=df_assessed_subset)
 summary(lm.success)

 ### i and iii. are not interpretable.

 # 10. Approximately what fraction of projects get assessed at project completion, 
 # and has this changed over time?
 sprintf("Approximately %.1f%% of projects get assessed at project completion.",
        nrow(df_assessed_subset)/nrow(df_completion_subset)*100)

 # Now, for change over time...

 # Format the year of assessed projects
 df_assessed_subset$completion.year <- format(df_assessed_subset$revised.completion.date, "%Y")

 # Count how many projects were completed each year
 assessed_by_year <- df_completion_subset %>%
  group_by(completion.year) %>%
  summarise(project.count = n()) %>%
  filter(!is.na(completion.year))

 # Count how many projects were assessed each year
 assessed.count <- df_assessed_subset %>%
  group_by(completion.year) %>%
  summarise(assessed.count = n()) %>%
  filter(!is.na(completion.year)) %>%
  select(assessed.count)

 assessed_by_year <- data.frame(assessed_by_year, assessed.count)

 assessed_by_year$percentage.assessed <- 
  100 * assessed_by_year$assessed.count / assessed_by_year$project.count

 assessed_by_year <- assessed_by_year %>%
  select(-assessed.count, -project.count)

 View(assessed_by_year)
 plot(assessed_by_year$completion.year, assessed_by_year$percentage.assessed)

 # 11. Is there a type of project that does not get assessed at project completion?
 assessed_by_project <- df_completion_subset %>%
  filter(!is.na(success.rating)) %>%
  group_by(project.type) %>%
  summarise(projects.assessed = n())

 total_by_project <- df_completion_subset %>%
  group_by(project.type) %>%
  summarise(projects.total = n()) %>%
  select(projects.total)

 assessed_by_project <- data.frame(assessed_by_project, total_by_project)
 assessed_by_project$percentage.assessed <- 
  100 * assessed_by_project$projects.assessed / assessed_by_project$projects.total

 assessed_by_project <- assessed_by_project %>%
  select(-projects.assessed, -projects.total) # PP doesn't get assessed

 # 12. Has the fraction of projects assessed at project completion changed over
 # time? Explain.

 assessed_by_project_over_time <- df_completion_subset %>%
  filter(!is.na(success.rating)) %>%
  group_by(project.type, completion.year) %>%
  filter(!is.na(completion.year)) %>%
  summarise(projects.assessed = n())

 total_by_project_over_time <- df_completion_subset %>%
  group_by(project.type, completion.year) %>%
  filter(!is.na(completion.year)) %>%
  summarise(projects.total = n()) 

 assessed_by_project_over_time <- full_join(assessed_by_project_over_time, 
                                           total_by_project_over_time, 
          by = c("project.type" = "project.type", "completion.year" = "completion.year"))

 assessed_by_project_over_time[is.na(assessed_by_project_over_time)] <- 0
 assessed_by_project_over_time$percentage.assessed <- 
  100 * assessed_by_project_over_time$projects.assessed / assessed_by_project_over_time$projects.total

 assessed_by_project_over_time <- assessed_by_project_over_time %>%
  select(-projects.assessed, -projects.total)
 View(assessed_by_project_over_time)

 # 13. Has the distribution of project-types changed over time?

 project_type_over_time <- df_completion_subset %>%
  group_by(project.type, completion.year) %>%
  filter(!is.na(completion.year)) %>%
  summarise(projects.type.year = n())

 total_projects_over_time <- df_completion_subset %>%
  group_by(completion.year) %>%
  filter(!is.na(completion.year)) %>%
  summarise(projects.year = n()) 

 project_type_over_time <- left_join(project_type_over_time, 
                                    total_projects_over_time, 
                                           by = c("completion.year" = "completion.year"))

 project_type_over_time$percentage.assessed <- 
  100 * project_type_over_time$projects.type.year / project_type_over_time$projects.year

 project_type_over_time <- project_type_over_time %>%
  select(-projects.type.year, -projects.year)
 View(project_type_over_time)
	# Importing the data
	# I'm just importing the dataset after having transformed the data columns to numeric.
	df <- read.csv("C:/Users/Vinic/Downloads/Copy of datasetforCS112 - TA.csv",
	stringsAsFactors = F)
	str(df)

	# Converting char columns to numeric
	df$cumulative.disbursements <-
	as.numeric(df$cumulative.disbursements)
	df$undisbursed.amount <-
	as.numeric(df$undisbursed.amount)
	str(df)

	# Converting int columns to Date
	df$approval.date <-
	as.Date(df$approval.date, origin = as.Date('1899-12-30'))
	df$implementation.start.date <-
	as.Date(df$implementation.start.date, origin = as.Date('1899-12-30'))
	df$original.completion.date <-
	as.Date(df$original.completion.date, origin = as.Date('1899-12-30'))
	df$revised.completion.date <-
	as.Date(df$revised.completion.date, origin = as.Date('1899-12-30'))
	str(df)

	# Are the dates in the first rows of the dataset
	# the same as in the Google Sheets spreadsheet?
	head(df)

	# This is the data entry error (see the revised completion date):
	View(df[df$approval.date == as.Date("1984-06-06"),])

	# Exclude projects approved pre-1/1/1995 & post-1/1/2017
	to_be_excluded <-
	which(df$approval.date < as.Date("1995-01-01") \|
	df$approval.date > as.Date("2016-12-31"))

	df_subset <- df[-to_be_excluded,]
	summary(df_subset)

	# 1. How many unique projects are represented in this data set?
	sprintf("There are %d unique projects in this dataset.", nrow(df_subset))

	# 2. What variables are measured for each project?
	names(df_subset)

	# 3. Does this data set contain missing data?
	if (any(is.na(df_subset)))
	print("The data set contains missing data.")

	# 4. Approximately how many projects are approved each year?

	# The most simple way of doing it...
	sprintf(
	"Approximately %.f projects are approved each year.",
	nrow(df_subset)/length(unique(format(df_subset$approval.date, "%Y")))
	)

	# A slightly better way of doing it, so you can see the change over years
	library(dplyr)
	df_subset$approval.year <- format(df_subset$approval.date, "%Y")
	approval_by_year <- df_subset %>%
	group_by(approval.year) %>%
	summarise(project_count = n())
	approval_by_year
	plot(approval_by_year)

	# 5. Approximately how many projects are completed each year?

	# As in the question above,
	sprintf(
	"Approximately %.f projects are completed each year.",
	nrow(df_subset)/length(unique(format(df_subset$revised.completion.date, "%Y")))
	)
	# Be reminded that if you do just this, the average will include
	# projects that have not been completed yet. Hence, you should
	# do the following:

	# :
	df_completion_subset <- df_subset[which(df_subset$status == "FINANCIALLY COMPLETED"),]
	sprintf(
	"Approximately %.f projects are completed each year.",
	nrow(df_completion_subset)/length(unique(format(df_completion_subset$revised.completion.date, "%Y")))
	)

	# For a more granular view...
	df_completion_subset$completion.year <-
	format(df_completion_subset$revised.completion.date, "%Y")
	completion_by_year <- df_completion_subset %>%
	group_by(completion.year) %>%
	summarise(project_count = n())
	completion_by_year
	plot(completion_by_year)

	# 6. What fraction of projects have completion dates that are different from revised completion dates?
	# For this comparison to work, we have to exclude projects in which either date is not available.

	with_both_dates <- which(!is.na(df_subset$original.completion.date) &
	!is.na(df_subset$revised.completion.date))
	df_subset_with_dates <- df_subset[with_both_dates,]
	with_different_dates <-
	which(df_subset_with_dates$original.completion.date != df_subset_with_dates$revised.completion.date)
	fraction_completed_revised <- length(with_different_dates) / nrow(df_subset_with_dates)
	sprintf("%.1f%% of projects have original completion dates different than the revised dates.",
	fraction_completed_revised * 100)

	# If you did not disconsider rows with missing data, that's fine.
	# The way you treat NAs will depend on the context of your problem.

	# 7. What does R's quantile function tell you about the distributions of...
	probs <- c(0, 0.05, 0.25, 0.5, 0.75, 0.95, 1)

	### i. project budgets?
	quantile(df_subset$project.budget, probs)

	# R: It can tell us how many projects are above/below a certain value. For example,
	# the median is 0.6, so half of the projects have budgets larger than 0.6.
	# 25% have budgets larger than 1, and only 5% larger than 2.4. The project
	# with the largest budget is 47.03, whereas the smallest budget is 0.009.

	### ii. cumulative disbursements ((for completed projects only))
	quantile(df_completion_subset$cumulative.disbursements, probs, na.rm=T)

	### iii. start-dates (i.e., approval dates)
	# This one is a little tricky...
	as.Date(quantile(unclass(df_subset$approval.date), probs), origin = "1970-01-01")

	### iv. end-dates
	as.Date(quantile(unclass(
	df_completion_subset$revised.completion.date), probs, na.rm=T),
	origin = "1970-01-01")

	### v. the difference between original end-dates and revised end-dates
	revisions <- as.numeric(df_subset_with_dates$revised.completion.date) -
	as.numeric(df_subset_with_dates$original.completion.date)
	quantile(revisions, probs)

	### vi. durations (i.e., from approval to revised completion dates)
	with_revised_date <- which(!is.na(df_subset$revised.completion.date))
	df_revised_date_Subset <- df_subset[with_revised_date, ]
	durations <- as.numeric(df_revised_date_Subset$revised.completion.date) -
	as.numeric(df_revised_date_Subset$approval.date)
	quantile(durations, probs)

	# (here I'm using the assignment 1 version of this question)
	# 8. Create a histogram or a probability density plot that shows the distribution of
	# project budgets. Be sure to label the x axis and give your plot a title.
	hist(df_subset$project.budget, breaks=100, xlim=c(0, 10), freq=F, xlab="Project Budget",
	main = "Histogram of Project Budgets")

	# 9. Restricting your analysis to completed projects for which we have a
	# success/failure assessment, is there a relationship between success/failure and:
	df_assessed_subset <- df_subset[which(!is.na(df_subset$success.rating)), ]

	### ii. project-type?
	library(tidyr)
	success_by_type <- df_assessed_subset %>%
	group_by(project.type, success.rating) %>%
	summarise(project_count = n()) %>%
	spread(success.rating, project_count)
	success_by_type
	# "PP" projects tend to fail much more often than "AD" or "RE" projects.

	# alternatively...

	lm.success <- lm(success.rating ~ project.type, data=df_assessed_subset)
	summary(lm.success)

	### i and iii. are not interpretable.

	# 10. Approximately what fraction of projects get assessed at project completion,
	# and has this changed over time?
	sprintf("Approximately %.1f%% of projects get assessed at project completion.",
	nrow(df_assessed_subset)/nrow(df_completion_subset)*100)

	# Now, for change over time...

	# Format the year of assessed projects
	df_assessed_subset$completion.year <- format(df_assessed_subset$revised.completion.date, "%Y")

	# Count how many projects were completed each year
	assessed_by_year <- df_completion_subset %>%
	group_by(completion.year) %>%
	summarise(project.count = n()) %>%
	filter(!is.na(completion.year))

	# Count how many projects were assessed each year
	assessed.count <- df_assessed_subset %>%
	group_by(completion.year) %>%
	summarise(assessed.count = n()) %>%
	filter(!is.na(completion.year)) %>%
	select(assessed.count)

	assessed_by_year <- data.frame(assessed_by_year, assessed.count)

	assessed_by_year$percentage.assessed <-
	100 * assessed_by_year$assessed.count / assessed_by_year$project.count

	assessed_by_year <- assessed_by_year %>%
	select(-assessed.count, -project.count)

	View(assessed_by_year)
	plot(assessed_by_year$completion.year, assessed_by_year$percentage.assessed)

	# 11. Is there a type of project that does not get assessed at project completion?
	assessed_by_project <- df_completion_subset %>%
	filter(!is.na(success.rating)) %>%
	group_by(project.type) %>%
	summarise(projects.assessed = n())

	total_by_project <- df_completion_subset %>%
	group_by(project.type) %>%
	summarise(projects.total = n()) %>%
	select(projects.total)

	assessed_by_project <- data.frame(assessed_by_project, total_by_project)
	assessed_by_project$percentage.assessed <-
	100 * assessed_by_project$projects.assessed / assessed_by_project$projects.total

	assessed_by_project <- assessed_by_project %>%
	select(-projects.assessed, -projects.total) # PP doesn't get assessed

	# 12. Has the fraction of projects assessed at project completion changed over
	# time? Explain.

	assessed_by_project_over_time <- df_completion_subset %>%
	filter(!is.na(success.rating)) %>%
	group_by(project.type, completion.year) %>%
	filter(!is.na(completion.year)) %>%
	summarise(projects.assessed = n())

	total_by_project_over_time <- df_completion_subset %>%
	group_by(project.type, completion.year) %>%
	filter(!is.na(completion.year)) %>%
	summarise(projects.total = n())

	assessed_by_project_over_time <- full_join(assessed_by_project_over_time,
	total_by_project_over_time,
	by = c("project.type" = "project.type", "completion.year" = "completion.year"))

	assessed_by_project_over_time[is.na(assessed_by_project_over_time)] <- 0
	assessed_by_project_over_time$percentage.assessed <-
	100 * assessed_by_project_over_time$projects.assessed / assessed_by_project_over_time$projects.total

	assessed_by_project_over_time <- assessed_by_project_over_time %>%
	select(-projects.assessed, -projects.total)
	View(assessed_by_project_over_time)

	# 13. Has the distribution of project-types changed over time?

	project_type_over_time <- df_completion_subset %>%
	group_by(project.type, completion.year) %>%
	filter(!is.na(completion.year)) %>%
	summarise(projects.type.year = n())

	total_projects_over_time <- df_completion_subset %>%
	group_by(completion.year) %>%
	filter(!is.na(completion.year)) %>%
	summarise(projects.year = n())

	project_type_over_time <- left_join(project_type_over_time,
	total_projects_over_time,
	by = c("completion.year" = "completion.year"))

	project_type_over_time$percentage.assessed <-
	100 * project_type_over_time$projects.type.year / project_type_over_time$projects.year

	project_type_over_time <- project_type_over_time %>%
	select(-projects.type.year, -projects.year)
	View(project_type_over_time)