Yao-Jen Kuo yaojenkuo

Could that data be any tidier? It is always nice to meet a data enthusiast who is also a marathon runner and a ping-pong lover.

yaojenkuo / write_concatenated_df.py

Created December 21, 2017 17:05

taipei_marathon/write_concatenated_df.py

	def write_concatenated_df():
	url_2011 = "https://www.run2pix.com/report/report_w.php?EventCode=20111218&Race=MA&sn=3"
	url_2012 = "https://www.run2pix.com/report/report_w.php?EventCode=20121216&Race=MA&sn=32"
	url_2013 = "https://www.run2pix.com/report/report_w.php?EventCode=20131215&Race=MA&sn=57"
	url_2014 = "https://www.run2pix.com/report/report_w.php?EventCode=20141221&Race=MA&sn=86"
	url_2015 = "https://www.run2pix.com/report/report_w.php?EventCode=20151220&Race=MA&sn=111"
	url_2016 = "https://www.run2pix.com/report/report_w.php?EventCode=20161218&Race=MA&sn=136"
	url_2017 = "https://www.run2pix.com/report/report_w.php?EventCode=20171217&Race=MA&sn=161"
	url_list = [url_2011, url_2012, url_2013, url_2014, url_2015, url_2016, url_2017]
	df_list = []

yaojenkuo / get_time_group.R

Created December 22, 2017 02:23

taipei_marathon/get_time_group.R

yaojenkuo / draw_ggplots.R

Last active December 22, 2017 07:37

taipei_marathon/draw_ggplots.R

yaojenkuo / get_specdata.R

Created February 13, 2018 08:12

	# get_specdata()
	get_specdata <- function(dest_file, ex_dir) {
	specdata_url <- "https://storage.googleapis.com/jhu_rprg/specdata.zip"
	download.file(specdata_url, destfile = dest_file)
	unzip(dest_file, exdir = ex_dir)
	}
	get_specdata("~/Downloads/specdata.zip", "~")

yaojenkuo / pollutantmean.R

Created February 13, 2018 08:51

	# pollutantmean()
	pollutantmean <- function(directory, pollutant, id = 1:332) {
	csv_files <- list.files(directory) # 使用內建函數 list.files() 建立出 CSV 檔案路徑
	csv_file_paths <- paste0(directory, csv_files)
	csv_file_paths <- csv_file_paths[id] # 依照輸入的 id 參數選擇性讀入
	df_list <- list()
	pollutant_vector <- vector()
	for (i in 1:length(csv_file_paths)) {
	df_list[[i]] <- read.csv(csv_file_paths[i])
	pollutant_vector <- c(pollutant_vector, df_list[[i]][, pollutant]) # 將讀入測站的污染物資料合併起來

yaojenkuo / complete.R

Created February 13, 2018 09:12

	# complete()
	complete <- function(directory, id = 1:332) {
	csv_files <- list.files(directory) # 使用內建函數 list.files() 建立出 CSV 檔案路徑
	csv_file_paths <- paste0(directory, csv_files)
	csv_file_paths <- csv_file_paths[id] # 依照輸入的 id 參數選擇性讀入
	df_list <- list()
	nobs <- vector()
	for (i in 1:length(csv_file_paths)) {
	df_list[[i]] <- read.csv(csv_file_paths[i])
	is_complete <- complete.cases(df_list[[i]]) # 直接引用 complete.cases() 函數

yaojenkuo / corr.R

Last active February 13, 2018 09:46

	# corr()
	corr <- function(directory, threshold = 0) {
	nobs <- complete(directory)[, "nobs"] # 使用上一題已經定義好的 complete() 函數
	if (threshold > max(nobs)) { # 如果使用者輸入的門檻值超過所有測站的最大完整觀測值
	return(NULL)
	} else {
	df_to_read <- nobs > threshold
	csv_files <- list.files(directory) # 使用內建函數 list.files() 建立出 CSV 檔案路徑
	csv_file_paths <- paste0(directory, csv_files)
	csv_file_paths <- csv_file_paths[df_to_read] # 利用邏輯值選出大於等於門檻的測站

yaojenkuo / get_hospital_data.R

Created February 14, 2018 07:48

	# get_hospital_data()
	get_hospital_data <- function(dest_file, ex_dir) {
	specdata_url <- "https://storage.googleapis.com/jhu_rprg/hospital_data.zip"
	download.file(specdata_url, destfile = dest_file)
	unzip(dest_file, exdir = ex_dir)
	}
	get_hospital_data("~/Downloads/hospital_data.zip", "~/hospital_data")

yaojenkuo / draw_histogram.R

Created February 14, 2018 08:13

	# To draw a histogram
	library(ggplot2)

	file_path <- "~/hospital_data/outcome-of-care-measures.csv"
	outcome_of_care_measures <- read.csv(file_path, stringsAsFactors = FALSE)
	outcome_of_care_measures[, 11] <- as.numeric(outcome_of_care_measures[, 11])
	hist(outcome_of_care_measures[, 11], xlab = "30 Day Death Mortality Rates From Heart Attack", col = rgb(1, 0, 0, 0.5), main = "", breaks = 40) # base plotting system
	ggplot(outcome_of_care_measures, aes(x = Hospital.30.Day.Death..Mortality..Rates.from.Heart.Attack)) +
	geom_histogram(bins = 40, fill = rgb(1, 0, 0, 0.5)) +
	xlab("30 Day Death Mortality Rates From Heart Attack") +

yaojenkuo / best.R

Created February 14, 2018 09:11

	# best()
	best <- function(state, outcome) {
	library(dplyr)
	library(magrittr)

	file_path <- "~/hospital_data/outcome-of-care-measures.csv"
	outcome_of_care_measures <- read.csv(file_path, stringsAsFactors = FALSE)
	outcome_col_idx <- c(11, 17, 23)
	for (i in outcome_col_idx) {
	outcome_of_care_measures[, i] <- suppressWarnings(as.numeric(outcome_of_care_measures[, i]))