Skip to content

Instantly share code, notes, and snippets.

View yaojenkuo's full-sized avatar

Yao-Jen Kuo yaojenkuo

View GitHub Profile
@yaojenkuo
yaojenkuo / write_concatenated_df.py
Created December 21, 2017 17:05
taipei_marathon/write_concatenated_df.py
def write_concatenated_df():
url_2011 = "https://www.run2pix.com/report/report_w.php?EventCode=20111218&Race=MA&sn=3"
url_2012 = "https://www.run2pix.com/report/report_w.php?EventCode=20121216&Race=MA&sn=32"
url_2013 = "https://www.run2pix.com/report/report_w.php?EventCode=20131215&Race=MA&sn=57"
url_2014 = "https://www.run2pix.com/report/report_w.php?EventCode=20141221&Race=MA&sn=86"
url_2015 = "https://www.run2pix.com/report/report_w.php?EventCode=20151220&Race=MA&sn=111"
url_2016 = "https://www.run2pix.com/report/report_w.php?EventCode=20161218&Race=MA&sn=136"
url_2017 = "https://www.run2pix.com/report/report_w.php?EventCode=20171217&Race=MA&sn=161"
url_list = [url_2011, url_2012, url_2013, url_2014, url_2015, url_2016, url_2017]
df_list = []
@yaojenkuo
yaojenkuo / get_time_group.R
Created December 22, 2017 02:23
taipei_marathon/get_time_group.R
get_time_group <- function(x) {
sub_2_half <- 2.5 * 3600
sub_3_hour <- 3 * 3600
sub_3_half <- 3.5 * 3600
sub_4_hour <- 4 * 3600
if (x < sub_2_half) {
return("sub 2:30")
} else if (x < sub_3_hour) {
return("sub 3:00")
} else if (x < sub_3_half) {
@yaojenkuo
yaojenkuo / draw_ggplots.R
Last active December 22, 2017 07:37
taipei_marathon/draw_ggplots.R
library(ggplot2)
library(dplyr)
library(magrittr)
library(ggtheme)
library(scale)
library(plotly)
runner_by_year_gender <- df %>%
group_by(year, gender) %>%
summarise(finishers = n())
# get_specdata()
get_specdata <- function(dest_file, ex_dir) {
specdata_url <- "https://storage.googleapis.com/jhu_rprg/specdata.zip"
download.file(specdata_url, destfile = dest_file)
unzip(dest_file, exdir = ex_dir)
}
get_specdata("~/Downloads/specdata.zip", "~")
# pollutantmean()
pollutantmean <- function(directory, pollutant, id = 1:332) {
csv_files <- list.files(directory) # 使用內建函數 list.files() 建立出 CSV 檔案路徑
csv_file_paths <- paste0(directory, csv_files)
csv_file_paths <- csv_file_paths[id] # 依照輸入的 id 參數選擇性讀入
df_list <- list()
pollutant_vector <- vector()
for (i in 1:length(csv_file_paths)) {
df_list[[i]] <- read.csv(csv_file_paths[i])
pollutant_vector <- c(pollutant_vector, df_list[[i]][, pollutant]) # 將讀入測站的污染物資料合併起來
# complete()
complete <- function(directory, id = 1:332) {
csv_files <- list.files(directory) # 使用內建函數 list.files() 建立出 CSV 檔案路徑
csv_file_paths <- paste0(directory, csv_files)
csv_file_paths <- csv_file_paths[id] # 依照輸入的 id 參數選擇性讀入
df_list <- list()
nobs <- vector()
for (i in 1:length(csv_file_paths)) {
df_list[[i]] <- read.csv(csv_file_paths[i])
is_complete <- complete.cases(df_list[[i]]) # 直接引用 complete.cases() 函數
@yaojenkuo
yaojenkuo / corr.R
Last active February 13, 2018 09:46
# corr()
corr <- function(directory, threshold = 0) {
nobs <- complete(directory)[, "nobs"] # 使用上一題已經定義好的 complete() 函數
if (threshold > max(nobs)) { # 如果使用者輸入的門檻值超過所有測站的最大完整觀測值
return(NULL)
} else {
df_to_read <- nobs > threshold
csv_files <- list.files(directory) # 使用內建函數 list.files() 建立出 CSV 檔案路徑
csv_file_paths <- paste0(directory, csv_files)
csv_file_paths <- csv_file_paths[df_to_read] # 利用邏輯值選出大於等於門檻的測站
# get_hospital_data()
get_hospital_data <- function(dest_file, ex_dir) {
specdata_url <- "https://storage.googleapis.com/jhu_rprg/hospital_data.zip"
download.file(specdata_url, destfile = dest_file)
unzip(dest_file, exdir = ex_dir)
}
get_hospital_data("~/Downloads/hospital_data.zip", "~/hospital_data")
# To draw a histogram
library(ggplot2)
file_path <- "~/hospital_data/outcome-of-care-measures.csv"
outcome_of_care_measures <- read.csv(file_path, stringsAsFactors = FALSE)
outcome_of_care_measures[, 11] <- as.numeric(outcome_of_care_measures[, 11])
hist(outcome_of_care_measures[, 11], xlab = "30 Day Death Mortality Rates From Heart Attack", col = rgb(1, 0, 0, 0.5), main = "", breaks = 40) # base plotting system
ggplot(outcome_of_care_measures, aes(x = Hospital.30.Day.Death..Mortality..Rates.from.Heart.Attack)) +
geom_histogram(bins = 40, fill = rgb(1, 0, 0, 0.5)) +
xlab("30 Day Death Mortality Rates From Heart Attack") +
# best()
best <- function(state, outcome) {
library(dplyr)
library(magrittr)
file_path <- "~/hospital_data/outcome-of-care-measures.csv"
outcome_of_care_measures <- read.csv(file_path, stringsAsFactors = FALSE)
outcome_col_idx <- c(11, 17, 23)
for (i in outcome_col_idx) {
outcome_of_care_measures[, i] <- suppressWarnings(as.numeric(outcome_of_care_measures[, i]))