Yao-Jen Kuo yaojenkuo

Could that data be any tidier? It is always nice to meet a data enthusiast who is also a marathon runner and a ping-pong lover.

322 followers · 42 following

Taipei, Taiwan
https://www.datainpoint.com

View GitHub Profile

Recently created

Least recently created

Recently updated

Least recently updated

yaojenkuo / rankhospital.R

Created February 14, 2018 13:50

	# rankhospital()
	rankhospital <- function(state, outcome, num = "best") {
	library(dplyr)
	library(magrittr)

	file_path <- "~/hospital_data/outcome-of-care-measures.csv"
	outcome_of_care_measures <- read.csv(file_path, stringsAsFactors = FALSE)
	outcome_col_idx <- c(11, 17, 23)
	for (i in outcome_col_idx) {
	outcome_of_care_measures[, i] <- suppressWarnings(as.numeric(outcome_of_care_measures[, i]))

yaojenkuo / rankall.R

Last active February 14, 2018 14:07

	# rankall()
	rankall <- function(outcome, num = "best") {
	library(dplyr)
	library(magrittr)

	file_path <- "~/hospital_data/outcome-of-care-measures.csv"
	outcome_of_care_measures <- read.csv(file_path, stringsAsFactors = FALSE)
	outcome_col_idx <- c(11, 17, 23)
	for (i in outcome_col_idx) {
	outcome_of_care_measures[, i] <- suppressWarnings(as.numeric(outcome_of_care_measures[, i]))

yaojenkuo / run_analysis.R

Created March 1, 2018 21:47

	# run_analysis.R
	library(dplyr)
	library(magrittr)

	## get_data: Get required data into a list
	get_data <- function(dest_file, ex_dir) {
	data_url <- "https://d396qusza40orc.cloudfront.net/getdata%2Fprojectfiles%2FUCI%20HAR%20Dataset.zip"
	download.file(data_url, destfile = dest_file) # 下載壓縮檔
	unzip(dest_file, exdir = ex_dir) # 解壓縮
	file_names <- c("train/X_train.txt", "train/y_train.txt", "train/subject_train.txt", "test/X_test.txt", "test/y_test.txt", "test/subject_test.txt", "features.txt")

yaojenkuo / get_list_of_university_towns.py

Created March 30, 2018 03:24

# Use this dictionary to map state names to two letter acronyms

states = {'OH': 'Ohio', 'KY': 'Kentucky', 'AS': 'American Samoa', 'NV': 'Nevada', 'WY': 'Wyoming', 'NA': 'National', 'AL': 'Alabama', 'MD': 'Maryland', 'AK': 'Alaska', 'UT': 'Utah', 'OR': 'Oregon', 'MT': 'Montana', 'IL': 'Illinois', 'TN': 'Tennessee', 'DC': 'District of Columbia', 'VT': 'Vermont', 'ID': 'Idaho', 'AR': 'Arkansas', 'ME': 'Maine', 'WA': 'Washington', 'HI': 'Hawaii', 'WI': 'Wisconsin', 'MI': 'Michigan', 'IN': 'Indiana', 'NJ': 'New Jersey', 'AZ': 'Arizona', 'GU': 'Guam', 'MS': 'Mississippi', 'PR': 'Puerto Rico', 'NC': 'North Carolina', 'TX': 'Texas', 'SD': 'South Dakota', 'MP': 'Northern Mariana Islands', 'IA': 'Iowa', 'MO': 'Missouri', 'CT': 'Connecticut', 'WV': 'West Virginia', 'SC': 'South Carolina', 'LA': 'Louisiana', 'KS': 'Kansas', 'NY': 'New York', 'NE': 'Nebraska', 'OK': 'Oklahoma', 'FL': 'Florida', 'CA': 'California', 'CO': 'Colorado', 'PA': 'Pennsylvania', 'DE': 'Delaware', 'NM': 'New Mexico', 'RI': 'Rhode Island', 'MN': 'Mi

yaojenkuo / get_recession_start.py

Created March 30, 2018 03:39

	def get_recession_flag():
	"""
	Returns a DataFrame with Recession Starts Flag
	"""
	df = pd.read_excel('https://storage.googleapis.com/um_ds_intro/gdplev.xls', skiprows=220, header=None)
	df = df.iloc[:, [4, 6]]
	df.columns = ['Quarterly', 'GDP in billions of chained 2009 dollars']
	# 取得 lagged 資料框 ------------
	df_shift = df.shift(1)
	df_concat = pd.concat([df, df_shift['GDP in billions of chained 2009 dollars']], axis = 1)

yaojenkuo / get_recession_end.py

Created March 30, 2018 03:46

	def get_recession_end():
	"""
	Returns the year and quarter of the recession end time as a
	string value in a format such as 2005q3
	"""
	df = get_recession_flag()
	recession_start_idx = df[df['is_recession_start'] == True]['Quarterly'].values[0]
	df = df.set_index('Quarterly')
	# 篩選衰退起始以後的觀測值 -------------
	df_after_recession = df.loc[recession_start_idx:, :].drop('is_recession_start', axis = 1)

yaojenkuo / get_recession_bottom.py

Created March 30, 2018 04:11

	def get_recession_bottom():
	"""
	Returns the year and quarter of the recession bottom time as a
	string value in a format such as 2005q3
	"""
	df = pd.read_excel('https://storage.googleapis.com/um_ds_intro/gdplev.xls', skiprows=220, header=None)
	df = df.iloc[:, [4, 6]]
	df.columns = ['Quarterly', 'GDP in billions of chained 2009 dollars']
	df = df.set_index('Quarterly')
	recession_starts = get_recession_start()

yaojenkuo / convert_housing_data_to_quarters.py

Created March 30, 2018 04:27

	def convert_housing_data_to_quarters():
	"""
	Converts the housing data to quarters and returns it as mean
	values in a dataframe. This dataframe should be a dataframe with
	columns for 2000q1 through 2016q3, and should have a multi-index
	in the shape of ["State","RegionName"].

	Note: Quarters are defined in the assignment description, they are
	not arbitrary three month periods.

yaojenkuo / run_ttest.py

Created March 30, 2018 04:59

	def run_ttest():
	"""
	First creates new data showing the decline or growth of housing prices
	between the recession start and the recession bottom. Then runs a ttest
	comparing the university town values to the non-university towns values,
	return whether the alternative hypothesis (that the two groups are the same)
	is true or not as well as the p-value of the confidence.

	Return the tuple (different, p, better) where different=True if the t-test is
	True at a p<0.01 (we reject the null hypothesis), or different=False if

yaojenkuo / headless_firefox.py

Created April 2, 2018 16:15

	from selenium import webdriver
	from selenium.webdriver.firefox.options import Options

	options = Options()
	options.set_headless(headless=True)
	driver = webdriver.Firefox(firefox_options=options, executable_path='/home/ubuntu/geckodriver')
	driver.get("http://www.imdb.com/title/tt3783958/")
	elem = driver.find_element_by_css_selector('strong span')
	print("Rating: {}".format(elem.text))
	elem = driver.find_elements_by_css_selector('.subtext .itemprop')

Older Newer