Last active
September 28, 2015 06:22
-
-
Save m-note/a395d1a938f0fbedfd44 to your computer and use it in GitHub Desktop.
Chapter 3の事例選択
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| --- | |
| title: "Dissertation_Case_Match" | |
| output: html_document | |
| --- | |
| パッケージの読み込み | |
| ```{r warning=FALSE} | |
| library(dplyr) | |
| ``` | |
| データの読み込み | |
| ```{r warning=FALSE} | |
| data <- read.csv("Data1.csv") | |
| ``` | |
| 国によって年数が違うと上手く分析できないのでそこを確認する。 | |
| すべての国が20年ずつだとわかった。 | |
| ```{r warning=FALSE} | |
| country_list <- unique(data[,"recipient"]) | |
| for (i in 1:length(country_list)){ | |
| print(paste(country_list[i], nrow(data[data$"recipient"==country_list[i],]))) | |
| } | |
| ``` | |
| idや落とす変数の設定をする | |
| ```{r warning=FALSE} | |
| dropvars <- c("year", "gwf_next", "gwf_prior", "gwf_nonautocracy", # 数字でない変数も落とす | |
| # ここ以下はNAがあるもの | |
| "p4_flag", "p4_democ", "p4_autoc", "p4_polity", "p4_durable", "p4_xrreg", | |
| "p4_xrcomp", "p4_xropen", "p4_xconst", "p4_parreg", "p4_parcomp", "fh_pr", "fh_cl", | |
| "fh_mean", "fh_inverse_mean", "fh_sum", "fh_inverse_sum", "fh_max", "fh_inverse_min", | |
| "fh_inverse_max", "gwf_spell", "gwf_disagree", "gwf_party", "gwf_military", "gwf_monarchy", | |
| "gwf_personal", "gwf_duration", "un_region.1", "un_continent.1", | |
| "cgv_collect", "cgv_nmil", "cgv_democracy", "WinningSelec_CountryCode", | |
| # ダミー変数など情報の重複分 | |
| "regime1", "regime2", "regime3", "regime4", "regime5", "regime6", "polity_lag", | |
| "polity_diff", "regime1.1", "regime2.1", "regime3.1", "regime4.1", "regime5.1", "regime6.1", | |
| "W_S", "WS_1", "WS_2", "WS_3", "WS_4", "WS_5", "WS_6", "WS_7", "WS_8", "WS_9", "WS_10", "WS_11", | |
| "WS_12", "WS_13", "WS_14", "WS_15", "WS_16", "WS_17", "WS_18", | |
| # システムは数値的に特異ですというエラーが出たので幾つか変数を落としてみる --> mahalanobis関数の問題だった | |
| #"un_region", "un_continent", "p4_fragment", "fh_status", "fh_min", "gwf_fail", "Aid_Dem", | |
| "GDP.per.capita.growth.annual", | |
| #"GDP.per.capita.constant.2005.US", | |
| "GDP.growth.annual", "Population.growth.annual", "Population.total", | |
| "Health.expenditure.public.of.government.expenditure", | |
| "Government.expenditure.on.education.total.of.government.expenditure", "Gross.savings.current.US", | |
| "Gross.domestic.savings.of.GDP", "Gross.savings.of.GDP", "GDP.per.capita.constant.2005.US", | |
| "Taxes.on.exports.of.tax.revenue", | |
| "Oil.rents.of.GDP.", | |
| "Total.natural.resources.rents.of.GDP", | |
| "cgv_regime", | |
| "gwf_regimetype", | |
| "WinningCoalition", "Selectorate", | |
| #"Corruption", | |
| "Tax.revenue.of.GDP" | |
| ) | |
| id <- "recipient" | |
| ``` | |
| 関数 | |
| ```{r warning=FALSE} | |
| caseMatchPD <- function(data, id, year=1, dropvars=NA, returnNum=NA, method="mahalanobis", treatment=NULL){ | |
| # data: 使うデータ | |
| # id: 国名のように識別になるもの | |
| # year: 上から同じ国に関して、yearの数字ずつ国が入っていると考える | |
| # デフォルトでは、各国一年ずつだと考えられているのでyear=1 | |
| # dropvars: 使わない列 | |
| # returnNum: いくつの結果を返すか | |
| # dropvarsに何か入っていれば取り除く (列の削除) | |
| if(is.na(dropvars[1])){}else{data <- data[, !(colnames(data) %in% dropvars)]} | |
| # uniqueなidベクトル | |
| id_unique <- unique(data[,id]) | |
| # treatmentがあるなら、データセットをtreatmentのあるなしで2つに分ける | |
| if (!is.null(treatment)){ | |
| ## Dataset with treatment | |
| data_T <- data[data[, treatment]==1, ] | |
| data_T <- data_T[ , !(colnames(data_T) %in% treatment)] | |
| data_NT <- data[data[, treatment]==0, ] | |
| data_NT <- data_NT[ , !(colnames(data_NT) %in% treatment)] | |
| } | |
| # 年数ごとにベクトルを作り、リストにまとめる / treatmentのあるなしで処理が違う | |
| if (is.null(treatment)){ | |
| ## Dataset without treatment | |
| for_time = 1 | |
| result_list <- as.list(NULL) # 結果をまとめるリスト | |
| for (i in seq(1, nrow(data), year)){ | |
| data_temp <- data[data[id]==as.character(id_unique[for_time]), # idが一致 | |
| !(colnames(data) %in% id)] # ベクトルを作るときにidは不要 | |
| result_list <- append(result_list, list(as.vector(t(data_temp)))) | |
| for_time = for_time + 1 | |
| } | |
| }else{ | |
| ## Dataset with treatment | |
| for_time = 1 | |
| id_unique_T <- unique(data_T[, id]) | |
| result_list_T <- as.list(NULL) # 結果をまとめるリスト | |
| for (i in seq(1, nrow(data_T), year)){ | |
| data_temp <- data_T[data_T[id]==id_unique_T[for_time], # idが一致 | |
| !(colnames(data_T) %in% id)] # ベクトルを作るときにidは不要 | |
| result_list_T <- append(result_list_T, list(as.vector(t(data_temp)))) | |
| for_time = for_time + 1 | |
| } | |
| ### 次にTreatmentが与えられていないもの | |
| for_time = 1 | |
| id_unique_NT <- unique(data_NT[, id]) | |
| result_list_NT <- as.list(NULL) # 結果をまとめるリスト | |
| for (i in seq(1, nrow(data_NT), year)){ | |
| data_temp <- data_NT[data_NT[id]==id_unique_NT[for_time], # idが一致 | |
| !(colnames(data_NT) %in% id)] # ベクトルを作るときにidは不要 | |
| result_list_NT <- append(result_list_NT, list(as.vector(t(data_temp)))) | |
| for_time = for_time + 1 | |
| } | |
| } | |
| # treatmentがないものは先にどのようなidの組み合わせがあるかを求める、一行が一つの組み合わせに対応 | |
| if (is.null(treatment)){ | |
| combination <- as.data.frame(t(combn(seq(1, length(id_unique), 1), 2))) | |
| print(paste("Number of combinations:", nrow(combination))) | |
| combi_length <- nrow(combination) | |
| } | |
| # 指定されたmethodごとに類似度を計算する | |
| if (method=="cos"){ | |
| # 組み合わせごとのcos類似度を求める | |
| result <- data.frame("id1" = NA, "id2" = NA, "similarity"=NA) | |
| for (s in 1:nrow(combination)){ | |
| country1_num <- combination[s, 1] | |
| country2_num <- combination[s, 2] | |
| country1_vec <- unlist(result_list[country1_num]) | |
| country2_vec <- unlist(result_list[country2_num]) | |
| cos <- country1_vec %*% country2_vec / sqrt(country1_vec%*%country1_vec * country2_vec%*%country2_vec) | |
| result <- rbind.data.frame(result, | |
| data.frame("id1" = id_unique[country1_num], "id2" = id_unique[country2_num], "similarity"=cos)) | |
| } | |
| result <- result[2:nrow(result),] # 初期化のときに付けたNAを消す | |
| # dplyrでソートする | |
| result <- result %>% | |
| dplyr::arrange(desc(similarity)) | |
| } | |
| if(method=="euclid"){ | |
| # 組み合わせごとのユークリッド距離を求める | |
| result <- data.frame("id1" = NA, "id2" = NA, "distance"=NA) | |
| for (s in 1:nrow(combination)){ | |
| id1_num <- combination[s, 1] | |
| id2_num <- combination[s, 2] | |
| id1_vec <- unlist(result_list[id1_num]) | |
| id2_vec <- unlist(result_list[id2_num]) | |
| euc <- sqrt(sum((id1_vec - id2_vec) ^ 2)) | |
| result <- rbind.data.frame(result, | |
| data.frame("id1" = id_unique[id1_num], "id2" = id_unique[id2_num], "distance"=euc)) | |
| } | |
| result <- result[2:nrow(result),] # 初期化のときに付けたNAを消す | |
| # dplyrでソートする | |
| result <- result %>% | |
| dplyr::arrange(distance) | |
| } | |
| if(method=="mahalanobis" & is.null(treatment)){ # Datast without treatment | |
| result <- data.frame("id1" = NA, "id2" = NA, "distance"=NA) | |
| # year=1とyear > 1でcovの作り方が変わってくる | |
| if (year > 1){ | |
| data_temp <- data[, !(colnames(data) %in% c(id, dropvars))] | |
| data_temp <- t(data.frame(result_list)) | |
| rownames(data_temp) <- NULL | |
| covData <- cov(data_temp) | |
| }else{covData <- cov(data[, !(colnames(data) %in% c(id, dropvars))])} | |
| t<-proc.time() # 時間を測る | |
| for (s in 1:nrow(combination)){ | |
| id1_num <- combination[s, 1] | |
| id2_num <- combination[s, 2] | |
| id1_vec <- unlist(result_list[id1_num]) | |
| id2_vec <- unlist(result_list[id2_num]) | |
| print(paste("Processing", s, "out of", combi_length, "/", id_unique[id1_num], id_unique[id2_num])) | |
| maha <- mahalanobis(id1_vec, id2_vec, covData, tol=1.5e-30) # set tolalence http://goo.gl/hmmia0 | |
| result <- rbind.data.frame(result, | |
| data.frame("id1" = id_unique[id1_num], "id2" = id_unique[id2_num], "distance"=maha)) | |
| if (maha < 0){ | |
| print("Warning: Mahalanobis distance is negative value") | |
| break | |
| } | |
| } | |
| print(proc.time()-t) # かかった時間 | |
| result <- result[2:nrow(result),] # 初期化のときに付けたNAを消す | |
| # dplyrでソートする | |
| result <- result %>% | |
| dplyr::arrange(distance) | |
| # 負のマハラノビス距離は分散・共分散行列の逆行列が正しく計算されていないために生じる | |
| } | |
| if(method=="mahalanobis" & !is.null(treatment)){ # Datast with treatment | |
| result <- data.frame("id1" = NA, "id2" = NA, "distance"=NA) | |
| # year=1とyear > 1でcovの作り方が変わってくる | |
| if (year > 1){ | |
| data_temp <- data[, !(colnames(data) %in% c(id, dropvars, treatment))] | |
| data_temp <- t(data.frame(result_list)) | |
| rownames(data_temp) <- NULL | |
| covData <- cov(data_temp) | |
| }else{covData <- cov(data[, !(colnames(data) %in% c(id, dropvars, treatment))])} | |
| t<-proc.time() # 時間を測る | |
| for (i in 1:length(result_list_T)){ | |
| treatment_vec <- unlist(result_list_T[i]) | |
| for (s in 1:length(result_list_NT)){ | |
| not_treatment_vec <- unlist(result_list_NT[s]) | |
| maha <- mahalanobis(treatment_vec, not_treatment_vec, covData) | |
| result <- rbind.data.frame(result, | |
| data.frame("id1" = id_unique_T[i], "id2" = id_unique_NT[s], "distance"=maha)) | |
| } | |
| } | |
| # dplyrでソートする | |
| print(proc.time()-t) # かかった時間 | |
| result <- result[2:nrow(result),] # 初期化のときに付けたNAを消す | |
| result <- result %>% | |
| dplyr::arrange(distance) | |
| } | |
| if(is.na(returnNum)){ | |
| return (result) | |
| }else{return (result[1:returnNum,])} | |
| } | |
| ``` | |
| 投入する変数が多すぎるとマハラノビス距離が負になることがあった(値が小さすぎるものとかがあるとエラーが出やすい) | |
| ```{r warning=FALSE, results="hide"} | |
| outcome <- caseMatchPD(data, id, year=20, dropvars=dropvars, method="mahalanobis") | |
| ``` | |
| ```{r} | |
| head(outcome) | |
| ``` | |
| ユークリッド距離を使うと結果は変わる(これはcaseMatchパッケージでも同じ) | |
| ```{r warning=FALSE} | |
| outcome2 <- caseMatchPD(data, id, year=20, dropvars=dropvars, method="euclid") | |
| head(outcome) | |
| ``` | |
| caseMatchパッケージのEUデータに対しても適用可能(結果は同じ) | |
| ```{r warning=FALSE, results="hide"} | |
| outcome3 <- caseMatchPD(EU, id_test, treatment="eu", year=1, dropvars=dropvars_test, method="mahalanobis") | |
| ``` | |
| ```{r} | |
| head(outcome3) | |
| ``` |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment