Skip to content

Instantly share code, notes, and snippets.

@m-note
Last active September 28, 2015 06:22
Show Gist options
  • Select an option

  • Save m-note/a395d1a938f0fbedfd44 to your computer and use it in GitHub Desktop.

Select an option

Save m-note/a395d1a938f0fbedfd44 to your computer and use it in GitHub Desktop.
Chapter 3の事例選択
---
title: "Dissertation_Case_Match"
output: html_document
---
パッケージの読み込み
```{r warning=FALSE}
library(dplyr)
```
データの読み込み
```{r warning=FALSE}
data <- read.csv("Data1.csv")
```
国によって年数が違うと上手く分析できないのでそこを確認する。
すべての国が20年ずつだとわかった。
```{r warning=FALSE}
country_list <- unique(data[,"recipient"])
for (i in 1:length(country_list)){
print(paste(country_list[i], nrow(data[data$"recipient"==country_list[i],])))
}
```
idや落とす変数の設定をする
```{r warning=FALSE}
dropvars <- c("year", "gwf_next", "gwf_prior", "gwf_nonautocracy", # 数字でない変数も落とす
# ここ以下はNAがあるもの
"p4_flag", "p4_democ", "p4_autoc", "p4_polity", "p4_durable", "p4_xrreg",
"p4_xrcomp", "p4_xropen", "p4_xconst", "p4_parreg", "p4_parcomp", "fh_pr", "fh_cl",
"fh_mean", "fh_inverse_mean", "fh_sum", "fh_inverse_sum", "fh_max", "fh_inverse_min",
"fh_inverse_max", "gwf_spell", "gwf_disagree", "gwf_party", "gwf_military", "gwf_monarchy",
"gwf_personal", "gwf_duration", "un_region.1", "un_continent.1",
"cgv_collect", "cgv_nmil", "cgv_democracy", "WinningSelec_CountryCode",
# ダミー変数など情報の重複分
"regime1", "regime2", "regime3", "regime4", "regime5", "regime6", "polity_lag",
"polity_diff", "regime1.1", "regime2.1", "regime3.1", "regime4.1", "regime5.1", "regime6.1",
"W_S", "WS_1", "WS_2", "WS_3", "WS_4", "WS_5", "WS_6", "WS_7", "WS_8", "WS_9", "WS_10", "WS_11",
"WS_12", "WS_13", "WS_14", "WS_15", "WS_16", "WS_17", "WS_18",
# システムは数値的に特異ですというエラーが出たので幾つか変数を落としてみる --> mahalanobis関数の問題だった
#"un_region", "un_continent", "p4_fragment", "fh_status", "fh_min", "gwf_fail", "Aid_Dem",
"GDP.per.capita.growth.annual",
#"GDP.per.capita.constant.2005.US",
"GDP.growth.annual", "Population.growth.annual", "Population.total",
"Health.expenditure.public.of.government.expenditure",
"Government.expenditure.on.education.total.of.government.expenditure", "Gross.savings.current.US",
"Gross.domestic.savings.of.GDP", "Gross.savings.of.GDP", "GDP.per.capita.constant.2005.US",
"Taxes.on.exports.of.tax.revenue",
"Oil.rents.of.GDP.",
"Total.natural.resources.rents.of.GDP",
"cgv_regime",
"gwf_regimetype",
"WinningCoalition", "Selectorate",
#"Corruption",
"Tax.revenue.of.GDP"
)
id <- "recipient"
```
関数
```{r warning=FALSE}
caseMatchPD <- function(data, id, year=1, dropvars=NA, returnNum=NA, method="mahalanobis", treatment=NULL){
# data: 使うデータ
# id: 国名のように識別になるもの
# year: 上から同じ国に関して、yearの数字ずつ国が入っていると考える
# デフォルトでは、各国一年ずつだと考えられているのでyear=1
# dropvars: 使わない列
# returnNum: いくつの結果を返すか
# dropvarsに何か入っていれば取り除く (列の削除)
if(is.na(dropvars[1])){}else{data <- data[, !(colnames(data) %in% dropvars)]}
# uniqueなidベクトル
id_unique <- unique(data[,id])
# treatmentがあるなら、データセットをtreatmentのあるなしで2つに分ける
if (!is.null(treatment)){
## Dataset with treatment
data_T <- data[data[, treatment]==1, ]
data_T <- data_T[ , !(colnames(data_T) %in% treatment)]
data_NT <- data[data[, treatment]==0, ]
data_NT <- data_NT[ , !(colnames(data_NT) %in% treatment)]
}
# 年数ごとにベクトルを作り、リストにまとめる / treatmentのあるなしで処理が違う
if (is.null(treatment)){
## Dataset without treatment
for_time = 1
result_list <- as.list(NULL) # 結果をまとめるリスト
for (i in seq(1, nrow(data), year)){
data_temp <- data[data[id]==as.character(id_unique[for_time]), # idが一致
!(colnames(data) %in% id)] # ベクトルを作るときにidは不要
result_list <- append(result_list, list(as.vector(t(data_temp))))
for_time = for_time + 1
}
}else{
## Dataset with treatment
for_time = 1
id_unique_T <- unique(data_T[, id])
result_list_T <- as.list(NULL) # 結果をまとめるリスト
for (i in seq(1, nrow(data_T), year)){
data_temp <- data_T[data_T[id]==id_unique_T[for_time], # idが一致
!(colnames(data_T) %in% id)] # ベクトルを作るときにidは不要
result_list_T <- append(result_list_T, list(as.vector(t(data_temp))))
for_time = for_time + 1
}
### 次にTreatmentが与えられていないもの
for_time = 1
id_unique_NT <- unique(data_NT[, id])
result_list_NT <- as.list(NULL) # 結果をまとめるリスト
for (i in seq(1, nrow(data_NT), year)){
data_temp <- data_NT[data_NT[id]==id_unique_NT[for_time], # idが一致
!(colnames(data_NT) %in% id)] # ベクトルを作るときにidは不要
result_list_NT <- append(result_list_NT, list(as.vector(t(data_temp))))
for_time = for_time + 1
}
}
# treatmentがないものは先にどのようなidの組み合わせがあるかを求める、一行が一つの組み合わせに対応
if (is.null(treatment)){
combination <- as.data.frame(t(combn(seq(1, length(id_unique), 1), 2)))
print(paste("Number of combinations:", nrow(combination)))
combi_length <- nrow(combination)
}
# 指定されたmethodごとに類似度を計算する
if (method=="cos"){
# 組み合わせごとのcos類似度を求める
result <- data.frame("id1" = NA, "id2" = NA, "similarity"=NA)
for (s in 1:nrow(combination)){
country1_num <- combination[s, 1]
country2_num <- combination[s, 2]
country1_vec <- unlist(result_list[country1_num])
country2_vec <- unlist(result_list[country2_num])
cos <- country1_vec %*% country2_vec / sqrt(country1_vec%*%country1_vec * country2_vec%*%country2_vec)
result <- rbind.data.frame(result,
data.frame("id1" = id_unique[country1_num], "id2" = id_unique[country2_num], "similarity"=cos))
}
result <- result[2:nrow(result),] # 初期化のときに付けたNAを消す
# dplyrでソートする
result <- result %>%
dplyr::arrange(desc(similarity))
}
if(method=="euclid"){
# 組み合わせごとのユークリッド距離を求める
result <- data.frame("id1" = NA, "id2" = NA, "distance"=NA)
for (s in 1:nrow(combination)){
id1_num <- combination[s, 1]
id2_num <- combination[s, 2]
id1_vec <- unlist(result_list[id1_num])
id2_vec <- unlist(result_list[id2_num])
euc <- sqrt(sum((id1_vec - id2_vec) ^ 2))
result <- rbind.data.frame(result,
data.frame("id1" = id_unique[id1_num], "id2" = id_unique[id2_num], "distance"=euc))
}
result <- result[2:nrow(result),] # 初期化のときに付けたNAを消す
# dplyrでソートする
result <- result %>%
dplyr::arrange(distance)
}
if(method=="mahalanobis" & is.null(treatment)){ # Datast without treatment
result <- data.frame("id1" = NA, "id2" = NA, "distance"=NA)
# year=1とyear > 1でcovの作り方が変わってくる
if (year > 1){
data_temp <- data[, !(colnames(data) %in% c(id, dropvars))]
data_temp <- t(data.frame(result_list))
rownames(data_temp) <- NULL
covData <- cov(data_temp)
}else{covData <- cov(data[, !(colnames(data) %in% c(id, dropvars))])}
t<-proc.time() # 時間を測る
for (s in 1:nrow(combination)){
id1_num <- combination[s, 1]
id2_num <- combination[s, 2]
id1_vec <- unlist(result_list[id1_num])
id2_vec <- unlist(result_list[id2_num])
print(paste("Processing", s, "out of", combi_length, "/", id_unique[id1_num], id_unique[id2_num]))
maha <- mahalanobis(id1_vec, id2_vec, covData, tol=1.5e-30) # set tolalence http://goo.gl/hmmia0
result <- rbind.data.frame(result,
data.frame("id1" = id_unique[id1_num], "id2" = id_unique[id2_num], "distance"=maha))
if (maha < 0){
print("Warning: Mahalanobis distance is negative value")
break
}
}
print(proc.time()-t) # かかった時間
result <- result[2:nrow(result),] # 初期化のときに付けたNAを消す
# dplyrでソートする
result <- result %>%
dplyr::arrange(distance)
# 負のマハラノビス距離は分散・共分散行列の逆行列が正しく計算されていないために生じる
}
if(method=="mahalanobis" & !is.null(treatment)){ # Datast with treatment
result <- data.frame("id1" = NA, "id2" = NA, "distance"=NA)
# year=1とyear > 1でcovの作り方が変わってくる
if (year > 1){
data_temp <- data[, !(colnames(data) %in% c(id, dropvars, treatment))]
data_temp <- t(data.frame(result_list))
rownames(data_temp) <- NULL
covData <- cov(data_temp)
}else{covData <- cov(data[, !(colnames(data) %in% c(id, dropvars, treatment))])}
t<-proc.time() # 時間を測る
for (i in 1:length(result_list_T)){
treatment_vec <- unlist(result_list_T[i])
for (s in 1:length(result_list_NT)){
not_treatment_vec <- unlist(result_list_NT[s])
maha <- mahalanobis(treatment_vec, not_treatment_vec, covData)
result <- rbind.data.frame(result,
data.frame("id1" = id_unique_T[i], "id2" = id_unique_NT[s], "distance"=maha))
}
}
# dplyrでソートする
print(proc.time()-t) # かかった時間
result <- result[2:nrow(result),] # 初期化のときに付けたNAを消す
result <- result %>%
dplyr::arrange(distance)
}
if(is.na(returnNum)){
return (result)
}else{return (result[1:returnNum,])}
}
```
投入する変数が多すぎるとマハラノビス距離が負になることがあった(値が小さすぎるものとかがあるとエラーが出やすい)
```{r warning=FALSE, results="hide"}
outcome <- caseMatchPD(data, id, year=20, dropvars=dropvars, method="mahalanobis")
```
```{r}
head(outcome)
```
ユークリッド距離を使うと結果は変わる(これはcaseMatchパッケージでも同じ)
```{r warning=FALSE}
outcome2 <- caseMatchPD(data, id, year=20, dropvars=dropvars, method="euclid")
head(outcome)
```
caseMatchパッケージのEUデータに対しても適用可能(結果は同じ)
```{r warning=FALSE, results="hide"}
outcome3 <- caseMatchPD(EU, id_test, treatment="eu", year=1, dropvars=dropvars_test, method="mahalanobis")
```
```{r}
head(outcome3)
```
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment