Skip to content

Instantly share code, notes, and snippets.

@gghatano
Last active March 25, 2020 02:37
Show Gist options
  • Save gghatano/ee530249becfab0e8e77 to your computer and use it in GitHub Desktop.
Save gghatano/ee530249becfab0e8e77 to your computer and use it in GitHub Desktop.
編集距離を使って名寄せをしたい (Bradley−Terryモデルで力士の強さを推定) ref: https://qiita.com/gg_hatano/items/20fece37c061bde6e3c1
# install.packages("MiscPsycho_1.6.tar", reposz = NULL, type = "source")
library("MiscPsycho")
## stringMatchを使えばいいらしいです。
stringMatch("hoge", "huge", normalize = "NO")
## 対戦相手
dat %>% filter(opponent == "日馬富") %>% head %>%
xtable %>% print(type="html")
dat_opponent %>% dim
## 紐つけたい2つの名前列(nameとopponent)をクロス結合して
## 名前の全組み合わせで編集距離を計算
dat_stringmatch =
dat_name %>%
merge(dat_opponent)
## 本当はdplyr::mutate(dist = stringMatch(name, opponent))としたかった
## 動かないので、無理矢理mapplyする
dist_col = mapply(FUN = function(x,y){return (stringMatch(x,y, normalize = "NO"))}, dat_stringmatch$name, dat_stringmatch$opponent)
dat_stringmatch$dist = dist_col
## 距離の最小値を求める
dat_mindist =
dat_stringmatch %>%
group_by(name) %>%
summarise(dist = min(dist))
## 編集距離が最小のものを抜き出す
## 最小値と名前で結合すればいいはず
dat_stringmatch %>%
merge(dat_mindist, by = c("name", "dist")) %>%
select(name, opponent) %>%
xtable %>% print(type="html")
dat_opponent_name =
dat_stringmatch %>%
merge(dat_mindist, by = c("name", "dist")) %>%
mutate(opponent_name=name) %>%
select(opponent_name, opponent)
dat_result =
dat %>%
merge(dat_opponent_name, by = "opponent") %>%
select(tournament, class, name, room_old, opponent_name, result)
library(BradleyTerry2)
dat_opponent_name =
dat_stringmatch %>%
merge(dat_mindist, by = c("name", "dist")) %>%
mutate(opponent_name=name) %>%
select(opponent_name, opponent)
dat_result =
dat %>%
merge(dat_opponent_name, by = "opponent") %>%
select(tournament, class, name, room_old, opponent_name, result)
## Loading required package: lme4
## Loading required package: Matrix
dat_winlose =
dat_result %>%
group_by(name, opponent_name) %>%
summarise(wins = sum(result == "W"), loses = sum(result=="L")) %>%
mutate(winner = ifelse(wins == 1, name, opponent_name)) %>%
mutate(loser = ifelse(wins == 0, name, opponent_name)) %>%
group_by(winner, loser, add=FALSE) %>%
summarise(wins = sum(wins), loses = 0)
sumoBT = BTm(outcome = cbind(wins, loses),
player1 = winner, player2 = loser,
data = dat_winlose)
## BT
sumoBTdf = BTabilities(sumoBT) %>% as.data.frame %>%
mutate(name = row.names(.)) %>%
arrange(desc(ability))
## 勝数を集計
dat_winlose_result =
dat_winlose %>% group_by(winner) %>%
summarise(wins = sum(wins)) %>%
mutate(name = winner) %>% select(-winner)
## BTモデルで推定された強さと勝数を較べてみます
sumoBTdf %>%
merge(dat_winlose_result, by = "name") %>%
arrange(desc(ability)) %>%
xtable %>% print(type="html")
dat_mindist =
dat_stringmatch %>%
group_by(name) %>%
summarise(dist = min(dist))
## 編集距離が距離が最小のものを抜き出す
## 最小値と名前で結合すればいいはず
dat_stringmatch %>%
merge(dat_mindist, by = c("name", "dist")) %>%
select(name, opponent)
## Loading required package: lme4
## Loading required package: Matrix
dat_winlose =
dat_result %>%
group_by(name, opponent_name) %>%
summarise(wins = sum(result == "W"), loses = sum(result=="L")) %>%
mutate(winner = ifelse(wins == 1, name, opponent_name)) %>%
mutate(loser = ifelse(wins == 0, name, opponent_name)) %>%
group_by(winner, loser, add=FALSE) %>%
summarise(wins = sum(wins), loses = 0)
sumoBT = BTm(outcome = cbind(wins, loses),
player1 = winner, player2 = loser,
data = dat_winlose)
## BT
sumoBTdf = BTabilities(sumoBT) %>% as.data.frame %>%
mutate(name = row.names(.)) %>%
arrange(desc(ability))
## 勝数を集計
dat_winlose_result =
dat_winlose %>% group_by(winner) %>%
summarise(wins = sum(wins)) %>%
mutate(name = winner) %>% select(-winner)
## BTモデルで推定された強さと勝数を較べてみます
sumoBTdf %>%
merge(dat_winlose_result, by = "name") %>%
arrange(desc(ability)) %>%
xtable %>% print(type="html")
## normalize = "NO"としないと、文字数で規格化された距離が出力されます。
stringMatch("hoge", "huge")
dat_mindist =
dat_stringmatch %>%
group_by(name) %>%
summarise(dist = min(dist))
## 編集距離が距離が最小のものを抜き出す
## 最小値と名前で結合すればいいはず
dat_stringmatch %>%
merge(dat_mindist, by = c("name", "dist")) %>%
select(name, opponent)
## [1] 0.75
utils::adist("hoge", "huge")
[,1]
[1,] 1
library("readr")
library("dplyr")
library("xtable")
dat = read_csv("winLoseDataTable.dat.no_juryou", col_names = FALSE)
## 名前をつける
names(dat) = c("tournament", "class", "name", "room_old", "opponent", "result")
## 実験用に、平成24年初場所のデータを使う
dat =
dat %>% filter(tournament == "H24-1")
## 内容確認
dat %>% head %>%
xtable %>% print(type="html")
## 名前
dat %>% filter(name == "日馬富士") %>% head %>%
xtable %>% print(type="html")
[,1]
[1,] 1
## 対戦相手
dat %>% filter(opponent == "日馬富") %>% head %>%
xtable %>% print(type="html")
## name列の要素
dat_name = dat %>% select(name) %>% unique
## opponent列の要素
dat_opponent = dat %>% select(opponent) %>% unique
## それぞれの長さ
dat_name %>% dim
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment