Skip to content

Instantly share code, notes, and snippets.

@gghatano
Created January 10, 2014 00:05
Show Gist options
  • Save gghatano/8344512 to your computer and use it in GitHub Desktop.
Save gghatano/8344512 to your computer and use it in GitHub Desktop.
Sarrogate data test for MLB atbat data
# サロゲートデータ法の実験をメジャーリーグのデータでやる
library(plyr)
library(MASS)
# データの読み込み
data2011 <- read.csv("all2011.csv")
fields <- read.csv("fields.csv")
names(data2011) <- fields[,"Header"]
# 01データで, 1の連の長さを数える
streaks <- function(y){
n <- length(y)
where <- ( c(0, y, 0) == 0 )
location.zeros <- (0 : (n+1))[where]
streak.lengths <- diff(location.zeros) - 1
streak.lengths[streak.lengths>0]
}
# データをランダムシャッフルして, 0の連の長さの平方和を計算する
random.mix <- function(y){
clump.stat <- function(sp) sum(sp^2)
mixed <- sample(y)
clump.stat(streaks(1-mixed))
}
# 選手のIDから統計量とその分布を計算するclump.test
# 1000-1 回シャッフルして, 統計量の分布を作る
# 元のデータの統計量と比較する
reps <- 1000-1
clump.test <- function(playerid, data){
player.AB <- subset(data, BAT_ID == playerid & AB_FL ==TRUE)
player.AB$HIT <- ifelse(player.AB$H_FL>0, 1, 0)
player.AB$DATE <- substr(player.AB$GAME_ID,4,12)
player.AB <- arrange(player.AB, DATE)
ST <- replicate(reps, random.mix(player.AB$HIT))
truehist(ST, xlab = "Clumpiness Statistic")
stat <- sum(streaks(1-player.AB$HIT)^2)
abline(v=stat, lwd = 3)
text(stat * 1.08, 0.0015, paste(playerid), cex=1.2)
frac <- which(sort(c(stat, ST))==stat)[1] / (reps+1)
return(frac)
}
# イチローで実験
clump.test("suzui001", data2011)
# カノーで実験
clump.test("canor001", data2011)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment