Created
January 10, 2014 00:05
-
-
Save gghatano/8344512 to your computer and use it in GitHub Desktop.
Sarrogate data test for MLB atbat data
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# サロゲートデータ法の実験をメジャーリーグのデータでやる | |
library(plyr) | |
library(MASS) | |
# データの読み込み | |
data2011 <- read.csv("all2011.csv") | |
fields <- read.csv("fields.csv") | |
names(data2011) <- fields[,"Header"] | |
# 01データで, 1の連の長さを数える | |
streaks <- function(y){ | |
n <- length(y) | |
where <- ( c(0, y, 0) == 0 ) | |
location.zeros <- (0 : (n+1))[where] | |
streak.lengths <- diff(location.zeros) - 1 | |
streak.lengths[streak.lengths>0] | |
} | |
# データをランダムシャッフルして, 0の連の長さの平方和を計算する | |
random.mix <- function(y){ | |
clump.stat <- function(sp) sum(sp^2) | |
mixed <- sample(y) | |
clump.stat(streaks(1-mixed)) | |
} | |
# 選手のIDから統計量とその分布を計算するclump.test | |
# 1000-1 回シャッフルして, 統計量の分布を作る | |
# 元のデータの統計量と比較する | |
reps <- 1000-1 | |
clump.test <- function(playerid, data){ | |
player.AB <- subset(data, BAT_ID == playerid & AB_FL ==TRUE) | |
player.AB$HIT <- ifelse(player.AB$H_FL>0, 1, 0) | |
player.AB$DATE <- substr(player.AB$GAME_ID,4,12) | |
player.AB <- arrange(player.AB, DATE) | |
ST <- replicate(reps, random.mix(player.AB$HIT)) | |
truehist(ST, xlab = "Clumpiness Statistic") | |
stat <- sum(streaks(1-player.AB$HIT)^2) | |
abline(v=stat, lwd = 3) | |
text(stat * 1.08, 0.0015, paste(playerid), cex=1.2) | |
frac <- which(sort(c(stat, ST))==stat)[1] / (reps+1) | |
return(frac) | |
} | |
# イチローで実験 | |
clump.test("suzui001", data2011) | |
# カノーで実験 | |
clump.test("canor001", data2011) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment