Created
July 31, 2015 08:18
-
-
Save hnagata/d2c607d6b4eba4f17304 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## grid / ggplot ---- | |
library(ggplot2) | |
library(grid) | |
make.grid <- function(row, col) { | |
grid.newpage() | |
l <- grid.layout(row, col) | |
v <- viewport(layout=l) | |
pushViewport(v) | |
} | |
print.at <- function(o, i, j) { | |
print(o, vp=viewport(layout.pos.row=i, layout.pos.col=j)) | |
} | |
end.grid <- function() { | |
popViewport() | |
} | |
plot.to.file <- FALSE | |
## データの読み込み ---- | |
dat <- read.csv("user.csv", fileEncoding="utf-8") | |
dat$date <- as.POSIXct(dat$date, tz="JST") | |
dat <- dat[order(dat$user, dat$date), ] | |
enam <- c("chartInterval", "chartStability", "chartExpressiveness", "chartRhythm", "chartVibratoLongtone") | |
## とりあえずlm ---- | |
lm.std <- lm(totalPoint ~ chartInterval + chartStability + chartExpressiveness + chartVibratoLongtone + chartRhythm, data=dat) | |
summary(lm.std) | |
## ノンパラlm ---- | |
mask.nonp <- rep(TRUE, nrow(dat)) | |
# 総合点100点を除外 (打ち切り対策) | |
mask.nonp <- mask.nonp & dat$totalPoint < 100 | |
## 各項目30以上だけ使う (singular 対策) | |
#mask.nonp <- mask.nonp & apply(dat[, enam], 1, function(row) all(row[enam] >= 30)) | |
# Singular 対策 | |
mask.nonp <- mask.nonp & dat$chartRhythm != 0 | |
mask.nonp <- mask.nonp & dat$chartRhythm != 2 | |
mask.nonp <- mask.nonp & dat$chartExpressiveness != 13 | |
# 推定 | |
do.lm <- function(mask.nonp, use.user=FALSE) { | |
enam <- if (use.user) c(enam, "user") else enam | |
dat.lm <- dat[mask.nonp, c("totalPoint", enam)] | |
for (en in enam) dat.lm[[en]] <- factor(dat.lm[[en]]) | |
design.lm <- data.frame(model.matrix(~ . - 1, dat.lm)) | |
if ("chartInterval0" %in% colnames(design.lm)) { | |
design.lm <- design.lm[, -which(colnames(design.lm) == "chartInterval0")] | |
} | |
if (use.user) { | |
ucol <- paste0("user", levels(dat$user)[-1]) | |
design.lm[, ucol] <- design.lm[, ucol] - as.numeric(dat$user[mask.nonp] == levels(dat$user)[1]) | |
} | |
lm(totalPoint ~ . - 1, design.lm) | |
} | |
lm.nonp <- do.lm(mask.nonp) | |
summary(lm.nonp) | |
# 係数のプロット | |
coef.nonp <- coef(lm.nonp) | |
print.coef <- function(coef.nonp, lower=40) { | |
value <- c(-1, lower : 100) | |
df <- data.frame( | |
elements = factor(rep(enam, each=length(value)), levels=enam), | |
value = rep(value, 5), | |
beta = as.vector(sapply(enam, function(en) { | |
coef.nonp[paste0(en, value)] | |
})) | |
) | |
f <- function(en) { | |
ggplot(df[df$elements == en | df$value == -1, ], aes(x=value, y=beta)) + | |
geom_point(aes(color=elements)) + | |
scale_x_continuous(en, limits=c(lower, 100), breaks=seq(lower, 100, 10)) + | |
theme(legend.position="none") | |
} | |
make.grid(3, 2) | |
print.at(f("chartInterval"), 1, 1) | |
print.at(f("chartStability"), 1, 2) | |
print.at(f("chartExpressiveness"), 2, 1) | |
print.at(f("chartRhythm"), 2, 2) | |
print.at(f("chartVibratoLongtone"), 3, 1) | |
end.grid() | |
} | |
#print.coef(coef.nonp) | |
# 残差の分布を見る | |
resid.nonp <- resid(lm.nonp) | |
print.resid.point <- function(resid.nonp, mask.nonp) { | |
df <- data.frame( | |
index = 1 : length(resid.nonp), | |
resid = resid.nonp, | |
user = factor(as.numeric(dat$user[mask.nonp]) %% 3, levels=0:2) | |
) | |
make.grid(1, 1) | |
print.at( | |
ggplot(df, aes(x=index, y=resid)) + | |
geom_point(aes(color=user), size=1) + | |
theme(legend.position="none"), | |
1, 1) | |
end.grid() | |
} | |
print.resid.hist <- function(resid.nonp) { | |
df <- data.frame(resid = resid.nonp) | |
make.grid(1, 1) | |
print.at( | |
ggplot(df, aes(x=resid)) + | |
geom_histogram(aes(fill=0)) + | |
theme(legend.position="none"), | |
1, 1) | |
end.grid() | |
} | |
if (plot.to.file) svg("total-resid-index.svg", width=6, height=4) | |
print.resid.point(resid.nonp, mask.nonp) | |
if (plot.to.file) dev.off() | |
if (plot.to.file) svg("total-hist-resid.svg", width=6, height=4) | |
print.resid.hist(resid.nonp) | |
if (plot.to.file) dev.off() | |
## 外れ値を除去 ---- | |
# ユーザーごとで 99.99% 点より外側のサンプルを除外 | |
quant.resid <- tapply(resid.nonp, dat$user[mask.nonp], function(r) quantile(r, probs=c(0.4, 0.5, 0.6))) | |
mask.ol <- mapply( | |
function(resid, user) { | |
quant <- quant.resid[[user]] | |
resid >= quant["50%"] - (quant["60%"] - quant["40%"]) * (qnorm(0.9999) / (2*qnorm(0.6))) | |
}, | |
resid.nonp, dat$user[mask.nonp] | |
) | |
mask.nonp2 <- mask.nonp | |
mask.nonp2[mask.nonp] <- mask.ol | |
# 判定された外れ値を見る | |
olresid.df <- data.frame( | |
index = 1 : length(resid.nonp), | |
resid = resid.nonp, | |
col = ifelse(mask.nonp2[mask.nonp], "darkgray", "red") | |
) | |
if (plot.to.file) svg("total-resid-index-ol.svg", width=6, height=4) | |
make.grid(1, 1) | |
print.at( | |
ggplot(olresid.df, aes(x=index, y=resid)) + | |
geom_point(color=olresid.df$col, size=1) + | |
theme(legend.position="none"), | |
1, 1) | |
end.grid() | |
if (plot.to.file) dev.off() | |
# 推定 | |
lm.nonp2 <- do.lm(mask.nonp2, use.user=TRUE) | |
summary(lm.nonp2) | |
# 係数のプロット | |
coef.nonp2 <- coef(lm.nonp2) | |
if (plot.to.file) svg("total-coef-elem.svg", width=8, height=12) | |
print.coef(coef.nonp2, lower=40) | |
if (plot.to.file) dev.off() | |
# 各項目 80 以上での拡大図 | |
df <- data.frame( | |
elements = factor(rep(enam, each=21), levels=enam), | |
value = rep(80 : 100, 5), | |
beta = as.vector(sapply(enam, function(en) { | |
ecoef <- coef.nonp2[paste0(en, 80 : 100)] | |
ecoef - max(ecoef, na.rm=TRUE) | |
})) | |
) | |
border <- 100 - sum(coef.nonp2[paste0(enam, 100)[-1]]) - coef.nonp2["chartInterval95"] | |
border | |
if (plot.to.file) svg("total-coef-elem-o80.svg", width=6, height=4) | |
make.grid(1, 1) | |
print.at( | |
ggplot(df, aes(x=value, y=beta, group=elements)) + | |
geom_hline(yintercept=border, color=1) + | |
geom_point(aes(color=elements)) + | |
geom_line(aes(color=elements)) + | |
xlim(c(80, 100)), | |
1, 1) | |
end.grid() | |
if (plot.to.file) dev.off() | |
# 残差の分布を見る | |
print.resid.hist(resid(lm.nonp2)) | |
# 裏加点の box-plot | |
ura.per.user <- coef.nonp2[paste0("user", levels(dat$user)[-1])] | |
ura.per.user <- c(-sum(ura.per.user), ura.per.user) | |
df <- data.frame( | |
ura = resid(lm.nonp2) + ura.per.user[as.numeric(dat$user[mask.nonp2])], | |
user = dat$user[mask.nonp2] | |
) | |
df <- df[order(ura.per.user[as.numeric(dat$user[mask.nonp2])], decreasing=TRUE), ] | |
df$user <- factor(as.numeric(df$user), levels=unique(as.numeric(df$user))) | |
df$ucol <- factor(as.numeric(df$user) %% 3, levels=0:2) | |
if (plot.to.file) svg("total-resid-user.svg", width=6, height=4) | |
make.grid(1, 1) | |
print.at( | |
ggplot(df, aes(user, ura)) + | |
geom_boxplot(aes(color=ucol), outlier.size=1) + | |
theme(legend.position="none", axis.ticks=element_blank(), axis.text.x=element_blank()), | |
1, 1) | |
end.grid() | |
if (plot.to.file) dev.off() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment