Last active
August 31, 2015 09:33
-
-
Save canard0328/b2f8aec2b9c286f53400 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# データの入手 | |
# Data obtained from http://biostat.mc.vanderbilt.edu/DataSets | |
data = read.csv("http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.csv", | |
stringsAsFactors=F, na.strings=c("","NA")) | |
# データの確認 | |
# survived: 1(生存),0(死亡) | |
# pclass: 乗客の社会経済的地位(1:上流,2:中流,3:下流) | |
# name: 氏名 | |
# sex: 性別 | |
# age: 年齢 | |
# sibsp: 同乗したSibling/Spouseの数 | |
# parch: 同乗したParent/Childrenの数 | |
# ticket: チケットナンバー | |
# fare: 乗船料金 | |
# cabin: 船室番号 | |
# embarked: 乗船場(C = Cherbourg, Q = Queenstown; S = Southampton) | |
# boat: Lifeboat | |
# body: Body Identification Number | |
# home.dest: Home/Destination | |
str(data) | |
# 欠損値の確認 | |
sapply(data, function(x) sum(is.na(x))) / nrow(data) | |
# 特徴量の選択 | |
# 欠損値の多い特徴量や,分析に有効でなさそうな特徴量を削除します. | |
# ※本来特徴量の選択は分析の試行錯誤のなかで行うべきですが, | |
# 演習の都合上最初に行っています. | |
data2 <- subset(data, select=-c(name, ticket, cabin, boat, body, home.dest)) | |
# データの可視化 | |
# カテゴリデータ(pclass, embarked, sex)は積み上げ棒グラフにして | |
# 変数の割合と,変数内の生死の割合を可視化 | |
par(mfrow=c(2,2), mar=c(4,4,.5,.5)) | |
cnt <- table(data2$survived, data2$pclass) | |
barplot(cnt, horiz=T, legend=rownames(cnt), args.legend=list(x="bottomright"), | |
ylab="pclass") | |
cnt <- table(data2$survived, data2$embarked) | |
barplot(cnt, horiz=T, las=2, legend=rownames(cnt), args.legend=list(x="bottomright"), | |
ylab="embarked") | |
cnt <- table(data2$survived, data2$sex) | |
barplot(cnt, horiz=T, legend=rownames(cnt), args.legend=list(x="bottomright"), | |
ylab="sex") | |
# 数値データ(age, sibsp, parch, fare)は生/死ごとにヒストグラムで可視化 | |
par(mfrow=c(2,2), mar=c(4,4,.5,.5)) | |
hist(data2$age[data2$survived==0], breaks=20, xlab="age", main="", | |
col="#ff00ff40") | |
hist(data2$age[data2$survived==1], breaks=20, add=T, col="#0000ff40") | |
legend("topright", c("0", "1"), fill=c("#ff00ff40", "#0000ff40")) | |
hist(data2$sibsp[data2$survived==0], breaks=20, xlab="sibsp", main="", | |
col="#ff00ff40") | |
hist(data2$sibsp[data2$survived==1], breaks=10, add=T, col="#0000ff40") | |
legend("topright", c("0", "1"), fill=c("#ff00ff40", "#0000ff40")) | |
hist(data2$parch[data2$survived==0], breaks=20, xlab="parch", main="", | |
col="#ff00ff40") | |
hist(data2$parch[data2$survived==1], breaks=10, add=T, col="#0000ff40") | |
legend("topright", c("0", "1"), fill=c("#ff00ff40", "#0000ff40")) | |
hist(data2$fare[data2$survived==0], breaks=20, xlab="fare", main="", | |
col="#ff00ff40") | |
hist(data2$fare[data2$survived==1], breaks=40, add=T, col="#0000ff40") | |
legend("topright", c("0", "1"), fill=c("#ff00ff40", "#0000ff40")) | |
# 特徴量同士の関係を可視化 | |
# カテゴリデータ同士の関係はクロス集計(Cross tabulation)で可視化 | |
# ※parch, sibspは数値データですが,取りうる値が限られているためクロス集計を利用 | |
table(data2$embarked, data2$pclass, deparse.level=2) | |
table(data2$embarked, data2$sex, deparse.level=2) | |
table(data2$embarked, data2$sibsp, deparse.level=2) | |
table(data2$embarked, data2$parch, deparse.level=2) | |
table(data2$pclass, data2$sex, deparse.level=2) | |
table(data2$pclass, data2$sibsp, deparse.level=2) | |
table(data2$pclass, data2$parch, deparse.level=2) | |
table(data2$sex, data2$sibsp, deparse.level=2) | |
table(data2$sex, data2$parch, deparse.level=2) | |
table(data2$sibsp, data2$parch, deparse.level=2) | |
# 数値データとカテゴリデータの関係は箱ひげ図(boxplot)で可視化 | |
par(mfrow=c(3,2), mar=c(4,4,.5,.5)) | |
plot(factor(data2$embarked), data2$age, xlab="embarked", ylab="age") | |
plot(factor(data2$pclass), data2$age, xlab="pclass", ylab="age") | |
plot(factor(data2$sex), data2$age, xlab="sex", ylab="age") | |
plot(factor(data2$sibsp), data2$age, xlab="sibsp", ylab="age") | |
plot(factor(data2$parch), data2$age, xlab="parch", ylab="age") | |
plot(factor(data2$embarked), data2$fare, xlab="embarked", ylab="fare") | |
par(mfrow=c(2,2), mar=c(4,4,.5,.5)) | |
plot(factor(data2$pclass), data2$fare, xlab="pclass", ylab="fare") | |
plot(factor(data2$sex), data2$fare, xlab="sex", ylab="fare") | |
plot(factor(data2$sibsp), data2$fare, xlab="sibsp", ylab="fare") | |
plot(factor(data2$parch), data2$fare, xlab="parch", ylab="fare") | |
# 数値データ同士の関係は散布図(Scatter plot)で可視化 | |
par(mfrow=c(1,1), mar=c(4,4,.5,.5)) | |
plot(data2$age, data2$fare, xlab="age", ylab="fare") | |
# 欠損値の処理 | |
# 今回は,数値データは中央値で,カテゴリデータは最頻値で補間します. | |
# ただし,運賃(fare)は社会経済的地位(pclass)と相関があるため※ | |
# 等級ごとの中央値で補間します. | |
# ※pclassとfareの可視化結果(boxplot)参照 | |
age_median <- median(data2$age, na.rm=T) | |
embarked_mode <- names(which.max(table(data2$embarked))) | |
fare_median_c1 <- median(data2$fare[data2$pclass==1], na.rm=T) | |
fare_median_c2 <- median(data2$fare[data2$pclass==2], na.rm=T) | |
fare_median_c3 <- median(data2$fare[data2$pclass==3], na.rm=T) | |
data2$age[is.na(data2$age)] <- age_median | |
data2$embarked[is.na(data2$embarked)] <- embarked_mode | |
data2$fare[data2$pclass==1 & is.na(data2$fare)] <- fare_median_c1 | |
data2$fare[data2$pclass==2 & is.na(data2$fare)] <- fare_median_c2 | |
data2$fare[data2$pclass==3 & is.na(data2$fare)] <- fare_median_c3 | |
sapply(data2, function(x) sum(is.na(x))) / nrow(data2) | |
# カテゴリ変数の処理 | |
# Rでは特徴量の型を因子型(factor)にしておくと | |
# 分析時にダミー変数を利用して処理してくれることが多いです. | |
data2$pclass <- factor(data2$pclass) | |
data2$survived <- factor(data2$survived) | |
data2$embarked <- factor(data2$embarked) | |
data2$sex <- factor(data2$sex) | |
# データの標準化 | |
# 数値データを平均0,分散1に標準化します. | |
colMean <- as.numeric(sapply(data2[,sapply(data2, is.numeric)], mean)) | |
colSd <- as.numeric(sapply(data2[,sapply(data2, is.numeric)], sd)) | |
data2[,sapply(data2, is.numeric)] <- scale(data2[,sapply(data2, is.numeric)], | |
center=colMean, scale=colSd) | |
# モデリング | |
# 決定木(Decision tree)を使って生死を予測してみます. | |
# 予測結果は精度(accuracy)で評価します. | |
# 精度=正解数/データ数 | |
library(rpart) | |
clf0 <- rpart(survived~., data=data2, method="class", | |
control=rpart.control(cp=1e-10, xval=1, minsplit=1, minbucket=1)) | |
confusion_matrix <- table(data2$survived, predict(clf0, newdata=data2, type="class")) | |
sum(diag(confusion_matrix)) / sum(confusion_matrix) | |
# 0.9656226 | |
# グリッドサーチと交差検証 | |
# registerDoMCのcoresはPCのコア数に応じて設定してください. | |
library(caret, quietly=T) | |
library(doMC, quietly=T) | |
registerDoMC(cores=4) | |
set.seed(1) | |
clf1 <- train(survived~., data=data2, method="rpart", tuneLength=5, | |
trControl=trainControl(method="cv", number=10)) | |
clf1$result[which.max(clf1$results$Accuracy),] | |
# cp Accuracy Kappa AccuracySD KappaSD | |
# 0.005 0.8074985 0.5737025 0.0301985 0.07122872 | |
# 学習曲線 | |
calc_cv_train_score <- function(data, K, cp){ | |
n <- nrow(data) | |
K <- 10 | |
set.seed(1) | |
grp <- factor(sample.int(n) %/% ceiling(n / K) + 1) | |
score_cv <- c() | |
score_train <- c() | |
for(k in 1:K){ | |
clf_cv <- rpart(survived~., data=data[grp!=k,], method="class", cp=cp) | |
pred <- predict(clf_cv, newdata=data[grp!=k,], type="class") | |
cm <- table(data$survived[grp!=k], pred) | |
score_train <- c(score_train, sum(diag(cm)) / sum(cm)) | |
pred <- predict(clf_cv, newdata=data[grp==k,], type="class") | |
cm <- table(data$survived[grp==k], pred) | |
score_cv <- c(score_cv, sum(diag(cm)) / sum(cm)) | |
} | |
res <- list("score_train"=mean(score_train), "score_cv"=mean(score_cv)) | |
res | |
} | |
set.seed(1) | |
score_train <- c() | |
score_cv <- c() | |
for(r in seq(0.1, 1, 0.1)){ | |
res <- calc_cv_train_score(data2[sample.int(nrow(data2), r*nrow(data2)),], 5, | |
clf1$result$cp[which.max(clf1$results$Accuracy)]) | |
score_train <- c(score_train, res$score_train) | |
score_cv <- c(score_cv, res$score_cv) | |
} | |
par(mfrow=c(1,1), mar=c(4,4,1,.5)) | |
plot(seq(0.1, 1, 0.1)*nrow(data2), score_train, type="b", pch=16, col="red", | |
ylim=c(0.4, 1), xlab="Training examples", ylab="score", main="Learning curve") | |
par(new=T) | |
plot(seq(0.1, 1, 0.1)*nrow(data2), score_cv, type="b", pch=16, col="green", | |
ylim=c(0.4, 1), ann=F) | |
legend("bottomleft", c("Training score", "Cross-validation score"), lty=2, | |
pch=16, col=c("red", "green"), inset=.05, bty="n") | |
# 学習曲線をみると, | |
# ・訓練スコアが低い | |
# ・訓練スコアと交差検証スコアの差が小さい | |
# ことから,ハイバイアスな状態であると分かります. | |
# そこで,性能改善のために, | |
# ・柔軟性の高いモデルに変更する | |
# ・特徴量を追加する | |
# を検討してみます. | |
# モデルの変更 | |
# モデルをより柔軟性の高い, | |
# アンサンブル学習(ブースティング)を利用したgbmに変更してみます. | |
library(gbm, quietly=T) | |
set.seed(1) | |
clf2 <- train(survived~., data=data2, method="gbm", | |
trControl=trainControl(method="cv", number=10), | |
tuneLength=15, | |
verbose=FALSE) | |
clf2$result[which.max(clf2$results$Accuracy),] | |
# shrinkage interaction.depth n.minobsinnode n.trees Accuracy Kappa | |
# 0.1 5 10 100 0.8174105 0.6002224 | |
# AccuracySD KappaSD | |
# 0.03272035 0.07538066 | |
#グリッドサーチ+交差検証 モデル変更 | |
# 0.807 0.817 | |
# 特徴量の追加,変更 | |
# 欠損率が高く利用を見送っていたcabinの情報を利用してみます. | |
# cabinは文字+数値という形なので,文字と数値に分離して利用します. | |
data3 <- data2 | |
data3$cabin <- sapply(as.character(data$cabin), | |
FUN=function(x){substring(strsplit(x, " ")[[1]][1], 1, 1)}) | |
data3$cabin[is.na(data3$cabin)] <- "unknown" | |
data3$cabin <- as.factor(data3$cabin) | |
data3$cabin_room <- sapply(as.character(data$cabin), | |
FUN=function(x){as.numeric(substring(strsplit(x, " ")[[1]][1], 2))}) | |
data3$cabin_room[is.na(data3$cabin_room)] <- 0 | |
set.seed(1) | |
suppressWarnings( | |
clf3 <- train(survived~., data=data3, method="gbm", | |
trControl=trainControl(method="cv", number=10), | |
tuneLength=15, | |
verbose=FALSE) | |
) | |
clf3$result[which.max(clf3$results$Accuracy),] | |
# shrinkage interaction.depth n.minobsinnode n.trees Accuracy Kappa | |
# 0.1 7 10 50 0.8189607 0.6081866 | |
# AccuracySD KappaSD | |
# 0.02430674 0.05363552 | |
# グリッドサーチ+交差検証 モデル変更 cabin利用 | |
# 0.807 0.817 0.819 | |
# 単純に中央値で補間していた年齢の欠損値を,その他の特徴量を使って予測してみます. | |
data4 <- data3 | |
title <- as.character(sub(' ', '', sapply(data$name, FUN=function(x) {strsplit(x, split='[,.]')[[1]][2]}))) | |
data4$immature <- ifelse(title %in% c("Master", "Mlle", "Miss"), 1, 0) | |
data4 <- subset(data4, select=c("pclass", "age", "sibsp", "parch", "fare", "immature")) | |
set.seed(1) | |
clf_age <- train(age~., data=data4[!is.na(data$age),], method="rpart", tuneLength=10, | |
trControl=trainControl(method="cv", number=10)) | |
clf_age$result[which.max(clf_age$results$RMSE),] | |
# cp RMSE Rsquared RMSESD RsquaredSD | |
#0.2105439 1.083399 0.1464117 0.1044172 0.04869529 | |
data3$age[is.na(data$age)] <- predict(clf_age, newdata=data4[is.na(data$age),]) | |
set.seed(1) | |
suppressWarnings( | |
clf4 <- train(survived~., data=data3, method="gbm", | |
trControl=trainControl(method="cv", number=10), | |
tuneLength = 15, | |
verbose=FALSE) | |
) | |
clf4$result[which.max(clf4$results$Accuracy),] | |
# shrinkage interaction.depth n.minobsinnode n.trees Accuracy Kappa | |
# 0.1 4 10 150 0.8250617 0.6200446 | |
# AccuracySD KappaSD | |
# 0.03476065 0.07844335 | |
# グリッドサーチ+交差検証 モデル変更 cabin利用 age予測 | |
# 0.807 0.817 0.819 0.825 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment