Created
February 16, 2015 04:04
-
-
Save hnagata/7f13b506a808b5dcb5a3 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(ggplot2) | |
library(glmnet) | |
library(msgps) | |
options(scipen=1) | |
dat.column <- read.csv("column.csv", head=FALSE, fileEncoding="UTF-8", colClasses="character") | |
varnames <- dat.column[, 2] | |
names(varnames) <- dat.column[, 1] | |
dat <- read.csv("data.csv", fileEncoding="UTF-8", colClasses=dat.column[, 3]) | |
dat <- dat[!duplicated(cbind(dat$price, dat$lat, dat$lon)), ] | |
dat$price[dat$id == "300056150013621013622"] <- NA # 価格の桁間違い? | |
dat$area[dat$id %in% c("300054130041305041305", "300105420003450004117")] <- NA # 面積の桁間違い? | |
dat$latitude[dat$id == "300059820006005006005"] <- NA # 緯度不正 | |
dat$longitude[dat$id == "300059820006005006005"] <- NA # 経度不正 | |
dim(dat) | |
dat.lm <- cbind( | |
log.price = log(dat$price), | |
dat[, c("time", "num.room", "S", "L", "D", "K", "area", "age", "floor", "num.floor", "days.published")], | |
type.mansion = as.numeric(dat$type == "マンション"), | |
type.house = as.numeric(grepl("一戸建て", dat$type)), | |
struct.mokuzo = as.numeric(dat$struct == "木造"), | |
struct.concrete = as.numeric(dat$struct == "ALC" | grepl("コンクリート", dat$struct)), | |
dir.N = as.numeric(dat$dir == "北"), | |
dir.NE = as.numeric(dat$dir == "北東"), | |
dir.E = as.numeric(dat$dir == "東"), | |
dir.SE = as.numeric(dat$dir == "南東"), | |
dir.S = as.numeric(dat$dir == "南"), | |
dir.SW = as.numeric(dat$dir == "南西"), | |
dir.W = as.numeric(dat$dir == "西"), | |
parking = as.numeric(dat$parking != "なし"), | |
dat[, paste0("C", 1:17)], | |
dat[, paste0("E", (1:99)[-39])] | |
) | |
dat.lm <- dat.lm[, -which(colnames(dat.lm) == "E89")] # 暖房機: E7冷房と高相関 (0.881) | |
dat.lm <- dat.lm[, -which(colnames(dat.lm) == "num.room")] # area と高相関 (0.836) | |
dat.lm <- dat.lm[, -which(colnames(dat.lm) == "E72")] # シャワー: E48洗濯機置き場と高相関 (0.835) | |
dat.lm$E92 <- as.numeric(dat.lm$E92 | dat.lm$E84) | |
dat.lm <- dat.lm[, -which(colnames(dat.lm) == "E84")] # 敷地内ゴミ置き場: E92専用ゴミ置き場と統合 (0.798) | |
dat.lm <- dat.lm[, -which(colnames(dat.lm) == "E53")] # 電話機: E88テレビと高相関 (0.769) | |
dat.lm <- dat.lm[, -which(colnames(dat.lm) == "E6")] # 可動間仕切り: E87アプローチライトと高相関 (0.763) | |
dat.lm <- dat.lm[, -which(colnames(dat.lm) == "type.mansion")] # struct.concrete と高相関 (0.725) | |
dat.lm <- dat.lm[, -which(colnames(dat.lm) == "num.floor")] # floor と高相関 (0.715) | |
dat.lm <- as.data.frame(scale(na.omit(dat.lm))) | |
dim(dat.lm) | |
varnames <- c( | |
varnames, | |
log.price = "log(価格)", | |
type.mansion = "マンション", type.house = "一戸建て", | |
struct.mokuzo = "木造", struct.concrete = "コンクリート建築", | |
dir.N = "北向き", dir.NE = "北東向き", dir.E = "東向き", dir.SE = "南東向き", | |
dir.S = "南向き", dir.SW = "南西向き", dir.W = "西向き" | |
) | |
## ---- | |
## check corelations | |
cor.dat <- cor(dat.lm) | |
large.cor <- matrix(abs(cor.dat) > 0.7, nrow(cor.dat)) & upper.tri(cor.dat) | |
large.cor.index <- arrayInd(which(large.cor), .dim=dim(large.cor)) | |
cbind( | |
matrix(colnames(dat.lm)[large.cor.index], nrow(large.cor.index)), | |
cor.dat[large.cor] | |
) | |
## ---- | |
## plot function | |
my.barplot.lm <- function(var, est, err=NULL) { | |
var <- factor(varnames[var], levels=varnames[var]) | |
ggdat <- data.frame(var=var, est=est, sign=est>=0) | |
if (!is.null(err)) ggdat$err = err | |
g <- ggplot(ggdat, aes(x=var, y=est, colour=sign, fill=sign)) | |
g <- g + coord_flip() | |
g <- g + scale_colour_manual(values=c(hsv(0, 1, 0.75), hsv(0.6, 1, 0.75))) | |
g <- g + scale_fill_manual(values=c(hsv(0, 0.5, 1), hsv(0.6, 0.5, 1))) | |
g <- g + theme(legend.position="none") | |
g <- g + scale_x_discrete(name="") | |
g <- g + scale_y_continuous(name="推定値") | |
g <- g + geom_bar(stat="identity") | |
if (!is.null(ggdat$err)) { | |
g <- g + geom_errorbar(aes(ymin=est-err, ymax=est+err), width=0.3) | |
} | |
g | |
} | |
## ---- | |
## lm | |
if (!file.exists("aiclm.Rdata")) { | |
aiclm.dat <- step(lm(log.price ~ ., data=dat.lm)) | |
save(aiclm.dat, file="aiclm.Rdata") | |
} else { | |
load("aiclm.Rdata") | |
} | |
coef.aiclm.dat <- summary(aiclm.dat)$coef[-1, ] | |
coef.aiclm.dat <- coef.aiclm.dat[order(coef.aiclm.dat[, 1]), ] | |
my.barplot.lm( | |
rownames(coef.aiclm.dat), coef.aiclm.dat[, 1], err=coef.aiclm.dat[, 2] * qnorm(0.975)) | |
ggsave("aiclm.svg", width=6, height=8, family="Meiryo") | |
if (!file.exists("biclm.Rdata")) { | |
biclm.dat <- step(lm(log.price ~ ., data=dat.lm), k=log(nrow(dat.lm))) | |
save(biclm.dat, file="biclm.Rdata") | |
} else { | |
load("biclm.Rdata") | |
} | |
coef.biclm.dat <- summary(biclm.dat)$coef[-1, ] | |
coef.biclm.dat <- coef.biclm.dat[order(coef.biclm.dat[, 1]), ] | |
my.barplot.lm( | |
rownames(coef.biclm.dat), coef.biclm.dat[, 1], err=coef.biclm.dat[, 2] * qnorm(0.975)) | |
## ---- | |
## glmnet | |
X <- as.matrix(dat.lm)[, -1] | |
y <- dat.lm[, 1] | |
glmnet.dat <- glmnet(X, y) | |
cri <- mapply(function(lambda, df) { | |
beta <- as.numeric(coef(glmnet.dat, lambda)) | |
resid <- drop(y - cbind(rep(1, nrow(X)), X) %*% beta) | |
log(var(resid)) + log(nrow(X)) * df / nrow(X) | |
}, glmnet.dat$lambda, glmnet.dat$df) | |
best.i <- which.min(cri) | |
coef.glmnet.dat <- drop(coef(glmnet.dat, glmnet.dat$lambda[best.i]))[-1] | |
coef.glmnet.dat <- coef.glmnet.dat[coef.glmnet.dat != 0] | |
coef.glmnet.dat <- sort(coef.glmnet.dat) | |
my.barplot.lm(names(coef.glmnet.dat), coef.glmnet.dat) | |
## ---- | |
## msgps | |
msgps.dat <- msgps(as.matrix(dat.lm)[, -1], dat.lm[, 1], penalty="alasso") | |
coef.msgps.dat <- drop(coef(msgps.dat, tuning=msgps.dat$dfbic_result$tuning))[-1] | |
coef.msgps.dat <- coef.msgps.dat[coef.msgps.dat != 0] | |
coef.msgps.dat <- sort(coef.msgps.dat) | |
my.barplot.lm(names(coef.msgps.dat), coef.msgps.dat) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment