Created
January 7, 2015 02:17
-
-
Save hillarysanders/22bc484b0d24f0600aaf to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
########################################################################################## | |
########################################################################################## | |
# PREMISE | |
# hills | |
########################################################################################## | |
wd <- "~/Desktop/PREMISE/" | |
setwd(wd) | |
# source("Hillary_Premise/utils/env.R") | |
########################################################################################## | |
########################################################################################## | |
x = read.csv('~/Downloads/anand-google-electricity-results.csv') | |
dim(x) | |
print(colnames(x)) | |
wanted_columns = c('b_type', 'b_size', 'b_wall', 'b_roof', 'wired', 'image', 'o_uuid', 'timestamp', | |
'g_ring_dis', 'g_ring_id', 'g_ring_name', 'e_sub', 'e_obj', 'e_rel', 'e_has_rel', | |
'user_name', 'id') | |
x = x[ , wanted_columns] | |
x = fix.factors(x) | |
# x$g_ring_id[x$g_ring_id=='-'] = '00000' | |
x = x[!is.na(x$wired), ] | |
x = x[x$g_ring_id!='-', ] | |
x = x[!is.na(x$wired), ] | |
x = x[order(x$g_ring_id), ] | |
info = x | |
y = x$wired | |
x = x[ ,colnames(x)!='wired'] | |
x = x[ , c('g_ring_dis', 'b_type', 'b_size', 'b_wall', 'b_roof')] | |
# 65.7 | |
for(w in sort(unique(x$b_wall))){ | |
print(w) | |
print(cor(x$b_wall==w, y)) | |
} | |
#################################################################################### | |
# CLEANING: | |
x$residence = c(0, 1)[1+(x$b_type!='t_business')] | |
x$b_type = NULL | |
x$b_size[x$b_size == 's_small'] = 1 | |
x$b_size[x$b_size == 's_medium'] = 2 | |
x$b_size[x$b_size == 's_large'] = 3 | |
x$b_size[x$b_size == ''] = NA | |
x$b_size = as.numeric(x$b_size) | |
x$b_wall[x$b_wall=='w_mud'] = 0 | |
x$b_wall[x$b_wall=='w_wood'] = 0 | |
x$b_wall[x$b_wall=='w_sheet'] = 0 | |
x$b_wall[x$b_wall==''] = 0 | |
x$b_wall[x$b_wall=='w_brick'] = 1 | |
x$b_wall[x$b_wall=='w_cement'] = 1 | |
x$b_wall = as.numeric(x$b_wall) | |
x$b_roof[x$b_roof=='r_tile'] = 2 | |
x$b_roof[x$b_roof=='r_sheet'] = 1 | |
x$b_roof[x$b_roof=='' | x$b_roof=='r_other'] = NA | |
x$b_roof = as.numeric(x$b_roof) | |
table(x$b_roof) | |
for(i in 1:ncol(x)){ | |
y = y[!is.na(x[ , i])] | |
info = info[!is.na(x[ , i]), ] | |
x = x[!is.na(x[ , i]), ] | |
} | |
#################################################################################### | |
#################################################################################### | |
# do it: | |
rings = unique(info$g_ring_id) | |
acc = data.frame(ring=rings, accuracy=NA, MSE=NA, abs.error=NA, n=NA, cor=NA, | |
precision.wired=NA, recall.wired=NA, precision.unwired=NA, recall.unwired=NA, | |
stringsAsFactors = F) | |
all.predictions = NULL | |
all.y = NULL | |
for(ring in rings){ | |
cat(paste('\n Ring:', ring)) | |
test.idx = info$g_ring_id == ring | |
train.idx = info$g_ring_id != ring | |
x.train = (x[train.idx, ]) | |
x.test = (x[test.idx, ]) | |
y.train = y[train.idx] | |
y.test = y[test.idx] | |
fitted = glm(formula = y.train ~ g_ring_dis+b_size+b_wall+b_roof+residence, | |
data = x.train, family = binomial(link = "logit")) | |
# fitted = glm(formula = y.train ~ b_size + b_wall, | |
# data = x.train, family = binomial(link = "logit")) | |
predictions = predict.glm(fitted, newdata = x.test) | |
binary = predictions > 0 | |
tab = table(binary, y.test) | |
n = length(y.test) | |
cat(paste0('\nTrained ', length(y.train), ' obs from other rings to test ', n, ' obs from ring ', ring)) | |
n.correct = sum(tab[c(1,4)]) | |
cat(paste0('\nUsing prob=0 as a cutoff, success rate was ', + n.correct , '/', n, ': ', round(100*n.correct / n, 1), '%')) | |
ring.idx = which(acc$ring==ring) | |
acc$accuracy[ring.idx] = n.correct / n | |
acc$MSE[ring.idx] = mse(x = predictions, y = y.test, normalize = F, exp = 2) | |
acc$abs.error[ring.idx] = mse(x = predictions, y = y.test, normalize = F, exp = 1) | |
acc$n[ring.idx] = n | |
acc$cor[ring.idx] = cor(predictions, y.test) | |
acc$precision.wired[ring.idx] = tab[4] / (tab[4]+tab[2]) | |
acc$recall.wired[ring.idx] = tab[4] / (tab[4]+tab[3]) | |
acc$precision.unwired[ring.idx] = tab[1] / (tab[1] + tab[3]) | |
acc$recall.unwired[ring.idx] = tab[1] / (tab[1] + tab[2]) | |
all.predictions = c(all.predictions, predictions) | |
all.y = c(all.y, y.test) | |
} | |
ave.accuracy = weighted.mean(x = acc$accuracy, w = acc$n, na.rm=T) | |
ave.cor = weighted.mean(x = acc$cor, w=acc$n, na.rm=T) | |
ave.wired.precision = weighted.mean(x = acc$precision.wired, w=acc$n, na.rm=T) | |
ave.unwired.precision = weighted.mean(x = acc$precision.unwired, w=acc$n, na.rm=T) | |
ave.wired.recall = weighted.mean(x = acc$recall.wired, w=acc$n, na.rm=T) | |
ave.unwired.recall = weighted.mean(x = acc$recall.unwired, w=acc$n, na.rm=T) | |
ave.recall = mean(ave.unwired.recall, ave.wired.recall) | |
ave.precision = mean(ave.unwired.precision, ave.wired.precision) | |
cat(paste0('\n\n\n Average Accuracy (out of sample) = ', round(ave.accuracy*100, 1), '%')) | |
cat(paste0('\nCorrelation between predictions and truth: ', round(ave.cor, 2))) | |
cat(paste0('\nAve Precision: ', round(ave.precision, 2))) | |
cat(paste0('\nAve Recall: ', round(ave.recall, 2))) | |
cols = get.colors(length(rings)) | |
plot(c(1, (length(all.predictions))), range(all.predictions, na.rm=T)+c(-.3, .1), xaxt='n', xlab='', | |
ylab='Does the Model think a building has Electricity?', | |
main=paste0(pretty(length(all.predictions)), ' out-of-sample Predictions'), cex=0) | |
for(i in 1:length(rings)){ | |
draw.shape(range(which(info$g_ring_id==rings[i])), y1 = -2, y2 = 4, col = cols[i], border=NA) | |
text(mean(which(info$g_ring_id==rings[i])), min(all.predictions, na.rm=T)-.6, labels=rings[i], cex=.65, xpd=T, srt=45) | |
} | |
points(all.predictions, col=c('black', 'yellow')[1+all.y], pch=19, cex=.4) | |
lines(c(1, length(all.y)), c(0,0), lwd=2, lty=2) | |
legend('bottomright', col = c('yellow', 'black'), | |
c( 'Wired', 'Not Wired'), pch=19, pt.cex=1.5, bty='n', box.lwd = 0) | |
legend('bottomleft', bty='n', paste0( round(100*ave.accuracy, 1), '% Accuracy')) | |
#################################################################################### | |
#################################################################################### | |
#################################################################################### | |
#################################################################################### | |
# - making vals into ranked numbers helped a tiny bit, only thing that really matters is | |
# wall type. 66% accuracy | |
# So. 65% is not that great, it's okay. Basically, b_wall is the only thing that makes the prediction good. Type of roof | |
# is interesting, but doesn't seem to actually be that helpful (likely due to scarcity and errors) | |
# cor(y, x$b_roof) | |
round(cor(x, y), 2) | |
# I would have guessed that adding in e.g. b_size to b_wall in the predictors would help, but in this particular | |
# test it didn't. | |
#################################################################################### | |
#################################################################################### | |
#################################################################################### | |
#################################################################################### | |
# # ---> add geo clusters and then add that as a feature. | |
# Use KNN to extract clusters and then predict how wired houses are. | |
# - first you need to know where each transformer is | |
# | |
# or use streets on a grid to do this. | |
# (everything is an obs so a few rows = the transformers) | |
# | |
# end of week = position to | |
# 1) give them clusters and scores | |
# 2) tell them we're adding more signal so this is preliminary | |
# 3) bs some 'good candidate bad canditate thing' | |
# also make bin/run work and play witha CSV in python!! run through the clean and index code | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment