Created
April 22, 2017 10:48
-
-
Save tap222/d8cb3747876b2b3727fd54ba872b0cd1 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
rm(list = ls()) | |
gc() | |
dir() | |
suppressMessages(library(lubridate)) | |
suppressMessages(library(caret)) | |
suppressMessages(library(gbm)) | |
suppressPackageStartupMessages(library(data.table)) | |
suppressPackageStartupMessages(library(h2o)) | |
suppressPackageStartupMessages(library(caret)) | |
suppressPackageStartupMessages(library(stringr)) | |
suppressPackageStartupMessages(library(e1071)) | |
suppressPackageStartupMessages(library(corrplot)) | |
library(anytime) | |
tr<-data.table(read.csv("train.csv")) | |
te<-data.table(read.csv("test.csv")) | |
tr<-data.table(tr[1:1000000,]) | |
te<-data.table(te[1:19918,]) | |
str(tr) | |
str(te) | |
#train | |
tr$release_date <- ymd(tr$release_date) | |
tr$year<-substring(tr$release_date,1,4) | |
#tr$year<- year(tr$release_date) | |
tr$ts_listen_date<-anydate(tr$ts_listen) | |
View(tr) | |
#tr$ts_listen<- anytime(tr$ts_listen) | |
#tr$ts_listen_time<-format(as.POSIXlt(tr$ts_listen),format = '%T') | |
tr$user_age<-as.factor(tr$user_age) | |
tr$year<-as.factor(tr$year) | |
#test | |
te$release_date<-ymd(te$release_date) | |
te$year<-substring(te$release_date,1,4) | |
#te$year<- year(te$release_date) | |
te$ts_listen_date<-anydate(te$ts_listen) | |
View(te) | |
#te$ts_listen<- anytime(te$ts_listen) | |
#te$ts_listen_time<-format(as.POSIXlt(te$ts_listen),format = '%T') | |
te$user_age<-as.factor(te$user_age) | |
te$year<-as.factor(te$year) | |
#te[,ts_listen_time:=NULL] | |
te$is_listen<-"" | |
tr$context_type<- as.factor(tr$context_type) | |
tr$platform_name<-as.factor(tr$platform_name) | |
tr$user_gender<- as.factor(tr$user_gender) | |
tr$platform_name <- as.factor(tr$platform_name) | |
tr$platform_family<- as.factor(tr$platform_family) | |
tr$listen_type<-as.factor(tr$listen_type) | |
tr$artist_id<-as.factor(tr$artist_id) | |
tr$is_listened<-as.factor(tr$is_listened) | |
tr$genre_id<-as.factor(tr$genre_id) | |
tr$media_id<-as.factor(tr$media_id) | |
te$context_type<- as.factor(te$context_type) | |
te$platform_name<-as.factor(te$platform_name) | |
te$user_gender<- as.factor(te$user_gender) | |
te$platform_name <- as.factor(te$platform_name) | |
te$platform_family<- as.factor(te$platform_family) | |
te$listen_type<-as.factor(te$listen_type) | |
te$artist_id<-as.factor(te$artist_id) | |
te$genre_id<-as.factor(te$genre_id) | |
te$album_id<-as.factor(te$album_id) | |
str(tr) | |
str(te) | |
se <- colnames(tr)[sapply(tr, is.numeric)] | |
#se <- se[!(se %in% c("context_type","platform_family","listen_type","artist_id","genre_id","album_id","year","month","day"))))] | |
#train | |
ggplot(tr,aes(media_duration))+geom_density(fill='lightblue',color='black') | |
skew <- sapply(tr[,se,with=F], function(x) skewness(x,na.rm = T)) | |
skew | |
skew <- skew[skew > 2] #filter variables with skewness > 2 | |
tr[,(names(skew)) := lapply(.SD, function(x) log(x + 15)), .SDcols = names(skew)] | |
ggplot(tr,aes(media_duration))+geom_density(fill='lightblue',color='black') | |
#test | |
ggplot(te,aes(media_duration))+geom_density(fill='lightblue',color='black') | |
se <- colnames(te)[sapply(te, is.numeric)] | |
skew_t <- sapply(te[,se,with=F], function(x) skewness(x,na.rm = T)) | |
skew_t | |
skew_t <- skew_t[skew_t > 2] | |
te[,(names(skew_t)) := lapply(.SD, function(x) log(x + 10)), .SDcols = names(skew_t)] | |
ggplot(te,aes(media_duration))+geom_density(fill='lightblue',color='black') | |
# One Hot Encoding -------------------------------------------------------- | |
tr_mod <- tr[,.(platform_name,user_gender,context_type,platform_family,listen_type,user_age,year)] | |
te_mod <- te[,.(platform_name,user_gender,context_type,platform_family,listen_type,user_age,year)] | |
tr_ex <- model.matrix(~.+0, data = tr_mod) | |
te_ex <- model.matrix(~.+0, data = te_mod) | |
tr_ex <- as.data.table(tr_ex) | |
te_ex <- as.data.table(te_ex) | |
dg <- setdiff(colnames(tr_ex), colnames(te_ex)) | |
tr_ex <- tr_ex[,-dg,with=F] | |
new_tr <- cbind(tr, tr_ex) | |
new_te <- cbind(te, te_ex) | |
new_tr[,c("platform_name","user_gender","context_type","platform_family","listen_type","artist_id","genre_id","user_age") := NULL] | |
new_te[,c("platform_name","user_gender","context_type","platform_family","listen_type","artist_id","genre_id","user_age") := NULL] | |
co_data <- rbind(new_tr,new_te,fill=TRUE) | |
F_tr <- co_data[!(is.na(is_listened))] | |
F_te <- co_data[!is.na(is_listened)] | |
rm(co_data,new_tr,new_te,tr_ex,te_ex) | |
gc() | |
# Feature Engineering ----------------------------------------------------- | |
#I think upto some extent | |
F_tr[,new_var_2 := as.Date(as.character(tr$ts_listen_date), format="%Y-%m-%d")- | |
as.Date(as.character(tr$release_date), format="%Y-%m-%d")] | |
F_tr$new_var_2<-as.integer(F_tr$new_var_2) | |
F_te[,new_var_2 := as.Date(as.character(te$release_date), format="%Y/%m/%d")- | |
as.Date(as.character(te$ts_listen_date), format="%Y/%m/%d")] | |
F_te$new_var_2<-as.integer(F_te$new_var_2) | |
tr[,new_var_3 := mean(media_duration),genre_id] | |
F_tr[,new_var_3 := tr$new_var_3] | |
xkm <- tr[,mean(media_duration),genre_id] | |
te <- xkm[te, on="genre_id"] | |
F_te[,new_var_3 := te$V1] | |
tr[,new_var_4 := mean(media_duration),user_gender] | |
F_tr[,new_var_4 := tr$new_var_4] | |
akm <- tr[,mean(media_duration),user_gender] | |
te <- akm[te, on="user_gender"] | |
F_te[,new_var_4 := te$i.V1] | |
tr[,new_var_5 := mean(media_duration),context_type] | |
F_tr[,new_var_5 := tr$new_var_5] | |
bkm <- tr[,mean(media_duration),context_type] | |
te <- bkm[te, on="context_type"] | |
F_te[,new_var_5 := te$i.V1.1] | |
tr[,new_var_6 := mean(media_duration),platform_family] | |
F_tr[,new_var_6 := tr$new_var_6] | |
ckm <- tr[,mean(media_duration),platform_family] | |
te <- ckm[te, on="platform_family"] | |
F_te[,new_var_6 := te$i.V1.2] | |
tr[,new_var_7 := mean(media_duration),platform_name] | |
F_tr[,new_var_7 := tr$new_var_7] | |
dkm <- tr[,mean(media_duration),platform_name] | |
te <- dkm[te, on="platform_name"] | |
F_te[,new_var_7 := te$i.V1.3] | |
tr[,new_var_8 := mean(media_duration),listen_type] | |
F_tr[,new_var_8 := tr$new_var_8] | |
ekm <- tr[,mean(media_duration),listen_type] | |
te <- ekm[te, on="listen_type"] | |
F_te[,new_var_8 := te$i.V1.4] | |
tr[,new_var_9 := mean(media_duration),artist_id] | |
F_tr[,new_var_9 := tr$new_var_9] | |
fkm <- tr[,mean(media_duration),artist_id] | |
te <- fkm[te, on="artist_id"] | |
F_te[,new_var_9 := te$i.V1.5] | |
#tr[,new_var_10 := mean(media_duration),year] | |
#F_tr[,new_var_10 := tr$new_var_10] | |
#gkm <- tr[,mean(media_duration),year] | |
#te <- gkm[te, on="year"] | |
#F_te[,new_var_10 := te$i.V1.6] | |
tr[,new_var_11 := mean(media_duration),user_age] | |
F_tr[,new_var_11 := tr$new_var_11] | |
hkm <- tr[,mean(media_duration),user_age] | |
te <- hkm[te, on="user_age"] | |
F_te[,new_var_11 := te$i.V1.7] | |
# Machine Learning with H2o ----------------------------------------------- | |
#Give H2o your maximum memory for computation | |
#if your laptop is 8GB, give atleast 6GB, close all other apps while computation happens | |
#may be, go out take a walk! | |
h2o.init(nthreads = -1,max_mem_size = "10G") | |
h2o_tr <- as.h2o(F_tr) | |
h2o_te <- as.h2o(F_te) | |
h2o_tr$is_listened <- h2o.asfactor(h2o_tr$is_listened) | |
# Create a validation frame ----------------------------------------------- | |
#Here I want to avoid doing k-fold CV since data set is large, it would take longer time | |
#hence doing hold out validation | |
xd <- h2o.splitFrame(h2o_tr,ratios = 0.8) | |
split_val <- xd[[2]] | |
y <- "is_listened" | |
x <- setdiff(colnames(F_tr), c(y,"user_id")) | |
# Training a GBM Model ---------------------------------------------------- | |
gbm_clf <- h2o.gbm(x = x | |
,y = y | |
,training_frame = h2o_tr | |
,validation_frame = split_val | |
,ignore_const_cols = T | |
,ntrees = 1000 | |
,max_depth = 20 | |
,stopping_rounds = 10 | |
,model_id = "gbm" | |
,stopping_metric = "AUC" | |
,learn_rate = 0.05 | |
,col_sample_rate_per_tree = 0.8 | |
,sample_rate = 0.8 | |
,learn_rate_annealing = 0.99 | |
) | |
gbm_clf | |
gbm_clf_pred <- as.data.table(h2o.predict(gbm_clf,h2o_te)) | |
head(gbm_clf_pred,10) | |
sub_pred1 <- data.table(sample_id = te$sample_id, is_listened = gbm_clf_pred$p1) | |
fwrite(sub_pred1,"h2o_gbm_u2.csv") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment