Last active
December 28, 2015 17:29
-
-
Save alfard/7536685 to your computer and use it in GitHub Desktop.
Code Expedia Kaggle
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| sample_train <- read.csv("/home/alfard/trainnrandom.csv",na.strings="NA") | |
| sample_train$X <- NULL | |
| sample_train$s1 <- ifelse (sample_train$click_bool==1 & sample_train$booking_bool==0,1,0) | |
| sample_train$s2 <- ifelse (sample_train$click_bool==1 & sample_train$booking_bool==1,5,0) | |
| sample_train$reli <- sample_train$s1+sample_train$s2 | |
| #sasfreq(reli,sample_train) | |
| #sample_train$reli <- factor(sample_train$reli) | |
| #sampleSMOTE <- SMOTE(reli ~ ., sample_train, perc.over = 10000,perc.under=100) | |
| #sasfreq(reli,sampleSMOTE) | |
| #Mise au format du train | |
| #loop <- read.csv("~/Documents/loop.csv",na.strings="NULL") | |
| ST <- sample_train | |
| ST$click_bool <- NULL | |
| #letor('booking_bool',ST2) | |
| ST$booking_bool <- NULL | |
| ST$s1 <- NULL | |
| ST$s2 <- NULL | |
| #ST$prop_id <- factor(ST$prop_id) | |
| ST$random_bool <- NULL | |
| ST$position <- NULL | |
| ST$gross_bookings_usd <- NULL | |
| ST2<-ST[order(ST$srch_id,-ST$reli),] | |
| ST2 <- ST2[c(50,1:49)] | |
| library(gtools) | |
| library(dummies) | |
| library(som) | |
| sasfreq <- defmacro(colonne, tablo, expr={ | |
| temp<-as.data.frame(table(tablo$colonne,exclude=NULL)) | |
| temp$perc<-round((temp$Freq/sum(temp$Freq))*100,digits = 2) | |
| #temp<- temp[order(perc,)] | |
| }) | |
| #Effacer ST et sample_train pour recuperer memoire | |
| ST<- NULL | |
| sample_train <- NULL | |
| #ST2$date_time <- as.numeric(substr(ST2$date_time, 6, 7)) | |
| ########################################################### | |
| ST2$reli<-factor(ST2$reli) | |
| ST2$date_time <- as.numeric(substr(ST2$date_time, 6, 7)) | |
| ST2$date_time <- normalize(ST2$date_time, byrow=TRUE) | |
| #ST2$date_time <- factor(ST2$date_time) | |
| #t<-as.data.frame(dummy(ST2$date_time)) | |
| ############Point de vente | |
| #sasfreq(site_id,ST2) | |
| ST2$site_id <- factor(ST2$site_id) | |
| #t1<-as.data.frame(dummy(ST2$site_id)) | |
| ############Pays d'origine de la recherche | |
| #sasfreq(visitor_location_country_id,ST2) | |
| #ST2$visitor_location_country_id <- factor(ST2$visitor_location_country_id) | |
| #t2<-as.data.frame(dummy(ST2$visitor_location_country_id)) | |
| ST2$visitor_location_country_id <- factor(ST2$visitor_location_country_id) | |
| ########Type d'hotel reservé avant reservation | |
| #sasfreq(visitor_hist_starrating,ST2) | |
| ST2$visitor_hist_starratingNA <- ifelse (is.na(ST2$visitor_hist_starrating),1,0) | |
| #t3<-as.data.frame(dummy(ST2$visitor_hist_starrating)) | |
| ST2$visitor_hist_starrating<-normalize(ST2$visitor_hist_starrating, byrow=TRUE) | |
| #sasfreq(visitor_hist_adr_usd,ST2) | |
| ST2$visitor_hist_adr_usdNA <- ifelse (is.na(ST2$visitor_hist_adr_usd),1,0) | |
| ST2$visitor_hist_adr_usd<-normalize(ST2$visitor_hist_adr_usd, byrow=TRUE) | |
| #sasfreq(prop_country_id,ST2) | |
| ST2$prop_country_id <- factor(ST2$prop_country_id) | |
| sasfreq(prop_id,ST2) | |
| #inutile d'utiliser l'id de l'hotel: 41657 | |
| #garder propid pour plus tard | |
| #propid <-ST2["prop_id"] | |
| #Nombre d'étoile | |
| #sasfreq(prop_starrating,ST2) | |
| ST2$prop_starrating <- factor(ST2$prop_starrating) | |
| #Evaluation de l'hotel | |
| #sasfreq(prop_review_score,ST2) | |
| ST2$prop_review_scoreNA <- ifelse (is.na(ST2$prop_review_score),1,0) | |
| ST2$prop_review_score<-factor(ST2$prop_review_score) | |
| #Dummy appartient à une chaine | |
| #sasfreq(prop_brand_bool,ST2) | |
| #sasfreq(prop_location_score1,ST2) | |
| ST2$prop_location_score1<-normalize(ST2$prop_location_score1, byrow=TRUE) | |
| #sasfreq(prop_location_score2,ST2) | |
| ST2$prop_location_score2<-normalize(ST2$prop_location_score2, byrow=TRUE) | |
| #Log du prix moyen de l'hotel | |
| #sasfreq(prop_log_historical_price,ST2) | |
| ST2$prop_log_historical_price<-normalize(ST2$prop_log_historical_price, byrow=TRUE) | |
| #letor('position',ST2) | |
| #Position dans le ranking d'expedia absent de test | |
| #Prix en dollar | |
| #sasfreq(price_usd,ST2) | |
| ST2$price_usd<-normalize(ST2$price_usd, byrow=TRUE) | |
| #Chiffre trop faible | |
| #Promotion flag | |
| #sasfreq(promotion_flag,ST2) | |
| #letor('gross_bookings_usd',ST2) | |
| #Montant de la transaction absent de test | |
| #Destination ID | |
| #sasfreq(srch_destination_id,ST2) | |
| ST2$srch_destination_id <- factor(ST2$srch_destination_id) | |
| #sasfreq(srch_length_of_stay,ST2) | |
| ST2$srch_length_of_stay <- normalize(ST2$srch_length_of_stay , byrow=TRUE) | |
| #sasfreq(srch_booking_window,ST2) | |
| ST2$srch_booking_window <- normalize(ST2$srch_booking_window , byrow=TRUE) | |
| #sasfreq(srch_adults_count,ST2) | |
| ST2$srch_adults_count <- normalize(ST2$srch_adults_count , byrow=TRUE) | |
| #sasfreq(srch_children_count,ST2) | |
| ST2$srch_children_count <- normalize(ST2$srch_children_count , byrow=TRUE) | |
| #Nombre de chambre | |
| #sasfreq(srch_room_count,ST2) | |
| ST2$srch_room_count <- normalize(ST2$srch_room_count , byrow=TRUE) | |
| #Dummy | |
| #sasfreq(srch_saturdy_night_bool,ST2) | |
| #Log probabilité que l'hotel soit choisi | |
| #sasfreq(srch_query_affinity_score,ST2) | |
| ST2$srch_query_affinity_score <- normalize(ST2$srch_query_affinity_score , byrow=TRUE) | |
| #sasfreq(orig_destination_distance,ST2) | |
| ST2$orig_destination_distance <- normalize(ST2$orig_destination_distance , byrow=TRUE) | |
| #sasfreq(random_bool,ST2) | |
| #letor('random_bool',ST2) | |
| #je retire cette variable pour la modelisation | |
| ######################################################################################### | |
| #letor('comp1_rate',ST2) | |
| ST2$comp1_rateNA <- ifelse (is.na(ST2$comp1_rate),1,0) | |
| #letor('comp1_inv',ST2) | |
| ST2$comp1_invNA <- ifelse (is.na(ST2$comp1_inv),1,0) | |
| #letor('comp1_rate_percent_diff',ST2) | |
| ST2$comp1_rate_percent_diffNA <- ifelse (is.na(ST2$comp1_rate_percent_diff),1,0) | |
| ST2$comp1_rate_percent_diff <- normalize(ST2$comp1_rate_percent_diff , byrow=TRUE) | |
| #letor('comp2_rate',ST2) | |
| ST2$comp2_rateNA <- ifelse (is.na(ST2$comp2_rate),1,0) | |
| #letor('comp2_inv',ST2) | |
| ST2$comp2_invNA <- ifelse (is.na(ST2$comp2_inv),1,0) | |
| #letor('comp2_rate_percent_diff',ST2) | |
| ST2$comp2_rate_percent_diffNA <- ifelse (is.na(ST2$comp2_rate_percent_diff),1,0) | |
| ST2$comp2_rate_percent_diff <- normalize(ST2$comp2_rate_percent_diff , byrow=TRUE) | |
| #letor('comp3_rate',ST2) | |
| ST2$comp3_rateNA <- ifelse (is.na(ST2$comp3_rate),1,0) | |
| #letor('comp3_inv',ST2) | |
| ST2$comp3_invNA <- ifelse (is.na(ST2$comp3_inv),1,0) | |
| #letor('comp3_rate_percent_diff',ST2) | |
| ST2$comp3_rate_percent_diffNA <- ifelse (is.na(ST2$comp3_rate_percent_diff),1,0) | |
| ST2$comp3_rate_percent_diff <- normalize(ST2$comp3_rate_percent_diff , byrow=TRUE) | |
| #letor('comp4_rate',ST2) | |
| ST2$comp4_rateNA <- ifelse (is.na(ST2$comp4_rate),1,0) | |
| #letor('comp4_inv',ST2) | |
| ST2$comp4_invNA <- ifelse (is.na(ST2$comp4_inv),1,0) | |
| #letor('comp4_rate_percent_diff',ST2) | |
| ST2$comp4_rate_percent_diffNA <- ifelse (is.na(ST2$comp4_rate_percent_diff),1,0) | |
| ST2$comp4_rate_percent_diff <- normalize(ST2$comp4_rate_percent_diff , byrow=TRUE) | |
| #letor('comp5_rate',ST2) | |
| ST2$comp5_rateNA <- ifelse (is.na(ST2$comp5_rate),1,0) | |
| #letor('comp5_inv',ST2) | |
| ST2$comp5_invNA <- ifelse (is.na(ST2$comp5_inv),1,0) | |
| #letor('comp5_rate_percent_diff',ST2) | |
| ST2$comp5_rate_percent_diffNA <- ifelse (is.na(ST2$comp5_rate_percent_diff),1,0) | |
| ST2$comp5_rate_percent_diff <- normalize(ST2$comp5_rate_percent_diff , byrow=TRUE) | |
| #letor('comp6_rate',ST2) | |
| ST2$comp6_rateNA <- ifelse (is.na(ST2$comp6_rate),1,0) | |
| #letor('comp6_inv',ST2) | |
| ST2$comp6_invNA <- ifelse (is.na(ST2$comp6_inv),1,0) | |
| #letor('comp6_rate_percent_diff',ST2) | |
| ST2$comp6_rate_percent_diffNA <- ifelse (is.na(ST2$comp6_rate_percent_diff),1,0) | |
| ST2$comp6_rate_percent_diff <- normalize(ST2$comp6_rate_percent_diff , byrow=TRUE) | |
| #letor('comp7_rate',ST2) | |
| ST2$comp7_rateNA <- ifelse (is.na(ST2$comp7_rate),1,0) | |
| #letor('comp7_inv',ST2) | |
| ST2$comp7_invNA <- ifelse (is.na(ST2$comp7_inv),1,0) | |
| #letor('comp7_rate_percent_diff',ST2) | |
| ST2$comp7_rate_percent_diffNA <- ifelse (is.na(ST2$comp7_rate_percent_diff),1,0) | |
| ST2$comp7_rate_percent_diff <- normalize(ST2$comp7_rate_percent_diff , byrow=TRUE) | |
| #letor('comp8_rate',ST2) | |
| ST2$comp8_rateNA <- ifelse (is.na(ST2$comp8_rate),1,0) | |
| #letor('comp8_inv',ST2) | |
| ST2$comp8_invNA <- ifelse (is.na(ST2$comp8_inv),1,0) | |
| #letor('comp8_rate_percent_diff',ST2) | |
| ST2$comp8_rate_percent_diffNA <- ifelse (is.na(ST2$comp8_rate_percent_diff),1,0) | |
| ST2$comp8_rate_percent_diff <- normalize(ST2$comp8_rate_percent_diff , byrow=TRUE) | |
| ST2$V4 <- with(ST2, ave(srch_id, srch_id, FUN = seq)) | |
| ST3<-ST2[!(ST2$V4 > 5),] | |
| ST3$reli <- ifelse (ST3$reli==5,1,0) | |
| sasfreq(reli,ST3) | |
| #load("train.RData") | |
| #save.image("~/1238.RData") | |
| #load("1238.RData") | |
| ST2 <- NULL | |
| SOL <- NULL | |
| SOL2 <- NULL | |
| predictionsgbm <- NULL | |
| predictionsgbmtest <- NULL | |
| sample_test <- NULL | |
| tt<-NULL | |
| #train.fraction=0.7 | |
| ST3<-ST3[!(ST3$V4 > 5),] | |
| sasfreq(reli,ST3) | |
| library(gbm) | |
| tt<-gbm(reli ~ .-srch_destination_id-V4-prop_id-srch_id, distribution="bernoulli", data=ST3, | |
| n.trees=2000, shrinkage=0.01, cv.folds=0,train.fraction=0.5, | |
| verbose=TRUE) | |
| best.iter <- gbm.perf(tt,method="test") | |
| predictionsgbm<-data.frame(predict.gbm(tt,ST2,best.iter)) | |
| ########################################### | |
| sasfreq(reli,ST2) | |
| #Proto | |
| SS <- cbind(predictionsgbm,ST2$reli,ST2$srch_id) | |
| ################################################# | |
| predictionsgbmtest<-data.frame(predict.gbm(tt,T3,best.iter)) | |
| SOL <- cbind(predictionsgbmtest$predict.gbm.tt..T3..best.iter.,T3$srch_id,T3$prop_id) | |
| SOL <- as.data.frame(SOL) | |
| SOL2<-SOL[order(SOL$V2,-SOL$V1),] | |
| SOL2$V1 <- NULL | |
| write.csv(SOL2, file = "MyData61000.csv",row.names=FALSE) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment