Skip to content

Instantly share code, notes, and snippets.

@flxw
Created January 25, 2018 19:51
Show Gist options
  • Save flxw/120747b159893ca54d89b3e1a62a89c4 to your computer and use it in GitHub Desktop.
Save flxw/120747b159893ca54d89b3e1a62a89c4 to your computer and use it in GitHub Desktop.
standardize <- function(x){
mu <- mean(x)
std <- sd(x)
result <- (x - mu)/std
return(result)
}
..static_user_statistics <- NULL
..read_and_preprocess_data_file = function(fp) {
sales = read.csv(fp, stringsAsFactors = FALSE)
sales$item_size [sales$item_size == "XS" ] <- "xs"
sales$item_size [sales$item_size == "XL" ] <- "xl"
sales$item_size [sales$item_size == "L" ] <- "l"
sales$item_size [sales$item_size == "M" ] <- "m"
sales$item_size [sales$item_size == "S" ] <- "s"
sales$item_size [sales$item_size == "XXL" ] <- "xxl"
sales$item_size [sales$item_size == "XXXL" ] <- "xxxl"
sales$item_size [sales$item_size == "36" ] <- "s"
sales$item_size [sales$item_size == "37" ] <- "s"
sales$item_size [sales$item_size == "38" ] <- "m"
sales$item_size [sales$item_size == "35" ] <- "xs"
sales$item_size [sales$item_size == "38+" ] <- "m"
sales$item_size [sales$item_size == "36+" ] <- "s"
sales$item_size [sales$item_size == "37+" ] <- "s"
sales$item_size [sales$item_size == "39+" ] <- "m"
sales$item_size [sales$item_size == "39" ] <- "m"
sales$item_size [sales$item_size == "40" ] <- "m"
sales$item_size [sales$item_size == "40+" ] <- "m"
sales$item_size [sales$item_size == "41" ] <- "l"
sales$item_size [sales$item_size == "41+" ] <- "l"
sales$item_size [sales$item_size == "42+" ] <- "l"
sales$item_size [sales$item_size == "42" ] <- "l"
sales$item_size [sales$item_size == "43" ] <- "l"
sales$item_size [sales$item_size == "43+" ] <- "xl"
sales$item_size [sales$item_size == "44+" ] <- "xl"
sales$item_size [sales$item_size == "44" ] <- "xl"
sales$item_size [sales$item_size == "45" ] <- "xl"
sales$item_size [sales$item_size == "45+" ] <- "xl"
sales$item_size [sales$item_size == "46" ] <- "xl"
sales$item_size [sales$item_size == "46+" ] <- "xl"
sales$item_size [sales$item_size == "47" ] <- "xxl"
sales$item_size [sales$item_size == "48" ] <- "xxl"
sales$item_size [sales$item_size == "50" ] <- "xxl"
sales$item_size [sales$item_size == "34" ] <- "xs"
sales$item_size [sales$item_size == "52" ] <- "xxxl"
sales$item_size [sales$item_size == "54" ] <- "xxxl"
sales$item_size [sales$item_size == "56" ] <- "xxxl"
sales$user_dob <- substring(sales$user_dob,1,4)
sales$user_dob [sales$user_dob == "?"] <- NA
sales$user_dob [sales$user_dob < 1920 ] <-NA
count<- table(sales$user_dob)
MFV <- names(count) [count == max (count)]
sales$user_dob [is.na(sales$user_dob)] <- MFV
sales$user_dob <- as.numeric(sales$user_dob)
zScores_sales_dob <- standardize(sales$user_dob)
sales$user_dob [zScores_sales_dob > 3] <- round(mean(sales$user_dob) + 3*sd(sales$user_dob), digit=0)
sales$user_dob [zScores_sales_dob < -3] <- round(mean(sales$user_dob) - 3*sd(sales$user_dob), digit=0)
# order_date and delivery_date / data cleansing
sales$order_date <- as.Date(sales$order_date, "%Y-%m-%d")
sales$delivery_date <- as.Date(sales$delivery_date, "%Y-%m-%d")
sales$delivery_duration<- difftime(sales$delivery_date , sales$order_date, units = c("days"))
sales$delivery_duration [sales$delivery_duration < 0] <- NA
sales$delivery_duration [sales$delivery_duration == "?"] <- NA
sales$delivery_duration [is.na (sales$delivery_duration)] <- 2
sales$delivery_duration <- as.numeric(sales$delivery_duration)
# item-price / data cleansing
sales$item_price <- as.numeric(sales$item_price)
sort(table(sales$item_price), decreasing = TRUE) #MFV
sales$item_price [is.na(sales$item_price) ] <- 59.9
#boxplot(sales$item_price)
zScores <- standardize(sales$item_price)
sales$item_price [zScores > 3] <- round(mean(sales$item_price) + 3*sd(sales$item_price), digit=2)
#boxplot(sales$item_price)
#user_reg_date / data cleansing
sales$user_reg_date <- as.Date(sales$user_reg_date, "%Y-%m-%d")
sales$ user_maturity <- difftime(sales$order_date , sales$user_reg_dat, units = c("days"))
sales$user_maturity <- as.numeric(sales$user_maturity)
#user_title / data cleansing
sales$user_title [sales$user_title == "not reported"] <-NA
sales$user_title [is.na(sales$user_title)] <- "Mrs"
#item_color / data cleansing
sales$item_color [sales$item_color == "blau"] <- "blue"
sales$item_color [sales$item_color == "brwon"] <- "brown"
sales$item_color [sales$item_color == "oliv"] <- "olive"
sales$item_color [sales$item_color == "?"] <- NA
#sort(table(sales$item_color), decreasing = TRUE) ## MFV
sales$item_color [is.na (sales$item_color)] <- "black"
sales$user_dob <- as.numeric(sales$user_dob)
# Month of Delivery
sales$month_of_delivery <- substring(sales$delivery_date,6,7)
sales$month_of_delivery [is.na(sales$month_of_delivery)] <- "01"
sales$month_of_delivery = as.numeric(sales$month_of_delivery)
# Factoring
chrIdx <- which(sapply(sales, is.character))
sales[, chrIdx] <- lapply( sales[, chrIdx],factor)
sales$item_price <- as.numeric(sales$item_price)
sales$price_and_age <- sales$item_price * sales$user_dob
# Specify the 'keys' i.e. ID variables for additional speed gains when merging or sorting
sales = data.table(sales)
data.table::setkey(sales, user_id, item_id, order_item_id)
# Splitting the data into a test and a training set
#idx.train <- caret::createDataPartition(y = sales$return, p = 0.8, list = FALSE) # Draw a random, stratified sample including p percent of the data
# Use data.table to calculate grouped summary statistics efficiently
#customers <- sales[ , .(mean(return)), by = .(user_id)]
# Every piece of information could be relevant, here for example the number of times a customer came back
#customers <- sales[ , list("avg_return" = mean(return), "nr_obs" = .N), by = "user_id"]
# Careful: When using the target variable as a feature, only calculate it on the training data
# You can merge data tables X and Y using the syntax X[Y]
if ("return" %in% colnames(sales)) {
..static_user_statistics <<- sales[, list("avg_return" = mean(return), "nr_obs" = .N), by = "user_id"]
}
sales = merge(x = sales, y = ..static_user_statistics, by = "user_id", all.x = TRUE)
#sales = sales[ ..static_user_statistics ]
# commented since the return column does not exist yet
#sales$return <- factor(sales$return, labels = c("keep","return"))
return(sales)
}
df_known = ..read_and_preprocess_data_file('data/BADS_WS1718_known.csv')
df_class = ..read_and_preprocess_data_file('data/BADS_WS1718_class.csv')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment