Skip to content

Instantly share code, notes, and snippets.

@ivaninkv
Created September 14, 2017 19:43
Show Gist options
  • Save ivaninkv/d9fd8793662cb94b7bb2cde83a31d0dd to your computer and use it in GitHub Desktop.
Save ivaninkv/d9fd8793662cb94b7bb2cde83a31d0dd to your computer and use it in GitHub Desktop.
MVideo Hackathon
# rm(list = ls())
# gc()
# загрузим все необходимые библиотеки
library(data.table)
library(tidyverse)
library(stringr)
library(magrittr)
library(tm)
library(text2vec)
library(xgboost)
# вспомогательные функции
IntToFactor <- function(dt, threshold = 10, inline = T, exclude = c()) {
subFunc <- function(dt_) {
int.col <- names(dt_)[sapply(dt_, is.integer)]
if (length(exclude) > 0) {
int.col <- int.col[!int.col %in% exclude]
}
if (length(int.col) > 0) {
for (feat in int.col) {
if (length(unique(dt_[[feat]])) <= threshold) {
set(dt_, j = feat, value = factor(dt_[[feat]]))
}
}
}
}
if (inline == TRUE) {
subFunc(dt)
}
else
{
dt.new <- copy(dt)
subFunc(dt.new)
return(dt.new)
}
}
# загрузим данные и словарь стоп слов
all.data <- readr::read_csv('X_train.csv')
sw.url <- 'https://raw.githubusercontent.com/stopwords-iso/stopwords-ru/master/stopwords-ru.txt'
sw <- readr::read_csv(sw.url, col_names = F)$X1
rm(sw.url)
# добавим новые фичи
all.data %<>%
mutate(comment = paste(comment, ifelse(is.na(commentNegative), '', commentNegative))) %>%
mutate(comment = paste(comment, ifelse(is.na(commentPositive), '', commentPositive)))
all.data$comment <- str_to_lower(all.data$comment)
all.data$comment <- str_replace_all(all.data$comment, 'ё', 'е')
all.data$comment <- str_replace_all(all.data$comment, '\\(', ' ( ')
all.data$comment <- str_replace_all(all.data$comment, '\\)', ' ) ')
all.data$comment <- str_replace_all(all.data$comment, '[:digit:]', ' ')
all.data$comment <- removeWords(all.data$comment, sw)
all.data$comment <- removePunctuation(all.data$comment)
all.data$emotion <- grepl('!', all.data$comment)
all.data$smile <- grepl(':)|))|;)|;-)', all.data$comment)
all.data$antismile <- grepl(':\\(|\\(\\(', all.data$comment)
IntToFactor(all.data, threshold = 200)
# удалим неиспользуемые столбцы
setDT(all.data)
all.data[, c('sku',
'property',
'date',
'commentNegative',
'commentPositive') := NULL]
# используем нетекстовые переменные
dt <- all.data %>%
select(-comment, -reting)
dt <- model.matrix(~ . -1, data = dt)
dim(dt)
# используем текстовые переменные
all.data %<>% mutate(rank = row_number())
setDT(all.data)
setkey(all.data, rank)
it_train = itoken(all.data$comment,
preprocessor = tolower,
tokenizer = tokenizers::tokenize_words,
ids = all.data$rank,
progressbar = TRUE)
vocab = create_vocabulary(it_train, stopwords = sw)
vectorizer = vocab_vectorizer(vocab)
pruned_vocab = prune_vocabulary(vocab,
term_count_min = 10,
doc_proportion_max = 0.5,
doc_proportion_min = 0.001)
vectorizer = vocab_vectorizer(pruned_vocab)
dtm_train = create_dtm(it_train, vectorizer)
mat <- as.matrix(dtm_train)
dim(mat)
mat <- cbind(mat, dt)
rm(dt, pruned_vocab, vocab, dtm_train, sw, it_train, vectorizer, IntToFactor)
# xgboost
fold <- caret::createDataPartition(all.data$reting, times = 1, p = 0.7, list = F)
X <- mat[fold, ]
y <- all.data$reting[fold]
X_pred <- mat[-fold, ]
y_pred <- all.data$reting[-fold]
k <- 4 #
param <- list(
eta = 0.2/k,
nround = 50*k,
max_depth = 4,
colsample_bytree = 0.7,
subsample = 0.7,
min_child_weight = 6,
gamma = 4,
tree_method = 'auto',
objective = 'reg:linear'
)
# cv.res <- xgb.cv(data = X, label = y, boosting = 'dart',
# params = param, nrounds = param$nround, nfold = 5, verbose = 1L)
model <- xgboost(
data = X,
label = y,
params = param,
boosting = 'dart',
nrounds = param$nround,
print_every_n = 50,
early_stopping_rounds = 100
)
# rmse
sqrt(mean((y_pred - predict(model, X_pred))^2))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment