Skip to content

Instantly share code, notes, and snippets.

@ivaninkv
Last active September 11, 2017 19:17
Show Gist options
  • Save ivaninkv/760cf9df9b5ed60a2a0d09bcaf9ee2d1 to your computer and use it in GitHub Desktop.
Save ivaninkv/760cf9df9b5ed60a2a0d09bcaf9ee2d1 to your computer and use it in GitHub Desktop.
rm(list = ls())
gc()
library(data.table)
library(tidyverse)
#library(tidytext)
library(tm)
library(stringr)
library(text2vec)
library(magrittr)
library(Matrix)
# read data
all.data <- readr::read_csv('X_train.csv')
#all.data <- fread('X_train_win.csv')
sw.url <- 'https://raw.githubusercontent.com/stopwords-iso/stopwords-ru/master/stopwords-ru.txt'
sw <- readr::read_csv(sw.url, col_names = F)$X1
rm(sw.url)
# add new feature
all.data %>%
mutate(comment = paste(comment, ifelse(is.na(commentNegative), '', commentNegative))) %>%
mutate(comment = paste(comment, ifelse(is.na(commentPositive), '', commentPositive))) -> all.data
all.data$comment <- str_to_lower(all.data$comment)
all.data$comment <- str_replace_all(all.data$comment, 'ё', 'е')
all.data$comment <- str_replace_all(all.data$comment, '\\(', ' ( ')
all.data$comment <- str_replace_all(all.data$comment, '\\)', ' ) ')
all.data$comment <- str_replace_all(all.data$comment, '[:digit:]', ' ')
all.data$comment <- removeWords(all.data$comment, sw)
all.data$comment <- removePunctuation(all.data$comment)
all.data$emotion <- grepl('!', all.data$comment)
all.data$smile <- grepl(':)|))|;)|;-)', all.data$comment)
all.data$antismile <- grepl(':\\(|\\(\\(', all.data$comment)
ds_IntToFactor(all.data, threshold = 200)
# remove unused feature
setDT(all.data)
all.data[, c('sku',
'property',
'date',
'commentNegative',
'commentPositive') := NULL]
# text2vec ----
all.data %<>% mutate(rank = row_number())
setDT(all.data)
setkey(all.data, rank)
it_train = itoken(all.data$comment,
preprocessor = tolower,
tokenizer = word_tokenizer,
ids = all.data$rank,
progressbar = TRUE)
vocab = create_vocabulary(it_train, stopwords = sw)
vectorizer = vocab_vectorizer(vocab)
dtm_train = create_dtm(it_train, vectorizer)
freq_df_pos <- colSums(dtm_train)
freq_df_pos <- data.frame(word = names(freq_df_pos), freq = freq_df_pos)
rownames(freq_df_pos) <- NULL
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment