Last active
April 9, 2018 13:02
-
-
Save artemklevtsov/710191b96e87bdb14013712a93bfbf86 to your computer and use it in GitHub Desktop.
Telco-Customer-Churn
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## ---- Загрузка пакетов ---- | |
library(readr) # чтение данных | |
library(dplyr) # манипулации с данными | |
library(tidyr) # манипулации с данными | |
library(rsample) # разбиение выборки | |
library(recipes) # препроцессинг | |
library(yardstick) # метрики модели | |
library(glmnet) # логистическая регрессия с регуляризацией | |
## ---- Загрузка данных ---- | |
data_url <- "https://community.watsonanalytics.com/wp-content/uploads/2015/03/WA_Fn-UseC_-Telco-Customer-Churn.csv" | |
all_data <- read_csv(data_url) | |
all_data <- all_data %>% | |
select(-customerID) %>% | |
drop_na() %>% | |
mutate_if(funs(any(grepl("No .* service", .))), | |
funs(replace(., which(grepl("No .* service", .)), "No"))) %>% | |
mutate(Churn = factor(Churn, c("No", "Yes"))) %>% | |
select(Churn, everything()) | |
## ---- Разбиение выборки | |
set.seed(100) | |
train_test_split <- initial_split(all_data, prop = 0.8, strata = "Churn") | |
train_data <- training(train_test_split) | |
test_data <- testing(train_test_split) | |
## ---- Предобработка ---- | |
# Создаём сценарий обработки | |
rec_obj <- recipe(Churn ~ ., data = train_data) %>% | |
step_ratio(TotalCharges, denom = denom_vars(tenure)) %>% | |
step_bin2factor(SeniorCitizen, levels = c("Yes", "No")) %>% | |
step_discretize(tenure, options = list(cuts = 4)) %>% | |
step_discretize(MonthlyCharges, options = list(cuts = 3)) %>% | |
step_regex(PaymentMethod, pattern = "automatic", result = "PaymentMethodAuto") %>% | |
step_bin2factor(PaymentMethodAuto, levels = c("Yes", "No")) %>% | |
step_BoxCox(TotalCharges) %>% | |
step_center(all_numeric(), -all_outcomes()) %>% | |
step_scale(all_numeric(), -all_outcomes()) %>% | |
step_dummy(all_nominal(), -all_outcomes()) %>% | |
step_zv(all_predictors(), -all_outcomes()) %>% | |
# step_lincomb(all_predictors()) %>% | |
# step_corr(all_numeric(), -all_outcomes(), threshold = 0.99) %>% | |
prep(data = train_data) | |
# Извлечение целевых переменных | |
train_lab <- train_data %>% pull(Churn) | |
test_lab <- test_data %>% pull(Churn) | |
# Применение сценария | |
train_prep_data <- bake(rec_obj, train_data) %>% select(-Churn) | |
test_prep_data <- bake(rec_obj, test_data) %>% select(-Churn) | |
## ---- Обучение модели ---- | |
glm_fit <- cv.glmnet(as.matrix(train_prep_data), as.integer(train_lab == "Yes"), | |
family = "binomial", type.measure = "mse", | |
nfolds = 5) | |
## ---- Оценка модели ---- | |
# Предсказания вероятности | |
train_pred <- predict(glm_fit, as.matrix(train_prep_data), s = "lambda.min", type="response")[, 1] | |
test_pred <- predict(glm_fit, as.matrix(test_prep_data), s = "lambda.min", type="response")[, 1] | |
test_pred_data <- data_frame( | |
target = test_lab, | |
glm = test_pred, | |
glm_bin = factor(test_pred >= 0.5, c(FALSE, TRUE), c("No", "Yes")) | |
) | |
# Метрики | |
roc_auc(test_pred_data, target, glm) | |
accuracy(test_pred_data, target, glm_bin) | |
precision(test_pred_data, target, glm_bin) | |
recall(test_pred_data, target, glm_bin) | |
conf_mat(test_pred_data, target, glm_bin) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment