Skip to content

Instantly share code, notes, and snippets.

@benlistyg
Created June 21, 2019 21:22
Show Gist options
  • Save benlistyg/cb35d30a707d3ad4e0924d8e06a0cc6f to your computer and use it in GitHub Desktop.
Save benlistyg/cb35d30a707d3ad4e0924d8e06a0cc6f to your computer and use it in GitHub Desktop.
# Multinomial Logit Model vs Random Forest
# Predicting College Major from Items
library(data.table) # For fread function (v fast!)
library(dplyr) # For pre-processing
library(tm) # For cleaning text / pre-processing
library(nnet) #for MNL
library(randomForest) #For rf
# Helper function for making training / testing sets.
outersect <- function(x, y) {
sort(c(setdiff(x, y),
setdiff(y, x)))
}
# Normalizing majors (entered as free response data)
majors <- data.table::fread(input = paste(getwd(),'/data.csv',sep=''), sep = '\t',stringsAsFactors = F)$major %>%
toupper() %>%
trimws() %>%
removePunctuation() %>%
removeNumbers() %>%
table() %>%
melt() %>%
arrange(-value) %>%
.[-1,]
# Final data set
vi_data <- data.table::fread(input = paste(getwd(),'/data.csv',sep=''), sep = '\t',stringsAsFactors = F) %>%
mutate(MAJOR = toupper(major)) %>%
select(-major) %>%
filter(MAJOR %in% as.character(majors$.[2:10])) %>%
filter(!is.na(MAJOR)) %>%
select(R1:C8, MAJOR) %>%
mutate(R = rowSums(select(., grep("R[0-9]", names(.)))),
I = rowSums(select(., grep("I[0-9]", names(.)))),
A = rowSums(select(., grep("A[0-9]", names(.)))),
S = rowSums(select(., grep("S[0-9]", names(.)))),
E = rowSums(select(., grep("E[0-9]", names(.)))),
C = rowSums(select(., grep("C[0-9]", names(.))))) %>%
mutate(MAJOR = as.factor(MAJOR)) %>%
select(R,I,A,S,E,C,MAJOR)
test_rows <- sample(1:nrow(vi_data), size = 5000, replace = F)
train_rows <- outersect(x = 1:nrow(vi_data), y = test_rows)
mnl_vi <- multinom(MAJOR ~ R+I+A+S+E+C, data=vi_data[train_rows,])
rf_vi <- randomForest::randomForest(MAJOR ~ ., data=vi_data[train_rows,])
list(
`Multi-Nomial Logit` = data.frame(vi_data[test_rows,],
prediction = predict(mnl_vi, vi_data[test_rows,-ncol(vi_data)])) %>%
select(MAJOR, prediction) %>%
table(.)
,
`Random Forest` = data.frame(vi_data[test_rows,],
prediction = predict(rf_vi, vi_data[test_rows,-ncol(vi_data)])) %>%
select(MAJOR, prediction) %>%
table(.)
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment