Last active
December 15, 2016 17:44
-
-
Save soodoku/e34dbe0219b0f00a74d5 to your computer and use it in GitHub Desktop.
Basic Text Classifier
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
" | |
Basic Text Classifier | |
- Takes a csv with a text column, and column of labels | |
- Splits into train and test | |
- Preprocesses text using tm/bag-of-words, 1/2-order Markov | |
- Uses SVM and Lasso | |
@author: Gaurav Sood | |
" | |
# Load libs | |
library(readr) # read in big data a bit quicker | |
library(tm) # pre_process text | |
library(glmnet) # for lasso | |
library(SnowballC) #for stemming | |
library(kernlab)# for svm | |
library(doMC) # parallel computing | |
" | |
Load data | |
Expected Data Format: | |
Let label column = labels | |
Say labels are of two classes: 1, 0 | |
Let text column = text | |
" | |
data <- read_csv("data.csv") | |
data <- as.data.frame(data) | |
colnames(data) <- make.names(colnames(data)) | |
# Take out text columns with no category labels | |
data <- subset(data, !is.na(labels)) | |
" | |
Depending on the classification task, you may not need the entire training dataset. | |
Note: You can always plot how much accuracy improves as n increases) | |
Subsetting may also prove useful for computational reasons for some | |
If subsetting, stratified sampling not a bad idea if labels v. imbalanced | |
" | |
" | |
Stratified function takes: | |
data (data), | |
name of the label column (labels), | |
either proportion of rows from each label (p) or n per label (n_per_label), and | |
random seed (seed) for reproducibility | |
" | |
stratified <- function(data, labels, n_per_label, p=NA, seed=314159265){ | |
# data <- data; labels="labels"; n_per_label=200;p <- NA;seed=314159265 | |
dsample <- NULL | |
set.seed(seed) | |
for(i in levels(as.factor(data[,labels]))) | |
{ | |
dsub <- data[data[,labels] == i,] | |
if(is.na(p)){ | |
dsub <- dsub[sample(1:nrow(dsub), n_per_label), ] | |
} | |
else{ | |
dsub <- dsub[sample(1:nrow(dsub), ceiling(nrow(dsub) * p)), ] | |
} | |
dsample <- c(dsample, row.names(dsub)) | |
} | |
dsample | |
} | |
# Training data | |
train_rows <- stratified(data, "labels", n_per_label=25000) | |
data_train <- data[train_rows,] | |
# Test data (10k is more than enough) | |
data_test <- data[-as.numeric(train_rows),] | |
data_test <- data_test[sample(1:nrow(data_test), 10000),] | |
# An abstract function to preprocess a text column | |
preprocess <- function(text_column) | |
{ | |
# Use tm to get a doc matrix | |
corpus <- Corpus(VectorSource(text_column)) | |
# all lower case | |
corpus <- tm_map(corpus, content_transformer(tolower)) | |
# remove punctuation | |
corpus <- tm_map(corpus, content_transformer(removePunctuation)) | |
# remove numbers | |
corpus <- tm_map(corpus, content_transformer(removeNumbers)) | |
# remove stopwords | |
corpus <- tm_map(corpus, removeWords, stopwords("english")) | |
# stem document | |
corpus <- tm_map(corpus, stemDocument) | |
# strip white spaces (always at the end) | |
corpus <- tm_map(corpus, stripWhitespace) | |
# return | |
corpus | |
} | |
# Get preprocess training and test data | |
train_corpus <- preprocess(data_train$text) | |
test_corpus <- preprocess(data_test$text) | |
# Create a Document Term Matrix for train and test | |
# Just including bi and tri-grams | |
# Bi-Trigram tokenizer function (you can always get longer n-grams) | |
bitrigramtokeniser <- function(x, n) { | |
RWeka:::NGramTokenizer(x, RWeka:::Weka_control(min = 2, max = 3)) | |
} | |
# Appropriate libraries | |
# For Windows | |
# Sys.setenv(JAVA_HOME='C:/Program Files/Java/jre7/') | |
library(rJava) | |
library(RWeka) | |
" | |
Remove remove words <=2 | |
TdIdf weighting | |
Infrequent (< than 1% of documents) and very frequent (> 80% of documents) terms not included | |
" | |
train_dtm <- DocumentTermMatrix(train_corpus, control=list(wordLengths=c(2, Inf), | |
tokenize = bitrigramtokeniser, | |
weighting = function(x) weightTfIdf(x, normalize = FALSE), | |
bounds=list(global=c(floor(length(train_corpus)*0.01), floor(length(train_corpus)*.8))))) | |
test_dtm <- DocumentTermMatrix(test_corpus, control=list(wordLengths=c(2, Inf), | |
tokenize = bitrigramtokeniser, | |
weighting = function(x) weightTfIdf(x, normalize = FALSE), | |
bounds=list(global=c(floor(length(test_corpus)*0.001), floor(length(test_corpus)*.8))))) | |
# Variable selection | |
# ~~~~~~~~~~~~~~~~~~~~ | |
" | |
For dimension reduction. | |
The function calculates chi-square value for each phrase and keeps phrases with highest chi_square values | |
Ideally you want to put variable selection as part of cross-validation. | |
chisqTwo function takes: | |
document term matrix (dtm), | |
vector of labels (labels), and | |
number of n-grams you want to keep (n_out) | |
" | |
chisqTwo <- function(dtm, labels, n_out=2000){ | |
mat <- as.matrix(dtm) | |
cat1 <- colSums(mat[labels==T,]) # total number of times phrase used in cat1 | |
cat2 <- colSums(mat[labels==F,]) # total number of times phrase used in cat2 | |
n_cat1 <- sum(mat[labels==T,]) - cat1 # total number of phrases in soft minus cat1 | |
n_cat2 <- sum(mat[labels==F,]) - cat2 # total number of phrases in hard minus cat2 | |
num <- (cat1*n_cat2 - cat2*n_cat1)^2 | |
den <- (cat1 + cat2)*(cat1 + n_cat1)*(cat2 + n_cat2)*(n_cat1 + n_cat2) | |
chisq <- num/den | |
chi_order <- chisq[order(chisq)][1:n_out] | |
mat <- mat[, colnames(mat) %in% names(chi_order)] | |
} | |
" | |
With high dimensional data, test matrix may not have all the phrases training matrix has. | |
This function fixes that - so that test matrix has the same columns as training. | |
testmat takes column names of training matrix (train_mat_cols), and | |
test matrix (test_mat) | |
and outputs test_matrix with the same columns as training matrix | |
" | |
# Test matrix maker | |
testmat <- function(train_mat_cols, test_mat){ | |
# train_mat_cols <- colnames(train_mat); test_mat <- as.matrix(test_dtm) | |
test_mat <- test_mat[, colnames(test_mat) %in% train_mat_cols] | |
miss_names <- train_mat_cols[!(train_mat_cols %in% colnames(test_mat))] | |
if(length(miss_names)!=0){ | |
colClasses <- rep("numeric", length(miss_names)) | |
df <- read.table(text = '', colClasses = colClasses, col.names = miss_names) | |
df[1:nrow(test_mat),] <- 0 | |
test_mat <- cbind(test_mat, df) | |
} | |
as.matrix(test_mat) | |
} | |
# Train and test matrices | |
train_mat <- chisqTwo(train_dtm, data_train$labels) | |
test_mat <- testmat(colnames(train_mat), as.matrix(test_dtm)) | |
# Take out the heavy dtms in the memory | |
rm(train_dtm) | |
rm(test_dtm) | |
# Run garbage collector to free up memory | |
gc() | |
# Fit the model | |
# ~~~~~~~~~~~~~~~~~~~~ | |
# Lasso (cross-validated lambda) | |
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | |
registerDoMC(cores=2) # set cores | |
fit_lasso <- cv.glmnet(train_mat, data_train$labels, family = "binomial", nfolds=5, type.measure="class") | |
pred <- predict(fit_lasso, newx= test_mat, s = "lambda.min", type="response") | |
# Prediction Accuracy | |
table(pred > .5, data_test$labels) | |
sum(diag(table(pred > .5, data_test$labels)))/sum(table(pred > .5, data_test$labels)) | |
# Plot Cross-Validation Curve | |
pdf("cvCurve.pdf") | |
plot(fit_lasso) | |
dev.off() | |
# SVM | |
# ~~~~~~~~~~~~~~ | |
svp <- ksvm(as.matrix(train_dtm), nyt_train$NewsDeskSoft, type="C-svc", kernel='vanilladot', C=100,scaled=c()) | |
spred <- predict(svp,test_dtm) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
With this code, I'm hitting this error after running lines 122-126 or 127-130.
Error in rep(seq_along(x), sapply(tflist, length)) :
invalid 'times' argument
It seems to be coming from what I'm assuming is the 94-95 function, but I don't see why. Any suggestions? It's a relatively small data set - only 1,000 observations - though the text itself is each about a paragraph per observation.