Skip to content

Instantly share code, notes, and snippets.

@koustuvsinha
Created November 13, 2015 19:57
Show Gist options
  • Save koustuvsinha/944eb5b929c7d2249aed to your computer and use it in GitHub Desktop.
Save koustuvsinha/944eb5b929c7d2249aed to your computer and use it in GitHub Desktop.
Twitter News Classification with SMOTE sampling
library(RTextTools)
library(DMwR)
library(stringi)
#load data
crime <- read.csv("CleanedDataNew.csv")
crime$X <- NULL
nrow(crime)
crime_bal <- crime
crime_bal$target <- as.factor(crime_bal$target)
prop.table(table(crime_bal$target))
cat("Crime Nos")
nrow(crime_bal[crime_bal$target == 1,])
cat("Non Crime Nos")
nrow(crime_bal[crime_bal$target == 0,])
# oversampling, keeping neg same but pos 8 times more
crime_bal <- SMOTE(target ~ .,crime_bal,perc.under = 100, perc.over = 800)
# undersampling, keeping pos same but neg reduced 2 times
# crime_bal <- SMOTE(target ~ .,crime_bal,perc.under = 200, perc.over = 100)
# crime_bal <- SMOTE(target ~ ., crime_bal, perc.over = 100, perc.under=200)
prop.table(table(crime_bal$target))
cat("Crime Nos")
nrow(crime_bal[crime_bal$target == 1,])
cat("Non Crime Nos")
nrow(crime_bal[crime_bal$target == 0,])
# Rearranging rows
crime_bal <- crime_bal[sample(nrow(crime_bal)),]
#remove hashtags
crime_bal$text <- stri_replace_all(crime_bal$text,"",regex = "#\\S+")
# crime_bal <- crime
training_data <- cbind.data.frame(crime_bal$text)
training_codes <- cbind.data.frame(crime_bal$target)
matrix <- create_matrix(training_data, language="english", removeNumbers=FALSE, stemWords=TRUE, removePunctuation=TRUE, removeStopwords=TRUE,stripWhitespace=TRUE, toLower=TRUE)
container <- create_container(matrix,t(training_codes),trainSize=3000:7293, testSize=1:2999,virgin=FALSE)
# container <- create_container(matrix,t(training_codes),trainSize=2001:6326, testSize=1:2000,virgin=FALSE)
# container <- create_container(matrix,t(training_codes),trainSize=601:1328, testSize=1:600,virgin=FALSE)
models <- train_models(container, algorithms="SVM") # this line is calling SVMforest
results <- classify_models(container, models)
analytics <- create_analytics(container, results)
analytics@ensemble_summary
create_precisionRecallSummary(container, results, b_value = 1)
crime_test = crime_bal[1:2999,]
# false positive calculation
nrow(results[results$SVM_LABEL == 1,])
rows <- which(results$SVM_LABEL == 1)
classified_as_crime <- crime_test[rows,]
false_positive <- classified_as_crime$target == 0
true_positive <- classified_as_crime$target == 1
n_false_pos <- sum(false_positive, na.rm = TRUE)
n_true_pos <- sum(true_positive, na.rm = TRUE)
false_pos_rows <- which(false_positive)
# false negative calculation
nrow(results[results$SVM_LABEL == 0,])
rows <- which(results$SVM_LABEL == 0)
classified_as_noncrime <- crime[rows,]
false_negative <- classified_as_noncrime$target == 1
true_negative <- classified_as_noncrime$target == 0
n_false_neg <- sum(false_negative, na.rm = TRUE)
n_true_neg <- sum(true_negative, na.rm = TRUE)
false_neg_rows <- which(false_negative)
## ROCR Graph Calculation
library(ROCR)
pred <- prediction(results$SVM_PROB, results$SVM_LABEL)
perf <- performance(pred,"tpr","fpr")
plot(perf)
@koustuvsinha
Copy link
Author

Dataset - CleanedDataNew.csv

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment