Created
March 21, 2013 00:43
-
-
Save alexhanna/5209830 to your computer and use it in GitHub Desktop.
Political classifier, largely adapted from Machine Learning for Hackers.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# File-Name: polClassify.R | |
# Edited: 2013-03-20 | |
# Orig.Author: Drew Conway ([email protected]) | |
# | |
# Packages Used: tm, ggplot2 | |
# | |
# All source code is copyright (c) 2012, under the Simplified BSD License. | |
# For more information on FreeBSD see: http://www.opensource.org/licenses/bsd-license.php | |
# All images and materials produced by this code are licensed under the Creative Commons | |
# Attribution-Share Alike 3.0 United States License: http://creativecommons.org/licenses/by-sa/3.0/us/ | |
# All rights reserved. | |
# NOTE: If you are running this in the R console you must use the 'setwd' command to set the | |
# working directory for the console to whereever you have saved this file prior to running. | |
# Otherwise you will see errors when loading data or saving figures! | |
## modified by Alex Hanna ([email protected]) for use with classifying political polarization | |
# Load libraries | |
library(tm) | |
library(ggplot2) | |
library(stringr) | |
colNames <- c( | |
"status_id", | |
"created_at", | |
"text", | |
"source", | |
"coords", | |
"user.id", | |
"user.name", | |
"user.screen_name", | |
"user.level", | |
"user.description", | |
"user.location", | |
"user.url", | |
"user.followers_count", | |
"user.friends_count", | |
"user.listed_count", | |
"user.statuses_count", | |
"rt.status_id", | |
"rt.created_at", | |
"rt.text", | |
"rt.source", | |
"rt.coords", | |
"rt.user.id", | |
"rt.user.name", | |
"rt.user.screen_name", | |
"rt.user.level", | |
"rt.user.description", | |
"rt.user.location", | |
"rt.user.url", | |
"rt.user.followers_count", | |
"rt.user.friends_count", | |
"rt.user.listed_count", | |
"rt.user.statuses_count" | |
) | |
# Return a single element vector of just the email body | |
# This is a very simple approach, as we are only using | |
# words as features | |
get.msg <- function(path) | |
{ | |
con <- file(path, open = "rt", encoding = "latin1") | |
text <- readLines(con) | |
# The message always begins after the first full line break | |
msg <- text[seq(which(text == "")[1] + 1, length(text), 1)] | |
close(con) | |
return(paste(msg, collapse = "\n")) | |
} | |
# Create a TermDocumentMatrix (TDM) from the corpus of SPAM email. | |
# The TDM control can be modified, and the sparsity level can be | |
# altered. This TDM is used to create the feature set used to do | |
# train our classifier. | |
get.tdm <- function(doc.vec) | |
{ | |
control <- list(stopwords = TRUE, | |
removePunctuation = TRUE, | |
removeNumbers = TRUE, | |
minDocFreq = 2) | |
doc.corpus <- Corpus(VectorSource(doc.vec)) | |
doc.dtm <- TermDocumentMatrix(doc.corpus, control) | |
return(doc.dtm) | |
} | |
# This function takes a file path to an email file and a string, | |
# the term parameter, and returns the count of that term in | |
# the email body. | |
count.word <- function(path, term) | |
{ | |
msg <- get.msg(path) | |
msg.corpus <- Corpus(VectorSource(msg)) | |
# Hard-coded TDM control | |
control <- list(stopwords = TRUE, | |
removePunctuation = TRUE, | |
removeNumbers = TRUE) | |
msg.tdm <- TermDocumentMatrix(msg.corpus, control) | |
word.freq <- rowSums(as.matrix(msg.tdm)) | |
term.freq <- word.freq[which(names(word.freq) == term)] | |
# We use ifelse here because term.freq = NA if nothing is found | |
return(ifelse(length(term.freq) > 0, term.freq, 0)) | |
} | |
# This is the our workhorse function for classifying email. It takes | |
# two required paramters: a file path to an email to classify, and | |
# a data frame of the trained data. The function also takes two | |
# optional parameters. First, a prior over the probability that an email | |
# is SPAM, which we set to 0.5 (naive), and constant value for the | |
# probability on words in the email that are not in our training data. | |
# The function returns the naive Bayes probability that the given email | |
# is SPAM. | |
classify.email <- function(msg, training.df, prior = 0.5, c = 1e-6) | |
{ | |
# Here, we use many of the support functions to get the | |
# email text data in a workable format | |
msg.tdm <- get.tdm(msg) | |
msg.freq <- rowSums(as.matrix(msg.tdm)) | |
# Find intersections of words | |
msg.match <- intersect(names(msg.freq), training.df$term) | |
# Now, we just perform the naive Bayes calculation | |
if(length(msg.match) < 1) | |
{ | |
return(prior * c ^ (length(msg.freq))) | |
} | |
else | |
{ | |
match.probs <- training.df$occurrence[match(msg.match, training.df$term)] | |
return(prior * prod(match.probs) * c ^ (length(msg.freq) - length(msg.match))) | |
} | |
} | |
df <- read.delim('/project/hanna/wirecall/wirecall.highdetail.csv', | |
header = F, sep = "\t", | |
col.names = colNames, | |
stringsAsFactors = F, quote = "") | |
df$user.screen_name <- tolower(df$user.screen_name) | |
## left and right users | |
left.users <- c("legaleagle", "spudlovr", "barrett4wi", "wisdems", "repgwenmoore", "markpocan") | |
right.users <- c("scottkwalker", "govwalker", "turnsc", "gopbrad", "repvos", | |
"tommyforwi", "afpwi", "tppatriots", "vickimckenna", "win8ive", "maciverwisc", | |
"mattbatzel", "gopbrad", "scottwalkerhq", "jbvanhollen", "ourcountrypac", | |
"pbartel", "BrickM", "wisgop", "rebeccaforreal", "danerepublicans", "the_rga", "prolifewi", | |
"mediatrackers") | |
left.docs <- df[which(with(df, user.screen_name %in% left.users)), ]$text | |
right.docs <- df[which(with(df, user.screen_name %in% right.users)), ]$text | |
## get lengths | |
left.length <- length(left.docs) | |
right.length <- length(right.docs) | |
## get a random sample for training | |
left.docs <- sample(left.docs, left.length) | |
right.docs <- sample(right.docs, right.length) | |
left.training <- left.docs[1:(left.length / 2)] | |
right.training <- right.docs[1:(right.length / 2)] | |
# With all of our support functions written, we can perform the classification. | |
# First, we create document corpus for left messages | |
# Create a DocumentTermMatrix from left vector | |
left.tdm <- get.tdm(left.training) | |
# Create a data frame that provides the feature set from the training LEFT data | |
left.matrix <- as.matrix(left.tdm) | |
left.counts <- rowSums(left.matrix) | |
left.df <- data.frame(cbind(names(left.counts), | |
as.numeric(left.counts)), | |
stringsAsFactors = FALSE) | |
names(left.df) <- c("term", "frequency") | |
left.df$frequency <- as.numeric(left.df$frequency) | |
left.occurrence <- sapply(1:nrow(left.matrix), | |
function(i) | |
{ | |
length(which(left.matrix[i, ] > 0)) / ncol(left.matrix) | |
}) | |
left.density <- left.df$frequency / sum(left.df$frequency) | |
# Add the term density and occurrence rate | |
left.df <- transform(left.df, | |
density = left.density, | |
occurrence = left.occurrence) | |
# Now do the same for the RIGHT email | |
right.tdm <- get.tdm(right.training) | |
right.matrix <- as.matrix(right.tdm) | |
right.counts <- rowSums(right.matrix) | |
right.df <- data.frame(cbind(names(right.counts), | |
as.numeric(right.counts)), | |
stringsAsFactors = FALSE) | |
names(right.df) <- c("term", "frequency") | |
right.df$frequency <- as.numeric(right.df$frequency) | |
right.occurrence <- sapply(1:nrow(right.matrix), | |
function(i) | |
{ | |
length(which(right.matrix[i, ] > 0)) / ncol(right.matrix) | |
}) | |
right.density <- right.df$frequency / sum(right.df$frequency) | |
right.df <- transform(right.df, | |
density = right.density, | |
occurrence = right.occurrence) | |
## peel off 10000 to classify | |
rest.docs <- sample(df$text, 10000) | |
# Run classifer against whole data set | |
rest.lefttest <- sapply(rest.docs[(left.length / 2):left.length], | |
function(p) classify.email(p, training.df = left.df)) | |
rest.righttest <- sapply(rest.docs[(left.length / 2):left.length], | |
function(p) classify.email(p, training.df = right.df)) | |
rest.res <- ifelse(rest.lefttest > rest.righttest, TRUE, FALSE) | |
summary(rest.res) | |
# Finally, attempt to classify the rest data using the classifer developed above. | |
# The rule is to classify a message as left if Pr(msg) = left > Pr(msg) = right | |
left.classifier <- function(msg) | |
{ | |
pr.left <- classify.email(msg, left.df, prior = 0.5) | |
pr.right <- classify.email(msg, right.df, prior = 0.5) | |
return(c(pr.left, pr.right, ifelse(pr.left > pr.right, 1, 0))) | |
} | |
# Classify them all! | |
left.class <- suppressWarnings(lapply(left.docs[(left.length / 2):left.length], | |
function(p) | |
{ | |
left.classifier(p) | |
})) | |
right.class <- suppressWarnings(lapply(right.docs[(right.length / 2):right.length], | |
function(p) | |
{ | |
left.classifier(p) | |
})) | |
# Create a single, final, data frame with all of the classification data in it | |
left.matrix <- do.call(rbind, left.class) | |
left.final <- cbind(left.matrix, "left") | |
right.matrix <- do.call(rbind, right.class) | |
right.final <- cbind(right.matrix, "right") | |
class.matrix <- rbind(left.final, right.final) | |
class.df <- data.frame(class.matrix, stringsAsFactors = FALSE) | |
names(class.df) <- c("Pr.Left" ,"Pr.Right", "Class", "Type") | |
class.df$Pr.Left <- as.numeric(class.df$Pr.Left) | |
class.df$Pr.Right <- as.numeric(class.df$Pr.Right) | |
class.df$Class <- as.logical(as.numeric(class.df$Class)) | |
class.df$Type <- as.factor(class.df$Type) | |
# Create final plot of results | |
class.plot <- ggplot(class.df, aes(x = log(Pr.Left), log(Pr.Right))) + | |
geom_point(aes(color = Type, alpha = 0.5)) + | |
stat_abline(yintercept = 0, slope = 1) + | |
scale_color_manual(values = c("left" = "blue", | |
"right" = "red"), | |
name = "Message Type") + | |
scale_alpha(guide = "none") + | |
xlab("log[Pr(Left)]") + | |
ylab("log[Pr(Right)]") + | |
theme_bw() | |
ggsave(plot = class.plot, | |
filename = file.path("../img", "final_class_LR.png"), | |
height = 10, | |
width = 10) | |
get.results <- function(bool.vector) | |
{ | |
results <- c(length(bool.vector[which(bool.vector == TRUE)]) / length(bool.vector), | |
length(bool.vector[which(bool.vector == FALSE)]) / length(bool.vector)) | |
return(results) | |
} | |
# Save results as a 2x2 table | |
left.col <- get.results(subset(class.df, Type == "left")$Class) | |
right.col <- get.results(subset(class.df, Type == "right")$Class) | |
class.res <- rbind(left.col, right.col) | |
colnames(class.res) <- c("Left", "Right") | |
print(class.res) | |
# Save the training data for use in Chapter 4 | |
write.csv(left.df, file.path("data", "left_df.csv"), row.names = FALSE) | |
write.csv(right.df, file.path("data", "right_df.csv"), row.names = FALSE) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment