alexhanna · March 21, 2013 00:43
diff --git a/polClassify.R b/polClassify.R
 # File-Name:       polClassify.R           
 # Edited:          2013-03-20                          
 # Orig.Author:     Drew Conway ([email protected])
 # 
 # Packages Used:   tm, ggplot2
 # 

 # All source code is copyright (c) 2012, under the Simplified BSD License.  
 # For more information on FreeBSD see: http://www.opensource.org/licenses/bsd-license.php

 # All images and materials produced by this code are licensed under the Creative Commons 
 # Attribution-Share Alike 3.0 United States License: http://creativecommons.org/licenses/by-sa/3.0/us/

 # All rights reserved.

 # NOTE: If you are running this in the R console you must use the 'setwd' command to set the 
 # working directory for the console to whereever you have saved this file prior to running.
 # Otherwise you will see errors when loading data or saving figures!

 ## modified by Alex Hanna ([email protected]) for use with classifying political polarization

 # Load libraries
 library(tm)
 library(ggplot2)
 library(stringr)

 colNames <- c(
 "status_id",
 "created_at",
 "text",
 "source",
 "coords",
 "user.id",
 "user.name",
 "user.screen_name",
 "user.level",
 "user.description",
 "user.location",
 "user.url",
 "user.followers_count",
 "user.friends_count",
 "user.listed_count",
 "user.statuses_count",
 "rt.status_id",
 "rt.created_at",
 "rt.text",
 "rt.source",
 "rt.coords",
 "rt.user.id",
 "rt.user.name",
 "rt.user.screen_name",
 "rt.user.level",
 "rt.user.description",
 "rt.user.location",
 "rt.user.url",
 "rt.user.followers_count",
 "rt.user.friends_count",
 "rt.user.listed_count",
 "rt.user.statuses_count"
        )


 # Return a single element vector of just the email body
 # This is a very simple approach, as we are only using 
 # words as features
 get.msg <- function(path)
 {
  con <- file(path, open = "rt", encoding = "latin1")
  text <- readLines(con)
  # The message always begins after the first full line break
  msg <- text[seq(which(text == "")[1] + 1, length(text), 1)]
  close(con)
  return(paste(msg, collapse = "\n"))
 }

 # Create a TermDocumentMatrix (TDM) from the corpus of SPAM email.
 # The TDM control can be modified, and the sparsity level can be 
 # altered.  This TDM is used to create the feature set used to do 
 # train our classifier.
 get.tdm <- function(doc.vec)
 {
  control <- list(stopwords = TRUE,
                  removePunctuation = TRUE,
                  removeNumbers = TRUE,
                  minDocFreq = 2)
  doc.corpus <- Corpus(VectorSource(doc.vec))
  doc.dtm <- TermDocumentMatrix(doc.corpus, control)
  return(doc.dtm)
 }

 # This function takes a file path to an email file and a string, 
 # the term parameter, and returns the count of that term in 
 # the email body.
 count.word <- function(path, term)
 {
  msg <- get.msg(path)
  msg.corpus <- Corpus(VectorSource(msg))
  # Hard-coded TDM control
  control <- list(stopwords = TRUE,
                  removePunctuation = TRUE,
                  removeNumbers = TRUE)
  msg.tdm <- TermDocumentMatrix(msg.corpus, control)
  word.freq <- rowSums(as.matrix(msg.tdm))
  term.freq <- word.freq[which(names(word.freq) == term)]
  # We use ifelse here because term.freq = NA if nothing is found
  return(ifelse(length(term.freq) > 0, term.freq, 0))
 }

 # This is the our workhorse function for classifying email.  It takes 
 # two required paramters: a file path to an email to classify, and
 # a data frame of the trained data.  The function also takes two 
 # optional parameters.  First, a prior over the probability that an email
 # is SPAM, which we set to 0.5 (naive), and constant value for the
 # probability on words in the email that are not in our training data.
 # The function returns the naive Bayes probability that the given email
 # is SPAM.  
 classify.email <- function(msg, training.df, prior = 0.5, c = 1e-6)
 {
  # Here, we use many of the support functions to get the
  # email text data in a workable format
  msg.tdm <- get.tdm(msg)
  msg.freq <- rowSums(as.matrix(msg.tdm))
  # Find intersections of words
  msg.match <- intersect(names(msg.freq), training.df$term)
  # Now, we just perform the naive Bayes calculation
  if(length(msg.match) < 1)
  {
    return(prior * c ^ (length(msg.freq)))
  }
  else
  {
    match.probs <- training.df$occurrence[match(msg.match, training.df$term)]
    return(prior * prod(match.probs) * c ^ (length(msg.freq) - length(msg.match)))
  }
 }

 df <- read.delim('/project/hanna/wirecall/wirecall.highdetail.csv', 
        header = F, sep = "\t",
        col.names = colNames,
        stringsAsFactors = F, quote = "")

 df$user.screen_name <- tolower(df$user.screen_name)

 ## left and right users
 left.users  <- c("legaleagle", "spudlovr", "barrett4wi", "wisdems", "repgwenmoore", "markpocan")
 right.users <- c("scottkwalker", "govwalker", "turnsc", "gopbrad", "repvos", 
  "tommyforwi", "afpwi", "tppatriots", "vickimckenna", "win8ive", "maciverwisc", 
  "mattbatzel", "gopbrad", "scottwalkerhq", "jbvanhollen", "ourcountrypac",
  "pbartel", "BrickM", "wisgop", "rebeccaforreal", "danerepublicans", "the_rga", "prolifewi",
  "mediatrackers")

 left.docs  <- df[which(with(df, user.screen_name %in% left.users)), ]$text
 right.docs <- df[which(with(df, user.screen_name %in% right.users)), ]$text

 ## get lengths
 left.length  <- length(left.docs)
 right.length <- length(right.docs)

 ## get a random sample for training
 left.docs  <- sample(left.docs, left.length)
 right.docs <- sample(right.docs, right.length)

 left.training  <- left.docs[1:(left.length / 2)]
 right.training <- right.docs[1:(right.length / 2)]

 # With all of our support functions written, we can perform the classification.
 # First, we create document corpus for left messages

 # Create a DocumentTermMatrix from left vector
 left.tdm <- get.tdm(left.training)

 # Create a data frame that provides the feature set from the training LEFT data
 left.matrix <- as.matrix(left.tdm)
 left.counts <- rowSums(left.matrix)
 left.df <- data.frame(cbind(names(left.counts),
                            as.numeric(left.counts)),
                      stringsAsFactors = FALSE)
 names(left.df) <- c("term", "frequency")
 left.df$frequency <- as.numeric(left.df$frequency)
 left.occurrence <- sapply(1:nrow(left.matrix),
                          function(i)
                          {
                            length(which(left.matrix[i, ] > 0)) / ncol(left.matrix)
                          })
 left.density <- left.df$frequency / sum(left.df$frequency)

 # Add the term density and occurrence rate
 left.df <- transform(left.df,
                     density = left.density,
                     occurrence = left.occurrence)

 # Now do the same for the RIGHT email
 right.tdm <- get.tdm(right.training)

 right.matrix <- as.matrix(right.tdm)
 right.counts <- rowSums(right.matrix)
 right.df <- data.frame(cbind(names(right.counts),
                               as.numeric(right.counts)),
                         stringsAsFactors = FALSE)
 names(right.df) <- c("term", "frequency")
 right.df$frequency <- as.numeric(right.df$frequency)
 right.occurrence <- sapply(1:nrow(right.matrix),
                            function(i)
                            {
                              length(which(right.matrix[i, ] > 0)) / ncol(right.matrix)
                            })
 right.density <- right.df$frequency / sum(right.df$frequency)

 right.df <- transform(right.df,
                        density = right.density,
                        occurrence = right.occurrence)

 ## peel off 10000 to classify
 rest.docs <- sample(df$text, 10000)

 # Run classifer against whole data set
 rest.lefttest <- sapply(rest.docs[(left.length / 2):left.length],
                           function(p) classify.email(p, training.df = left.df))
    
 rest.righttest <- sapply(rest.docs[(left.length / 2):left.length],
                          function(p) classify.email(p, training.df = right.df))
    
 rest.res <- ifelse(rest.lefttest > rest.righttest, TRUE, FALSE)
 summary(rest.res)

 # Finally, attempt to classify the rest data using the classifer developed above.
 # The rule is to classify a message as left if Pr(msg) = left > Pr(msg) = right
 left.classifier <- function(msg)
 {
  pr.left  <- classify.email(msg, left.df, prior = 0.5)
  pr.right <- classify.email(msg, right.df, prior = 0.5)
  return(c(pr.left, pr.right, ifelse(pr.left > pr.right, 1, 0)))
 }

 # Classify them all!
 left.class <- suppressWarnings(lapply(left.docs[(left.length / 2):left.length],
                                   function(p)
                                   {
                                     left.classifier(p)
                                   }))

 right.class <- suppressWarnings(lapply(right.docs[(right.length / 2):right.length],
                                   function(p)
                                   {
                                     left.classifier(p)
                                   }))

 # Create a single, final, data frame with all of the classification data in it
 left.matrix <- do.call(rbind, left.class)
 left.final <- cbind(left.matrix, "left")

 right.matrix <- do.call(rbind, right.class)
 right.final <- cbind(right.matrix, "right")

 class.matrix <- rbind(left.final, right.final)
 class.df <- data.frame(class.matrix, stringsAsFactors = FALSE)
 names(class.df) <- c("Pr.Left" ,"Pr.Right", "Class", "Type")
 class.df$Pr.Left <- as.numeric(class.df$Pr.Left)
 class.df$Pr.Right <- as.numeric(class.df$Pr.Right)
 class.df$Class <- as.logical(as.numeric(class.df$Class))
 class.df$Type <- as.factor(class.df$Type)

 # Create final plot of results
 class.plot <- ggplot(class.df, aes(x = log(Pr.Left), log(Pr.Right))) +
    geom_point(aes(color = Type, alpha = 0.5)) +
    stat_abline(yintercept = 0, slope = 1) +
    scale_color_manual(values = c("left" = "blue",
                                  "right" = "red"),
                       name = "Message Type") +
    scale_alpha(guide = "none") +
    xlab("log[Pr(Left)]") +
    ylab("log[Pr(Right)]") +
    theme_bw()

 ggsave(plot = class.plot,
       filename = file.path("../img", "final_class_LR.png"),
       height = 10,
       width = 10)

 get.results <- function(bool.vector)
 {
  results <- c(length(bool.vector[which(bool.vector == TRUE)]) / length(bool.vector),
               length(bool.vector[which(bool.vector == FALSE)]) / length(bool.vector))
  return(results)
 }

 # Save results as a 2x2 table
 left.col <- get.results(subset(class.df, Type == "left")$Class)
 right.col <- get.results(subset(class.df, Type == "right")$Class)

 class.res <- rbind(left.col, right.col)
 colnames(class.res) <- c("Left", "Right")
 print(class.res)

 # Save the training data for use in Chapter 4
 write.csv(left.df, file.path("data", "left_df.csv"), row.names = FALSE)
 write.csv(right.df, file.path("data", "right_df.csv"), row.names = FALSE)
	# File-Name: polClassify.R
	# Edited: 2013-03-20
	# Orig.Author: Drew Conway ([email protected])
	#
	# Packages Used: tm, ggplot2
	#

	# All source code is copyright (c) 2012, under the Simplified BSD License.
	# For more information on FreeBSD see: http://www.opensource.org/licenses/bsd-license.php

	# All images and materials produced by this code are licensed under the Creative Commons
	# Attribution-Share Alike 3.0 United States License: http://creativecommons.org/licenses/by-sa/3.0/us/

	# All rights reserved.

	# NOTE: If you are running this in the R console you must use the 'setwd' command to set the
	# working directory for the console to whereever you have saved this file prior to running.
	# Otherwise you will see errors when loading data or saving figures!

	## modified by Alex Hanna ([email protected]) for use with classifying political polarization

	# Load libraries
	library(tm)
	library(ggplot2)
	library(stringr)

	colNames <- c(
	"status_id",
	"created_at",
	"text",
	"source",
	"coords",
	"user.id",
	"user.name",
	"user.screen_name",
	"user.level",
	"user.description",
	"user.location",
	"user.url",
	"user.followers_count",
	"user.friends_count",
	"user.listed_count",
	"user.statuses_count",
	"rt.status_id",
	"rt.created_at",
	"rt.text",
	"rt.source",
	"rt.coords",
	"rt.user.id",
	"rt.user.name",
	"rt.user.screen_name",
	"rt.user.level",
	"rt.user.description",
	"rt.user.location",
	"rt.user.url",
	"rt.user.followers_count",
	"rt.user.friends_count",
	"rt.user.listed_count",
	"rt.user.statuses_count"
	)


	# Return a single element vector of just the email body
	# This is a very simple approach, as we are only using
	# words as features
	get.msg <- function(path)
	{
	con <- file(path, open = "rt", encoding = "latin1")
	text <- readLines(con)
	# The message always begins after the first full line break
	msg <- text[seq(which(text == "")[1] + 1, length(text), 1)]
	close(con)
	return(paste(msg, collapse = "\n"))
	}

	# Create a TermDocumentMatrix (TDM) from the corpus of SPAM email.
	# The TDM control can be modified, and the sparsity level can be
	# altered. This TDM is used to create the feature set used to do
	# train our classifier.
	get.tdm <- function(doc.vec)
	{
	control <- list(stopwords = TRUE,
	removePunctuation = TRUE,
	removeNumbers = TRUE,
	minDocFreq = 2)
	doc.corpus <- Corpus(VectorSource(doc.vec))
	doc.dtm <- TermDocumentMatrix(doc.corpus, control)
	return(doc.dtm)
	}

	# This function takes a file path to an email file and a string,
	# the term parameter, and returns the count of that term in
	# the email body.
	count.word <- function(path, term)
	{
	msg <- get.msg(path)
	msg.corpus <- Corpus(VectorSource(msg))
	# Hard-coded TDM control
	control <- list(stopwords = TRUE,
	removePunctuation = TRUE,
	removeNumbers = TRUE)
	msg.tdm <- TermDocumentMatrix(msg.corpus, control)
	word.freq <- rowSums(as.matrix(msg.tdm))
	term.freq <- word.freq[which(names(word.freq) == term)]
	# We use ifelse here because term.freq = NA if nothing is found
	return(ifelse(length(term.freq) > 0, term.freq, 0))
	}

	# This is the our workhorse function for classifying email. It takes
	# two required paramters: a file path to an email to classify, and
	# a data frame of the trained data. The function also takes two
	# optional parameters. First, a prior over the probability that an email
	# is SPAM, which we set to 0.5 (naive), and constant value for the
	# probability on words in the email that are not in our training data.
	# The function returns the naive Bayes probability that the given email
	# is SPAM.
	classify.email <- function(msg, training.df, prior = 0.5, c = 1e-6)
	{
	# Here, we use many of the support functions to get the
	# email text data in a workable format
	msg.tdm <- get.tdm(msg)
	msg.freq <- rowSums(as.matrix(msg.tdm))
	# Find intersections of words
	msg.match <- intersect(names(msg.freq), training.df$term)
	# Now, we just perform the naive Bayes calculation
	if(length(msg.match) < 1)
	{
	return(prior * c ^ (length(msg.freq)))
	}
	else
	{
	match.probs <- training.df$occurrence[match(msg.match, training.df$term)]
	return(prior * prod(match.probs) * c ^ (length(msg.freq) - length(msg.match)))
	}
	}

	df <- read.delim('/project/hanna/wirecall/wirecall.highdetail.csv',
	header = F, sep = "\t",
	col.names = colNames,
	stringsAsFactors = F, quote = "")

	df$user.screen_name <- tolower(df$user.screen_name)

	## left and right users
	left.users <- c("legaleagle", "spudlovr", "barrett4wi", "wisdems", "repgwenmoore", "markpocan")
	right.users <- c("scottkwalker", "govwalker", "turnsc", "gopbrad", "repvos",
	"tommyforwi", "afpwi", "tppatriots", "vickimckenna", "win8ive", "maciverwisc",
	"mattbatzel", "gopbrad", "scottwalkerhq", "jbvanhollen", "ourcountrypac",
	"pbartel", "BrickM", "wisgop", "rebeccaforreal", "danerepublicans", "the_rga", "prolifewi",
	"mediatrackers")

	left.docs <- df[which(with(df, user.screen_name %in% left.users)), ]$text
	right.docs <- df[which(with(df, user.screen_name %in% right.users)), ]$text

	## get lengths
	left.length <- length(left.docs)
	right.length <- length(right.docs)

	## get a random sample for training
	left.docs <- sample(left.docs, left.length)
	right.docs <- sample(right.docs, right.length)

	left.training <- left.docs[1:(left.length / 2)]
	right.training <- right.docs[1:(right.length / 2)]

	# With all of our support functions written, we can perform the classification.
	# First, we create document corpus for left messages

	# Create a DocumentTermMatrix from left vector
	left.tdm <- get.tdm(left.training)

	# Create a data frame that provides the feature set from the training LEFT data
	left.matrix <- as.matrix(left.tdm)
	left.counts <- rowSums(left.matrix)
	left.df <- data.frame(cbind(names(left.counts),
	as.numeric(left.counts)),
	stringsAsFactors = FALSE)
	names(left.df) <- c("term", "frequency")
	left.df$frequency <- as.numeric(left.df$frequency)
	left.occurrence <- sapply(1:nrow(left.matrix),
	function(i)
	{
	length(which(left.matrix[i, ] > 0)) / ncol(left.matrix)
	})
	left.density <- left.df$frequency / sum(left.df$frequency)

	# Add the term density and occurrence rate
	left.df <- transform(left.df,
	density = left.density,
	occurrence = left.occurrence)

	# Now do the same for the RIGHT email
	right.tdm <- get.tdm(right.training)

	right.matrix <- as.matrix(right.tdm)
	right.counts <- rowSums(right.matrix)
	right.df <- data.frame(cbind(names(right.counts),
	as.numeric(right.counts)),
	stringsAsFactors = FALSE)
	names(right.df) <- c("term", "frequency")
	right.df$frequency <- as.numeric(right.df$frequency)
	right.occurrence <- sapply(1:nrow(right.matrix),
	function(i)
	{
	length(which(right.matrix[i, ] > 0)) / ncol(right.matrix)
	})
	right.density <- right.df$frequency / sum(right.df$frequency)

	right.df <- transform(right.df,
	density = right.density,
	occurrence = right.occurrence)

	## peel off 10000 to classify
	rest.docs <- sample(df$text, 10000)

	# Run classifer against whole data set
	rest.lefttest <- sapply(rest.docs[(left.length / 2):left.length],
	function(p) classify.email(p, training.df = left.df))

	rest.righttest <- sapply(rest.docs[(left.length / 2):left.length],
	function(p) classify.email(p, training.df = right.df))

	rest.res <- ifelse(rest.lefttest > rest.righttest, TRUE, FALSE)
	summary(rest.res)

	# Finally, attempt to classify the rest data using the classifer developed above.
	# The rule is to classify a message as left if Pr(msg) = left > Pr(msg) = right
	left.classifier <- function(msg)
	{
	pr.left <- classify.email(msg, left.df, prior = 0.5)
	pr.right <- classify.email(msg, right.df, prior = 0.5)
	return(c(pr.left, pr.right, ifelse(pr.left > pr.right, 1, 0)))
	}

	# Classify them all!
	left.class <- suppressWarnings(lapply(left.docs[(left.length / 2):left.length],
	function(p)
	{
	left.classifier(p)
	}))

	right.class <- suppressWarnings(lapply(right.docs[(right.length / 2):right.length],
	function(p)
	{
	left.classifier(p)
	}))

	# Create a single, final, data frame with all of the classification data in it
	left.matrix <- do.call(rbind, left.class)
	left.final <- cbind(left.matrix, "left")

	right.matrix <- do.call(rbind, right.class)
	right.final <- cbind(right.matrix, "right")

	class.matrix <- rbind(left.final, right.final)
	class.df <- data.frame(class.matrix, stringsAsFactors = FALSE)
	names(class.df) <- c("Pr.Left" ,"Pr.Right", "Class", "Type")
	class.df$Pr.Left <- as.numeric(class.df$Pr.Left)
	class.df$Pr.Right <- as.numeric(class.df$Pr.Right)
	class.df$Class <- as.logical(as.numeric(class.df$Class))
	class.df$Type <- as.factor(class.df$Type)

	# Create final plot of results
	class.plot <- ggplot(class.df, aes(x = log(Pr.Left), log(Pr.Right))) +
	geom_point(aes(color = Type, alpha = 0.5)) +
	stat_abline(yintercept = 0, slope = 1) +
	scale_color_manual(values = c("left" = "blue",
	"right" = "red"),
	name = "Message Type") +
	scale_alpha(guide = "none") +
	xlab("log[Pr(Left)]") +
	ylab("log[Pr(Right)]") +
	theme_bw()

	ggsave(plot = class.plot,
	filename = file.path("../img", "final_class_LR.png"),
	height = 10,
	width = 10)

	get.results <- function(bool.vector)
	{
	results <- c(length(bool.vector[which(bool.vector == TRUE)]) / length(bool.vector),
	length(bool.vector[which(bool.vector == FALSE)]) / length(bool.vector))
	return(results)
	}

	# Save results as a 2x2 table
	left.col <- get.results(subset(class.df, Type == "left")$Class)
	right.col <- get.results(subset(class.df, Type == "right")$Class)

	class.res <- rbind(left.col, right.col)
	colnames(class.res) <- c("Left", "Right")
	print(class.res)

	# Save the training data for use in Chapter 4
	write.csv(left.df, file.path("data", "left_df.csv"), row.names = FALSE)
	write.csv(right.df, file.path("data", "right_df.csv"), row.names = FALSE)