inkhorn · December 27, 2015 03:18
diff --git a/enron corpus processing.r b/enron corpus processing.r
 library(stringr)
 library(plyr)
 library(tm)
 library(tm.plugin.mail)
 library(SnowballC)
 library(topicmodels)

 # At this point, the python script should have been run, 
 # creating about 126 thousand txt files.  I was very much afraid
 # to import that many txt files into the tm package in R (my computer only
 # runs on 8GB of RAM), so I decided to mark 60k of them for a sample, and move the
 # rest of them into a separate directory

 email_txts = list.files('data/')
 email_txts_sample = sample(email_txts, 60000)
 email_rename = data.frame(orig=email_txts_sample, new=sub(".txt",".rxr", email_txts_sample))
 file.rename(str_c('data/',email_rename$orig), str_c('data/',email_rename$new))

 # At this point, all of the non-sampled emails (labelled .txt, not .rxr)
 # need to go into a different directory. I created a directory that I called
 # nonsampled/ and moved the files there via the terminal command "mv *.txt nonsampled/".
 # It's very important that you don't try to do this via a file explorer, windows or linux,
 # as the act of trying to display that many file icons is apparently very difficult for a regular machine :$

 enron = Corpus(DirSource("/home/inkhorn/enron/data"))

 extendedstopwords=c("a","about","above","across","after","MIME Version","forwarded","again","against","all","almost","alone","along","already","also","although","always","am","among","an","and","another","any","anybody","anyone","anything","anywhere","are","area","areas","aren't","around","as","ask","asked","asking","asks","at","away","b","back","backed","backing","backs","be","became","because","become","becomes","been","before","began","behind","being","beings","below","best","better","between","big","both","but","by","c","came","can","cannot","can't","case","cases","certain","certainly","clear","clearly","come","could","couldn't","d","did","didn't","differ","different","differently","do","does","doesn't","doing","done","don't","down","downed","downing","downs","during","e","each","early","either","end","ended","ending","ends","enough","even","evenly","ever","every","everybody","everyone","everything","everywhere","f","face","faces","fact","facts","far","felt","few","find","finds","first","for","four","from","full","fully","further","furthered","furthering","furthers","g","gave","general","generally","get","gets","give","given","gives","go","going","good","goods","got","great","greater","greatest","group","grouped","grouping","groups","h","had","hadn't","has","hasn't","have","haven't","having","he","he'd","he'll","her","here","here's","hers","herself","he's","high","higher","highest","him","himself","his","how","however","how's","i","i'd","if","i'll","i'm","important","in","interest","interested","interesting","interests","into","is","isn't","it","its","it's","itself","i've","j","just","k","keep","keeps","kind","knew","know","known","knows","l","large","largely","last","later","latest","least","less","let","lets","let's","like","likely","long","longer","longest","m","made","make","making","man","many","may","me","member","members","men","might","more","most","mostly","mr","mrs","much","must","mustn't","my","myself","n","necessary","need","needed","needing","needs","never","new","newer","newest","next","no","nobody","non","noone","nor","not","nothing","now","nowhere","number","numbers","o","of","off","often","old","older","oldest","on","once","one","only","open","opened","opening","opens","or","order","ordered","ordering","orders","other","others","ought","our","ours","ourselves","out","over","own","p","part","parted","parting","parts","per","perhaps","place","places","point","pointed","pointing","points","possible","present","presented","presenting","presents","problem","problems","put","puts","q","quite","r","rather","really","right","room","rooms","s","said","same","saw","say","says","second","seconds","see","seem","seemed","seeming","seems","sees","several","shall","shan't","she","she'd","she'll","she's","should","shouldn't","show","showed","showing","shows","side","sides","since","small","smaller","smallest","so","some","somebody","someone","something","somewhere","state","states","still","such","sure","t","take","taken","than","that","that's","the","their","theirs","them","themselves","then","there","therefore","there's","these","they","they'd","they'll","they're","they've","thing","things","think","thinks","this","those","though","thought","thoughts","three","through","thus","to","today","together","too","took","toward","turn","turned","turning","turns","two","u","under","until","up","upon","us","use","used","uses","v","very","w","want","wanted","wanting","wants","was","wasn't","way","ways","we","we'd","well","we'll","wells","went","were","we're","weren't","we've","what","what's","when","when's","where","where's","whether","which","while","who","whole","whom","who's","whose","why","why's","will","with","within","without","won't","work","worked","working","works","would","wouldn't","x","y","year","years","yes","yet","you","you'd","you'll","young","younger","youngest","your","you're","yours","yourself","yourselves","you've","z")
 dtm.control = list(
  tolower                         = T,
  removePunctuation         = T,
  removeNumbers                 = T,
  stopwords                         = c(stopwords("english"),extendedstopwords),
  stemming                         = T,
  wordLengths                 = c(3,Inf),
  weighting                         = weightTf)

 dtm = DocumentTermMatrix(enron, control=dtm.control)
 dtm = removeSparseTerms(dtm,0.999)
 dtm = dtm[rowSums(as.matrix(dtm))>0,]

 k = 4

 # Beware: this step takes a lot of patience!  My computer was chugging along for probably 10 or so minutes before it completed the LDA here.
 lda.model = LDA(dtm, k)

 # This enables you to examine the words that make up each topic that was calculated.  Bear in mind that I've chosen to stem all words possible in this corpus, so some of the words output will look a little weird.
 terms(lda.model,20)

 # Here I construct a dataframe that scores each document according to how closely its content 
 # matches up with each topic.  The closer the score is to 0, the more likely its content matches
 # up with a particular topic. 

 emails.topics = posterior(lda.model, dtm)$topics
 df.emails.topics = as.data.frame(emails.topics)
 df.emails.topics = cbind(email=as.character(rownames(df.emails.topics)), 
                         df.emails.topics, stringsAsFactors=F)
	library(stringr)
	library(plyr)
	library(tm)
	library(tm.plugin.mail)
	library(SnowballC)
	library(topicmodels)

	# At this point, the python script should have been run,
	# creating about 126 thousand txt files. I was very much afraid
	# to import that many txt files into the tm package in R (my computer only
	# runs on 8GB of RAM), so I decided to mark 60k of them for a sample, and move the
	# rest of them into a separate directory

	email_txts = list.files('data/')
	email_txts_sample = sample(email_txts, 60000)
	email_rename = data.frame(orig=email_txts_sample, new=sub(".txt",".rxr", email_txts_sample))
	file.rename(str_c('data/',email_rename$orig), str_c('data/',email_rename$new))

	# At this point, all of the non-sampled emails (labelled .txt, not .rxr)
	# need to go into a different directory. I created a directory that I called
	# nonsampled/ and moved the files there via the terminal command "mv *.txt nonsampled/".
	# It's very important that you don't try to do this via a file explorer, windows or linux,
	# as the act of trying to display that many file icons is apparently very difficult for a regular machine :$

	enron = Corpus(DirSource("/home/inkhorn/enron/data"))

	extendedstopwords=c("a","about","above","across","after","MIME Version","forwarded","again","against","all","almost","alone","along","already","also","although","always","am","among","an","and","another","any","anybody","anyone","anything","anywhere","are","area","areas","aren't","around","as","ask","asked","asking","asks","at","away","b","back","backed","backing","backs","be","became","because","become","becomes","been","before","began","behind","being","beings","below","best","better","between","big","both","but","by","c","came","can","cannot","can't","case","cases","certain","certainly","clear","clearly","come","could","couldn't","d","did","didn't","differ","different","differently","do","does","doesn't","doing","done","don't","down","downed","downing","downs","during","e","each","early","either","end","ended","ending","ends","enough","even","evenly","ever","every","everybody","everyone","everything","everywhere","f","face","faces","fact","facts","far","felt","few","find","finds","first","for","four","from","full","fully","further","furthered","furthering","furthers","g","gave","general","generally","get","gets","give","given","gives","go","going","good","goods","got","great","greater","greatest","group","grouped","grouping","groups","h","had","hadn't","has","hasn't","have","haven't","having","he","he'd","he'll","her","here","here's","hers","herself","he's","high","higher","highest","him","himself","his","how","however","how's","i","i'd","if","i'll","i'm","important","in","interest","interested","interesting","interests","into","is","isn't","it","its","it's","itself","i've","j","just","k","keep","keeps","kind","knew","know","known","knows","l","large","largely","last","later","latest","least","less","let","lets","let's","like","likely","long","longer","longest","m","made","make","making","man","many","may","me","member","members","men","might","more","most","mostly","mr","mrs","much","must","mustn't","my","myself","n","necessary","need","needed","needing","needs","never","new","newer","newest","next","no","nobody","non","noone","nor","not","nothing","now","nowhere","number","numbers","o","of","off","often","old","older","oldest","on","once","one","only","open","opened","opening","opens","or","order","ordered","ordering","orders","other","others","ought","our","ours","ourselves","out","over","own","p","part","parted","parting","parts","per","perhaps","place","places","point","pointed","pointing","points","possible","present","presented","presenting","presents","problem","problems","put","puts","q","quite","r","rather","really","right","room","rooms","s","said","same","saw","say","says","second","seconds","see","seem","seemed","seeming","seems","sees","several","shall","shan't","she","she'd","she'll","she's","should","shouldn't","show","showed","showing","shows","side","sides","since","small","smaller","smallest","so","some","somebody","someone","something","somewhere","state","states","still","such","sure","t","take","taken","than","that","that's","the","their","theirs","them","themselves","then","there","therefore","there's","these","they","they'd","they'll","they're","they've","thing","things","think","thinks","this","those","though","thought","thoughts","three","through","thus","to","today","together","too","took","toward","turn","turned","turning","turns","two","u","under","until","up","upon","us","use","used","uses","v","very","w","want","wanted","wanting","wants","was","wasn't","way","ways","we","we'd","well","we'll","wells","went","were","we're","weren't","we've","what","what's","when","when's","where","where's","whether","which","while","who","whole","whom","who's","whose","why","why's","will","with","within","without","won't","work","worked","working","works","would","wouldn't","x","y","year","years","yes","yet","you","you'd","you'll","young","younger","youngest","your","you're","yours","yourself","yourselves","you've","z")
	dtm.control = list(
	tolower = T,
	removePunctuation = T,
	removeNumbers = T,
	stopwords = c(stopwords("english"),extendedstopwords),
	stemming = T,
	wordLengths = c(3,Inf),
	weighting = weightTf)

	dtm = DocumentTermMatrix(enron, control=dtm.control)
	dtm = removeSparseTerms(dtm,0.999)
	dtm = dtm[rowSums(as.matrix(dtm))>0,]

	k = 4

	# Beware: this step takes a lot of patience! My computer was chugging along for probably 10 or so minutes before it completed the LDA here.
	lda.model = LDA(dtm, k)

	# This enables you to examine the words that make up each topic that was calculated. Bear in mind that I've chosen to stem all words possible in this corpus, so some of the words output will look a little weird.
	terms(lda.model,20)

	# Here I construct a dataframe that scores each document according to how closely its content
	# matches up with each topic. The closer the score is to 0, the more likely its content matches
	# up with a particular topic.

	emails.topics = posterior(lda.model, dtm)$topics
	df.emails.topics = as.data.frame(emails.topics)
	df.emails.topics = cbind(email=as.character(rownames(df.emails.topics)),
	df.emails.topics, stringsAsFactors=F)