Last active
December 27, 2015 03:18
-
-
Save inkhorn/7257840 to your computer and use it in GitHub Desktop.
Enron Corpus Processing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(stringr) | |
library(plyr) | |
library(tm) | |
library(tm.plugin.mail) | |
library(SnowballC) | |
library(topicmodels) | |
# At this point, the python script should have been run, | |
# creating about 126 thousand txt files. I was very much afraid | |
# to import that many txt files into the tm package in R (my computer only | |
# runs on 8GB of RAM), so I decided to mark 60k of them for a sample, and move the | |
# rest of them into a separate directory | |
email_txts = list.files('data/') | |
email_txts_sample = sample(email_txts, 60000) | |
email_rename = data.frame(orig=email_txts_sample, new=sub(".txt",".rxr", email_txts_sample)) | |
file.rename(str_c('data/',email_rename$orig), str_c('data/',email_rename$new)) | |
# At this point, all of the non-sampled emails (labelled .txt, not .rxr) | |
# need to go into a different directory. I created a directory that I called | |
# nonsampled/ and moved the files there via the terminal command "mv *.txt nonsampled/". | |
# It's very important that you don't try to do this via a file explorer, windows or linux, | |
# as the act of trying to display that many file icons is apparently very difficult for a regular machine :$ | |
enron = Corpus(DirSource("/home/inkhorn/enron/data")) | |
extendedstopwords=c("a","about","above","across","after","MIME Version","forwarded","again","against","all","almost","alone","along","already","also","although","always","am","among","an","and","another","any","anybody","anyone","anything","anywhere","are","area","areas","aren't","around","as","ask","asked","asking","asks","at","away","b","back","backed","backing","backs","be","became","because","become","becomes","been","before","began","behind","being","beings","below","best","better","between","big","both","but","by","c","came","can","cannot","can't","case","cases","certain","certainly","clear","clearly","come","could","couldn't","d","did","didn't","differ","different","differently","do","does","doesn't","doing","done","don't","down","downed","downing","downs","during","e","each","early","either","end","ended","ending","ends","enough","even","evenly","ever","every","everybody","everyone","everything","everywhere","f","face","faces","fact","facts","far","felt","few","find","finds","first","for","four","from","full","fully","further","furthered","furthering","furthers","g","gave","general","generally","get","gets","give","given","gives","go","going","good","goods","got","great","greater","greatest","group","grouped","grouping","groups","h","had","hadn't","has","hasn't","have","haven't","having","he","he'd","he'll","her","here","here's","hers","herself","he's","high","higher","highest","him","himself","his","how","however","how's","i","i'd","if","i'll","i'm","important","in","interest","interested","interesting","interests","into","is","isn't","it","its","it's","itself","i've","j","just","k","keep","keeps","kind","knew","know","known","knows","l","large","largely","last","later","latest","least","less","let","lets","let's","like","likely","long","longer","longest","m","made","make","making","man","many","may","me","member","members","men","might","more","most","mostly","mr","mrs","much","must","mustn't","my","myself","n","necessary","need","needed","needing","needs","never","new","newer","newest","next","no","nobody","non","noone","nor","not","nothing","now","nowhere","number","numbers","o","of","off","often","old","older","oldest","on","once","one","only","open","opened","opening","opens","or","order","ordered","ordering","orders","other","others","ought","our","ours","ourselves","out","over","own","p","part","parted","parting","parts","per","perhaps","place","places","point","pointed","pointing","points","possible","present","presented","presenting","presents","problem","problems","put","puts","q","quite","r","rather","really","right","room","rooms","s","said","same","saw","say","says","second","seconds","see","seem","seemed","seeming","seems","sees","several","shall","shan't","she","she'd","she'll","she's","should","shouldn't","show","showed","showing","shows","side","sides","since","small","smaller","smallest","so","some","somebody","someone","something","somewhere","state","states","still","such","sure","t","take","taken","than","that","that's","the","their","theirs","them","themselves","then","there","therefore","there's","these","they","they'd","they'll","they're","they've","thing","things","think","thinks","this","those","though","thought","thoughts","three","through","thus","to","today","together","too","took","toward","turn","turned","turning","turns","two","u","under","until","up","upon","us","use","used","uses","v","very","w","want","wanted","wanting","wants","was","wasn't","way","ways","we","we'd","well","we'll","wells","went","were","we're","weren't","we've","what","what's","when","when's","where","where's","whether","which","while","who","whole","whom","who's","whose","why","why's","will","with","within","without","won't","work","worked","working","works","would","wouldn't","x","y","year","years","yes","yet","you","you'd","you'll","young","younger","youngest","your","you're","yours","yourself","yourselves","you've","z") | |
dtm.control = list( | |
tolower = T, | |
removePunctuation = T, | |
removeNumbers = T, | |
stopwords = c(stopwords("english"),extendedstopwords), | |
stemming = T, | |
wordLengths = c(3,Inf), | |
weighting = weightTf) | |
dtm = DocumentTermMatrix(enron, control=dtm.control) | |
dtm = removeSparseTerms(dtm,0.999) | |
dtm = dtm[rowSums(as.matrix(dtm))>0,] | |
k = 4 | |
# Beware: this step takes a lot of patience! My computer was chugging along for probably 10 or so minutes before it completed the LDA here. | |
lda.model = LDA(dtm, k) | |
# This enables you to examine the words that make up each topic that was calculated. Bear in mind that I've chosen to stem all words possible in this corpus, so some of the words output will look a little weird. | |
terms(lda.model,20) | |
# Here I construct a dataframe that scores each document according to how closely its content | |
# matches up with each topic. The closer the score is to 0, the more likely its content matches | |
# up with a particular topic. | |
emails.topics = posterior(lda.model, dtm)$topics | |
df.emails.topics = as.data.frame(emails.topics) | |
df.emails.topics = cbind(email=as.character(rownames(df.emails.topics)), | |
df.emails.topics, stringsAsFactors=F) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment