Skip to content

Instantly share code, notes, and snippets.

@alexstorer
Created December 17, 2012 21:49
Show Gist options
  • Save alexstorer/4322612 to your computer and use it in GitHub Desktop.
Save alexstorer/4322612 to your computer and use it in GitHub Desktop.
Code to handle the Word documents (saved as HTML) and convert to CSV. Then, we do topic modeling with R.
import lxml.html
import csv
f = open('Sampl.htm')
fw = open('Sampl.csv','w')
tree = lxml.html.parse(f).getroot()
fnames = ['COMPNOS','FROMDATE','M1','M2','NARRATION']
dw = csv.DictWriter(fw,fnames)
rows = tree.xpath('.//tr')
for r in rows:
cols = r.xpath('.//td')
# we expect to have four columns
if len(cols)==5:
d = dict()
for i in range(5):
c = cols[i]
d[fnames[i]] = c.text_content()
dw.writerow(d)
f.close()
fw.close()
setwd('~/Work/dlopez')
r <- read.csv('Sampl.csv',stringsAsFactors=FALSE)
# First, let's load up this pile of things.
Sys.setenv(NOAWT=TRUE)
# This is a workaround for Macs
library(tm)
library(Snowball)
library(RWeka)
library(rJava)
library(RWekajars)
library("topicmodels")
library("XML")
readbasic <- function (elem, language, id)
{
# ideally, add in metadata for each piece of lexisnexis
doc <- PlainTextDocument(elem$content, id = id, language = language)
}
corpus <- Corpus(VectorSource(r$NARRATION),readerControl=list(reader = readbasic))
dtm <- DocumentTermMatrix(corpus, control = list(stemming = FALSE, stopwords = TRUE,
minWordLength = 3, removeNumbers = TRUE, removePunctuation = TRUE))
dim(dtm)
# throw out any terms that occur in less than 10% of documents
# throw out any terms that occur in more than 80% of documents
# 2 topics
k <- 10
SEED <- 2010
sub_TM <-
list(VEM = LDA(sub_dtm, k = k, control = list(seed = SEED)),
VEM_fixed = LDA(sub_dtm, k = k,
control = list(estimate.alpha = FALSE, seed = SEED)),
Gibbs = LDA(sub_dtm, k = k, method = "Gibbs",
control = list(seed = SEED, burnin = 1000,
thin = 100, iter = 1000)),
CTM = CTM(sub_dtm, k = k,
control = list(seed = SEED,
var = list(tol = 10^-4),
em = list(tol = 10^-3))))
sapply(sub_TM[1:2], slot, "alpha")
Terms <- terms(sub_TM[["VEM"]], 5)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment