Created
December 17, 2012 21:49
-
-
Save alexstorer/4322612 to your computer and use it in GitHub Desktop.
Code to handle the Word documents (saved as HTML) and convert to CSV. Then, we do topic modeling with R.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import lxml.html | |
| import csv | |
| f = open('Sampl.htm') | |
| fw = open('Sampl.csv','w') | |
| tree = lxml.html.parse(f).getroot() | |
| fnames = ['COMPNOS','FROMDATE','M1','M2','NARRATION'] | |
| dw = csv.DictWriter(fw,fnames) | |
| rows = tree.xpath('.//tr') | |
| for r in rows: | |
| cols = r.xpath('.//td') | |
| # we expect to have four columns | |
| if len(cols)==5: | |
| d = dict() | |
| for i in range(5): | |
| c = cols[i] | |
| d[fnames[i]] = c.text_content() | |
| dw.writerow(d) | |
| f.close() | |
| fw.close() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| setwd('~/Work/dlopez') | |
| r <- read.csv('Sampl.csv',stringsAsFactors=FALSE) | |
| # First, let's load up this pile of things. | |
| Sys.setenv(NOAWT=TRUE) | |
| # This is a workaround for Macs | |
| library(tm) | |
| library(Snowball) | |
| library(RWeka) | |
| library(rJava) | |
| library(RWekajars) | |
| library("topicmodels") | |
| library("XML") | |
| readbasic <- function (elem, language, id) | |
| { | |
| # ideally, add in metadata for each piece of lexisnexis | |
| doc <- PlainTextDocument(elem$content, id = id, language = language) | |
| } | |
| corpus <- Corpus(VectorSource(r$NARRATION),readerControl=list(reader = readbasic)) | |
| dtm <- DocumentTermMatrix(corpus, control = list(stemming = FALSE, stopwords = TRUE, | |
| minWordLength = 3, removeNumbers = TRUE, removePunctuation = TRUE)) | |
| dim(dtm) | |
| # throw out any terms that occur in less than 10% of documents | |
| # throw out any terms that occur in more than 80% of documents | |
| # 2 topics | |
| k <- 10 | |
| SEED <- 2010 | |
| sub_TM <- | |
| list(VEM = LDA(sub_dtm, k = k, control = list(seed = SEED)), | |
| VEM_fixed = LDA(sub_dtm, k = k, | |
| control = list(estimate.alpha = FALSE, seed = SEED)), | |
| Gibbs = LDA(sub_dtm, k = k, method = "Gibbs", | |
| control = list(seed = SEED, burnin = 1000, | |
| thin = 100, iter = 1000)), | |
| CTM = CTM(sub_dtm, k = k, | |
| control = list(seed = SEED, | |
| var = list(tol = 10^-4), | |
| em = list(tol = 10^-3)))) | |
| sapply(sub_TM[1:2], slot, "alpha") | |
| Terms <- terms(sub_TM[["VEM"]], 5) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment