alexstorer · December 17, 2012 21:49
diff --git a/sample.py b/sample.py
 import lxml.html
 import csv

 f = open('Sampl.htm')
 fw = open('Sampl.csv','w')

 tree = lxml.html.parse(f).getroot()

 fnames = ['COMPNOS','FROMDATE','M1','M2','NARRATION']
 dw = csv.DictWriter(fw,fnames)

 rows = tree.xpath('.//tr')
 for r in rows:
    cols = r.xpath('.//td')
    # we expect to have four columns
    if len(cols)==5:
        d = dict()
        for i in range(5):
            c = cols[i]
            d[fnames[i]] = c.text_content()
        dw.writerow(d)
        
 f.close()
 fw.close()
diff --git a/sample.R b/sample.R
 setwd('~/Work/dlopez')
 r <- read.csv('Sampl.csv',stringsAsFactors=FALSE)

 # First, let's load up this pile of things.

 Sys.setenv(NOAWT=TRUE) 

 # This is a workaround for Macs

 library(tm) 
 library(Snowball) 
 library(RWeka) 
 library(rJava) 
 library(RWekajars) 


 library("topicmodels")
 library("XML")

 readbasic <- function (elem, language, id) 
 {
  # ideally, add in metadata for each piece of lexisnexis
  doc <- PlainTextDocument(elem$content, id = id, language = language)
 }
 corpus <- Corpus(VectorSource(r$NARRATION),readerControl=list(reader = readbasic))

 dtm <- DocumentTermMatrix(corpus, control = list(stemming = FALSE, stopwords = TRUE, 
                                                 minWordLength = 3, removeNumbers = TRUE, removePunctuation = TRUE))
 dim(dtm)

 # throw out any terms that occur in less than 10% of documents

 # throw out any terms that occur in more than 80% of documents

 # 2 topics

 k <- 10
 SEED <- 2010
 sub_TM <-
  list(VEM = LDA(sub_dtm, k = k, control = list(seed = SEED)),
       VEM_fixed = LDA(sub_dtm, k = k,
                       control = list(estimate.alpha = FALSE, seed = SEED)),
       Gibbs = LDA(sub_dtm, k = k, method = "Gibbs",
                   control = list(seed = SEED, burnin = 1000,
                                  thin = 100, iter = 1000)),
       CTM = CTM(sub_dtm, k = k,
                 control = list(seed = SEED,
                                var = list(tol = 10^-4), 
                                em = list(tol = 10^-3))))

 sapply(sub_TM[1:2], slot, "alpha")

 Terms <- terms(sub_TM[["VEM"]], 5)
	import lxml.html
	import csv

	f = open('Sampl.htm')
	fw = open('Sampl.csv','w')

	tree = lxml.html.parse(f).getroot()

	fnames = ['COMPNOS','FROMDATE','M1','M2','NARRATION']
	dw = csv.DictWriter(fw,fnames)

	rows = tree.xpath('.//tr')
	for r in rows:
	cols = r.xpath('.//td')
	# we expect to have four columns
	if len(cols)==5:
	d = dict()
	for i in range(5):
	c = cols[i]
	d[fnames[i]] = c.text_content()
	dw.writerow(d)

	f.close()
	fw.close()
	setwd('~/Work/dlopez')
	r <- read.csv('Sampl.csv',stringsAsFactors=FALSE)

	# First, let's load up this pile of things.

	Sys.setenv(NOAWT=TRUE)

	# This is a workaround for Macs

	library(tm)
	library(Snowball)
	library(RWeka)
	library(rJava)
	library(RWekajars)


	library("topicmodels")
	library("XML")

	readbasic <- function (elem, language, id)
	{
	# ideally, add in metadata for each piece of lexisnexis
	doc <- PlainTextDocument(elem$content, id = id, language = language)
	}
	corpus <- Corpus(VectorSource(r$NARRATION),readerControl=list(reader = readbasic))

	dtm <- DocumentTermMatrix(corpus, control = list(stemming = FALSE, stopwords = TRUE,
	minWordLength = 3, removeNumbers = TRUE, removePunctuation = TRUE))
	dim(dtm)

	# throw out any terms that occur in less than 10% of documents

	# throw out any terms that occur in more than 80% of documents

	# 2 topics

	k <- 10
	SEED <- 2010
	sub_TM <-
	list(VEM = LDA(sub_dtm, k = k, control = list(seed = SEED)),
	VEM_fixed = LDA(sub_dtm, k = k,
	control = list(estimate.alpha = FALSE, seed = SEED)),
	Gibbs = LDA(sub_dtm, k = k, method = "Gibbs",
	control = list(seed = SEED, burnin = 1000,
	thin = 100, iter = 1000)),
	CTM = CTM(sub_dtm, k = k,
	control = list(seed = SEED,
	var = list(tol = 10^-4),
	em = list(tol = 10^-3))))

	sapply(sub_TM[1:2], slot, "alpha")

	Terms <- terms(sub_TM[["VEM"]], 5)