alexstorer · April 5, 2013 21:49
diff --git a/gistfile1.r b/gistfile1.r
 # This is an exploration of topic modeling

 # First, let's load up this pile of things.

 Sys.setenv(NOAWT=TRUE) 

 # This is a workaround for Macs

 library(tm) 
 library(Snowball) 
 library(RWeka) 
 library(rJava) 
 library(RWekajars)
 library(gdata)
 library(slam)
 library("MiscPsycho")
 library(rpart)


 library("topicmodels")
 library("XML")

 blogsreader <- function(elem, language, id) {
  PlainTextDocument(elem$content, 
                    id = id, language = language)
 }

 mydir <- '/Users/astorer/Work/psanghavi'
 setwd(mydir)
 npi <- read.xls('npi_0405.xlsx',sheet="All NPIs",stringsAsFactors=FALSE)

 irsdb <- read.csv('data-download-pub78.txt',sep='|',stringsAsFactors=FALSE)
 names(irsdb) <- c('id', 'name', 'city', 'state', 'country', 'var')

 # save some space!
 irsdb$id <- NULL
 irsdb$country <- NULL
 irsdb$var <- NULL
 irsdb$name <- toupper(irsdb$name)

 # let's remove punctuation and common abbreviations
 irsdb$name <- gsub(",","",irsdb$name)
 irsdb$name <- gsub("\\.","",irsdb$name)
 irsdb$name <- gsub("INCORPORATED","INC",irsdb$name)
 irsdb$name <- gsub("'","",irsdb$name)
 npi$NPI_Description <- gsub(",","",npi$NPI_Description)
 npi$NPI_Description <- gsub("\\.","",npi$NPI_Description)
 npi$NPI_Description <- gsub("'","",npi$NPI_Description)
 npi$NPI_Description <- gsub("INCORPORATED","INC",npi$NPI_Description)



 npi$plocstatename[npi$plocstatename=='PUERTO RICO'] <- "PR"

 all.np <- c()
 all.states <- unique(npi$pmailstatename)
 for (this.state in all.states) {
  print(paste("===>",this.state))
  irs.state <- subset(irsdb,state==this.state)
  npi.state <- subset(npi,plocstatename==this.state)
  for (i in 1:nrow(npi.state)) {
    my.name <- npi.state[i,"NPI_Description"]
    my.id <- npi.state[i,"PRF_NPI"]    
    res <- my.name==irs.state$name
    if (sum(res,na.rm=T)>0) {
      print(irs.state[res,"name"])
      all.np <- c(all.np,my.id)
    }
  }
 }

 all.np <- data.frame(PRF_NPI = all.np)

 merge(all.np,npi)

 npi$nonprofit <- ifelse(npi$PRF_NPI %in% all.np$PRF_NPI,1,0)

 missing <- paste(npi$NPI_Description)==""
 textvec <- paste(npi$NPI_Description,npi$porgnameoth,npi$aotitle)
 textvec <- gsub("CHIEF FINANCIAL OFFICER","CFO",textvec)
 textvec <- gsub("CHIEF EXECUTIVE OFFICER","CEO",textvec)
 textvec <- gsub("ASSOCIATION","ASSOC",textvec)
 textvec <- gsub("/"," ",textvec)
 textvec[missing] <- "MISSING"

 corpus <- Corpus(VectorSource(textvec),readerControl=list(reader = blogsreader))

 Sys.setlocale("LC_COLLATE", "C")

 dtm <- DocumentTermMatrix(corpus, control = list(stemming = FALSE, stopwords = FALSE, 
                                                     minWordLength = 2, removeNumbers = FALSE, removePunctuation = TRUE))

 # Machine learning things

 # first remove terms that are too infrequent:
 cs <- col_sums(dtm)
 keepterms <- cs>15

 dtm_sub <- dtm[,keepterms]


 # Put the dtm_sub back into the npi data frame
 npidtm <- cbind(npi$Final.Assignment,as.data.frame(as.matrix(dtm_sub)))
 names(npidtm)[1] <- "Final.Assignment"
 npitruth <- npidtm[""!=(npidtm$Final.Assignment),]
 assignments<-npitruth$Final.Assignment

 # remove terms we used explicitly
 npitruth[,"fire"] <- NULL
 npitruth[,"owner"] <- NULL
 #npitruth[,"llc"] <- NULL
 #npitruth[,"inc"] <- NULL


 npitruth$Final.Assignment <- assignments
 npi.tree <- rpart(Final.Assignment ~., npitruth)
 plot(npi.tree)
 text(npi.tree)

 rsq.rpart(npi.tree) # visualize cross-validation results
 printcp(npi.tree) # display the results 
 plotcp(npi.tree) # visualize cross-validation results 

 predictions <- predict(npi.tree,npitruth,type='class')
 #table(predictions<0.5,npitruth$Final.Assignment)
 table(predictions,npitruth$Final.Assignment)

 # Other things we can do:
 # 1) Try other classifier (easy)
 # 2) Resample input to have equal numbers of categories, e.g., less fire (medium)
 # 3) Try different number of words (easy-medium)
 # 4) Use nonprofit as an input feature (easy)
 # 5) PO Box (medium - must label as PO Box)
 # 6) Remove small companies (medium)
 # 7) Use PCA as input features (medium)
 # 8) Do Idaho!
 # 9) Collapse categories into the basic 3



 dtm_sub_logical <- dtm_sub>0
 npidtm <- cbind(npi$Final.Assignment,as.data.frame(as.matrix(dtm_sub_logical)))
 names(npidtm)[1] <- "Final.Assignment"
 npitruth <- npidtm[""!=(npidtm$Final.Assignment),]
 npi.tree <- rpart(Final.Assignment ~., npitruth)
 plot(npi.tree)
 text(npi.tree)

 # Descriptive things

 sorted_terms <- sort(col_sums(dtm),decreasing=TRUE)
 plot(dtm, terms = names(sorted_terms[1:20]), corThreshold = 0.10)

 # We can query the associations for any term
 findAssocs(dtm,term="hospital",0.05)

 # Let's do some text clustering on this

 # first remove terms that are too infrequent:
 cs <- col_sums(dtm)
 keepterms <- cs>15

 dtm_sub <- dtm[,keepterms]

 k <- 10
 km <- kmeans(dtm_sub,k)
 plot.new()
 for (i in 1:k) {
  sorted_terms <- sort(col_sums(dtm_sub[km$cluster==i,]),decreasing=TRUE)
  barplot(sorted_terms[1:20],las=2)
  title(paste("Cluster", i))
  #plot(dtm[km$cluster==i,], terms = names(sorted_terms[1:20]), corThreshold = 0.075)
 }

 textvec[km$cluster==1]

 # PCA?

 pca <- princomp(dtm_sub)

 textvec[pca$scores[,1]>mean(pca$scores[,1])]
 textvec[pca$scores[,2]>mean(pca$scores[,2])]
 textvec[pca$scores[,3]>mean(pca$scores[,3])]

 termnames <- dimnames(dtm_sub)$Terms


 plot(dtm, terms = names(sorted_terms[1:50]), corThreshold = 0.075)


 ft <- findFreqTerms(dtm, lowfreq = 150, highfreq = 5000)

 # We can get the most frequent terms
 sorted_terms <- sort(col_sums(dtm),decreasing=T)

 # We can plot them
 barplot(sorted_terms[1:20],las=2)

 binarydtm <- dtm
 binarydtm$v <- binarydtm$v*0+1



 # k topics

 k <- 3
 SEED <- 2010
 sub_TM <-
   list(VEM = LDA(binarydtm, k = k, control = list(seed = SEED)),
        VEM_fixed = LDA(binarydtm, k = k,
                        control = list(estimate.alpha = FALSE, seed = SEED)),
        Gibbs = LDA(binarydtm, k = k, method = "Gibbs",
        control = list(seed = SEED, burnin = 1000,
                      thin = 100, iter = 1000)),
        CTM = CTM(binarydtm, k = k,
                  control = list(seed = SEED,
                                 var = list(tol = 10^-4), 
                                 em = list(tol = 10^-3))))

 sapply(sub_TM[1:2], slot, "alpha")

 Terms <- terms(sub_TM[["VEM"]], 10)

 # We can also plot the terms with lines indicating if they correlate
 plot(dtm, terms = names(sorted_terms[1:30]), corThreshold = 0.075)

 # Here are terms that correlate with debt
 findAssocs(dtm,term="chief",0.1)

 # Here is a plot of some debt terms and how they are related
 plot(dtm, terms = names(findAssocs(dtm,term="debt",0.2)), corThreshold = 0.30)


 ### Let's try this again, with new .txt files:

 blogsreader <- function(elem, language, id) {
  stripped <- remove_HTML_regex(elem$content)
  PlainTextDocument(stripped, 
                    id = id, language = language)
 }
 mydir <- '/Library/Frameworks/R.framework/Versions/2.14/Resources/library/ReadMe/demofiles/clintonposts/'
 corpus <- Corpus(DirSource(directory = mydir),readerControl=list(reader = blogsreader))
 #corpus <- Corpus(VectorSource(sapply(JSS_papers[, "description"],remove_HTML_regex)))
 Sys.setlocale("LC_COLLATE", "C")

 blogs_dtm <- DocumentTermMatrix(corpus, control = list(stemming = TRUE, stopwords = TRUE, 
                                                     minWordLength = 3, removeNumbers = TRUE, removePunctuation = TRUE))
 dim(blogs_dtm)

 summary(col_sums(blogs_dtm))
 term_tfidf <-
  tapply(blogs_dtm$v/row_sums(blogs_dtm)[blogs_dtm$i], blogs_dtm$j, mean) *
  log2(nDocs(blogs_dtm)/col_sums(blogs_dtm > 0))
 summary(term_tfidf)
 blogs_dtm <- blogs_dtm[,term_tfidf >= 0.1]
 blogs_dtm <- blogs_dtm[row_sums(blogs_dtm) > 0,]
 summary(col_sums(blogs_dtm))


 k <- 30
 SEED <- 2010
 blogs_TM <-
  list(VEM = LDA(blogs_dtm, k = k, control = list(seed = SEED)),
       VEM_fixed = LDA(blogs_dtm, k = k,
                       control = list(estimate.alpha = FALSE, seed = SEED)),
       Gibbs = LDA(blogs_dtm, k = k, method = "Gibbs",
                   control = list(seed = SEED, burnin = 1000,
                                  thin = 100, iter = 1000)),
       CTM = CTM(blogs_dtm, k = k,
                 control = list(seed = SEED,
                                var = list(tol = 10^-4), 
                                em = list(tol = 10^-3))))

 sapply(blogs_TM[1:2], slot, "alpha")

 Terms <- terms(blogs_TM[["VEM"]], 5)
	# This is an exploration of topic modeling

	# First, let's load up this pile of things.

	Sys.setenv(NOAWT=TRUE)

	# This is a workaround for Macs

	library(tm)
	library(Snowball)
	library(RWeka)
	library(rJava)
	library(RWekajars)
	library(gdata)
	library(slam)
	library("MiscPsycho")
	library(rpart)


	library("topicmodels")
	library("XML")

	blogsreader <- function(elem, language, id) {
	PlainTextDocument(elem$content,
	id = id, language = language)
	}

	mydir <- '/Users/astorer/Work/psanghavi'
	setwd(mydir)
	npi <- read.xls('npi_0405.xlsx',sheet="All NPIs",stringsAsFactors=FALSE)

	irsdb <- read.csv('data-download-pub78.txt',sep='\|',stringsAsFactors=FALSE)
	names(irsdb) <- c('id', 'name', 'city', 'state', 'country', 'var')

	# save some space!
	irsdb$id <- NULL
	irsdb$country <- NULL
	irsdb$var <- NULL
	irsdb$name <- toupper(irsdb$name)

	# let's remove punctuation and common abbreviations
	irsdb$name <- gsub(",","",irsdb$name)
	irsdb$name <- gsub("\\.","",irsdb$name)
	irsdb$name <- gsub("INCORPORATED","INC",irsdb$name)
	irsdb$name <- gsub("'","",irsdb$name)
	npi$NPI_Description <- gsub(",","",npi$NPI_Description)
	npi$NPI_Description <- gsub("\\.","",npi$NPI_Description)
	npi$NPI_Description <- gsub("'","",npi$NPI_Description)
	npi$NPI_Description <- gsub("INCORPORATED","INC",npi$NPI_Description)



	npi$plocstatename[npi$plocstatename=='PUERTO RICO'] <- "PR"

	all.np <- c()
	all.states <- unique(npi$pmailstatename)
	for (this.state in all.states) {
	print(paste("===>",this.state))
	irs.state <- subset(irsdb,state==this.state)
	npi.state <- subset(npi,plocstatename==this.state)
	for (i in 1:nrow(npi.state)) {
	my.name <- npi.state[i,"NPI_Description"]
	my.id <- npi.state[i,"PRF_NPI"]
	res <- my.name==irs.state$name
	if (sum(res,na.rm=T)>0) {
	print(irs.state[res,"name"])
	all.np <- c(all.np,my.id)
	}
	}
	}

	all.np <- data.frame(PRF_NPI = all.np)

	merge(all.np,npi)

	npi$nonprofit <- ifelse(npi$PRF_NPI %in% all.np$PRF_NPI,1,0)

	missing <- paste(npi$NPI_Description)==""
	textvec <- paste(npi$NPI_Description,npi$porgnameoth,npi$aotitle)
	textvec <- gsub("CHIEF FINANCIAL OFFICER","CFO",textvec)
	textvec <- gsub("CHIEF EXECUTIVE OFFICER","CEO",textvec)
	textvec <- gsub("ASSOCIATION","ASSOC",textvec)
	textvec <- gsub("/"," ",textvec)
	textvec[missing] <- "MISSING"

	corpus <- Corpus(VectorSource(textvec),readerControl=list(reader = blogsreader))

	Sys.setlocale("LC_COLLATE", "C")

	dtm <- DocumentTermMatrix(corpus, control = list(stemming = FALSE, stopwords = FALSE,
	minWordLength = 2, removeNumbers = FALSE, removePunctuation = TRUE))

	# Machine learning things

	# first remove terms that are too infrequent:
	cs <- col_sums(dtm)
	keepterms <- cs>15

	dtm_sub <- dtm[,keepterms]


	# Put the dtm_sub back into the npi data frame
	npidtm <- cbind(npi$Final.Assignment,as.data.frame(as.matrix(dtm_sub)))
	names(npidtm)[1] <- "Final.Assignment"
	npitruth <- npidtm[""!=(npidtm$Final.Assignment),]
	assignments<-npitruth$Final.Assignment

	# remove terms we used explicitly
	npitruth[,"fire"] <- NULL
	npitruth[,"owner"] <- NULL
	#npitruth[,"llc"] <- NULL
	#npitruth[,"inc"] <- NULL


	npitruth$Final.Assignment <- assignments
	npi.tree <- rpart(Final.Assignment ~., npitruth)
	plot(npi.tree)
	text(npi.tree)

	rsq.rpart(npi.tree) # visualize cross-validation results
	printcp(npi.tree) # display the results
	plotcp(npi.tree) # visualize cross-validation results

	predictions <- predict(npi.tree,npitruth,type='class')
	#table(predictions<0.5,npitruth$Final.Assignment)
	table(predictions,npitruth$Final.Assignment)

	# Other things we can do:
	# 1) Try other classifier (easy)
	# 2) Resample input to have equal numbers of categories, e.g., less fire (medium)
	# 3) Try different number of words (easy-medium)
	# 4) Use nonprofit as an input feature (easy)
	# 5) PO Box (medium - must label as PO Box)
	# 6) Remove small companies (medium)
	# 7) Use PCA as input features (medium)
	# 8) Do Idaho!
	# 9) Collapse categories into the basic 3



	dtm_sub_logical <- dtm_sub>0
	npidtm <- cbind(npi$Final.Assignment,as.data.frame(as.matrix(dtm_sub_logical)))
	names(npidtm)[1] <- "Final.Assignment"
	npitruth <- npidtm[""!=(npidtm$Final.Assignment),]
	npi.tree <- rpart(Final.Assignment ~., npitruth)
	plot(npi.tree)
	text(npi.tree)

	# Descriptive things

	sorted_terms <- sort(col_sums(dtm),decreasing=TRUE)
	plot(dtm, terms = names(sorted_terms[1:20]), corThreshold = 0.10)

	# We can query the associations for any term
	findAssocs(dtm,term="hospital",0.05)

	# Let's do some text clustering on this

	# first remove terms that are too infrequent:
	cs <- col_sums(dtm)
	keepterms <- cs>15

	dtm_sub <- dtm[,keepterms]

	k <- 10
	km <- kmeans(dtm_sub,k)
	plot.new()
	for (i in 1:k) {
	sorted_terms <- sort(col_sums(dtm_sub[km$cluster==i,]),decreasing=TRUE)
	barplot(sorted_terms[1:20],las=2)
	title(paste("Cluster", i))
	#plot(dtm[km$cluster==i,], terms = names(sorted_terms[1:20]), corThreshold = 0.075)
	}

	textvec[km$cluster==1]

	# PCA?

	pca <- princomp(dtm_sub)

	textvec[pca$scores[,1]>mean(pca$scores[,1])]
	textvec[pca$scores[,2]>mean(pca$scores[,2])]
	textvec[pca$scores[,3]>mean(pca$scores[,3])]

	termnames <- dimnames(dtm_sub)$Terms


	plot(dtm, terms = names(sorted_terms[1:50]), corThreshold = 0.075)


	ft <- findFreqTerms(dtm, lowfreq = 150, highfreq = 5000)

	# We can get the most frequent terms
	sorted_terms <- sort(col_sums(dtm),decreasing=T)

	# We can plot them
	barplot(sorted_terms[1:20],las=2)

	binarydtm <- dtm
	binarydtm$v <- binarydtm$v*0+1



	# k topics

	k <- 3
	SEED <- 2010
	sub_TM <-
	list(VEM = LDA(binarydtm, k = k, control = list(seed = SEED)),
	VEM_fixed = LDA(binarydtm, k = k,
	control = list(estimate.alpha = FALSE, seed = SEED)),
	Gibbs = LDA(binarydtm, k = k, method = "Gibbs",
	control = list(seed = SEED, burnin = 1000,
	thin = 100, iter = 1000)),
	CTM = CTM(binarydtm, k = k,
	control = list(seed = SEED,
	var = list(tol = 10^-4),
	em = list(tol = 10^-3))))

	sapply(sub_TM[1:2], slot, "alpha")

	Terms <- terms(sub_TM[["VEM"]], 10)

	# We can also plot the terms with lines indicating if they correlate
	plot(dtm, terms = names(sorted_terms[1:30]), corThreshold = 0.075)

	# Here are terms that correlate with debt
	findAssocs(dtm,term="chief",0.1)

	# Here is a plot of some debt terms and how they are related
	plot(dtm, terms = names(findAssocs(dtm,term="debt",0.2)), corThreshold = 0.30)


	### Let's try this again, with new .txt files:

	blogsreader <- function(elem, language, id) {
	stripped <- remove_HTML_regex(elem$content)
	PlainTextDocument(stripped,
	id = id, language = language)
	}
	mydir <- '/Library/Frameworks/R.framework/Versions/2.14/Resources/library/ReadMe/demofiles/clintonposts/'
	corpus <- Corpus(DirSource(directory = mydir),readerControl=list(reader = blogsreader))
	#corpus <- Corpus(VectorSource(sapply(JSS_papers[, "description"],remove_HTML_regex)))
	Sys.setlocale("LC_COLLATE", "C")

	blogs_dtm <- DocumentTermMatrix(corpus, control = list(stemming = TRUE, stopwords = TRUE,
	minWordLength = 3, removeNumbers = TRUE, removePunctuation = TRUE))
	dim(blogs_dtm)

	summary(col_sums(blogs_dtm))
	term_tfidf <-
	tapply(blogs_dtm$v/row_sums(blogs_dtm)[blogs_dtm$i], blogs_dtm$j, mean) *
	log2(nDocs(blogs_dtm)/col_sums(blogs_dtm > 0))
	summary(term_tfidf)
	blogs_dtm <- blogs_dtm[,term_tfidf >= 0.1]
	blogs_dtm <- blogs_dtm[row_sums(blogs_dtm) > 0,]
	summary(col_sums(blogs_dtm))


	k <- 30
	SEED <- 2010
	blogs_TM <-
	list(VEM = LDA(blogs_dtm, k = k, control = list(seed = SEED)),
	VEM_fixed = LDA(blogs_dtm, k = k,
	control = list(estimate.alpha = FALSE, seed = SEED)),
	Gibbs = LDA(blogs_dtm, k = k, method = "Gibbs",
	control = list(seed = SEED, burnin = 1000,
	thin = 100, iter = 1000)),
	CTM = CTM(blogs_dtm, k = k,
	control = list(seed = SEED,
	var = list(tol = 10^-4),
	em = list(tol = 10^-3))))

	sapply(blogs_TM[1:2], slot, "alpha")

	Terms <- terms(blogs_TM[["VEM"]], 5)