Skip to content

Instantly share code, notes, and snippets.

@alexstorer
Created April 5, 2013 21:49
Show Gist options
  • Save alexstorer/5322931 to your computer and use it in GitHub Desktop.
Save alexstorer/5322931 to your computer and use it in GitHub Desktop.
Text Analysis Examples
# This is an exploration of topic modeling
# First, let's load up this pile of things.
Sys.setenv(NOAWT=TRUE)
# This is a workaround for Macs
library(tm)
library(Snowball)
library(RWeka)
library(rJava)
library(RWekajars)
library(gdata)
library(slam)
library("MiscPsycho")
library(rpart)
library("topicmodels")
library("XML")
blogsreader <- function(elem, language, id) {
PlainTextDocument(elem$content,
id = id, language = language)
}
mydir <- '/Users/astorer/Work/psanghavi'
setwd(mydir)
npi <- read.xls('npi_0405.xlsx',sheet="All NPIs",stringsAsFactors=FALSE)
irsdb <- read.csv('data-download-pub78.txt',sep='|',stringsAsFactors=FALSE)
names(irsdb) <- c('id', 'name', 'city', 'state', 'country', 'var')
# save some space!
irsdb$id <- NULL
irsdb$country <- NULL
irsdb$var <- NULL
irsdb$name <- toupper(irsdb$name)
# let's remove punctuation and common abbreviations
irsdb$name <- gsub(",","",irsdb$name)
irsdb$name <- gsub("\\.","",irsdb$name)
irsdb$name <- gsub("INCORPORATED","INC",irsdb$name)
irsdb$name <- gsub("'","",irsdb$name)
npi$NPI_Description <- gsub(",","",npi$NPI_Description)
npi$NPI_Description <- gsub("\\.","",npi$NPI_Description)
npi$NPI_Description <- gsub("'","",npi$NPI_Description)
npi$NPI_Description <- gsub("INCORPORATED","INC",npi$NPI_Description)
npi$plocstatename[npi$plocstatename=='PUERTO RICO'] <- "PR"
all.np <- c()
all.states <- unique(npi$pmailstatename)
for (this.state in all.states) {
print(paste("===>",this.state))
irs.state <- subset(irsdb,state==this.state)
npi.state <- subset(npi,plocstatename==this.state)
for (i in 1:nrow(npi.state)) {
my.name <- npi.state[i,"NPI_Description"]
my.id <- npi.state[i,"PRF_NPI"]
res <- my.name==irs.state$name
if (sum(res,na.rm=T)>0) {
print(irs.state[res,"name"])
all.np <- c(all.np,my.id)
}
}
}
all.np <- data.frame(PRF_NPI = all.np)
merge(all.np,npi)
npi$nonprofit <- ifelse(npi$PRF_NPI %in% all.np$PRF_NPI,1,0)
missing <- paste(npi$NPI_Description)==""
textvec <- paste(npi$NPI_Description,npi$porgnameoth,npi$aotitle)
textvec <- gsub("CHIEF FINANCIAL OFFICER","CFO",textvec)
textvec <- gsub("CHIEF EXECUTIVE OFFICER","CEO",textvec)
textvec <- gsub("ASSOCIATION","ASSOC",textvec)
textvec <- gsub("/"," ",textvec)
textvec[missing] <- "MISSING"
corpus <- Corpus(VectorSource(textvec),readerControl=list(reader = blogsreader))
Sys.setlocale("LC_COLLATE", "C")
dtm <- DocumentTermMatrix(corpus, control = list(stemming = FALSE, stopwords = FALSE,
minWordLength = 2, removeNumbers = FALSE, removePunctuation = TRUE))
# Machine learning things
# first remove terms that are too infrequent:
cs <- col_sums(dtm)
keepterms <- cs>15
dtm_sub <- dtm[,keepterms]
# Put the dtm_sub back into the npi data frame
npidtm <- cbind(npi$Final.Assignment,as.data.frame(as.matrix(dtm_sub)))
names(npidtm)[1] <- "Final.Assignment"
npitruth <- npidtm[""!=(npidtm$Final.Assignment),]
assignments<-npitruth$Final.Assignment
# remove terms we used explicitly
npitruth[,"fire"] <- NULL
npitruth[,"owner"] <- NULL
#npitruth[,"llc"] <- NULL
#npitruth[,"inc"] <- NULL
npitruth$Final.Assignment <- assignments
npi.tree <- rpart(Final.Assignment ~., npitruth)
plot(npi.tree)
text(npi.tree)
rsq.rpart(npi.tree) # visualize cross-validation results
printcp(npi.tree) # display the results
plotcp(npi.tree) # visualize cross-validation results
predictions <- predict(npi.tree,npitruth,type='class')
#table(predictions<0.5,npitruth$Final.Assignment)
table(predictions,npitruth$Final.Assignment)
# Other things we can do:
# 1) Try other classifier (easy)
# 2) Resample input to have equal numbers of categories, e.g., less fire (medium)
# 3) Try different number of words (easy-medium)
# 4) Use nonprofit as an input feature (easy)
# 5) PO Box (medium - must label as PO Box)
# 6) Remove small companies (medium)
# 7) Use PCA as input features (medium)
# 8) Do Idaho!
# 9) Collapse categories into the basic 3
dtm_sub_logical <- dtm_sub>0
npidtm <- cbind(npi$Final.Assignment,as.data.frame(as.matrix(dtm_sub_logical)))
names(npidtm)[1] <- "Final.Assignment"
npitruth <- npidtm[""!=(npidtm$Final.Assignment),]
npi.tree <- rpart(Final.Assignment ~., npitruth)
plot(npi.tree)
text(npi.tree)
# Descriptive things
sorted_terms <- sort(col_sums(dtm),decreasing=TRUE)
plot(dtm, terms = names(sorted_terms[1:20]), corThreshold = 0.10)
# We can query the associations for any term
findAssocs(dtm,term="hospital",0.05)
# Let's do some text clustering on this
# first remove terms that are too infrequent:
cs <- col_sums(dtm)
keepterms <- cs>15
dtm_sub <- dtm[,keepterms]
k <- 10
km <- kmeans(dtm_sub,k)
plot.new()
for (i in 1:k) {
sorted_terms <- sort(col_sums(dtm_sub[km$cluster==i,]),decreasing=TRUE)
barplot(sorted_terms[1:20],las=2)
title(paste("Cluster", i))
#plot(dtm[km$cluster==i,], terms = names(sorted_terms[1:20]), corThreshold = 0.075)
}
textvec[km$cluster==1]
# PCA?
pca <- princomp(dtm_sub)
textvec[pca$scores[,1]>mean(pca$scores[,1])]
textvec[pca$scores[,2]>mean(pca$scores[,2])]
textvec[pca$scores[,3]>mean(pca$scores[,3])]
termnames <- dimnames(dtm_sub)$Terms
plot(dtm, terms = names(sorted_terms[1:50]), corThreshold = 0.075)
ft <- findFreqTerms(dtm, lowfreq = 150, highfreq = 5000)
# We can get the most frequent terms
sorted_terms <- sort(col_sums(dtm),decreasing=T)
# We can plot them
barplot(sorted_terms[1:20],las=2)
binarydtm <- dtm
binarydtm$v <- binarydtm$v*0+1
# k topics
k <- 3
SEED <- 2010
sub_TM <-
list(VEM = LDA(binarydtm, k = k, control = list(seed = SEED)),
VEM_fixed = LDA(binarydtm, k = k,
control = list(estimate.alpha = FALSE, seed = SEED)),
Gibbs = LDA(binarydtm, k = k, method = "Gibbs",
control = list(seed = SEED, burnin = 1000,
thin = 100, iter = 1000)),
CTM = CTM(binarydtm, k = k,
control = list(seed = SEED,
var = list(tol = 10^-4),
em = list(tol = 10^-3))))
sapply(sub_TM[1:2], slot, "alpha")
Terms <- terms(sub_TM[["VEM"]], 10)
# We can also plot the terms with lines indicating if they correlate
plot(dtm, terms = names(sorted_terms[1:30]), corThreshold = 0.075)
# Here are terms that correlate with debt
findAssocs(dtm,term="chief",0.1)
# Here is a plot of some debt terms and how they are related
plot(dtm, terms = names(findAssocs(dtm,term="debt",0.2)), corThreshold = 0.30)
### Let's try this again, with new .txt files:
blogsreader <- function(elem, language, id) {
stripped <- remove_HTML_regex(elem$content)
PlainTextDocument(stripped,
id = id, language = language)
}
mydir <- '/Library/Frameworks/R.framework/Versions/2.14/Resources/library/ReadMe/demofiles/clintonposts/'
corpus <- Corpus(DirSource(directory = mydir),readerControl=list(reader = blogsreader))
#corpus <- Corpus(VectorSource(sapply(JSS_papers[, "description"],remove_HTML_regex)))
Sys.setlocale("LC_COLLATE", "C")
blogs_dtm <- DocumentTermMatrix(corpus, control = list(stemming = TRUE, stopwords = TRUE,
minWordLength = 3, removeNumbers = TRUE, removePunctuation = TRUE))
dim(blogs_dtm)
summary(col_sums(blogs_dtm))
term_tfidf <-
tapply(blogs_dtm$v/row_sums(blogs_dtm)[blogs_dtm$i], blogs_dtm$j, mean) *
log2(nDocs(blogs_dtm)/col_sums(blogs_dtm > 0))
summary(term_tfidf)
blogs_dtm <- blogs_dtm[,term_tfidf >= 0.1]
blogs_dtm <- blogs_dtm[row_sums(blogs_dtm) > 0,]
summary(col_sums(blogs_dtm))
k <- 30
SEED <- 2010
blogs_TM <-
list(VEM = LDA(blogs_dtm, k = k, control = list(seed = SEED)),
VEM_fixed = LDA(blogs_dtm, k = k,
control = list(estimate.alpha = FALSE, seed = SEED)),
Gibbs = LDA(blogs_dtm, k = k, method = "Gibbs",
control = list(seed = SEED, burnin = 1000,
thin = 100, iter = 1000)),
CTM = CTM(blogs_dtm, k = k,
control = list(seed = SEED,
var = list(tol = 10^-4),
em = list(tol = 10^-3))))
sapply(blogs_TM[1:2], slot, "alpha")
Terms <- terms(blogs_TM[["VEM"]], 5)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment