Created
April 5, 2013 21:49
-
-
Save alexstorer/5322931 to your computer and use it in GitHub Desktop.
Text Analysis Examples
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # This is an exploration of topic modeling | |
| # First, let's load up this pile of things. | |
| Sys.setenv(NOAWT=TRUE) | |
| # This is a workaround for Macs | |
| library(tm) | |
| library(Snowball) | |
| library(RWeka) | |
| library(rJava) | |
| library(RWekajars) | |
| library(gdata) | |
| library(slam) | |
| library("MiscPsycho") | |
| library(rpart) | |
| library("topicmodels") | |
| library("XML") | |
| blogsreader <- function(elem, language, id) { | |
| PlainTextDocument(elem$content, | |
| id = id, language = language) | |
| } | |
| mydir <- '/Users/astorer/Work/psanghavi' | |
| setwd(mydir) | |
| npi <- read.xls('npi_0405.xlsx',sheet="All NPIs",stringsAsFactors=FALSE) | |
| irsdb <- read.csv('data-download-pub78.txt',sep='|',stringsAsFactors=FALSE) | |
| names(irsdb) <- c('id', 'name', 'city', 'state', 'country', 'var') | |
| # save some space! | |
| irsdb$id <- NULL | |
| irsdb$country <- NULL | |
| irsdb$var <- NULL | |
| irsdb$name <- toupper(irsdb$name) | |
| # let's remove punctuation and common abbreviations | |
| irsdb$name <- gsub(",","",irsdb$name) | |
| irsdb$name <- gsub("\\.","",irsdb$name) | |
| irsdb$name <- gsub("INCORPORATED","INC",irsdb$name) | |
| irsdb$name <- gsub("'","",irsdb$name) | |
| npi$NPI_Description <- gsub(",","",npi$NPI_Description) | |
| npi$NPI_Description <- gsub("\\.","",npi$NPI_Description) | |
| npi$NPI_Description <- gsub("'","",npi$NPI_Description) | |
| npi$NPI_Description <- gsub("INCORPORATED","INC",npi$NPI_Description) | |
| npi$plocstatename[npi$plocstatename=='PUERTO RICO'] <- "PR" | |
| all.np <- c() | |
| all.states <- unique(npi$pmailstatename) | |
| for (this.state in all.states) { | |
| print(paste("===>",this.state)) | |
| irs.state <- subset(irsdb,state==this.state) | |
| npi.state <- subset(npi,plocstatename==this.state) | |
| for (i in 1:nrow(npi.state)) { | |
| my.name <- npi.state[i,"NPI_Description"] | |
| my.id <- npi.state[i,"PRF_NPI"] | |
| res <- my.name==irs.state$name | |
| if (sum(res,na.rm=T)>0) { | |
| print(irs.state[res,"name"]) | |
| all.np <- c(all.np,my.id) | |
| } | |
| } | |
| } | |
| all.np <- data.frame(PRF_NPI = all.np) | |
| merge(all.np,npi) | |
| npi$nonprofit <- ifelse(npi$PRF_NPI %in% all.np$PRF_NPI,1,0) | |
| missing <- paste(npi$NPI_Description)=="" | |
| textvec <- paste(npi$NPI_Description,npi$porgnameoth,npi$aotitle) | |
| textvec <- gsub("CHIEF FINANCIAL OFFICER","CFO",textvec) | |
| textvec <- gsub("CHIEF EXECUTIVE OFFICER","CEO",textvec) | |
| textvec <- gsub("ASSOCIATION","ASSOC",textvec) | |
| textvec <- gsub("/"," ",textvec) | |
| textvec[missing] <- "MISSING" | |
| corpus <- Corpus(VectorSource(textvec),readerControl=list(reader = blogsreader)) | |
| Sys.setlocale("LC_COLLATE", "C") | |
| dtm <- DocumentTermMatrix(corpus, control = list(stemming = FALSE, stopwords = FALSE, | |
| minWordLength = 2, removeNumbers = FALSE, removePunctuation = TRUE)) | |
| # Machine learning things | |
| # first remove terms that are too infrequent: | |
| cs <- col_sums(dtm) | |
| keepterms <- cs>15 | |
| dtm_sub <- dtm[,keepterms] | |
| # Put the dtm_sub back into the npi data frame | |
| npidtm <- cbind(npi$Final.Assignment,as.data.frame(as.matrix(dtm_sub))) | |
| names(npidtm)[1] <- "Final.Assignment" | |
| npitruth <- npidtm[""!=(npidtm$Final.Assignment),] | |
| assignments<-npitruth$Final.Assignment | |
| # remove terms we used explicitly | |
| npitruth[,"fire"] <- NULL | |
| npitruth[,"owner"] <- NULL | |
| #npitruth[,"llc"] <- NULL | |
| #npitruth[,"inc"] <- NULL | |
| npitruth$Final.Assignment <- assignments | |
| npi.tree <- rpart(Final.Assignment ~., npitruth) | |
| plot(npi.tree) | |
| text(npi.tree) | |
| rsq.rpart(npi.tree) # visualize cross-validation results | |
| printcp(npi.tree) # display the results | |
| plotcp(npi.tree) # visualize cross-validation results | |
| predictions <- predict(npi.tree,npitruth,type='class') | |
| #table(predictions<0.5,npitruth$Final.Assignment) | |
| table(predictions,npitruth$Final.Assignment) | |
| # Other things we can do: | |
| # 1) Try other classifier (easy) | |
| # 2) Resample input to have equal numbers of categories, e.g., less fire (medium) | |
| # 3) Try different number of words (easy-medium) | |
| # 4) Use nonprofit as an input feature (easy) | |
| # 5) PO Box (medium - must label as PO Box) | |
| # 6) Remove small companies (medium) | |
| # 7) Use PCA as input features (medium) | |
| # 8) Do Idaho! | |
| # 9) Collapse categories into the basic 3 | |
| dtm_sub_logical <- dtm_sub>0 | |
| npidtm <- cbind(npi$Final.Assignment,as.data.frame(as.matrix(dtm_sub_logical))) | |
| names(npidtm)[1] <- "Final.Assignment" | |
| npitruth <- npidtm[""!=(npidtm$Final.Assignment),] | |
| npi.tree <- rpart(Final.Assignment ~., npitruth) | |
| plot(npi.tree) | |
| text(npi.tree) | |
| # Descriptive things | |
| sorted_terms <- sort(col_sums(dtm),decreasing=TRUE) | |
| plot(dtm, terms = names(sorted_terms[1:20]), corThreshold = 0.10) | |
| # We can query the associations for any term | |
| findAssocs(dtm,term="hospital",0.05) | |
| # Let's do some text clustering on this | |
| # first remove terms that are too infrequent: | |
| cs <- col_sums(dtm) | |
| keepterms <- cs>15 | |
| dtm_sub <- dtm[,keepterms] | |
| k <- 10 | |
| km <- kmeans(dtm_sub,k) | |
| plot.new() | |
| for (i in 1:k) { | |
| sorted_terms <- sort(col_sums(dtm_sub[km$cluster==i,]),decreasing=TRUE) | |
| barplot(sorted_terms[1:20],las=2) | |
| title(paste("Cluster", i)) | |
| #plot(dtm[km$cluster==i,], terms = names(sorted_terms[1:20]), corThreshold = 0.075) | |
| } | |
| textvec[km$cluster==1] | |
| # PCA? | |
| pca <- princomp(dtm_sub) | |
| textvec[pca$scores[,1]>mean(pca$scores[,1])] | |
| textvec[pca$scores[,2]>mean(pca$scores[,2])] | |
| textvec[pca$scores[,3]>mean(pca$scores[,3])] | |
| termnames <- dimnames(dtm_sub)$Terms | |
| plot(dtm, terms = names(sorted_terms[1:50]), corThreshold = 0.075) | |
| ft <- findFreqTerms(dtm, lowfreq = 150, highfreq = 5000) | |
| # We can get the most frequent terms | |
| sorted_terms <- sort(col_sums(dtm),decreasing=T) | |
| # We can plot them | |
| barplot(sorted_terms[1:20],las=2) | |
| binarydtm <- dtm | |
| binarydtm$v <- binarydtm$v*0+1 | |
| # k topics | |
| k <- 3 | |
| SEED <- 2010 | |
| sub_TM <- | |
| list(VEM = LDA(binarydtm, k = k, control = list(seed = SEED)), | |
| VEM_fixed = LDA(binarydtm, k = k, | |
| control = list(estimate.alpha = FALSE, seed = SEED)), | |
| Gibbs = LDA(binarydtm, k = k, method = "Gibbs", | |
| control = list(seed = SEED, burnin = 1000, | |
| thin = 100, iter = 1000)), | |
| CTM = CTM(binarydtm, k = k, | |
| control = list(seed = SEED, | |
| var = list(tol = 10^-4), | |
| em = list(tol = 10^-3)))) | |
| sapply(sub_TM[1:2], slot, "alpha") | |
| Terms <- terms(sub_TM[["VEM"]], 10) | |
| # We can also plot the terms with lines indicating if they correlate | |
| plot(dtm, terms = names(sorted_terms[1:30]), corThreshold = 0.075) | |
| # Here are terms that correlate with debt | |
| findAssocs(dtm,term="chief",0.1) | |
| # Here is a plot of some debt terms and how they are related | |
| plot(dtm, terms = names(findAssocs(dtm,term="debt",0.2)), corThreshold = 0.30) | |
| ### Let's try this again, with new .txt files: | |
| blogsreader <- function(elem, language, id) { | |
| stripped <- remove_HTML_regex(elem$content) | |
| PlainTextDocument(stripped, | |
| id = id, language = language) | |
| } | |
| mydir <- '/Library/Frameworks/R.framework/Versions/2.14/Resources/library/ReadMe/demofiles/clintonposts/' | |
| corpus <- Corpus(DirSource(directory = mydir),readerControl=list(reader = blogsreader)) | |
| #corpus <- Corpus(VectorSource(sapply(JSS_papers[, "description"],remove_HTML_regex))) | |
| Sys.setlocale("LC_COLLATE", "C") | |
| blogs_dtm <- DocumentTermMatrix(corpus, control = list(stemming = TRUE, stopwords = TRUE, | |
| minWordLength = 3, removeNumbers = TRUE, removePunctuation = TRUE)) | |
| dim(blogs_dtm) | |
| summary(col_sums(blogs_dtm)) | |
| term_tfidf <- | |
| tapply(blogs_dtm$v/row_sums(blogs_dtm)[blogs_dtm$i], blogs_dtm$j, mean) * | |
| log2(nDocs(blogs_dtm)/col_sums(blogs_dtm > 0)) | |
| summary(term_tfidf) | |
| blogs_dtm <- blogs_dtm[,term_tfidf >= 0.1] | |
| blogs_dtm <- blogs_dtm[row_sums(blogs_dtm) > 0,] | |
| summary(col_sums(blogs_dtm)) | |
| k <- 30 | |
| SEED <- 2010 | |
| blogs_TM <- | |
| list(VEM = LDA(blogs_dtm, k = k, control = list(seed = SEED)), | |
| VEM_fixed = LDA(blogs_dtm, k = k, | |
| control = list(estimate.alpha = FALSE, seed = SEED)), | |
| Gibbs = LDA(blogs_dtm, k = k, method = "Gibbs", | |
| control = list(seed = SEED, burnin = 1000, | |
| thin = 100, iter = 1000)), | |
| CTM = CTM(blogs_dtm, k = k, | |
| control = list(seed = SEED, | |
| var = list(tol = 10^-4), | |
| em = list(tol = 10^-3)))) | |
| sapply(blogs_TM[1:2], slot, "alpha") | |
| Terms <- terms(blogs_TM[["VEM"]], 5) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment