Created
February 14, 2014 14:50
-
-
Save cigrainger/9002305 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| setwd("/Users/thiemo/Dropbox/mafundo/") | |
| is_installed <- function(mypkg) is.element(mypkg, installed.packages()[,1]) | |
| load_or_install<-function(package_names) | |
| { | |
| for(package_name in package_names) | |
| { | |
| if(!is_installed(package_name)) | |
| { | |
| install.packages(package_name,repos="http://lib.stat.cmu.edu/R/CRAN") | |
| } | |
| library(package_name,character.only=TRUE,quietly=TRUE,verbose=FALSE) | |
| } | |
| } | |
| bottom <- function(frame, rows=5) { | |
| frame[(nrow(frame)-rows:nrow(frame)),] | |
| } | |
| cbindM <- | |
| function(A, v, none=NA) { | |
| dif <- setdiff(union(rownames(A),rownames(v)),intersect(rownames(A),rownames(v))) | |
| #if names is the same, then a simple cbind will do | |
| if(length(dif)==0) { | |
| A<- cbind(A,v[match(rownames(A),rownames(v))]) | |
| rownames(A) <- names(v) | |
| } | |
| else if(length(dif)>0) { | |
| #sets are not equal, so either matrix is longer / shorter | |
| #this tells us which elements in dif are part of A (and of v) respectively | |
| for(i in dif) { | |
| if(is.element(i,rownames(A))) { | |
| #element is in A but not in v, so add it to v and then a | |
| temp<-matrix(data = none, nrow = 1, ncol = ncol(v), byrow = FALSE, dimnames =list(i)) | |
| v <- rbind(v,temp) | |
| } else { | |
| # element is in v but not in A, so add it to A | |
| temp<-matrix(data = none, nrow = 1, ncol = ncol(A), byrow = FALSE, dimnames =list(i)) | |
| A<-rbind(A,temp) | |
| } | |
| } | |
| A<-cbind(A,v[match(rownames(A),rownames(v))]) | |
| } | |
| A | |
| } | |
| DTUniqueBy <- function(data, varvec) { | |
| data <- as.data.table(data) | |
| data[!duplicated(data.frame(data[, varvec, with=F]))] | |
| } | |
| mergeString <- function(txt,sepr=" ") { | |
| for(i in 1:length(txt)) { | |
| if(i ==1) { | |
| str = paste(txt[i], sep=sepr) | |
| } else { | |
| str = paste(str,txt[i], sep=sepr) | |
| } | |
| } | |
| str | |
| } | |
| preProcess <- function(content, keepnum=FALSE, keepperiod=FALSE,tolower=TRUE) { | |
| if(keepperiod==TRUE) { | |
| content <- gsub("[^[:alnum:].,]", " ", content ) | |
| } | |
| else { | |
| content <- gsub("[^[:alnum:]]", " ", content ) | |
| } | |
| content <- gsub("[[:space:]]+", " ", content ) | |
| if(tolower==TRUE) { | |
| content <- tolower(content) | |
| } | |
| #content <- content [!grepl("^(| |\\.)*$", content )] | |
| content <- gsub("^ *", " ", content ) | |
| if(keepnum==FALSE) { | |
| content <- gsub("([0-9])*", "", content ) | |
| } | |
| content <- gsub("^-*", "", content ) | |
| content <- gsub(" +", " ", content ) | |
| content <- gsub(" *$", "", content ) | |
| #content <- content[!grepl("^(| |\\.\\,)*$",content )] | |
| content <- gsub(" *$", "", content ) | |
| content <- gsub("^ *", "", content ) | |
| content | |
| } | |
| wordfreq <- function(txt, EF=1, stopws=NULL,stem=FALSE) { | |
| txt <- unlist(strsplit(txt, " ", fixed = TRUE)) | |
| if(stem==TRUE) { | |
| txt <- wordStem(txt) | |
| } | |
| if (!is.null(stopws)) | |
| txt = txt[!txt %in% stopws] | |
| txt <- preProcess(txt) | |
| tab <- sort(table(txt), decreasing = TRUE) | |
| return(data.frame(docs=EF, terms = names(tab), Freq = tab, row.names = NULL)) | |
| } | |
| textmat <- function (vec=A, stpws=NULL) { | |
| dummy <- mapply(wordfreq, vec, 1:length(vec), MoreArgs=list(stopws=stpws), SIMPLIFY=F) | |
| names(dummy) <- NULL | |
| dtm <- t(xtabs(Freq ~ ., data = do.call("rbind", dummy))) | |
| dtm | |
| } | |
| makenumeric <- function(vec,replna=NA) { | |
| vec<-as.numeric(vec) | |
| vec[is.na(vec)]<-replna | |
| vec | |
| } | |
| makeCAPEXnice <- function(P) { | |
| P2 <- t(P) | |
| P2<-data.table(P2) | |
| P3<-as.matrix(P2) | |
| rownames(P3)<-names(P) | |
| rn <- NULL | |
| for(i in 1:ncol(P3)) { | |
| rn[i] <- paste(P3[1,i]) | |
| } | |
| colnames(P3) <- rn | |
| P3 <- P3[3:nrow(P3),] | |
| quarter <- gsub("Mar\\.([0-9]{2})",1,rownames(P3)) | |
| quarter <- gsub("Jun\\.([0-9]{2})",2,quarter) | |
| quarter <- gsub("Sep\\.([0-9]{2})",3,quarter) | |
| quarter <- as.numeric(gsub("Dec\\.([0-9]{2})",4,quarter)) | |
| iyear <- gsub("(Mar|Jun|Sep|Dec\\.)|(Mar|Jun|Sep|Dec)([0-9]{2})","\\2",rownames(P3)) | |
| iyear <- gsub("\\.","",iyear) | |
| iyear <- gsub("(99|98|97|96|95)","19\\1",iyear) | |
| iyear <- as.numeric(gsub("(00|01|02|03|04|05|06|07|08|09|10|11|12)","20\\1",iyear)) | |
| P3<-apply(P3,2,function(x) makenumeric(x,0)) | |
| P3<-cbind(P3,iyear,quarter) | |
| P3[,order(colnames(P3))] | |
| } | |
| reshapeCSV <- function(REG, varname="stalled") { | |
| out<-NULL | |
| for(i in names(REG)) { | |
| temp<-NULL | |
| temp<-cbind(STATE_NAME=rep(i,length(REG$i)),year=REG$iyear,quarter=REG$quarter,varname=REG[,i,with=F]) | |
| names(temp)<-c("STATE_NAME","year","quarter",varname) | |
| if(is.null(out)) { | |
| out<- rbind(temp) | |
| names(out)<-c("STATE_NAME","year","quarter",varname) | |
| } else { | |
| out<- rbind(out,temp) | |
| names(out)<-c("STATE_NAME","year","quarter",varname) | |
| } | |
| } | |
| data.table(out) | |
| } | |
| findMatches <- function(A,key) { | |
| res <- grep(paste("\\b",key,"\\b",sep=""),A$story) | |
| sapply(res, function(x) dbSendQuery(con,paste("INSERT INTO satp_association VALUES ('','",A[x]$id,"','",key,"')", sep=""))) | |
| } | |
| findMatchesSentence <- function(A,key) { | |
| res <- grep(paste("\\b",key,"\\b",sep=""),A$story) | |
| sapply(res, function(x) dbSendQuery(con,paste("INSERT INTO satp_association VALUES ('','",A[x]$id,"','",key,"')", sep=""))) | |
| } | |
| yearOnyearMaps <- function(yr,A,I) { | |
| A2 <- A[year==yr,.N,by=c("term")] | |
| names(A2)[2]<-"ATTACKCOUNT" | |
| o <- match(I@data$DISTRICT, A2$term) | |
| A2 <- A2[o,] | |
| row.names(A2)<-row.names(I) | |
| A3 <- spCbind(I, A2) | |
| pdf(paste(yr,".pdf",sep="")) | |
| plot(A3) | |
| nclr <- 12 | |
| plotclr <- brewer.pal(nclr,"YlOrRd") | |
| class <- classIntervals(A3@data$ATTACKCOUNT, 8, style="fixed",fixedBreaks=c(0, 1, 4, 8, 16, 32, 64,128)) | |
| colcode <- findColours(class, plotclr) | |
| #class <- classIntervals(A3@data$ATTACKCOUNT, nclr, style="equal") | |
| #colcode <- findColours(class, plotclr) | |
| plot(A3, col=colcode, add=T) | |
| title(main=paste("Violence in ",yr,sep=""), sub="Fixed Class Intervals") | |
| legend("bottomleft", legend=names(attr(colcode, "table")), | |
| fill=attr(colcode, "palette"), cex=0.6, bty="n") | |
| dev.off() | |
| } | |
| timeVector <- function(starttime,endtime,timestep="months") { | |
| starttime<- as.POSIXct(strptime(starttime, '%Y-%m-%d')) | |
| endtime<- as.POSIXct(strptime(endtime, '%Y-%m-%d')) | |
| if(timestep=="quarters") { | |
| timestep="months" | |
| ret<-seq(from=as.POSIXct(starttime), to=as.POSIXct(endtime), by=timestep) | |
| quarter <- gsub("(^[123]{1}$)", 1, month(ret)) | |
| quarter <- gsub("(^[456]{1}$)", 2, quarter) | |
| quarter <- gsub("(^[789]{1}$)", 3, quarter) | |
| quarter <- as.numeric(gsub("(^[102]{2}$)", 4, quarter)) | |
| ret<-paste(year(ret),quarter,sep="-") | |
| ret<-unique(ret) | |
| } else { | |
| ret<-seq(from=as.POSIXct(starttime), to=as.POSIXct(endtime), by=timestep) | |
| } | |
| ret | |
| } | |
| panelStructure <- function(group,timevec) { | |
| tt<-rep(timevec,length(group)) | |
| tt2 <- as.character(sort(rep(group,length(timevec)))) | |
| mat <- cbind("group"=data.frame(tt2),"timevec"=data.frame(tt)) | |
| names(mat)<-c("group","timevec") | |
| mat | |
| } | |
| setwd("/Users/thiemo/Dropbox/mafundo/Googlestuff") | |
| load_or_install(c("R.oo","stringr","classInt","rgdal", "maptools","XML", "sentiment","plyr","RMySQL","RTextTools", "topicmodels","corpora","ggplot2","tm","tm.plugin.sentiment","foreach","RColorBrewer","wordcloud","lsa","MASS","openNLP","openNLPmodels.en","data.table","depmixS4")) | |
| BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 3)) | |
| con <- dbConnect(MySQL(), username="root", password="", dbname="vanderbilt", unix.socket="/tmp/mysql.sock") | |
| query <- "SELECT * FROM `abstracts` WHERE year(start) >= 1990" | |
| A<-data.table(fetch(dbSendQuery(con,query),-1)) | |
| A$rand <- runif(nrow(A),0,1) | |
| A<-A[order(rand)] | |
| A<-A[abstract!=""] | |
| A<-A[1:60000] | |
| A$abstract<-preProcess(A$abstract) | |
| A$abstract<-removeWords(A$abstract, preProcess(stopwords("en"),c("featured scenes shown","report introduced","reported"))) | |
| A$abstract<-preProcess(A$abstract) | |
| A<-A[abstract!=""] | |
| A$technology<-"trivia" | |
| A$year<-A[,year(start)] | |
| con <- dbConnect(MySQL(), username="root", password="", dbname="googleinsights", unix.socket="/tmp/mysql.sock") | |
| query<-"SELECT * FROM `applntitlemerge` a LEFT JOIN patentclass p ON a.APPLN_ID=p.APPLN_ID" | |
| P<-data.table(fetch(dbSendQuery(con,query),-1)) | |
| T<-P[!is.na(TECHNOLOGY)] | |
| P<-P[is.na(TECHNOLOGY)] | |
| PS<-P[APPLN_ID %in% sample(P$APPLN_ID,100000)] | |
| PST<-rbind(T,PS) | |
| PST[is.na(TECHNOLOGY)]$TECHNOLOGY<-"other" | |
| PST$year<-PST[,year(APPLN_FILING_DATE)] | |
| techwords <- NULL | |
| techn <- names(table(PST$TECHNOLOGY)) | |
| PST$APPLN_TITLE<-preProcess(PST$APPLN_TITLE) | |
| PST$APPLN_TITLE<-removeWords(PST$APPLN_TITLE,preProcess(stopwords("en"))) | |
| PST$APPLN_TITLE<-preProcess(PST$APPLN_TITLE) | |
| techwords <- NULL | |
| for(i in techn) { | |
| cat(paste(i," ",sep=" ")) | |
| Bc <- Corpus(VectorSource(PST[TECHNOLOGY==i]$APPLN_TITLE)) | |
| cable_mat <- DocumentTermMatrix(Bc, control = list(weighting = weightTf, | |
| removePunctuation = TRUE, removeNumbers = TRUE, wordLengths = c(3, 30),tokenize = BigramTokenizer)) | |
| dtm2<-removeSparseTerms(cable_mat, 0.999) | |
| dtm2 <- as.matrix(dtm2) | |
| temp <- as.matrix(cbind(rep(i, ncol(dtm2)), colnames(dtm2),colSums(dtm2))) | |
| techwords <- rbind(techwords,temp) | |
| gc("free") | |
| } | |
| techwords<-data.table(techwords) | |
| setnames(techwords,c("V1","V2","V3"),c("technology","word","count")) | |
| techwords$count<-as.numeric(as.character(techwords$count)) | |
| Bc <- Corpus(VectorSource(A$abstract)) | |
| cable_mat <- DocumentTermMatrix(Bc, control = list(weighting = weightTf, | |
| removePunctuation = TRUE, removeNumbers = TRUE, wordLengths = c(3, 30),tokenize = BigramTokenizer)) | |
| dtm2<-removeSparseTerms(cable_mat, 0.999) | |
| dtm2 <- as.matrix(dtm2) | |
| temp <- as.matrix(cbind(rep("trivia", ncol(dtm2)), colnames(dtm2),colSums(dtm2))) | |
| temp<-data.table(temp) | |
| temp$V3<-as.numeric(as.character(temp$V3)) | |
| trivia<-temp | |
| techwords<-rbind(techwords,trivia,use.names=FALSE) | |
| techannualwords<-NULL | |
| for(j in 1995:2012) { | |
| temp<-A[year<=j & year >=j-5][1:1000]$abstract | |
| Bc <- Corpus(VectorSource(temp)) | |
| cable_mat <- DocumentTermMatrix(Bc, control = list(weighting = weightTf, | |
| removePunctuation = TRUE, removeNumbers = TRUE, wordLengths = c(3, 30),tokenize = BigramTokenizer)) | |
| dtm2<-removeSparseTerms(cable_mat, 0.999) | |
| dtm2 <- as.matrix(dtm2) | |
| temp <- as.matrix(cbind(rep("trivia", ncol(dtm2)),rep(j, ncol(dtm2)), colnames(dtm2),colSums(dtm2))) | |
| techannualwords<-rbind(techannualwords,temp) | |
| for(i in techn) { | |
| cat(paste(i," ",j,sep=" ")) | |
| ks<-PST[TECHNOLOGY==i & year<=j & year>=j-5]$APPLN_TITLE | |
| if(length(ks) > 0) { | |
| Bc <- Corpus(VectorSource(ks)) | |
| cable_mat <- DocumentTermMatrix(Bc, control = list(weighting = weightTf, | |
| removePunctuation = TRUE, removeNumbers = TRUE, wordLengths = c(3, 30),tokenize = BigramTokenizer)) | |
| dtm2<-removeSparseTerms(cable_mat, 0.999) | |
| dtm2 <- as.matrix(dtm2) | |
| temp <- as.matrix(cbind(rep(i, ncol(dtm2)),rep(j, ncol(dtm2)), colnames(dtm2),colSums(dtm2))) | |
| techannualwords <- rbind(techannualwords,temp) | |
| gc("free") | |
| } | |
| } | |
| } | |
| techannualwords<-data.table(techannualwords) | |
| setnames(techannualwords,c("V1","V2","V3","V4"),c("technology","year","word","count")) | |
| techannualwords$word<-as.character(techannualwords$word) | |
| techannualwords$technology<-as.character(techannualwords$technology) | |
| techannualwords$count<-as.numeric(as.character(techannualwords$count)) | |
| Freqsannual <- NULL | |
| ###individual technology innovations | |
| for(j in 1995:2012) { | |
| for(i in techn) { | |
| cat(j," ",i,sep=" ") | |
| if(nrow(techannualwords[technology==i & year==j]) > 0) { | |
| FreqVec<-merge(techannualwords[technology !=i & year==j][,sum(count),by=word],techannualwords[technology==i & year==j],by="word",all.x=TRUE,all.y=TRUE) | |
| #democrat - other | |
| #republican - solar | |
| setnames(FreqVec, c("V1","technology","count") ,c("anchorcount","technology","patentcount")) | |
| FreqVec[is.na(anchorcount)]$anchorcount<-0 | |
| FreqVec[is.na(patentcount)]$patentcount<-0 | |
| anch<-FreqVec[,sum(anchorcount)] | |
| pat<-FreqVec[,sum(patentcount)] | |
| chis<-FreqVec[,(anch+pat)*((anchorcount*(pat-patentcount) - patentcount*(anch-anchorcount))^2)/((anchorcount+patentcount)*(anch *pat)*(anch+pat-anchorcount-patentcount)),by=word] | |
| FreqVec<-cbind(FreqVec,chis) | |
| names(FreqVec)[ncol(FreqVec)] <- "chis" | |
| chissign<-FreqVec[,sign((anch+pat)*((anchorcount*(pat-patentcount) - patentcount*(anch-anchorcount)))),by=word] | |
| FreqVec<-cbind(rep(j,nrow(FreqVec)),FreqVec) | |
| FreqVec<-cbind(FreqVec,chissign) | |
| Freqsannual<-rbind(Freqsannual,FreqVec) | |
| rm(FreqVec) | |
| gc("free") | |
| } | |
| } | |
| } | |
| Freqsannual<-data.table(Freqsannual) | |
| FA <- Freqsannual[technology != "<NA>" & V1 == -1 & chis>= 10 & patentcount>=3] | |
| setnames(FA,"rep(j, nrow(FreqVec))","year") | |
| write.csv(FA[,c(1,3,4,5,6,7,8,10),with=F],file="keywords-annual.csv") | |
| write.csv(data.frame(table(FA$word)), file="distinct-keywords.csv") | |
| techannualwords$broadclass <- "" | |
| techannualwords[technology=="trivia"]$broadclass = "notechnology" | |
| techannualwords[technology=="other"]$broadclass = "othertechnology" | |
| techannualwords[broadclass==""]$broadclass = "cleantech" | |
| Freqanytech <- NULL | |
| ###individual technology innovations | |
| for(j in 1995:2012) { | |
| cat(j," ",sep=" ") | |
| FreqVec<-merge(techannualwords[broadclass =="othertechnology" & year==j][,sum(count),by=word],techannualwords[broadclass =="cleantech" & year==j][,sum(count),by=word],by="word",all.x=TRUE,all.y=TRUE) | |
| #democrat - other | |
| #republican - solar | |
| setnames(FreqVec, c("V1.x","V1.y") ,c("anchorcount","patentcount")) | |
| FreqVec[is.na(anchorcount)]$anchorcount<-0 | |
| FreqVec[is.na(patentcount)]$patentcount<-0 | |
| anch<-FreqVec[,sum(anchorcount)] | |
| pat<-FreqVec[,sum(patentcount)] | |
| chis<-FreqVec[,(anch+pat)*((anchorcount*(pat-patentcount) - patentcount*(anch-anchorcount))^2)/((anchorcount+patentcount)*(anch *pat)*(anch+pat-anchorcount-patentcount)),by=word] | |
| FreqVec<-cbind(FreqVec,chis) | |
| names(FreqVec)[ncol(FreqVec)] <- "chis" | |
| chissign<-FreqVec[,sign((anch+pat)*((anchorcount*(pat-patentcount) - patentcount*(anch-anchorcount)))),by=word] | |
| FreqVec<-cbind(rep(j,nrow(FreqVec)),FreqVec) | |
| FreqVec<-cbind(FreqVec,chissign) | |
| Freqanytech<-rbind(Freqanytech,FreqVec) | |
| rm(FreqVec) | |
| gc("free") | |
| } | |
| FAY <- Freqanytech[chis>= 5 & patentcount>=3] | |
| setnames(FAY,"rep(j, nrow(FreqVec))","year") | |
| FAY$innovationclass <- "" | |
| FAY[V1==1]$innovationclass <- "noncleantech" | |
| FAY[V1==-1]$innovationclass <- "cleantech" | |
| write.csv(FAY[,c(1,2,3,4,6,8,9),with=F],file="innovation-keywords.csv") | |
| write.csv(data.frame(table(FACNC$word)), file="clean-nonclean-keywords.csv") | |
| write.csv(data.frame(table(FAY$word)), file="general-innovation-keywords.csv") | |
| setwd("/Users/thiemo/Dropbox/mafundo/Googlestuff/Google keywords/keywords-generalinnovation/") | |
| for(j in 1995:2012) { | |
| fn = paste("innovation-",j,".pdf",sep="") | |
| if(nrow(FAY[ year == j ]) > 10) { | |
| pdf(fn) | |
| wordcloud(FAY[year == j ]$word,sqrt(FAY[year == j]$chis), scale=c(8,.3), min.freq=2) | |
| dev.off() | |
| } | |
| } | |
| FACNC <- Freqanytech[chis>= 5 & patentcount>=3 & V1 == -1 ] | |
| setnames(FACNC,"rep(j, nrow(FreqVec))","year") | |
| FACNC$innovationclass <- "cleantech" | |
| setwd("/Users/thiemo/Dropbox/mafundo/Googlestuff/Google keywords/keywords-clean-vs-nonclean/") | |
| for(j in 1995:2012) { | |
| fn = paste("cleannonclean-",j,".pdf",sep="") | |
| if(nrow(FACNC[ year == j ]) > 10) { | |
| pdf(fn) | |
| wordcloud(FACNC[year == j & innovationclass=="cleantech" ]$word,sqrt(FACNC[year == j & innovationclass=="cleantech" ]$chis), scale=c(8,.3), min.freq=2) | |
| dev.off() | |
| } | |
| } | |
| setwd("/Users/thiemo/Dropbox/mafundo/Googlestuff/Google keywords/keywords-annual/") | |
| for(i in techn) { | |
| for(j in 1995:2012) { | |
| fn = paste(i,"-",j,".pdf",sep="") | |
| if(nrow(FA[technology==i & year == j ]) > 10) { | |
| txt <- paste("Technology: ",i," - Number of patents (including prev 5 yrs): ", nrow(PST[TECHNOLOGY==i & year <= j & year >= j-5]), sep="") | |
| pdf(fn) | |
| wordcloud(FA[technology==i & year == j ]$word,sqrt(FA[technology==i & year == j]$chis), scale=c(8,.3), min.freq=2) | |
| mtext(txt,side=1,line=4, col="darkred") | |
| dev.off() | |
| } | |
| } | |
| } | |
| annualwords<-NULL | |
| for(j in 1994:2012) { | |
| for(i in techn) { | |
| cat(paste(i," ",sep=" ")) | |
| ks<-PST[TECHNOLOGY==i & year==j]$APPLN_TITLE | |
| if(length(ks) > 0) { | |
| Bc <- Corpus(VectorSource(ks)) | |
| cable_mat <- DocumentTermMatrix(Bc, control = list(weighting = weightTf, | |
| removePunctuation = TRUE, removeNumbers = TRUE, wordLengths = c(3, 30),tokenize = BigramTokenizer)) | |
| dtm2<-removeSparseTerms(cable_mat, 0.999) | |
| dtm2 <- as.matrix(dtm2) | |
| temp <- as.matrix(cbind(rep(i, ncol(dtm2)),rep(j, ncol(dtm2)), colnames(dtm2),colSums(dtm2))) | |
| annualwords <- rbind(annualwords,temp) | |
| gc("free") | |
| } | |
| } | |
| } | |
| annualwords<-data.table(annualwords) | |
| setnames(annualwords,c("V1","V2","V3","V4"),c("technology","year","word","count")) | |
| F<-read.csv("keywords.csv") | |
| F<-data.table(F) | |
| query<-"SELECT * FROM `applntitlemerge` a LEFT JOIN patentclass p ON a.APPLN_ID=p.APPLN_ID" | |
| P<-data.table(fetch(dbSendQuery(con,query),-1)) | |
| T<-P[!is.na(TECHNOLOGY)] | |
| P<-P[is.na(TECHNOLOGY)] | |
| PS<-P[APPLN_ID %in% sample(P$APPLN_ID,100000)] | |
| PST<-rbind(T,PS) | |
| PST$ordered<-runif(nrow(PST),0,1) | |
| PST<-PST[order(ordered)] | |
| PST2 <- PST[APPLN_ID %in% sample(PST$APPLN_ID,30000)] | |
| PST3 <- PST | |
| PST <- PST2 | |
| PST$techlabel <- as.factor(PST$TECHNOLOGY) | |
| PST$APPLN_TITLE<-removeWords(PST$APPLN_TITLE,c(preProcess(stopwords()),"one","two","three","four","five","six","seven","eight","nine","ten","twelve","thirteen","fourteen","another","a","an","shot","encounter","dead","including","identified","road","at")) | |
| PST$APPLN_TITLE<-preProcess(PST$APPLN_TITLE) | |
| PST[is.na(techlabel)]$techlabel<-"other" | |
| PST$label <- as.numeric(PST$techlabel) | |
| PST<-PST[order(ordered)] | |
| addrow<-nrow(PST) | |
| addrowp = addrow+1 | |
| appendwords<-as.character(F$word) | |
| docs <- PST$APPLN_TITLE | |
| docs<-append(docs,appendwords) | |
| labels <- PST$techlabel | |
| appendlabel<-as.character(F$technology) | |
| labels<-append(labels,appendlabel) | |
| RA<-cbind(appendwords,results) | |
| RA<-cbind(RA,appendlabel) | |
| RA$SVM_LABEL<-as.numeric(as.character(RA$SVM_LABEL)) | |
| RA$MAXENTROPY_LABEL<-as.numeric(as.character(RA$MAXENTROPY_LABEL)) | |
| RA$meanprob = (RA$MAXENTROPY_PROB+RA$SVM_PROB)/2 | |
| doc_matrix <- create_matrix(docs,language="english", removeNumbers=TRUE,stemWords=TRUE,removePunctuation=TRUE,removeSparseTerms=0.99995) | |
| container <- create_container(doc_matrix,labels, trainSize=1:addrow, testSize=addrowp:length(docs), virgin=TRUE) | |
| models <- train_models(container, algorithms=c("MAXENT","SVM")) | |
| results <- classify_models(container, models) | |
| analytics <- create_analytics(container, results) | |
| res<-cbind(PST[7001:nrow(PST)],analytics@document_summary) | |
| models <- train_models(container, algorithms=c("MAXENT","SVM")) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment