Created
February 14, 2014 14:50
-
-
Save cigrainger/9002305 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
setwd("/Users/thiemo/Dropbox/mafundo/") | |
is_installed <- function(mypkg) is.element(mypkg, installed.packages()[,1]) | |
load_or_install<-function(package_names) | |
{ | |
for(package_name in package_names) | |
{ | |
if(!is_installed(package_name)) | |
{ | |
install.packages(package_name,repos="http://lib.stat.cmu.edu/R/CRAN") | |
} | |
library(package_name,character.only=TRUE,quietly=TRUE,verbose=FALSE) | |
} | |
} | |
bottom <- function(frame, rows=5) { | |
frame[(nrow(frame)-rows:nrow(frame)),] | |
} | |
cbindM <- | |
function(A, v, none=NA) { | |
dif <- setdiff(union(rownames(A),rownames(v)),intersect(rownames(A),rownames(v))) | |
#if names is the same, then a simple cbind will do | |
if(length(dif)==0) { | |
A<- cbind(A,v[match(rownames(A),rownames(v))]) | |
rownames(A) <- names(v) | |
} | |
else if(length(dif)>0) { | |
#sets are not equal, so either matrix is longer / shorter | |
#this tells us which elements in dif are part of A (and of v) respectively | |
for(i in dif) { | |
if(is.element(i,rownames(A))) { | |
#element is in A but not in v, so add it to v and then a | |
temp<-matrix(data = none, nrow = 1, ncol = ncol(v), byrow = FALSE, dimnames =list(i)) | |
v <- rbind(v,temp) | |
} else { | |
# element is in v but not in A, so add it to A | |
temp<-matrix(data = none, nrow = 1, ncol = ncol(A), byrow = FALSE, dimnames =list(i)) | |
A<-rbind(A,temp) | |
} | |
} | |
A<-cbind(A,v[match(rownames(A),rownames(v))]) | |
} | |
A | |
} | |
DTUniqueBy <- function(data, varvec) { | |
data <- as.data.table(data) | |
data[!duplicated(data.frame(data[, varvec, with=F]))] | |
} | |
mergeString <- function(txt,sepr=" ") { | |
for(i in 1:length(txt)) { | |
if(i ==1) { | |
str = paste(txt[i], sep=sepr) | |
} else { | |
str = paste(str,txt[i], sep=sepr) | |
} | |
} | |
str | |
} | |
preProcess <- function(content, keepnum=FALSE, keepperiod=FALSE,tolower=TRUE) { | |
if(keepperiod==TRUE) { | |
content <- gsub("[^[:alnum:].,]", " ", content ) | |
} | |
else { | |
content <- gsub("[^[:alnum:]]", " ", content ) | |
} | |
content <- gsub("[[:space:]]+", " ", content ) | |
if(tolower==TRUE) { | |
content <- tolower(content) | |
} | |
#content <- content [!grepl("^(| |\\.)*$", content )] | |
content <- gsub("^ *", " ", content ) | |
if(keepnum==FALSE) { | |
content <- gsub("([0-9])*", "", content ) | |
} | |
content <- gsub("^-*", "", content ) | |
content <- gsub(" +", " ", content ) | |
content <- gsub(" *$", "", content ) | |
#content <- content[!grepl("^(| |\\.\\,)*$",content )] | |
content <- gsub(" *$", "", content ) | |
content <- gsub("^ *", "", content ) | |
content | |
} | |
wordfreq <- function(txt, EF=1, stopws=NULL,stem=FALSE) { | |
txt <- unlist(strsplit(txt, " ", fixed = TRUE)) | |
if(stem==TRUE) { | |
txt <- wordStem(txt) | |
} | |
if (!is.null(stopws)) | |
txt = txt[!txt %in% stopws] | |
txt <- preProcess(txt) | |
tab <- sort(table(txt), decreasing = TRUE) | |
return(data.frame(docs=EF, terms = names(tab), Freq = tab, row.names = NULL)) | |
} | |
textmat <- function (vec=A, stpws=NULL) { | |
dummy <- mapply(wordfreq, vec, 1:length(vec), MoreArgs=list(stopws=stpws), SIMPLIFY=F) | |
names(dummy) <- NULL | |
dtm <- t(xtabs(Freq ~ ., data = do.call("rbind", dummy))) | |
dtm | |
} | |
makenumeric <- function(vec,replna=NA) { | |
vec<-as.numeric(vec) | |
vec[is.na(vec)]<-replna | |
vec | |
} | |
makeCAPEXnice <- function(P) { | |
P2 <- t(P) | |
P2<-data.table(P2) | |
P3<-as.matrix(P2) | |
rownames(P3)<-names(P) | |
rn <- NULL | |
for(i in 1:ncol(P3)) { | |
rn[i] <- paste(P3[1,i]) | |
} | |
colnames(P3) <- rn | |
P3 <- P3[3:nrow(P3),] | |
quarter <- gsub("Mar\\.([0-9]{2})",1,rownames(P3)) | |
quarter <- gsub("Jun\\.([0-9]{2})",2,quarter) | |
quarter <- gsub("Sep\\.([0-9]{2})",3,quarter) | |
quarter <- as.numeric(gsub("Dec\\.([0-9]{2})",4,quarter)) | |
iyear <- gsub("(Mar|Jun|Sep|Dec\\.)|(Mar|Jun|Sep|Dec)([0-9]{2})","\\2",rownames(P3)) | |
iyear <- gsub("\\.","",iyear) | |
iyear <- gsub("(99|98|97|96|95)","19\\1",iyear) | |
iyear <- as.numeric(gsub("(00|01|02|03|04|05|06|07|08|09|10|11|12)","20\\1",iyear)) | |
P3<-apply(P3,2,function(x) makenumeric(x,0)) | |
P3<-cbind(P3,iyear,quarter) | |
P3[,order(colnames(P3))] | |
} | |
reshapeCSV <- function(REG, varname="stalled") { | |
out<-NULL | |
for(i in names(REG)) { | |
temp<-NULL | |
temp<-cbind(STATE_NAME=rep(i,length(REG$i)),year=REG$iyear,quarter=REG$quarter,varname=REG[,i,with=F]) | |
names(temp)<-c("STATE_NAME","year","quarter",varname) | |
if(is.null(out)) { | |
out<- rbind(temp) | |
names(out)<-c("STATE_NAME","year","quarter",varname) | |
} else { | |
out<- rbind(out,temp) | |
names(out)<-c("STATE_NAME","year","quarter",varname) | |
} | |
} | |
data.table(out) | |
} | |
findMatches <- function(A,key) { | |
res <- grep(paste("\\b",key,"\\b",sep=""),A$story) | |
sapply(res, function(x) dbSendQuery(con,paste("INSERT INTO satp_association VALUES ('','",A[x]$id,"','",key,"')", sep=""))) | |
} | |
findMatchesSentence <- function(A,key) { | |
res <- grep(paste("\\b",key,"\\b",sep=""),A$story) | |
sapply(res, function(x) dbSendQuery(con,paste("INSERT INTO satp_association VALUES ('','",A[x]$id,"','",key,"')", sep=""))) | |
} | |
yearOnyearMaps <- function(yr,A,I) { | |
A2 <- A[year==yr,.N,by=c("term")] | |
names(A2)[2]<-"ATTACKCOUNT" | |
o <- match(I@data$DISTRICT, A2$term) | |
A2 <- A2[o,] | |
row.names(A2)<-row.names(I) | |
A3 <- spCbind(I, A2) | |
pdf(paste(yr,".pdf",sep="")) | |
plot(A3) | |
nclr <- 12 | |
plotclr <- brewer.pal(nclr,"YlOrRd") | |
class <- classIntervals(A3@data$ATTACKCOUNT, 8, style="fixed",fixedBreaks=c(0, 1, 4, 8, 16, 32, 64,128)) | |
colcode <- findColours(class, plotclr) | |
#class <- classIntervals(A3@data$ATTACKCOUNT, nclr, style="equal") | |
#colcode <- findColours(class, plotclr) | |
plot(A3, col=colcode, add=T) | |
title(main=paste("Violence in ",yr,sep=""), sub="Fixed Class Intervals") | |
legend("bottomleft", legend=names(attr(colcode, "table")), | |
fill=attr(colcode, "palette"), cex=0.6, bty="n") | |
dev.off() | |
} | |
timeVector <- function(starttime,endtime,timestep="months") { | |
starttime<- as.POSIXct(strptime(starttime, '%Y-%m-%d')) | |
endtime<- as.POSIXct(strptime(endtime, '%Y-%m-%d')) | |
if(timestep=="quarters") { | |
timestep="months" | |
ret<-seq(from=as.POSIXct(starttime), to=as.POSIXct(endtime), by=timestep) | |
quarter <- gsub("(^[123]{1}$)", 1, month(ret)) | |
quarter <- gsub("(^[456]{1}$)", 2, quarter) | |
quarter <- gsub("(^[789]{1}$)", 3, quarter) | |
quarter <- as.numeric(gsub("(^[102]{2}$)", 4, quarter)) | |
ret<-paste(year(ret),quarter,sep="-") | |
ret<-unique(ret) | |
} else { | |
ret<-seq(from=as.POSIXct(starttime), to=as.POSIXct(endtime), by=timestep) | |
} | |
ret | |
} | |
panelStructure <- function(group,timevec) { | |
tt<-rep(timevec,length(group)) | |
tt2 <- as.character(sort(rep(group,length(timevec)))) | |
mat <- cbind("group"=data.frame(tt2),"timevec"=data.frame(tt)) | |
names(mat)<-c("group","timevec") | |
mat | |
} | |
setwd("/Users/thiemo/Dropbox/mafundo/Googlestuff") | |
load_or_install(c("R.oo","stringr","classInt","rgdal", "maptools","XML", "sentiment","plyr","RMySQL","RTextTools", "topicmodels","corpora","ggplot2","tm","tm.plugin.sentiment","foreach","RColorBrewer","wordcloud","lsa","MASS","openNLP","openNLPmodels.en","data.table","depmixS4")) | |
BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 3)) | |
con <- dbConnect(MySQL(), username="root", password="", dbname="vanderbilt", unix.socket="/tmp/mysql.sock") | |
query <- "SELECT * FROM `abstracts` WHERE year(start) >= 1990" | |
A<-data.table(fetch(dbSendQuery(con,query),-1)) | |
A$rand <- runif(nrow(A),0,1) | |
A<-A[order(rand)] | |
A<-A[abstract!=""] | |
A<-A[1:60000] | |
A$abstract<-preProcess(A$abstract) | |
A$abstract<-removeWords(A$abstract, preProcess(stopwords("en"),c("featured scenes shown","report introduced","reported"))) | |
A$abstract<-preProcess(A$abstract) | |
A<-A[abstract!=""] | |
A$technology<-"trivia" | |
A$year<-A[,year(start)] | |
con <- dbConnect(MySQL(), username="root", password="", dbname="googleinsights", unix.socket="/tmp/mysql.sock") | |
query<-"SELECT * FROM `applntitlemerge` a LEFT JOIN patentclass p ON a.APPLN_ID=p.APPLN_ID" | |
P<-data.table(fetch(dbSendQuery(con,query),-1)) | |
T<-P[!is.na(TECHNOLOGY)] | |
P<-P[is.na(TECHNOLOGY)] | |
PS<-P[APPLN_ID %in% sample(P$APPLN_ID,100000)] | |
PST<-rbind(T,PS) | |
PST[is.na(TECHNOLOGY)]$TECHNOLOGY<-"other" | |
PST$year<-PST[,year(APPLN_FILING_DATE)] | |
techwords <- NULL | |
techn <- names(table(PST$TECHNOLOGY)) | |
PST$APPLN_TITLE<-preProcess(PST$APPLN_TITLE) | |
PST$APPLN_TITLE<-removeWords(PST$APPLN_TITLE,preProcess(stopwords("en"))) | |
PST$APPLN_TITLE<-preProcess(PST$APPLN_TITLE) | |
techwords <- NULL | |
for(i in techn) { | |
cat(paste(i," ",sep=" ")) | |
Bc <- Corpus(VectorSource(PST[TECHNOLOGY==i]$APPLN_TITLE)) | |
cable_mat <- DocumentTermMatrix(Bc, control = list(weighting = weightTf, | |
removePunctuation = TRUE, removeNumbers = TRUE, wordLengths = c(3, 30),tokenize = BigramTokenizer)) | |
dtm2<-removeSparseTerms(cable_mat, 0.999) | |
dtm2 <- as.matrix(dtm2) | |
temp <- as.matrix(cbind(rep(i, ncol(dtm2)), colnames(dtm2),colSums(dtm2))) | |
techwords <- rbind(techwords,temp) | |
gc("free") | |
} | |
techwords<-data.table(techwords) | |
setnames(techwords,c("V1","V2","V3"),c("technology","word","count")) | |
techwords$count<-as.numeric(as.character(techwords$count)) | |
Bc <- Corpus(VectorSource(A$abstract)) | |
cable_mat <- DocumentTermMatrix(Bc, control = list(weighting = weightTf, | |
removePunctuation = TRUE, removeNumbers = TRUE, wordLengths = c(3, 30),tokenize = BigramTokenizer)) | |
dtm2<-removeSparseTerms(cable_mat, 0.999) | |
dtm2 <- as.matrix(dtm2) | |
temp <- as.matrix(cbind(rep("trivia", ncol(dtm2)), colnames(dtm2),colSums(dtm2))) | |
temp<-data.table(temp) | |
temp$V3<-as.numeric(as.character(temp$V3)) | |
trivia<-temp | |
techwords<-rbind(techwords,trivia,use.names=FALSE) | |
techannualwords<-NULL | |
for(j in 1995:2012) { | |
temp<-A[year<=j & year >=j-5][1:1000]$abstract | |
Bc <- Corpus(VectorSource(temp)) | |
cable_mat <- DocumentTermMatrix(Bc, control = list(weighting = weightTf, | |
removePunctuation = TRUE, removeNumbers = TRUE, wordLengths = c(3, 30),tokenize = BigramTokenizer)) | |
dtm2<-removeSparseTerms(cable_mat, 0.999) | |
dtm2 <- as.matrix(dtm2) | |
temp <- as.matrix(cbind(rep("trivia", ncol(dtm2)),rep(j, ncol(dtm2)), colnames(dtm2),colSums(dtm2))) | |
techannualwords<-rbind(techannualwords,temp) | |
for(i in techn) { | |
cat(paste(i," ",j,sep=" ")) | |
ks<-PST[TECHNOLOGY==i & year<=j & year>=j-5]$APPLN_TITLE | |
if(length(ks) > 0) { | |
Bc <- Corpus(VectorSource(ks)) | |
cable_mat <- DocumentTermMatrix(Bc, control = list(weighting = weightTf, | |
removePunctuation = TRUE, removeNumbers = TRUE, wordLengths = c(3, 30),tokenize = BigramTokenizer)) | |
dtm2<-removeSparseTerms(cable_mat, 0.999) | |
dtm2 <- as.matrix(dtm2) | |
temp <- as.matrix(cbind(rep(i, ncol(dtm2)),rep(j, ncol(dtm2)), colnames(dtm2),colSums(dtm2))) | |
techannualwords <- rbind(techannualwords,temp) | |
gc("free") | |
} | |
} | |
} | |
techannualwords<-data.table(techannualwords) | |
setnames(techannualwords,c("V1","V2","V3","V4"),c("technology","year","word","count")) | |
techannualwords$word<-as.character(techannualwords$word) | |
techannualwords$technology<-as.character(techannualwords$technology) | |
techannualwords$count<-as.numeric(as.character(techannualwords$count)) | |
Freqsannual <- NULL | |
###individual technology innovations | |
for(j in 1995:2012) { | |
for(i in techn) { | |
cat(j," ",i,sep=" ") | |
if(nrow(techannualwords[technology==i & year==j]) > 0) { | |
FreqVec<-merge(techannualwords[technology !=i & year==j][,sum(count),by=word],techannualwords[technology==i & year==j],by="word",all.x=TRUE,all.y=TRUE) | |
#democrat - other | |
#republican - solar | |
setnames(FreqVec, c("V1","technology","count") ,c("anchorcount","technology","patentcount")) | |
FreqVec[is.na(anchorcount)]$anchorcount<-0 | |
FreqVec[is.na(patentcount)]$patentcount<-0 | |
anch<-FreqVec[,sum(anchorcount)] | |
pat<-FreqVec[,sum(patentcount)] | |
chis<-FreqVec[,(anch+pat)*((anchorcount*(pat-patentcount) - patentcount*(anch-anchorcount))^2)/((anchorcount+patentcount)*(anch *pat)*(anch+pat-anchorcount-patentcount)),by=word] | |
FreqVec<-cbind(FreqVec,chis) | |
names(FreqVec)[ncol(FreqVec)] <- "chis" | |
chissign<-FreqVec[,sign((anch+pat)*((anchorcount*(pat-patentcount) - patentcount*(anch-anchorcount)))),by=word] | |
FreqVec<-cbind(rep(j,nrow(FreqVec)),FreqVec) | |
FreqVec<-cbind(FreqVec,chissign) | |
Freqsannual<-rbind(Freqsannual,FreqVec) | |
rm(FreqVec) | |
gc("free") | |
} | |
} | |
} | |
Freqsannual<-data.table(Freqsannual) | |
FA <- Freqsannual[technology != "<NA>" & V1 == -1 & chis>= 10 & patentcount>=3] | |
setnames(FA,"rep(j, nrow(FreqVec))","year") | |
write.csv(FA[,c(1,3,4,5,6,7,8,10),with=F],file="keywords-annual.csv") | |
write.csv(data.frame(table(FA$word)), file="distinct-keywords.csv") | |
techannualwords$broadclass <- "" | |
techannualwords[technology=="trivia"]$broadclass = "notechnology" | |
techannualwords[technology=="other"]$broadclass = "othertechnology" | |
techannualwords[broadclass==""]$broadclass = "cleantech" | |
Freqanytech <- NULL | |
###individual technology innovations | |
for(j in 1995:2012) { | |
cat(j," ",sep=" ") | |
FreqVec<-merge(techannualwords[broadclass =="othertechnology" & year==j][,sum(count),by=word],techannualwords[broadclass =="cleantech" & year==j][,sum(count),by=word],by="word",all.x=TRUE,all.y=TRUE) | |
#democrat - other | |
#republican - solar | |
setnames(FreqVec, c("V1.x","V1.y") ,c("anchorcount","patentcount")) | |
FreqVec[is.na(anchorcount)]$anchorcount<-0 | |
FreqVec[is.na(patentcount)]$patentcount<-0 | |
anch<-FreqVec[,sum(anchorcount)] | |
pat<-FreqVec[,sum(patentcount)] | |
chis<-FreqVec[,(anch+pat)*((anchorcount*(pat-patentcount) - patentcount*(anch-anchorcount))^2)/((anchorcount+patentcount)*(anch *pat)*(anch+pat-anchorcount-patentcount)),by=word] | |
FreqVec<-cbind(FreqVec,chis) | |
names(FreqVec)[ncol(FreqVec)] <- "chis" | |
chissign<-FreqVec[,sign((anch+pat)*((anchorcount*(pat-patentcount) - patentcount*(anch-anchorcount)))),by=word] | |
FreqVec<-cbind(rep(j,nrow(FreqVec)),FreqVec) | |
FreqVec<-cbind(FreqVec,chissign) | |
Freqanytech<-rbind(Freqanytech,FreqVec) | |
rm(FreqVec) | |
gc("free") | |
} | |
FAY <- Freqanytech[chis>= 5 & patentcount>=3] | |
setnames(FAY,"rep(j, nrow(FreqVec))","year") | |
FAY$innovationclass <- "" | |
FAY[V1==1]$innovationclass <- "noncleantech" | |
FAY[V1==-1]$innovationclass <- "cleantech" | |
write.csv(FAY[,c(1,2,3,4,6,8,9),with=F],file="innovation-keywords.csv") | |
write.csv(data.frame(table(FACNC$word)), file="clean-nonclean-keywords.csv") | |
write.csv(data.frame(table(FAY$word)), file="general-innovation-keywords.csv") | |
setwd("/Users/thiemo/Dropbox/mafundo/Googlestuff/Google keywords/keywords-generalinnovation/") | |
for(j in 1995:2012) { | |
fn = paste("innovation-",j,".pdf",sep="") | |
if(nrow(FAY[ year == j ]) > 10) { | |
pdf(fn) | |
wordcloud(FAY[year == j ]$word,sqrt(FAY[year == j]$chis), scale=c(8,.3), min.freq=2) | |
dev.off() | |
} | |
} | |
FACNC <- Freqanytech[chis>= 5 & patentcount>=3 & V1 == -1 ] | |
setnames(FACNC,"rep(j, nrow(FreqVec))","year") | |
FACNC$innovationclass <- "cleantech" | |
setwd("/Users/thiemo/Dropbox/mafundo/Googlestuff/Google keywords/keywords-clean-vs-nonclean/") | |
for(j in 1995:2012) { | |
fn = paste("cleannonclean-",j,".pdf",sep="") | |
if(nrow(FACNC[ year == j ]) > 10) { | |
pdf(fn) | |
wordcloud(FACNC[year == j & innovationclass=="cleantech" ]$word,sqrt(FACNC[year == j & innovationclass=="cleantech" ]$chis), scale=c(8,.3), min.freq=2) | |
dev.off() | |
} | |
} | |
setwd("/Users/thiemo/Dropbox/mafundo/Googlestuff/Google keywords/keywords-annual/") | |
for(i in techn) { | |
for(j in 1995:2012) { | |
fn = paste(i,"-",j,".pdf",sep="") | |
if(nrow(FA[technology==i & year == j ]) > 10) { | |
txt <- paste("Technology: ",i," - Number of patents (including prev 5 yrs): ", nrow(PST[TECHNOLOGY==i & year <= j & year >= j-5]), sep="") | |
pdf(fn) | |
wordcloud(FA[technology==i & year == j ]$word,sqrt(FA[technology==i & year == j]$chis), scale=c(8,.3), min.freq=2) | |
mtext(txt,side=1,line=4, col="darkred") | |
dev.off() | |
} | |
} | |
} | |
annualwords<-NULL | |
for(j in 1994:2012) { | |
for(i in techn) { | |
cat(paste(i," ",sep=" ")) | |
ks<-PST[TECHNOLOGY==i & year==j]$APPLN_TITLE | |
if(length(ks) > 0) { | |
Bc <- Corpus(VectorSource(ks)) | |
cable_mat <- DocumentTermMatrix(Bc, control = list(weighting = weightTf, | |
removePunctuation = TRUE, removeNumbers = TRUE, wordLengths = c(3, 30),tokenize = BigramTokenizer)) | |
dtm2<-removeSparseTerms(cable_mat, 0.999) | |
dtm2 <- as.matrix(dtm2) | |
temp <- as.matrix(cbind(rep(i, ncol(dtm2)),rep(j, ncol(dtm2)), colnames(dtm2),colSums(dtm2))) | |
annualwords <- rbind(annualwords,temp) | |
gc("free") | |
} | |
} | |
} | |
annualwords<-data.table(annualwords) | |
setnames(annualwords,c("V1","V2","V3","V4"),c("technology","year","word","count")) | |
F<-read.csv("keywords.csv") | |
F<-data.table(F) | |
query<-"SELECT * FROM `applntitlemerge` a LEFT JOIN patentclass p ON a.APPLN_ID=p.APPLN_ID" | |
P<-data.table(fetch(dbSendQuery(con,query),-1)) | |
T<-P[!is.na(TECHNOLOGY)] | |
P<-P[is.na(TECHNOLOGY)] | |
PS<-P[APPLN_ID %in% sample(P$APPLN_ID,100000)] | |
PST<-rbind(T,PS) | |
PST$ordered<-runif(nrow(PST),0,1) | |
PST<-PST[order(ordered)] | |
PST2 <- PST[APPLN_ID %in% sample(PST$APPLN_ID,30000)] | |
PST3 <- PST | |
PST <- PST2 | |
PST$techlabel <- as.factor(PST$TECHNOLOGY) | |
PST$APPLN_TITLE<-removeWords(PST$APPLN_TITLE,c(preProcess(stopwords()),"one","two","three","four","five","six","seven","eight","nine","ten","twelve","thirteen","fourteen","another","a","an","shot","encounter","dead","including","identified","road","at")) | |
PST$APPLN_TITLE<-preProcess(PST$APPLN_TITLE) | |
PST[is.na(techlabel)]$techlabel<-"other" | |
PST$label <- as.numeric(PST$techlabel) | |
PST<-PST[order(ordered)] | |
addrow<-nrow(PST) | |
addrowp = addrow+1 | |
appendwords<-as.character(F$word) | |
docs <- PST$APPLN_TITLE | |
docs<-append(docs,appendwords) | |
labels <- PST$techlabel | |
appendlabel<-as.character(F$technology) | |
labels<-append(labels,appendlabel) | |
RA<-cbind(appendwords,results) | |
RA<-cbind(RA,appendlabel) | |
RA$SVM_LABEL<-as.numeric(as.character(RA$SVM_LABEL)) | |
RA$MAXENTROPY_LABEL<-as.numeric(as.character(RA$MAXENTROPY_LABEL)) | |
RA$meanprob = (RA$MAXENTROPY_PROB+RA$SVM_PROB)/2 | |
doc_matrix <- create_matrix(docs,language="english", removeNumbers=TRUE,stemWords=TRUE,removePunctuation=TRUE,removeSparseTerms=0.99995) | |
container <- create_container(doc_matrix,labels, trainSize=1:addrow, testSize=addrowp:length(docs), virgin=TRUE) | |
models <- train_models(container, algorithms=c("MAXENT","SVM")) | |
results <- classify_models(container, models) | |
analytics <- create_analytics(container, results) | |
res<-cbind(PST[7001:nrow(PST)],analytics@document_summary) | |
models <- train_models(container, algorithms=c("MAXENT","SVM")) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment