cigrainger · February 14, 2014 14:50
diff --git a/gistfile1.r b/gistfile1.r

 setwd("/Users/thiemo/Dropbox/mafundo/")
 is_installed <- function(mypkg) is.element(mypkg, installed.packages()[,1])

 load_or_install<-function(package_names)
 {

  for(package_name in package_names)
  {
    if(!is_installed(package_name))
    {
       install.packages(package_name,repos="http://lib.stat.cmu.edu/R/CRAN")
    }
    library(package_name,character.only=TRUE,quietly=TRUE,verbose=FALSE)
  }
 }


 bottom <- function(frame, rows=5) {

 frame[(nrow(frame)-rows:nrow(frame)),]

 }

 cbindM <-
 function(A, v, none=NA) {
 	
 	dif <- setdiff(union(rownames(A),rownames(v)),intersect(rownames(A),rownames(v)))
 	#if names is the same, then a simple cbind will do
 if(length(dif)==0) {
 	
 	A<- cbind(A,v[match(rownames(A),rownames(v))])

 	rownames(A) <- names(v)	
 		
 	}
 	else if(length(dif)>0) {
 	#sets are not equal, so either matrix is longer / shorter

 	#this tells us which elements in dif are part of A (and of v) respectively	
 	for(i in dif)	{
 	
 	if(is.element(i,rownames(A))) {
 	#element is in A but not in v, so add it to v and then a	
 		
 		temp<-matrix(data = none, nrow = 1, ncol = ncol(v), byrow = FALSE, dimnames =list(i))
 		v <- rbind(v,temp)
 	
 	} else {
 	# element is in v but not in A, so add it to A
 	
 		temp<-matrix(data = none, nrow = 1, ncol = ncol(A), byrow = FALSE, dimnames =list(i))
 		A<-rbind(A,temp)
 	}
 	}
 	
 	
 		A<-cbind(A,v[match(rownames(A),rownames(v))])
 	}
 	
 	A
 	}

 DTUniqueBy <- function(data, varvec) {
  data <- as.data.table(data)
  data[!duplicated(data.frame(data[, varvec, with=F]))]
 }

 mergeString <- function(txt,sepr=" ") {

 for(i in 1:length(txt)) {
 if(i ==1) {
 str = paste(txt[i], sep=sepr)
 } else {
 str = paste(str,txt[i], sep=sepr)

 }
 }

 str
 }

 preProcess <- function(content, keepnum=FALSE, keepperiod=FALSE,tolower=TRUE) {
 if(keepperiod==TRUE) {
 content <- gsub("[^[:alnum:].,]", " ", content )
 } 
 else {
 content <- gsub("[^[:alnum:]]", " ", content )
 }
 content <- gsub("[[:space:]]+", " ", content )
 if(tolower==TRUE) {
 content <- tolower(content)
 }

 #content  <- content [!grepl("^(| |\\.)*$", content )]

 content  <- gsub("^ *", "  ", content )
 if(keepnum==FALSE) {
 content  <- gsub("([0-9])*", "", content )
 }
 content  <- gsub("^-*", "", content )
 content  <- gsub(" +", " ", content )
 content  <- gsub(" *$", "", content )
 #content  <- content[!grepl("^(| |\\.\\,)*$",content )]
 content  <- gsub(" *$", "", content )
 content  <- gsub("^ *", "", content )

 content
 }

 wordfreq <- function(txt, EF=1, stopws=NULL,stem=FALSE) {
  txt <- unlist(strsplit(txt, " ", fixed = TRUE))
  
  if(stem==TRUE) {
  txt <- wordStem(txt)
  
  }
  if (!is.null(stopws)) 
         txt = txt[!txt %in% stopws]
    	 txt <- preProcess(txt)
    	 
  tab <- sort(table(txt), decreasing = TRUE)
  return(data.frame(docs=EF, terms = names(tab), Freq = tab, row.names = NULL))
 }

 textmat <- function (vec=A, stpws=NULL) {
        dummy <- mapply(wordfreq, vec, 1:length(vec), MoreArgs=list(stopws=stpws), SIMPLIFY=F)
        names(dummy) <- NULL
        dtm <- t(xtabs(Freq ~ ., data = do.call("rbind", dummy)))
        dtm
 }



 makenumeric <- function(vec,replna=NA) {

 vec<-as.numeric(vec)
 vec[is.na(vec)]<-replna
 vec
 }


 makeCAPEXnice <- function(P) {

 P2 <- t(P)
 P2<-data.table(P2)
 P3<-as.matrix(P2)
 rownames(P3)<-names(P)
 rn <- NULL
 for(i in 1:ncol(P3)) {
 rn[i] <- paste(P3[1,i])

 }

 colnames(P3) <-  rn

 P3 <- P3[3:nrow(P3),]


 quarter <- gsub("Mar\\.([0-9]{2})",1,rownames(P3))

 quarter <- gsub("Jun\\.([0-9]{2})",2,quarter)
 quarter <- gsub("Sep\\.([0-9]{2})",3,quarter)
 quarter <- as.numeric(gsub("Dec\\.([0-9]{2})",4,quarter))

 iyear <- gsub("(Mar|Jun|Sep|Dec\\.)|(Mar|Jun|Sep|Dec)([0-9]{2})","\\2",rownames(P3))
 iyear <- gsub("\\.","",iyear)
 iyear <- gsub("(99|98|97|96|95)","19\\1",iyear)
 iyear <- as.numeric(gsub("(00|01|02|03|04|05|06|07|08|09|10|11|12)","20\\1",iyear))

 P3<-apply(P3,2,function(x) makenumeric(x,0))
 P3<-cbind(P3,iyear,quarter)

 P3[,order(colnames(P3))]


 }


 reshapeCSV <- function(REG, varname="stalled") {
 out<-NULL
 for(i in names(REG)) {
 	temp<-NULL
 	temp<-cbind(STATE_NAME=rep(i,length(REG$i)),year=REG$iyear,quarter=REG$quarter,varname=REG[,i,with=F])
 		names(temp)<-c("STATE_NAME","year","quarter",varname)

 	if(is.null(out)) {
 	out<- rbind(temp)
 	names(out)<-c("STATE_NAME","year","quarter",varname)

 	}  else {
 		out<- rbind(out,temp)
 	names(out)<-c("STATE_NAME","year","quarter",varname)

 	}
 }

 data.table(out)
 }

 findMatches <- function(A,key) {

 res <- grep(paste("\\b",key,"\\b",sep=""),A$story)

 sapply(res, function(x) dbSendQuery(con,paste("INSERT INTO satp_association VALUES ('','",A[x]$id,"','",key,"')", sep="")))

 }


 findMatchesSentence <- function(A,key) {

 res <- grep(paste("\\b",key,"\\b",sep=""),A$story)

 sapply(res, function(x) dbSendQuery(con,paste("INSERT INTO satp_association VALUES ('','",A[x]$id,"','",key,"')", sep="")))

 }


 yearOnyearMaps <- function(yr,A,I) {

 A2 <- A[year==yr,.N,by=c("term")]

 names(A2)[2]<-"ATTACKCOUNT"

 o <- match(I@data$DISTRICT, A2$term)
 A2 <- A2[o,]
 row.names(A2)<-row.names(I)

 A3 <- spCbind(I, A2)

 pdf(paste(yr,".pdf",sep=""))
 plot(A3)
 nclr <- 12
 plotclr <- brewer.pal(nclr,"YlOrRd")
 class <- classIntervals(A3@data$ATTACKCOUNT, 8, style="fixed",fixedBreaks=c(0, 1, 4, 8, 16, 32, 64,128))
 colcode <- findColours(class, plotclr)
 #class <- classIntervals(A3@data$ATTACKCOUNT, nclr, style="equal")
 #colcode <- findColours(class, plotclr)
 plot(A3, col=colcode, add=T)
 title(main=paste("Violence in ",yr,sep=""), sub="Fixed Class Intervals")
 legend("bottomleft", legend=names(attr(colcode, "table")),
    fill=attr(colcode, "palette"), cex=0.6, bty="n")
 dev.off()

 }

 timeVector <- function(starttime,endtime,timestep="months") {

 starttime<- as.POSIXct(strptime(starttime, '%Y-%m-%d'))
 endtime<- as.POSIXct(strptime(endtime, '%Y-%m-%d'))
 if(timestep=="quarters") {
 timestep="months"
 ret<-seq(from=as.POSIXct(starttime), to=as.POSIXct(endtime), by=timestep)
 quarter <- gsub("(^[123]{1}$)", 1, month(ret))
 quarter <- gsub("(^[456]{1}$)", 2, quarter)
 quarter <- gsub("(^[789]{1}$)", 3, quarter)
 quarter <- as.numeric(gsub("(^[102]{2}$)", 4, quarter))

 ret<-paste(year(ret),quarter,sep="-")
 ret<-unique(ret)
 } else {

 ret<-seq(from=as.POSIXct(starttime), to=as.POSIXct(endtime), by=timestep)
 }
 ret

 }


 panelStructure <- function(group,timevec) {


 tt<-rep(timevec,length(group))
 tt2 <- as.character(sort(rep(group,length(timevec))))
 mat <- cbind("group"=data.frame(tt2),"timevec"=data.frame(tt))
 names(mat)<-c("group","timevec")
 mat
 }



 setwd("/Users/thiemo/Dropbox/mafundo/Googlestuff")

 load_or_install(c("R.oo","stringr","classInt","rgdal", "maptools","XML", "sentiment","plyr","RMySQL","RTextTools", "topicmodels","corpora","ggplot2","tm","tm.plugin.sentiment","foreach","RColorBrewer","wordcloud","lsa","MASS","openNLP","openNLPmodels.en","data.table","depmixS4"))

 BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 3))


 con <- dbConnect(MySQL(), username="root", password="", dbname="vanderbilt", unix.socket="/tmp/mysql.sock")
 query <- "SELECT * FROM `abstracts` WHERE year(start) >= 1990"
 A<-data.table(fetch(dbSendQuery(con,query),-1))

 A$rand <- runif(nrow(A),0,1)
 A<-A[order(rand)]
 A<-A[abstract!=""] 
 A<-A[1:60000]
 A$abstract<-preProcess(A$abstract)
 A$abstract<-removeWords(A$abstract, preProcess(stopwords("en"),c("featured scenes shown","report introduced","reported")))
 A$abstract<-preProcess(A$abstract)
 
 A<-A[abstract!=""]
 A$technology<-"trivia"
 A$year<-A[,year(start)]

 con <- dbConnect(MySQL(), username="root", password="", dbname="googleinsights", unix.socket="/tmp/mysql.sock")
 query<-"SELECT * FROM `applntitlemerge` a LEFT JOIN patentclass p ON a.APPLN_ID=p.APPLN_ID"

 P<-data.table(fetch(dbSendQuery(con,query),-1))

 T<-P[!is.na(TECHNOLOGY)]

 P<-P[is.na(TECHNOLOGY)]
 PS<-P[APPLN_ID %in% sample(P$APPLN_ID,100000)]

 PST<-rbind(T,PS)

 PST[is.na(TECHNOLOGY)]$TECHNOLOGY<-"other"
 PST$year<-PST[,year(APPLN_FILING_DATE)]

 techwords <- NULL

 techn <- names(table(PST$TECHNOLOGY))

 PST$APPLN_TITLE<-preProcess(PST$APPLN_TITLE)
 PST$APPLN_TITLE<-removeWords(PST$APPLN_TITLE,preProcess(stopwords("en")))
 PST$APPLN_TITLE<-preProcess(PST$APPLN_TITLE)

 techwords <- NULL

 for(i in techn) {

 cat(paste(i," ",sep=" "))
 Bc <- Corpus(VectorSource(PST[TECHNOLOGY==i]$APPLN_TITLE))


 cable_mat <- DocumentTermMatrix(Bc, control = list(weighting = weightTf, 
    removePunctuation = TRUE, removeNumbers = TRUE,  wordLengths = c(3, 30),tokenize = BigramTokenizer))

 dtm2<-removeSparseTerms(cable_mat, 0.999)

 dtm2 <- as.matrix(dtm2)

 temp <-  as.matrix(cbind(rep(i, ncol(dtm2)), colnames(dtm2),colSums(dtm2)))

 techwords <- rbind(techwords,temp)

 gc("free")
 }

 techwords<-data.table(techwords)
 setnames(techwords,c("V1","V2","V3"),c("technology","word","count"))
 techwords$count<-as.numeric(as.character(techwords$count))


 Bc <- Corpus(VectorSource(A$abstract))

 cable_mat <- DocumentTermMatrix(Bc, control = list(weighting = weightTf, 
    removePunctuation = TRUE, removeNumbers = TRUE,  wordLengths = c(3, 30),tokenize = BigramTokenizer))
 dtm2<-removeSparseTerms(cable_mat, 0.999)
 dtm2 <- as.matrix(dtm2)
 temp <-  as.matrix(cbind(rep("trivia", ncol(dtm2)), colnames(dtm2),colSums(dtm2)))
 temp<-data.table(temp)

 temp$V3<-as.numeric(as.character(temp$V3))

 trivia<-temp

 techwords<-rbind(techwords,trivia,use.names=FALSE)


 techannualwords<-NULL

 for(j in 1995:2012) {

 temp<-A[year<=j & year >=j-5][1:1000]$abstract
 Bc <- Corpus(VectorSource(temp))

 cable_mat <- DocumentTermMatrix(Bc, control = list(weighting = weightTf, 
    removePunctuation = TRUE, removeNumbers = TRUE,  wordLengths = c(3, 30),tokenize = BigramTokenizer))
 dtm2<-removeSparseTerms(cable_mat, 0.999)
 dtm2 <- as.matrix(dtm2)
 temp <-  as.matrix(cbind(rep("trivia", ncol(dtm2)),rep(j, ncol(dtm2)), colnames(dtm2),colSums(dtm2)))
 techannualwords<-rbind(techannualwords,temp)

 for(i in techn) {

 cat(paste(i," ",j,sep=" "))

 ks<-PST[TECHNOLOGY==i & year<=j & year>=j-5]$APPLN_TITLE

 if(length(ks) > 0) {
 Bc <- Corpus(VectorSource(ks))

 cable_mat <- DocumentTermMatrix(Bc, control = list(weighting = weightTf, 
    removePunctuation = TRUE, removeNumbers = TRUE,  wordLengths = c(3, 30),tokenize = BigramTokenizer))

 dtm2<-removeSparseTerms(cable_mat, 0.999)

 dtm2 <- as.matrix(dtm2)

 temp <-  as.matrix(cbind(rep(i, ncol(dtm2)),rep(j, ncol(dtm2)), colnames(dtm2),colSums(dtm2)))

 techannualwords <- rbind(techannualwords,temp)

 gc("free")
 }
 }
 }



 techannualwords<-data.table(techannualwords)

 setnames(techannualwords,c("V1","V2","V3","V4"),c("technology","year","word","count"))

 techannualwords$word<-as.character(techannualwords$word)
 techannualwords$technology<-as.character(techannualwords$technology)
 techannualwords$count<-as.numeric(as.character(techannualwords$count))

 Freqsannual <- NULL

 ###individual technology innovations
 for(j in 1995:2012) {
 for(i in techn) {

 cat(j," ",i,sep=" ")

 if(nrow(techannualwords[technology==i & year==j]) > 0) {

 FreqVec<-merge(techannualwords[technology !=i & year==j][,sum(count),by=word],techannualwords[technology==i & year==j],by="word",all.x=TRUE,all.y=TRUE)

 #democrat - other
 #republican - solar

 setnames(FreqVec, c("V1","technology","count") ,c("anchorcount","technology","patentcount"))

 FreqVec[is.na(anchorcount)]$anchorcount<-0
 FreqVec[is.na(patentcount)]$patentcount<-0

 anch<-FreqVec[,sum(anchorcount)]
 pat<-FreqVec[,sum(patentcount)]
 chis<-FreqVec[,(anch+pat)*((anchorcount*(pat-patentcount) - patentcount*(anch-anchorcount))^2)/((anchorcount+patentcount)*(anch *pat)*(anch+pat-anchorcount-patentcount)),by=word]

 FreqVec<-cbind(FreqVec,chis)
 names(FreqVec)[ncol(FreqVec)] <- "chis"

 chissign<-FreqVec[,sign((anch+pat)*((anchorcount*(pat-patentcount) - patentcount*(anch-anchorcount)))),by=word]
 FreqVec<-cbind(rep(j,nrow(FreqVec)),FreqVec)

 FreqVec<-cbind(FreqVec,chissign)


 Freqsannual<-rbind(Freqsannual,FreqVec)
 rm(FreqVec)
 gc("free")
 }
 }
 }


 Freqsannual<-data.table(Freqsannual)

 FA <- Freqsannual[technology != "<NA>" & V1 == -1 & chis>= 10 & patentcount>=3]

 setnames(FA,"rep(j, nrow(FreqVec))","year")

 write.csv(FA[,c(1,3,4,5,6,7,8,10),with=F],file="keywords-annual.csv")



 write.csv(data.frame(table(FA$word)), file="distinct-keywords.csv")

 techannualwords$broadclass <- ""

 techannualwords[technology=="trivia"]$broadclass = "notechnology"
 techannualwords[technology=="other"]$broadclass = "othertechnology"
 techannualwords[broadclass==""]$broadclass = "cleantech"





 Freqanytech <- NULL

 ###individual technology innovations
 for(j in 1995:2012) {

 cat(j," ",sep=" ")

 FreqVec<-merge(techannualwords[broadclass =="othertechnology" & year==j][,sum(count),by=word],techannualwords[broadclass =="cleantech" & year==j][,sum(count),by=word],by="word",all.x=TRUE,all.y=TRUE)

 #democrat - other
 #republican - solar

 setnames(FreqVec, c("V1.x","V1.y") ,c("anchorcount","patentcount"))

 FreqVec[is.na(anchorcount)]$anchorcount<-0
 FreqVec[is.na(patentcount)]$patentcount<-0

 anch<-FreqVec[,sum(anchorcount)]
 pat<-FreqVec[,sum(patentcount)]
 chis<-FreqVec[,(anch+pat)*((anchorcount*(pat-patentcount) - patentcount*(anch-anchorcount))^2)/((anchorcount+patentcount)*(anch *pat)*(anch+pat-anchorcount-patentcount)),by=word]

 FreqVec<-cbind(FreqVec,chis)
 names(FreqVec)[ncol(FreqVec)] <- "chis"

 chissign<-FreqVec[,sign((anch+pat)*((anchorcount*(pat-patentcount) - patentcount*(anch-anchorcount)))),by=word]
 FreqVec<-cbind(rep(j,nrow(FreqVec)),FreqVec)

 FreqVec<-cbind(FreqVec,chissign)


 Freqanytech<-rbind(Freqanytech,FreqVec)
 rm(FreqVec)
 gc("free")

 }


 FAY <- Freqanytech[chis>= 5 & patentcount>=3]
 setnames(FAY,"rep(j, nrow(FreqVec))","year")


 FAY$innovationclass <- ""

 FAY[V1==1]$innovationclass <- "noncleantech"
 FAY[V1==-1]$innovationclass <- "cleantech"


 write.csv(FAY[,c(1,2,3,4,6,8,9),with=F],file="innovation-keywords.csv")

 write.csv(data.frame(table(FACNC$word)), file="clean-nonclean-keywords.csv")


 write.csv(data.frame(table(FAY$word)), file="general-innovation-keywords.csv")


 setwd("/Users/thiemo/Dropbox/mafundo/Googlestuff/Google keywords/keywords-generalinnovation/")

 for(j in 1995:2012) {

 fn = paste("innovation-",j,".pdf",sep="")

 if(nrow(FAY[ year == j ]) > 10) {
 pdf(fn)
 wordcloud(FAY[year == j ]$word,sqrt(FAY[year == j]$chis), scale=c(8,.3), min.freq=2)
 dev.off()
 }

 }



 FACNC <- Freqanytech[chis>= 5 & patentcount>=3 & V1 == -1 ]
 setnames(FACNC,"rep(j, nrow(FreqVec))","year")

 FACNC$innovationclass <- "cleantech"



 setwd("/Users/thiemo/Dropbox/mafundo/Googlestuff/Google keywords/keywords-clean-vs-nonclean/")

 for(j in 1995:2012) {

 fn = paste("cleannonclean-",j,".pdf",sep="")

 if(nrow(FACNC[ year == j ]) > 10) {
 pdf(fn)
 wordcloud(FACNC[year == j & innovationclass=="cleantech" ]$word,sqrt(FACNC[year == j & innovationclass=="cleantech" ]$chis), scale=c(8,.3), min.freq=2)
 dev.off()
 }

 }

 setwd("/Users/thiemo/Dropbox/mafundo/Googlestuff/Google keywords/keywords-annual/")

 for(i in techn) {
 for(j in 1995:2012) {

 fn = paste(i,"-",j,".pdf",sep="")

 if(nrow(FA[technology==i & year == j ]) > 10) {
 txt <- paste("Technology: ",i," - Number of patents (including prev 5 yrs): ", nrow(PST[TECHNOLOGY==i & year <= j & year >= j-5]), sep="")
 pdf(fn)
 wordcloud(FA[technology==i & year == j ]$word,sqrt(FA[technology==i & year == j]$chis), scale=c(8,.3), min.freq=2)
 mtext(txt,side=1,line=4, col="darkred")
 dev.off()
 }
 }
 }








 annualwords<-NULL

 for(j in 1994:2012) {
 for(i in techn) {

 cat(paste(i," ",sep=" "))

 ks<-PST[TECHNOLOGY==i & year==j]$APPLN_TITLE

 if(length(ks) > 0) {
 Bc <- Corpus(VectorSource(ks))

 cable_mat <- DocumentTermMatrix(Bc, control = list(weighting = weightTf, 
    removePunctuation = TRUE, removeNumbers = TRUE,  wordLengths = c(3, 30),tokenize = BigramTokenizer))

 dtm2<-removeSparseTerms(cable_mat, 0.999)

 dtm2 <- as.matrix(dtm2)

 temp <-  as.matrix(cbind(rep(i, ncol(dtm2)),rep(j, ncol(dtm2)), colnames(dtm2),colSums(dtm2)))

 annualwords <- rbind(annualwords,temp)

 gc("free")
 }
 }
 }


 annualwords<-data.table(annualwords)

 setnames(annualwords,c("V1","V2","V3","V4"),c("technology","year","word","count"))





 F<-read.csv("keywords.csv")
 F<-data.table(F)

 query<-"SELECT * FROM `applntitlemerge` a LEFT JOIN patentclass p ON a.APPLN_ID=p.APPLN_ID"

 P<-data.table(fetch(dbSendQuery(con,query),-1))

 T<-P[!is.na(TECHNOLOGY)]

 P<-P[is.na(TECHNOLOGY)]
 PS<-P[APPLN_ID %in% sample(P$APPLN_ID,100000)]

 PST<-rbind(T,PS)


 PST$ordered<-runif(nrow(PST),0,1)

 PST<-PST[order(ordered)]

 PST2 <- PST[APPLN_ID %in% sample(PST$APPLN_ID,30000)]

 PST3 <- PST
 PST <- PST2
 PST$techlabel <- as.factor(PST$TECHNOLOGY)
 PST$APPLN_TITLE<-removeWords(PST$APPLN_TITLE,c(preProcess(stopwords()),"one","two","three","four","five","six","seven","eight","nine","ten","twelve","thirteen","fourteen","another","a","an","shot","encounter","dead","including","identified","road","at"))
 PST$APPLN_TITLE<-preProcess(PST$APPLN_TITLE)
 
 PST[is.na(techlabel)]$techlabel<-"other"

 PST$label <- as.numeric(PST$techlabel)

 PST<-PST[order(ordered)]

 addrow<-nrow(PST)
 addrowp = addrow+1

 appendwords<-as.character(F$word)

 docs <- PST$APPLN_TITLE
 docs<-append(docs,appendwords)

 labels <- PST$techlabel
 appendlabel<-as.character(F$technology)

 labels<-append(labels,appendlabel)


 RA<-cbind(appendwords,results)
 RA<-cbind(RA,appendlabel)
 

 RA$SVM_LABEL<-as.numeric(as.character(RA$SVM_LABEL))
 RA$MAXENTROPY_LABEL<-as.numeric(as.character(RA$MAXENTROPY_LABEL))

 RA$meanprob = (RA$MAXENTROPY_PROB+RA$SVM_PROB)/2

 doc_matrix <- create_matrix(docs,language="english", removeNumbers=TRUE,stemWords=TRUE,removePunctuation=TRUE,removeSparseTerms=0.99995)







 container <- create_container(doc_matrix,labels, trainSize=1:addrow, testSize=addrowp:length(docs), virgin=TRUE)

 models <- train_models(container, algorithms=c("MAXENT","SVM"))


 results <- classify_models(container, models)

 analytics <- create_analytics(container, results)

 res<-cbind(PST[7001:nrow(PST)],analytics@document_summary)


 models <- train_models(container, algorithms=c("MAXENT","SVM"))

	setwd("/Users/thiemo/Dropbox/mafundo/")
	is_installed <- function(mypkg) is.element(mypkg, installed.packages()[,1])

	load_or_install<-function(package_names)
	{

	for(package_name in package_names)
	{
	if(!is_installed(package_name))
	{
	install.packages(package_name,repos="http://lib.stat.cmu.edu/R/CRAN")
	}
	library(package_name,character.only=TRUE,quietly=TRUE,verbose=FALSE)
	}
	}


	bottom <- function(frame, rows=5) {

	frame[(nrow(frame)-rows:nrow(frame)),]

	}

	cbindM <-
	function(A, v, none=NA) {

	dif <- setdiff(union(rownames(A),rownames(v)),intersect(rownames(A),rownames(v)))
	#if names is the same, then a simple cbind will do
	if(length(dif)==0) {

	A<- cbind(A,v[match(rownames(A),rownames(v))])

	rownames(A) <- names(v)

	}
	else if(length(dif)>0) {
	#sets are not equal, so either matrix is longer / shorter

	#this tells us which elements in dif are part of A (and of v) respectively
	for(i in dif) {

	if(is.element(i,rownames(A))) {
	#element is in A but not in v, so add it to v and then a

	temp<-matrix(data = none, nrow = 1, ncol = ncol(v), byrow = FALSE, dimnames =list(i))
	v <- rbind(v,temp)

	} else {
	# element is in v but not in A, so add it to A

	temp<-matrix(data = none, nrow = 1, ncol = ncol(A), byrow = FALSE, dimnames =list(i))
	A<-rbind(A,temp)
	}
	}


	A<-cbind(A,v[match(rownames(A),rownames(v))])
	}

	A
	}

	DTUniqueBy <- function(data, varvec) {
	data <- as.data.table(data)
	data[!duplicated(data.frame(data[, varvec, with=F]))]
	}

	mergeString <- function(txt,sepr=" ") {

	for(i in 1:length(txt)) {
	if(i ==1) {
	str = paste(txt[i], sep=sepr)
	} else {
	str = paste(str,txt[i], sep=sepr)

	}
	}

	str
	}

	preProcess <- function(content, keepnum=FALSE, keepperiod=FALSE,tolower=TRUE) {
	if(keepperiod==TRUE) {
	content <- gsub("[^[:alnum:].,]", " ", content )
	}
	else {
	content <- gsub("[^[:alnum:]]", " ", content )
	}
	content <- gsub("[[:space:]]+", " ", content )
	if(tolower==TRUE) {
	content <- tolower(content)
	}

	#content <- content [!grepl("^(\| \|\\.)*$", content )]

	content <- gsub("^ *", " ", content )
	if(keepnum==FALSE) {
	content <- gsub("([0-9])*", "", content )
	}
	content <- gsub("^-*", "", content )
	content <- gsub(" +", " ", content )
	content <- gsub(" *$", "", content )
	#content <- content[!grepl("^(\| \|\\.\\,)*$",content )]
	content <- gsub(" *$", "", content )
	content <- gsub("^ *", "", content )

	content
	}

	wordfreq <- function(txt, EF=1, stopws=NULL,stem=FALSE) {
	txt <- unlist(strsplit(txt, " ", fixed = TRUE))

	if(stem==TRUE) {
	txt <- wordStem(txt)

	}
	if (!is.null(stopws))
	txt = txt[!txt %in% stopws]
	txt <- preProcess(txt)

	tab <- sort(table(txt), decreasing = TRUE)
	return(data.frame(docs=EF, terms = names(tab), Freq = tab, row.names = NULL))
	}

	textmat <- function (vec=A, stpws=NULL) {
	dummy <- mapply(wordfreq, vec, 1:length(vec), MoreArgs=list(stopws=stpws), SIMPLIFY=F)
	names(dummy) <- NULL
	dtm <- t(xtabs(Freq ~ ., data = do.call("rbind", dummy)))
	dtm
	}



	makenumeric <- function(vec,replna=NA) {

	vec<-as.numeric(vec)
	vec[is.na(vec)]<-replna
	vec
	}


	makeCAPEXnice <- function(P) {

	P2 <- t(P)
	P2<-data.table(P2)
	P3<-as.matrix(P2)
	rownames(P3)<-names(P)
	rn <- NULL
	for(i in 1:ncol(P3)) {
	rn[i] <- paste(P3[1,i])

	}

	colnames(P3) <- rn

	P3 <- P3[3:nrow(P3),]


	quarter <- gsub("Mar\\.([0-9]{2})",1,rownames(P3))

	quarter <- gsub("Jun\\.([0-9]{2})",2,quarter)
	quarter <- gsub("Sep\\.([0-9]{2})",3,quarter)
	quarter <- as.numeric(gsub("Dec\\.([0-9]{2})",4,quarter))

	iyear <- gsub("(Mar\|Jun\|Sep\|Dec\\.)\|(Mar\|Jun\|Sep\|Dec)([0-9]{2})","\\2",rownames(P3))
	iyear <- gsub("\\.","",iyear)
	iyear <- gsub("(99\|98\|97\|96\|95)","19\\1",iyear)
	iyear <- as.numeric(gsub("(00\|01\|02\|03\|04\|05\|06\|07\|08\|09\|10\|11\|12)","20\\1",iyear))

	P3<-apply(P3,2,function(x) makenumeric(x,0))
	P3<-cbind(P3,iyear,quarter)

	P3[,order(colnames(P3))]


	}


	reshapeCSV <- function(REG, varname="stalled") {
	out<-NULL
	for(i in names(REG)) {
	temp<-NULL
	temp<-cbind(STATE_NAME=rep(i,length(REG$i)),year=REG$iyear,quarter=REG$quarter,varname=REG[,i,with=F])
	names(temp)<-c("STATE_NAME","year","quarter",varname)

	if(is.null(out)) {
	out<- rbind(temp)
	names(out)<-c("STATE_NAME","year","quarter",varname)

	} else {
	out<- rbind(out,temp)
	names(out)<-c("STATE_NAME","year","quarter",varname)

	}
	}

	data.table(out)
	}

	findMatches <- function(A,key) {

	res <- grep(paste("\\b",key,"\\b",sep=""),A$story)

	sapply(res, function(x) dbSendQuery(con,paste("INSERT INTO satp_association VALUES ('','",A[x]$id,"','",key,"')", sep="")))

	}


	findMatchesSentence <- function(A,key) {

	res <- grep(paste("\\b",key,"\\b",sep=""),A$story)

	sapply(res, function(x) dbSendQuery(con,paste("INSERT INTO satp_association VALUES ('','",A[x]$id,"','",key,"')", sep="")))

	}


	yearOnyearMaps <- function(yr,A,I) {

	A2 <- A[year==yr,.N,by=c("term")]

	names(A2)[2]<-"ATTACKCOUNT"

	o <- match(I@data$DISTRICT, A2$term)
	A2 <- A2[o,]
	row.names(A2)<-row.names(I)

	A3 <- spCbind(I, A2)

	pdf(paste(yr,".pdf",sep=""))
	plot(A3)
	nclr <- 12
	plotclr <- brewer.pal(nclr,"YlOrRd")
	class <- classIntervals(A3@data$ATTACKCOUNT, 8, style="fixed",fixedBreaks=c(0, 1, 4, 8, 16, 32, 64,128))
	colcode <- findColours(class, plotclr)
	#class <- classIntervals(A3@data$ATTACKCOUNT, nclr, style="equal")
	#colcode <- findColours(class, plotclr)
	plot(A3, col=colcode, add=T)
	title(main=paste("Violence in ",yr,sep=""), sub="Fixed Class Intervals")
	legend("bottomleft", legend=names(attr(colcode, "table")),
	fill=attr(colcode, "palette"), cex=0.6, bty="n")
	dev.off()

	}

	timeVector <- function(starttime,endtime,timestep="months") {

	starttime<- as.POSIXct(strptime(starttime, '%Y-%m-%d'))
	endtime<- as.POSIXct(strptime(endtime, '%Y-%m-%d'))
	if(timestep=="quarters") {
	timestep="months"
	ret<-seq(from=as.POSIXct(starttime), to=as.POSIXct(endtime), by=timestep)
	quarter <- gsub("(^[123]{1}$)", 1, month(ret))
	quarter <- gsub("(^[456]{1}$)", 2, quarter)
	quarter <- gsub("(^[789]{1}$)", 3, quarter)
	quarter <- as.numeric(gsub("(^[102]{2}$)", 4, quarter))

	ret<-paste(year(ret),quarter,sep="-")
	ret<-unique(ret)
	} else {

	ret<-seq(from=as.POSIXct(starttime), to=as.POSIXct(endtime), by=timestep)
	}
	ret

	}


	panelStructure <- function(group,timevec) {


	tt<-rep(timevec,length(group))
	tt2 <- as.character(sort(rep(group,length(timevec))))
	mat <- cbind("group"=data.frame(tt2),"timevec"=data.frame(tt))
	names(mat)<-c("group","timevec")
	mat
	}



	setwd("/Users/thiemo/Dropbox/mafundo/Googlestuff")

	load_or_install(c("R.oo","stringr","classInt","rgdal", "maptools","XML", "sentiment","plyr","RMySQL","RTextTools", "topicmodels","corpora","ggplot2","tm","tm.plugin.sentiment","foreach","RColorBrewer","wordcloud","lsa","MASS","openNLP","openNLPmodels.en","data.table","depmixS4"))

	BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 3))


	con <- dbConnect(MySQL(), username="root", password="", dbname="vanderbilt", unix.socket="/tmp/mysql.sock")
	query <- "SELECT * FROM `abstracts` WHERE year(start) >= 1990"
	A<-data.table(fetch(dbSendQuery(con,query),-1))

	A$rand <- runif(nrow(A),0,1)
	A<-A[order(rand)]
	A<-A[abstract!=""]
	A<-A[1:60000]
	A$abstract<-preProcess(A$abstract)
	A$abstract<-removeWords(A$abstract, preProcess(stopwords("en"),c("featured scenes shown","report introduced","reported")))
	A$abstract<-preProcess(A$abstract)

	A<-A[abstract!=""]
	A$technology<-"trivia"
	A$year<-A[,year(start)]

	con <- dbConnect(MySQL(), username="root", password="", dbname="googleinsights", unix.socket="/tmp/mysql.sock")
	query<-"SELECT * FROM `applntitlemerge` a LEFT JOIN patentclass p ON a.APPLN_ID=p.APPLN_ID"

	P<-data.table(fetch(dbSendQuery(con,query),-1))

	T<-P[!is.na(TECHNOLOGY)]

	P<-P[is.na(TECHNOLOGY)]
	PS<-P[APPLN_ID %in% sample(P$APPLN_ID,100000)]

	PST<-rbind(T,PS)

	PST[is.na(TECHNOLOGY)]$TECHNOLOGY<-"other"
	PST$year<-PST[,year(APPLN_FILING_DATE)]

	techwords <- NULL

	techn <- names(table(PST$TECHNOLOGY))

	PST$APPLN_TITLE<-preProcess(PST$APPLN_TITLE)
	PST$APPLN_TITLE<-removeWords(PST$APPLN_TITLE,preProcess(stopwords("en")))
	PST$APPLN_TITLE<-preProcess(PST$APPLN_TITLE)

	techwords <- NULL

	for(i in techn) {

	cat(paste(i," ",sep=" "))
	Bc <- Corpus(VectorSource(PST[TECHNOLOGY==i]$APPLN_TITLE))


	cable_mat <- DocumentTermMatrix(Bc, control = list(weighting = weightTf,
	removePunctuation = TRUE, removeNumbers = TRUE, wordLengths = c(3, 30),tokenize = BigramTokenizer))

	dtm2<-removeSparseTerms(cable_mat, 0.999)

	dtm2 <- as.matrix(dtm2)

	temp <- as.matrix(cbind(rep(i, ncol(dtm2)), colnames(dtm2),colSums(dtm2)))

	techwords <- rbind(techwords,temp)

	gc("free")
	}

	techwords<-data.table(techwords)
	setnames(techwords,c("V1","V2","V3"),c("technology","word","count"))
	techwords$count<-as.numeric(as.character(techwords$count))


	Bc <- Corpus(VectorSource(A$abstract))

	cable_mat <- DocumentTermMatrix(Bc, control = list(weighting = weightTf,
	removePunctuation = TRUE, removeNumbers = TRUE, wordLengths = c(3, 30),tokenize = BigramTokenizer))
	dtm2<-removeSparseTerms(cable_mat, 0.999)
	dtm2 <- as.matrix(dtm2)
	temp <- as.matrix(cbind(rep("trivia", ncol(dtm2)), colnames(dtm2),colSums(dtm2)))
	temp<-data.table(temp)

	temp$V3<-as.numeric(as.character(temp$V3))

	trivia<-temp

	techwords<-rbind(techwords,trivia,use.names=FALSE)


	techannualwords<-NULL

	for(j in 1995:2012) {

	temp<-A[year<=j & year >=j-5][1:1000]$abstract
	Bc <- Corpus(VectorSource(temp))

	cable_mat <- DocumentTermMatrix(Bc, control = list(weighting = weightTf,
	removePunctuation = TRUE, removeNumbers = TRUE, wordLengths = c(3, 30),tokenize = BigramTokenizer))
	dtm2<-removeSparseTerms(cable_mat, 0.999)
	dtm2 <- as.matrix(dtm2)
	temp <- as.matrix(cbind(rep("trivia", ncol(dtm2)),rep(j, ncol(dtm2)), colnames(dtm2),colSums(dtm2)))
	techannualwords<-rbind(techannualwords,temp)

	for(i in techn) {

	cat(paste(i," ",j,sep=" "))

	ks<-PST[TECHNOLOGY==i & year<=j & year>=j-5]$APPLN_TITLE

	if(length(ks) > 0) {
	Bc <- Corpus(VectorSource(ks))

	cable_mat <- DocumentTermMatrix(Bc, control = list(weighting = weightTf,
	removePunctuation = TRUE, removeNumbers = TRUE, wordLengths = c(3, 30),tokenize = BigramTokenizer))

	dtm2<-removeSparseTerms(cable_mat, 0.999)

	dtm2 <- as.matrix(dtm2)

	temp <- as.matrix(cbind(rep(i, ncol(dtm2)),rep(j, ncol(dtm2)), colnames(dtm2),colSums(dtm2)))

	techannualwords <- rbind(techannualwords,temp)

	gc("free")
	}
	}
	}



	techannualwords<-data.table(techannualwords)

	setnames(techannualwords,c("V1","V2","V3","V4"),c("technology","year","word","count"))

	techannualwords$word<-as.character(techannualwords$word)
	techannualwords$technology<-as.character(techannualwords$technology)
	techannualwords$count<-as.numeric(as.character(techannualwords$count))

	Freqsannual <- NULL

	###individual technology innovations
	for(j in 1995:2012) {
	for(i in techn) {

	cat(j," ",i,sep=" ")

	if(nrow(techannualwords[technology==i & year==j]) > 0) {

	FreqVec<-merge(techannualwords[technology !=i & year==j][,sum(count),by=word],techannualwords[technology==i & year==j],by="word",all.x=TRUE,all.y=TRUE)

	#democrat - other
	#republican - solar

	setnames(FreqVec, c("V1","technology","count") ,c("anchorcount","technology","patentcount"))

	FreqVec[is.na(anchorcount)]$anchorcount<-0
	FreqVec[is.na(patentcount)]$patentcount<-0

	anch<-FreqVec[,sum(anchorcount)]
	pat<-FreqVec[,sum(patentcount)]
	chis<-FreqVec[,(anch+pat)((anchorcount(pat-patentcount) - patentcount(anch-anchorcount))^2)/((anchorcount+patentcount)(anch pat)(anch+pat-anchorcount-patentcount)),by=word]

	FreqVec<-cbind(FreqVec,chis)
	names(FreqVec)[ncol(FreqVec)] <- "chis"

	chissign<-FreqVec[,sign((anch+pat)((anchorcount(pat-patentcount) - patentcount*(anch-anchorcount)))),by=word]
	FreqVec<-cbind(rep(j,nrow(FreqVec)),FreqVec)

	FreqVec<-cbind(FreqVec,chissign)


	Freqsannual<-rbind(Freqsannual,FreqVec)
	rm(FreqVec)
	gc("free")
	}
	}
	}


	Freqsannual<-data.table(Freqsannual)

	FA <- Freqsannual[technology != "<NA>" & V1 == -1 & chis>= 10 & patentcount>=3]

	setnames(FA,"rep(j, nrow(FreqVec))","year")

	write.csv(FA[,c(1,3,4,5,6,7,8,10),with=F],file="keywords-annual.csv")



	write.csv(data.frame(table(FA$word)), file="distinct-keywords.csv")

	techannualwords$broadclass <- ""

	techannualwords[technology=="trivia"]$broadclass = "notechnology"
	techannualwords[technology=="other"]$broadclass = "othertechnology"
	techannualwords[broadclass==""]$broadclass = "cleantech"





	Freqanytech <- NULL

	###individual technology innovations
	for(j in 1995:2012) {

	cat(j," ",sep=" ")

	FreqVec<-merge(techannualwords[broadclass =="othertechnology" & year==j][,sum(count),by=word],techannualwords[broadclass =="cleantech" & year==j][,sum(count),by=word],by="word",all.x=TRUE,all.y=TRUE)

	#democrat - other
	#republican - solar

	setnames(FreqVec, c("V1.x","V1.y") ,c("anchorcount","patentcount"))

	FreqVec[is.na(anchorcount)]$anchorcount<-0
	FreqVec[is.na(patentcount)]$patentcount<-0

	anch<-FreqVec[,sum(anchorcount)]
	pat<-FreqVec[,sum(patentcount)]
	chis<-FreqVec[,(anch+pat)((anchorcount(pat-patentcount) - patentcount(anch-anchorcount))^2)/((anchorcount+patentcount)(anch pat)(anch+pat-anchorcount-patentcount)),by=word]

	FreqVec<-cbind(FreqVec,chis)
	names(FreqVec)[ncol(FreqVec)] <- "chis"

	chissign<-FreqVec[,sign((anch+pat)((anchorcount(pat-patentcount) - patentcount*(anch-anchorcount)))),by=word]
	FreqVec<-cbind(rep(j,nrow(FreqVec)),FreqVec)

	FreqVec<-cbind(FreqVec,chissign)


	Freqanytech<-rbind(Freqanytech,FreqVec)
	rm(FreqVec)
	gc("free")

	}


	FAY <- Freqanytech[chis>= 5 & patentcount>=3]
	setnames(FAY,"rep(j, nrow(FreqVec))","year")


	FAY$innovationclass <- ""

	FAY[V1==1]$innovationclass <- "noncleantech"
	FAY[V1==-1]$innovationclass <- "cleantech"


	write.csv(FAY[,c(1,2,3,4,6,8,9),with=F],file="innovation-keywords.csv")

	write.csv(data.frame(table(FACNC$word)), file="clean-nonclean-keywords.csv")


	write.csv(data.frame(table(FAY$word)), file="general-innovation-keywords.csv")


	setwd("/Users/thiemo/Dropbox/mafundo/Googlestuff/Google keywords/keywords-generalinnovation/")

	for(j in 1995:2012) {

	fn = paste("innovation-",j,".pdf",sep="")

	if(nrow(FAY[ year == j ]) > 10) {
	pdf(fn)
	wordcloud(FAY[year == j ]$word,sqrt(FAY[year == j]$chis), scale=c(8,.3), min.freq=2)
	dev.off()
	}

	}



	FACNC <- Freqanytech[chis>= 5 & patentcount>=3 & V1 == -1 ]
	setnames(FACNC,"rep(j, nrow(FreqVec))","year")

	FACNC$innovationclass <- "cleantech"



	setwd("/Users/thiemo/Dropbox/mafundo/Googlestuff/Google keywords/keywords-clean-vs-nonclean/")

	for(j in 1995:2012) {

	fn = paste("cleannonclean-",j,".pdf",sep="")

	if(nrow(FACNC[ year == j ]) > 10) {
	pdf(fn)
	wordcloud(FACNC[year == j & innovationclass=="cleantech" ]$word,sqrt(FACNC[year == j & innovationclass=="cleantech" ]$chis), scale=c(8,.3), min.freq=2)
	dev.off()
	}

	}

	setwd("/Users/thiemo/Dropbox/mafundo/Googlestuff/Google keywords/keywords-annual/")

	for(i in techn) {
	for(j in 1995:2012) {

	fn = paste(i,"-",j,".pdf",sep="")

	if(nrow(FA[technology==i & year == j ]) > 10) {
	txt <- paste("Technology: ",i," - Number of patents (including prev 5 yrs): ", nrow(PST[TECHNOLOGY==i & year <= j & year >= j-5]), sep="")
	pdf(fn)
	wordcloud(FA[technology==i & year == j ]$word,sqrt(FA[technology==i & year == j]$chis), scale=c(8,.3), min.freq=2)
	mtext(txt,side=1,line=4, col="darkred")
	dev.off()
	}
	}
	}








	annualwords<-NULL

	for(j in 1994:2012) {
	for(i in techn) {

	cat(paste(i," ",sep=" "))

	ks<-PST[TECHNOLOGY==i & year==j]$APPLN_TITLE

	if(length(ks) > 0) {
	Bc <- Corpus(VectorSource(ks))

	cable_mat <- DocumentTermMatrix(Bc, control = list(weighting = weightTf,
	removePunctuation = TRUE, removeNumbers = TRUE, wordLengths = c(3, 30),tokenize = BigramTokenizer))

	dtm2<-removeSparseTerms(cable_mat, 0.999)

	dtm2 <- as.matrix(dtm2)

	temp <- as.matrix(cbind(rep(i, ncol(dtm2)),rep(j, ncol(dtm2)), colnames(dtm2),colSums(dtm2)))

	annualwords <- rbind(annualwords,temp)

	gc("free")
	}
	}
	}


	annualwords<-data.table(annualwords)

	setnames(annualwords,c("V1","V2","V3","V4"),c("technology","year","word","count"))





	F<-read.csv("keywords.csv")
	F<-data.table(F)

	query<-"SELECT * FROM `applntitlemerge` a LEFT JOIN patentclass p ON a.APPLN_ID=p.APPLN_ID"

	P<-data.table(fetch(dbSendQuery(con,query),-1))

	T<-P[!is.na(TECHNOLOGY)]

	P<-P[is.na(TECHNOLOGY)]
	PS<-P[APPLN_ID %in% sample(P$APPLN_ID,100000)]

	PST<-rbind(T,PS)


	PST$ordered<-runif(nrow(PST),0,1)

	PST<-PST[order(ordered)]

	PST2 <- PST[APPLN_ID %in% sample(PST$APPLN_ID,30000)]

	PST3 <- PST
	PST <- PST2
	PST$techlabel <- as.factor(PST$TECHNOLOGY)
	PST$APPLN_TITLE<-removeWords(PST$APPLN_TITLE,c(preProcess(stopwords()),"one","two","three","four","five","six","seven","eight","nine","ten","twelve","thirteen","fourteen","another","a","an","shot","encounter","dead","including","identified","road","at"))
	PST$APPLN_TITLE<-preProcess(PST$APPLN_TITLE)

	PST[is.na(techlabel)]$techlabel<-"other"

	PST$label <- as.numeric(PST$techlabel)

	PST<-PST[order(ordered)]

	addrow<-nrow(PST)
	addrowp = addrow+1

	appendwords<-as.character(F$word)

	docs <- PST$APPLN_TITLE
	docs<-append(docs,appendwords)

	labels <- PST$techlabel
	appendlabel<-as.character(F$technology)

	labels<-append(labels,appendlabel)


	RA<-cbind(appendwords,results)
	RA<-cbind(RA,appendlabel)


	RA$SVM_LABEL<-as.numeric(as.character(RA$SVM_LABEL))
	RA$MAXENTROPY_LABEL<-as.numeric(as.character(RA$MAXENTROPY_LABEL))

	RA$meanprob = (RA$MAXENTROPY_PROB+RA$SVM_PROB)/2

	doc_matrix <- create_matrix(docs,language="english", removeNumbers=TRUE,stemWords=TRUE,removePunctuation=TRUE,removeSparseTerms=0.99995)







	container <- create_container(doc_matrix,labels, trainSize=1:addrow, testSize=addrowp:length(docs), virgin=TRUE)

	models <- train_models(container, algorithms=c("MAXENT","SVM"))


	results <- classify_models(container, models)

	analytics <- create_analytics(container, results)

	res<-cbind(PST[7001:nrow(PST)],analytics@document_summary)


	models <- train_models(container, algorithms=c("MAXENT","SVM"))