Skip to content

Instantly share code, notes, and snippets.

@cigrainger
Created February 14, 2014 14:50
Show Gist options
  • Save cigrainger/9002305 to your computer and use it in GitHub Desktop.
Save cigrainger/9002305 to your computer and use it in GitHub Desktop.
setwd("/Users/thiemo/Dropbox/mafundo/")
is_installed <- function(mypkg) is.element(mypkg, installed.packages()[,1])
load_or_install<-function(package_names)
{
for(package_name in package_names)
{
if(!is_installed(package_name))
{
install.packages(package_name,repos="http://lib.stat.cmu.edu/R/CRAN")
}
library(package_name,character.only=TRUE,quietly=TRUE,verbose=FALSE)
}
}
bottom <- function(frame, rows=5) {
frame[(nrow(frame)-rows:nrow(frame)),]
}
cbindM <-
function(A, v, none=NA) {
dif <- setdiff(union(rownames(A),rownames(v)),intersect(rownames(A),rownames(v)))
#if names is the same, then a simple cbind will do
if(length(dif)==0) {
A<- cbind(A,v[match(rownames(A),rownames(v))])
rownames(A) <- names(v)
}
else if(length(dif)>0) {
#sets are not equal, so either matrix is longer / shorter
#this tells us which elements in dif are part of A (and of v) respectively
for(i in dif) {
if(is.element(i,rownames(A))) {
#element is in A but not in v, so add it to v and then a
temp<-matrix(data = none, nrow = 1, ncol = ncol(v), byrow = FALSE, dimnames =list(i))
v <- rbind(v,temp)
} else {
# element is in v but not in A, so add it to A
temp<-matrix(data = none, nrow = 1, ncol = ncol(A), byrow = FALSE, dimnames =list(i))
A<-rbind(A,temp)
}
}
A<-cbind(A,v[match(rownames(A),rownames(v))])
}
A
}
DTUniqueBy <- function(data, varvec) {
data <- as.data.table(data)
data[!duplicated(data.frame(data[, varvec, with=F]))]
}
mergeString <- function(txt,sepr=" ") {
for(i in 1:length(txt)) {
if(i ==1) {
str = paste(txt[i], sep=sepr)
} else {
str = paste(str,txt[i], sep=sepr)
}
}
str
}
preProcess <- function(content, keepnum=FALSE, keepperiod=FALSE,tolower=TRUE) {
if(keepperiod==TRUE) {
content <- gsub("[^[:alnum:].,]", " ", content )
}
else {
content <- gsub("[^[:alnum:]]", " ", content )
}
content <- gsub("[[:space:]]+", " ", content )
if(tolower==TRUE) {
content <- tolower(content)
}
#content <- content [!grepl("^(| |\\.)*$", content )]
content <- gsub("^ *", " ", content )
if(keepnum==FALSE) {
content <- gsub("([0-9])*", "", content )
}
content <- gsub("^-*", "", content )
content <- gsub(" +", " ", content )
content <- gsub(" *$", "", content )
#content <- content[!grepl("^(| |\\.\\,)*$",content )]
content <- gsub(" *$", "", content )
content <- gsub("^ *", "", content )
content
}
wordfreq <- function(txt, EF=1, stopws=NULL,stem=FALSE) {
txt <- unlist(strsplit(txt, " ", fixed = TRUE))
if(stem==TRUE) {
txt <- wordStem(txt)
}
if (!is.null(stopws))
txt = txt[!txt %in% stopws]
txt <- preProcess(txt)
tab <- sort(table(txt), decreasing = TRUE)
return(data.frame(docs=EF, terms = names(tab), Freq = tab, row.names = NULL))
}
textmat <- function (vec=A, stpws=NULL) {
dummy <- mapply(wordfreq, vec, 1:length(vec), MoreArgs=list(stopws=stpws), SIMPLIFY=F)
names(dummy) <- NULL
dtm <- t(xtabs(Freq ~ ., data = do.call("rbind", dummy)))
dtm
}
makenumeric <- function(vec,replna=NA) {
vec<-as.numeric(vec)
vec[is.na(vec)]<-replna
vec
}
makeCAPEXnice <- function(P) {
P2 <- t(P)
P2<-data.table(P2)
P3<-as.matrix(P2)
rownames(P3)<-names(P)
rn <- NULL
for(i in 1:ncol(P3)) {
rn[i] <- paste(P3[1,i])
}
colnames(P3) <- rn
P3 <- P3[3:nrow(P3),]
quarter <- gsub("Mar\\.([0-9]{2})",1,rownames(P3))
quarter <- gsub("Jun\\.([0-9]{2})",2,quarter)
quarter <- gsub("Sep\\.([0-9]{2})",3,quarter)
quarter <- as.numeric(gsub("Dec\\.([0-9]{2})",4,quarter))
iyear <- gsub("(Mar|Jun|Sep|Dec\\.)|(Mar|Jun|Sep|Dec)([0-9]{2})","\\2",rownames(P3))
iyear <- gsub("\\.","",iyear)
iyear <- gsub("(99|98|97|96|95)","19\\1",iyear)
iyear <- as.numeric(gsub("(00|01|02|03|04|05|06|07|08|09|10|11|12)","20\\1",iyear))
P3<-apply(P3,2,function(x) makenumeric(x,0))
P3<-cbind(P3,iyear,quarter)
P3[,order(colnames(P3))]
}
reshapeCSV <- function(REG, varname="stalled") {
out<-NULL
for(i in names(REG)) {
temp<-NULL
temp<-cbind(STATE_NAME=rep(i,length(REG$i)),year=REG$iyear,quarter=REG$quarter,varname=REG[,i,with=F])
names(temp)<-c("STATE_NAME","year","quarter",varname)
if(is.null(out)) {
out<- rbind(temp)
names(out)<-c("STATE_NAME","year","quarter",varname)
} else {
out<- rbind(out,temp)
names(out)<-c("STATE_NAME","year","quarter",varname)
}
}
data.table(out)
}
findMatches <- function(A,key) {
res <- grep(paste("\\b",key,"\\b",sep=""),A$story)
sapply(res, function(x) dbSendQuery(con,paste("INSERT INTO satp_association VALUES ('','",A[x]$id,"','",key,"')", sep="")))
}
findMatchesSentence <- function(A,key) {
res <- grep(paste("\\b",key,"\\b",sep=""),A$story)
sapply(res, function(x) dbSendQuery(con,paste("INSERT INTO satp_association VALUES ('','",A[x]$id,"','",key,"')", sep="")))
}
yearOnyearMaps <- function(yr,A,I) {
A2 <- A[year==yr,.N,by=c("term")]
names(A2)[2]<-"ATTACKCOUNT"
o <- match(I@data$DISTRICT, A2$term)
A2 <- A2[o,]
row.names(A2)<-row.names(I)
A3 <- spCbind(I, A2)
pdf(paste(yr,".pdf",sep=""))
plot(A3)
nclr <- 12
plotclr <- brewer.pal(nclr,"YlOrRd")
class <- classIntervals(A3@data$ATTACKCOUNT, 8, style="fixed",fixedBreaks=c(0, 1, 4, 8, 16, 32, 64,128))
colcode <- findColours(class, plotclr)
#class <- classIntervals(A3@data$ATTACKCOUNT, nclr, style="equal")
#colcode <- findColours(class, plotclr)
plot(A3, col=colcode, add=T)
title(main=paste("Violence in ",yr,sep=""), sub="Fixed Class Intervals")
legend("bottomleft", legend=names(attr(colcode, "table")),
fill=attr(colcode, "palette"), cex=0.6, bty="n")
dev.off()
}
timeVector <- function(starttime,endtime,timestep="months") {
starttime<- as.POSIXct(strptime(starttime, '%Y-%m-%d'))
endtime<- as.POSIXct(strptime(endtime, '%Y-%m-%d'))
if(timestep=="quarters") {
timestep="months"
ret<-seq(from=as.POSIXct(starttime), to=as.POSIXct(endtime), by=timestep)
quarter <- gsub("(^[123]{1}$)", 1, month(ret))
quarter <- gsub("(^[456]{1}$)", 2, quarter)
quarter <- gsub("(^[789]{1}$)", 3, quarter)
quarter <- as.numeric(gsub("(^[102]{2}$)", 4, quarter))
ret<-paste(year(ret),quarter,sep="-")
ret<-unique(ret)
} else {
ret<-seq(from=as.POSIXct(starttime), to=as.POSIXct(endtime), by=timestep)
}
ret
}
panelStructure <- function(group,timevec) {
tt<-rep(timevec,length(group))
tt2 <- as.character(sort(rep(group,length(timevec))))
mat <- cbind("group"=data.frame(tt2),"timevec"=data.frame(tt))
names(mat)<-c("group","timevec")
mat
}
setwd("/Users/thiemo/Dropbox/mafundo/Googlestuff")
load_or_install(c("R.oo","stringr","classInt","rgdal", "maptools","XML", "sentiment","plyr","RMySQL","RTextTools", "topicmodels","corpora","ggplot2","tm","tm.plugin.sentiment","foreach","RColorBrewer","wordcloud","lsa","MASS","openNLP","openNLPmodels.en","data.table","depmixS4"))
BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 3))
con <- dbConnect(MySQL(), username="root", password="", dbname="vanderbilt", unix.socket="/tmp/mysql.sock")
query <- "SELECT * FROM `abstracts` WHERE year(start) >= 1990"
A<-data.table(fetch(dbSendQuery(con,query),-1))
A$rand <- runif(nrow(A),0,1)
A<-A[order(rand)]
A<-A[abstract!=""]
A<-A[1:60000]
A$abstract<-preProcess(A$abstract)
A$abstract<-removeWords(A$abstract, preProcess(stopwords("en"),c("featured scenes shown","report introduced","reported")))
A$abstract<-preProcess(A$abstract)
A<-A[abstract!=""]
A$technology<-"trivia"
A$year<-A[,year(start)]
con <- dbConnect(MySQL(), username="root", password="", dbname="googleinsights", unix.socket="/tmp/mysql.sock")
query<-"SELECT * FROM `applntitlemerge` a LEFT JOIN patentclass p ON a.APPLN_ID=p.APPLN_ID"
P<-data.table(fetch(dbSendQuery(con,query),-1))
T<-P[!is.na(TECHNOLOGY)]
P<-P[is.na(TECHNOLOGY)]
PS<-P[APPLN_ID %in% sample(P$APPLN_ID,100000)]
PST<-rbind(T,PS)
PST[is.na(TECHNOLOGY)]$TECHNOLOGY<-"other"
PST$year<-PST[,year(APPLN_FILING_DATE)]
techwords <- NULL
techn <- names(table(PST$TECHNOLOGY))
PST$APPLN_TITLE<-preProcess(PST$APPLN_TITLE)
PST$APPLN_TITLE<-removeWords(PST$APPLN_TITLE,preProcess(stopwords("en")))
PST$APPLN_TITLE<-preProcess(PST$APPLN_TITLE)
techwords <- NULL
for(i in techn) {
cat(paste(i," ",sep=" "))
Bc <- Corpus(VectorSource(PST[TECHNOLOGY==i]$APPLN_TITLE))
cable_mat <- DocumentTermMatrix(Bc, control = list(weighting = weightTf,
removePunctuation = TRUE, removeNumbers = TRUE, wordLengths = c(3, 30),tokenize = BigramTokenizer))
dtm2<-removeSparseTerms(cable_mat, 0.999)
dtm2 <- as.matrix(dtm2)
temp <- as.matrix(cbind(rep(i, ncol(dtm2)), colnames(dtm2),colSums(dtm2)))
techwords <- rbind(techwords,temp)
gc("free")
}
techwords<-data.table(techwords)
setnames(techwords,c("V1","V2","V3"),c("technology","word","count"))
techwords$count<-as.numeric(as.character(techwords$count))
Bc <- Corpus(VectorSource(A$abstract))
cable_mat <- DocumentTermMatrix(Bc, control = list(weighting = weightTf,
removePunctuation = TRUE, removeNumbers = TRUE, wordLengths = c(3, 30),tokenize = BigramTokenizer))
dtm2<-removeSparseTerms(cable_mat, 0.999)
dtm2 <- as.matrix(dtm2)
temp <- as.matrix(cbind(rep("trivia", ncol(dtm2)), colnames(dtm2),colSums(dtm2)))
temp<-data.table(temp)
temp$V3<-as.numeric(as.character(temp$V3))
trivia<-temp
techwords<-rbind(techwords,trivia,use.names=FALSE)
techannualwords<-NULL
for(j in 1995:2012) {
temp<-A[year<=j & year >=j-5][1:1000]$abstract
Bc <- Corpus(VectorSource(temp))
cable_mat <- DocumentTermMatrix(Bc, control = list(weighting = weightTf,
removePunctuation = TRUE, removeNumbers = TRUE, wordLengths = c(3, 30),tokenize = BigramTokenizer))
dtm2<-removeSparseTerms(cable_mat, 0.999)
dtm2 <- as.matrix(dtm2)
temp <- as.matrix(cbind(rep("trivia", ncol(dtm2)),rep(j, ncol(dtm2)), colnames(dtm2),colSums(dtm2)))
techannualwords<-rbind(techannualwords,temp)
for(i in techn) {
cat(paste(i," ",j,sep=" "))
ks<-PST[TECHNOLOGY==i & year<=j & year>=j-5]$APPLN_TITLE
if(length(ks) > 0) {
Bc <- Corpus(VectorSource(ks))
cable_mat <- DocumentTermMatrix(Bc, control = list(weighting = weightTf,
removePunctuation = TRUE, removeNumbers = TRUE, wordLengths = c(3, 30),tokenize = BigramTokenizer))
dtm2<-removeSparseTerms(cable_mat, 0.999)
dtm2 <- as.matrix(dtm2)
temp <- as.matrix(cbind(rep(i, ncol(dtm2)),rep(j, ncol(dtm2)), colnames(dtm2),colSums(dtm2)))
techannualwords <- rbind(techannualwords,temp)
gc("free")
}
}
}
techannualwords<-data.table(techannualwords)
setnames(techannualwords,c("V1","V2","V3","V4"),c("technology","year","word","count"))
techannualwords$word<-as.character(techannualwords$word)
techannualwords$technology<-as.character(techannualwords$technology)
techannualwords$count<-as.numeric(as.character(techannualwords$count))
Freqsannual <- NULL
###individual technology innovations
for(j in 1995:2012) {
for(i in techn) {
cat(j," ",i,sep=" ")
if(nrow(techannualwords[technology==i & year==j]) > 0) {
FreqVec<-merge(techannualwords[technology !=i & year==j][,sum(count),by=word],techannualwords[technology==i & year==j],by="word",all.x=TRUE,all.y=TRUE)
#democrat - other
#republican - solar
setnames(FreqVec, c("V1","technology","count") ,c("anchorcount","technology","patentcount"))
FreqVec[is.na(anchorcount)]$anchorcount<-0
FreqVec[is.na(patentcount)]$patentcount<-0
anch<-FreqVec[,sum(anchorcount)]
pat<-FreqVec[,sum(patentcount)]
chis<-FreqVec[,(anch+pat)*((anchorcount*(pat-patentcount) - patentcount*(anch-anchorcount))^2)/((anchorcount+patentcount)*(anch *pat)*(anch+pat-anchorcount-patentcount)),by=word]
FreqVec<-cbind(FreqVec,chis)
names(FreqVec)[ncol(FreqVec)] <- "chis"
chissign<-FreqVec[,sign((anch+pat)*((anchorcount*(pat-patentcount) - patentcount*(anch-anchorcount)))),by=word]
FreqVec<-cbind(rep(j,nrow(FreqVec)),FreqVec)
FreqVec<-cbind(FreqVec,chissign)
Freqsannual<-rbind(Freqsannual,FreqVec)
rm(FreqVec)
gc("free")
}
}
}
Freqsannual<-data.table(Freqsannual)
FA <- Freqsannual[technology != "<NA>" & V1 == -1 & chis>= 10 & patentcount>=3]
setnames(FA,"rep(j, nrow(FreqVec))","year")
write.csv(FA[,c(1,3,4,5,6,7,8,10),with=F],file="keywords-annual.csv")
write.csv(data.frame(table(FA$word)), file="distinct-keywords.csv")
techannualwords$broadclass <- ""
techannualwords[technology=="trivia"]$broadclass = "notechnology"
techannualwords[technology=="other"]$broadclass = "othertechnology"
techannualwords[broadclass==""]$broadclass = "cleantech"
Freqanytech <- NULL
###individual technology innovations
for(j in 1995:2012) {
cat(j," ",sep=" ")
FreqVec<-merge(techannualwords[broadclass =="othertechnology" & year==j][,sum(count),by=word],techannualwords[broadclass =="cleantech" & year==j][,sum(count),by=word],by="word",all.x=TRUE,all.y=TRUE)
#democrat - other
#republican - solar
setnames(FreqVec, c("V1.x","V1.y") ,c("anchorcount","patentcount"))
FreqVec[is.na(anchorcount)]$anchorcount<-0
FreqVec[is.na(patentcount)]$patentcount<-0
anch<-FreqVec[,sum(anchorcount)]
pat<-FreqVec[,sum(patentcount)]
chis<-FreqVec[,(anch+pat)*((anchorcount*(pat-patentcount) - patentcount*(anch-anchorcount))^2)/((anchorcount+patentcount)*(anch *pat)*(anch+pat-anchorcount-patentcount)),by=word]
FreqVec<-cbind(FreqVec,chis)
names(FreqVec)[ncol(FreqVec)] <- "chis"
chissign<-FreqVec[,sign((anch+pat)*((anchorcount*(pat-patentcount) - patentcount*(anch-anchorcount)))),by=word]
FreqVec<-cbind(rep(j,nrow(FreqVec)),FreqVec)
FreqVec<-cbind(FreqVec,chissign)
Freqanytech<-rbind(Freqanytech,FreqVec)
rm(FreqVec)
gc("free")
}
FAY <- Freqanytech[chis>= 5 & patentcount>=3]
setnames(FAY,"rep(j, nrow(FreqVec))","year")
FAY$innovationclass <- ""
FAY[V1==1]$innovationclass <- "noncleantech"
FAY[V1==-1]$innovationclass <- "cleantech"
write.csv(FAY[,c(1,2,3,4,6,8,9),with=F],file="innovation-keywords.csv")
write.csv(data.frame(table(FACNC$word)), file="clean-nonclean-keywords.csv")
write.csv(data.frame(table(FAY$word)), file="general-innovation-keywords.csv")
setwd("/Users/thiemo/Dropbox/mafundo/Googlestuff/Google keywords/keywords-generalinnovation/")
for(j in 1995:2012) {
fn = paste("innovation-",j,".pdf",sep="")
if(nrow(FAY[ year == j ]) > 10) {
pdf(fn)
wordcloud(FAY[year == j ]$word,sqrt(FAY[year == j]$chis), scale=c(8,.3), min.freq=2)
dev.off()
}
}
FACNC <- Freqanytech[chis>= 5 & patentcount>=3 & V1 == -1 ]
setnames(FACNC,"rep(j, nrow(FreqVec))","year")
FACNC$innovationclass <- "cleantech"
setwd("/Users/thiemo/Dropbox/mafundo/Googlestuff/Google keywords/keywords-clean-vs-nonclean/")
for(j in 1995:2012) {
fn = paste("cleannonclean-",j,".pdf",sep="")
if(nrow(FACNC[ year == j ]) > 10) {
pdf(fn)
wordcloud(FACNC[year == j & innovationclass=="cleantech" ]$word,sqrt(FACNC[year == j & innovationclass=="cleantech" ]$chis), scale=c(8,.3), min.freq=2)
dev.off()
}
}
setwd("/Users/thiemo/Dropbox/mafundo/Googlestuff/Google keywords/keywords-annual/")
for(i in techn) {
for(j in 1995:2012) {
fn = paste(i,"-",j,".pdf",sep="")
if(nrow(FA[technology==i & year == j ]) > 10) {
txt <- paste("Technology: ",i," - Number of patents (including prev 5 yrs): ", nrow(PST[TECHNOLOGY==i & year <= j & year >= j-5]), sep="")
pdf(fn)
wordcloud(FA[technology==i & year == j ]$word,sqrt(FA[technology==i & year == j]$chis), scale=c(8,.3), min.freq=2)
mtext(txt,side=1,line=4, col="darkred")
dev.off()
}
}
}
annualwords<-NULL
for(j in 1994:2012) {
for(i in techn) {
cat(paste(i," ",sep=" "))
ks<-PST[TECHNOLOGY==i & year==j]$APPLN_TITLE
if(length(ks) > 0) {
Bc <- Corpus(VectorSource(ks))
cable_mat <- DocumentTermMatrix(Bc, control = list(weighting = weightTf,
removePunctuation = TRUE, removeNumbers = TRUE, wordLengths = c(3, 30),tokenize = BigramTokenizer))
dtm2<-removeSparseTerms(cable_mat, 0.999)
dtm2 <- as.matrix(dtm2)
temp <- as.matrix(cbind(rep(i, ncol(dtm2)),rep(j, ncol(dtm2)), colnames(dtm2),colSums(dtm2)))
annualwords <- rbind(annualwords,temp)
gc("free")
}
}
}
annualwords<-data.table(annualwords)
setnames(annualwords,c("V1","V2","V3","V4"),c("technology","year","word","count"))
F<-read.csv("keywords.csv")
F<-data.table(F)
query<-"SELECT * FROM `applntitlemerge` a LEFT JOIN patentclass p ON a.APPLN_ID=p.APPLN_ID"
P<-data.table(fetch(dbSendQuery(con,query),-1))
T<-P[!is.na(TECHNOLOGY)]
P<-P[is.na(TECHNOLOGY)]
PS<-P[APPLN_ID %in% sample(P$APPLN_ID,100000)]
PST<-rbind(T,PS)
PST$ordered<-runif(nrow(PST),0,1)
PST<-PST[order(ordered)]
PST2 <- PST[APPLN_ID %in% sample(PST$APPLN_ID,30000)]
PST3 <- PST
PST <- PST2
PST$techlabel <- as.factor(PST$TECHNOLOGY)
PST$APPLN_TITLE<-removeWords(PST$APPLN_TITLE,c(preProcess(stopwords()),"one","two","three","four","five","six","seven","eight","nine","ten","twelve","thirteen","fourteen","another","a","an","shot","encounter","dead","including","identified","road","at"))
PST$APPLN_TITLE<-preProcess(PST$APPLN_TITLE)
PST[is.na(techlabel)]$techlabel<-"other"
PST$label <- as.numeric(PST$techlabel)
PST<-PST[order(ordered)]
addrow<-nrow(PST)
addrowp = addrow+1
appendwords<-as.character(F$word)
docs <- PST$APPLN_TITLE
docs<-append(docs,appendwords)
labels <- PST$techlabel
appendlabel<-as.character(F$technology)
labels<-append(labels,appendlabel)
RA<-cbind(appendwords,results)
RA<-cbind(RA,appendlabel)
RA$SVM_LABEL<-as.numeric(as.character(RA$SVM_LABEL))
RA$MAXENTROPY_LABEL<-as.numeric(as.character(RA$MAXENTROPY_LABEL))
RA$meanprob = (RA$MAXENTROPY_PROB+RA$SVM_PROB)/2
doc_matrix <- create_matrix(docs,language="english", removeNumbers=TRUE,stemWords=TRUE,removePunctuation=TRUE,removeSparseTerms=0.99995)
container <- create_container(doc_matrix,labels, trainSize=1:addrow, testSize=addrowp:length(docs), virgin=TRUE)
models <- train_models(container, algorithms=c("MAXENT","SVM"))
results <- classify_models(container, models)
analytics <- create_analytics(container, results)
res<-cbind(PST[7001:nrow(PST)],analytics@document_summary)
models <- train_models(container, algorithms=c("MAXENT","SVM"))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment