Created
June 18, 2012 16:12
-
-
Save VikParuchuri/2949153 to your computer and use it in GitHub Desktop.
Generate Sentiment Plot
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
term<-c("egypt","jordan","israel","saudi") | |
term_vec<-foreach(i=1:length(all_score_frames),.combine=rbind) %do% | |
{ | |
score_row<-rep(0,length(term)) | |
for(z in 1:length(score_row)) | |
{ | |
sel_score<-all_score_frames[[i]][all_score_frames[[i]]$term==term[z],"score"] | |
sel_score[is.na(sel_score)]<-0 | |
if(length(sel_score)==0) | |
sel_score<-0 | |
score_row[z]<-round(sel_score,5) | |
} | |
as.numeric(c(date_max_list[i],score_row)) | |
} | |
term_vec<-as.data.frame(term_vec) | |
names(term_vec)<-c("year",term) | |
term_df <- melt(term_vec, id.vars="year") | |
term_means<-sapply(all_score_frames,function(x) mean(x$score)) | |
text_size<-40 | |
ggplot(data=term_df,aes(x=year, y=value, colour=variable))+geom_line(size=1) + geom_line(aes(x = as.numeric(date_max_list), y = term_means), colour = "black",size=1.5) + ylab("sentiment") + opts(title = expression("US Sentiment (+/-) Over Time"),legend.text=theme_text(size=text_size),legend.title=theme_text(size=0),plot.title=theme_text(size=text_size),axis.text.y=theme_text(size=text_size),axis.text.x=theme_text(size=text_size),axis.title.y=theme_text(size=text_size,angle=90),axis.title.x=theme_text(size=text_size),legend.key.size=unit(2,"cm")) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
load_or_install(c("RODBC","corpora","ggplot2","tm","foreach","RColorBrewer","wordcloud","lsa","MASS","openNLP")) | |
channel <- odbcConnect(db_name, uid = "", pwd = "") | |
all_score_frames<-list() | |
ri_cols<-30000 | |
max_cables_to_sample<-15000 | |
for(z in 1:length(date_min_list)) | |
{ | |
date_min<-paste(date_min_list[z],"-01-01",sep="") | |
date_max<-paste(date_max_list[z],"-01-01",sep="") | |
print(date_min) | |
cable_frame<-sqlQuery(channel, paste("SELECT * from cable WHERE date > '",date_min,"' AND date <'",date_max,"'",sep=""),stringsAsFactors=FALSE,errors=TRUE) | |
ppatterns<-c("\\n","\\r") | |
sampled_indices<-sample(1:nrow(cable_frame),min(max_cables_to_sample,nrow(cable_frame))) | |
combined<-tolower(gsub(paste("(",paste(ppatterns,collapse="|"),")",sep=""),"",cable_frame$content[sampled_indices])) | |
combined<-sentDetect(combined) | |
combined<-combined[!is.na(combined)] | |
combined<-combined[nchar(combined)>5] | |
tokenized_combined<-lapply(combined,scan_tokenizer) | |
ri_mat<-matrix(0,length(full_term_list),ri_cols) | |
rownames(ri_mat)<-full_term_list | |
gc() | |
for(i in 1:length(combined)) | |
{ | |
if(i%%10000==0) | |
print(i) | |
tokens<-tokenized_combined[[i]] | |
tokens<-tokens[nchar(tokens)>4 & nchar(tokens)<20] | |
tokens<-tokens[tokens %in% full_term_list] | |
set.seed(i) | |
sample_vec<-rep(0,ri_cols) | |
s_inds<-sample(1:length(sample_vec),5) | |
sample_vec[s_inds]<-1 | |
ri_mat[tokens,]<-ri_mat[tokens,]+sample_vec | |
} | |
gc() | |
ri_mat<-ri_mat[rowSums(ri_mat)>0,] | |
gc() | |
neg_vec<-colSums(ri_mat[rownames(ri_mat) %in% afinn_list$word[afinn_list$score< -2],]) | |
pos_vec<-colSums(ri_mat[rownames(ri_mat) %in% afinn_list$word[afinn_list$score> 2],]) | |
ri_mat<-ri_mat[!rownames(ri_mat) %in% afinn_list$word,] | |
neg_scores<-apply(ri_mat,1,function(x)cosine(x,neg_vec)) | |
pos_scores<-apply(ri_mat,1,function(x)cosine(x,pos_vec)) | |
score_frame<-data.frame(term=rownames(ri_mat),pos_scores,neg_scores,score=pos_scores-neg_scores) | |
sorted_score_frame<-score_frame[order(score_frame$score),] | |
all_score_frames[[z]]<-sorted_score_frame | |
rm(ri_mat) | |
gc() | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi Vik,
Thanks for this awesome R analysis. I was implementing this but faced a lot of errors. I was able to debug most. now im stuck on sentiment_score_generation.R, line number 83. The error I encountered is:
Error in ri_mat[rownames(ri_mat) %in% afinn_list$word[afinn_list$score < : incorrect number of dimensions .. can you please help me in debugging this?