Last active
December 12, 2018 23:03
-
-
Save MattSandy/ed96fc71b833092ebaa708c88b6802d6 to your computer and use it in GitHub Desktop.
Use R and Tableau to Analyze Text from Presidential Debate
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#https://www.washingtonpost.com/news/the-fix/wp/2016/09/26/the-first-trump-clinton-presidential-debate-transcript-annotated/ | |
transcript <- read.csv(file="~/R/reddit/speach/import/transcript.csv",header = FALSE, stringsAsFactors = FALSE) | |
colnames(transcript) <- c("candidate","statement") | |
transcript_melted <- matrix(data="NA",nrow = 0,ncol = 2) | |
for(i in 1:nrow(transcript)) { | |
#removes non alphanumeric, then splits statement into a vector of words | |
words <- unlist(strsplit(gsub("[^[:alnum:] \']", "", transcript[i,"statement"]), " ")) | |
for(word in words) { | |
transcript_melted <- rbind(transcript_melted, c(transcript[i,"candidate"],word)) | |
} | |
} | |
transcript_melted <- data.frame(transcript_melted) | |
colnames(transcript_melted) <- c("candidate","word") | |
#remove spaces and empty values | |
transcript_melted <- transcript_melted[which(transcript_melted$word!=""),] | |
transcript_melted <- transcript_melted[which(transcript_melted$word!=" "),] | |
transcript_melted$word <- tolower(transcript_melted$word) | |
#Removes Lester Holt from Candidates | |
transcript_melted <- transcript_melted[which(transcript_melted$candidate!="LESTER HOLT"),] | |
transcript_melted$candidate <- factor(transcript_melted$candidate) | |
summary <- data.frame(table(transcript_melted$candidate, transcript_melted$word)) | |
summary <- summary[order(-summary$Freq),] | |
print(summary[1:200,],row.names=FALSE) | |
#Find some differences | |
export_table <- matrix(nrow = 0,ncol = 4) | |
for(word in unique(transcript_melted$word)) { | |
#grab the counts | |
clinton <- nrow(transcript_melted[which(transcript_melted$word==word&transcript_melted$candidate=="CLINTON"),]) | |
trump <- nrow(transcript_melted[which(transcript_melted$word==word&transcript_melted$candidate=="TRUMP"),]) | |
if((clinton/trump>2)||(trump/clinton>2)) { | |
print(word) | |
print(table((transcript_melted[which(transcript_melted$word==word),"candidate"]))) | |
#gets the number of times the word has been said by each candidate | |
#append export_table | |
export_table <- rbind(export_table,c("TRUMP",word,trump,trump/clinton)) | |
export_table <- rbind(export_table,c("CLINTON",word,clinton,trump/clinton)) | |
} | |
} | |
#Clinton said more | |
for(word in unique(transcript_melted$word)) { | |
clinton <- nrow(transcript_melted[which(transcript_melted$word==word&transcript_melted$candidate=="CLINTON"),]) | |
trump <- nrow(transcript_melted[which(transcript_melted$word==word&transcript_melted$candidate=="TRUMP"),]) | |
if((clinton>trump)&(trump>0)) { | |
print(word) | |
print(table((transcript_melted[which(transcript_melted$word==word),"candidate"]))) | |
} | |
} | |
write.csv(export_table,file="~/R/reddit/speach/export/export_table.csv", row.names = FALSE) | |
write.csv(transcript_melted,file="~/R/reddit/speach/export/transcript_melted.csv", row.names = FALSE) | |
#fun bits | |
#How Many Words Trump Said over Hilary | |
nrow(transcript_melted[which(transcript_melted$candidate=="TRUMP"),])/nrow(transcript_melted[which(transcript_melted$candidate=="CLINTON"),]) | |
#Tremendous | |
summary[which(summary$Var2=="tremendous"),] | |
summary[which(summary$Var2=="very"),] | |
summary[which(summary$Var2=="important"),] | |
summary[which(summary$Var2=="wrong"),] | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment