Created
September 13, 2019 15:04
-
-
Save MattSandy/717ad43b4ce6d17f1e3140652d460abf to your computer and use it in GitHub Desktop.
Third Democratic Debate
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tidyverse) | |
library(rvest) | |
library(data.table) | |
library(quanteda) | |
library(magrittr) | |
library(ggthemes) | |
url <- "https://www.washingtonpost.com/" %>% | |
paste0("politics/2019/09/13/transcript-third-democratic-debate/") | |
transcript <- url %>% read_html %>% | |
html_nodes("article>p") %>% | |
html_text() | |
# At a glance | |
transcript[1:5] | |
# Find where the first instance of an author occurs | |
start <- (!transcript %>% | |
str_match('^.*?[A-Z]+:') %>% | |
is.na) %>% | |
which %>% .[1] | |
# Drop the preamble | |
transcript <- transcript[start:length(transcript)] | |
df <- matrix(NA,nrow = 0, ncol = 2) | |
for(paragraph in transcript) { | |
found_author <- paragraph %>% | |
str_extract('[A-Z]+:') %>% | |
trimws(whitespace = ":") | |
text <- paragraph %>% | |
str_replace('^.*?[A-Z]+:','') | |
if(!is.na(found_author)) { | |
author <- found_author | |
} | |
df <- df %>% rbind(c(author,text)) | |
# Debugging | |
# print(found_author) | |
} | |
# Convert it to a real data frame | |
df <- df %>% data.frame %>% | |
select(author = X1, text = X2) | |
# Who got called on | |
df %>% group_by(author) %>% | |
summarise(n = n()) %>% | |
arrange(-n) | |
# Who had the most applause | |
applause <- df %>% filter(text == "(APPLAUSE)") %>% | |
group_by(author) %>% | |
summarise(n = n()) %>% | |
arrange(-n) | |
ggplot(applause, aes(x = factor(author,levels = rev(applause$author)), y = n)) + | |
geom_bar(stat = "identity") + coord_flip() + | |
theme_fivethirtyeight() + | |
theme(legend.position="none", | |
text = element_text(size=9)) + | |
labs(title = "Applause at the Debate", | |
subtitle = "Based on (APPLAUSE) Frequency in Transcript", | |
caption = "The Third Democratic Debate @appupio") | |
# Bigrams | |
bigrams <- lapply(unique(df$author),function(candidate) { | |
lapply(df %>% filter(author==candidate) %>% .[["text"]], function(text) { | |
text %>% str_remove_all('\\.\\.\\.') %>% | |
tokens(remove_numbers = TRUE, remove_punct = TRUE) %>% | |
tokens_select(pattern = stopwords('en'), selection = 'remove') %>% | |
tokens_ngrams(n = 2) %>% toupper %>% unique | |
}) %>% unlist %>% table %>% data.frame -> tmp | |
if(nrow(tmp)>0) { | |
tmp$author <- candidate | |
return(tmp) | |
} else { | |
return(NULL) | |
} | |
}) | |
names(bigrams) <- unique(df$author) | |
bigrams$WARREN %>% top_n(10,wt = Freq) %>% arrange(Freq) | |
# Bigram Table ------------------------------------------------------------ | |
bigram_table <- bigrams %>% | |
bind_rows %>% | |
select(Gram = '.', Freq, author) | |
top_grams <- bigram_table %>% | |
group_by(Gram) %>% | |
summarise(Freq = sum(Freq)) %>% | |
.[rev(order(.$Freq)),"Gram"] %>% | |
unlist %>% as.vector | |
top_grams[1:10] | |
# Clustering -------------------------------------------------------------- | |
cluster_matrix <- bigram_table %>% | |
filter(Gram %in% top_grams[1:80]) %>% | |
group_by(Gram,author,Freq) %>% | |
spread(author,Freq) | |
cluster_matrix[is.na(cluster_matrix)] <- 0 | |
# numerical columns | |
dat <- cluster_matrix[,2:(ncol(cluster_matrix))] %>% as.data.frame | |
row.names(dat) <- cluster_matrix$Gram | |
# clustering | |
row.order <- hclust(dist(dat))$order | |
col.order <- hclust(dist(t(dat)))$order | |
# re-order matrix accoring to clustering | |
dat_new <- dat[row.order, col.order] | |
# reshape into dataframe | |
cluster_matrix <- melt(as.matrix(dat_new)) | |
names(cluster_matrix) <- c("Gram", "Candidate","Freq") | |
# Plotting ---------------------------------------------------------------- | |
ggplot(cluster_matrix,aes(x = Candidate, | |
y = Gram, | |
fill = Freq, | |
label = Freq)) + | |
geom_tile() + scale_fill_viridis_c() + | |
geom_text(color="#FFFFFF",size=2) + | |
theme_fivethirtyeight() + | |
theme(axis.text.x = element_text(angle = 45, | |
hjust = 1)) + | |
theme(legend.position="none", | |
text = element_text(size=9)) + | |
labs(title = "Most Used Bigrams", | |
subtitle = "Top 80 Bigrams Selected by Cumulative Use, Stop Words Removed", | |
caption = "The Third Democratic Debate @appupio") | |
ggsave(filename = "bigram.png", width = 6, height = 12) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment