Skip to content

Instantly share code, notes, and snippets.

@MattSandy
Created September 13, 2019 15:04
Show Gist options
  • Save MattSandy/717ad43b4ce6d17f1e3140652d460abf to your computer and use it in GitHub Desktop.
Save MattSandy/717ad43b4ce6d17f1e3140652d460abf to your computer and use it in GitHub Desktop.
Third Democratic Debate
library(tidyverse)
library(rvest)
library(data.table)
library(quanteda)
library(magrittr)
library(ggthemes)
url <- "https://www.washingtonpost.com/" %>%
paste0("politics/2019/09/13/transcript-third-democratic-debate/")
transcript <- url %>% read_html %>%
html_nodes("article>p") %>%
html_text()
# At a glance
transcript[1:5]
# Find where the first instance of an author occurs
start <- (!transcript %>%
str_match('^.*?[A-Z]+:') %>%
is.na) %>%
which %>% .[1]
# Drop the preamble
transcript <- transcript[start:length(transcript)]
df <- matrix(NA,nrow = 0, ncol = 2)
for(paragraph in transcript) {
found_author <- paragraph %>%
str_extract('[A-Z]+:') %>%
trimws(whitespace = ":")
text <- paragraph %>%
str_replace('^.*?[A-Z]+:','')
if(!is.na(found_author)) {
author <- found_author
}
df <- df %>% rbind(c(author,text))
# Debugging
# print(found_author)
}
# Convert it to a real data frame
df <- df %>% data.frame %>%
select(author = X1, text = X2)
# Who got called on
df %>% group_by(author) %>%
summarise(n = n()) %>%
arrange(-n)
# Who had the most applause
applause <- df %>% filter(text == "(APPLAUSE)") %>%
group_by(author) %>%
summarise(n = n()) %>%
arrange(-n)
ggplot(applause, aes(x = factor(author,levels = rev(applause$author)), y = n)) +
geom_bar(stat = "identity") + coord_flip() +
theme_fivethirtyeight() +
theme(legend.position="none",
text = element_text(size=9)) +
labs(title = "Applause at the Debate",
subtitle = "Based on (APPLAUSE) Frequency in Transcript",
caption = "The Third Democratic Debate @appupio")
# Bigrams
bigrams <- lapply(unique(df$author),function(candidate) {
lapply(df %>% filter(author==candidate) %>% .[["text"]], function(text) {
text %>% str_remove_all('\\.\\.\\.') %>%
tokens(remove_numbers = TRUE, remove_punct = TRUE) %>%
tokens_select(pattern = stopwords('en'), selection = 'remove') %>%
tokens_ngrams(n = 2) %>% toupper %>% unique
}) %>% unlist %>% table %>% data.frame -> tmp
if(nrow(tmp)>0) {
tmp$author <- candidate
return(tmp)
} else {
return(NULL)
}
})
names(bigrams) <- unique(df$author)
bigrams$WARREN %>% top_n(10,wt = Freq) %>% arrange(Freq)
# Bigram Table ------------------------------------------------------------
bigram_table <- bigrams %>%
bind_rows %>%
select(Gram = '.', Freq, author)
top_grams <- bigram_table %>%
group_by(Gram) %>%
summarise(Freq = sum(Freq)) %>%
.[rev(order(.$Freq)),"Gram"] %>%
unlist %>% as.vector
top_grams[1:10]
# Clustering --------------------------------------------------------------
cluster_matrix <- bigram_table %>%
filter(Gram %in% top_grams[1:80]) %>%
group_by(Gram,author,Freq) %>%
spread(author,Freq)
cluster_matrix[is.na(cluster_matrix)] <- 0
# numerical columns
dat <- cluster_matrix[,2:(ncol(cluster_matrix))] %>% as.data.frame
row.names(dat) <- cluster_matrix$Gram
# clustering
row.order <- hclust(dist(dat))$order
col.order <- hclust(dist(t(dat)))$order
# re-order matrix accoring to clustering
dat_new <- dat[row.order, col.order]
# reshape into dataframe
cluster_matrix <- melt(as.matrix(dat_new))
names(cluster_matrix) <- c("Gram", "Candidate","Freq")
# Plotting ----------------------------------------------------------------
ggplot(cluster_matrix,aes(x = Candidate,
y = Gram,
fill = Freq,
label = Freq)) +
geom_tile() + scale_fill_viridis_c() +
geom_text(color="#FFFFFF",size=2) +
theme_fivethirtyeight() +
theme(axis.text.x = element_text(angle = 45,
hjust = 1)) +
theme(legend.position="none",
text = element_text(size=9)) +
labs(title = "Most Used Bigrams",
subtitle = "Top 80 Bigrams Selected by Cumulative Use, Stop Words Removed",
caption = "The Third Democratic Debate @appupio")
ggsave(filename = "bigram.png", width = 6, height = 12)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment