MattSandy · September 13, 2019 15:04
diff --git a/run.R b/run.R
 library(tidyverse)
 library(rvest)
 library(data.table)
 library(quanteda)
 library(magrittr)
 library(ggthemes)

 url <- "https://www.washingtonpost.com/" %>% 
  paste0("politics/2019/09/13/transcript-third-democratic-debate/")
 transcript <- url %>% read_html %>%
  html_nodes("article>p") %>%
  html_text()

 # At a glance
 transcript[1:5]

 # Find where the first instance of an author occurs
 start <- (!transcript %>% 
            str_match('^.*?[A-Z]+:') %>% 
            is.na) %>% 
  which %>% .[1]

 # Drop the preamble
 transcript <- transcript[start:length(transcript)]

 df <- matrix(NA,nrow = 0, ncol = 2)

 for(paragraph in transcript) {
  found_author <- paragraph %>% 
    str_extract('[A-Z]+:') %>% 
    trimws(whitespace = ":")
  text <- paragraph %>% 
    str_replace('^.*?[A-Z]+:','')
  if(!is.na(found_author)) {
    author <- found_author
  }
  df <- df %>% rbind(c(author,text))
  # Debugging
  # print(found_author)
 }
 # Convert it to a real data frame
 df <- df %>% data.frame %>% 
  select(author = X1, text = X2)

 # Who got called on
 df %>% group_by(author) %>% 
  summarise(n = n()) %>%
  arrange(-n)

 # Who had the most applause
 applause <- df %>% filter(text == "(APPLAUSE)") %>% 
  group_by(author) %>% 
  summarise(n = n()) %>% 
  arrange(-n)


 ggplot(applause, aes(x = factor(author,levels = rev(applause$author)), y = n)) + 
  geom_bar(stat = "identity") + coord_flip() +
  theme_fivethirtyeight() +
  theme(legend.position="none",
        text = element_text(size=9)) +
  labs(title = "Applause at the Debate",
       subtitle = "Based on (APPLAUSE) Frequency in Transcript",
       caption = "The Third Democratic Debate @appupio") 

 # Bigrams
 bigrams <- lapply(unique(df$author),function(candidate) {
  lapply(df %>% filter(author==candidate) %>% .[["text"]], function(text) {
    text %>% str_remove_all('\\.\\.\\.') %>%
      tokens(remove_numbers = TRUE,  remove_punct = TRUE) %>% 
      tokens_select(pattern = stopwords('en'), selection = 'remove') %>% 
      tokens_ngrams(n = 2) %>% toupper %>% unique
  }) %>% unlist %>% table %>% data.frame -> tmp
  if(nrow(tmp)>0) {
    tmp$author <- candidate
    return(tmp)
  } else {
    return(NULL)
  }
 })
 names(bigrams) <- unique(df$author)
 bigrams$WARREN %>% top_n(10,wt = Freq) %>% arrange(Freq)

 # Bigram Table ------------------------------------------------------------

 bigram_table <- bigrams %>% 
  bind_rows %>%
  select(Gram = '.', Freq, author)
 top_grams <- bigram_table %>% 
  group_by(Gram) %>% 
  summarise(Freq = sum(Freq)) %>% 
  .[rev(order(.$Freq)),"Gram"] %>% 
  unlist %>% as.vector
 top_grams[1:10]


 # Clustering --------------------------------------------------------------

 cluster_matrix <- bigram_table %>% 
  filter(Gram %in% top_grams[1:80]) %>% 
  group_by(Gram,author,Freq) %>% 
  spread(author,Freq)
 cluster_matrix[is.na(cluster_matrix)] <- 0
 # numerical columns
 dat <- cluster_matrix[,2:(ncol(cluster_matrix))] %>% as.data.frame
 row.names(dat) <- cluster_matrix$Gram
 # clustering
 row.order <- hclust(dist(dat))$order
 col.order <- hclust(dist(t(dat)))$order
 # re-order matrix accoring to clustering
 dat_new <- dat[row.order, col.order]

 # reshape into dataframe
 cluster_matrix <- melt(as.matrix(dat_new))
 names(cluster_matrix) <- c("Gram", "Candidate","Freq")


 # Plotting ----------------------------------------------------------------

 ggplot(cluster_matrix,aes(x = Candidate,
                          y = Gram,
                          fill = Freq,
                          label = Freq)) + 
  geom_tile() + scale_fill_viridis_c() + 
  geom_text(color="#FFFFFF",size=2) +
  theme_fivethirtyeight() +
  theme(axis.text.x = element_text(angle = 45,
                                   hjust = 1)) +
  theme(legend.position="none",
        text = element_text(size=9)) +
  labs(title = "Most Used Bigrams",
       subtitle = "Top 80 Bigrams Selected by Cumulative Use, Stop Words Removed",
       caption = "The Third Democratic Debate @appupio") 

 ggsave(filename = "bigram.png", width = 6, height = 12)
	library(tidyverse)
	library(rvest)
	library(data.table)
	library(quanteda)
	library(magrittr)
	library(ggthemes)

	url <- "https://www.washingtonpost.com/" %>%
	paste0("politics/2019/09/13/transcript-third-democratic-debate/")
	transcript <- url %>% read_html %>%
	html_nodes("article>p") %>%
	html_text()

	# At a glance
	transcript[1:5]

	# Find where the first instance of an author occurs
	start <- (!transcript %>%
	str_match('^.*?[A-Z]+:') %>%
	is.na) %>%
	which %>% .[1]

	# Drop the preamble
	transcript <- transcript[start:length(transcript)]

	df <- matrix(NA,nrow = 0, ncol = 2)

	for(paragraph in transcript) {
	found_author <- paragraph %>%
	str_extract('[A-Z]+:') %>%
	trimws(whitespace = ":")
	text <- paragraph %>%
	str_replace('^.*?[A-Z]+:','')
	if(!is.na(found_author)) {
	author <- found_author
	}
	df <- df %>% rbind(c(author,text))
	# Debugging
	# print(found_author)
	}
	# Convert it to a real data frame
	df <- df %>% data.frame %>%
	select(author = X1, text = X2)

	# Who got called on
	df %>% group_by(author) %>%
	summarise(n = n()) %>%
	arrange(-n)

	# Who had the most applause
	applause <- df %>% filter(text == "(APPLAUSE)") %>%
	group_by(author) %>%
	summarise(n = n()) %>%
	arrange(-n)


	ggplot(applause, aes(x = factor(author,levels = rev(applause$author)), y = n)) +
	geom_bar(stat = "identity") + coord_flip() +
	theme_fivethirtyeight() +
	theme(legend.position="none",
	text = element_text(size=9)) +
	labs(title = "Applause at the Debate",
	subtitle = "Based on (APPLAUSE) Frequency in Transcript",
	caption = "The Third Democratic Debate @appupio")

	# Bigrams
	bigrams <- lapply(unique(df$author),function(candidate) {
	lapply(df %>% filter(author==candidate) %>% .[["text"]], function(text) {
	text %>% str_remove_all('\\.\\.\\.') %>%
	tokens(remove_numbers = TRUE, remove_punct = TRUE) %>%
	tokens_select(pattern = stopwords('en'), selection = 'remove') %>%
	tokens_ngrams(n = 2) %>% toupper %>% unique
	}) %>% unlist %>% table %>% data.frame -> tmp
	if(nrow(tmp)>0) {
	tmp$author <- candidate
	return(tmp)
	} else {
	return(NULL)
	}
	})
	names(bigrams) <- unique(df$author)
	bigrams$WARREN %>% top_n(10,wt = Freq) %>% arrange(Freq)

	# Bigram Table ------------------------------------------------------------

	bigram_table <- bigrams %>%
	bind_rows %>%
	select(Gram = '.', Freq, author)
	top_grams <- bigram_table %>%
	group_by(Gram) %>%
	summarise(Freq = sum(Freq)) %>%
	.[rev(order(.$Freq)),"Gram"] %>%
	unlist %>% as.vector
	top_grams[1:10]


	# Clustering --------------------------------------------------------------

	cluster_matrix <- bigram_table %>%
	filter(Gram %in% top_grams[1:80]) %>%
	group_by(Gram,author,Freq) %>%
	spread(author,Freq)
	cluster_matrix[is.na(cluster_matrix)] <- 0
	# numerical columns
	dat <- cluster_matrix[,2:(ncol(cluster_matrix))] %>% as.data.frame
	row.names(dat) <- cluster_matrix$Gram
	# clustering
	row.order <- hclust(dist(dat))$order
	col.order <- hclust(dist(t(dat)))$order
	# re-order matrix accoring to clustering
	dat_new <- dat[row.order, col.order]

	# reshape into dataframe
	cluster_matrix <- melt(as.matrix(dat_new))
	names(cluster_matrix) <- c("Gram", "Candidate","Freq")


	# Plotting ----------------------------------------------------------------

	ggplot(cluster_matrix,aes(x = Candidate,
	y = Gram,
	fill = Freq,
	label = Freq)) +
	geom_tile() + scale_fill_viridis_c() +
	geom_text(color="#FFFFFF",size=2) +
	theme_fivethirtyeight() +
	theme(axis.text.x = element_text(angle = 45,
	hjust = 1)) +
	theme(legend.position="none",
	text = element_text(size=9)) +
	labs(title = "Most Used Bigrams",
	subtitle = "Top 80 Bigrams Selected by Cumulative Use, Stop Words Removed",
	caption = "The Third Democratic Debate @appupio")

	ggsave(filename = "bigram.png", width = 6, height = 12)