ofchurches · April 30, 2020 00:41
diff --git a/gistfile1.txt b/gistfile1.txt
 library(tidyverse)
 library(tidygraph)
 library(ggraph)

 grosses <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-04-28/grosses.csv', guess_max = 40000)

 # links between theatres that have put on the same show
 # create the edge list
 edge_list <- grosses %>%
  select(show, theatre) %>%
  distinct() %>%
  # here the steps to getting an edge list are from https://stackoverflow.com/questions/34670145/generating-an-edge-list-from-id-and-grouping-vectors
  group_by(show) %>%
  filter(n() >= 2) %>%
  do(data.frame(t(combn(.$theatre, 2)), stringsAsFactors = FALSE)) %>%
  ungroup() %>%
  select(- show) %>%
  rename(from = X1, to = X2) %>%
  # here the steps to getting the edge weight are from: https://www.jessesadler.com/post/network-analysis-with-r/
  group_by(from, to) %>%
  summarise(weight = n()) %>% 
  ungroup() %>%
  filter(weight > 1)

 # Create graph using tidygraph
 graph <- as_tbl_graph(edge_list) %>%
  to_undirected() %>%
  activate(nodes) %>%
  mutate(centrality = centrality_authority()) %>%
  mutate(group = as.factor(group_edge_betweenness())) %>%
  group_by(group) %>%
  mutate(name_first = last(name, order_by = centrality)) %>%
  ungroup()


 # plot network using ggraph
 graph %>%
  ggraph(layout = 'linear') + 
  geom_edge_arc(aes(width = weight), 
                show.legend = FALSE, 
                colour = "grey") + 
  geom_node_text(aes(label = str_wrap(name, width = 30), 
                     colour = group), 
                 angle = 90, 
                 position = position_nudge(y = -.3), 
                 hjust = 1) +
  geom_node_point(aes(colour = group), 
                  size = 6) + 
  ylim(-4, 5) + 
  theme(plot.background = element_rect(fill = "white"), 
        panel.background = element_rect(fill = "white"), 
        title = element_text(colour = "black", 
                             size = 16)) + 
  guides(colour = FALSE, size = FALSE) + 
  labs(title = str_wrap("Network of Broadway theaters that have put on the same shows", 
                        width = 60))

 ggsave("broadway_network.png",
       scale = 2,
       width = 90,
       height = 90,
       units = "mm",
       dpi = 300)

 #look at the synopses
 library(tidytext)
 library(ggrepel)

 synopses <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-04-28/synopses.csv')

 run_length <- grosses %>%
  count(show) %>%
  rename(run_weeks = n)

 opening_week <- grosses %>%
  group_by(show) %>%
  mutate(opening = first(week_ending)) %>%
  ungroup() %>%
  select(show, opening) %>%
  distinct()

 synopses %>%
  unnest_tokens(word, synopsis) %>%
  inner_join(get_sentiments("afinn")) %>%
  group_by(show) %>%
  filter(n() > 5) %>%
  summarise(average_sentiment = mean(value)) %>%
  ungroup() %>%
  filter(average_sentiment > 2.5 | average_sentiment < -.8) %>%
  ggplot(aes(x = reorder(show, average_sentiment), y = average_sentiment, fill = average_sentiment > 0)) + 
  geom_col(show.legend = FALSE) + 
  coord_flip() + 
  labs(title = "The most positive and negative play synopses on Braodway", 
       y = "Average sentiment of the synopsis", 
       x = "") + 
  theme_minimal()

 synopses %>%
  unnest_tokens(word, synopsis) %>%
  inner_join(get_sentiments("afinn")) %>%
  group_by(show) %>%
  filter(n() > 2) %>%
  summarise(average_sentiment = mean(value)) %>%
  ungroup() %>%
  inner_join(run_length) %>%
  inner_join(opening_week) %>%
  ggplot(aes(y = run_weeks, x = average_sentiment, label = show)) + 
  geom_jitter() + 
  geom_label_repel(aes(label = ifelse(run_weeks > 250, show, NA)))+ 
  labs(title = "Average synopsis sentiment by run length of musicals on Broadway", 
       x = "Average sentiment",
       y = "Run length (weeks)") + 
  theme_minimal()

 ggsave("broadway_sentiment_runs.png",
       scale = 2,
       width = 90,
       height = 90,
       units = "mm",
       dpi = 300)
	library(tidyverse)
	library(tidygraph)
	library(ggraph)

	grosses <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-04-28/grosses.csv', guess_max = 40000)

	# links between theatres that have put on the same show
	# create the edge list
	edge_list <- grosses %>%
	select(show, theatre) %>%
	distinct() %>%
	# here the steps to getting an edge list are from https://stackoverflow.com/questions/34670145/generating-an-edge-list-from-id-and-grouping-vectors
	group_by(show) %>%
	filter(n() >= 2) %>%
	do(data.frame(t(combn(.$theatre, 2)), stringsAsFactors = FALSE)) %>%
	ungroup() %>%
	select(- show) %>%
	rename(from = X1, to = X2) %>%
	# here the steps to getting the edge weight are from: https://www.jessesadler.com/post/network-analysis-with-r/
	group_by(from, to) %>%
	summarise(weight = n()) %>%
	ungroup() %>%
	filter(weight > 1)

	# Create graph using tidygraph
	graph <- as_tbl_graph(edge_list) %>%
	to_undirected() %>%
	activate(nodes) %>%
	mutate(centrality = centrality_authority()) %>%
	mutate(group = as.factor(group_edge_betweenness())) %>%
	group_by(group) %>%
	mutate(name_first = last(name, order_by = centrality)) %>%
	ungroup()


	# plot network using ggraph
	graph %>%
	ggraph(layout = 'linear') +
	geom_edge_arc(aes(width = weight),
	show.legend = FALSE,
	colour = "grey") +
	geom_node_text(aes(label = str_wrap(name, width = 30),
	colour = group),
	angle = 90,
	position = position_nudge(y = -.3),
	hjust = 1) +
	geom_node_point(aes(colour = group),
	size = 6) +
	ylim(-4, 5) +
	theme(plot.background = element_rect(fill = "white"),
	panel.background = element_rect(fill = "white"),
	title = element_text(colour = "black",
	size = 16)) +
	guides(colour = FALSE, size = FALSE) +
	labs(title = str_wrap("Network of Broadway theaters that have put on the same shows",
	width = 60))

	ggsave("broadway_network.png",
	scale = 2,
	width = 90,
	height = 90,
	units = "mm",
	dpi = 300)

	#look at the synopses
	library(tidytext)
	library(ggrepel)

	synopses <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-04-28/synopses.csv')

	run_length <- grosses %>%
	count(show) %>%
	rename(run_weeks = n)

	opening_week <- grosses %>%
	group_by(show) %>%
	mutate(opening = first(week_ending)) %>%
	ungroup() %>%
	select(show, opening) %>%
	distinct()

	synopses %>%
	unnest_tokens(word, synopsis) %>%
	inner_join(get_sentiments("afinn")) %>%
	group_by(show) %>%
	filter(n() > 5) %>%
	summarise(average_sentiment = mean(value)) %>%
	ungroup() %>%
	filter(average_sentiment > 2.5 \| average_sentiment < -.8) %>%
	ggplot(aes(x = reorder(show, average_sentiment), y = average_sentiment, fill = average_sentiment > 0)) +
	geom_col(show.legend = FALSE) +
	coord_flip() +
	labs(title = "The most positive and negative play synopses on Braodway",
	y = "Average sentiment of the synopsis",
	x = "") +
	theme_minimal()

	synopses %>%
	unnest_tokens(word, synopsis) %>%
	inner_join(get_sentiments("afinn")) %>%
	group_by(show) %>%
	filter(n() > 2) %>%
	summarise(average_sentiment = mean(value)) %>%
	ungroup() %>%
	inner_join(run_length) %>%
	inner_join(opening_week) %>%
	ggplot(aes(y = run_weeks, x = average_sentiment, label = show)) +
	geom_jitter() +
	geom_label_repel(aes(label = ifelse(run_weeks > 250, show, NA)))+
	labs(title = "Average synopsis sentiment by run length of musicals on Broadway",
	x = "Average sentiment",
	y = "Run length (weeks)") +
	theme_minimal()

	ggsave("broadway_sentiment_runs.png",
	scale = 2,
	width = 90,
	height = 90,
	units = "mm",
	dpi = 300)