aurora-mareviv · April 27, 2020 14:25
diff --git a/gistfile1.txt b/gistfile1.txt
 library(tidyverse)
 library(tidytext)
 library(wordcloud)
 library(reshape2)
 library(tidygraph)
 library(ggraph)

 gdpr_text <- readr::read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-04-21/gdpr_text.tsv')

 #base on https://www.tidytextmining.com/sentiment.html
 gdpr_text %>%
  unnest_tokens(word, gdpr_text) %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("#F8766D", "#7CAE00"),
                   max.words = 100, match.colors = TRUE)



 gdpr_violations <- readr::read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-04-21/gdpr_violations.tsv')

 edge_list <- gdpr_violations %>%
  # this part is from https://juliasilge.com/blog/gdpr-violations/
  transmute(id,
            articles = str_extract_all(article_violated, "Art.[:digit:]+|Art. [:digit:]+")
  ) %>%
  unnest(articles) %>%
  # here the steps to getting an edge list are from https://stackoverflow.com/questions/34670145/generating-an-edge-list-from-id-and-grouping-vectors
  group_by(id) %>%
  filter(n() >= 2) %>%
  do(data.frame(t(combn(.$articles, 2)), stringsAsFactors = FALSE)) %>%
  ungroup() %>%
  select(- id) %>%
  rename(from = X1, to = X2) %>%
  # here the steps to getting the edge weight are from: https://www.jessesadler.com/post/network-analysis-with-r/
  group_by(from, to) %>%
  summarise(weight = n()) %>% 
  ungroup()

 # Create graph using tidygraph
 graph <- as_tbl_graph(edge_list) %>%
  to_undirected() %>%
  activate(nodes) %>%
  mutate(centrality = centrality_authority()) %>%
  mutate(group = as.factor(group_edge_betweenness())) %>%
  group_by(group) %>%
  mutate(name_first = last(name, order_by = centrality)) %>%
  ungroup()


 # plot network using ggraph
 graph %>%
  ggraph(layout = 'linear', circular = TRUE) + 
  geom_edge_arc(aes(alpha = weight, width = weight), show.legend = FALSE) + 
  geom_node_label(aes(label = name, 
                      colour = group)) +
  theme_graph() + 
  guides(colour = FALSE, size = FALSE) + 
  labs(title = str_wrap("Network of GDPR articles that co-occured in the same violations", 
                        width = 40))

 ggsave("gdpr_network.png",
       scale = 2,
       width = 90,
       height = 90,
       units = "mm",
       dpi = 300)
       
       cooccurrence_df <- gdpr_violations %>%
  # this part is from https://juliasilge.com/blog/gdpr-violations/
  transmute(id,
            articles = str_extract_all(article_violated, "Art.[:digit:]+|Art. [:digit:]+")
  ) %>%
  unnest(articles) %>%
  mutate(value = 1) %>%
  distinct() %>%
  pivot_wider(names_from = articles, 
              values_from = value, 
              values_fill = list(value = 0)) %>%
  as.data.frame()

 upset(cooccurrence_df, order.by = c( "freq"))
	library(tidyverse)
	library(tidytext)
	library(wordcloud)
	library(reshape2)
	library(tidygraph)
	library(ggraph)

	gdpr_text <- readr::read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-04-21/gdpr_text.tsv')

	#base on https://www.tidytextmining.com/sentiment.html
	gdpr_text %>%
	unnest_tokens(word, gdpr_text) %>%
	inner_join(get_sentiments("bing")) %>%
	count(word, sentiment, sort = TRUE) %>%
	acast(word ~ sentiment, value.var = "n", fill = 0) %>%
	comparison.cloud(colors = c("#F8766D", "#7CAE00"),
	max.words = 100, match.colors = TRUE)



	gdpr_violations <- readr::read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-04-21/gdpr_violations.tsv')

	edge_list <- gdpr_violations %>%
	# this part is from https://juliasilge.com/blog/gdpr-violations/
	transmute(id,
	articles = str_extract_all(article_violated, "Art.[:digit:]+\|Art. [:digit:]+")
	) %>%
	unnest(articles) %>%
	# here the steps to getting an edge list are from https://stackoverflow.com/questions/34670145/generating-an-edge-list-from-id-and-grouping-vectors
	group_by(id) %>%
	filter(n() >= 2) %>%
	do(data.frame(t(combn(.$articles, 2)), stringsAsFactors = FALSE)) %>%
	ungroup() %>%
	select(- id) %>%
	rename(from = X1, to = X2) %>%
	# here the steps to getting the edge weight are from: https://www.jessesadler.com/post/network-analysis-with-r/
	group_by(from, to) %>%
	summarise(weight = n()) %>%
	ungroup()

	# Create graph using tidygraph
	graph <- as_tbl_graph(edge_list) %>%
	to_undirected() %>%
	activate(nodes) %>%
	mutate(centrality = centrality_authority()) %>%
	mutate(group = as.factor(group_edge_betweenness())) %>%
	group_by(group) %>%
	mutate(name_first = last(name, order_by = centrality)) %>%
	ungroup()


	# plot network using ggraph
	graph %>%
	ggraph(layout = 'linear', circular = TRUE) +
	geom_edge_arc(aes(alpha = weight, width = weight), show.legend = FALSE) +
	geom_node_label(aes(label = name,
	colour = group)) +
	theme_graph() +
	guides(colour = FALSE, size = FALSE) +
	labs(title = str_wrap("Network of GDPR articles that co-occured in the same violations",
	width = 40))

	ggsave("gdpr_network.png",
	scale = 2,
	width = 90,
	height = 90,
	units = "mm",
	dpi = 300)

	cooccurrence_df <- gdpr_violations %>%
	# this part is from https://juliasilge.com/blog/gdpr-violations/
	transmute(id,
	articles = str_extract_all(article_violated, "Art.[:digit:]+\|Art. [:digit:]+")
	) %>%
	unnest(articles) %>%
	mutate(value = 1) %>%
	distinct() %>%
	pivot_wider(names_from = articles,
	values_from = value,
	values_fill = list(value = 0)) %>%
	as.data.frame()

	upset(cooccurrence_df, order.by = c( "freq"))
No results found