tillgrallert · October 1, 2021 15:00
diff --git a/wordclouds.r b/wordclouds.r
 library(tidyverse)
 library(lubridate) # for working with dates
 library(ggrepel)
 library(ggthemes) # install themes for ggplots
 library(ggwordcloud)
 library(here) # for easier locating of all files
 library(ragg) # for rendering Arabic
 library(RColorBrewer)
 library(wordcloud)
 library(arabicStemR)
 library(tm)
 # enable unicode
 Sys.setlocale("LC_ALL", "en_US.UTF-8")
 # set a general theme for all ggplots
 theme_set(theme_bw())

 # function to generate a frequency list
 f.frequency.list <- function(input) {
  docs <- Corpus(VectorSource(input))
  # build a term-document matrix to get a frequency list
  dtm <- TermDocumentMatrix(docs)
  m <- as.matrix(dtm)
  v <- sort(rowSums(m),decreasing=TRUE)
  output <- data.frame(word = names(v),freq=v)
  output
 }
 # variables for labels
 v.label.source = 'Data source: '
 v.label.license = 'Licence information'
 # fonts: select those available on your system
 font = "Baskerville"
 font.Arab = "Amiri" # for Arabic
 # font sizes
 size.Base.Px = 9
 ## font sizes are measured in mm
 size.Base.Mm = (5/14) * size.Base.Px
 # specify text sizes
 size.Title = 2
 size.Subtitle = 1.5
 size.Text = 1
 size.Title.Mm = size.Title * size.Base.Mm
 size.Subtitle.Mm = size.Subtitle * size.Base.Mm
 size.Text.Mm = size.Text * size.Base.Mm

 # funtion for generating wordclouds based on the frequency lists
 f.wordcloud.frequency <- function(input, max.values, label.text) {
  # process data: frequency list
  data.frequency <- f.frequency.list(input) %>%
    slice(1:max.values) %>% # limit the length of the data set
    # add some 90 degree angles to 20 % of all words
    dplyr::mutate(angle = 90 * sample(c(0, 1), n(), replace = TRUE, prob = c(80, 20))) 
    #dplyr::mutate(angle = 45 * sample(-2:2, n(), replace = TRUE, prob = c(1, 1, 4, 1, 1)))
  # plot
  plot.base <- ggplot(data.frequency, aes(x = 1, y = 1, size = freq, label = word, colour = freq)) +
    scale_y_continuous(breaks = NULL) +
    scale_x_continuous(breaks = NULL)
  # labs
  layer.labs <- labs(x = "", y = "", 
                     title = paste("Wordcloud of the", max.values, "most frequent words in", label.text, sep = " "),
                     subtitle = v.label.source,
                     caption = v.label.license)
  layer.text.repel <- c(
    geom_text_repel(segment.size = 0, force = 10, max.overlaps = 500, family = font.Arab),
    scale_size(range = c(2, 22), guide = FALSE))
  layer.text.wordcloud <- c(
    geom_text_wordcloud(aes(angle = angle), # use the angle information
                        family = font.Arab, 
                        area_corr = F, # for frequency corresponding to area
                        eccentricity = 1, # to form a circle
                        rm_outside = TRUE, # if there are too many words, the smallest ones should be removed if they cannot fit onto the canvas
                        grid_margin = 0.5, seed = 43,
                        show.legend = T),
    scale_size_area(max_size = 20)
    #scale_radius(range = c(0, 30), limits = c(0, NA))
  )
  
  plot.base.final <- plot.base +
    layer.labs +
    scale_color_gradient(low = "darkgreen", high = "red") +
    guides(color = guide_colorbar("Frequency", order = 1),
           size = "none")+ #guide_legend("Frequency", order = 2)) +
    theme(
      text = element_text(family = font, face = "plain"),
      plot.title = element_text(size = size.Title.Mm),
      plot.subtitle = element_text(size = size.Subtitle.Mm),
      plot.caption = element_text(size = size.Text.Mm),
      legend.position = "bottom",
      panel.border = element_blank()) 
  plot.repel <- plot.base +
    layer.text.repel
  plot.wordcloud <- plot.base.final +
    layer.text.wordcloud
  #plot.repel
  plot.wordcloud
  # save output: with the latest update of ggplot2, ragg is not needed anymore and Arabic is correctly printed
  ggsave(plot = plot.wordcloud, filename = paste("wordcloud_", label.text,".png", sep = ""), units = "mm" , height = height.Plot, width = width.Plot, dpi = dpi.Plot)
 }

 # function for producing wordclouds for specific periods in a dataset with the above functions 
 f.wordcloud.period <- function(dataframe.input, onset, period, column, max.values, label.text) {
  terminus = onset + period - 1
  data.slice <- dataframe.input %>%
    dplyr::filter(year >= onset, year <= terminus) %>% # filter for period
    dplyr::select(column) %>% # select only one column
    dplyr::rename(text = 1) # rename the single selected column
  label = paste(label.text, " (", onset, "-", terminus, ")", sep = "")
 # call wordcloud function
  f.wordcloud.frequency(data.slice$text, max.values, label)
 }
	library(tidyverse)
	library(lubridate) # for working with dates
	library(ggrepel)
	library(ggthemes) # install themes for ggplots
	library(ggwordcloud)
	library(here) # for easier locating of all files
	library(ragg) # for rendering Arabic
	library(RColorBrewer)
	library(wordcloud)
	library(arabicStemR)
	library(tm)
	# enable unicode
	Sys.setlocale("LC_ALL", "en_US.UTF-8")
	# set a general theme for all ggplots
	theme_set(theme_bw())

	# function to generate a frequency list
	f.frequency.list <- function(input) {
	docs <- Corpus(VectorSource(input))
	# build a term-document matrix to get a frequency list
	dtm <- TermDocumentMatrix(docs)
	m <- as.matrix(dtm)
	v <- sort(rowSums(m),decreasing=TRUE)
	output <- data.frame(word = names(v),freq=v)
	output
	}
	# variables for labels
	v.label.source = 'Data source: '
	v.label.license = 'Licence information'
	# fonts: select those available on your system
	font = "Baskerville"
	font.Arab = "Amiri" # for Arabic
	# font sizes
	size.Base.Px = 9
	## font sizes are measured in mm
	size.Base.Mm = (5/14) * size.Base.Px
	# specify text sizes
	size.Title = 2
	size.Subtitle = 1.5
	size.Text = 1
	size.Title.Mm = size.Title * size.Base.Mm
	size.Subtitle.Mm = size.Subtitle * size.Base.Mm
	size.Text.Mm = size.Text * size.Base.Mm

	# funtion for generating wordclouds based on the frequency lists
	f.wordcloud.frequency <- function(input, max.values, label.text) {
	# process data: frequency list
	data.frequency <- f.frequency.list(input) %>%
	slice(1:max.values) %>% # limit the length of the data set
	# add some 90 degree angles to 20 % of all words
	dplyr::mutate(angle = 90 * sample(c(0, 1), n(), replace = TRUE, prob = c(80, 20)))
	#dplyr::mutate(angle = 45 * sample(-2:2, n(), replace = TRUE, prob = c(1, 1, 4, 1, 1)))
	# plot
	plot.base <- ggplot(data.frequency, aes(x = 1, y = 1, size = freq, label = word, colour = freq)) +
	scale_y_continuous(breaks = NULL) +
	scale_x_continuous(breaks = NULL)
	# labs
	layer.labs <- labs(x = "", y = "",
	title = paste("Wordcloud of the", max.values, "most frequent words in", label.text, sep = " "),
	subtitle = v.label.source,
	caption = v.label.license)
	layer.text.repel <- c(
	geom_text_repel(segment.size = 0, force = 10, max.overlaps = 500, family = font.Arab),
	scale_size(range = c(2, 22), guide = FALSE))
	layer.text.wordcloud <- c(
	geom_text_wordcloud(aes(angle = angle), # use the angle information
	family = font.Arab,
	area_corr = F, # for frequency corresponding to area
	eccentricity = 1, # to form a circle
	rm_outside = TRUE, # if there are too many words, the smallest ones should be removed if they cannot fit onto the canvas
	grid_margin = 0.5, seed = 43,
	show.legend = T),
	scale_size_area(max_size = 20)
	#scale_radius(range = c(0, 30), limits = c(0, NA))
	)

	plot.base.final <- plot.base +
	layer.labs +
	scale_color_gradient(low = "darkgreen", high = "red") +
	guides(color = guide_colorbar("Frequency", order = 1),
	size = "none")+ #guide_legend("Frequency", order = 2)) +
	theme(
	text = element_text(family = font, face = "plain"),
	plot.title = element_text(size = size.Title.Mm),
	plot.subtitle = element_text(size = size.Subtitle.Mm),
	plot.caption = element_text(size = size.Text.Mm),
	legend.position = "bottom",
	panel.border = element_blank())
	plot.repel <- plot.base +
	layer.text.repel
	plot.wordcloud <- plot.base.final +
	layer.text.wordcloud
	#plot.repel
	plot.wordcloud
	# save output: with the latest update of ggplot2, ragg is not needed anymore and Arabic is correctly printed
	ggsave(plot = plot.wordcloud, filename = paste("wordcloud_", label.text,".png", sep = ""), units = "mm" , height = height.Plot, width = width.Plot, dpi = dpi.Plot)
	}

	# function for producing wordclouds for specific periods in a dataset with the above functions
	f.wordcloud.period <- function(dataframe.input, onset, period, column, max.values, label.text) {
	terminus = onset + period - 1
	data.slice <- dataframe.input %>%
	dplyr::filter(year >= onset, year <= terminus) %>% # filter for period
	dplyr::select(column) %>% # select only one column
	dplyr::rename(text = 1) # rename the single selected column
	label = paste(label.text, " (", onset, "-", terminus, ")", sep = "")
	# call wordcloud function
	f.wordcloud.frequency(data.slice$text, max.values, label)
	}