Skip to content

Instantly share code, notes, and snippets.

@tillgrallert
Last active October 1, 2021 15:00
Show Gist options
  • Save tillgrallert/9413bdf80eccc167f8c1269e6564837f to your computer and use it in GitHub Desktop.
Save tillgrallert/9413bdf80eccc167f8c1269e6564837f to your computer and use it in GitHub Desktop.
Wordclouds for frequency lists in R (with ggplot2)
library(tidyverse)
library(lubridate) # for working with dates
library(ggrepel)
library(ggthemes) # install themes for ggplots
library(ggwordcloud)
library(here) # for easier locating of all files
library(ragg) # for rendering Arabic
library(RColorBrewer)
library(wordcloud)
library(arabicStemR)
library(tm)
# enable unicode
Sys.setlocale("LC_ALL", "en_US.UTF-8")
# set a general theme for all ggplots
theme_set(theme_bw())
# function to generate a frequency list
f.frequency.list <- function(input) {
docs <- Corpus(VectorSource(input))
# build a term-document matrix to get a frequency list
dtm <- TermDocumentMatrix(docs)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
output <- data.frame(word = names(v),freq=v)
output
}
# variables for labels
v.label.source = 'Data source: '
v.label.license = 'Licence information'
# fonts: select those available on your system
font = "Baskerville"
font.Arab = "Amiri" # for Arabic
# font sizes
size.Base.Px = 9
## font sizes are measured in mm
size.Base.Mm = (5/14) * size.Base.Px
# specify text sizes
size.Title = 2
size.Subtitle = 1.5
size.Text = 1
size.Title.Mm = size.Title * size.Base.Mm
size.Subtitle.Mm = size.Subtitle * size.Base.Mm
size.Text.Mm = size.Text * size.Base.Mm
# funtion for generating wordclouds based on the frequency lists
f.wordcloud.frequency <- function(input, max.values, label.text) {
# process data: frequency list
data.frequency <- f.frequency.list(input) %>%
slice(1:max.values) %>% # limit the length of the data set
# add some 90 degree angles to 20 % of all words
dplyr::mutate(angle = 90 * sample(c(0, 1), n(), replace = TRUE, prob = c(80, 20)))
#dplyr::mutate(angle = 45 * sample(-2:2, n(), replace = TRUE, prob = c(1, 1, 4, 1, 1)))
# plot
plot.base <- ggplot(data.frequency, aes(x = 1, y = 1, size = freq, label = word, colour = freq)) +
scale_y_continuous(breaks = NULL) +
scale_x_continuous(breaks = NULL)
# labs
layer.labs <- labs(x = "", y = "",
title = paste("Wordcloud of the", max.values, "most frequent words in", label.text, sep = " "),
subtitle = v.label.source,
caption = v.label.license)
layer.text.repel <- c(
geom_text_repel(segment.size = 0, force = 10, max.overlaps = 500, family = font.Arab),
scale_size(range = c(2, 22), guide = FALSE))
layer.text.wordcloud <- c(
geom_text_wordcloud(aes(angle = angle), # use the angle information
family = font.Arab,
area_corr = F, # for frequency corresponding to area
eccentricity = 1, # to form a circle
rm_outside = TRUE, # if there are too many words, the smallest ones should be removed if they cannot fit onto the canvas
grid_margin = 0.5, seed = 43,
show.legend = T),
scale_size_area(max_size = 20)
#scale_radius(range = c(0, 30), limits = c(0, NA))
)
plot.base.final <- plot.base +
layer.labs +
scale_color_gradient(low = "darkgreen", high = "red") +
guides(color = guide_colorbar("Frequency", order = 1),
size = "none")+ #guide_legend("Frequency", order = 2)) +
theme(
text = element_text(family = font, face = "plain"),
plot.title = element_text(size = size.Title.Mm),
plot.subtitle = element_text(size = size.Subtitle.Mm),
plot.caption = element_text(size = size.Text.Mm),
legend.position = "bottom",
panel.border = element_blank())
plot.repel <- plot.base +
layer.text.repel
plot.wordcloud <- plot.base.final +
layer.text.wordcloud
#plot.repel
plot.wordcloud
# save output: with the latest update of ggplot2, ragg is not needed anymore and Arabic is correctly printed
ggsave(plot = plot.wordcloud, filename = paste("wordcloud_", label.text,".png", sep = ""), units = "mm" , height = height.Plot, width = width.Plot, dpi = dpi.Plot)
}
# function for producing wordclouds for specific periods in a dataset with the above functions
f.wordcloud.period <- function(dataframe.input, onset, period, column, max.values, label.text) {
terminus = onset + period - 1
data.slice <- dataframe.input %>%
dplyr::filter(year >= onset, year <= terminus) %>% # filter for period
dplyr::select(column) %>% # select only one column
dplyr::rename(text = 1) # rename the single selected column
label = paste(label.text, " (", onset, "-", terminus, ")", sep = "")
# call wordcloud function
f.wordcloud.frequency(data.slice$text, max.values, label)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment