Last active
October 1, 2021 15:00
-
-
Save tillgrallert/9413bdf80eccc167f8c1269e6564837f to your computer and use it in GitHub Desktop.
Wordclouds for frequency lists in R (with ggplot2)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tidyverse) | |
library(lubridate) # for working with dates | |
library(ggrepel) | |
library(ggthemes) # install themes for ggplots | |
library(ggwordcloud) | |
library(here) # for easier locating of all files | |
library(ragg) # for rendering Arabic | |
library(RColorBrewer) | |
library(wordcloud) | |
library(arabicStemR) | |
library(tm) | |
# enable unicode | |
Sys.setlocale("LC_ALL", "en_US.UTF-8") | |
# set a general theme for all ggplots | |
theme_set(theme_bw()) | |
# function to generate a frequency list | |
f.frequency.list <- function(input) { | |
docs <- Corpus(VectorSource(input)) | |
# build a term-document matrix to get a frequency list | |
dtm <- TermDocumentMatrix(docs) | |
m <- as.matrix(dtm) | |
v <- sort(rowSums(m),decreasing=TRUE) | |
output <- data.frame(word = names(v),freq=v) | |
output | |
} | |
# variables for labels | |
v.label.source = 'Data source: ' | |
v.label.license = 'Licence information' | |
# fonts: select those available on your system | |
font = "Baskerville" | |
font.Arab = "Amiri" # for Arabic | |
# font sizes | |
size.Base.Px = 9 | |
## font sizes are measured in mm | |
size.Base.Mm = (5/14) * size.Base.Px | |
# specify text sizes | |
size.Title = 2 | |
size.Subtitle = 1.5 | |
size.Text = 1 | |
size.Title.Mm = size.Title * size.Base.Mm | |
size.Subtitle.Mm = size.Subtitle * size.Base.Mm | |
size.Text.Mm = size.Text * size.Base.Mm | |
# funtion for generating wordclouds based on the frequency lists | |
f.wordcloud.frequency <- function(input, max.values, label.text) { | |
# process data: frequency list | |
data.frequency <- f.frequency.list(input) %>% | |
slice(1:max.values) %>% # limit the length of the data set | |
# add some 90 degree angles to 20 % of all words | |
dplyr::mutate(angle = 90 * sample(c(0, 1), n(), replace = TRUE, prob = c(80, 20))) | |
#dplyr::mutate(angle = 45 * sample(-2:2, n(), replace = TRUE, prob = c(1, 1, 4, 1, 1))) | |
# plot | |
plot.base <- ggplot(data.frequency, aes(x = 1, y = 1, size = freq, label = word, colour = freq)) + | |
scale_y_continuous(breaks = NULL) + | |
scale_x_continuous(breaks = NULL) | |
# labs | |
layer.labs <- labs(x = "", y = "", | |
title = paste("Wordcloud of the", max.values, "most frequent words in", label.text, sep = " "), | |
subtitle = v.label.source, | |
caption = v.label.license) | |
layer.text.repel <- c( | |
geom_text_repel(segment.size = 0, force = 10, max.overlaps = 500, family = font.Arab), | |
scale_size(range = c(2, 22), guide = FALSE)) | |
layer.text.wordcloud <- c( | |
geom_text_wordcloud(aes(angle = angle), # use the angle information | |
family = font.Arab, | |
area_corr = F, # for frequency corresponding to area | |
eccentricity = 1, # to form a circle | |
rm_outside = TRUE, # if there are too many words, the smallest ones should be removed if they cannot fit onto the canvas | |
grid_margin = 0.5, seed = 43, | |
show.legend = T), | |
scale_size_area(max_size = 20) | |
#scale_radius(range = c(0, 30), limits = c(0, NA)) | |
) | |
plot.base.final <- plot.base + | |
layer.labs + | |
scale_color_gradient(low = "darkgreen", high = "red") + | |
guides(color = guide_colorbar("Frequency", order = 1), | |
size = "none")+ #guide_legend("Frequency", order = 2)) + | |
theme( | |
text = element_text(family = font, face = "plain"), | |
plot.title = element_text(size = size.Title.Mm), | |
plot.subtitle = element_text(size = size.Subtitle.Mm), | |
plot.caption = element_text(size = size.Text.Mm), | |
legend.position = "bottom", | |
panel.border = element_blank()) | |
plot.repel <- plot.base + | |
layer.text.repel | |
plot.wordcloud <- plot.base.final + | |
layer.text.wordcloud | |
#plot.repel | |
plot.wordcloud | |
# save output: with the latest update of ggplot2, ragg is not needed anymore and Arabic is correctly printed | |
ggsave(plot = plot.wordcloud, filename = paste("wordcloud_", label.text,".png", sep = ""), units = "mm" , height = height.Plot, width = width.Plot, dpi = dpi.Plot) | |
} | |
# function for producing wordclouds for specific periods in a dataset with the above functions | |
f.wordcloud.period <- function(dataframe.input, onset, period, column, max.values, label.text) { | |
terminus = onset + period - 1 | |
data.slice <- dataframe.input %>% | |
dplyr::filter(year >= onset, year <= terminus) %>% # filter for period | |
dplyr::select(column) %>% # select only one column | |
dplyr::rename(text = 1) # rename the single selected column | |
label = paste(label.text, " (", onset, "-", terminus, ")", sep = "") | |
# call wordcloud function | |
f.wordcloud.frequency(data.slice$text, max.values, label) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment