jakeybob · July 15, 2024 11:30
diff --git a/cloud.R b/cloud.R
 library(tidyverse)
 library(ggwordcloud)
 library(tidytext)

 # https://github.com/lepennec/ggwordcloud
 # https://cran.r-project.org/web/packages/tidytext/vignettes/tidytext.html

 # get some text, remove newline/return chars
 text <- read_file("https://www.gutenberg.org/cache/epub/84/pg84.txt") |> 
  str_replace_all(pattern = "\r\n|\n|\r", replacement = " ") |> 
  str_squish()

 # put in dataframe, one row per word
 df <- tibble(text = text) |> 
  unnest_tokens(output = word, input = text)

 # just combine all the en language stopword dictionaries
 stopwords <- get_stopwords("en", "snowball") |> 
  bind_rows(get_stopwords("en", "smart")) |>
  bind_rows(get_stopwords("en", "stopwords-iso")) |> 
  bind_rows(get_stopwords("en", "marimo")) |> 
  bind_rows(get_stopwords("en", "nltk")) |> 
  bind_rows(tibble(word = c("my", "custom", "stopwords"), lexicon = "custom")) |> 
  group_by(word) |> summarise(lexicon = first(lexicon), .groups = "drop")

 # remove stopwords and count remaining word freqs
 df_cleaned_word_freqs <- df |> 
  anti_join(stopwords) |> 
  count(word) |> 
  arrange(desc(n)) 

 # plot n most common words in text
 n <- 30
 max_size <- 15 # size limit of plot, may need tweaking

 df_cleaned_word_freqs |> 
  slice_head(n = n) |> 
  ggplot(aes(label = word, size = n, colour = n)) +
  geom_text_wordcloud() +
  scale_size_area(max_size = max_size) +
  scale_colour_viridis_c(option = "magma", direction = -1, end = 0.9) +
  theme_minimal()
	library(tidyverse)
	library(ggwordcloud)
	library(tidytext)

	# https://github.com/lepennec/ggwordcloud
	# https://cran.r-project.org/web/packages/tidytext/vignettes/tidytext.html

	# get some text, remove newline/return chars
	text <- read_file("https://www.gutenberg.org/cache/epub/84/pg84.txt") \|>
	str_replace_all(pattern = "\r\n\|\n\|\r", replacement = " ") \|>
	str_squish()

	# put in dataframe, one row per word
	df <- tibble(text = text) \|>
	unnest_tokens(output = word, input = text)

	# just combine all the en language stopword dictionaries
	stopwords <- get_stopwords("en", "snowball") \|>
	bind_rows(get_stopwords("en", "smart")) \|>
	bind_rows(get_stopwords("en", "stopwords-iso")) \|>
	bind_rows(get_stopwords("en", "marimo")) \|>
	bind_rows(get_stopwords("en", "nltk")) \|>
	bind_rows(tibble(word = c("my", "custom", "stopwords"), lexicon = "custom")) \|>
	group_by(word) \|> summarise(lexicon = first(lexicon), .groups = "drop")

	# remove stopwords and count remaining word freqs
	df_cleaned_word_freqs <- df \|>
	anti_join(stopwords) \|>
	count(word) \|>
	arrange(desc(n))

	# plot n most common words in text
	n <- 30
	max_size <- 15 # size limit of plot, may need tweaking

	df_cleaned_word_freqs \|>
	slice_head(n = n) \|>
	ggplot(aes(label = word, size = n, colour = n)) +
	geom_text_wordcloud() +
	scale_size_area(max_size = max_size) +
	scale_colour_viridis_c(option = "magma", direction = -1, end = 0.9) +
	theme_minimal()
No results found