billdenney · February 5, 2022 18:51
diff --git a/wordle.R b/wordle.R
 # Load all required libraries
 library(tidyverse)

 # Functions ####

 #' Load the possible word lists from the Wordle source
 #' 
 #' @param url The javascript source for finding the word lists
 #' @param match_possible_words A word that is within the word list for the
 #'   possible words (mainly, words that people are likely to know, the Wordle
 #'   developer was kind)
 #' @param match_all_words A word that is within the word list for all words
 #'   (more than just the possible words)
 #' @param expected_characters The number of characters expected in every word,
 #'   used as a quality check for the outputs
 #' @return A named list with names of "possible" and "all" for the possible
 #'   and all word lists
 #' @export
 load_wordle_word_lists <- function(
    url="https://www.powerlanguage.co.uk/wordle/main.c1506a22.js",
    match_possible_words="balmy", match_all_words="aahed",
    expected_characters=5) {
  word_source_raw <- readLines(url)
  ## There are two word lists in the source, one that is allowed words to be the
  ## correct answer, and the other that is all words allowed to be guessed
  two_lists <- c(possible=match_possible_words, all=match_all_words)
  two_lists_l <- list()
  for (current_word_type in names(two_lists)) {
    current_word <- two_lists[[current_word_type]]
    mask <- grepl(x=word_source_raw, pattern=current_word)
    word_source_words_raw <-
      gsub(
        x=word_source_raw[mask],
        pattern=sprintf('^.*\\[([",a-z]*%s[",a-z]*)\\].*$', current_word),
        replacement="\\1"
      )
    #word_source_words_raw <- gsub(x=word_source_raw[mask], pattern='^.*?\\[((?:\\"[a-z]{5}\\",?)+)\\].*?$', replacement="\\1")
    two_lists_l[[current_word_type]] <-
      gsub(x=strsplit(x=word_source_words_raw, split=",")[[1]], pattern='"', replacement="")
  }
  ## All words that are allowed
  words <- unname(unlist(two_lists_l))
  ## All words that are possible
  possible_words <- two_lists_l$balmy
  # Make sure that loading the data worked correctly (all words should have 5 characters)
  stopifnot(all(nchar(words) == expected_characters))
  words[nchar(words) != expected_characters]
  two_lists_l
 }

 #' Generate a data.frame with all the letter/position for all words
 #' 
 #' @param word The list of words (character vector)
 #' @return A data.frame with rownames of \code{word} and the first column "word"
 #'   containing \code{word}.  Then additional columns for each letter and
 #'   letter/position ("a", "a1", "a2", ...) with a boolean indicating if the
 #'   letter is in the word or the letter is in the word at the position.
 #' @examples
 #' add_word_attributes(c("binge", "splut"))
 #' @export
 wordle_has_letter_setup <- function(word) {
  stopifnot(length(unique(nchar(word))) == 1)
  ret <- data.frame(word=word)
  for (current_letter in letters) {
    ret[[current_letter]] <- grepl(x=word, pattern=current_letter, fixed=TRUE)
    for (current_idx in seq_len(nchar(word[1]))) {
      ret[[paste0(current_letter, current_idx)]] <- current_letter == substr(word, current_idx, current_idx)
    }
  }
  rownames(ret) <- word
  ret
 }

 #' Generate the "colors" for a word guess relative to the possible words
 #' 
 #' @param guess The word that was guessed
 #' @param possible The vector of possible words
 #' @return An integer vector where 0 indicates no in the word, 1 indicates
 #'   elsewhere in the word, and 2 indicates at the current location in the word.
 #' @export
 wordle_guess_colors <- function(current_guess, wordle_has_letter_df) {
  states <- rep(0, nrow(wordle_has_letter_df))
  for (idx in seq_len(nchar(current_guess))) {
    current_letter <- substr(current_guess, idx, idx)
    is_anywhere <- wordle_has_letter_df[, current_letter]
    is_here <- wordle_has_letter_df[, paste0(current_letter, idx)]
    states <- states*10 + (is_anywhere + is_here)
  }
  as.integer(states)
 }

 #' Generate a data.frame of "colors" where the first column are the possible
 #' words to guess and the remaining columns are the colors generated by the
 #' guessed word.
 #' 
 #' @inheritParams wordle_guess_colors 
 wordle_guess_colors_df <- function(possible, all_words) {
  wordle_has_letter_df <- wordle_has_letter_setup(possible)
  all_words <- sort(unique(c(possible, all_words)))
  ret <- data.frame(X=possible)
  pb <- txtProgressBar(min=0, max=length(all_words), style=3)
  for (current_all in all_words) {
    setTxtProgressBar(pb, value=getTxtProgressBar(pb) + 1)
    ret[[current_all]] <-
      wordle_guess_colors(
        current_guess=current_all,
        wordle_has_letter_df=wordle_has_letter_df
      )
  }
  ret
 }

 new_wordle_game <- function(correct_word, possible_df, expected_char=5) {
  stopifnot(is.character(correct_word))
  stopifnot(length(correct_word) == 1)
  stopifnot(nchar(correct_word) == expected_char)
  list(
    correct=correct_word,
    possible_df=possible_df,
    available_rows=rep(TRUE, nrow(possible_df)),
    available_cols=c(FALSE, rep(TRUE, ncol(possible_df) - 1))
  )
 }

 wordle_game_state <- function(game_state, guess, verbose=TRUE) {
  guess_value <- game_state$possible_df[game_state$possible_df$X == game_state$correct, guess]
  history_new <- setNames(guess_value, guess)
  game_state$history <- c(game_state$history, history_new)
  game_state$available_rows <- game_state$available_rows & game_state$possible_df[, guess] == guess_value
  # game_state$available_cols <-
  #   (names(game_state$available_cols) %in% game_state$possible_df$X)
  if (verbose) {
    message(
      "Number of possible words: ", sum(game_state$available_rows), "\n",
      "Number of possible guesses: ", sum(game_state$available_cols), "\n",
      "History: ", paste(names(game_state$history), game_state$history, sep="=", collapse=", ")
    )
  }
  game_state
 }

 wordle_choose_guess <- function(game_state, metric_fun, verbose=TRUE) {
  possible_words <- game_state$possible_df[[1]][game_state$available_rows]
  if (length(possible_words) < 3) {
    # If there are two choices, guess
    return(setNames(rep(-Inf, length(possible_words)), possible_words))
  }
  if (!is.list(metric_fun)) {
    metric_fun <- list(metric_fun)
  }
  if (verbose) {
    pb <- txtProgressBar(min=0, max=sum(game_state$available_cols), style=3)
    pb_metric_fun <- function(...) {
      setTxtProgressBar(pb, value=getTxtProgressBar(pb) + 1)
      metric_fun[[1]](...)
    }
  } else {
    pb_metric_fun <- metric_fun[[1]]
  }
  metrics <-
    sapply(
      # operate only on the available rows from the available columns
      X=game_state$possible_df[game_state$available_cols][game_state$available_rows, ],
      FUN=pb_metric_fun
    )
  # First prioritize improved metrics.  Then prioritize words that are possible
  # (so that you can get it in 1 more).
  ret_metrics <- metrics[order(metrics, -(names(metrics) %in% possible_words))]
  # If there are more than one metric functions defined, break top-level ties with the next metric function
  mask_equal_to_best <- ret_metrics == ret_metrics[[1]]
  if ((length(metric_fun) > 1) & sum(mask_equal_to_best) > 1) {
    metrics2 <-
      sapply(
        # operate only on the available rows from the available columns
        X=game_state$possible_df[names(ret_metrics)][game_state$available_rows, ],
        FUN=metric_fun[[2]]
      )
    ret <- ret_metrics[mask_equal_to_best][order(metrics2)]
  } else {
    ret <- ret_metrics
  }
  ret
 }

 wordle_choose_guess_cache <- function(metric_fun) {
  cache <- force(list())
  metric_fun <- force(metric_fun)
  function(game_state, verbose) {
    history_chr <- paste(names(game_state$history), game_state$history, sep="=", collapse=", ")
    if (history_chr %in% names(cache)) {
      if (verbose) message("Using cache for ", history_chr, ": ", names(cache[[history_chr]]))
    } else {
      # Only store the first guess, since that is all that is used.
      cache[[history_chr]] <<- wordle_choose_guess(game_state=game_state, metric_fun=metric_fun, verbose=verbose)[1]
    }
    cache[[history_chr]]
  }
 }

 # Autoplay ####

 wordle_autoplay <- function(game_state, guess_fun, initial_guess, verbose_guess=FALSE, verbose_state=FALSE) {
  game_state <- wordle_game_state(game_state, guess=initial_guess, verbose=verbose_state)
  while (game_state$history[length(game_state$history)] != 22222) {
    current_guess <- guess_fun(game_state=game_state, verbose=verbose_guess)
    game_state <- wordle_game_state(game_state, guess=names(current_guess)[1], verbose=verbose_state)
  }
  message(
    "Game complete for ", game_state$correct, "; ", length(game_state$history), " guesses; ",
    paste(names(game_state$history), game_state$history, sep="=", collapse=", ")
  )
  game_state
 }

 # Word selection metrics ####

 wordle_metric_smallest_big_group <- function(x) {
  max(summary(factor(x)))
 }

 wordle_metric_most_to_median_group <- function(x) {
  sf <- rev(sort(summary(factor(x))))
  -unname(which(cumsum(sf) > (length(x)/2))[1])
 }

 wordle_metric_smallest_median_group <- function(x) {
  sf <- sort(summary(factor(x)))
  unname(sf[which(cumsum(sf) > (length(x)/2))[1]])
 }

 guess_cache_smallest_big_group <- wordle_choose_guess_cache(metric_fun=wordle_metric_smallest_big_group)
 guess_cache_smallest_median_group <- wordle_choose_guess_cache(metric_fun=wordle_metric_smallest_median_group)
 guess_cache_smallest_big_then_median_group <- wordle_choose_guess_cache(metric_fun=list(wordle_metric_smallest_big_group, wordle_metric_smallest_median_group))

 # Game Setup ####

 words <- load_wordle_word_lists()
 wordle_guesses_possible <- wordle_guess_colors_df(possible=words$possible, all_words=words$all)

 # Start a game ####
 # game_pleat <- new_wordle_game(correct_word="pleat", possible_df=wordle_guesses_possible)
 # initial_guess_smallest_big_group <- wordle_choose_guess(game_state=game_pleat, metric_fun=wordle_metric_smallest_big_group)
 # initial_guess_most_to_median_group <- wordle_choose_guess(game_state=game_pleat, metric_fun=wordle_metric_most_to_median_group)
 # initial_guess_smallest_median_group <- wordle_choose_guess(game_state=game_pleat, metric_fun=wordle_metric_smallest_median_group)

 pb_all_words <- txtProgressBar(min=0, max=nrow(wordle_guesses_possible), style=3)
 ret <- list()
 for (current_word in sort(wordle_guesses_possible$X)) {
  setTxtProgressBar(pb_all_words, value=getTxtProgressBar(pb_all_words) + 1, title="All words")
  ret[[current_word]] <-
    wordle_autoplay(
      game_state=new_wordle_game(correct_word=current_word, possible_df=wordle_guesses_possible),
      guess_fun=guess_cache_smallest_big_group,
      initial_guess="raise"
    )
 }

 guess_distribution <- sapply(X=ret, FUN=function(x) length(x$history))
 ggplot2::ggplot(data=data.frame(distr=guess_distribution), aes(x=distr)) +
  geom_histogram()
	# Load all required libraries
	library(tidyverse)

	# Functions ####

	#' Load the possible word lists from the Wordle source
	#'
	#' @param url The javascript source for finding the word lists
	#' @param match_possible_words A word that is within the word list for the
	#' possible words (mainly, words that people are likely to know, the Wordle
	#' developer was kind)
	#' @param match_all_words A word that is within the word list for all words
	#' (more than just the possible words)
	#' @param expected_characters The number of characters expected in every word,
	#' used as a quality check for the outputs
	#' @return A named list with names of "possible" and "all" for the possible
	#' and all word lists
	#' @export
	load_wordle_word_lists <- function(
	url="https://www.powerlanguage.co.uk/wordle/main.c1506a22.js",
	match_possible_words="balmy", match_all_words="aahed",
	expected_characters=5) {
	word_source_raw <- readLines(url)
	## There are two word lists in the source, one that is allowed words to be the
	## correct answer, and the other that is all words allowed to be guessed
	two_lists <- c(possible=match_possible_words, all=match_all_words)
	two_lists_l <- list()
	for (current_word_type in names(two_lists)) {
	current_word <- two_lists[[current_word_type]]
	mask <- grepl(x=word_source_raw, pattern=current_word)
	word_source_words_raw <-
	gsub(
	x=word_source_raw[mask],
	pattern=sprintf('^.\\[([",a-z]%s[",a-z])\\].$', current_word),
	replacement="\\1"
	)
	#word_source_words_raw <- gsub(x=word_source_raw[mask], pattern='^.?\\[((?:\\"[a-z]{5}\\",?)+)\\].?$', replacement="\\1")
	two_lists_l[[current_word_type]] <-
	gsub(x=strsplit(x=word_source_words_raw, split=",")[[1]], pattern='"', replacement="")
	}
	## All words that are allowed
	words <- unname(unlist(two_lists_l))
	## All words that are possible
	possible_words <- two_lists_l$balmy
	# Make sure that loading the data worked correctly (all words should have 5 characters)
	stopifnot(all(nchar(words) == expected_characters))
	words[nchar(words) != expected_characters]
	two_lists_l
	}

	#' Generate a data.frame with all the letter/position for all words
	#'
	#' @param word The list of words (character vector)
	#' @return A data.frame with rownames of \code{word} and the first column "word"
	#' containing \code{word}. Then additional columns for each letter and
	#' letter/position ("a", "a1", "a2", ...) with a boolean indicating if the
	#' letter is in the word or the letter is in the word at the position.
	#' @examples
	#' add_word_attributes(c("binge", "splut"))
	#' @export
	wordle_has_letter_setup <- function(word) {
	stopifnot(length(unique(nchar(word))) == 1)
	ret <- data.frame(word=word)
	for (current_letter in letters) {
	ret[[current_letter]] <- grepl(x=word, pattern=current_letter, fixed=TRUE)
	for (current_idx in seq_len(nchar(word[1]))) {
	ret[[paste0(current_letter, current_idx)]] <- current_letter == substr(word, current_idx, current_idx)
	}
	}
	rownames(ret) <- word
	ret
	}

	#' Generate the "colors" for a word guess relative to the possible words
	#'
	#' @param guess The word that was guessed
	#' @param possible The vector of possible words
	#' @return An integer vector where 0 indicates no in the word, 1 indicates
	#' elsewhere in the word, and 2 indicates at the current location in the word.
	#' @export
	wordle_guess_colors <- function(current_guess, wordle_has_letter_df) {
	states <- rep(0, nrow(wordle_has_letter_df))
	for (idx in seq_len(nchar(current_guess))) {
	current_letter <- substr(current_guess, idx, idx)
	is_anywhere <- wordle_has_letter_df[, current_letter]
	is_here <- wordle_has_letter_df[, paste0(current_letter, idx)]
	states <- states*10 + (is_anywhere + is_here)
	}
	as.integer(states)
	}

	#' Generate a data.frame of "colors" where the first column are the possible
	#' words to guess and the remaining columns are the colors generated by the
	#' guessed word.
	#'
	#' @inheritParams wordle_guess_colors
	wordle_guess_colors_df <- function(possible, all_words) {
	wordle_has_letter_df <- wordle_has_letter_setup(possible)
	all_words <- sort(unique(c(possible, all_words)))
	ret <- data.frame(X=possible)
	pb <- txtProgressBar(min=0, max=length(all_words), style=3)
	for (current_all in all_words) {
	setTxtProgressBar(pb, value=getTxtProgressBar(pb) + 1)
	ret[[current_all]] <-
	wordle_guess_colors(
	current_guess=current_all,
	wordle_has_letter_df=wordle_has_letter_df
	)
	}
	ret
	}

	new_wordle_game <- function(correct_word, possible_df, expected_char=5) {
	stopifnot(is.character(correct_word))
	stopifnot(length(correct_word) == 1)
	stopifnot(nchar(correct_word) == expected_char)
	list(
	correct=correct_word,
	possible_df=possible_df,
	available_rows=rep(TRUE, nrow(possible_df)),
	available_cols=c(FALSE, rep(TRUE, ncol(possible_df) - 1))
	)
	}

	wordle_game_state <- function(game_state, guess, verbose=TRUE) {
	guess_value <- game_state$possible_df[game_state$possible_df$X == game_state$correct, guess]
	history_new <- setNames(guess_value, guess)
	game_state$history <- c(game_state$history, history_new)
	game_state$available_rows <- game_state$available_rows & game_state$possible_df[, guess] == guess_value
	# game_state$available_cols <-
	# (names(game_state$available_cols) %in% game_state$possible_df$X)
	if (verbose) {
	message(
	"Number of possible words: ", sum(game_state$available_rows), "\n",
	"Number of possible guesses: ", sum(game_state$available_cols), "\n",
	"History: ", paste(names(game_state$history), game_state$history, sep="=", collapse=", ")
	)
	}
	game_state
	}

	wordle_choose_guess <- function(game_state, metric_fun, verbose=TRUE) {
	possible_words <- game_state$possible_df[[1]][game_state$available_rows]
	if (length(possible_words) < 3) {
	# If there are two choices, guess
	return(setNames(rep(-Inf, length(possible_words)), possible_words))
	}
	if (!is.list(metric_fun)) {
	metric_fun <- list(metric_fun)
	}
	if (verbose) {
	pb <- txtProgressBar(min=0, max=sum(game_state$available_cols), style=3)
	pb_metric_fun <- function(...) {
	setTxtProgressBar(pb, value=getTxtProgressBar(pb) + 1)
	metric_fun[[1]](...)
	}
	} else {
	pb_metric_fun <- metric_fun[[1]]
	}
	metrics <-
	sapply(
	# operate only on the available rows from the available columns
	X=game_state$possible_df[game_state$available_cols][game_state$available_rows, ],
	FUN=pb_metric_fun
	)
	# First prioritize improved metrics. Then prioritize words that are possible
	# (so that you can get it in 1 more).
	ret_metrics <- metrics[order(metrics, -(names(metrics) %in% possible_words))]
	# If there are more than one metric functions defined, break top-level ties with the next metric function
	mask_equal_to_best <- ret_metrics == ret_metrics[[1]]
	if ((length(metric_fun) > 1) & sum(mask_equal_to_best) > 1) {
	metrics2 <-
	sapply(
	# operate only on the available rows from the available columns
	X=game_state$possible_df[names(ret_metrics)][game_state$available_rows, ],
	FUN=metric_fun[[2]]
	)
	ret <- ret_metrics[mask_equal_to_best][order(metrics2)]
	} else {
	ret <- ret_metrics
	}
	ret
	}

	wordle_choose_guess_cache <- function(metric_fun) {
	cache <- force(list())
	metric_fun <- force(metric_fun)
	function(game_state, verbose) {
	history_chr <- paste(names(game_state$history), game_state$history, sep="=", collapse=", ")
	if (history_chr %in% names(cache)) {
	if (verbose) message("Using cache for ", history_chr, ": ", names(cache[[history_chr]]))
	} else {
	# Only store the first guess, since that is all that is used.
	cache[[history_chr]] <<- wordle_choose_guess(game_state=game_state, metric_fun=metric_fun, verbose=verbose)[1]
	}
	cache[[history_chr]]
	}
	}

	# Autoplay ####

	wordle_autoplay <- function(game_state, guess_fun, initial_guess, verbose_guess=FALSE, verbose_state=FALSE) {
	game_state <- wordle_game_state(game_state, guess=initial_guess, verbose=verbose_state)
	while (game_state$history[length(game_state$history)] != 22222) {
	current_guess <- guess_fun(game_state=game_state, verbose=verbose_guess)
	game_state <- wordle_game_state(game_state, guess=names(current_guess)[1], verbose=verbose_state)
	}
	message(
	"Game complete for ", game_state$correct, "; ", length(game_state$history), " guesses; ",
	paste(names(game_state$history), game_state$history, sep="=", collapse=", ")
	)
	game_state
	}

	# Word selection metrics ####

	wordle_metric_smallest_big_group <- function(x) {
	max(summary(factor(x)))
	}

	wordle_metric_most_to_median_group <- function(x) {
	sf <- rev(sort(summary(factor(x))))
	-unname(which(cumsum(sf) > (length(x)/2))[1])
	}

	wordle_metric_smallest_median_group <- function(x) {
	sf <- sort(summary(factor(x)))
	unname(sf[which(cumsum(sf) > (length(x)/2))[1]])
	}

	guess_cache_smallest_big_group <- wordle_choose_guess_cache(metric_fun=wordle_metric_smallest_big_group)
	guess_cache_smallest_median_group <- wordle_choose_guess_cache(metric_fun=wordle_metric_smallest_median_group)
	guess_cache_smallest_big_then_median_group <- wordle_choose_guess_cache(metric_fun=list(wordle_metric_smallest_big_group, wordle_metric_smallest_median_group))

	# Game Setup ####

	words <- load_wordle_word_lists()
	wordle_guesses_possible <- wordle_guess_colors_df(possible=words$possible, all_words=words$all)

	# Start a game ####
	# game_pleat <- new_wordle_game(correct_word="pleat", possible_df=wordle_guesses_possible)
	# initial_guess_smallest_big_group <- wordle_choose_guess(game_state=game_pleat, metric_fun=wordle_metric_smallest_big_group)
	# initial_guess_most_to_median_group <- wordle_choose_guess(game_state=game_pleat, metric_fun=wordle_metric_most_to_median_group)
	# initial_guess_smallest_median_group <- wordle_choose_guess(game_state=game_pleat, metric_fun=wordle_metric_smallest_median_group)

	pb_all_words <- txtProgressBar(min=0, max=nrow(wordle_guesses_possible), style=3)
	ret <- list()
	for (current_word in sort(wordle_guesses_possible$X)) {
	setTxtProgressBar(pb_all_words, value=getTxtProgressBar(pb_all_words) + 1, title="All words")
	ret[[current_word]] <-
	wordle_autoplay(
	game_state=new_wordle_game(correct_word=current_word, possible_df=wordle_guesses_possible),
	guess_fun=guess_cache_smallest_big_group,
	initial_guess="raise"
	)
	}

	guess_distribution <- sapply(X=ret, FUN=function(x) length(x$history))
	ggplot2::ggplot(data=data.frame(distr=guess_distribution), aes(x=distr)) +
	geom_histogram()