Last active
October 6, 2023 19:39
-
-
Save jsundram/b3c2f3120645b25895f8b53faf36decf to your computer and use it in GitHub Desktop.
letter statistics visualizations for wordle (using the short wordle wordlist)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from collections import Counter | |
from string import ascii_lowercase as ALPHABET | |
import json | |
import matplotlib.pyplot as plt | |
import numpy as np | |
""" | |
For https://www.powerlanguage.co.uk/wordle/. | |
Read more here: https://www.nytimes.com/2022/01/03/technology/wordle-word-game-creator.html | |
""" | |
plt.style.use("Solarize_Light2") | |
plt.rcParams["savefig.facecolor"] = plt.rcParams["axes.facecolor"] | |
plt.rcParams["savefig.edgecolor"] = plt.rcParams["axes.edgecolor"] | |
def get_words(key='short'): | |
"""Key must be one of: | |
1) "short": wordle's short list (2,314 entries) | |
2) "long": wordle's long list | |
3) "dict": /usr/share/dict/words of length 5 (not proper nouns) | |
""" | |
if key in ['short', 'long']: | |
with open('wordle.json') as f: | |
data = json.load(f) | |
return [w.lower() for w in data.get(key, [])] | |
elif key == 'dict': | |
with open('/usr/share/dict/words') as f: | |
words = [line.strip() for line in f] | |
return [w for w in words if len(w) == 5 and not w[0].isupper()] | |
return [] | |
def histogram(words): | |
c = Counter([a for w in words for a in w]) | |
y = [i for (i, _) in enumerate(c.keys())] | |
width = [v for (k, v) in sorted(c.items())] | |
tick_label = [k for (k, v) in sorted(c.items())] | |
fig, ax = plt.subplots() | |
plt.barh(y=list(reversed(y)), width=width, tick_label=tick_label) | |
ax.set_title("Letter Distribution from {:,} 5-letter Words".format(len(words))) | |
plt.savefig("histogram.png") | |
def heatmap(words): | |
"""https://stackoverflow.com/questions/14391959/heatmap-in-matplotlib-with-pcolor""" | |
data = get_frequencies(words) | |
fig, ax = plt.subplots(figsize=(10, 3)) | |
heatmap = ax.pcolor(data, cmap=plt.cm.RdYlBu_r) | |
# Put the major ticks at the middle of each cell. | |
ax.set_xticks(np.arange(data.shape[1]) + 0.5, minor=False) | |
ax.set_yticks(np.arange(data.shape[0]) + 0.5, minor=False) | |
# Want a more natural, table-like display. | |
ax.invert_yaxis() | |
ax.xaxis.tick_top() | |
ax.set_xticklabels(ALPHABET, minor=False) | |
ax.set_yticklabels(list(range(1, 6)), minor=False) | |
ax.set_aspect("equal") | |
ax.set_title( | |
"Letter Frequencies from {:,} 5-letter words by position in word".format( | |
len(words) | |
) | |
) | |
ax.set_ylabel("Position") | |
ax.set_xlabel("Letter") | |
# Make a legend that aligns with the heatmap | |
# https://stackoverflow.com/a/18195921/2683 | |
ax_pos = ax.get_position() | |
spacing, width = 0.01, 0.02 | |
cax = fig.add_axes([ax_pos.x1 + spacing, ax_pos.y0, width, ax_pos.height]) | |
fig.colorbar(heatmap, cax=cax) | |
plt.savefig("heatmap.png", dpi=300) | |
def get_frequencies(words): | |
table = np.zeros((len(words[0]), len(ALPHABET)), dtype=int) | |
ix = {a: i for (i, a) in enumerate(ALPHABET)} | |
for word in words: | |
for row, a in enumerate(word): | |
col = ix[a] | |
table[row][col] += 1 | |
return table | |
def main(): | |
words = get_words('short') | |
histogram(words) | |
heatmap(words) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment