Skip to content

Instantly share code, notes, and snippets.

@iancward
Created January 24, 2022 00:05
Show Gist options
  • Save iancward/eaa4727a5058db51135977444655ca40 to your computer and use it in GitHub Desktop.
Save iancward/eaa4727a5058db51135977444655ca40 to your computer and use it in GitHub Desktop.
Python script to fetch a list of five letter words, and count letters by frequency at letter position. Useful for finding a start letter for Wordle.
#!/usr/bin/env python3
# stdlib
import re
import string
from collections import Counter
from operator import itemgetter
# third party
import requests
print('Fetching word list')
# get list of five-letter words from meaningpedia.com
# found it linked from Wikipedia:
# https://en.wikipedia.org/wiki/Lists_of_English_words#External_links
meaningpedia_resp = requests.get(
"https://meaningpedia.com/5-letter-words?show=all")
# get list of words by grabbing regex captures of response
# there's probably a far better way to do this by actually parsing the HTML
# response, but I don't know how to do that, and this gets the job done
# compile regex
pattern = re.compile(r'<span itemprop="name">(\w+)</span>')
# find all matches
word_list = pattern.findall(meaningpedia_resp.text)
# create counter dictionary for each position
letter_one = Counter()
letter_two = Counter()
letter_three = Counter()
letter_four = Counter()
letter_five = Counter()
# loop over word_list, counting letters by position
for result in word_list:
word = result.lower().rstrip()
print('Tabulating: {}'.format(word))
letter_one[word[0]] += 1
letter_two[word[1]] += 1
letter_three[word[2]] += 1
letter_four[word[3]] += 1
letter_five[word[4]] += 1
# sort positional counters
sorted_one = sorted(letter_one.items(), key=itemgetter(1), reverse=True)
sorted_two = sorted(letter_two.items(), key=itemgetter(1), reverse=True)
sorted_three = sorted(letter_three.items(), key=itemgetter(1), reverse=True)
sorted_four = sorted(letter_four.items(), key=itemgetter(1), reverse=True)
sorted_five = sorted(letter_five.items(), key=itemgetter(1), reverse=True)
# not every character shows up in last position, so add those letters
missing_letters = set(string.ascii_lowercase) - set(letter_five.keys())
sorted_five += [(letter, 0) for letter in missing_letters]
print('word Frequency by location:')
for i in range(26):
print('{}:{}\t{}:{}\t{}:{}\t{}:{}\t{}:{}'.format(
sorted_one[i][0], sorted_one[i][1],
sorted_two[i][0], sorted_two[i][1],
sorted_three[i][0], sorted_three[i][1],
sorted_four[i][0], sorted_four[i][1],
sorted_five[i][0], sorted_five[i][1],
))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment