Created
January 24, 2022 00:05
-
-
Save iancward/eaa4727a5058db51135977444655ca40 to your computer and use it in GitHub Desktop.
Python script to fetch a list of five letter words, and count letters by frequency at letter position. Useful for finding a start letter for Wordle.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# stdlib | |
import re | |
import string | |
from collections import Counter | |
from operator import itemgetter | |
# third party | |
import requests | |
print('Fetching word list') | |
# get list of five-letter words from meaningpedia.com | |
# found it linked from Wikipedia: | |
# https://en.wikipedia.org/wiki/Lists_of_English_words#External_links | |
meaningpedia_resp = requests.get( | |
"https://meaningpedia.com/5-letter-words?show=all") | |
# get list of words by grabbing regex captures of response | |
# there's probably a far better way to do this by actually parsing the HTML | |
# response, but I don't know how to do that, and this gets the job done | |
# compile regex | |
pattern = re.compile(r'<span itemprop="name">(\w+)</span>') | |
# find all matches | |
word_list = pattern.findall(meaningpedia_resp.text) | |
# create counter dictionary for each position | |
letter_one = Counter() | |
letter_two = Counter() | |
letter_three = Counter() | |
letter_four = Counter() | |
letter_five = Counter() | |
# loop over word_list, counting letters by position | |
for result in word_list: | |
word = result.lower().rstrip() | |
print('Tabulating: {}'.format(word)) | |
letter_one[word[0]] += 1 | |
letter_two[word[1]] += 1 | |
letter_three[word[2]] += 1 | |
letter_four[word[3]] += 1 | |
letter_five[word[4]] += 1 | |
# sort positional counters | |
sorted_one = sorted(letter_one.items(), key=itemgetter(1), reverse=True) | |
sorted_two = sorted(letter_two.items(), key=itemgetter(1), reverse=True) | |
sorted_three = sorted(letter_three.items(), key=itemgetter(1), reverse=True) | |
sorted_four = sorted(letter_four.items(), key=itemgetter(1), reverse=True) | |
sorted_five = sorted(letter_five.items(), key=itemgetter(1), reverse=True) | |
# not every character shows up in last position, so add those letters | |
missing_letters = set(string.ascii_lowercase) - set(letter_five.keys()) | |
sorted_five += [(letter, 0) for letter in missing_letters] | |
print('word Frequency by location:') | |
for i in range(26): | |
print('{}:{}\t{}:{}\t{}:{}\t{}:{}\t{}:{}'.format( | |
sorted_one[i][0], sorted_one[i][1], | |
sorted_two[i][0], sorted_two[i][1], | |
sorted_three[i][0], sorted_three[i][1], | |
sorted_four[i][0], sorted_four[i][1], | |
sorted_five[i][0], sorted_five[i][1], | |
)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment