Last active
November 8, 2018 17:21
-
-
Save ajbrock/0ede94afed21325e2cbf87e0e50763d6 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
# Corpus available here: https://pastebin.com/WqD6fAgu | |
# Corpus taken from https://dominionstrategy.com/all-cards/ | |
# Read all cards into memory | |
with open('dominion_cards.html', 'r') as rfile: | |
x = rfile.readlines() | |
# Convenience function to count words, used later | |
def count_words(text): | |
# initialize count | |
count = 0 | |
# Count the number of + and $ | |
for item in ['+', '$', '-']: | |
count += text.count(item) | |
text = text.replace(item, '') | |
# Replace the slashes with spaces | |
text = text.replace('/', ' ') | |
# Split based on spaces, then with | |
# count the remaining words | |
split_text = [item for item in text.split(' ') if any(item)] | |
count += len(split_text) | |
return count | |
# Extract cards and format pythonically | |
c = [] | |
i = -1 | |
active=False | |
weasels = ['\n'] | |
for s in x: | |
# The <tr> indicates the start of a card | |
if '<tr>' in s: | |
active = True | |
i += 1 | |
c += [[]] | |
elif '</tr>' in s: | |
active = False | |
elif active: | |
ii = 0 | |
# Replace all html tags | |
while s.find('<') != -1: | |
ii +=1 | |
if ii > 100: | |
print('breaking for safety') | |
break | |
low, high = s.find('<'), s.find('>') | |
s = s[:low] + s[high + 1:] | |
for word in weasels: | |
s = s.replace(word, '') | |
s = s.replace('Victory Points', 'VP') | |
s = s.replace('Victory Point', 'VP') | |
c[-1] += [s] | |
# Toss landmarks and other somesuch so and sos | |
c = [item for item in c if not any([word in item[1] for word in 'Boon', 'Landmark', 'Hex', 'State', 'Event', 'Castle', 'Ruins', 'Shelter'])] | |
txt = [', '.join(item[3:]) for item in c] | |
# Count words | |
counts = [count_words(item) for item in txt] | |
order = np.argsort(counts) | |
np.asarray(counts)[order[:10]] | |
# Print out num_display cards and their wordcounts; display more than ten so we can skip ones with mistakes or errors | |
num_display = 25 | |
print('Displaying 25 cards with lowest number of words...') | |
for i in range(num_display): | |
print('#%d: %s, %s' % (i, np.asarray(c)[order[i]], np.asarray(counts)[order[i]])) | |
# Which ones I select | |
print('-------------------------------------') | |
print('-------------------------------------') | |
# print('My selected cards, and the total count:') | |
my_indices = [4, 7, 8, 9, 10, 11, 12, 13, 14, 15] | |
for num, i in enumerate(my_indices): | |
print('#%d: %s, %s' % (num + 1, np.asarray(c)[order[i]], np.asarray(counts)[order[i]])) | |
print('Sum of all words is %d' % sum([counts[order[index]] for index in my_indices])) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment