Skip to content

Instantly share code, notes, and snippets.

@dutc
Last active March 8, 2021 15:33
Show Gist options
  • Save dutc/6fc4590c4e24ef3f7465a811293feaa9 to your computer and use it in GitHub Desktop.
Save dutc/6fc4590c4e24ef3f7465a811293feaa9 to your computer and use it in GitHub Desktop.
“Python Expert” Newsletter (Mar 3, 2021): Learning Corner, Python Data Structure Fundamentals and “Concordance”
# original attendee solution, PRIOR to refactoring
text = '''
I'm not your friend or anything,
You think that you're the man
I think, therefore, I am
I'm not your friend or anything,
You think that you're the man
I think, therefore, I am
Stop, what are you talking about? Ha
Get my pretty name out of your mouth
We are not the same with or without
Don't talk 'bout me like how you might know how I feel
Top of the world, but your world isn't real
Your world's an ideal
So go have fun
I really couldn't care less
And you can give 'em my best, but just know
I'm not your friend or anything,
You think that you're the man
I think, therefore, I am
I'm not your friend or anything,
You think that you're the man
I think, therefore, I am
'''
def strip_punctuation(word):
new_word = ''
for c in word:
if c.isalpha():
new_word += c
return new_word
STOP_WORDS = [
'the', 'and', 'not', 'my', 'no', 'is', 'i',
'but', 'what', 'by', 'that', 'for', 'you', "youre",
'or'
]
words = text.split()
cleaned_words = []
for w in words:
w = w.lower()
w = strip_punctuation(w)
if w in STOP_WORDS:
continue
cleaned_words.append(w)
seen = []
freq = []
for w in cleaned_words:
if w not in seen: # O(n)
seen.append(w)
freq.append(1)
else:
idx = seen.index(w) # O(n)
freq[idx] += 1
most_common_words, most_common_counts = [None, None, None], [0, 0, 0]
for idx in range(len(freq)):
c = freq[idx]
w = seen[idx]
for idx2 in range(len(most_common_counts)):
if most_common_counts[idx2] < c:
most_common_words[idx2] = w
most_common_counts[idx2] = c
break
print('Report I'.center(50, '-'))
for idx in range(len(most_common_words)):
word = most_common_words[idx]
count = most_common_counts[idx]
print(f'The word "{word}" appears {count} times.')
tile_scores = {'e': 1, 'a': 1, 'i': 1, 'o': 1, 'n': 1, 'r': 1, 't': 1, 'l': 1,
's': 1, 'u': 1, 'd': 2, 'g': 2, 'b': 3, 'c': 3, 'm': 3, 'p': 3,
'f': 4, 'h': 4, 'v': 4, 'w': 4, 'y': 4, 'k': 5, 'j': 8, 'x': 8,}
def score(word):
total = 0
for c in word:
if c in tile_scores:
total += tile_scores[c]
return total
highest_scoring_words = [None, None, None]
highest_scoring_counts = [0, 0, 0]
highest_scoring_scores = [0, 0, 0]
for idx in range(len(freq)):
c = freq[idx]
w = seen[idx]
for idx2 in range(len(highest_scoring_words)):
if highest_scoring_scores[idx2] < c * score(w):
highest_scoring_words[idx2] = w
highest_scoring_counts[idx2] = c
highest_scoring_scores[idx2] = c * score(w)
break
print('Report II'.center(50, '-'))
for idx in range(len(highest_scoring_words)):
word = highest_scoring_words[idx]
count = highest_scoring_counts[idx]
score = highest_scoring_scores[idx]
print(f'The word "{word}" appears {count} times with a score of {score}.')
#!/usr/bin/env python3
# TASK: compute the word frequencies in the below text
# and report on the three most common words that appear
text = '''
I'm not your friend or anything,
You think that you're the man
I think, therefore, I am
I'm not your friend or anything,
You think that you're the man
I think, therefore, I am
Stop, what are you talking about? Ha
Get my pretty name out of your mouth
We are not the same with or without
Don't talk 'bout me like how you might know how I feel
Top of the world, but your world isn't real
Your world's an ideal
So go have fun
I really couldn't care less
And you can give 'em my best, but just know
I'm not your friend or anything,
You think that you're the man
I think, therefore, I am
I'm not your friend or anything,
You think that you're the man
I think, therefore, I am
'''
words = text.split()
for w in words:
print(f'{w = }')
#!/usr/bin/env python3
# TASK: use the below Scrabble-style scoring dictionary
# and compute the top highest aggregate scores
# (i.e., find the max by frequency × word-score)
text = '''
I'm not your friend or anything,
You think that you're the man
I think, therefore, I am
I'm not your friend or anything,
You think that you're the man
I think, therefore, I am
Stop, what are you talking about? Ha
Get my pretty name out of your mouth
We are not the same with or without
Don't talk 'bout me like how you might know how I feel
Top of the world, but your world isn't real
Your world's an ideal
So go have fun
I really couldn't care less
And you can give 'em my best, but just know
I'm not your friend or anything,
You think that you're the man
I think, therefore, I am
I'm not your friend or anything,
You think that you're the man
I think, therefore, I am
'''
# 1 pt: E×12, A×9, I×9, O×8, N×6, R×6, T×6, L×4, S×4, U×4
# 2 pts: D×4, G×3
# 3 pts: B×2, C×2, M×2, P×2
# 4 pts: F×2, H×2, V×2, W×2, Y×2
# 5 pts: K×1
# 8 pts: J×1, X×1
tile_scores = {'e': 1, 'a': 1, 'i': 1, 'o': 1, 'n': 1, 'r': 1, 't': 1, 'l': 1,
's': 1, 'u': 1, 'd': 2, 'g': 2, 'b': 3, 'c': 3, 'm': 3, 'p': 3,
'f': 4, 'h': 4, 'v': 4, 'w': 4, 'y': 4, 'k': 5, 'j': 8, 'x': 8,}
words = text.split()
for w in words:
print(f'{w = }')
#!/usr/bin/env python3
# TASK: repeat part1.py and part2.py above, but…
# - Normalise for case (“case folding”) so that “To” and “to” are considered same word.
# - Remove common “stop words” like “it” and “the”, as they will drown out more interesting words.
# - Strip punctuation, so that “it” and “it.” are considered the same word.
# - (BONUS: perform “conjugation folding” so that “know” and “knowing” are considered the same word.)
text = '''
I'm not your friend or anything,
You think that you're the man
I think, therefore, I am
I'm not your friend or anything,
You think that you're the man
I think, therefore, I am
Stop, what are you talking about? Ha
Get my pretty name out of your mouth
We are not the same with or without
Don't talk 'bout me like how you might know how I feel
Top of the world, but your world isn't real
Your world's an ideal
So go have fun
I really couldn't care less
And you can give 'em my best, but just know
I'm not your friend or anything,
You think that you're the man
I think, therefore, I am
I'm not your friend or anything,
You think that you're the man
I think, therefore, I am
'''
# 1 pt: E×12, A×9, I×9, O×8, N×6, R×6, T×6, L×4, S×4, U×4
# 2 pts: D×4, G×3
# 3 pts: B×2, C×2, M×2, P×2
# 4 pts: F×2, H×2, V×2, W×2, Y×2
# 5 pts: K×1
# 8 pts: J×1, X×1
tile_scores = {'e': 1, 'a': 1, 'i': 1, 'o': 1, 'n': 1, 'r': 1, 't': 1, 'l': 1,
's': 1, 'u': 1, 'd': 2, 'g': 2, 'b': 3, 'c': 3, 'm': 3, 'p': 3,
'f': 4, 'h': 4, 'v': 4, 'w': 4, 'y': 4, 'k': 5, 'j': 8, 'x': 8,}
words = text.split()
for w in words:
print(f'{w = }')
# instructor solution, AFTER refactoring
from collections import Counter
text = '''
I'm not your friend or anything,
You think that you're the man
I think, therefore, I am
I'm not your friend or anything,
You think that you're the manoriginal
I think, therefore, I am
Stop, what are you talking about? Ha
Get my pretty name out of your mouth
We are not the same with or without
Don't talk 'bout me like how you might know how I feel
Top of the world, but your world isn't real
Your world's an ideal
So go have fun
I really couldn't care less
And you can give 'em my best, but just know
I'm not your friend or anything,
You think that you're the man
I think, therefore, I am
I'm not your friend or anything,
You think that you're the man
I think, therefore, I am
'''
def strip_punctuation(word):
return ''.join(c for c in word if c.isalpha())
STOP_WORDS = set(x.strip() for x in '''
the and not my no is i
but what by that for you youre
or im
'''.split())
# 1. data normalisation
words = text.split()
words = (w.lower() for w in words)
words = (strip_punctuation(w) for w in words)
words = (w for w in words if w not in STOP_WORDS)
# 2. data processing
freq = Counter(words)
# 3a. data reporting
print('Report I'.center(50, '-'))
for word, count in freq.most_common(3):
print(f'The word "{word}" appears {count} times.')
# # reconstruct `tile_scores` from raw input text
# :! sed -r s'/^\# //' | tee >( python ) >( sed -r 's/^/\# /') &> /dev/null
# tile_scores_text = '''
# 1 pt: E×12, A×9, I×9, O×8, N×6, R×6, T×6, L×4, S×4, U×4
# 2 pts: D×4, G×3
# 3 pts: B×2, C×2, M×2, P×2
# 4 pts: F×2, H×2, V×2, W×2, Y×2
# 5 pts: K×1
# 8 pts: J×1, X×1
# '''
# tile_scores = {}
# for line in tile_scores_text.strip().splitlines():
# score, tiles = line.split(':')
# score, *_ = score.strip().split()
# score = int(score)
# tiles = tiles.strip().split(',')
# for t in (t.strip().split('×')[0].lower() for t in tiles):
# tile_scores[t] = score
# print(f'tile_scores = {dict(sorted(tile_scores.items()))}')
tile_scores = {'a': 1, 'b': 3, 'c': 3, 'd': 2, 'e': 1, 'f': 4, 'g': 2, 'h': 4,
'i': 1, 'j': 8, 'k': 5, 'l': 1, 'm': 3, 'n': 1, 'o': 1, 'p': 3,
'r': 1, 's': 1, 't': 1, 'u': 1, 'v': 4, 'w': 4, 'x': 8, 'y': 4}
def score(word):
return sum(tile_scores.get(c) for c in word)
def by_score(entry):
word, count = entry
return count * score(word)
# 3b. data reporting
print('Report II'.center(50, '-'))
for word, count in sorted(freq.items(), key=by_score, reverse=True)[:3]:
print(f'The word "{word}" appears {count} times with a score of {count * score(word)}.')
@dutc
Copy link
Author

dutc commented Mar 3, 2021

For the full write-up and discussion, sign up for the “Python Expert” newsletter!

bit.ly/expert-python

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment