dutc · March 8, 2021 15:33 · dutc · Mar 3, 2021
diff --git a/original-solution.py b/original-solution.py
 # original attendee solution, PRIOR to refactoring

 text = '''
 I'm not your friend or anything, 
 You think that you're the man
 I think, therefore, I am
 I'm not your friend or anything, 
 You think that you're the man
 I think, therefore, I am

 Stop, what are you talking about? Ha
 Get my pretty name out of your mouth
 We are not the same with or without
 Don't talk 'bout me like how you might know how I feel
 Top of the world, but your world isn't real
 Your world's an ideal

 So go have fun
 I really couldn't care less
 And you can give 'em my best, but just know

 I'm not your friend or anything, 
 You think that you're the man
 I think, therefore, I am
 I'm not your friend or anything, 
 You think that you're the man
 I think, therefore, I am
 '''

 def strip_punctuation(word):
    new_word = ''
    for c in word:
        if c.isalpha():
            new_word += c
    return new_word

 STOP_WORDS = [
    'the', 'and', 'not', 'my', 'no', 'is', 'i',
    'but', 'what', 'by', 'that', 'for', 'you', "youre",
    'or'
 ]

 words = text.split()
 cleaned_words = []
 for w in words:
    w = w.lower()
    w = strip_punctuation(w)
    if w in STOP_WORDS:
        continue
    cleaned_words.append(w)

 seen = []
 freq = []
 for w in cleaned_words:
    if w not in seen: # O(n)
        seen.append(w)
        freq.append(1)
    else:
        idx = seen.index(w) # O(n)
        freq[idx] += 1

 most_common_words, most_common_counts = [None, None, None], [0, 0, 0]
 for idx in range(len(freq)):
    c = freq[idx]
    w = seen[idx]
    for idx2 in range(len(most_common_counts)):
        if most_common_counts[idx2] < c:
            most_common_words[idx2] = w
            most_common_counts[idx2] = c
            break
 print('Report I'.center(50, '-'))
 for idx in range(len(most_common_words)):
    word  = most_common_words[idx]
    count = most_common_counts[idx]
    print(f'The word "{word}" appears {count} times.')

 tile_scores = {'e': 1, 'a': 1, 'i': 1, 'o': 1, 'n': 1, 'r': 1, 't': 1, 'l': 1,
               's': 1, 'u': 1, 'd': 2, 'g': 2, 'b': 3, 'c': 3, 'm': 3, 'p': 3,
               'f': 4, 'h': 4, 'v': 4, 'w': 4, 'y': 4, 'k': 5, 'j': 8, 'x': 8,}

 def score(word):
    total = 0
    for c in word:
        if c in tile_scores:
            total += tile_scores[c]
    return total

 highest_scoring_words = [None, None, None]
 highest_scoring_counts = [0, 0, 0]
 highest_scoring_scores = [0, 0, 0]
 for idx in range(len(freq)):
    c = freq[idx]
    w = seen[idx]
    for idx2 in range(len(highest_scoring_words)):
        if highest_scoring_scores[idx2] < c * score(w):
            highest_scoring_words[idx2] = w
            highest_scoring_counts[idx2] = c
            highest_scoring_scores[idx2] = c * score(w)
            break
 print('Report II'.center(50, '-'))
 for idx in range(len(highest_scoring_words)):
    word  = highest_scoring_words[idx]
    count = highest_scoring_counts[idx]
    score = highest_scoring_scores[idx]
    print(f'The word "{word}" appears {count} times with a score of {score}.')
diff --git a/problem1.py b/problem1.py
 #!/usr/bin/env python3

 # TASK: compute the word frequencies in the below text
 #       and report on the three most common words that appear
 text = '''
 I'm not your friend or anything, 
 You think that you're the man
 I think, therefore, I am
 I'm not your friend or anything, 
 You think that you're the man
 I think, therefore, I am

 Stop, what are you talking about? Ha
 Get my pretty name out of your mouth
 We are not the same with or without
 Don't talk 'bout me like how you might know how I feel
 Top of the world, but your world isn't real
 Your world's an ideal

 So go have fun
 I really couldn't care less
 And you can give 'em my best, but just know

 I'm not your friend or anything, 
 You think that you're the man
 I think, therefore, I am
 I'm not your friend or anything, 
 You think that you're the man
 I think, therefore, I am
 '''

 words = text.split()
 for w in words:
    print(f'{w = }')
diff --git a/problem2.py b/problem2.py
 #!/usr/bin/env python3

 # TASK: use the below Scrabble-style scoring dictionary
 #       and compute the top highest aggregate scores
 #       (i.e., find the max by frequency × word-score)

 text = '''
 I'm not your friend or anything, 
 You think that you're the man
 I think, therefore, I am
 I'm not your friend or anything, 
 You think that you're the man
 I think, therefore, I am

 Stop, what are you talking about? Ha
 Get my pretty name out of your mouth
 We are not the same with or without
 Don't talk 'bout me like how you might know how I feel
 Top of the world, but your world isn't real
 Your world's an ideal

 So go have fun
 I really couldn't care less
 And you can give 'em my best, but just know

 I'm not your friend or anything, 
 You think that you're the man
 I think, therefore, I am
 I'm not your friend or anything, 
 You think that you're the man
 I think, therefore, I am
 '''

 #  1 pt:  E×12, A×9, I×9, O×8, N×6, R×6, T×6, L×4, S×4, U×4
 #  2 pts: D×4, G×3
 #  3 pts: B×2, C×2, M×2, P×2
 #  4 pts: F×2, H×2, V×2, W×2, Y×2
 #  5 pts: K×1
 #  8 pts: J×1, X×1
 tile_scores = {'e': 1, 'a': 1, 'i': 1, 'o': 1, 'n': 1, 'r': 1, 't': 1, 'l': 1,
               's': 1, 'u': 1, 'd': 2, 'g': 2, 'b': 3, 'c': 3, 'm': 3, 'p': 3,
               'f': 4, 'h': 4, 'v': 4, 'w': 4, 'y': 4, 'k': 5, 'j': 8, 'x': 8,}


 words = text.split()
 for w in words:
    print(f'{w = }')
diff --git a/problem3.py b/problem3.py
 #!/usr/bin/env python3

 # TASK: repeat part1.py and part2.py above, but…
 #  - Normalise for case (“case folding”) so that “To” and “to” are considered same word.
 #  - Remove common “stop words” like “it” and “the”, as they will drown out more interesting words.
 #  - Strip punctuation, so that “it” and “it.” are considered the same word.
 #  - (BONUS: perform “conjugation folding” so that “know” and “knowing” are considered the same word.)

 text = '''
 I'm not your friend or anything, 
 You think that you're the man
 I think, therefore, I am
 I'm not your friend or anything, 
 You think that you're the man
 I think, therefore, I am

 Stop, what are you talking about? Ha
 Get my pretty name out of your mouth
 We are not the same with or without
 Don't talk 'bout me like how you might know how I feel
 Top of the world, but your world isn't real
 Your world's an ideal

 So go have fun
 I really couldn't care less
 And you can give 'em my best, but just know

 I'm not your friend or anything, 
 You think that you're the man
 I think, therefore, I am
 I'm not your friend or anything, 
 You think that you're the man
 I think, therefore, I am
 '''

 #  1 pt:  E×12, A×9, I×9, O×8, N×6, R×6, T×6, L×4, S×4, U×4
 #  2 pts: D×4, G×3
 #  3 pts: B×2, C×2, M×2, P×2
 #  4 pts: F×2, H×2, V×2, W×2, Y×2
 #  5 pts: K×1
 #  8 pts: J×1, X×1
 tile_scores = {'e': 1, 'a': 1, 'i': 1, 'o': 1, 'n': 1, 'r': 1, 't': 1, 'l': 1,
               's': 1, 'u': 1, 'd': 2, 'g': 2, 'b': 3, 'c': 3, 'm': 3, 'p': 3,
               'f': 4, 'h': 4, 'v': 4, 'w': 4, 'y': 4, 'k': 5, 'j': 8, 'x': 8,}


 words = text.split()
 for w in words:
    print(f'{w = }')
diff --git a/revised-solution.py b/revised-solution.py
 # instructor solution, AFTER refactoring

 from collections import Counter

 text = '''
 I'm not your friend or anything, 
 You think that you're the man
 I think, therefore, I am
 I'm not your friend or anything, 
 You think that you're the manoriginal
 I think, therefore, I am

 Stop, what are you talking about? Ha
 Get my pretty name out of your mouth
 We are not the same with or without
 Don't talk 'bout me like how you might know how I feel
 Top of the world, but your world isn't real
 Your world's an ideal

 So go have fun
 I really couldn't care less
 And you can give 'em my best, but just know

 I'm not your friend or anything, 
 You think that you're the man
 I think, therefore, I am
 I'm not your friend or anything, 
 You think that you're the man
 I think, therefore, I am
 '''

 def strip_punctuation(word):
    return ''.join(c for c in word if c.isalpha())

 STOP_WORDS = set(x.strip() for x in '''
    the and not my no is i
    but what by that for you youre
    or im
 '''.split())

 # 1. data normalisation
 words = text.split()
 words = (w.lower() for w in words)
 words = (strip_punctuation(w) for w in words)
 words = (w for w in words if w not in STOP_WORDS)

 # 2. data processing
 freq = Counter(words)

 # 3a. data reporting
 print('Report I'.center(50, '-'))
 for word, count in freq.most_common(3):
    print(f'The word "{word}" appears {count} times.')

 # # reconstruct `tile_scores` from raw input text
 # :! sed -r s'/^\# //' | tee >( python ) >( sed -r 's/^/\# /') &> /dev/null
 # tile_scores_text = '''
 # 1 pt:  E×12, A×9, I×9, O×8, N×6, R×6, T×6, L×4, S×4, U×4
 # 2 pts: D×4, G×3
 # 3 pts: B×2, C×2, M×2, P×2
 # 4 pts: F×2, H×2, V×2, W×2, Y×2
 # 5 pts: K×1
 # 8 pts: J×1, X×1
 # '''
 # tile_scores = {}
 # for line in tile_scores_text.strip().splitlines():
 #     score, tiles = line.split(':')
 #     score, *_ = score.strip().split()
 #     score = int(score)
 #     tiles = tiles.strip().split(',')
 #     for t in (t.strip().split('×')[0].lower() for t in tiles):
 #         tile_scores[t] = score
 # print(f'tile_scores = {dict(sorted(tile_scores.items()))}')
 tile_scores = {'a': 1, 'b': 3, 'c': 3, 'd': 2, 'e': 1, 'f': 4, 'g': 2, 'h': 4,
               'i': 1, 'j': 8, 'k': 5, 'l': 1, 'm': 3, 'n': 1, 'o': 1, 'p': 3,
               'r': 1, 's': 1, 't': 1, 'u': 1, 'v': 4, 'w': 4, 'x': 8, 'y': 4}

 def score(word):
    return sum(tile_scores.get(c) for c in word)

 def by_score(entry):
    word, count = entry
    return count * score(word)

 # 3b. data reporting
 print('Report II'.center(50, '-'))
 for word, count in sorted(freq.items(), key=by_score, reverse=True)[:3]:
    print(f'The word "{word}" appears {count} times with a score of {count * score(word)}.')
	# original attendee solution, PRIOR to refactoring

	text = '''
	I'm not your friend or anything,
	You think that you're the man
	I think, therefore, I am
	I'm not your friend or anything,
	You think that you're the man
	I think, therefore, I am

	Stop, what are you talking about? Ha
	Get my pretty name out of your mouth
	We are not the same with or without
	Don't talk 'bout me like how you might know how I feel
	Top of the world, but your world isn't real
	Your world's an ideal

	So go have fun
	I really couldn't care less
	And you can give 'em my best, but just know

	I'm not your friend or anything,
	You think that you're the man
	I think, therefore, I am
	I'm not your friend or anything,
	You think that you're the man
	I think, therefore, I am
	'''

	def strip_punctuation(word):
	new_word = ''
	for c in word:
	if c.isalpha():
	new_word += c
	return new_word

	STOP_WORDS = [
	'the', 'and', 'not', 'my', 'no', 'is', 'i',
	'but', 'what', 'by', 'that', 'for', 'you', "youre",
	'or'
	]

	words = text.split()
	cleaned_words = []
	for w in words:
	w = w.lower()
	w = strip_punctuation(w)
	if w in STOP_WORDS:
	continue
	cleaned_words.append(w)

	seen = []
	freq = []
	for w in cleaned_words:
	if w not in seen: # O(n)
	seen.append(w)
	freq.append(1)
	else:
	idx = seen.index(w) # O(n)
	freq[idx] += 1

	most_common_words, most_common_counts = [None, None, None], [0, 0, 0]
	for idx in range(len(freq)):
	c = freq[idx]
	w = seen[idx]
	for idx2 in range(len(most_common_counts)):
	if most_common_counts[idx2] < c:
	most_common_words[idx2] = w
	most_common_counts[idx2] = c
	break
	print('Report I'.center(50, '-'))
	for idx in range(len(most_common_words)):
	word = most_common_words[idx]
	count = most_common_counts[idx]
	print(f'The word "{word}" appears {count} times.')

	tile_scores = {'e': 1, 'a': 1, 'i': 1, 'o': 1, 'n': 1, 'r': 1, 't': 1, 'l': 1,
	's': 1, 'u': 1, 'd': 2, 'g': 2, 'b': 3, 'c': 3, 'm': 3, 'p': 3,
	'f': 4, 'h': 4, 'v': 4, 'w': 4, 'y': 4, 'k': 5, 'j': 8, 'x': 8,}

	def score(word):
	total = 0
	for c in word:
	if c in tile_scores:
	total += tile_scores[c]
	return total

	highest_scoring_words = [None, None, None]
	highest_scoring_counts = [0, 0, 0]
	highest_scoring_scores = [0, 0, 0]
	for idx in range(len(freq)):
	c = freq[idx]
	w = seen[idx]
	for idx2 in range(len(highest_scoring_words)):
	if highest_scoring_scores[idx2] < c * score(w):
	highest_scoring_words[idx2] = w
	highest_scoring_counts[idx2] = c
	highest_scoring_scores[idx2] = c * score(w)
	break
	print('Report II'.center(50, '-'))
	for idx in range(len(highest_scoring_words)):
	word = highest_scoring_words[idx]
	count = highest_scoring_counts[idx]
	score = highest_scoring_scores[idx]
	print(f'The word "{word}" appears {count} times with a score of {score}.')
	#!/usr/bin/env python3

	# TASK: compute the word frequencies in the below text
	# and report on the three most common words that appear
	text = '''
	I'm not your friend or anything,
	You think that you're the man
	I think, therefore, I am
	I'm not your friend or anything,
	You think that you're the man
	I think, therefore, I am

	Stop, what are you talking about? Ha
	Get my pretty name out of your mouth
	We are not the same with or without
	Don't talk 'bout me like how you might know how I feel
	Top of the world, but your world isn't real
	Your world's an ideal

	So go have fun
	I really couldn't care less
	And you can give 'em my best, but just know

	I'm not your friend or anything,
	You think that you're the man
	I think, therefore, I am
	I'm not your friend or anything,
	You think that you're the man
	I think, therefore, I am
	'''

	words = text.split()
	for w in words:
	print(f'{w = }')
	#!/usr/bin/env python3

	# TASK: use the below Scrabble-style scoring dictionary
	# and compute the top highest aggregate scores
	# (i.e., find the max by frequency × word-score)

	text = '''
	I'm not your friend or anything,
	You think that you're the man
	I think, therefore, I am
	I'm not your friend or anything,
	You think that you're the man
	I think, therefore, I am

	Stop, what are you talking about? Ha
	Get my pretty name out of your mouth
	We are not the same with or without
	Don't talk 'bout me like how you might know how I feel
	Top of the world, but your world isn't real
	Your world's an ideal

	So go have fun
	I really couldn't care less
	And you can give 'em my best, but just know

	I'm not your friend or anything,
	You think that you're the man
	I think, therefore, I am
	I'm not your friend or anything,
	You think that you're the man
	I think, therefore, I am
	'''

	# 1 pt: E×12, A×9, I×9, O×8, N×6, R×6, T×6, L×4, S×4, U×4
	# 2 pts: D×4, G×3
	# 3 pts: B×2, C×2, M×2, P×2
	# 4 pts: F×2, H×2, V×2, W×2, Y×2
	# 5 pts: K×1
	# 8 pts: J×1, X×1
	tile_scores = {'e': 1, 'a': 1, 'i': 1, 'o': 1, 'n': 1, 'r': 1, 't': 1, 'l': 1,
	's': 1, 'u': 1, 'd': 2, 'g': 2, 'b': 3, 'c': 3, 'm': 3, 'p': 3,
	'f': 4, 'h': 4, 'v': 4, 'w': 4, 'y': 4, 'k': 5, 'j': 8, 'x': 8,}


	words = text.split()
	for w in words:
	print(f'{w = }')
	#!/usr/bin/env python3

	# TASK: repeat part1.py and part2.py above, but…
	# - Normalise for case (“case folding”) so that “To” and “to” are considered same word.
	# - Remove common “stop words” like “it” and “the”, as they will drown out more interesting words.
	# - Strip punctuation, so that “it” and “it.” are considered the same word.
	# - (BONUS: perform “conjugation folding” so that “know” and “knowing” are considered the same word.)

	text = '''
	I'm not your friend or anything,
	You think that you're the man
	I think, therefore, I am
	I'm not your friend or anything,
	You think that you're the man
	I think, therefore, I am

	Stop, what are you talking about? Ha
	Get my pretty name out of your mouth
	We are not the same with or without
	Don't talk 'bout me like how you might know how I feel
	Top of the world, but your world isn't real
	Your world's an ideal

	So go have fun
	I really couldn't care less
	And you can give 'em my best, but just know

	I'm not your friend or anything,
	You think that you're the man
	I think, therefore, I am
	I'm not your friend or anything,
	You think that you're the man
	I think, therefore, I am
	'''

	# 1 pt: E×12, A×9, I×9, O×8, N×6, R×6, T×6, L×4, S×4, U×4
	# 2 pts: D×4, G×3
	# 3 pts: B×2, C×2, M×2, P×2
	# 4 pts: F×2, H×2, V×2, W×2, Y×2
	# 5 pts: K×1
	# 8 pts: J×1, X×1
	tile_scores = {'e': 1, 'a': 1, 'i': 1, 'o': 1, 'n': 1, 'r': 1, 't': 1, 'l': 1,
	's': 1, 'u': 1, 'd': 2, 'g': 2, 'b': 3, 'c': 3, 'm': 3, 'p': 3,
	'f': 4, 'h': 4, 'v': 4, 'w': 4, 'y': 4, 'k': 5, 'j': 8, 'x': 8,}


	words = text.split()
	for w in words:
	print(f'{w = }')
	# instructor solution, AFTER refactoring

	from collections import Counter

	text = '''
	I'm not your friend or anything,
	You think that you're the man
	I think, therefore, I am
	I'm not your friend or anything,
	You think that you're the manoriginal
	I think, therefore, I am

	Stop, what are you talking about? Ha
	Get my pretty name out of your mouth
	We are not the same with or without
	Don't talk 'bout me like how you might know how I feel
	Top of the world, but your world isn't real
	Your world's an ideal

	So go have fun
	I really couldn't care less
	And you can give 'em my best, but just know

	I'm not your friend or anything,
	You think that you're the man
	I think, therefore, I am
	I'm not your friend or anything,
	You think that you're the man
	I think, therefore, I am
	'''

	def strip_punctuation(word):
	return ''.join(c for c in word if c.isalpha())

	STOP_WORDS = set(x.strip() for x in '''
	the and not my no is i
	but what by that for you youre
	or im
	'''.split())

	# 1. data normalisation
	words = text.split()
	words = (w.lower() for w in words)
	words = (strip_punctuation(w) for w in words)
	words = (w for w in words if w not in STOP_WORDS)

	# 2. data processing
	freq = Counter(words)

	# 3a. data reporting
	print('Report I'.center(50, '-'))
	for word, count in freq.most_common(3):
	print(f'The word "{word}" appears {count} times.')

	# # reconstruct `tile_scores` from raw input text
	# :! sed -r s'/^\# //' \| tee >( python ) >( sed -r 's/^/\# /') &> /dev/null
	# tile_scores_text = '''
	# 1 pt: E×12, A×9, I×9, O×8, N×6, R×6, T×6, L×4, S×4, U×4
	# 2 pts: D×4, G×3
	# 3 pts: B×2, C×2, M×2, P×2
	# 4 pts: F×2, H×2, V×2, W×2, Y×2
	# 5 pts: K×1
	# 8 pts: J×1, X×1
	# '''
	# tile_scores = {}
	# for line in tile_scores_text.strip().splitlines():
	# score, tiles = line.split(':')
	# score, *_ = score.strip().split()
	# score = int(score)
	# tiles = tiles.strip().split(',')
	# for t in (t.strip().split('×')[0].lower() for t in tiles):
	# tile_scores[t] = score
	# print(f'tile_scores = {dict(sorted(tile_scores.items()))}')
	tile_scores = {'a': 1, 'b': 3, 'c': 3, 'd': 2, 'e': 1, 'f': 4, 'g': 2, 'h': 4,
	'i': 1, 'j': 8, 'k': 5, 'l': 1, 'm': 3, 'n': 1, 'o': 1, 'p': 3,
	'r': 1, 's': 1, 't': 1, 'u': 1, 'v': 4, 'w': 4, 'x': 8, 'y': 4}

	def score(word):
	return sum(tile_scores.get(c) for c in word)

	def by_score(entry):
	word, count = entry
	return count * score(word)

	# 3b. data reporting
	print('Report II'.center(50, '-'))
	for word, count in sorted(freq.items(), key=by_score, reverse=True)[:3]:
	print(f'The word "{word}" appears {count} times with a score of {count * score(word)}.')