bradmontgomery · August 15, 2022 19:11 · salihkaragoz · Feb 2, 2018
diff --git a/count_words.py b/count_words.py
 """
 Use a Counter to find the most common words in "The Wonderful Wizard of Oz" by
 L. Frank Baum.

 Available in (mostly) plain text at:
 https://archive.org/stream/wonderfulwizardo00baumiala/wonderfulwizardo00baumiala_djvu.txt

 Note: This code also counts the words in the header, so it's not a *realistic*
 applicaton, but more of a demonstration of python's Counter.

 Running this code should give you something like this:

    $ python count_words.py

    The Top 10 words
    the: 2808
    and: 1630
    to: 1143
    of: 869
    a: 819
    I: 597
    was: 502
    you: 486
    in: 476
    he: 408

 """
 import re
 from collections import Counter
 from html.parser import HTMLParser

 import requests


 class PreParser(HTMLParser):
    """
    This is an HTML parser that captures the text within
    <pre></pre> tags. See more in the html.parser docs:
    https://docs.python.org/3/library/html.parser.html

    """

    capture = False
    result = None  # <--- we'll store some text here.

    def handle_starttag(self, tag, attrs):
        if tag == "pre":
            self.capture = True

    def handle_endtag(self, tag):
        if tag == "pre":
            self.capture = False

    def handle_data(self, data):
        if self.capture:
            self.result = data


 def main(n=10):

    # Create a parser to parse the HTML document.
    parser = PreParser()

    # Now fetch some content & do a little cleanup
    url = "https://archive.org/stream/wonderfulwizardo00baumiala/wonderfulwizardo00baumiala_djvu.txt"
    resp = requests.get(url)
    if resp.status_code != 200:
        print(f"Failed to fetch document: {resp.status_code}\n{resp.txt}")
        return

    # Do some content cleaning...
    parser.feed(resp.text)
    content = re.sub("\s+", " ", parser.result)
    content = re.sub("[^A-Za-z ]+", "", content)
    words = content.split()
    print(f"Found {len(words):,} words!")

    # Start counting
    word_count = Counter(words)

    # The Top-N words
    print("The Top {0} words".format(n))
    for word, count in word_count.most_common(n):
        print("{0}: {1}".format(word, count))


 if __name__ == "__main__":
    main()
diff --git a/counter.py b/counter.py
 """
 This python file is exported from a Jupyter Notebook used
 during the MEMpy presentation on 2022-08-15.

 """
 #!/usr/bin/env python
 # coding: utf-8

 # # collections.Counter
 #
 # It's good stuff!
 #
 # Super-powerful utilities that do _very common_ operations.
 #

 # In[1]:


 from collections import Counter


 # ## what is `Counter`?
 #
 # - It's like a dictionary (keys & values)
 # - Keys -> The things you want to count
 # - Values -> The number of times the key appears in a _collection_ of stuff.

 # In[2]:


 # Before using Counter
 c = {}
 if "widgets" in c:
    c["widgets"] += 1
 else:
    c["widgets"] = 1


 # OR, if you KNOW all of your keys...
 c = {
    "widgets": 0,
 }
 c["widgets"] += 1


 # Counter lets you start counting without knowing keys in advance
 c = Counter()
 c["widgets"] += 1
 c


 # # Counter behavior
 #
 # - Most of the dict methods are available.
 # - `.keys()` & `.values()`
 # - `in` operations

 # In[3]:


 c.keys()


 # In[4]:


 c.values()


 # In[5]:


 # Update will create new keys or adjust
 # counts for existing keys
 c.update({"foo": 1})
 c


 # In[6]:


 c.update({"foo": 1})  # calling it a 2nd time ...
 c


 # In[7]:


 # you can create a Counger based on keyword arguments
 scores = Counter(grizzlies=134, warriors=95)
 scores


 # ## Where things start to get interesting
 #
 # You can create a Counter object from _any_ iterable!

 # In[8]:


 c = Counter(["moe", "larry", "larry", "curly", "curly"])
 c


 # In[9]:


 # you can also go "backwards" ... get a "list" of elements based on their counts
 list(c.elements())


 # ## most common occurances?
 #
 # One of the best use-cases for a Counter!
 #

 # In[10]:


 # Most common letters in a string?
 word = "supercalifragilisticexpialidocious"
 Counter(word).most_common(3)


 # ## What are the 10 most common words in "The Wonderful Wizard of Oz"?

 # In[11]:


 import requests
 import re

 from html.parser import HTMLParser
 from collections import Counter


 # blergh, write a little HTML parser:
 # https://docs.python.org/3/library/html.parser.html
 class Parser(HTMLParser):
    capture = False
    result = None  # <--- we'll store some text here.

    def handle_starttag(self, tag, attrs):
        if tag == "pre":
            self.capture = True

    def handle_endtag(self, tag):
        if tag == "pre":
            self.capture = False

    def handle_data(self, data):
        if self.capture:
            self.result = data


 parser = Parser()


 # In[12]:


 # Now fetch some content & do a little cleanup
 url = "https://archive.org/stream/wonderfulwizardo00baumiala/wonderfulwizardo00baumiala_djvu.txt"
 resp = requests.get(url)
 parser.feed(resp.text)

 content = re.sub("\s+", " ", parser.result)
 content = re.sub("[^A-Za-z ]+", "", content)
 words = content.split()
 print(f"{len(words):,} words!")


 # In[13]:


 # Start counting
 word_count = Counter(words)
 word_count.most_common(10)


 # ## ok that's neat, but....
 #
 # How can this help me build that RPG/MMO I've always wanted to build?

 # # Answer: Let's go shopping!

 # In[14]:


 # Set up your purse!
 purse = Counter(gold=1000, silver=500, copper=100)
 purse


 # In[15]:


 # Create some items in the shop
 shield = {"gold": 25}
 sword = {"gold": 100, "silver": 50}
 tunic = {"silver": 10, "copper": 50}


 # In[16]:


 # Let's make some purchases
 purse.subtract(shield)
 purse


 # In[17]:


 # Buy the sword.
 purse.subtract(sword)
 purse


 # In[18]:


 # Get the tunic too
 purse.subtract(tunic)
 purse


 # In[19]:


 # Buy a castle!
 castle = {"gold": 50_000, "silver": 10_000, "copper": 350}
 purse.subtract(castle)
 purse  # whoops


 # In[20]:


 # New in 3.10
 purse.total()  # -> Should sum all the values.


 # In[ ]:


 # or ...
 debt = sum(purse.values())
 print(f"We owe {debt:,}!")


 # In[ ]:


 purse.clear()  # reset!
 purse


 # ## Resources
 #
 # - Python Collections: https://docs.python.org/3/library/collections.html
 # - HTML Parser: https://docs.python.org/3/library/html.parser.html
 # - The 2013 version of this talk: https://speakerdeck.com/bkmontgomery/pythons-counter-collection?slide=40
 # - Sample Code: https://gist.github.com/bradmontgomery/4717521
 #
 # ### Other collections goodies!
 #
 # - ChainMap
 # - deque
 # - namedtuple
 # - defaultdict
 # - OrderedDict
 # - UserDict, UserList, UserString

 # # Thank you!
 #
 # Questions?
diff --git a/requirements.txt b/requirements.txt
 black==22.6.0
 certifi==2022.6.15
 isort==5.10.1
 requests==2.28.1
	"""
	Use a Counter to find the most common words in "The Wonderful Wizard of Oz" by
	L. Frank Baum.

	Available in (mostly) plain text at:
	https://archive.org/stream/wonderfulwizardo00baumiala/wonderfulwizardo00baumiala_djvu.txt

	Note: This code also counts the words in the header, so it's not a realistic
	applicaton, but more of a demonstration of python's Counter.

	Running this code should give you something like this:

	$ python count_words.py

	The Top 10 words
	the: 2808
	and: 1630
	to: 1143
	of: 869
	a: 819
	I: 597
	was: 502
	you: 486
	in: 476
	he: 408

	"""
	import re
	from collections import Counter
	from html.parser import HTMLParser

	import requests


	class PreParser(HTMLParser):
	"""
	This is an HTML parser that captures the text within
	<pre></pre> tags. See more in the html.parser docs:
	https://docs.python.org/3/library/html.parser.html

	"""

	capture = False
	result = None # <--- we'll store some text here.

	def handle_starttag(self, tag, attrs):
	if tag == "pre":
	self.capture = True

	def handle_endtag(self, tag):
	if tag == "pre":
	self.capture = False

	def handle_data(self, data):
	if self.capture:
	self.result = data


	def main(n=10):

	# Create a parser to parse the HTML document.
	parser = PreParser()

	# Now fetch some content & do a little cleanup
	url = "https://archive.org/stream/wonderfulwizardo00baumiala/wonderfulwizardo00baumiala_djvu.txt"
	resp = requests.get(url)
	if resp.status_code != 200:
	print(f"Failed to fetch document: {resp.status_code}\n{resp.txt}")
	return

	# Do some content cleaning...
	parser.feed(resp.text)
	content = re.sub("\s+", " ", parser.result)
	content = re.sub("[^A-Za-z ]+", "", content)
	words = content.split()
	print(f"Found {len(words):,} words!")

	# Start counting
	word_count = Counter(words)

	# The Top-N words
	print("The Top {0} words".format(n))
	for word, count in word_count.most_common(n):
	print("{0}: {1}".format(word, count))


	if __name__ == "__main__":
	main()
	"""
	This python file is exported from a Jupyter Notebook used
	during the MEMpy presentation on 2022-08-15.

	"""
	#!/usr/bin/env python
	# coding: utf-8

	# # collections.Counter
	#
	# It's good stuff!
	#
	# Super-powerful utilities that do _very common_ operations.
	#

	# In[1]:


	from collections import Counter


	# ## what is `Counter`?
	#
	# - It's like a dictionary (keys & values)
	# - Keys -> The things you want to count
	# - Values -> The number of times the key appears in a _collection_ of stuff.

	# In[2]:


	# Before using Counter
	c = {}
	if "widgets" in c:
	c["widgets"] += 1
	else:
	c["widgets"] = 1


	# OR, if you KNOW all of your keys...
	c = {
	"widgets": 0,
	}
	c["widgets"] += 1


	# Counter lets you start counting without knowing keys in advance
	c = Counter()
	c["widgets"] += 1
	c


	# # Counter behavior
	#
	# - Most of the dict methods are available.
	# - `.keys()` & `.values()`
	# - `in` operations

	# In[3]:


	c.keys()


	# In[4]:


	c.values()


	# In[5]:


	# Update will create new keys or adjust
	# counts for existing keys
	c.update({"foo": 1})
	c


	# In[6]:


	c.update({"foo": 1}) # calling it a 2nd time ...
	c


	# In[7]:


	# you can create a Counger based on keyword arguments
	scores = Counter(grizzlies=134, warriors=95)
	scores


	# ## Where things start to get interesting
	#
	# You can create a Counter object from _any_ iterable!

	# In[8]:


	c = Counter(["moe", "larry", "larry", "curly", "curly"])
	c


	# In[9]:


	# you can also go "backwards" ... get a "list" of elements based on their counts
	list(c.elements())


	# ## most common occurances?
	#
	# One of the best use-cases for a Counter!
	#

	# In[10]:


	# Most common letters in a string?
	word = "supercalifragilisticexpialidocious"
	Counter(word).most_common(3)


	# ## What are the 10 most common words in "The Wonderful Wizard of Oz"?

	# In[11]:


	import requests
	import re

	from html.parser import HTMLParser
	from collections import Counter


	# blergh, write a little HTML parser:
	# https://docs.python.org/3/library/html.parser.html
	class Parser(HTMLParser):
	capture = False
	result = None # <--- we'll store some text here.

	def handle_starttag(self, tag, attrs):
	if tag == "pre":
	self.capture = True

	def handle_endtag(self, tag):
	if tag == "pre":
	self.capture = False

	def handle_data(self, data):
	if self.capture:
	self.result = data


	parser = Parser()


	# In[12]:


	# Now fetch some content & do a little cleanup
	url = "https://archive.org/stream/wonderfulwizardo00baumiala/wonderfulwizardo00baumiala_djvu.txt"
	resp = requests.get(url)
	parser.feed(resp.text)

	content = re.sub("\s+", " ", parser.result)
	content = re.sub("[^A-Za-z ]+", "", content)
	words = content.split()
	print(f"{len(words):,} words!")


	# In[13]:


	# Start counting
	word_count = Counter(words)
	word_count.most_common(10)


	# ## ok that's neat, but....
	#
	# How can this help me build that RPG/MMO I've always wanted to build?

	# # Answer: Let's go shopping!

	# In[14]:


	# Set up your purse!
	purse = Counter(gold=1000, silver=500, copper=100)
	purse


	# In[15]:


	# Create some items in the shop
	shield = {"gold": 25}
	sword = {"gold": 100, "silver": 50}
	tunic = {"silver": 10, "copper": 50}


	# In[16]:


	# Let's make some purchases
	purse.subtract(shield)
	purse


	# In[17]:


	# Buy the sword.
	purse.subtract(sword)
	purse


	# In[18]:


	# Get the tunic too
	purse.subtract(tunic)
	purse


	# In[19]:


	# Buy a castle!
	castle = {"gold": 50_000, "silver": 10_000, "copper": 350}
	purse.subtract(castle)
	purse # whoops


	# In[20]:


	# New in 3.10
	purse.total() # -> Should sum all the values.


	# In[ ]:


	# or ...
	debt = sum(purse.values())
	print(f"We owe {debt:,}!")


	# In[ ]:


	purse.clear() # reset!
	purse


	# ## Resources
	#
	# - Python Collections: https://docs.python.org/3/library/collections.html
	# - HTML Parser: https://docs.python.org/3/library/html.parser.html
	# - The 2013 version of this talk: https://speakerdeck.com/bkmontgomery/pythons-counter-collection?slide=40
	# - Sample Code: https://gist.github.com/bradmontgomery/4717521
	#
	# ### Other collections goodies!
	#
	# - ChainMap
	# - deque
	# - namedtuple
	# - defaultdict
	# - OrderedDict
	# - UserDict, UserList, UserString

	# # Thank you!
	#
	# Questions?
	black==22.6.0
	certifi==2022.6.15
	isort==5.10.1
	requests==2.28.1