Last active
August 15, 2022 19:11
-
-
Save bradmontgomery/4717521 to your computer and use it in GitHub Desktop.
playing with python's `collections.Counter`
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Use a Counter to find the most common words in "The Wonderful Wizard of Oz" by | |
L. Frank Baum. | |
Available in (mostly) plain text at: | |
https://archive.org/stream/wonderfulwizardo00baumiala/wonderfulwizardo00baumiala_djvu.txt | |
Note: This code also counts the words in the header, so it's not a *realistic* | |
applicaton, but more of a demonstration of python's Counter. | |
Running this code should give you something like this: | |
$ python count_words.py | |
The Top 10 words | |
the: 2808 | |
and: 1630 | |
to: 1143 | |
of: 869 | |
a: 819 | |
I: 597 | |
was: 502 | |
you: 486 | |
in: 476 | |
he: 408 | |
""" | |
import re | |
from collections import Counter | |
from html.parser import HTMLParser | |
import requests | |
class PreParser(HTMLParser): | |
""" | |
This is an HTML parser that captures the text within | |
<pre></pre> tags. See more in the html.parser docs: | |
https://docs.python.org/3/library/html.parser.html | |
""" | |
capture = False | |
result = None # <--- we'll store some text here. | |
def handle_starttag(self, tag, attrs): | |
if tag == "pre": | |
self.capture = True | |
def handle_endtag(self, tag): | |
if tag == "pre": | |
self.capture = False | |
def handle_data(self, data): | |
if self.capture: | |
self.result = data | |
def main(n=10): | |
# Create a parser to parse the HTML document. | |
parser = PreParser() | |
# Now fetch some content & do a little cleanup | |
url = "https://archive.org/stream/wonderfulwizardo00baumiala/wonderfulwizardo00baumiala_djvu.txt" | |
resp = requests.get(url) | |
if resp.status_code != 200: | |
print(f"Failed to fetch document: {resp.status_code}\n{resp.txt}") | |
return | |
# Do some content cleaning... | |
parser.feed(resp.text) | |
content = re.sub("\s+", " ", parser.result) | |
content = re.sub("[^A-Za-z ]+", "", content) | |
words = content.split() | |
print(f"Found {len(words):,} words!") | |
# Start counting | |
word_count = Counter(words) | |
# The Top-N words | |
print("The Top {0} words".format(n)) | |
for word, count in word_count.most_common(n): | |
print("{0}: {1}".format(word, count)) | |
if __name__ == "__main__": | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
This python file is exported from a Jupyter Notebook used | |
during the MEMpy presentation on 2022-08-15. | |
""" | |
#!/usr/bin/env python | |
# coding: utf-8 | |
# # collections.Counter | |
# | |
# It's good stuff! | |
# | |
# Super-powerful utilities that do _very common_ operations. | |
# | |
# In[1]: | |
from collections import Counter | |
# ## what is `Counter`? | |
# | |
# - It's like a dictionary (keys & values) | |
# - Keys -> The things you want to count | |
# - Values -> The number of times the key appears in a _collection_ of stuff. | |
# In[2]: | |
# Before using Counter | |
c = {} | |
if "widgets" in c: | |
c["widgets"] += 1 | |
else: | |
c["widgets"] = 1 | |
# OR, if you KNOW all of your keys... | |
c = { | |
"widgets": 0, | |
} | |
c["widgets"] += 1 | |
# Counter lets you start counting without knowing keys in advance | |
c = Counter() | |
c["widgets"] += 1 | |
c | |
# # Counter behavior | |
# | |
# - Most of the dict methods are available. | |
# - `.keys()` & `.values()` | |
# - `in` operations | |
# In[3]: | |
c.keys() | |
# In[4]: | |
c.values() | |
# In[5]: | |
# Update will create new keys or adjust | |
# counts for existing keys | |
c.update({"foo": 1}) | |
c | |
# In[6]: | |
c.update({"foo": 1}) # calling it a 2nd time ... | |
c | |
# In[7]: | |
# you can create a Counger based on keyword arguments | |
scores = Counter(grizzlies=134, warriors=95) | |
scores | |
# ## Where things start to get interesting | |
# | |
# You can create a Counter object from _any_ iterable! | |
# In[8]: | |
c = Counter(["moe", "larry", "larry", "curly", "curly"]) | |
c | |
# In[9]: | |
# you can also go "backwards" ... get a "list" of elements based on their counts | |
list(c.elements()) | |
# ## most common occurances? | |
# | |
# One of the best use-cases for a Counter! | |
# | |
# In[10]: | |
# Most common letters in a string? | |
word = "supercalifragilisticexpialidocious" | |
Counter(word).most_common(3) | |
# ## What are the 10 most common words in "The Wonderful Wizard of Oz"? | |
# In[11]: | |
import requests | |
import re | |
from html.parser import HTMLParser | |
from collections import Counter | |
# blergh, write a little HTML parser: | |
# https://docs.python.org/3/library/html.parser.html | |
class Parser(HTMLParser): | |
capture = False | |
result = None # <--- we'll store some text here. | |
def handle_starttag(self, tag, attrs): | |
if tag == "pre": | |
self.capture = True | |
def handle_endtag(self, tag): | |
if tag == "pre": | |
self.capture = False | |
def handle_data(self, data): | |
if self.capture: | |
self.result = data | |
parser = Parser() | |
# In[12]: | |
# Now fetch some content & do a little cleanup | |
url = "https://archive.org/stream/wonderfulwizardo00baumiala/wonderfulwizardo00baumiala_djvu.txt" | |
resp = requests.get(url) | |
parser.feed(resp.text) | |
content = re.sub("\s+", " ", parser.result) | |
content = re.sub("[^A-Za-z ]+", "", content) | |
words = content.split() | |
print(f"{len(words):,} words!") | |
# In[13]: | |
# Start counting | |
word_count = Counter(words) | |
word_count.most_common(10) | |
# ## ok that's neat, but.... | |
# | |
# How can this help me build that RPG/MMO I've always wanted to build? | |
# # Answer: Let's go shopping! | |
# In[14]: | |
# Set up your purse! | |
purse = Counter(gold=1000, silver=500, copper=100) | |
purse | |
# In[15]: | |
# Create some items in the shop | |
shield = {"gold": 25} | |
sword = {"gold": 100, "silver": 50} | |
tunic = {"silver": 10, "copper": 50} | |
# In[16]: | |
# Let's make some purchases | |
purse.subtract(shield) | |
purse | |
# In[17]: | |
# Buy the sword. | |
purse.subtract(sword) | |
purse | |
# In[18]: | |
# Get the tunic too | |
purse.subtract(tunic) | |
purse | |
# In[19]: | |
# Buy a castle! | |
castle = {"gold": 50_000, "silver": 10_000, "copper": 350} | |
purse.subtract(castle) | |
purse # whoops | |
# In[20]: | |
# New in 3.10 | |
purse.total() # -> Should sum all the values. | |
# In[ ]: | |
# or ... | |
debt = sum(purse.values()) | |
print(f"We owe {debt:,}!") | |
# In[ ]: | |
purse.clear() # reset! | |
purse | |
# ## Resources | |
# | |
# - Python Collections: https://docs.python.org/3/library/collections.html | |
# - HTML Parser: https://docs.python.org/3/library/html.parser.html | |
# - The 2013 version of this talk: https://speakerdeck.com/bkmontgomery/pythons-counter-collection?slide=40 | |
# - Sample Code: https://gist.github.com/bradmontgomery/4717521 | |
# | |
# ### Other collections goodies! | |
# | |
# - ChainMap | |
# - deque | |
# - namedtuple | |
# - defaultdict | |
# - OrderedDict | |
# - UserDict, UserList, UserString | |
# # Thank you! | |
# | |
# Questions? |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
black==22.6.0 | |
certifi==2022.6.15 | |
isort==5.10.1 | |
requests==2.28.1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thank you for sharing this code.
Can you update the download links? It seems like broken.