Skip to content

Instantly share code, notes, and snippets.

@seifip
Forked from bradmontgomery/count_words.py
Created August 11, 2017 15:13
Show Gist options
  • Save seifip/038e7ad1bb87e86bb8723dfc92f93ab5 to your computer and use it in GitHub Desktop.
Save seifip/038e7ad1bb87e86bb8723dfc92f93ab5 to your computer and use it in GitHub Desktop.
playing with python's `collections.Counter`
"""
Use a Counter to find the most common words in "The Wonderful Wizard of Oz" by
L. Frank Baum.
Available in plain text at:
https://ia700500.us.archive.org/2/items/thewonderfulwiza00055gut/wizoz10.txt
short link: http://bit.ly/thewonderfulwizard
Note: This code also counts the words in the header, so it's not a *realistic*
applicaton, but more of a demonstration of python's Counter.
Running this code should give you something like this:
$ python count_words.py
The Top 10 words
the: 2808
and: 1630
to: 1143
of: 869
a: 819
I: 597
was: 502
you: 486
in: 476
he: 408
"""
from collections import Counter
import re
import urllib # for more pleasant http, use http://bit.ly/python-requests
def main(n=10):
# Download the content
content = urllib.urlopen('http://bit.ly/thewonderfulwizard').read()
# Clean the content a little
content = re.sub('\s+', ' ', content) # condense all whitespace
content = re.sub('[^A-Za-z ]+', '', content) # remove non-alpha chars
words = content.split()
# Start counting
word_count = Counter(words)
# The Top-N words
print("The Top {0} words".format(n))
for word, count in word_count.most_common(n):
print("{0}: {1}".format(word, count))
if __name__ == "__main__":
main()
"""
Playing with Python's `Counter`
- it's like a dictionary
- values can be positive/negative integers
- keys correspond to the things you want to count
"""
>>> from collections import Counter
>>> c = Counter() # Create a Counter
>>> c['widgets'] += 1 # start counting 'widgets'
>>> c
Counter({'widgets': 1})
# (most) regular dict methods are available
>>> c.keys()
['widgets']
>>> c.values()
[1]
>>> 'widgets' in c
True
# `update` will create new keys or adjust the counts for
# existing keys
>>> c.update({'foo': 1})
>>> c
Counter({'widgets': 1, 'foo': 1})
# calling `update` again will increment the value of 'foo'
>>> c.update({'foo': 1})
>>> c
Counter({'widgets': 1, 'foo': 2})
# You can create a Counter from an iterable
>>> c = Counter(['larry', 'moe', 'curly'])
>>> c
Counter({'larry': 1, 'curly': 1, 'moe': 1})
# Or you can pass in keyword args
>>> c = Counter(ravens=34, niners=31)
>>> c
Counter({'ravens': 34, 'niners': 31})
# `elements` gives you an iterator that yeilds a `key` for each
# `count`. (You can also create a counter from an iterable).
>>> colors = ['red', 'blue', 'yellow']
>>> c = Counter(colors)
>>> c
Counter({'blue': 1, 'yellow': 1, 'red': 1})
>>> c['red'] += 2 # Three 'red's
>>> c['blue'] += 1 # Two 'blues's
>>> c
Counter({'red': 3, 'blue': 2, 'yellow': 1})
>>> list(c.elements())
['blue', 'blue', 'yellow', 'red', 'red', 'red']
# Finding the N "most common" elements
>>> c.most_common(2)
[('red', 3), ('blue', 2)]
# Trick: Find the most common letters in a string:
>>> Counter('supercalifragilisticexpialidocious').most_common(3)
[('i', 7), ('a', 3), ('c', 3)]
# Subtracting counts
>>> money = {'gold': 1001, 'silver': 501, 'copper': 101}
>>> shield = {'gold': 25}
>>> sword = {'gold': 100, 'silver':50}
# initialize your bank
>>> c = Counter(money)
>>> c
Counter({'gold': 1001, 'silver': 501, 'copper': 101})
# Buy a shield
>>> c.subtract(shield)
>>> c
Counter({'gold': 976, 'silver': 501, 'copper': 101})
# Buy a sword
>>> c.subtract(sword)
Counter({'gold': 876, 'silver': 451, 'copper': 101})
# Buy a Castle!
>>> castle = {'gold': 50000, 'silver': 9999, 'copper': 350}
>>> c.subtract(castle)
>>> c
Counter({'copper': -249, 'silver': -9548, 'gold': -49124})
# oops!
# start over!
>>> c.clear()
Counter()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment