-
-
Save seifip/038e7ad1bb87e86bb8723dfc92f93ab5 to your computer and use it in GitHub Desktop.
playing with python's `collections.Counter`
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Use a Counter to find the most common words in "The Wonderful Wizard of Oz" by | |
L. Frank Baum. | |
Available in plain text at: | |
https://ia700500.us.archive.org/2/items/thewonderfulwiza00055gut/wizoz10.txt | |
short link: http://bit.ly/thewonderfulwizard | |
Note: This code also counts the words in the header, so it's not a *realistic* | |
applicaton, but more of a demonstration of python's Counter. | |
Running this code should give you something like this: | |
$ python count_words.py | |
The Top 10 words | |
the: 2808 | |
and: 1630 | |
to: 1143 | |
of: 869 | |
a: 819 | |
I: 597 | |
was: 502 | |
you: 486 | |
in: 476 | |
he: 408 | |
""" | |
from collections import Counter | |
import re | |
import urllib # for more pleasant http, use http://bit.ly/python-requests | |
def main(n=10): | |
# Download the content | |
content = urllib.urlopen('http://bit.ly/thewonderfulwizard').read() | |
# Clean the content a little | |
content = re.sub('\s+', ' ', content) # condense all whitespace | |
content = re.sub('[^A-Za-z ]+', '', content) # remove non-alpha chars | |
words = content.split() | |
# Start counting | |
word_count = Counter(words) | |
# The Top-N words | |
print("The Top {0} words".format(n)) | |
for word, count in word_count.most_common(n): | |
print("{0}: {1}".format(word, count)) | |
if __name__ == "__main__": | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Playing with Python's `Counter` | |
- it's like a dictionary | |
- values can be positive/negative integers | |
- keys correspond to the things you want to count | |
""" | |
>>> from collections import Counter | |
>>> c = Counter() # Create a Counter | |
>>> c['widgets'] += 1 # start counting 'widgets' | |
>>> c | |
Counter({'widgets': 1}) | |
# (most) regular dict methods are available | |
>>> c.keys() | |
['widgets'] | |
>>> c.values() | |
[1] | |
>>> 'widgets' in c | |
True | |
# `update` will create new keys or adjust the counts for | |
# existing keys | |
>>> c.update({'foo': 1}) | |
>>> c | |
Counter({'widgets': 1, 'foo': 1}) | |
# calling `update` again will increment the value of 'foo' | |
>>> c.update({'foo': 1}) | |
>>> c | |
Counter({'widgets': 1, 'foo': 2}) | |
# You can create a Counter from an iterable | |
>>> c = Counter(['larry', 'moe', 'curly']) | |
>>> c | |
Counter({'larry': 1, 'curly': 1, 'moe': 1}) | |
# Or you can pass in keyword args | |
>>> c = Counter(ravens=34, niners=31) | |
>>> c | |
Counter({'ravens': 34, 'niners': 31}) | |
# `elements` gives you an iterator that yeilds a `key` for each | |
# `count`. (You can also create a counter from an iterable). | |
>>> colors = ['red', 'blue', 'yellow'] | |
>>> c = Counter(colors) | |
>>> c | |
Counter({'blue': 1, 'yellow': 1, 'red': 1}) | |
>>> c['red'] += 2 # Three 'red's | |
>>> c['blue'] += 1 # Two 'blues's | |
>>> c | |
Counter({'red': 3, 'blue': 2, 'yellow': 1}) | |
>>> list(c.elements()) | |
['blue', 'blue', 'yellow', 'red', 'red', 'red'] | |
# Finding the N "most common" elements | |
>>> c.most_common(2) | |
[('red', 3), ('blue', 2)] | |
# Trick: Find the most common letters in a string: | |
>>> Counter('supercalifragilisticexpialidocious').most_common(3) | |
[('i', 7), ('a', 3), ('c', 3)] | |
# Subtracting counts | |
>>> money = {'gold': 1001, 'silver': 501, 'copper': 101} | |
>>> shield = {'gold': 25} | |
>>> sword = {'gold': 100, 'silver':50} | |
# initialize your bank | |
>>> c = Counter(money) | |
>>> c | |
Counter({'gold': 1001, 'silver': 501, 'copper': 101}) | |
# Buy a shield | |
>>> c.subtract(shield) | |
>>> c | |
Counter({'gold': 976, 'silver': 501, 'copper': 101}) | |
# Buy a sword | |
>>> c.subtract(sword) | |
Counter({'gold': 876, 'silver': 451, 'copper': 101}) | |
# Buy a Castle! | |
>>> castle = {'gold': 50000, 'silver': 9999, 'copper': 350} | |
>>> c.subtract(castle) | |
>>> c | |
Counter({'copper': -249, 'silver': -9548, 'gold': -49124}) | |
# oops! | |
# start over! | |
>>> c.clear() | |
Counter() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment