Created
June 11, 2011 14:56
-
-
Save llimllib/1020630 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
testtext = """1 22 333 444 555 666 777 888 999 | |
999 999 999 testing something else 1bananas""" | |
# Benford's law applies only to numbers, not to words starting with numbers like '1bananas', | |
# so let's use a regex to find all the numbers in the text: | |
numbers = re.findall(r"\b\d+\b", testtext) | |
# here's how to understand that regular expression: | |
# | |
# \b means "word boundary", and matches an empty space at the beginning or end of a word | |
# \d means "digit", and matches any digit | |
# + means "one or more" | |
# | |
# so we can read \b\d+\b as "a word boundary, followed by one or more integers, followed by | |
# a word boundary" | |
# | |
# numbers is now: ['1', '22', '333', '444', '555', '666', '777', '888', '999', '999', '999', '999'] | |
# now we want to count the number of times each starting digit reoccurs. Let's use the | |
# Counter object I introduced in my previous gist: | |
from collections import Counter | |
number_counts = Counter() | |
for n in numbers: | |
number_counts[n[0]] += 1 | |
# now that we have our counts, let's calculate the percentage each n represents | |
# we need to convert this number to a float so that 10/3 == 3.33 instead of 10/3 == 3 | |
total_numbers = float(len(numbers)) | |
# now, the percentage is just (count/total)*100 for each number | |
number_percentage = [(number_counts[i]/total_numbers) * 100 for i in '123456789'] | |
benfords_law = [30.1, 17.6, 12.5, 9.7, 7.9, 6.7, 5.8, 5.1, 4.6] | |
difference = [number_percentage[i] - benfords_law[i] for i in range(9)] | |
for i in range(1, 10): | |
print "You had %d %d's (expecting %.2f%%, difference %.2f%%" % (number_counts[str(i)], i, benfords_law[i-1], difference[i-1]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment