llimllib · June 11, 2011 14:56
diff --git a/gistfile1.py b/gistfile1.py
 import re

 testtext = """1 22 333 444 555 666 777 888 999
 999 999 999 testing something else 1bananas"""

 # Benford's law applies only to numbers, not to words starting with numbers like '1bananas',
 # so let's use a regex to find all the numbers in the text:
 numbers = re.findall(r"\b\d+\b", testtext)

 # here's how to understand that regular expression:
 #
 # \b means "word boundary", and matches an empty space at the beginning or end of a word
 # \d means "digit", and matches any digit
 # + means "one or more"
 #
 # so we can read \b\d+\b as "a word boundary, followed by one or more integers, followed by
 # a word boundary"
 #
 # numbers is now: ['1', '22', '333', '444', '555', '666', '777', '888', '999', '999', '999', '999']


 # now we want to count the number of times each starting digit reoccurs. Let's use the 
 # Counter object I introduced in my previous gist:
 from collections import Counter
 number_counts = Counter()

 for n in numbers:
    number_counts[n[0]] += 1


 # now that we have our counts, let's calculate the percentage each n represents

 # we need to convert this number to a float so that 10/3 == 3.33 instead of 10/3 == 3
 total_numbers = float(len(numbers))

 # now, the percentage is just (count/total)*100 for each number
 number_percentage = [(number_counts[i]/total_numbers) * 100 for i in '123456789']

 benfords_law = [30.1, 17.6, 12.5, 9.7, 7.9, 6.7, 5.8, 5.1, 4.6]

 difference = [number_percentage[i] - benfords_law[i] for i in range(9)]

 for i in range(1, 10):
    print "You had %d %d's (expecting %.2f%%, difference %.2f%%" % (number_counts[str(i)], i, benfords_law[i-1], difference[i-1])
	import re

	testtext = """1 22 333 444 555 666 777 888 999
	999 999 999 testing something else 1bananas"""

	# Benford's law applies only to numbers, not to words starting with numbers like '1bananas',
	# so let's use a regex to find all the numbers in the text:
	numbers = re.findall(r"\b\d+\b", testtext)

	# here's how to understand that regular expression:
	#
	# \b means "word boundary", and matches an empty space at the beginning or end of a word
	# \d means "digit", and matches any digit
	# + means "one or more"
	#
	# so we can read \b\d+\b as "a word boundary, followed by one or more integers, followed by
	# a word boundary"
	#
	# numbers is now: ['1', '22', '333', '444', '555', '666', '777', '888', '999', '999', '999', '999']


	# now we want to count the number of times each starting digit reoccurs. Let's use the
	# Counter object I introduced in my previous gist:
	from collections import Counter
	number_counts = Counter()

	for n in numbers:
	number_counts[n[0]] += 1


	# now that we have our counts, let's calculate the percentage each n represents

	# we need to convert this number to a float so that 10/3 == 3.33 instead of 10/3 == 3
	total_numbers = float(len(numbers))

	# now, the percentage is just (count/total)*100 for each number
	number_percentage = [(number_counts[i]/total_numbers) * 100 for i in '123456789']

	benfords_law = [30.1, 17.6, 12.5, 9.7, 7.9, 6.7, 5.8, 5.1, 4.6]

	difference = [number_percentage[i] - benfords_law[i] for i in range(9)]

	for i in range(1, 10):
	print "You had %d %d's (expecting %.2f%%, difference %.2f%%" % (number_counts[str(i)], i, benfords_law[i-1], difference[i-1])