Asmageddon · December 25, 2011 12:55
diff --git a/passwordstrength.py b/passwordstrength.py
 import random, math
 import re

 def log_bin(value):
 	return math.log(value) / math.log(2)

 def cumprod(values):
 	result = 1
 	for i in values:
 		result *= i
 	return result

 def gen_password(charset, length):
 	result = ""
 	for i in range(length):
 		to_add = random.choice(charset)
 		if len(to_add) > 1: #If it's not a single char, capitalize it so the attacker cannot make assumption that it's all lowercase
 			to_add = to_add.capitalize()
 		result += to_add
 	return result

 def format_size(size):
 	size *= 1.0 #Convert to floats
 	unit = ""
 	prefix_list = "KMGTPEZY" #Kilo, Mega, Giga, Tera, Peta, Exa, Zetta, Yotta
 	prefix = -1

 	while size > 1024:
 		size /= 1024
 		prefix+= 1

 	if prefix >= len(prefix_list):
 		return "much"
 	elif prefix == 0:
 		return "%.2fB" % size
 	else:
 		return "%.2f%siB" % (size, prefix_list[prefix])

 def num_separators(number):
 	number = list( str( int(number) ) )
 	result = ""
 	separator = ','
 	for index, char in enumerate(reversed(number)):
 		result = char + result
 		if not (index +1) % 3 and index != len(number):
 			result = separator + result
 	return result

 def acronym(text): #From big letters found in it
 	pattern = re.compile("[A-Z]") #Simply find big letters ^^
 	return ''.join( re.findall(pattern, text) )

 dictfile = open("/usr/share/dict/words", "r")

 pattern = re.compile("^.*[^'][^s]$") #Remove words ending with 's

 #Generate list of words:
 word_list = [word[:-1] for word in dictfile.readlines() if re.match(pattern, word[:-1])] # [:-1] removes the last character of the line - \n

 #Count them and their average length
 word_count = len(word_list)
 average_word_length = sum([len(word) for word in word_list]) / (1.0 * len(word_list))
 word_entropy_per_char = log_bin(word_count) / average_word_length

 hash_digest_size = 512 / 8 #For SHA512, in bytes

 #We're only using ASCII letters and numbers, no symbols except for underscore - a common set of allowed characters for passwords
 char_list = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_0123456789"
 char_count = len(char_list)
 char_entropy_per_char = log_bin(char_count)

 print "%i words in our dictionary" % word_count
 print "  Of average length %.2f" % average_word_length
 print "  With entropy content of %.2f bits per character\n" % word_entropy_per_char

 print "We're using %i characters" % char_count
 print "  With entropy content of %.2f bits each" % char_entropy_per_char
 print "  Containing %.2f%% the entropy content of single character in a random word\n" % (char_entropy_per_char / word_entropy_per_char * 100)

 results = [] #In format (combinations, text, disk usage of all hashes)

 #Let's do the math for characters first, from 4 to 22
 for i in range(4, 22):
 	combinations = char_count**i
 	#Disk usage of hashes for all possibilities
 	disk_usage = combinations * (hash_digest_size + i + 1) #+1 for a delimiter, presumably newline
 	results += [(combinations,
 				"%2i chars:      %.2e permutations, %s disk space" % (i, combinations, format_size(disk_usage) ),
 				disk_usage
 	)]

 #Now for words, we assume that attacker has a complete dictionary available
 for i in range(1,8):
 	combinations = word_count**i
 	disk_usage = combinations * (hash_digest_size + i*average_word_length + 1)
 	results += [(combinations,
 				"%i words(%2ich): %.2e permutations, %s disk space" % (i, math.ceil(i*average_word_length), combinations, format_size(disk_usage) ),
 				disk_usage
 	)]

 #Disclaimer: I have no idea how much this would take on what hardware, so please adjust yourself
 hashing_speed = 9001.0 * 1000**2 #9001 million hashes per second

 print "With %s hashes per second: " % num_separators(hashing_speed)

 marks = [60, 60, 24, 30, 12, 100, 137000000] #Minutes, hours, days, months, years, centuries, ages of universe

 marks_text = ["seconds", "minutes", "hours", "days", "months", "years", "centuries", "ages of universe"]

 mark = 0

 print "\nExample passwords:"
 print "  12 random characters:"
 for i in range(0,5):
 	print "    %s  %s" % (gen_password(char_list, 12), gen_password(char_list, 12))
 print "  4 random words:"
 for i in range(0,5):
 	word = gen_password(word_list, 4)
 	print "    %s: %s (%ich)" % (acronym(word), word, len(word))
 print "  5 random words:"
 for i in range(0,5):
 	word = gen_password(word_list, 5)
 	print "    %s: %s (%ich)" % (acronym(word), word, len(word))
 print "  15 random characters:"
 for i in range(0,5):
 	print "    %s  %s" % (gen_password(char_list, 15), gen_password(char_list, 15))

 print "==Starting from seconds=="

 for i in sorted(results): #Sorted by amount of permutations
 	#Change our unit when we go over it's limit
 	while i[0] / hashing_speed > cumprod( marks[:mark+1] ) and mark <= len(marks)-1: #More than one mark may be passed in one step
 		mark += 1
 		print "==Going into %s==" % marks_text[mark]
 	print "%s (%.2f %s to brute force)" % ( i[1], i[0] / hashing_speed / cumprod(marks[:mark]), marks_text[mark] )

 print "Disk usage is amount of space to store all hashes"
 print "Brute force assumes worst case, averages to half of the value"

 print "\nNotes:"
 print "  With disk usage, I am assuming no compression is used on words."
 print "   Zipping my words file yielded decrease from 931 to 252 kilobytes"
 print "   Compressing would probably decrease size by ~2/3 of that,"
 print "   since hashes are pretty much incompressible"
 print "  Compressing random characters makes no significant difference"
 print "  I am not accounting for technological progress, but I assume"
 print "   that by time it becomes easy enough to crack these,"
 print "   most currently used algorithms will be irrelevant anyway"

 print "\nVerdict:"
 print "  While random characters contain way more entropy per character,"
 print "   a set of words is much easier to remember"
 num_increase = average_word_length * 4 * 10
 print "  Inserting randomly a single number into 4 words increases"
 print "   amount of permutations %i times (%i orders of magnitude)" % (num_increase, math.floor (math.log10(num_increase)) )
 print "   %.2f%% of how much adding a single char to a random sequence does" % (num_increase * 100.0 / char_count)
 print "As such, I declare that using a sequence of 5 or even 4 random words"
 print " is a superior alternative to random characters,"
 print " since it is much easier for human brain to remember sequences"
 print " that have a meaning, and acronyms help further"
	import random, math
	import re

	def log_bin(value):
	return math.log(value) / math.log(2)

	def cumprod(values):
	result = 1
	for i in values:
	result *= i
	return result

	def gen_password(charset, length):
	result = ""
	for i in range(length):
	to_add = random.choice(charset)
	if len(to_add) > 1: #If it's not a single char, capitalize it so the attacker cannot make assumption that it's all lowercase
	to_add = to_add.capitalize()
	result += to_add
	return result

	def format_size(size):
	size *= 1.0 #Convert to floats
	unit = ""
	prefix_list = "KMGTPEZY" #Kilo, Mega, Giga, Tera, Peta, Exa, Zetta, Yotta
	prefix = -1

	while size > 1024:
	size /= 1024
	prefix+= 1

	if prefix >= len(prefix_list):
	return "much"
	elif prefix == 0:
	return "%.2fB" % size
	else:
	return "%.2f%siB" % (size, prefix_list[prefix])

	def num_separators(number):
	number = list( str( int(number) ) )
	result = ""
	separator = ','
	for index, char in enumerate(reversed(number)):
	result = char + result
	if not (index +1) % 3 and index != len(number):
	result = separator + result
	return result

	def acronym(text): #From big letters found in it
	pattern = re.compile("[A-Z]") #Simply find big letters ^^
	return ''.join( re.findall(pattern, text) )

	dictfile = open("/usr/share/dict/words", "r")

	pattern = re.compile("^.*[^'][^s]$") #Remove words ending with 's

	#Generate list of words:
	word_list = [word[:-1] for word in dictfile.readlines() if re.match(pattern, word[:-1])] # [:-1] removes the last character of the line - \n

	#Count them and their average length
	word_count = len(word_list)
	average_word_length = sum([len(word) for word in word_list]) / (1.0 * len(word_list))
	word_entropy_per_char = log_bin(word_count) / average_word_length

	hash_digest_size = 512 / 8 #For SHA512, in bytes

	#We're only using ASCII letters and numbers, no symbols except for underscore - a common set of allowed characters for passwords
	char_list = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_0123456789"
	char_count = len(char_list)
	char_entropy_per_char = log_bin(char_count)

	print "%i words in our dictionary" % word_count
	print " Of average length %.2f" % average_word_length
	print " With entropy content of %.2f bits per character\n" % word_entropy_per_char

	print "We're using %i characters" % char_count
	print " With entropy content of %.2f bits each" % char_entropy_per_char
	print " Containing %.2f%% the entropy content of single character in a random word\n" % (char_entropy_per_char / word_entropy_per_char * 100)

	results = [] #In format (combinations, text, disk usage of all hashes)

	#Let's do the math for characters first, from 4 to 22
	for i in range(4, 22):
	combinations = char_count**i
	#Disk usage of hashes for all possibilities
	disk_usage = combinations * (hash_digest_size + i + 1) #+1 for a delimiter, presumably newline
	results += [(combinations,
	"%2i chars: %.2e permutations, %s disk space" % (i, combinations, format_size(disk_usage) ),
	disk_usage
	)]

	#Now for words, we assume that attacker has a complete dictionary available
	for i in range(1,8):
	combinations = word_count**i
	disk_usage = combinations * (hash_digest_size + i*average_word_length + 1)
	results += [(combinations,
	"%i words(%2ich): %.2e permutations, %s disk space" % (i, math.ceil(i*average_word_length), combinations, format_size(disk_usage) ),
	disk_usage
	)]

	#Disclaimer: I have no idea how much this would take on what hardware, so please adjust yourself
	hashing_speed = 9001.0 * 1000**2 #9001 million hashes per second

	print "With %s hashes per second: " % num_separators(hashing_speed)

	marks = [60, 60, 24, 30, 12, 100, 137000000] #Minutes, hours, days, months, years, centuries, ages of universe

	marks_text = ["seconds", "minutes", "hours", "days", "months", "years", "centuries", "ages of universe"]

	mark = 0

	print "\nExample passwords:"
	print " 12 random characters:"
	for i in range(0,5):
	print " %s %s" % (gen_password(char_list, 12), gen_password(char_list, 12))
	print " 4 random words:"
	for i in range(0,5):
	word = gen_password(word_list, 4)
	print " %s: %s (%ich)" % (acronym(word), word, len(word))
	print " 5 random words:"
	for i in range(0,5):
	word = gen_password(word_list, 5)
	print " %s: %s (%ich)" % (acronym(word), word, len(word))
	print " 15 random characters:"
	for i in range(0,5):
	print " %s %s" % (gen_password(char_list, 15), gen_password(char_list, 15))

	print "==Starting from seconds=="

	for i in sorted(results): #Sorted by amount of permutations
	#Change our unit when we go over it's limit
	while i[0] / hashing_speed > cumprod( marks[:mark+1] ) and mark <= len(marks)-1: #More than one mark may be passed in one step
	mark += 1
	print "==Going into %s==" % marks_text[mark]
	print "%s (%.2f %s to brute force)" % ( i[1], i[0] / hashing_speed / cumprod(marks[:mark]), marks_text[mark] )

	print "Disk usage is amount of space to store all hashes"
	print "Brute force assumes worst case, averages to half of the value"

	print "\nNotes:"
	print " With disk usage, I am assuming no compression is used on words."
	print " Zipping my words file yielded decrease from 931 to 252 kilobytes"
	print " Compressing would probably decrease size by ~2/3 of that,"
	print " since hashes are pretty much incompressible"
	print " Compressing random characters makes no significant difference"
	print " I am not accounting for technological progress, but I assume"
	print " that by time it becomes easy enough to crack these,"
	print " most currently used algorithms will be irrelevant anyway"

	print "\nVerdict:"
	print " While random characters contain way more entropy per character,"
	print " a set of words is much easier to remember"
	num_increase = average_word_length * 4 * 10
	print " Inserting randomly a single number into 4 words increases"
	print " amount of permutations %i times (%i orders of magnitude)" % (num_increase, math.floor (math.log10(num_increase)) )
	print " %.2f%% of how much adding a single char to a random sequence does" % (num_increase * 100.0 / char_count)
	print "As such, I declare that using a sequence of 5 or even 4 random words"
	print " is a superior alternative to random characters,"
	print " since it is much easier for human brain to remember sequences"
	print " that have a meaning, and acronyms help further"