deanmalmgren · September 9, 2014 13:56
diff --git a/README.md b/README.md
diff --git a/simulate.py b/simulate.py
 """Quick error analysis using Monte Carlo bootstrapping techniques to
 test the statistical significance of the observations tweeted here:

 https://twitter.com/Shogannai/status/509176510880575488
 """

 import random
 import collections

 def error_bars(n_internal_ceos, n_ceos):
    """estimate the error in the reported percent of internally hired CEOs
    to identify which regions have statistically different
    results. This uses a Bootstrap Monte Carlo approach to estimate
    the 95% confidence interval.
    """

    # # if you were a good Bayesian, you'd account for your prior
    # # knowledge here in some way. In the absense of any information
    # # (no observations), I'd guess that half of CEOs come from inside
    # # a company
    # p_internal_expected = (n_internal_ceos + 1.0) / (n_ceos + 2.0)

    # its pretty clear that this is how the percentage of internally
    # hired CEOs is calculated though, so I'll just go with
    # this. Switching this for the previous block of code doesn't
    # change things in any way
    p_internal_expected = float(n_internal_ceos) / n_ceos

    # Simulate 10,000 "alternate universes" where CEOs are hired
    # internally at a rate that is consistent with what you can
    # expect. We're going to count the number of times we observe each
    # discrete observation to determine the error in our estimate of
    # p_internal_expected above
    n_internal_estimated = collections.Counter()
    n_simulations = 10000
    for simulation in range(n_simulations):
        n_internal = 0
        for ceo in range(n_ceos):  
            if random.random() < p_internal_expected:
                n_internal += 1
        n_internal_estimated[n_internal] += 1

    # find the 95% confidence interval by starting at the mode of this
    # distribution and adding mass in a steepest decent kind of way
    n_internal_mode, count = n_internal_estimated.most_common(1)[0]
    below = above = n_internal_mode
    while float(count) / n_simulations < 0.95:
        if n_internal_estimated[below-1] > n_internal_estimated[above+1]:
            count += n_internal_estimated[below-1]
            below -= 1
        else:
            count += n_internal_estimated[above+1]
            above += 1
    return float(below)/n_ceos, float(above)/n_ceos, float(count)/n_simulations


 def print_result(country, error):
    """quick function to print results in a pretty way"""
    print "%20s\t%0.3f\t%0.3f" % (
        country,
        error[0],
        error[1],
    )

 # Based on the figure, I back calculated the number of internally
 # hired CEOs. The output below shows the error bars expected from this
 # analysis
 print_result( "USA/Canada", error_bars(60, 78) )
 print_result( "Western Europe", error_bars(36, 48) )
 print_result( "Japan", error_bars(28, 29) )
 print_result( "Other mature", error_bars(32, 46) )
 print_result( "China", error_bars(26, 31) )
 print_result( "Brazil, Russia, India", error_bars(20, 29) )
 print_result( "Other emerging", error_bars(14, 25) )
	"""Quick error analysis using Monte Carlo bootstrapping techniques to
	test the statistical significance of the observations tweeted here:

	https://twitter.com/Shogannai/status/509176510880575488
	"""

	import random
	import collections

	def error_bars(n_internal_ceos, n_ceos):
	"""estimate the error in the reported percent of internally hired CEOs
	to identify which regions have statistically different
	results. This uses a Bootstrap Monte Carlo approach to estimate
	the 95% confidence interval.
	"""

	# # if you were a good Bayesian, you'd account for your prior
	# # knowledge here in some way. In the absense of any information
	# # (no observations), I'd guess that half of CEOs come from inside
	# # a company
	# p_internal_expected = (n_internal_ceos + 1.0) / (n_ceos + 2.0)

	# its pretty clear that this is how the percentage of internally
	# hired CEOs is calculated though, so I'll just go with
	# this. Switching this for the previous block of code doesn't
	# change things in any way
	p_internal_expected = float(n_internal_ceos) / n_ceos

	# Simulate 10,000 "alternate universes" where CEOs are hired
	# internally at a rate that is consistent with what you can
	# expect. We're going to count the number of times we observe each
	# discrete observation to determine the error in our estimate of
	# p_internal_expected above
	n_internal_estimated = collections.Counter()
	n_simulations = 10000
	for simulation in range(n_simulations):
	n_internal = 0
	for ceo in range(n_ceos):
	if random.random() < p_internal_expected:
	n_internal += 1
	n_internal_estimated[n_internal] += 1

	# find the 95% confidence interval by starting at the mode of this
	# distribution and adding mass in a steepest decent kind of way
	n_internal_mode, count = n_internal_estimated.most_common(1)[0]
	below = above = n_internal_mode
	while float(count) / n_simulations < 0.95:
	if n_internal_estimated[below-1] > n_internal_estimated[above+1]:
	count += n_internal_estimated[below-1]
	below -= 1
	else:
	count += n_internal_estimated[above+1]
	above += 1
	return float(below)/n_ceos, float(above)/n_ceos, float(count)/n_simulations


	def print_result(country, error):
	"""quick function to print results in a pretty way"""
	print "%20s\t%0.3f\t%0.3f" % (
	country,
	error[0],
	error[1],
	)

	# Based on the figure, I back calculated the number of internally
	# hired CEOs. The output below shows the error bars expected from this
	# analysis
	print_result( "USA/Canada", error_bars(60, 78) )
	print_result( "Western Europe", error_bars(36, 48) )
	print_result( "Japan", error_bars(28, 29) )
	print_result( "Other mature", error_bars(32, 46) )
	print_result( "China", error_bars(26, 31) )
	print_result( "Brazil, Russia, India", error_bars(20, 29) )
	print_result( "Other emerging", error_bars(14, 25) )