|
"""Quick error analysis using Monte Carlo bootstrapping techniques to |
|
test the statistical significance of the observations tweeted here: |
|
|
|
https://twitter.com/Shogannai/status/509176510880575488 |
|
""" |
|
|
|
import random |
|
import collections |
|
|
|
def error_bars(n_internal_ceos, n_ceos): |
|
"""estimate the error in the reported percent of internally hired CEOs |
|
to identify which regions have statistically different |
|
results. This uses a Bootstrap Monte Carlo approach to estimate |
|
the 95% confidence interval. |
|
""" |
|
|
|
# # if you were a good Bayesian, you'd account for your prior |
|
# # knowledge here in some way. In the absense of any information |
|
# # (no observations), I'd guess that half of CEOs come from inside |
|
# # a company |
|
# p_internal_expected = (n_internal_ceos + 1.0) / (n_ceos + 2.0) |
|
|
|
# its pretty clear that this is how the percentage of internally |
|
# hired CEOs is calculated though, so I'll just go with |
|
# this. Switching this for the previous block of code doesn't |
|
# change things in any way |
|
p_internal_expected = float(n_internal_ceos) / n_ceos |
|
|
|
# Simulate 10,000 "alternate universes" where CEOs are hired |
|
# internally at a rate that is consistent with what you can |
|
# expect. We're going to count the number of times we observe each |
|
# discrete observation to determine the error in our estimate of |
|
# p_internal_expected above |
|
n_internal_estimated = collections.Counter() |
|
n_simulations = 10000 |
|
for simulation in range(n_simulations): |
|
n_internal = 0 |
|
for ceo in range(n_ceos): |
|
if random.random() < p_internal_expected: |
|
n_internal += 1 |
|
n_internal_estimated[n_internal] += 1 |
|
|
|
# find the 95% confidence interval by starting at the mode of this |
|
# distribution and adding mass in a steepest decent kind of way |
|
n_internal_mode, count = n_internal_estimated.most_common(1)[0] |
|
below = above = n_internal_mode |
|
while float(count) / n_simulations < 0.95: |
|
if n_internal_estimated[below-1] > n_internal_estimated[above+1]: |
|
count += n_internal_estimated[below-1] |
|
below -= 1 |
|
else: |
|
count += n_internal_estimated[above+1] |
|
above += 1 |
|
return float(below)/n_ceos, float(above)/n_ceos, float(count)/n_simulations |
|
|
|
|
|
def print_result(country, error): |
|
"""quick function to print results in a pretty way""" |
|
print "%20s\t%0.3f\t%0.3f" % ( |
|
country, |
|
error[0], |
|
error[1], |
|
) |
|
|
|
# Based on the figure, I back calculated the number of internally |
|
# hired CEOs. The output below shows the error bars expected from this |
|
# analysis |
|
print_result( "USA/Canada", error_bars(60, 78) ) |
|
print_result( "Western Europe", error_bars(36, 48) ) |
|
print_result( "Japan", error_bars(28, 29) ) |
|
print_result( "Other mature", error_bars(32, 46) ) |
|
print_result( "China", error_bars(26, 31) ) |
|
print_result( "Brazil, Russia, India", error_bars(20, 29) ) |
|
print_result( "Other emerging", error_bars(14, 25) ) |