Created
April 13, 2019 16:27
-
-
Save nebuta/fed447861b2e40ee8fcb122ff88b313a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import numpy as np | |
from scipy import stats | |
import matplotlib.pyplot as plt | |
import matplotlib | |
from matplotlib import rc | |
font = {'family':'IPAexGothic'} | |
rc('font', **font) | |
# https://stackoverflow.com/questions/39239087/run-a-chi-square-test-with-observation-and-expectation-counts-and-get-confidence | |
def diffprop(obs): | |
""" | |
`obs` must be a 2x2 numpy array. | |
Returns: | |
delta | |
The difference in proportions | |
ci | |
The Wald 95% confidence interval for delta | |
corrected_ci | |
Yates continuity correction for the 95% confidence interval of delta. | |
""" | |
n1, n2 = obs.sum(axis=1) | |
prop1 = obs[0,0] / n1 | |
prop2 = obs[1,0] / n2 | |
delta = prop1 - prop2 | |
# Wald 95% confidence interval for delta | |
se = np.sqrt(prop1*(1 - prop1)/n1 + prop2*(1 - prop2)/n2) | |
ci = (delta - 1.96*se, delta + 1.96*se) | |
# Yates continuity correction for confidence interval of delta | |
correction = 0.5*(1/n1 + 1/n2) | |
corrected_ci = (ci[0] - correction, ci[1] + correction) | |
return delta, ci, corrected_ci | |
def one_row(r,j): | |
# mt: male total, fp: female pass, ff: female fail, etc. | |
v = [int(s.replace(',','').replace('-','0')) for s in r[j:j+6]] | |
return {'mt': v[0],'ft': v[1], 'mp': v[3], 'fp': v[4], 'mf': v[0] - v[3], 'ff': v[1] - v[4]} | |
def get_of_year(v): | |
return np.array([[v['mp'],v['mf']],[v['fp'],v['ff']]]) | |
def get_confidence_interval(v): | |
m_bottom, m_up = stats.binom.interval(alpha=0.95, n=v['mt'], p=v['mp']/v['mt'], loc=0) | |
f_bottom, f_up = stats.binom.interval(alpha=0.95, n=v['ft'], p=v['fp']/v['ft'], loc=0) | |
return np.array( | |
[[v['mp']/v['mt'],m_bottom/v['mt'],m_up/v['mt']], | |
[v['fp']/v['ft'],f_bottom/v['ft'],f_up/v['ft']]]) | |
values = {} | |
with open('data.csv') as f: | |
reader = csv.reader(f) | |
for _ in range(3): | |
next(reader) | |
for row in reader: | |
if row[3] == '合計': | |
vss = [one_row(row,loc) for i,loc in enumerate([4,12,20,28,36,44])] | |
vs_total = {'mt': sum([v['mt'] for v in vss]),'ft': sum([v['ft'] for v in vss]), | |
'mp': sum([v['mp'] for v in vss]),'fp': sum([v['fp'] for v in vss]), | |
'mf': sum([v['mf'] for v in vss]),'ff': sum([v['ff'] for v in vss])} | |
vss.append(vs_total) | |
values[row[2]] = vss | |
count = 1 | |
figcount = 1 | |
plt.subplots_adjust(hspace=0.7) | |
delta_cis = [] | |
names = [] | |
# plt.figure(figsize=(15,12)) | |
for name,vss in values.items(): | |
for vs,year in zip(vss,[2018,2017,2016,2015,2014,2013,'Total']): | |
if year != 'Total': | |
continue | |
try: | |
if count > 12: | |
plt.savefig('fig %d.pdf' % figcount) | |
plt.clf() | |
figcount += 1 | |
# plt.show() | |
# plt.figure(figsize=(15,12)) | |
plt.subplots_adjust(hspace=0.7) | |
count = 1 | |
obs = get_of_year(vs) | |
chi2, p, dof, ex = stats.chi2_contingency(obs,correction=False) | |
print(year,name,'%.3f' % p) | |
print(obs) | |
m_ci,f_ci = get_confidence_interval(vs) | |
delta,delta_ci,_ = diffprop(obs) | |
delta_cis.append(delta_ci) | |
names.append(name) | |
print(delta,delta_ci) | |
ax = plt.subplot(3,4,count) | |
if count % 4 != 1: | |
ax.get_yaxis().set_visible(False) | |
plt.bar([0,1],[m_ci[0],f_ci[0]],tick_label=['男','女'],alpha=0.4) | |
plt.errorbar([0,1],[m_ci[0],f_ci[0]], yerr=[m_ci[0]-m_ci[1],f_ci[0]-f_ci[1]],fmt='.',markersize=0) | |
plt.title(name,fontsize=10) | |
plt.ylim([0,0.5]) | |
count += 1 | |
except Exception as e: | |
print('error',e) | |
plt.savefig('fig %d.pdf' % figcount) | |
plt.clf() | |
figcount += 1 | |
# plt.show() | |
plt.close() | |
# plt.figure() | |
fig, ax = plt.subplots(figsize=(7,14)) | |
ys = [(ds[0]+ds[1])/2 for ds in delta_cis] | |
yerr = [(ds[1]-ds[0])/2 for ds in delta_cis] | |
colors = [] | |
for y,n,e in zip(ys,range(len(names)),yerr): | |
c = (0,0,0) | |
if y - e > 0: | |
c = (0,0,1) | |
elif y + e < 0: | |
c = (1,0,0) | |
colors.append(c) | |
plt.errorbar(y,n,xerr=e,fmt='.',c=c,elinewidth=4,alpha=0.4) | |
ax.yaxis.set_ticks(np.arange(len(names))) | |
plt.plot([0,0],[-1,len(names)],ls='--',c=(0.3,0.3,0.3)) | |
plt.ylim([-1,len(names)]) | |
plt.xlim([-0.15,0.15]) | |
plt.tick_params(labeltop=True,top=True) | |
ax.set_yticklabels(names,fontsize=6) | |
for ytick, c in zip(ax.get_yticklabels(),colors): | |
ytick.set_color(c) | |
ax.invert_yaxis() | |
ax.set_aspect(aspect=0.01) | |
plt.savefig('all_difference.pdf') | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment