Skip to content

Instantly share code, notes, and snippets.

@musyoku
Created September 9, 2017 10:10
Show Gist options
  • Save musyoku/c4d59fc66bc39c66c8061cca2d966dda to your computer and use it in GitHub Desktop.
Save musyoku/c4d59fc66bc39c66c8061cca2d966dda to your computer and use it in GitHub Desktop.
import chainer
import numpy as np
train, test = chainer.datasets.get_mnist()
dataset = train._datasets[0] * 10 - 5
num_data = dataset.shape[0]
def calculate_parameter():
mean = np.mean(dataset, axis=0)
var = np.var(dataset, axis=0)
return mean, var
# 平均と分散を逐次的に計算するアルゴリズム
# https://mathwords.net/tikujiheikin
def _calculate_sample_mean_var(iteration=100, batchsize=64):
mean = 0
var = 0
for i in range(iteration):
indices = np.random.randint(0, num_data, size=batchsize)
samples = dataset.take(indices, axis=0)
sample_mean = np.mean(samples, axis=0)
sample_var = np.var(samples, axis=0)
scale = batchsize / ((i + 1) * batchsize)
new_mean = (i * mean + sample_mean) * scale
new_var = (i * (var + mean ** 2) + (sample_var + sample_mean ** 2)) * scale - new_mean ** 2
mean = new_mean
var = new_var
return mean, var
# 漸化式バージョン
# 精度が良い
# https://qpp.bitbucket.io/post/variance/
def _recurrence_calculate_sample_mean_var(iteration=100, batchsize=64):
mean = 0
nvar = 0 # 分散にnを掛けたもの
for i in range(iteration):
indices = np.random.randint(0, num_data, size=batchsize)
samples = dataset.take(indices, axis=0)
sample_sum = np.sum(samples, axis=0)
sample_squared_sum = np.sum(samples ** 2, axis=0)
new_mean = mean + (sample_sum - batchsize * mean) / ((i + 1) * batchsize)
new_nvar = nvar + sample_squared_sum - sample_sum * (new_mean + mean) + batchsize * new_mean * mean
mean = new_mean
nvar = new_nvar
return mean, nvar / ((i + 1) * batchsize) # nで割ると分散に戻る
# 標本平均・標本分散を返す
def calculate_sample_mean_var(iteration=100, batchsize=64, recurrence=True):
if recurrence:
return _recurrence_calculate_sample_mean_var(iteration, batchsize)
return _calculate_sample_mean_var(iteration, batchsize)
# 標本平均・不偏分散を返す
def calculate_mean_and_unbiased_var(iteration=100, batchsize=64, recurrence=True):
mean, var = calculate_sample_mean_var(iteration, batchsize, recurrence)
n = iteration * batchsize
scale = n / (n - 1)
return mean, scale * var
def main():
seed = np.random.randint(0, 20170909)
population_mean, population_var = calculate_parameter()
np.random.seed(seed)
sample_mean, sample_var = calculate_sample_mean_var(recurrence=False)
_, unbiased_var = calculate_mean_and_unbiased_var(recurrence=False)
print("naive")
print(" error(mean)")
print(" sample", np.mean(abs(population_mean - sample_mean)))
print(" error(var)")
print(" sample", np.mean(abs(population_var - sample_var)))
print(" unbiased", np.mean(abs(population_var - unbiased_var)))
np.random.seed(seed)
sample_mean, sample_var = calculate_sample_mean_var(recurrence=True)
_, unbiased_var = calculate_mean_and_unbiased_var(recurrence=True)
print("recurrence")
print(" error(mean)")
print(" sample", np.mean(abs(population_mean - sample_mean)))
print(" error(var)")
print(" sample", np.mean(abs(population_var - sample_var)))
print(" unbiased", np.mean(abs(population_var - unbiased_var)))
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment