Last active June 14, 2020 18:43
backpropagation with numpy
import numpy as np
from sklearn.datasets import load_iris
def softmax(inputs):
return np.exp(inputs) / np.sum(np.exp(inputs), 1)[:, None]
def construct_net(in_dim, out_dim, hidden_dim=20):
bound1 = np.sqrt(6.0 / (in_dim + hidden_dim))
W1 = np.random.uniform(-bound1, bound1, size=[in_dim, hidden_dim])
b1 = np.zeros(20)
bound2 = np.sqrt(6.0 / (hidden_dim + out_dim))
W2 = np.random.uniform(-bound2, bound2, size=[hidden_dim, out_dim])
b2 = np.zeros(3)
return [W1, b1, W2, b2]
def propagate(batch_X, batch_y, params):
# one-hot label
labels = np.zeros((len(batch_X), 3))
for i in range(len(batch_y)):
labels[i][batch_y[i]] = 1
# forward
W1, b1, W2, b2 = params
h1 =, W1) + b1
a1 = np.copy(h1)
a1[a1 < 0.0] = 0.0
h2 =, W2) + b2
p = softmax(h2)
# NLL loss
loss = np.mean(-np.log(np.sum(p * labels, 1)))
# backward
dl_dh2 = p - labels # [batch, 3]
dl_dW2 =, dl_dh2)
dl_db2 = np.sum(dl_dh2, 0)
dl_da1 =, W2.T)
da1_dh1 = (h1 > 0).astype(float)
dl_dh1 = dl_da1 * da1_dh1
dl_dW1 =, dl_dh1)
dl_db1 = np.sum(dl_dh1, 0)
return p, loss, [dl_dW1, dl_db1, dl_dW2, dl_db2]
def main():
# prepare dataset
iris = load_iris()
dataset =
dataset -= np.mean(dataset)
dataset /= np.std(dataset)
data_size = len(dataset)
test_size = int(0.2 * data_size)
test_idxs = np.random.randint(0, data_size, test_size)
train_idxs = np.array([i for i in range(data_size) if i not in test_idxs])
train_X = dataset[train_idxs]
train_y =[train_idxs]
test_X = dataset[test_idxs]
test_y =[test_idxs]
params = construct_net(4, 3)
# train
batch_size = 16
leanring_rate = 0.003
running_loss = 0
for step in range(1000):
batch_idx = np.random.randint(0, len(train_X), size=batch_size)
batch_X = train_X[batch_idx]
batch_y = train_y[batch_idx]
_, loss, grads = propagate(batch_X, batch_y, params)
if running_loss:
running_loss = 0.9 * running_loss + 0.1 * loss
running_loss = loss
# update params
for i in range(len(params)):
params[i] -= leanring_rate * grads[i]
if step % 50 == 0:
print(step, running_loss)
# evaluate
predict, eval_loss, _ = propagate(test_X, test_y, params)
predict = np.argmax(predict, 1)
count = 0.0
for i in range(test_size):
if predict[i] == test_y[i]:
count += 1.0
print(count / test_size)
if __name__ == '__main__':
@zchrissirhcz 感谢指出。

  • h 是 logits,p 是预测概率,已修正
  • 由于 labels 是 one-hot 的,两种实现是等效的
  • loss function 严格来说应该是定义在一个数据点上,只是在 DL 中 mini-batch 训练比较常见,有时候也会把 batch_size 写进去

