|
struct MNISTParameters : ParameterAggregate { |
|
var w1 = Tensor<Float>(randomNormal: [784, 30]) |
|
var w2 = Tensor<Float>(randomNormal: [30, 10]) |
|
|
|
// Compiler-synthesized: |
|
// static var allKeyPaths: [WritableKeyPath<MNISTParameters, Tensor<Float>>] { |
|
// return [\MNISTParameters.w1, \MNISTParameters.w2] |
|
// } |
|
// Learn more about key paths here: https://github.com/apple/swift-evolution/blob/master/proposals/0161-key-paths.md |
|
} |
|
|
|
struct AdamOptimizer { |
|
typealias Scalar = Float |
|
|
|
var learningRate: Scalar |
|
var beta1: Scalar |
|
var beta2: Scalar |
|
var epsilon: Scalar |
|
|
|
init(learningRate: Scalar = 0.001, beta1: Scalar = 0.9, beta2: Scalar = 0.999, epsilon: Scalar = 1e-8) { |
|
self.learningRate = learningRate |
|
self.beta1 = beta1 |
|
self.beta2 = beta2 |
|
self.epsilon = epsilon |
|
} |
|
|
|
var step: Float = 0 |
|
var firstMoments: MNISTParameters? = nil |
|
var secondMoments: MNISTParameters? = nil |
|
|
|
// `fitParameters` can be generalized to work with any `ParameterAggregate`-conforming type when such types |
|
// define a zero initializer. There are multiple ways to enable this (e.g. conforming `ParameterAggregate` to |
|
// `VectorNumeric`). |
|
mutating func fitParameters( |
|
_ parameters: inout MNISTParameters, |
|
withGradients gradients: MNISTParameters |
|
) { |
|
func initializeWithZerosIfNeeded(_ x: MNISTParameters?) -> MNISTParameters { |
|
return x ?? MNISTParameters( |
|
w1: Tensor(0).broadcast(like: parameters.w1), |
|
w2: Tensor(0).broadcast(like: parameters.w2) |
|
) |
|
} |
|
|
|
var firstMoments = initializeWithZerosIfNeeded(self.firstMoments) |
|
var secondMoments = initializeWithZerosIfNeeded(self.secondMoments) |
|
step += 1 |
|
|
|
// Iterating over `allKeyPaths` and applying key paths currently produce sends/receives. |
|
// It should be possible to eliminate sends/receives eventually, by fully unrolling the loop at compile-time |
|
// and implementing compile-time evaluation of key path initialization and application. |
|
// Read the key path design for more information. |
|
for kp in MNISTParameters.allKeyPaths { |
|
firstMoments[keyPath: kp] = |
|
firstMoments[keyPath: kp] * beta1 + (1 - beta1) * gradients[keyPath: kp] |
|
secondMoments[keyPath: kp] = |
|
firstMoments[keyPath: kp] * beta2 + (1 - beta2) * gradients[keyPath: kp] * gradients[keyPath: kp] |
|
|
|
let denominator = sqrt(secondMoments[keyPath: kp]) + epsilon |
|
let biasCorrection1 = 1 - pow(beta1, step) |
|
let biasCorrection2 = 1 - pow(beta2, step) |
|
let stepSize = learningRate * sqrt(biasCorrection2) / biasCorrection1 |
|
parameters[keyPath: kp] -= stepSize * firstMoments[keyPath: kp] / denominator |
|
} |
|
|
|
self.firstMoments = firstMoments |
|
self.secondMoments = secondMoments |
|
} |
|
} |