Machine Learning in Elixir

Mix.install([
  {:axon, "~> 0.5"},
  {:nx, "~> 0.5"},
  {:exla, "~> 0.5"},
  {:explorer, "~> 0.5"},
  {:kino, "~> 0.8"},
  {:kino_explorer, "~> 0.1.21"},
  {:benchee, github: "bencheeorg/benchee", override: true},
  {:vega_lite, "~> 0.1.9"},
  {:kino_vega_lite, "~> 0.1.13"}
])

Prologue

require Explorer.DataFrame, as: DF

1. Make Machines That Learn

iris = Explorer.Datasets.iris()

normalized_iris =
  iris
  |> DF.mutate(
    for col <- across(~r/(sepal|petal)_(length|width)/) do
      {col.name, col - mean(col) / standard_deviation(col)}
    end
  )
  |> DF.mutate(species: cast(species, :category))

for df <- [iris, normalized_iris] do
  Map.take(DF.dtypes(df), ["species"])
end

shuffled_normalized_iris = DF.shuffle(normalized_iris)

train_df = DF.slice(shuffled_normalized_iris, 0..119)
test_df = DF.slice(shuffled_normalized_iris, 120..149)

feature_columns = ["sepal_length", "sepal_width", "petal_length", "petal_width"]

from_frame = fn frame ->
  x = Nx.stack(frame[feature_columns], axis: -1)
  y = frame["species"] |> Nx.stack(axis: -1) |> Nx.equal(Nx.iota({1, 3}, axis: -1))
  {x, y}
end

train_data = from_frame.(train_df)
test_data = from_frame.(test_df)

model =
  Axon.input("iris_features", shape: {nil, 4})
  |> Axon.dense(3, activation: :softmax)

plot =
  VegaLite.new()
  |> VegaLite.mark(:line)
  |> VegaLite.encode_field(:x, "step", type: :quantitative)
  |> VegaLite.encode_field(:y, "loss", type: :quantitative)
  |> Kino.VegaLite.new()
  |> Kino.render()

trained_model_state =
  model
  |> Axon.Loop.trainer(:categorical_cross_entropy, :sgd)
  |> Axon.Loop.metric(:accuracy)
  |> Axon.Loop.kino_vega_lite_plot(plot, "loss")
  |> Axon.Loop.run(Stream.repeatedly(fn -> train_data end), %{}, iterations: 500, epochs: 10)

See Appendix for an explanation of how a dataframe is converted to a tensor.

2. Get Comfortable with Nx

Nx.add(1, Nx.tensor([1, 2, 3]))

Nx.tensor([[1, 2, 3, 4]])

Nx.tensor([[1, 2, 3], [4, 5, 6]], names: [:x, :y])

a = Nx.tensor([[1, 2, 3], [4, 5, 6]])
Nx.to_binary(a)

<<1::64-signed-native, 2::64-signed-native, 3::64-signed-native>>
|> Nx.from_binary({:s, 64})

<<1::64-signed-native, 2::64-signed-native, 3::64-signed-native>>
|> Nx.from_binary({:s, 64})
|> Nx.reshape({1, 3}, names: [:x, :y])

Tensors are Immutable

...

a = Nx.tensor([1, 2, 3])

a
|> Nx.as_type({:f, 32})
|> Nx.reshape({1, 3, 1})

Nx.bitcast(a, {:f, 64})

Element-wise Unary Operations

a = [-1, -2, -3, 0, 1, 2, 3]
Enum.map(a, &abs/1)

a = Nx.tensor([-1, -2, -3, 0, 1, 2, 3])
Nx.abs(a)

a = Nx.tensor([[[-1, -2, -3], [-4, -5, -6]], [[1, 2, 3], [4, 5, 6]]])

Nx.abs(a)

Element-wise Binary Operations

a = [1, 2, 3]
b = [4, 5, 6]
Enum.zip_with(a, b, fn x, y -> x + y end)

a = Nx.tensor([1, 2, 3])
b = Nx.tensor([4, 5, 6])
Nx.add(a, b)

Nx.add(5, Nx.tensor([1, 2, 3]))

Nx.add(Nx.tensor([1, 2, 3]), Nx.tensor([[4, 5, 6], [7, 8, 9]]))

Reductions

revenues = Nx.tensor([85, 76, 42, 34, 46, 23, 52, 99, 22, 32, 85, 51])

Nx.sum(revenues)

revenues =
  Nx.tensor(
    [
      [21, 64, 86, 26, 74, 81, 38, 79, 70, 48, 85, 33],
      [64, 82, 48, 39, 70, 71, 81, 53, 50, 67, 36, 50],
      [68, 74, 39, 78, 95, 62, 53, 21, 43, 59, 51, 88],
      [47, 74, 97, 51, 98, 47, 61, 36, 83, 55, 74, 43]
    ],
    names: [:year, :month]
  )

Nx.sum(revenues)

Nx.sum(revenues, axes: [:year])

Nx.sum(revenues, axes: [:month])

Nx.mean(revenues, axes: [:year])
|> Nx.as_type({:f, 64}) # why always f32?

Going from def to defn

defmodule MyModule do
  import Nx.Defn

  defn adds_one(x) do
    # Nx.add(x, 1)
    (x + 1) |> print_value(label: "my value")
  end
end

MyModule.adds_one(Nx.tensor([1, 2, 3]))

defmodule Softmax do
  import Nx.Defn

  defn softmax(n) do
    Nx.exp(n) / Nx.sum(Nx.exp(n))
  end
end

key = Nx.Random.key(42)
{tensor, _key} = Nx.Random.uniform(key, shape: {1_000_000})

Benchee.run(
  %{
    "JIT with EXLA" => fn -> apply(EXLA.jit(&Softmax.softmax/1), [tensor]) end,
    "Regular Elixir" => fn ->
      Softmax.softmax(tensor)
    end
  },
  time: 10
)

3. Harness the Power of Math

linear algebra
linear transformations
probability theory, decision theory and information theory
reasoning about uncertainty
frequentist
bayes
automatic differentiation

defmodule BerryFarm do
  import Nx.Defn

  defn profits(trees) do
    -((trees - 1) ** 4) + trees ** 3 + trees ** 2
  end

  defn profits_derivative(trees) do
    grad(trees, &profits/1)
  end
end

trees = Nx.linspace(0, 3, n: 100)
profits = BerryFarm.profits(trees)
profits_derivative = BerryFarm.profits_derivative(trees)

plot = fn ->
  import VegaLite

  new(title: "Berry Profits and Profits Rate of Change")
  |> data_from_values(
    trees: Nx.to_flat_list(trees),
    profits: Nx.to_flat_list(profits),
    profits_derivative: Nx.to_flat_list(profits_derivative)
  )
  |> layers([
    new()
    |> mark(:line, interpolate: :basis)
    |> encode_field(:x, "trees", type: :quantitative)
    |> encode_field(:y, "profits", type: :quantitative),
    new()
    |> mark(:line, interpolate: :basis)
    |> encode_field(:x, "trees", type: :quantitative)
    |> encode_field(:y, "profits_derivative", type: :quantitative)
    |> encode(:color, value: "#ff0000")
  ])
end

plot.()

defmodule GradFun do
  import Nx.Defn

  defn my_function(x) do
    import Nx

    sum(exp(cos(x)))
    |> print_expr()
  end

  defn grad_my_function(x) do
    grad(x, &my_function/1) |> print_expr()
  end
end

GradFun.grad_my_function(Nx.tensor([1.0, 2.0, 3.0]))

chain rule

4. Optimize Everything

loss function
objective function
maximum likelihood estimation (MLE)
cross-entropy
mean squared error
convergence
regularize to generalize
overfitting, underfitting and capacity
complexity penalties
weight decay, L2-Norm (distance) $ \sqrt{x^2 + y^2} $
$ loss + \lambda * penalty $
early stopping, validation set

key = Nx.Random.key(42)
{true_params, key} = Nx.Random.uniform(key, shape: {32})

true_function = fn params, x ->
  Nx.dot(x, params)
end

generate = fn true_function, key ->
  {x, key} = Nx.Random.uniform(key, shape: {10_000, 32})
  y = true_function.(true_params, x)
  data = Enum.zip(Nx.to_batched(x, 1), Nx.to_batched(y, 1))
  {data, key}
end

{train_data, key} = generate.(true_function, key)
{test_data, _key} = generate.(true_function, key)

# stream = Nx.to_batched(Nx.tensor([[1], [2]]), 1)
# Enum.each(stream, fn thing -> IO.inspect(thing, label: "thing") end)

defmodule SGD do
  import Nx.Defn

  defn init_random_params(key) do
    Nx.Random.uniform(key, shape: {32, 1})
  end

  defn model(params, x) do
    Nx.dot(x, params)
  end

  defn mean_squared_error(y_true, y_pred) do
    ((y_true - y_pred) ** 2)
    |> Nx.mean(axes: [-1])
  end

  defn loss(y_true, y_pred) do
    mean_squared_error(y_true, y_pred)
  end

  defn objective(params, x, y_true) do
    y_pred = model(params, x)
    loss(y_true, y_pred)
  end

  defn step(params, x, y_true) do
    {loss, grad} =
      value_and_grad(params, fn params ->
        objective(params, x, y_true)
      end)

    # learning rate
    alpha = 1.0e-2
    {loss, params - alpha * grad}
  end

  def eval(params, data) do
    data
    |> Enum.map(fn {x, y} ->
      pred = model(params, x)
      loss(y, pred)
    end)
    |> Enum.reduce(0, &Nx.add/2)
  end

  def train(data, iterations, key) do
    {params, _key} = init_random_params(key)
    loss = Nx.tensor(0.0)

    {_, trained_params} =
      for i <- 1..iterations, reduce: {loss, params} do
        {loss, params} ->
          for {{x, y}, j} <- Enum.with_index(data), reduce: {loss, params} do
            {loss, params} ->
              {batch_loss, new_params} = step(params, x, y)
              avg_loss = Nx.add(Nx.mean(batch_loss), loss) |> Nx.divide(j + 1)
              IO.write("\rEpoch: #{i}, Loss: #{Nx.to_number(avg_loss)}")
              {avg_loss, new_params}
          end
      end

    trained_params
  end
end

key = Nx.Random.key(100)
{random_params, _key} = SGD.init_random_params(key)
SGD.eval(random_params, test_data)

key = Nx.Random.key(0)
trained_params = SGD.train(train_data, 1, key)

SGD.eval(trained_params, test_data)

making it fail

key = Nx.Random.key(42)
true_function = fn params, x ->
  Nx.dot(x, params) |> Nx.cos()
end

{train_data, key} = generate.(true_function, key)
{test_data, _key} = generate.(true_function, key)

key = Nx.Random.key(0)
trained_params = SGD.train(train_data, 10, key)
SGD.eval(trained_params, test_data)

hyperparameter search
evolutionary algorithm
grid search

Appendix: Data Frames

DF.new(x: [1, 2, 3], y: [10, 20, 30], z: ["a", "b", "c"])
|> DF.mutate(x2: x + 1, x: count(x) + 1)

Appendix: Table Protocol

DF.new(sepal_length: [1, 2, 3], sepal_width: [4, 5, 6])
|> Table.to_rows()
|> Enum.to_list()

for %{"x" => x, "y" => y} <-
      Table.to_rows(
        DF.new(
          x: [1, 2, 3],
          y: [4, 5, 6]
        )
      ) do
  [x, y]
end
|> Nx.tensor()

DF.new(
  x: [1, 2, 3],
  y: [4, 5, 6]
)
|> Nx.stack(axis: -1)

Appendix: Nx Serving

defmodule MyDefn do
  import Nx.Defn

  defn print_and_multiply(x) do
    x = print_value(x, label: "debug")
    x * 2
  end
end

serving = Nx.Serving.new(fn opts -> Nx.Defn.jit(&MyDefn.print_and_multiply/1, opts) end)
batch = Nx.Batch.stack([Nx.tensor([1, 2, 3])])
Nx.Serving.run(serving, batch)

serving =
  Nx.Serving.jit(&MyDefn.print_and_multiply/1)
  |> Nx.Serving.client_preprocessing(fn input -> {Nx.Batch.stack(input), :client_info} end)
  |> Nx.Serving.client_postprocessing(fn {output, _metadata}, _client_info -> output end)

Nx.Serving.run(serving, [Nx.tensor([1, 2]), Nx.tensor([3, 4])])

Appendix: VegaLite

plot = fn ->
  import VegaLite

  new()
  |> data_from_url("https://vega.github.io/editor/data/weather.csv")
  |> transform(filter: "datum.location == 'Seattle'")
  |> concat([
    new()
    |> mark(:bar)
    |> encode_field(:x, "date", time_unit: :month, type: :ordinal)
    |> encode_field(:y, "precipitation", aggregate: :mean),
    new()
    |> mark(:point)
    |> encode_field(:x, "temp_min", bin: true)
    |> encode_field(:y, "temp_max", bin: true)
    |> encode(:size, aggregate: :count)
  ])
end

plot.()

alias VegaLite, as: Vl

chart =
  Vl.new(width: 400, height: 400)
  |> Vl.mark(:line)
  |> Vl.encode_field(:x, "x", type: :quantitative)
  |> Vl.encode_field(:y, "y", type: :quantitative)
  |> Kino.VegaLite.new()
  |> Kino.render()

for i <- 1..300 do
  point = %{x: i / 10, y: :math.sin(i / 10)}
  Kino.VegaLite.push(chart, point)
  Process.sleep(25)
end

:ok

Stream.zip(
  Nx.to_batched(Nx.tensor([1, 2, 3]), 1),
  Nx.to_batched(Nx.tensor([1, 2, 3]), 1)
)
|> Enum.take(2)

alias VegaLite, as: Vl

plot =
  Vl.new()
  |> Vl.mark(:line)
  |> Vl.encode_field(:x, "step", type: :quantitative)
  |> Vl.encode_field(:y, "loss", type: :quantitative)
  |> Kino.VegaLite.new()
  |> Kino.render()

model
|> Axon.Loop.loop(fn {x, y}, state ->
  %{parameters: params, optimizer_state: optim_state} = state

  gradients = grad(params, objective_fn.(&1, inputs, targets))
  {updates, new_optim_state} = optimizer.(optim_state, params, gradients)

  new_params = apply_updates(params, updates)

  # Shown for simplicity, you can optimize this by calculating preds
  # along with the gradient calculation
  preds = model_fn.(params, inputs)

  %{
    y_true: targets,
    y_pred: preds,
    parameters: new_params,
    optimizer_state: optim_state
  }
end)
|> Axon.Loop.kino_vega_lite_plot(plot, "loss")
|> Axon.Loop.run(
  Stream.zip(
    1..100,
    Stream.map(1..100, &(&1 * 2))
  )
)

aymanosman/study-machine-learning-in-elixir.livemd

Machine Learning in Elixir

Prologue

1. Make Machines That Learn

2. Get Comfortable with Nx

Tensors are Immutable

Element-wise Unary Operations

Element-wise Binary Operations

Reductions

Going from def to defn

3. Harness the Power of Math

4. Optimize Everything

Appendix: Data Frames

Appendix: Table Protocol

Appendix: Nx Serving

Appendix: VegaLite