|
//! [https://gist.github.com/karpathy/d4dee566867f8291f086](@karpathy's min-char-rnn.py) in Rust. |
|
|
|
use std::collections::{BTreeMap, BTreeSet}; |
|
use std::env; |
|
use std::error::Error; |
|
use std::fs::File; |
|
use std::io::Read; |
|
use std::iter::FromIterator; |
|
use std::ops::Mul; |
|
|
|
use indicatif::{ProgressBar, ProgressStyle}; |
|
use rand::distributions::{Distribution, Uniform, WeightedIndex}; |
|
use rand::{thread_rng, Rng}; |
|
use rulinalg::matrix::{BaseMatrix, Matrix}; |
|
|
|
#[allow(non_snake_case, clippy::many_single_char_names)] |
|
fn main() -> Result<(), Box<dyn Error>> { |
|
let mut rng = thread_rng(); |
|
|
|
let filename = env::args().nth(1).expect("Expected filename to be given"); |
|
let mut f = File::open(filename)?; |
|
let mut text = String::new(); |
|
f.read_to_string(&mut text)?; |
|
|
|
let chars = BTreeSet::from_iter(text.chars()); |
|
let vocab_size = chars.len(); |
|
let char_index = BTreeMap::from_iter(chars.iter().cloned().zip(0..vocab_size)); |
|
let inverted_index = BTreeMap::from_iter(char_index.clone().into_iter().map(|(k, v)| (v, k))); |
|
|
|
dbg!(vocab_size); |
|
|
|
let text: Vec<usize> = text.chars().map(|c| char_index[&c]).collect(); |
|
|
|
// hyperparameters |
|
|
|
let hidden_size = 100; |
|
let seq_length = 25; |
|
let learning_rate = 0.1; |
|
|
|
// model parameters |
|
|
|
let dist = Uniform::new(0.0, 1.0); |
|
|
|
// weights |
|
|
|
let mut W_ih = Matrix::new( |
|
vocab_size, |
|
hidden_size, |
|
rng.sample_iter(&dist) |
|
.take(vocab_size * hidden_size) |
|
.map(|v| v * 0.01) |
|
.collect::<Vec<f64>>(), |
|
); |
|
let mut W_hh = Matrix::new( |
|
hidden_size, |
|
hidden_size, |
|
rng.sample_iter(&dist) |
|
.take(hidden_size * hidden_size) |
|
.map(|v| v * 0.01) |
|
.collect::<Vec<f64>>(), |
|
); |
|
let mut W_hy = Matrix::new( |
|
hidden_size, |
|
vocab_size, |
|
rng.sample_iter(&dist) |
|
.take(hidden_size * vocab_size) |
|
.map(|v| v * 0.01) |
|
.collect::<Vec<f64>>(), |
|
); |
|
|
|
// biases |
|
|
|
let mut b_h = Matrix::zeros(1, hidden_size); |
|
let mut b_y = Matrix::zeros(1, vocab_size); |
|
|
|
// state |
|
|
|
let mut h = Matrix::zeros(1, hidden_size); |
|
|
|
// Memory Variables for Adagrad |
|
|
|
let mut m_W_ih = Matrix::zeros(vocab_size, hidden_size); |
|
let mut m_W_hh = Matrix::zeros(hidden_size, hidden_size); |
|
let mut m_W_hy = Matrix::zeros(hidden_size, vocab_size); |
|
let mut m_b_h = Matrix::zeros(1, hidden_size); |
|
let mut m_b_y = Matrix::zeros(1, vocab_size); |
|
|
|
// iteration variables |
|
|
|
let (mut n, mut p) = (0, 0); |
|
let mut smooth_loss = -(1.0 / (vocab_size as f64)).ln() * (seq_length as f64); |
|
|
|
let progress = ProgressBar::new_spinner(); |
|
progress.set_style(ProgressStyle::default_spinner().template("{spinner:.green} {msg}")); |
|
|
|
loop { |
|
if p + seq_length + 1 >= text.len() || n == 0 { |
|
h = Matrix::zeros(1, hidden_size); |
|
p = 0; |
|
} |
|
|
|
// NOTE: instead of defining a lossFn, the core net is implemented here itself |
|
// a lot of things around it are in new functions |
|
|
|
let mut loss = 0.0; |
|
|
|
let mut ts = Vec::new(); |
|
ts.push((None, h.clone(), None)); |
|
|
|
for t in 0..seq_length { |
|
let input = text[p + t]; |
|
let target = text[p + t + 1]; |
|
|
|
let x = one_hot(input, vocab_size); |
|
|
|
h = (&x).mul(&W_ih) + h.mul(&W_hh) + &b_h; |
|
h = tanh(h); |
|
|
|
let y = (&h).mul(&W_hy) + &b_y; |
|
let y = softmax(y); |
|
|
|
loss += -y[[0, target]].ln(); |
|
|
|
ts.push((Some(x), h.clone(), Some(y))); |
|
} |
|
|
|
let mut d_W_ih = Matrix::zeros(vocab_size, hidden_size); |
|
let mut d_W_hh = Matrix::zeros(hidden_size, hidden_size); |
|
let mut d_W_hy = Matrix::zeros(hidden_size, vocab_size); |
|
let mut d_b_h = Matrix::zeros(1, hidden_size); |
|
let mut d_b_y = Matrix::zeros(1, vocab_size); |
|
|
|
// backwards gradient for current state coming in from next state |
|
let mut d_h_next = Matrix::zeros(1, hidden_size); |
|
|
|
for t in (1..=seq_length).rev() { |
|
let target = text[p + t]; |
|
|
|
let (x, h, y) = &ts[t]; |
|
let mut dy = y.as_ref().unwrap().clone(); |
|
dy[[0, target]] -= 1.0; |
|
|
|
d_W_hy += h.transpose().mul(&dy); |
|
d_b_y += &dy; |
|
|
|
let dh = dy.mul(W_hy.transpose()) + &d_h_next; |
|
let dh = dh.elemul(&tanh_derivative(h)); |
|
|
|
let (_, prevh, _) = &ts[t - 1]; |
|
|
|
d_W_hh += prevh.transpose().mul(&dh); |
|
d_b_h += &dh; |
|
|
|
d_W_ih += x.as_ref().unwrap().transpose().mul(&dh); |
|
|
|
d_h_next = dh.mul(W_hh.transpose()); |
|
} |
|
|
|
// NOTE: skipping gradient clipping for now |
|
|
|
smooth_loss = smooth_loss * 0.999 + loss * 0.001; |
|
if n % 100 == 0 { |
|
progress.println(format!("iteration: {}, loss: {}", n, smooth_loss)); |
|
|
|
let mut h_copy = h.clone(); |
|
let mut value = text[p]; |
|
let mut sample = String::new(); |
|
sample.push(inverted_index[&value]); |
|
|
|
for _ in 0..200 { |
|
let x = one_hot(value, vocab_size); |
|
|
|
h_copy = tanh(x.mul(&W_ih) + (&h_copy).mul(&W_hh) + &b_h); |
|
|
|
let y = softmax((&h_copy).mul(&W_hy)); |
|
let y = y.into_vec(); |
|
|
|
let dist = WeightedIndex::new(&y)?; |
|
value = dist.sample(&mut rng); |
|
sample.push(inverted_index[&value]); |
|
} |
|
|
|
progress.println(format!( |
|
"sampled prediction:\n----------\n{}\n----------\n", |
|
sample |
|
)); |
|
} |
|
|
|
progress.set_message(&format!("loss: {}", smooth_loss)); |
|
|
|
m_W_ih += (&d_W_ih).elemul(&d_W_ih); |
|
m_W_hh += (&d_W_hh).elemul(&d_W_hh); |
|
m_W_hy += (&d_W_hy).elemul(&d_W_hy); |
|
m_b_h += (&d_b_h).elemul(&d_b_h); |
|
m_b_y += (&d_b_y).elemul(&d_b_y); |
|
|
|
W_ih -= adagrad_update(d_W_ih, &m_W_ih, learning_rate); |
|
W_hh -= adagrad_update(d_W_hh, &m_W_hh, learning_rate); |
|
W_hy -= adagrad_update(d_W_hy, &m_W_hy, learning_rate); |
|
b_h -= adagrad_update(d_b_h, &m_b_h, learning_rate); |
|
b_y -= adagrad_update(d_b_y, &m_b_y, learning_rate); |
|
|
|
p += seq_length; |
|
n += 1; |
|
} |
|
} |
|
|
|
fn one_hot(v: usize, t: usize) -> Matrix<f64> { |
|
let mut data = vec![0.0; t]; |
|
data[v] = 1.0; |
|
Matrix::new(1, t, data) |
|
} |
|
|
|
fn tanh(mut m: Matrix<f64>) -> Matrix<f64> { |
|
for i in 0..m.rows() { |
|
for j in 0..m.cols() { |
|
m[[i, j]] = m[[i, j]].tanh(); |
|
} |
|
} |
|
|
|
m |
|
} |
|
|
|
fn softmax(mut m: Matrix<f64>) -> Matrix<f64> { |
|
for i in 0..m.rows() { |
|
let mut s = 0.0; |
|
|
|
for j in 0..m.cols() { |
|
m[[i, j]] = m[[i, j]].exp(); |
|
s += m[[i, j]]; |
|
} |
|
|
|
for j in 0..m.cols() { |
|
m[[i, j]] /= s; |
|
} |
|
} |
|
|
|
m |
|
} |
|
|
|
fn tanh_derivative(m: &Matrix<f64>) -> Matrix<f64> { |
|
let mut ans = Matrix::zeros(m.rows(), m.cols()); |
|
|
|
for i in 0..m.rows() { |
|
for j in 0..m.cols() { |
|
ans[[i, j]] = 1.0 - (m[[i, j]] * m[[i, j]]); |
|
} |
|
} |
|
|
|
ans |
|
} |
|
|
|
fn adagrad_update(mut d: Matrix<f64>, m: &Matrix<f64>, learning_rate: f64) -> Matrix<f64> { |
|
let (r, c) = (m.rows(), m.cols()); |
|
|
|
for i in 0..r { |
|
for j in 0..c { |
|
d[[i, j]] = (learning_rate * d[[i, j]]) / (m[[i, j]] + 1e-8).sqrt(); |
|
} |
|
} |
|
|
|
d |
|
} |