Skip to content

Instantly share code, notes, and snippets.

@jef-sure
Last active February 20, 2026 17:06
Show Gist options
  • Select an option

  • Save jef-sure/c0ff8bdf894a77fb4dce4351b4076524 to your computer and use it in GitHub Desktop.

Select an option

Save jef-sure/c0ff8bdf894a77fb4dce4351b4076524 to your computer and use it in GitHub Desktop.
/* -*- C++ -*-
* File: dht_nn_network.h
* Copyright 2026 Anton Petrusevich
*
* Neural network for DHT demosaicing direction refinement.
* Architecture: input_size -> N hidden layers (LeakyReLU α=0.01) -> 3 (softmax H/V + sigmoid S)
* Input: (2*patch_r+1)^2 patch x 16 features/pixel
* Output: logit(HOR), logit(VER) [softmax], logit(sharpness) [sigmoid]
*
* Patch radius, number and sizes of hidden layers are runtime-configurable.
* The binary file format is self-describing (stores architecture).
*
* This code is licensed under one of two licenses as you choose:
*
* 1. GNU LESSER GENERAL PUBLIC LICENSE version 2.1
* (See file LICENSE.LGPL provided in LibRaw distribution archive for details).
*
* 2. COMMON DEVELOPMENT AND DISTRIBUTION LICENSE (CDDL) Version 1.0
* (See file LICENSE.CDDL provided in LibRaw distribution archive for details).
*/
#ifndef DHT_NN_NETWORK_H
#define DHT_NN_NETWORK_H
#include <cmath>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <cstdint>
#include <alloca.h>
struct DHTNNNetwork
{
/* Fixed constants */
static const int FEAT_PER_PX = 16; // nraw[3], one-hot bayer[3], hue_h, hue_v, hue_change_h, hue_change_v, ld_ud, ld_lr, ld_lu, ld_ld, ld_ru, ld_rd
static const int OUTPUT_SIZE = 3; // HOR=0, VER=1, S=2 (sharpness)
static const uint32_t MAGIC = 0x4E4E4448; // "HDNN"
static const uint32_t VERSION = 16; // v16: LeakyReLU (alpha=0.01) instead of ReLU
/* Runtime architecture (set at construction or load time) */
int patch_r; // e.g. 2, 3, 4, ...
int patch_side; // 2*patch_r + 1
int patch_size; // patch_side^2
int input_size; // patch_size * FEAT_PER_PX
int n_hidden; // number of hidden layers
int *hidden; // hidden[n_hidden]: size of each hidden layer
/* Dynamically allocated weight arrays (row-major).
* num_weights() = n_hidden + 1 weight matrices:
* W[0]: input_size -> hidden[0]
* W[l]: hidden[l-1] -> hidden[l] (for 0 < l < n_hidden)
* W[n_hidden]: hidden[n_hidden-1] -> OUTPUT_SIZE
*/
float **W, **b;
/* Adam optimizer state */
float **mW, **mb; // 1st moment
float **vW, **vb; // 2nd moment
long long t; // timestep
bool weights_allocated;
bool optimizer_allocated;
/* Helpers */
int num_weights() const { return n_hidden + 1; }
int weight_out(int l) const { return (l < n_hidden) ? hidden[l] : OUTPUT_SIZE; }
int weight_in(int l) const { return (l == 0) ? input_size : hidden[l - 1]; }
/* Default: patch_r=5 (11x11), 4 hidden layers (256, 128, 64, 32) */
DHTNNNetwork(int _patch_r = 5, int _n_hidden = 4, const int *_hidden = NULL)
: patch_r(0), patch_side(0), patch_size(0), input_size(0),
n_hidden(0), hidden(NULL), W(NULL), b(NULL),
mW(NULL), mb(NULL), vW(NULL), vb(NULL), t(0),
weights_allocated(false), optimizer_allocated(false)
{
static const int default_hidden[] = {256, 128, 64, 32};
if (!_hidden) { _hidden = default_hidden; _n_hidden = 4; }
set_arch(_patch_r, _n_hidden, _hidden);
init_random();
}
~DHTNNNetwork()
{
free_optimizer();
free_weights();
free(hidden);
}
/* Reconfigure architecture — reallocates weight arrays */
void set_arch(int _patch_r, int _n_hidden, const int *_hidden)
{
free_weights();
patch_r = _patch_r;
patch_side = 2 * patch_r + 1;
patch_size = patch_side * patch_side;
input_size = patch_size * FEAT_PER_PX;
n_hidden = _n_hidden;
free(hidden);
hidden = (int *)malloc(n_hidden * sizeof(int));
for (int i = 0; i < n_hidden; i++)
hidden[i] = _hidden[i];
alloc_weights();
}
int total_weights() const
{
int total = 0;
int nw = num_weights();
for (int l = 0; l < nw; l++)
total += weight_out(l) * weight_in(l) + weight_out(l);
return total;
}
void alloc_weights()
{
if (weights_allocated) return;
int nw = num_weights();
W = (float **)calloc(nw, sizeof(float *));
b = (float **)calloc(nw, sizeof(float *));
for (int l = 0; l < nw; l++)
{
W[l] = (float *)calloc(weight_out(l) * weight_in(l), sizeof(float));
b[l] = (float *)calloc(weight_out(l), sizeof(float));
}
weights_allocated = true;
}
void free_weights()
{
free_optimizer();
if (!weights_allocated) return;
int nw = num_weights();
for (int l = 0; l < nw; l++)
{
free(W[l]); free(b[l]);
}
free(W); free(b);
W = b = NULL;
weights_allocated = false;
}
/* Xavier (He) initialization */
void init_random()
{
if (!weights_allocated) return;
int nw = num_weights();
for (int l = 0; l < nw; l++)
{
int in_sz = weight_in(l);
int out_sz = weight_out(l);
float scale = sqrtf(6.0f / in_sz);
for (int i = 0; i < out_sz * in_sz; i++)
W[l][i] = scale * ((float)rand() / RAND_MAX - 0.5f) * 2.0f;
for (int i = 0; i < out_sz; i++)
b[l][i] = 0.0f;
}
}
/*
* Forward pass (inference).
* input[input_size] -> output[OUTPUT_SIZE] (raw logits)
* Returns argmax: 0=HOR, 1=VER.
* Uses two ping-pong buffers for intermediate activations.
*/
int forward(const float *input, float *output) const
{
int nw = num_weights();
int max_h = OUTPUT_SIZE;
for (int i = 0; i < n_hidden; i++)
if (hidden[i] > max_h) max_h = hidden[i];
float *buf[2];
buf[0] = (float *)alloca(max_h * sizeof(float));
buf[1] = (float *)alloca(max_h * sizeof(float));
const float *prev = input;
for (int l = 0; l < nw; l++)
{
int out_sz = weight_out(l);
int in_sz = weight_in(l);
float *cur = (l < nw - 1) ? buf[l & 1] : output;
for (int o = 0; o < out_sz; o++)
{
float sum = b[l][o];
const float *row = W[l] + o * in_sz;
for (int k = 0; k < in_sz; k++)
sum += prev[k] * row[k];
cur[o] = (l < nw - 1) ? (sum > 0.0f ? sum : 0.01f * sum) : sum;
}
prev = cur;
}
return output[0] > output[1] ? 0 : 1;
}
/*
* Forward pass with saved activations for backpropagation.
* Caller must provide h[n_hidden] arrays, each h[l] of size hidden[l],
* and output[OUTPUT_SIZE].
*/
int forward_train(const float *input, float **h, float *output) const
{
int nw = num_weights();
const float *prev = input;
for (int l = 0; l < nw; l++)
{
int out_sz = weight_out(l);
int in_sz = weight_in(l);
float *cur = (l < n_hidden) ? h[l] : output;
for (int o = 0; o < out_sz; o++)
{
float sum = b[l][o];
const float *row = W[l] + o * in_sz;
for (int k = 0; k < in_sz; k++)
sum += prev[k] * row[k];
cur[o] = (l < n_hidden) ? (sum > 0.0f ? sum : 0.01f * sum) : sum;
}
prev = cur;
}
return output[0] > output[1] ? 0 : 1;
}
/* Allocate Adam optimizer buffers */
void alloc_optimizer()
{
if (optimizer_allocated) return;
if (!weights_allocated) alloc_weights();
int nw = num_weights();
mW = (float **)calloc(nw, sizeof(float *));
mb = (float **)calloc(nw, sizeof(float *));
vW = (float **)calloc(nw, sizeof(float *));
vb = (float **)calloc(nw, sizeof(float *));
for (int l = 0; l < nw; l++)
{
int szW = weight_out(l) * weight_in(l);
int szb = weight_out(l);
mW[l] = (float *)calloc(szW, sizeof(float));
mb[l] = (float *)calloc(szb, sizeof(float));
vW[l] = (float *)calloc(szW, sizeof(float));
vb[l] = (float *)calloc(szb, sizeof(float));
}
optimizer_allocated = true;
t = 0;
}
void free_optimizer()
{
if (!optimizer_allocated) return;
int nw = num_weights();
for (int l = 0; l < nw; l++)
{
free(mW[l]); free(mb[l]);
free(vW[l]); free(vb[l]);
}
free(mW); free(mb);
free(vW); free(vb);
mW = mb = vW = vb = NULL;
optimizer_allocated = false;
t = 0;
}
/*
* Adam parameter update (single-threaded).
* m[] and v[] are the global moment buffers.
*/
static void adam_update(float *param, float *m, float *v, const float *grad,
int size, float lr_t)
{
const float beta1 = 0.9f;
const float beta2 = 0.999f;
const float eps = 1e-8f;
for (int i = 0; i < size; i++)
{
float g = grad[i];
m[i] = beta1 * m[i] + (1.0f - beta1) * g;
v[i] = beta2 * v[i] + (1.0f - beta2) * g * g;
param[i] += -lr_t * m[i] / (sqrtf(v[i]) + eps);
}
}
/* Compute the bias-corrected learning rate for a given timestep */
static float adam_lr_t(float lr, long long t)
{
const float beta1 = 0.9f;
const float beta2 = 0.999f;
float b1t = powf(beta1, (float)t);
float b2t = powf(beta2, (float)t);
return lr * sqrtf(1.0f - b2t) / (1.0f - b1t);
}
/*
* Compute buffer sizes needed by backward().
* max_act: maximum activation size (for delta buffers and bias grad)
* max_wt: maximum weight matrix size (for weight grad buffer)
*/
void backprop_buf_sizes(int &max_act, int &max_wt) const
{
int nw = num_weights();
max_act = OUTPUT_SIZE;
max_wt = 0;
for (int l = 0; l < nw; l++)
{
int out_sz = weight_out(l), in_sz = weight_in(l);
if (out_sz > max_act) max_act = out_sz;
if (in_sz > max_act) max_act = in_sz;
int wsz = out_sz * in_sz;
if (wsz > max_wt) max_wt = wsz;
}
}
/*
* Backward pass with Adam update for a single sample.
* thread_mW/mb/vW/vb: per-thread moment buffers (same shape as W/b).
* bp_delta_a/b, bp_gW, bp_gb: pre-allocated temp buffers from backprop_buf_sizes().
* iter: global sample timestep for bias-correction (computed once per call).
* targets[3]: soft labels {target_h, target_v, target_s}
*/
void backward(const float *input, float * const *h, const float *output,
const float *targets, float lr, long long iter,
float **thread_mW, float **thread_mb,
float **thread_vW, float **thread_vb,
float *bp_delta_a, float *bp_delta_b,
float *bp_gW, float *bp_gb)
{
int nw = num_weights();
/* Pre-compute bias-corrected lr for this timestep (one powf pair total) */
float lr_t = adam_lr_t(lr, iter);
/* Softmax of H/V logits → CE gradient; sigmoid of sharpness → BCE gradient */
float max_hv = output[0] > output[1] ? output[0] : output[1];
float exp_h = expf(output[0] - max_hv);
float exp_v = expf(output[1] - max_hv);
float sum_hv = exp_h + exp_v;
float p_s = 1.0f / (1.0f + expf(-output[2]));
/* dL/d(output): softmax CE for [0,1], sigmoid BCE for [2] */
float dout[OUTPUT_SIZE];
dout[0] = exp_h / sum_hv - targets[0];
dout[1] = exp_v / sum_hv - targets[1];
dout[2] = p_s - targets[2];
float *delta = bp_delta_a;
float *new_delta = bp_delta_b;
float *gW = bp_gW;
float *gb = bp_gb;
memcpy(delta, dout, OUTPUT_SIZE * sizeof(float));
for (int l = nw - 1; l >= 0; l--)
{
int out_sz = weight_out(l);
int in_sz = weight_in(l);
const float *layer_input = (l == 0) ? input : h[l - 1];
/* Compute weight and bias gradients */
for (int o = 0; o < out_sz; o++)
{
gb[o] = delta[o];
for (int k = 0; k < in_sz; k++)
gW[o * in_sz + k] = delta[o] * layer_input[k];
}
/* Propagate delta to previous hidden layer (before weight update) */
if (l > 0)
{
for (int k = 0; k < in_sz; k++)
{
float sum = 0.0f;
for (int o = 0; o < out_sz; o++)
sum += W[l][o * in_sz + k] * delta[o];
new_delta[k] = h[l - 1][k] > 0.0f ? sum : 0.01f * sum;
}
}
/* Adam update using thread-local moments (no race on m/v) */
adam_update(W[l], thread_mW[l], thread_vW[l], gW, out_sz * in_sz, lr_t);
adam_update(b[l], thread_mb[l], thread_vb[l], gb, out_sz, lr_t);
/* Swap delta buffers */
if (l > 0)
{
float *tmp = delta;
delta = new_delta;
new_delta = tmp;
}
}
}
/*
* Accumulate gradients for a single sample into per-thread buffers.
* Thread-safe: only reads from W[] (shared), writes to acc_gW/acc_gb (thread-local).
* acc_gW[l][out*in], acc_gb[l][out]: caller must zero before first call in a batch.
* bp_delta_a, bp_delta_b: pre-allocated temp buffers (from backprop_buf_sizes max_act).
* targets[3]: soft labels {target_h, target_v, target_s}
* sample_weight: multiplier on the gradient (default 1.0f). Higher = more influence.
*/
void accumulate_grads(const float *input, float * const *h, const float *output,
const float *targets, float sample_weight,
float **acc_gW, float **acc_gb,
float *bp_delta_a, float *bp_delta_b) const
{
int nw = num_weights();
float max_hv = output[0] > output[1] ? output[0] : output[1];
float exp_h = expf(output[0] - max_hv);
float exp_v = expf(output[1] - max_hv);
float sum_hv = exp_h + exp_v;
float p_s = 1.0f / (1.0f + expf(-output[2]));
float dout[OUTPUT_SIZE];
dout[0] = (exp_h / sum_hv - targets[0]) * sample_weight;
dout[1] = (exp_v / sum_hv - targets[1]) * sample_weight;
dout[2] = (p_s - targets[2]) * sample_weight;
float *delta = bp_delta_a;
float *new_delta = bp_delta_b;
memcpy(delta, dout, OUTPUT_SIZE * sizeof(float));
for (int l = nw - 1; l >= 0; l--)
{
int out_sz = weight_out(l);
int in_sz = weight_in(l);
const float *layer_input = (l == 0) ? input : h[l - 1];
for (int o = 0; o < out_sz; o++)
{
float d = delta[o];
acc_gb[l][o] += d;
for (int k = 0; k < in_sz; k++)
acc_gW[l][o * in_sz + k] += d * layer_input[k];
}
if (l > 0)
{
for (int k = 0; k < in_sz; k++)
{
float sum = 0.0f;
for (int o = 0; o < out_sz; o++)
sum += W[l][o * in_sz + k] * delta[o];
new_delta[k] = h[l - 1][k] > 0.0f ? sum : 0.01f * sum;
}
float *tmp = delta;
delta = new_delta;
new_delta = tmp;
}
}
}
/*
* Apply Adam update to all layers using pre-averaged gradients.
* avg_gW[l], avg_gb[l]: gradient arrays (already divided by batch size).
* Modifies W[], b[], mW[], mb[], vW[], vb[].
*/
void adam_update_all(float **avg_gW, float **avg_gb, float lr, long long iter)
{
float lr_t = adam_lr_t(lr, iter);
int nw = num_weights();
for (int l = 0; l < nw; l++)
{
adam_update(W[l], mW[l], vW[l], avg_gW[l], weight_out(l) * weight_in(l), lr_t);
adam_update(b[l], mb[l], vb[l], avg_gb[l], weight_out(l), lr_t);
}
}
/*
* Save inference weights (and optinal optimizer state) to binary file.
* Format: magic(4) | version(4) | endian_check(4) | n_layers(4) | layer_sizes[n_layers](variable) | weights
*/
bool save(const char *path, bool save_optimizer = false) const
{
FILE *f = fopen(path, "wb");
if (!f) return false;
uint32_t magic = MAGIC;
uint32_t ver = VERSION | (save_optimizer && optimizer_allocated ? 0x80000000u : 0);
float endian_check = 1.0f;
uint32_t n_layers = (uint32_t)(n_hidden + 2);
fwrite(&magic, 4, 1, f);
fwrite(&ver, 4, 1, f);
fwrite(&endian_check, 4, 1, f);
fwrite(&n_layers, 4, 1, f);
/* Layer sizes: [input_size, hidden[0], ..., hidden[n_hidden-1], OUTPUT_SIZE] */
uint32_t lsz;
lsz = (uint32_t)input_size; fwrite(&lsz, 4, 1, f);
for (int i = 0; i < n_hidden; i++)
{ lsz = (uint32_t)hidden[i]; fwrite(&lsz, 4, 1, f); }
lsz = (uint32_t)OUTPUT_SIZE; fwrite(&lsz, 4, 1, f);
/* Weights */
int nw = num_weights();
for (int l = 0; l < nw; l++)
{
fwrite(W[l], sizeof(float), weight_out(l) * weight_in(l), f);
fwrite(b[l], sizeof(float), weight_out(l), f);
}
/* Optimizer state */
if (ver & 0x80000000u)
{
fwrite(&t, sizeof(long long), 1, f);
for (int l = 0; l < nw; l++)
{
int szW = weight_out(l) * weight_in(l);
int szb = weight_out(l);
fwrite(mW[l], sizeof(float), szW, f);
fwrite(mb[l], sizeof(float), szb, f);
fwrite(vW[l], sizeof(float), szW, f);
fwrite(vb[l], sizeof(float), szb, f);
}
}
fclose(f);
return true;
}
/*
* Load inference weights from binary file.
* Reconfigures architecture to match the file.
* Returns false on any format error.
*/
bool load(const char *path)
{
FILE *f = fopen(path, "rb");
if (!f) return false;
uint32_t magic, ver;
float endian_check;
if (fread(&magic, 4, 1, f) != 1 || magic != MAGIC)
{ fclose(f); return false; }
if (fread(&ver, 4, 1, f) != 1)
{ fclose(f); return false; }
uint32_t file_ver = ver & 0x7FFFFFFFu;
bool has_optimizer = (ver & 0x80000000u) != 0;
if (file_ver != VERSION) // Only v10 (hue-change features) is compatible
{ fclose(f); return false; }
if (fread(&endian_check, 4, 1, f) != 1 || endian_check != 1.0f)
{ fclose(f); return false; }
uint32_t n_layers;
if (fread(&n_layers, 4, 1, f) != 1 || n_layers < 2)
{ fclose(f); return false; }
uint32_t *lsizes = (uint32_t *)malloc(n_layers * sizeof(uint32_t));
if (fread(lsizes, 4, n_layers, f) != n_layers)
{ free(lsizes); fclose(f); return false; }
if (lsizes[n_layers - 1] != (uint32_t)OUTPUT_SIZE)
{ free(lsizes); fclose(f); return false; }
/* Derive patch_r from input_size */
int file_input = (int)lsizes[0];
if (file_input % FEAT_PER_PX != 0)
{ free(lsizes); fclose(f); return false; }
int ps = file_input / FEAT_PER_PX;
int side = (int)roundf(sqrtf((float)ps));
if (side * side != ps || side < 1 || (side & 1) == 0)
{ free(lsizes); fclose(f); return false; }
int pr = (side - 1) / 2;
int file_nh = (int)n_layers - 2;
int *file_hidden = (int *)malloc(file_nh * sizeof(int));
for (int i = 0; i < file_nh; i++)
file_hidden[i] = (int)lsizes[i + 1];
free(lsizes);
/* Reconfigure to match file */
set_arch(pr, file_nh, file_hidden);
free(file_hidden);
size_t r = 0;
int nw = num_weights();
for (int l = 0; l < nw; l++)
{
r += fread(W[l], sizeof(float), weight_out(l) * weight_in(l), f);
r += fread(b[l], sizeof(float), weight_out(l), f);
}
if (has_optimizer)
{
alloc_optimizer();
size_t r2 = 0;
r2 += fread(&t, sizeof(long long), 1, f);
for (int l = 0; l < nw; l++)
{
int szW = weight_out(l) * weight_in(l);
int szb = weight_out(l);
r2 += fread(mW[l], sizeof(float), szW, f);
r2 += fread(mb[l], sizeof(float), szb, f);
r2 += fread(vW[l], sizeof(float), szW, f);
r2 += fread(vb[l], sizeof(float), szb, f);
}
(void)r2;
}
else
{
// If loaded v6 or v7 without optimizer, reset existing optimizer if any
if (optimizer_allocated)
{
free_optimizer();
// Optionally re-alloc pure zeros?
// alloc_optimizer() will zero init.
}
}
fclose(f);
return r == (size_t)total_weights(); // Should check total read bytes actually...
}
/*
* Save training checkpoint (weights + optimizer state).
*/
bool save_checkpoint(const char *path) const
{
return save(path, true);
}
/*
* Load from file — auto-detects checkpoint vs inference-only.
* For v6 (vanilla SGD), checkpoints and weights files are identical.
* Reconfigures architecture to match the file.
*/
bool load_checkpoint(const char *path)
{
return load(path);
}
};
#endif /* DHT_NN_NETWORK_H */
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment