jef-sure · February 20, 2026 17:06
diff --git a/dht_nn_demosaic.cpp b/dht_nn_demosaic.cpp
 /* -*- C++ -*-
 * File: dht_nn_demosaic.cpp
 * Copyright 2026 Anton Petrusevich
 * Created: Tue Feb 10, 2026
 *
 * This code is licensed under one of two licenses as you choose:
 *
 * 1. GNU LESSER GENERAL PUBLIC LICENSE version 2.1
 *    (See file LICENSE.LGPL provided in LibRaw distribution archive for
 * details).
 *
 * 2. COMMON DEVELOPMENT AND DISTRIBUTION LICENSE (CDDL) Version 1.0
 *    (See file LICENSE.CDDL provided in LibRaw distribution archive for
 * details).
 *
 */

 #include "../../internal/dmp_include.h"
 #include "../../internal/dht_nn_network.h"
 #include "../../internal/dht_nn_cl.h"
 #include <cmath>
 #include <vector>
 #include <algorithm>
 #include <random>
 #include <string>

 const float RawChannelMaximum   = 65535.0f;
 const float FloatChannelMaximum = 16.0f;

 /* Build filename like "dht_nn_v10_p5_L4.weights" or "dht_nn_v10_p5_L4.ckpt" */
 static std::string dht_nn_filename(const DHTNNNetwork &net, const char *ext)
 {
    char buf[128];
    snprintf(buf, sizeof(buf), "dht_nn_v%u_p%d_L%d.%s",
             DHTNNNetwork::VERSION, net.patch_r, net.n_hidden, ext);
    return std::string(buf);
 }

 static inline float raw_to_float(ushort raw)
 {
    const static ushort RT = 8;
    const static float  k  = 1.06066017177982f;
    return (raw < 8 ? k * sqrtf(raw) : log2f(raw)) / FloatChannelMaximum;
 }

 static inline ushort float_to_raw(float val)
 {
    const static float k            = 1.06066017177982f;
    const static float threshold    = 3.0f; // k * sqrt(8) = log2(8) = 3.0
    float              unnormalized = val * FloatChannelMaximum;
    return unnormalized < threshold ? (ushort)((unnormalized * unnormalized) / (k * k)) : (ushort)exp2f(unnormalized);
 }

 static inline float calc_dist(float a, float b) { return fabsf(a - b); }
 static inline float safe_inv_dist(float d) { return d > 1e-6f ? 1.f / d : 1e6f; }

 struct DHTNNF
 {
    int              nr_height, nr_width;
    static const int nr_topmargin = 4, nr_leftmargin = 4;
    float (*nraw)[3];
    float   channel_maximum[3];
    float   channel_minimum[3];
    LibRaw &libraw;
    enum
    {
        HVSH   = 1,
        HOR    = 2,
        VER    = 4,
        HORSH  = HOR | HVSH,
        VERSH  = VER | HVSH,
        DIASH  = 8,
        LURD   = 16,
        RULD   = 32,
        LURDSH = LURD | DIASH,
        RULDSH = RULD | DIASH,
        HOT    = 64
    };
    static inline bool TresholdHot(float a, float b) throw()
    {
        return fabsf(a - b) > 0.375f; // 6 stops between values
    }
    static inline float Tg(void) throw()
    {
        return 0.5f; // 8 stops between values
    }
    static inline float T(void) throw()
    {
        // Diagonal edge sharpness threshold. In the original DHT algorithm (integer raw space),
        // this was approximately sqrt(2) ≈ 1.4 representing a brightness ratio threshold.
        // In normalized float space (0-1, log₂-encoded), 0.1 corresponds to ~1.6 stops or 3× brightness.
        return 0.1f;
    }
    char *ndir;
    float (*nn_feat)[16]; // precomputed per-pixel features: nraw[3], bayer[3], hue_h, hue_v, hue_change_h, hue_change_v, ld_ud, ld_lr, ld_lu, ld_ld, ld_ru, ld_rd
    bool  *nn_green_written; // true for pixels where nn_blend_green already wrote green (make_gline skips these)
    inline int nr_offset(int row, int col) throw() { return (row * nr_width + col); }
    int        get_hv_grb(int x, int y, int kc)
    {
        float hv1 = nraw[nr_offset(y - 1, x)][1] - (nraw[nr_offset(y - 2, x)][kc] + nraw[nr_offset(y, x)][kc]) * 0.5f;
        float hv2 = nraw[nr_offset(y + 1, x)][1] - (nraw[nr_offset(y + 2, x)][kc] + nraw[nr_offset(y, x)][kc]) * 0.5f;
        float kv  = calc_dist(hv1, hv2) *
                   calc_dist(nraw[nr_offset(y, x)][kc] * nraw[nr_offset(y, x)][kc], (nraw[nr_offset(y - 2, x)][kc] * nraw[nr_offset(y + 2, x)][kc]));
        kv *= kv;
        kv *= kv;
        kv *= kv;
        float dv  = kv * calc_dist(nraw[nr_offset(y - 3, x)][1] * nraw[nr_offset(y + 3, x)][1], nraw[nr_offset(y - 1, x)][1] * nraw[nr_offset(y + 1, x)][1]);
        float hh1 = nraw[nr_offset(y, x - 1)][1] - (nraw[nr_offset(y, x - 2)][kc] + nraw[nr_offset(y, x)][kc]) * 0.5f;
        float hh2 = nraw[nr_offset(y, x + 1)][1] - (nraw[nr_offset(y, x + 2)][kc] + nraw[nr_offset(y, x)][kc]) * 0.5f;
        float kh  = calc_dist(hh1, hh2) *
                   calc_dist(nraw[nr_offset(y, x)][kc] * nraw[nr_offset(y, x)][kc], (nraw[nr_offset(y, x - 2)][kc] * nraw[nr_offset(y, x + 2)][kc]));
        kh *= kh;
        kh *= kh;
        kh *= kh;
        float dh = kh * calc_dist(nraw[nr_offset(y, x - 3)][1] * nraw[nr_offset(y, x + 3)][1], nraw[nr_offset(y, x - 1)][1] * nraw[nr_offset(y, x + 1)][1]);
        float e  = calc_dist(dh, dv);
        char  d  = dh < dv ? (e > Tg() ? HORSH : HOR) : (e > Tg() ? VERSH : VER);
        return d;
    }
    int get_hv_rbg(int x, int y, int hc)
    {
        float hv1 = nraw[nr_offset(y - 1, x)][hc ^ 2] - (nraw[nr_offset(y - 2, x)][1] + nraw[nr_offset(y, x)][1]) * 0.5f;
        float hv2 = nraw[nr_offset(y + 1, x)][hc ^ 2] - (nraw[nr_offset(y + 2, x)][1] + nraw[nr_offset(y, x)][1]) * 0.5f;
        float kv =
            calc_dist(hv1, hv2) * calc_dist(nraw[nr_offset(y, x)][1] * nraw[nr_offset(y, x)][1], (nraw[nr_offset(y - 2, x)][1] * nraw[nr_offset(y + 2, x)][1]));
        kv *= kv;
        kv *= kv;
        kv *= kv;
        float dv  = kv * calc_dist(nraw[nr_offset(y - 3, x)][hc ^ 2] * nraw[nr_offset(y + 3, x)][hc ^ 2],
                                   nraw[nr_offset(y - 1, x)][hc ^ 2] * nraw[nr_offset(y + 1, x)][hc ^ 2]);
        float hh1 = nraw[nr_offset(y, x - 1)][hc] - (nraw[nr_offset(y, x - 2)][1] + nraw[nr_offset(y, x)][1]) * 0.5f;
        float hh2 = nraw[nr_offset(y, x + 1)][hc] - (nraw[nr_offset(y, x + 2)][1] + nraw[nr_offset(y, x)][1]) * 0.5f;
        float kh =
            calc_dist(hh1, hh2) * calc_dist(nraw[nr_offset(y, x)][1] * nraw[nr_offset(y, x)][1], (nraw[nr_offset(y, x - 2)][1] * nraw[nr_offset(y, x + 2)][1]));
        kh *= kh;
        kh *= kh;
        kh *= kh;
        float dh = kh * calc_dist(nraw[nr_offset(y, x - 3)][hc] * nraw[nr_offset(y, x + 3)][hc], nraw[nr_offset(y, x - 1)][hc] * nraw[nr_offset(y, x + 1)][hc]);
        float e  = calc_dist(dh, dv);
        char  d  = dh < dv ? (e > Tg() ? HORSH : HOR) : (e > Tg() ? VERSH : VER);
        return d;
    }
    /**
     * @brief Determines the optimal diagonal interpolation direction for non-green (red/blue) pixel locations.
     *
     * This function analyzes two competing diagonal directions at a Bayer pattern location where
     * a non-green color (red or blue, specified by kc) is known, to decide which diagonal path
     * provides the smoothest gradient for interpolation purposes.
     *
     * The two diagonal directions evaluated are:
     * - LURD (Left-Up to Right-Down): compares pixels at (y-1,x-1) and (y+1,x+1)
     * - RULD (Right-Up to Left-Down): compares pixels at (y-1,x+1) and (y+1,x-1)
     *
     * Direction selection methodology:
     * 1. Computes green-to-color ratios at diagonal neighbors to assess color consistency
     * 2. Multiplies ratio differences by geometric mean differences of green values
     * 3. Selects the direction with the SMALLER metric (smoother gradient)
     * 4. If the difference between metrics exceeds T() threshold, marks edge as "sharp" (DIASH flag)
     *
     * The T() threshold controls diagonal edge sharpness detection. In the original DHT algorithm
     * (integer raw space 0-65535), this was approximately sqrt(2) ≈ 1.4, representing a brightness
     * ratio threshold. In the current normalized float space (0-1, log₂-encoded), the equivalent
     * value needs recalculation based on the perceptual encoding used by raw_to_float().
     *
     * @param x Horizontal coordinate in the normalized raw array (includes margin offset)
     * @param y Vertical coordinate in the normalized raw array (includes margin offset)
     * @param kc Color channel index (0=red, 2=blue) of the known color at position (y,x)
     *
     * @return Direction flags: LURD or RULD for the selected diagonal, with DIASH bit set
     *         if the edge is sharp (|dlurd - druld| > T())
     *
     * @see get_diag_rbg() — Diagonal direction detection for green pixel locations
     * @see get_hv_grb()   — Horizontal/vertical direction detection for non-green pixels
     * @see T()            — Threshold function for diagonal edge sharpness (currently 0.1 = ~1.6 stops)
     */
    int get_diag_grb(int x, int y, int kc)
    {
        float hlu   = nraw[nr_offset(y - 1, x - 1)][1] / nraw[nr_offset(y - 1, x - 1)][kc];
        float hrd   = nraw[nr_offset(y + 1, x + 1)][1] / nraw[nr_offset(y + 1, x + 1)][kc];
        float dlurd = calc_dist(hlu, hrd) *
                      calc_dist(nraw[nr_offset(y - 1, x - 1)][1] * nraw[nr_offset(y + 1, x + 1)][1], nraw[nr_offset(y, x)][1] * nraw[nr_offset(y, x)][1]);
        float druld = calc_dist(hlu, hrd) *
                      calc_dist(nraw[nr_offset(y - 1, x + 1)][1] * nraw[nr_offset(y + 1, x - 1)][1], nraw[nr_offset(y, x)][1] * nraw[nr_offset(y, x)][1]);
        float e = calc_dist(dlurd, druld);
        char  d = druld < dlurd ? (e > T() ? RULDSH : RULD) : (e > T() ? LURDSH : LURD);
        return d;
    }
    int get_diag_rbg(int x, int y, int /* hc */)
    {
        float dlurd = calc_dist(nraw[nr_offset(y - 1, x - 1)][1] * nraw[nr_offset(y + 1, x + 1)][1], nraw[nr_offset(y, x)][1] * nraw[nr_offset(y, x)][1]);
        float druld = calc_dist(nraw[nr_offset(y - 1, x + 1)][1] * nraw[nr_offset(y + 1, x - 1)][1], nraw[nr_offset(y, x)][1] * nraw[nr_offset(y, x)][1]);
        float e     = calc_dist(dlurd, druld);
        char  d     = druld < dlurd ? (e > T() ? RULDSH : RULD) : (e > T() ? LURDSH : LURD);
        return d;
    }
    static inline float scale_over(float ec, float base)
    {
        float s = base * .4f;
        float o = ec - base;
        return base + sqrt(s * (o + s)) - s;
    }
    static inline float scale_under(float ec, float base)
    {
        float s = base * .6f;
        float o = base - ec;
        return base - sqrt(s * (o + s)) + s;
    }
    ~DHTNNF();
    DHTNNF(LibRaw &_libraw);
    void  copy_to_image();
    void  make_greens();
    void  make_diag_dirs();
    void  make_hv_dirs();
    void  refine_hv_dirs(int i, int js, const char *ndir_src = NULL);
    void  refine_diag_dirs(int i, int js, const char *ndir_src = NULL);
    void  refine_ihv_dirs(int i, const char *ndir_src = NULL);
    void  refine_idiag_dirs(int i, const char *ndir_src = NULL);
    void  illustrate_dirs();
    void  illustrate_dline(int i);
    void  make_hv_dline(int i);
    void  make_diag_dline(int i);
    void  make_gline(int i);
    void  make_rbdiag(int i);
    void  make_rbhv(int i);
    void  make_rb();
    void  hide_hots();
    void  restore_hots();
    void  build_nn_features();
    void  extract_patch(int i, int j, float *patch, int patch_r);
    float compute_green_forced(int i, int j, int kc, bool use_ver);
    void  nn_refine_hv_dirs(DHTNNNetwork &net);
    void  nn_blend_green(DHTNNNetwork &net);
    void  train_nn(DHTNNNetwork &net, float lr, float (*gt_rgb)[3], int gt_w, int gt_h);
    void  benchmark_nn(DHTNNNetwork &net);
 };

 typedef float float3[3];

 /*
 * информация о цветах копируется во float в общем то с одной целью -- чтобы
 * вместо 0 можно было писать 0.5, что больше подходит для вычисления яркостной
 * разницы. причина: в целых числах разница в 1 стоп составляет ряд 8,4,2,1,0 --
 * последнее число должно быть 0.5, которое непредствамио в целых числах. так же
 * это изменение позволяет не думать о специальных случаях деления на ноль.
 *
 * альтернативное решение: умножить все данные на 2 и в младший бит внести 1.
 * правда, всё равно придётся следить, чтобы при интерпретации зелёного цвета не
 * получился 0 при округлении, иначе проблема при интерпретации синих и красных.
 *
 */
 DHTNNF::DHTNNF(LibRaw &_libraw) : libraw(_libraw), nn_feat(NULL), nn_green_written(NULL)
 {
    nr_height          = libraw.imgdata.sizes.iheight + nr_topmargin * 2;
    nr_width           = libraw.imgdata.sizes.iwidth + nr_leftmargin * 2;
    nraw               = (float3 *)malloc(nr_height * nr_width * sizeof(float3));
    int iwidth         = libraw.imgdata.sizes.iwidth;
    ndir               = (char *)calloc(nr_height * nr_width, 1);
    channel_maximum[0] = channel_maximum[1] = channel_maximum[2] = 0.0f;
    channel_minimum[0] = channel_minimum[1] = channel_minimum[2] = 1.0f;
    for (int i = 0; i < nr_height * nr_width; ++i)
        nraw[i][0] = nraw[i][1] = nraw[i][2] = 0;
    for (int i = 0; i < libraw.imgdata.sizes.iheight; ++i)
    {
        int col_cache[48];
        for (int j = 0; j < 48; ++j)
        {
            int l = libraw.COLOR(i, j);
            if (l == 3) l = 1; // both greens are equal
            col_cache[j] = l;
        }
        for (int j = 0; j < iwidth; ++j)
        {
            int            l                                        = col_cache[j % 48];
            unsigned short c                                        = libraw.imgdata.image[i * iwidth + j][l];
            float          fv                                       = raw_to_float(c);
            nraw[nr_offset(i + nr_topmargin, j + nr_leftmargin)][l] = fv;
            if (c != 0)
            {
                if (fv > channel_maximum[l]) channel_maximum[l] = fv;
                if (fv < channel_minimum[l]) channel_minimum[l] = fv;
            }
        }
    }
 }

 /**
 * @brief Detects and temporarily replaces hot (defective) pixels in the raw image data.
 *
 * Hot pixels are sensor defects that produce abnormally high or low values compared
 * to their surroundings. This method identifies them by checking whether a pixel's
 * value is a strict local extremum (either maximum or minimum) among all 8 of its
 * nearest neighbors — including both same-color pixels at distance 2 and green
 * pixels at distance 1 in the Bayer pattern.
 *
 * For each candidate hot pixel, an average of the 8 surrounding same-color pixels
 * (at distance 2 in a cross+X pattern) is computed. If the difference between the
 * pixel's value and this average exceeds the hot pixel threshold (0.375 in
 * log-domain, corresponding to ~6 stops of brightness difference), the pixel is
 * flagged with the HOT bit in the direction map.
 *
 * The flagged pixel's value is then replaced with an interpolated estimate: the
 * method compares vertical and horizontal gradient consistency (using products of
 * neighboring color and green values) and selects the direction with the smoother
 * gradient. The replacement value is the average of the two same-color neighbors
 * along that direction.
 *
 * This two-phase approach (non-green pixels first, then green pixels) processes
 * each Bayer color plane independently. The original hot pixel values are preserved
 * in the raw image buffer (libraw.imgdata.image) and can be restored later via
 * @ref restore_hots() after demosaicing is complete, ensuring that the hot pixel
 * suppression only aids the interpolation process without permanently altering
 * the original captured data.
 *
 * @note This method is parallelized with OpenMP over image rows.
 *
 * @see restore_hots() — restores original values of pixels flagged as HOT.
 * @see TresholdHot()  — the threshold function used to confirm hot pixel candidates.
 */
 void DHTNNF::hide_hots()
 {
    int iwidth = libraw.imgdata.sizes.iwidth;
 #if defined(LIBRAW_USE_OPENMP)
 #pragma omp parallel for schedule(guided)
 #endif
    for (int i = 0; i < libraw.imgdata.sizes.iheight; ++i)
    {
        int js = libraw.COLOR(i, 0) & 1;
        int kc = libraw.COLOR(i, js);
        /*
         * js -- начальная х-координата, которая попадает мимо известного зелёного
         * kc -- известный цвет в точке интерполирования
         */
        for (int j = js; j < iwidth; j += 2)
        {
            int   x = j + nr_leftmargin;
            int   y = i + nr_topmargin;
            float c = nraw[nr_offset(y, x)][kc];
            if ((c > nraw[nr_offset(y, x + 2)][kc] && c > nraw[nr_offset(y, x - 2)][kc] && c > nraw[nr_offset(y - 2, x)][kc] &&
                 c > nraw[nr_offset(y + 2, x)][kc] && c > nraw[nr_offset(y, x + 1)][1] && c > nraw[nr_offset(y, x - 1)][1] &&
                 c > nraw[nr_offset(y - 1, x)][1] && c > nraw[nr_offset(y + 1, x)][1]) ||
                (c < nraw[nr_offset(y, x + 2)][kc] && c < nraw[nr_offset(y, x - 2)][kc] && c < nraw[nr_offset(y - 2, x)][kc] &&
                 c < nraw[nr_offset(y + 2, x)][kc] && c < nraw[nr_offset(y, x + 1)][1] && c < nraw[nr_offset(y, x - 1)][1] &&
                 c < nraw[nr_offset(y - 1, x)][1] && c < nraw[nr_offset(y + 1, x)][1]))
            {
                float avg = 0;
                for (int k = -2; k < 3; k += 2)
                    for (int m = -2; m < 3; m += 2)
                        if (m == 0 && k == 0) continue;
                        else avg += nraw[nr_offset(y + k, x + m)][kc];
                avg /= 8;
                if (TresholdHot(c, avg))
                {
                    ndir[nr_offset(y, x)] |= HOT;
                    float dv =
                        calc_dist(nraw[nr_offset(y - 2, x)][kc] * nraw[nr_offset(y - 1, x)][1], nraw[nr_offset(y + 2, x)][kc] * nraw[nr_offset(y + 1, x)][1]);
                    float dh =
                        calc_dist(nraw[nr_offset(y, x - 2)][kc] * nraw[nr_offset(y, x - 1)][1], nraw[nr_offset(y, x + 2)][kc] * nraw[nr_offset(y, x + 1)][1]);
                    if (dv > dh) nraw[nr_offset(y, x)][kc] = (nraw[nr_offset(y, x + 2)][kc] + nraw[nr_offset(y, x - 2)][kc]) / 2;
                    else nraw[nr_offset(y, x)][kc] = (nraw[nr_offset(y - 2, x)][kc] + nraw[nr_offset(y + 2, x)][kc]) / 2;
                }
            }
        }
        for (int j = js ^ 1; j < iwidth; j += 2)
        {
            int   x = j + nr_leftmargin;
            int   y = i + nr_topmargin;
            float c = nraw[nr_offset(y, x)][1];
            if ((c > nraw[nr_offset(y, x + 2)][1] && c > nraw[nr_offset(y, x - 2)][1] && c > nraw[nr_offset(y - 2, x)][1] && c > nraw[nr_offset(y + 2, x)][1] &&
                 c > nraw[nr_offset(y, x + 1)][kc] && c > nraw[nr_offset(y, x - 1)][kc] && c > nraw[nr_offset(y - 1, x)][kc ^ 2] &&
                 c > nraw[nr_offset(y + 1, x)][kc ^ 2]) ||
                (c < nraw[nr_offset(y, x + 2)][1] && c < nraw[nr_offset(y, x - 2)][1] && c < nraw[nr_offset(y - 2, x)][1] && c < nraw[nr_offset(y + 2, x)][1] &&
                 c < nraw[nr_offset(y, x + 1)][kc] && c < nraw[nr_offset(y, x - 1)][kc] && c < nraw[nr_offset(y - 1, x)][kc ^ 2] &&
                 c < nraw[nr_offset(y + 1, x)][kc ^ 2]))
            {
                float avg = 0;
                for (int k = -2; k < 3; k += 2)
                    for (int m = -2; m < 3; m += 2)
                        if (k == 0 && m == 0) continue;
                        else avg += nraw[nr_offset(y + k, x + m)][1];
                avg /= 8;
                if (TresholdHot(c, avg))
                {
                    ndir[nr_offset(y, x)] |= HOT;
                    float dv = calc_dist(nraw[nr_offset(y - 2, x)][1] * nraw[nr_offset(y - 1, x)][kc ^ 2],
                                         nraw[nr_offset(y + 2, x)][1] * nraw[nr_offset(y + 1, x)][kc ^ 2]);
                    float dh =
                        calc_dist(nraw[nr_offset(y, x - 2)][1] * nraw[nr_offset(y, x - 1)][kc], nraw[nr_offset(y, x + 2)][1] * nraw[nr_offset(y, x + 1)][kc]);
                    if (dv > dh) nraw[nr_offset(y, x)][1] = (nraw[nr_offset(y, x + 2)][1] + nraw[nr_offset(y, x - 2)][1]) / 2;
                    else nraw[nr_offset(y, x)][1] = (nraw[nr_offset(y - 2, x)][1] + nraw[nr_offset(y + 2, x)][1]) / 2;
                }
            }
        }
    }
 }

 void DHTNNF::restore_hots()
 {
    int iwidth = libraw.imgdata.sizes.iwidth;
 #if defined(LIBRAW_USE_OPENMP)
 #pragma omp parallel for schedule(guided) collapse(2)
 #endif
    for (int i = 0; i < libraw.imgdata.sizes.iheight; ++i)
    {
        for (int j = 0; j < iwidth; ++j)
        {
            int x = j + nr_leftmargin;
            int y = i + nr_topmargin;
            if (ndir[nr_offset(y, x)] & HOT)
            {
                int l = libraw.COLOR(i, j);
                if (l == 3) l = 1;
                nraw[nr_offset(i + nr_topmargin, j + nr_leftmargin)][l] = raw_to_float(libraw.imgdata.image[i * iwidth + j][l]);
            }
        }
    }
 }

 void DHTNNF::make_diag_dirs()
 {
 #if defined(LIBRAW_USE_OPENMP)
 #pragma omp parallel for schedule(guided)
 #endif
    for (int i = 0; i < libraw.imgdata.sizes.iheight; ++i)
    {
        make_diag_dline(i);
    }
    // #if defined(LIBRAW_USE_OPENMP)
    // #pragma omp parallel for schedule(guided)
    // #endif
    //	for (int i = 0; i < libraw.imgdata.sizes.iheight; ++i) {
    //		refine_diag_dirs(i, i & 1);
    //	}
    // #if defined(LIBRAW_USE_OPENMP)
    // #pragma omp parallel for schedule(guided)
    // #endif
    //	for (int i = 0; i < libraw.imgdata.sizes.iheight; ++i) {
    //		refine_diag_dirs(i, (i & 1) ^ 1);
    //	}
    char *snap = (char *)malloc(nr_height * nr_width);
    memcpy(snap, ndir, nr_height * nr_width);
 #if defined(LIBRAW_USE_OPENMP)
 #pragma omp parallel for schedule(guided)
 #endif
    for (int i = 0; i < libraw.imgdata.sizes.iheight; ++i)
    {
        refine_idiag_dirs(i, snap);
    }
    free(snap);
 }

 void DHTNNF::make_hv_dirs()
 {
    int iheight = libraw.imgdata.sizes.iheight;
 #if defined(LIBRAW_USE_OPENMP)
 #pragma omp parallel for schedule(guided)
 #endif
    for (int i = 0; i < iheight; ++i)
    {
        make_hv_dline(i);
    }

    /* Refinement passes use a snapshot for reads to avoid data races
     * when parallelized: each thread reads neighbor directions from the
     * snapshot while writing updated directions to ndir (Jacobi iteration). */
    char *snap = (char *)malloc(nr_height * nr_width);

    memcpy(snap, ndir, nr_height * nr_width);
 #if defined(LIBRAW_USE_OPENMP)
 #pragma omp parallel for schedule(guided)
 #endif
    for (int i = 0; i < iheight; ++i)
    {
        refine_hv_dirs(i, i & 1, snap);
    }

    memcpy(snap, ndir, nr_height * nr_width);
 #if defined(LIBRAW_USE_OPENMP)
 #pragma omp parallel for schedule(guided)
 #endif
    for (int i = 0; i < iheight; ++i)
    {
        refine_hv_dirs(i, (i & 1) ^ 1, snap);
    }

    memcpy(snap, ndir, nr_height * nr_width);
 #if defined(LIBRAW_USE_OPENMP)
 #pragma omp parallel for schedule(guided)
 #endif
    for (int i = 0; i < iheight; ++i)
    {
        refine_ihv_dirs(i, snap);
    }

    free(snap);
 }

 void DHTNNF::refine_hv_dirs(int i, int js, const char *ndir_src)
 {
    const char *nd     = ndir_src ? ndir_src : ndir;
    int         iwidth = libraw.imgdata.sizes.iwidth;
    for (int j = js; j < iwidth; j += 2)
    {
        int x = j + nr_leftmargin;
        int y = i + nr_topmargin;
        if (nd[nr_offset(y, x)] & HVSH) continue;
        int  nv    = (nd[nr_offset(y - 1, x)] & VER) + (nd[nr_offset(y + 1, x)] & VER) + (nd[nr_offset(y, x - 1)] & VER) + (nd[nr_offset(y, x + 1)] & VER);
        int  nh    = (nd[nr_offset(y - 1, x)] & HOR) + (nd[nr_offset(y + 1, x)] & HOR) + (nd[nr_offset(y, x - 1)] & HOR) + (nd[nr_offset(y, x + 1)] & HOR);
        bool codir = (nd[nr_offset(y, x)] & VER) ? ((nd[nr_offset(y - 1, x)] & VER) || (nd[nr_offset(y + 1, x)] & VER))
                                                 : ((nd[nr_offset(y, x - 1)] & HOR) || (nd[nr_offset(y, x + 1)] & HOR));
        nv /= VER;
        nh /= HOR;
        if ((nd[nr_offset(y, x)] & VER) && (nh > 2 && !codir))
        {
            ndir[nr_offset(y, x)] &= ~VER;
            ndir[nr_offset(y, x)] |= HOR;
        }
        if ((nd[nr_offset(y, x)] & HOR) && (nv > 2 && !codir))
        {
            ndir[nr_offset(y, x)] &= ~HOR;
            ndir[nr_offset(y, x)] |= VER;
        }
    }
 }

 void DHTNNF::refine_ihv_dirs(int i, const char *ndir_src)
 {
    const char *nd     = ndir_src ? ndir_src : ndir;
    int         iwidth = libraw.imgdata.sizes.iwidth;
    for (int j = 0; j < iwidth; j++)
    {
        int x = j + nr_leftmargin;
        int y = i + nr_topmargin;
        if (nd[nr_offset(y, x)] & HVSH) continue;
        int nv = (nd[nr_offset(y - 1, x)] & VER) + (nd[nr_offset(y + 1, x)] & VER) + (nd[nr_offset(y, x - 1)] & VER) + (nd[nr_offset(y, x + 1)] & VER);
        int nh = (nd[nr_offset(y - 1, x)] & HOR) + (nd[nr_offset(y + 1, x)] & HOR) + (nd[nr_offset(y, x - 1)] & HOR) + (nd[nr_offset(y, x + 1)] & HOR);
        nv /= VER;
        nh /= HOR;
        if ((nd[nr_offset(y, x)] & VER) && nh > 3)
        {
            ndir[nr_offset(y, x)] &= ~VER;
            ndir[nr_offset(y, x)] |= HOR;
        }
        if ((nd[nr_offset(y, x)] & HOR) && nv > 3)
        {
            ndir[nr_offset(y, x)] &= ~HOR;
            ndir[nr_offset(y, x)] |= VER;
        }
    }
 }

 /*
 * для каждой точки интерполирования, которая не помечена как горизонтальная или вертикальная, вычисляем ориентированную градиентную метрику в обоих
 * направлениях и сравниваем их. если метрика в одном направлении значительно больше, чем в другом, то помечаем точку как ориентированную в этом направлении.
 * если же метрики примерно равны, то помечаем точку как ориентированную в обоих направлениях.
 *
 * для вычисления градиентной метрики используется произведение соседних цветовых и зелёных значений, что позволяет учитывать как яркостные, так и цветовые
 * изменения. чем больше разница между метриками в двух направлениях, тем более уверенно можно определить ориентацию градиента.
 *
 * эта информация о направлении градиента будет использоваться на этапе интерполяции для выбора оптимального способа восстановления недостающих цветовых
 * каналов, что поможет сохранить детали и уменьшить артефакты при демозаике.
 */

 void DHTNNF::make_hv_dline(int i)
 {
    int iwidth = libraw.imgdata.sizes.iwidth;
    int js     = libraw.COLOR(i, 0) & 1;
    int kc     = libraw.COLOR(i, js);
    /*
     * js -- начальная х-координата, которая попадает мимо известного зелёного
     * kc -- известный цвет в точке интерполирования
     */
    for (int j = 0; j < iwidth; j++)
    {
        int  x = j + nr_leftmargin;
        int  y = i + nr_topmargin;
        char d = 0;
        if ((j & 1) == js)
        {
            d = get_hv_grb(x, y, kc);
        }
        else
        {
            d = get_hv_rbg(x, y, kc);
        }
        ndir[nr_offset(y, x)] |= d;
    }
 }

 void DHTNNF::make_diag_dline(int i)
 {
    int iwidth = libraw.imgdata.sizes.iwidth;
    int js     = libraw.COLOR(i, 0) & 1;
    int kc     = libraw.COLOR(i, js);
    /*
     * js -- начальная х-координата, которая попадает мимо известного зелёного
     * kc -- известный цвет в точке интерполирования
     */
    for (int j = 0; j < iwidth; j++)
    {
        int  x = j + nr_leftmargin;
        int  y = i + nr_topmargin;
        char d = 0;
        if ((j & 1) == js)
        {
            d = get_diag_grb(x, y, kc);
        }
        else
        {
            d = get_diag_rbg(x, y, kc);
        }
        ndir[nr_offset(y, x)] |= d;
    }
 }

 void DHTNNF::refine_diag_dirs(int i, int js, const char *ndir_src)
 {
    const char *nd     = ndir_src ? ndir_src : ndir;
    int         iwidth = libraw.imgdata.sizes.iwidth;
    for (int j = js; j < iwidth; j += 2)
    {
        int x = j + nr_leftmargin;
        int y = i + nr_topmargin;
        if (nd[nr_offset(y, x)] & DIASH) continue;
        int nv = (nd[nr_offset(y - 1, x)] & LURD) + (nd[nr_offset(y + 1, x)] & LURD) + (nd[nr_offset(y, x - 1)] & LURD) + (nd[nr_offset(y, x + 1)] & LURD) +
                 (nd[nr_offset(y - 1, x - 1)] & LURD) + (nd[nr_offset(y - 1, x + 1)] & LURD) + (nd[nr_offset(y + 1, x - 1)] & LURD) +
                 (nd[nr_offset(y + 1, x + 1)] & LURD);
        int nh = (nd[nr_offset(y - 1, x)] & RULD) + (nd[nr_offset(y + 1, x)] & RULD) + (nd[nr_offset(y, x - 1)] & RULD) + (nd[nr_offset(y, x + 1)] & RULD) +
                 (nd[nr_offset(y - 1, x - 1)] & RULD) + (nd[nr_offset(y - 1, x + 1)] & RULD) + (nd[nr_offset(y + 1, x - 1)] & RULD) +
                 (nd[nr_offset(y + 1, x + 1)] & RULD);
        bool codir = (nd[nr_offset(y, x)] & LURD) ? ((nd[nr_offset(y - 1, x - 1)] & LURD) || (nd[nr_offset(y + 1, x + 1)] & LURD))
                                                  : ((nd[nr_offset(y - 1, x + 1)] & RULD) || (nd[nr_offset(y + 1, x - 1)] & RULD));
        nv /= LURD;
        nh /= RULD;
        if ((nd[nr_offset(y, x)] & LURD) && (nh > 4 && !codir))
        {
            ndir[nr_offset(y, x)] &= ~LURD;
            ndir[nr_offset(y, x)] |= RULD;
        }
        if ((nd[nr_offset(y, x)] & RULD) && (nv > 4 && !codir))
        {
            ndir[nr_offset(y, x)] &= ~RULD;
            ndir[nr_offset(y, x)] |= LURD;
        }
    }
 }

 void DHTNNF::refine_idiag_dirs(int i, const char *ndir_src)
 {
    const char *nd     = ndir_src ? ndir_src : ndir;
    int         iwidth = libraw.imgdata.sizes.iwidth;
    for (int j = 0; j < iwidth; j++)
    {
        int x = j + nr_leftmargin;
        int y = i + nr_topmargin;
        if (nd[nr_offset(y, x)] & DIASH) continue;
        int nv = (nd[nr_offset(y - 1, x)] & LURD) + (nd[nr_offset(y + 1, x)] & LURD) + (nd[nr_offset(y, x - 1)] & LURD) + (nd[nr_offset(y, x + 1)] & LURD) +
                 (nd[nr_offset(y - 1, x - 1)] & LURD) + (nd[nr_offset(y - 1, x + 1)] & LURD) + (nd[nr_offset(y + 1, x - 1)] & LURD) +
                 (nd[nr_offset(y + 1, x + 1)] & LURD);
        int nh = (nd[nr_offset(y - 1, x)] & RULD) + (nd[nr_offset(y + 1, x)] & RULD) + (nd[nr_offset(y, x - 1)] & RULD) + (nd[nr_offset(y, x + 1)] & RULD) +
                 (nd[nr_offset(y - 1, x - 1)] & RULD) + (nd[nr_offset(y - 1, x + 1)] & RULD) + (nd[nr_offset(y + 1, x - 1)] & RULD) +
                 (nd[nr_offset(y + 1, x + 1)] & RULD);
        nv /= LURD;
        nh /= RULD;
        if ((nd[nr_offset(y, x)] & LURD) && nh > 7)
        {
            ndir[nr_offset(y, x)] &= ~LURD;
            ndir[nr_offset(y, x)] |= RULD;
        }
        if ((nd[nr_offset(y, x)] & RULD) && nv > 7)
        {
            ndir[nr_offset(y, x)] &= ~RULD;
            ndir[nr_offset(y, x)] |= LURD;
        }
    }
 }

 /*
 * вычисление недостающих зелёных точек.
 */
 void DHTNNF::make_greens()
 {
 #if defined(LIBRAW_USE_OPENMP)
 #pragma omp parallel for schedule(guided)
 #endif
    for (int i = 0; i < libraw.imgdata.sizes.iheight; ++i)
    {
        make_gline(i);
    }
 }

 void DHTNNF::make_gline(int i)
 {
    int iwidth = libraw.imgdata.sizes.iwidth;
    int js     = libraw.COLOR(i, 0) & 1;
    int kc     = libraw.COLOR(i, js);
    /*
     * js -- начальная х-координата, которая попадает мимо известного зелёного
     * kc -- известный цвет в точке интерполирования
     */
    for (int j = js; j < iwidth; j += 2)
    {
        int   x = j + nr_leftmargin;
        int   y = i + nr_topmargin;
        /* Skip pixels where nn_blend_green already wrote the blended green */
        if (nn_green_written && nn_green_written[i * iwidth + j]) continue;
        int   dx, dy, dx2, dy2;
        float d1, d2;
        if (ndir[nr_offset(y, x)] & VER)
        {
            dx = dx2 = 0;
            dy       = -1;
            dy2      = 1;
            // log-space: difference = green − geometric mean of colors
            d1 = nraw[nr_offset(y - 1, x)][1] - (nraw[nr_offset(y - 2, x)][kc] + nraw[nr_offset(y, x)][kc]) * 0.5f;
            d2 = nraw[nr_offset(y + 1, x)][1] - (nraw[nr_offset(y + 2, x)][kc] + nraw[nr_offset(y, x)][kc]) * 0.5f;
        }
        else
        {
            dy = dy2 = 0;
            dx       = 1;
            dx2      = -1;
            d1       = nraw[nr_offset(y, x + 1)][1] - (nraw[nr_offset(y, x + 2)][kc] + nraw[nr_offset(y, x)][kc]) * 0.5f;
            d2       = nraw[nr_offset(y, x - 1)][1] - (nraw[nr_offset(y, x - 2)][kc] + nraw[nr_offset(y, x)][kc]) * 0.5f;
        }
        float b1 = safe_inv_dist(calc_dist(nraw[nr_offset(y, x)][kc], nraw[nr_offset(y + dy * 2, x + dx * 2)][kc]));
        float b2 = safe_inv_dist(calc_dist(nraw[nr_offset(y, x)][kc], nraw[nr_offset(y + dy2 * 2, x + dx2 * 2)][kc]));
        b1 *= b1;
        b2 *= b2;
        // log-space: sum replaces product
        float eg = nraw[nr_offset(y, x)][kc] + (b1 * d1 + b2 * d2) / (b1 + b2);
        float min, max;
        min = MIN(nraw[nr_offset(y + dy, x + dx)][1], nraw[nr_offset(y + dy2, x + dx2)][1]);
        max = MAX(nraw[nr_offset(y + dy, x + dx)][1], nraw[nr_offset(y + dy2, x + dx2)][1]);
        // log-space: ×1.2 → +log₂(1.2)/16, ÷1.2 → −log₂(1.2)/16
        static const float margin = log2f(1.2f) / FloatChannelMaximum;
        min -= margin;
        max += margin;
        if (eg < min) eg = min;
        else if (eg > max) eg = max;
        if (eg > channel_maximum[1]) eg = channel_maximum[1];
        else if (eg < channel_minimum[1]) eg = channel_minimum[1];
        nraw[nr_offset(y, x)][1] = eg;
    }
 }

 /*
 * отладочная функция
 */

 void DHTNNF::illustrate_dirs()
 {
 #if defined(LIBRAW_USE_OPENMP)
 #pragma omp parallel for schedule(guided)
 #endif
    for (int i = 0; i < libraw.imgdata.sizes.iheight; ++i)
    {
        illustrate_dline(i);
    }
 }

 void DHTNNF::illustrate_dline(int i)
 {
    int iwidth = libraw.imgdata.sizes.iwidth;
    for (int j = 0; j < iwidth; j++)
    {
        int x                    = j + nr_leftmargin;
        int y                    = i + nr_topmargin;
        nraw[nr_offset(y, x)][0] = nraw[nr_offset(y, x)][1] = nraw[nr_offset(y, x)][2] = 0.5;
        int l                                                                          = ndir[nr_offset(y, x)] & 8;
        // l >>= 3; // WTF?
        l = 1;
        if (ndir[nr_offset(y, x)] & HOT) nraw[nr_offset(y, x)][0] = l * channel_maximum[0] / 4.f + channel_maximum[0] / 4.f;
        else nraw[nr_offset(y, x)][2] = l * channel_maximum[2] / 4.f + channel_maximum[2] / 4.f;
    }
 }

 /*
 * интерполяция красных и синих.
 *
 * сначала интерполируются недостающие цвета, по диагональным направлениям от
 * которых находятся известные, затем ситуация сводится к тому как
 * интерполировались зелёные.
 */

 void DHTNNF::make_rbdiag(int i)
 {
    int iwidth = libraw.imgdata.sizes.iwidth;
    int js     = libraw.COLOR(i, 0) & 1;
    int uc     = libraw.COLOR(i, js);
    int cl     = uc ^ 2;
    /*
     * js -- начальная х-координата, которая попадает на уже интерполированный
     * зелёный al -- известный цвет (кроме зелёного) в точке интерполирования cl
     * -- неизвестный цвет
     */
    for (int j = js; j < iwidth; j += 2)
    {
        int x = j + nr_leftmargin;
        int y = i + nr_topmargin;
        int dx, dy, dx2, dy2;
        if (ndir[nr_offset(y, x)] & LURD)
        {
            dx  = -1;
            dx2 = 1;
            dy  = -1;
            dy2 = 1;
        }
        else
        {
            dx  = -1;
            dx2 = 1;
            dy  = 1;
            dy2 = -1;
        }
        float g1 = safe_inv_dist(calc_dist(nraw[nr_offset(y, x)][1], nraw[nr_offset(y + dy, x + dx)][1]));
        float g2 = safe_inv_dist(calc_dist(nraw[nr_offset(y, x)][1], nraw[nr_offset(y + dy2, x + dx2)][1]));
        g1 *= g1 * g1;
        g2 *= g2 * g2;

        // log-space: C/G ratio → difference, G×ratio → sum
        float d1 = nraw[nr_offset(y + dy, x + dx)][cl] - nraw[nr_offset(y + dy, x + dx)][1];
        float d2 = nraw[nr_offset(y + dy2, x + dx2)][cl] - nraw[nr_offset(y + dy2, x + dx2)][1];
        float eg = nraw[nr_offset(y, x)][1] + (g1 * d1 + g2 * d2) / (g1 + g2);
        float min, max;
        min                       = MIN(nraw[nr_offset(y + dy, x + dx)][cl], nraw[nr_offset(y + dy2, x + dx2)][cl]);
        max                       = MAX(nraw[nr_offset(y + dy, x + dx)][cl], nraw[nr_offset(y + dy2, x + dx2)][cl]);
        static const float margin = log2f(1.2f) / FloatChannelMaximum;
        min -= margin;
        max += margin;
        if (eg < min) eg = min;
        else if (eg > max) eg = max;
        if (eg > channel_maximum[cl]) eg = channel_maximum[cl];
        else if (eg < channel_minimum[cl]) eg = channel_minimum[cl];
        eg = std::max(0.f, std::min(1.f, eg));
        nraw[nr_offset(y, x)][cl] = eg;
    }
 }

 /*
 * интерполяция красных и синих в точках где был известен только зелёный,
 * направления горизонтальные или вертикальные
 */

 void DHTNNF::make_rbhv(int i)
 {
    int iwidth = libraw.imgdata.sizes.iwidth;
    int js     = (libraw.COLOR(i, 0) & 1) ^ 1;
    for (int j = js; j < iwidth; j += 2)
    {
        int x = j + nr_leftmargin;
        int y = i + nr_topmargin;
        /*
         * поскольку сверху-снизу и справа-слева уже есть все необходимые красные и
         * синие, то можно выбрать наилучшее направление исходя из информации по
         * обоим цветам.
         */
        int dx, dy, dx2, dy2;
        if (ndir[nr_offset(y, x)] & VER)
        {
            dx = dx2 = 0;
            dy       = -1;
            dy2      = 1;
        }
        else
        {
            dy = dy2 = 0;
            dx       = 1;
            dx2      = -1;
        }
        float g1 = safe_inv_dist(calc_dist(nraw[nr_offset(y, x)][1], nraw[nr_offset(y + dy, x + dx)][1]));
        float g2 = safe_inv_dist(calc_dist(nraw[nr_offset(y, x)][1], nraw[nr_offset(y + dy2, x + dx2)][1]));
        g1 *= g1;
        g2 *= g2;
        // log-space: R/G and B/G ratios → differences, G×ratio → sum
        float d1_r = nraw[nr_offset(y + dy, x + dx)][0] - nraw[nr_offset(y + dy, x + dx)][1];
        float d2_r = nraw[nr_offset(y + dy2, x + dx2)][0] - nraw[nr_offset(y + dy2, x + dx2)][1];
        float d1_b = nraw[nr_offset(y + dy, x + dx)][2] - nraw[nr_offset(y + dy, x + dx)][1];
        float d2_b = nraw[nr_offset(y + dy2, x + dx2)][2] - nraw[nr_offset(y + dy2, x + dx2)][1];
        float eg_r = nraw[nr_offset(y, x)][1] + (g1 * d1_r + g2 * d2_r) / (g1 + g2);
        float eg_b = nraw[nr_offset(y, x)][1] + (g1 * d1_b + g2 * d2_b) / (g1 + g2);
        float min_r, max_r;
        min_r = MIN(nraw[nr_offset(y + dy, x + dx)][0], nraw[nr_offset(y + dy2, x + dx2)][0]);
        max_r = MAX(nraw[nr_offset(y + dy, x + dx)][0], nraw[nr_offset(y + dy2, x + dx2)][0]);
        float min_b, max_b;
        min_b                     = MIN(nraw[nr_offset(y + dy, x + dx)][2], nraw[nr_offset(y + dy2, x + dx2)][2]);
        max_b                     = MAX(nraw[nr_offset(y + dy, x + dx)][2], nraw[nr_offset(y + dy2, x + dx2)][2]);
        static const float margin = log2f(1.2f) / FloatChannelMaximum;
        min_r -= margin;
        max_r += margin;
        min_b -= margin;
        max_b += margin;

        if (eg_r < min_r) eg_r = min_r;
        else if (eg_r > max_r) eg_r = max_r;
        if (eg_b < min_b) eg_b = min_b;
        else if (eg_b > max_b) eg_b = max_b;

        if (eg_r > channel_maximum[0]) eg_r = channel_maximum[0];
        else if (eg_r < channel_minimum[0]) eg_r = channel_minimum[0];
        if (eg_b > channel_maximum[2]) eg_b = channel_maximum[2];
        else if (eg_b < channel_minimum[2]) eg_b = channel_minimum[2];
        eg_r = std::max(0.f, std::min(1.f, eg_r));
        eg_b = std::max(0.f, std::min(1.f, eg_b));
        nraw[nr_offset(y, x)][0] = eg_r;
        nraw[nr_offset(y, x)][2] = eg_b;
    }
 }

 void DHTNNF::make_rb()
 {
 #if defined(LIBRAW_USE_OPENMP)
 #pragma omp parallel for schedule(guided)
 #endif
    for (int i = 0; i < libraw.imgdata.sizes.iheight; ++i)
    {
        make_rbdiag(i);
    }
    /* Implicit barrier at end of parallel for ensures make_rbdiag
     * completes before make_rbhv starts (which reads its output). */
 #if defined(LIBRAW_USE_OPENMP)
 #pragma omp parallel for schedule(guided)
 #endif
    for (int i = 0; i < libraw.imgdata.sizes.iheight; ++i)
    {
        make_rbhv(i);
    }
 }

 /*
 * перенос изображения в выходной массив
 */
 void DHTNNF::copy_to_image()
 {
    int iwidth = libraw.imgdata.sizes.iwidth;
 #if defined(LIBRAW_USE_OPENMP)
 #pragma omp parallel for schedule(guided) collapse(2)
 #endif
    for (int i = 0; i < libraw.imgdata.sizes.iheight; ++i)
    {
        for (int j = 0; j < iwidth; ++j)
        {
            int off = nr_offset(i + nr_topmargin, j + nr_leftmargin);

            float r = nraw[off][0];
            float g = nraw[off][1];
            float b = nraw[off][2];

            if (!std::isfinite(g)) g = 0.0f;
            if (!std::isfinite(r)) r = g;
            if (!std::isfinite(b)) b = g;

            if (r < 0.0f) r = 0.0f;
            if (g < 0.0f) g = 0.0f;
            if (b < 0.0f) b = 0.0f;

            if (r > 1.0f) r = 1.0f;
            if (g > 1.0f) g = 1.0f;
            if (b > 1.0f) b = 1.0f;

            libraw.imgdata.image[i * iwidth + j][0] = float_to_raw(r);
            libraw.imgdata.image[i * iwidth + j][2] = float_to_raw(b);
            libraw.imgdata.image[i * iwidth + j][1] = libraw.imgdata.image[i * iwidth + j][3] = float_to_raw(g);
        }
    }
 }

 DHTNNF::~DHTNNF()
 {
    free(nraw);
    free(ndir);
    free(nn_feat);
    free(nn_green_written);
 }

 /*
 * Precompute per-pixel 13-feature vectors for the entire image.
 * nn_feat[off][0..2]  = nraw[off][0..2]
 * nn_feat[off][3..5]  = one-hot Bayer channel (R, G, B)
 * nn_feat[off][6]     = hue_h: horizontal hue at center G-mean(KC)
 * nn_feat[off][7]     = hue_v: vertical hue at center   G-mean(KC)
 * nn_feat[off][8]     = hue_change_h: |hue(left)-hue(right)| horizontal hue consistency
 * nn_feat[off][9]     = hue_change_v: |hue(top)-hue(bottom)| vertical hue consistency
 * nn_feat[off][10]    = ld_ud: hue_aprox(g_u,g_d, c0,c_u2,c_d2) — cardinal vertical
 * nn_feat[off][11]    = ld_lr: hue_aprox(g_l,g_r, c0,c_l2,c_r2) — cardinal horizontal
 * nn_feat[off][12]    = ld_lu: hue_aprox(g_l,g_u, c0,c_l2,c_u2) — diagonal left-up
 * nn_feat[off][13]    = ld_ld: hue_aprox(g_l,g_d, c0,c_l2,c_d2) — diagonal left-down
 * nn_feat[off][14]    = ld_ru: hue_aprox(g_r,g_u, c0,c_r2,c_u2) — diagonal right-up
 * nn_feat[off][15]    = ld_rd: hue_aprox(g_r,g_d, c0,c_r2,c_d2) — diagonal right-down
 *
 * Must be called (or re-called) whenever ndir changes before NN inference/training.
 */
 void DHTNNF::build_nn_features()
 {
    int npix    = nr_height * nr_width;
    int iwidth  = libraw.imgdata.sizes.iwidth;
    int iheight = libraw.imgdata.sizes.iheight;

    if (!nn_feat) nn_feat = (float (*)[16])malloc(npix * 16 * sizeof(float));

    /* Zero out (border pixels keep zeros) */
    memset(nn_feat, 0, npix * 16 * sizeof(float));

    /* Fill the active image area */
 #if defined(LIBRAW_USE_OPENMP)
 #pragma omp parallel for schedule(guided)
 #endif
    for (int i = 0; i < iheight; i++)
    {
        /* Bayer pattern repeats every 2 rows and columns;
         * precompute a row-local cache of COLOR for this row. */
        int col_cache[48];
        for (int c = 0; c < 48; c++)
        {
            int ch = libraw.COLOR(i, c);
            if (ch == 3) ch = 1;
            col_cache[c] = ch;
        }

        for (int j = 0; j < iwidth; j++)
        {
            int    off = nr_offset(i + nr_topmargin, j + nr_leftmargin);
            float *f   = nn_feat[off];

            f[0] = nraw[off][0];
            f[1] = nraw[off][1];
            f[2] = nraw[off][2];

            int ch = col_cache[j % 48];
            f[3]   = (ch == 0) ? 1.0f : 0.0f;
            f[4]   = (ch == 1) ? 1.0f : 0.0f;
            f[5]   = (ch == 2) ? 1.0f : 0.0f;

            /* Hue features in log-domain: G - mean(KC) along each direction.
             * In log space, subtraction = ratio, and (a+b)/2 ≈ log(geomean).
             * For green pixels: G_center - (ng_left + ng_right) / 2
             * For non-green:    (G_left + G_right) / 2 - KC_center */
            int x = j + nr_leftmargin;
            int y = i + nr_topmargin;
            if (ch == 1) {
                /* Green pixel: non-green neighbors at distance 1 */
                float ng_l = nraw[nr_offset(y, x - 1)][0] + nraw[nr_offset(y, x - 1)][2];
                float ng_r = nraw[nr_offset(y, x + 1)][0] + nraw[nr_offset(y, x + 1)][2];
                float ng_u = nraw[nr_offset(y - 1, x)][0] + nraw[nr_offset(y - 1, x)][2];
                float ng_d = nraw[nr_offset(y + 1, x)][0] + nraw[nr_offset(y + 1, x)][2];
                f[6] = nraw[off][1] - (ng_l + ng_r) * 0.5f;
                f[7] = nraw[off][1] - (ng_u + ng_d) * 0.5f;
            } else {
                /* Non-green pixel: green neighbors at distance 1, known color at center */
                f[6] = (nraw[nr_offset(y, x - 1)][1] + nraw[nr_offset(y, x + 1)][1]) * 0.5f
                     - nraw[off][ch];
                f[7] = (nraw[nr_offset(y - 1, x)][1] + nraw[nr_offset(y + 1, x)][1]) * 0.5f
                     - nraw[off][ch];
            }

            /* Hue CHANGE: |hue(neighbor1) - hue(neighbor2)| along each axis.
             * Measures hue consistency in horizontal / vertical directions.
             * Same calculation as calc_dist(hh1,hh2) / calc_dist(hv1,hv2) in get_hv_grb/rbg. */
            if (ch == 1) {
                /* Green center: non-green at ±1, green at ±2 (same parity as center) */
                float kc_l = nraw[nr_offset(y, x - 1)][0] + nraw[nr_offset(y, x - 1)][2];
                float kc_r = nraw[nr_offset(y, x + 1)][0] + nraw[nr_offset(y, x + 1)][2];
                float hh1  = kc_l - (nraw[nr_offset(y, x - 2)][1] + nraw[off][1]) * 0.5f;
                float hh2  = kc_r - (nraw[off][1] + nraw[nr_offset(y, x + 2)][1]) * 0.5f;
                f[8] = fabsf(hh1 - hh2);
                float kc_u = nraw[nr_offset(y - 1, x)][0] + nraw[nr_offset(y - 1, x)][2];
                float kc_d = nraw[nr_offset(y + 1, x)][0] + nraw[nr_offset(y + 1, x)][2];
                float hv1  = kc_u - (nraw[nr_offset(y - 2, x)][1] + nraw[off][1]) * 0.5f;
                float hv2  = kc_d - (nraw[off][1] + nraw[nr_offset(y + 2, x)][1]) * 0.5f;
                f[9] = fabsf(hv1 - hv2);
            } else {
                /* Non-green center: green at ±1, same ch at ±2 */
                float hh1 = nraw[nr_offset(y, x - 1)][1] - (nraw[nr_offset(y, x - 2)][ch] + nraw[off][ch]) * 0.5f;
                float hh2 = nraw[nr_offset(y, x + 1)][1] - (nraw[off][ch] + nraw[nr_offset(y, x + 2)][ch]) * 0.5f;
                f[8] = fabsf(hh1 - hh2);
                float hv1 = nraw[nr_offset(y - 1, x)][1] - (nraw[nr_offset(y - 2, x)][ch] + nraw[off][ch]) * 0.5f;
                float hv2 = nraw[nr_offset(y + 1, x)][1] - (nraw[off][ch] + nraw[nr_offset(y + 2, x)][ch]) * 0.5f;
                f[9] = fabsf(hv1 - hv2);
            }

            /* Features f[10..15]: all 6 LD (lightness-directed) hue_aprox directions.
             * hue_aprox(G1,G2,C0,C1,C2) = (G1+G2)/2 - (C1+C2-2*C0)/4
             * G1,G2 = green at ±1 neighbors along the two chosen directions.
             * C1,C2 = same-channel values at ±2 in those directions.
             * Cardinal: UD, LR.  Diagonal: LU, LD, RU, RD. */
            {
                float c0   = nraw[off][ch];
                float g_u  = nraw[nr_offset(y - 1, x)][1];
                float g_d  = nraw[nr_offset(y + 1, x)][1];
                float g_l  = nraw[nr_offset(y, x - 1)][1];
                float g_r  = nraw[nr_offset(y, x + 1)][1];
                float c_u2 = nraw[nr_offset(y - 2, x)][ch];
                float c_d2 = nraw[nr_offset(y + 2, x)][ch];
                float c_l2 = nraw[nr_offset(y, x - 2)][ch];
                float c_r2 = nraw[nr_offset(y, x + 2)][ch];
                f[10] = (g_u + g_d) * 0.5f - (c_u2 + c_d2 - 2.0f * c0) * 0.25f; // ld_ud
                f[11] = (g_l + g_r) * 0.5f - (c_l2 + c_r2 - 2.0f * c0) * 0.25f; // ld_lr
                f[12] = (g_l + g_u) * 0.5f - (c_l2 + c_u2 - 2.0f * c0) * 0.25f; // ld_lu
                f[13] = (g_l + g_d) * 0.5f - (c_l2 + c_d2 - 2.0f * c0) * 0.25f; // ld_ld
                f[14] = (g_r + g_u) * 0.5f - (c_r2 + c_u2 - 2.0f * c0) * 0.25f; // ld_ru
                f[15] = (g_r + g_d) * 0.5f - (c_r2 + c_d2 - 2.0f * c0) * 0.25f; // ld_rd
            }
        }
    }
 }

 /*
 * Extract a patch of features centered at image pixel (i,j).
 * Gathers from the precomputed nn_feat array.
 * patch_r: patch radius (e.g. 3 -> 7x7 patch).
 * Output: patch[(2*patch_r+1)^2 * 10] in row-major order.
 */
 void DHTNNF::extract_patch(int i, int j, float *patch, int patch_r)
 {
    int idx = 0;
    for (int dy = -patch_r; dy <= patch_r; dy++)
    {
        int row = (i + dy) + nr_topmargin;
        for (int dx = -patch_r; dx <= patch_r; dx++)
        {
            int off = row * nr_width + (j + dx) + nr_leftmargin;
            memcpy(patch + idx, nn_feat[off], DHTNNNetwork::FEAT_PER_PX * sizeof(float));
            idx += DHTNNNetwork::FEAT_PER_PX;
        }
    }
 }

 /*
 * Compute interpolated green at non-green pixel (i,j) with a forced direction.
 * Duplicates the make_gline formula but with explicit direction control.
 * kc: known non-green channel at this pixel (0=R, 2=B).
 * use_ver: true=interpolate vertically, false=horizontally.
 */
 float DHTNNF::compute_green_forced(int i, int j, int kc, bool use_ver)
 {
    int   x = j + nr_leftmargin;
    int   y = i + nr_topmargin;
    int   dx, dy, dx2, dy2;
    float d1, d2;
    if (use_ver)
    {
        dx = dx2 = 0;
        dy       = -1;
        dy2      = 1;
        d1       = nraw[nr_offset(y - 1, x)][1] - (nraw[nr_offset(y - 2, x)][kc] + nraw[nr_offset(y, x)][kc]) * 0.5f;
        d2       = nraw[nr_offset(y + 1, x)][1] - (nraw[nr_offset(y + 2, x)][kc] + nraw[nr_offset(y, x)][kc]) * 0.5f;
    }
    else
    {
        dy = dy2 = 0;
        dx       = 1;
        dx2      = -1;
        d1       = nraw[nr_offset(y, x + 1)][1] - (nraw[nr_offset(y, x + 2)][kc] + nraw[nr_offset(y, x)][kc]) * 0.5f;
        d2       = nraw[nr_offset(y, x - 1)][1] - (nraw[nr_offset(y, x - 2)][kc] + nraw[nr_offset(y, x)][kc]) * 0.5f;
    }
    float b1 = safe_inv_dist(calc_dist(nraw[nr_offset(y, x)][kc], nraw[nr_offset(y + dy * 2, x + dx * 2)][kc]));
    float b2 = safe_inv_dist(calc_dist(nraw[nr_offset(y, x)][kc], nraw[nr_offset(y + dy2 * 2, x + dx2 * 2)][kc]));
    b1 *= b1;
    b2 *= b2;
    float eg = nraw[nr_offset(y, x)][kc] + (b1 * d1 + b2 * d2) / (b1 + b2);
    float min, max;
    min                       = MIN(nraw[nr_offset(y + dy, x + dx)][1], nraw[nr_offset(y + dy2, x + dx2)][1]);
    max                       = MAX(nraw[nr_offset(y + dy, x + dx)][1], nraw[nr_offset(y + dy2, x + dx2)][1]);
    static const float margin = log2f(1.2f) / FloatChannelMaximum;
    min -= margin;
    max += margin;
    if (eg < min) eg = min;
    else if (eg > max) eg = max;
    if (eg > channel_maximum[1]) eg = channel_maximum[1];
    else if (eg < channel_minimum[1]) eg = channel_minimum[1];
    return eg;
 }

 /*
 * Use neural network to refine HV directions (direction flags only, no green write).
 * GPU path. CPU inference is handled by nn_blend_green.
 */
 void DHTNNF::nn_refine_hv_dirs(DHTNNNetwork &net)
 {
    int iwidth  = libraw.imgdata.sizes.iwidth;
    int iheight = libraw.imgdata.sizes.iheight;
    int PR      = net.patch_r;

 #ifdef USE_OPENCL
    {
        DHTNNCLContext &cl = get_cl_context();
        float *gpu_green = (float *)malloc((size_t)iwidth * iheight * sizeof(float));
        if (gpu_green && cl.run_inference(nraw, ndir, gpu_green, nr_height, nr_width, iwidth, iheight, nr_topmargin, nr_leftmargin, (unsigned int)libraw.imgdata.idata.filters, channel_minimum[1], channel_maximum[1], net))
        {
            free(gpu_green); /* nn_refine_hv_dirs only needs ndir, discard green */
            fprintf(stderr, "DHT-NN: GPU inference done (%dx%d)\n", iwidth, iheight);
            return;
        }
        free(gpu_green);
        fprintf(stderr, "DHT-NN: OpenCL failed, falling back to CPU\n");
    }
 #endif

    /* Precompute per-pixel feature vectors for fast patch extraction */
    build_nn_features();

    /* CPU fallback: OpenMP-parallel per-pixel forward pass */
    char *new_ndir = (char *)malloc(nr_height * nr_width);
    memcpy(new_ndir, ndir, nr_height * nr_width);

 #if defined(LIBRAW_USE_OPENMP)
 #pragma omp parallel for schedule(guided)
 #endif
    for (int i = PR; i < iheight - PR; i++)
    {
        for (int j = PR; j < iwidth - PR; j++)
        {
            int x = j + nr_leftmargin;
            int y = i + nr_topmargin;
            if (ndir[nr_offset(y, x)] & HVSH) continue;

            float *patch = (float *)alloca(net.input_size * sizeof(float));
            extract_patch(i, j, patch, net.patch_r);

            float output[DHTNNNetwork::OUTPUT_SIZE];
            int   prediction = net.forward(patch, output);

            new_ndir[nr_offset(y, x)] &= ~(HOR | VER);
            new_ndir[nr_offset(y, x)] |= (prediction == 0) ? HOR : VER;
        }
    }

    memcpy(ndir, new_ndir, nr_height * nr_width);
    free(new_ndir);
 }

 /*
 * NN inference with soft H/V blend:
 *
 *   float2 hv = softmax(out[0], out[1]);
 *   float H_weight = hv.x,  V_weight = hv.y;
 *   float S = sigmoid(out[2]);
 *   float soft  = H_weight * H_val + V_weight * V_val;
 *   float hard  = H_weight > V_weight ? H_val : V_val;
 *   float green = (1 - S) * soft + S * hard;
 *
 * For GPU path the kernel computes blended green and writes to green_out buffer.
 * For CPU path, green is computed and written here; make_gline skips those pixels.
 */
 void DHTNNF::nn_blend_green(DHTNNNetwork &net)
 {
    int iwidth  = libraw.imgdata.sizes.iwidth;
    int iheight = libraw.imgdata.sizes.iheight;
    int PR      = net.patch_r;

 #ifdef USE_OPENCL
    {
        /* Allocate (or reuse) per-pixel green-written flags */
        if (!nn_green_written)
            nn_green_written = (bool *)calloc(iwidth * iheight, sizeof(bool));
        else
            memset(nn_green_written, 0, iwidth * iheight * sizeof(bool));

        float *gpu_green = (float *)malloc((size_t)iwidth * iheight * sizeof(float));
        DHTNNCLContext &cl = get_cl_context();
        if (gpu_green && cl.run_inference(nraw, ndir, gpu_green, nr_height, nr_width, iwidth, iheight, nr_topmargin, nr_leftmargin, (unsigned int)libraw.imgdata.idata.filters, channel_minimum[1], channel_maximum[1], net))
        {
            /* Apply GPU-computed green values to nraw */
            for (int i = 0; i < iheight; i++)
                for (int j = 0; j < iwidth; j++)
                {
                    float g = gpu_green[i * iwidth + j];
                    if (g >= 0.0f)
                    {
                        nraw[nr_offset(i + nr_topmargin, j + nr_leftmargin)][1] = g;
                        nn_green_written[i * iwidth + j] = true;
                    }
                }
            free(gpu_green);
            fprintf(stderr, "DHT-NN: GPU blend done (%dx%d)\n", iwidth, iheight);
            return;
        }
        free(gpu_green);
        fprintf(stderr, "DHT-NN: OpenCL failed, falling back to CPU blend\n");
    }
 #endif

    build_nn_features();

    /* Allocate (or reuse) per-pixel green-written flags */
    if (!nn_green_written)
        nn_green_written = (bool *)calloc(iwidth * iheight, sizeof(bool));
    else
        memset(nn_green_written, 0, iwidth * iheight * sizeof(bool));

    char *new_ndir = (char *)malloc(nr_height * nr_width);
    memcpy(new_ndir, ndir, nr_height * nr_width);

 #if defined(LIBRAW_USE_OPENMP)
 #pragma omp parallel for schedule(guided)
 #endif
    for (int i = PR; i < iheight - PR; i++)
    {
        int js = libraw.COLOR(i, 0) & 1;
        int kc = libraw.COLOR(i, js);
        int j_start = PR + (((PR & 1) != js) ? 1 : 0); // first non-green j >= PR
        for (int j = j_start; j < iwidth - PR; j += 2)
        {
            int x = j + nr_leftmargin;
            int y = i + nr_topmargin;
            if (ndir[nr_offset(y, x)] & HVSH) continue;

            float *patch = (float *)alloca(net.input_size * sizeof(float));
            extract_patch(i, j, patch, net.patch_r);

            float output[DHTNNNetwork::OUTPUT_SIZE];
            net.forward(patch, output);

            /* Softmax over H/V logits */
            float max_hv  = output[0] > output[1] ? output[0] : output[1];
            float eh      = expf(output[0] - max_hv);
            float ev      = expf(output[1] - max_hv);
            float sum_hv  = eh + ev;
            float H_weight = eh / sum_hv;
            float V_weight = ev / sum_hv;

            /* Sigmoid sharpness */
            float S = 1.0f / (1.0f + expf(-output[2]));

            /* H and V green estimates */
            float H_val = compute_green_forced(i, j, kc, false);
            float V_val = compute_green_forced(i, j, kc, true);

            /* Blend: S=0 → weighted avg (ambiguous), S=1 → hard select (sharp edge) */
            float soft  = H_weight * H_val + V_weight * V_val;
            float hard  = H_weight > V_weight ? H_val : V_val;
            float green = (1.0f - S) * soft + S * hard;

            /* Clamp */
            if (green > channel_maximum[1]) green = channel_maximum[1];
            if (green < channel_minimum[1]) green = channel_minimum[1];

            nraw[nr_offset(y, x)][1] = green;
            nn_green_written[i * iwidth + j] = true;

            /* Keep ndir HOR/VER for make_rb */
            new_ndir[nr_offset(y, x)] &= ~(HOR | VER);
            new_ndir[nr_offset(y, x)] |= (H_weight > V_weight) ? HOR : VER;
        }
    }

    memcpy(ndir, new_ndir, nr_height * nr_width);
    free(new_ndir);
 }

 /*
 * Train the NN using objective ground truth from a downsampled AAHD demosaic.
 *
 * gt_rgb: half-resolution ground truth image in perceptual float space [gt_h * gt_w][3].
 *         All three channels are known at every pixel (from AAHD + 2x downsample).
 * gt_w, gt_h: dimensions of the ground truth image.
 *
 * v15 training (make_nnt_answers pattern):
 *   Output is 3 values: logit(H), logit(V) [softmax], logit(sharpness) [sigmoid].
 *   For every non-green pixel, compute all 6 LD hue_aprox estimates and compare
 *   each to the known gt_green.  Assign soft scores (per JND threshold) and derive:
 *     - target_h / target_v : H/V soft labels (normalized from per-direction scores)
 *     - target_s            : sharpness = exp(-bestDiff/JND)
 *   No ambiguous pixel is discarded; ALL non-green pixels get a training signal.
 */
 void DHTNNF::train_nn(DHTNNNetwork &net, float lr, float (*gt_rgb)[3], int gt_w, int gt_h)
 {
    int iwidth  = libraw.imgdata.sizes.iwidth;
    int iheight = libraw.imgdata.sizes.iheight;
    int PR      = net.patch_r;

    build_nn_features();

    static constexpr float JND_ACCEPT   = 0.005f; // per-estimate acceptance radius (how close to truth = "good")
    static constexpr float JND_DECISIVE  = 0.010f; // cardinal gap threshold for sharpness label
    const int   N_EPOCHS = 3;

    /* ── Phase 1: Collect training samples (parallel) ── */
    struct Sample {
        int i, j;
        float target_h;   // soft label for HOR (softmax target)
        float target_v;   // soft label for VER (softmax target)
        float target_s;   // soft label for sharpness (sigmoid target)
        float confidence; // |target_h - target_v|: sample weight for gradient
    };
    std::vector<Sample> samples;
    samples.reserve(iwidth * iheight / 2);

 #if defined(LIBRAW_USE_OPENMP)
 #pragma omp parallel
    {
 #endif
    std::vector<Sample> local_samples;

 #if defined(LIBRAW_USE_OPENMP)
 #pragma omp for schedule(guided) nowait
 #endif
    for (int i = PR; i < iheight - PR; i++)
    {
        for (int j = PR; j < iwidth - PR; j++)
        {
            int x = j + nr_leftmargin;
            int y = i + nr_topmargin;

            if (ndir[nr_offset(y, x)] & HVSH) continue;

            int js = libraw.COLOR(i, 0) & 1;
            if ((j & 1) != js) continue;

            float gt_green = gt_rgb[i * gt_w + j][1];

            /* All 6 LD green estimates from precomputed features:
             * f[10]=ld_ud(VER), f[11]=ld_lr(HOR), f[12..15]=diagonals */
            const float *f = nn_feat[nr_offset(i + nr_topmargin, j + nr_leftmargin)];
            const float aG[6] = { f[10], f[11], f[12], f[13], f[14], f[15] };

            /* Per-estimate soft score: linear-in-gap within JND_ACCEPT, else 0.
             * Best estimate always gets full credit regardless. */
            float score[6];
            int   bestI    = 0;
            float bestDiff = fabsf(gt_green - aG[0]);
            for (int ai = 0; ai < 6; ai++)
            {
                float diff = fabsf(gt_green - aG[ai]);
                if (diff < bestDiff) { bestI = ai; bestDiff = diff; }
                score[ai] = (diff < JND_ACCEPT) ? 0.9f + 0.1f * (JND_ACCEPT - diff) / JND_ACCEPT : 0.0f;
            }
            score[bestI] = 1.0f; // best always gets full credit

            /* H/V soft labels:
             * UD(0) → purely VER, LR(1) → purely HOR, diagonals(2-5) → 0.5 each */
            float h_score  = score[1] + 0.5f * (score[2] + score[3] + score[4] + score[5]);
            float v_score  = score[0] + 0.5f * (score[2] + score[3] + score[4] + score[5]);
            float sum_hv   = h_score + v_score;
            float target_h = (sum_hv > 1e-6f) ? h_score / sum_hv : 0.5f;
            float target_v = (sum_hv > 1e-6f) ? v_score / sum_hv : 0.5f;

            /* Sharpness: how decisive the cardinal H/V choice is. */
            float diff_H   = fabsf(gt_green - aG[1]); // ld_lr (cardinal HOR)
            float diff_V   = fabsf(gt_green - aG[0]); // ld_ud (cardinal VER)
            float target_s = tanhf(fabsf(diff_H - diff_V) / JND_DECISIVE);

            /* Confidence weight: how decisive the H/V label is.
             * Ambiguous (50/50) samples get near-zero weight → less noisy gradient. */
            float confidence = fabsf(target_h - target_v);
            local_samples.push_back({i, j, target_h, target_v, target_s, confidence});
        }
    }

 #if defined(LIBRAW_USE_OPENMP)
 #pragma omp critical
 #endif
    {
        samples.insert(samples.end(), local_samples.begin(), local_samples.end());
    }

 #if defined(LIBRAW_USE_OPENMP)
    }
 #endif

    /* Shuffle samples to prevent catastrophic forgetting of previous rows */
    std::shuffle(samples.begin(), samples.end(), std::minstd_rand(std::random_device{}()));

    int total_samples = (int)samples.size();

    /* Count confident samples (|target_h - target_v| > 0.2) for separate accuracy tracking */
    int n_confident = 0;
    for (int s = 0; s < total_samples; s++)
        if (samples[s].confidence > 0.2f) n_confident++;

    fprintf(stderr, "DHT-NN train: %d samples (%d confident, %d ambiguous), soft H/V+sharpness labels\n",
            total_samples, n_confident, total_samples - n_confident);

    /* ── Phase 2: Mini-batch parallel training ── */
    int correct_total = 0;
    int correct_confident = 0; // accuracy on confident samples only
    long long adam_step = net.t;
    int nw = net.num_weights();
    const int BATCH_SIZE = 256;

    if (!net.optimizer_allocated) net.alloc_optimizer();

    int bp_max_act, bp_max_wt;
    net.backprop_buf_sizes(bp_max_act, bp_max_wt);

    int n_threads;
 #if defined(LIBRAW_USE_OPENMP)
    n_threads = omp_get_max_threads();
 #else
    n_threads = 1;
 #endif

    /* Per-thread buffers: forward activations, backprop temps, gradient accumulators */
    float  **t_patch   = (float **)malloc(n_threads * sizeof(float *));
    float ***t_h       = (float ***)malloc(n_threads * sizeof(float **));
    float  **t_bp_da   = (float **)malloc(n_threads * sizeof(float *));
    float  **t_bp_db   = (float **)malloc(n_threads * sizeof(float *));
    float ***t_acc_gW  = (float ***)malloc(n_threads * sizeof(float **));
    float ***t_acc_gb  = (float ***)malloc(n_threads * sizeof(float **));
    /* Separate buffers for averaged gradients (reduction target) */
    float **avg_gW = (float **)malloc(nw * sizeof(float *));
    float **avg_gb = (float **)malloc(nw * sizeof(float *));
    int     *t_correct = (int *)calloc(n_threads, sizeof(int));
    int     *t_correct_conf = (int *)calloc(n_threads, sizeof(int));

    for (int t = 0; t < n_threads; t++)
    {
        t_patch[t] = (float *)malloc(net.input_size * sizeof(float));
        t_h[t]     = (float **)malloc(net.n_hidden * sizeof(float *));
        for (int l = 0; l < net.n_hidden; l++)
            t_h[t][l] = (float *)malloc(net.hidden[l] * sizeof(float));
        t_bp_da[t]  = (float *)malloc(bp_max_act * sizeof(float));
        t_bp_db[t]  = (float *)malloc(bp_max_act * sizeof(float));
        t_acc_gW[t] = (float **)malloc(nw * sizeof(float *));
        t_acc_gb[t] = (float **)malloc(nw * sizeof(float *));
        for (int l = 0; l < nw; l++)
        {
            t_acc_gW[t][l] = (float *)malloc(net.weight_out(l) * net.weight_in(l) * sizeof(float));
            t_acc_gb[t][l] = (float *)malloc(net.weight_out(l) * sizeof(float));
        }
    }
    /* Allocate averaged gradient buffers */
    for (int l = 0; l < nw; l++)
    {
        avg_gW[l] = (float *)malloc(net.weight_out(l) * net.weight_in(l) * sizeof(float));
        avg_gb[l] = (float *)malloc(net.weight_out(l) * sizeof(float));
    }

    for (int epoch = 0; epoch < N_EPOCHS; epoch++)
    {
        if (epoch > 0)
            std::shuffle(samples.begin(), samples.end(), std::minstd_rand(std::random_device{}()));

        correct_total = 0;
        correct_confident = 0;

 #if defined(LIBRAW_USE_OPENMP)
 #pragma omp parallel
 #endif
        {
 #if defined(LIBRAW_USE_OPENMP)
            int tid = omp_get_thread_num();
            int nt  = omp_get_num_threads();
 #else
            int tid = 0;
            int nt  = 1;
 #endif
            for (int batch_start = 0; batch_start < total_samples; batch_start += BATCH_SIZE)
            {
                int batch_end = batch_start + BATCH_SIZE;
                if (batch_end > total_samples) batch_end = total_samples;
                int batch_sz = batch_end - batch_start;

                /* Zero this thread's gradient accumulators */
                for (int l = 0; l < nw; l++)
                {
                    memset(t_acc_gW[tid][l], 0, net.weight_out(l) * net.weight_in(l) * sizeof(float));
                    memset(t_acc_gb[tid][l], 0, net.weight_out(l) * sizeof(float));
                }
                t_correct[tid] = 0;
                t_correct_conf[tid] = 0;

                /* Parallel forward + gradient accumulation */
 #if defined(LIBRAW_USE_OPENMP)
 #pragma omp for schedule(static)
 #endif
                for (int s = batch_start; s < batch_end; s++)
                {
                    extract_patch(samples[s].i, samples[s].j, t_patch[tid], net.patch_r);
                    float output[DHTNNNetwork::OUTPUT_SIZE];
                    int pred = net.forward_train(t_patch[tid], t_h[tid], output);
                    /* accuracy: argmax(H,V) matches dominant soft label */
                    int expected = (samples[s].target_h >= samples[s].target_v) ? 0 : 1;
                    if (pred == expected) {
                        t_correct[tid]++;
                        if (samples[s].confidence > 0.2f) t_correct_conf[tid]++;
                    }
                    float samp_targets[3] = { samples[s].target_h, samples[s].target_v, samples[s].target_s };
                    /* Weight gradient by confidence: ambiguous 50/50 samples contribute less */
                    float sw = 0.1f + 0.9f * samples[s].confidence; // floor=0.1 to keep some signal
                    net.accumulate_grads(t_patch[tid], t_h[tid], output, samp_targets,
                                         sw,
                                         t_acc_gW[tid], t_acc_gb[tid],
                                         t_bp_da[tid], t_bp_db[tid]);
                }
                /* implicit barrier after omp for */

                /* Parallel gradient reduction + averaging into separate buffers */
                {
                    float inv_batch = 1.0f / batch_sz;
                    for (int l = 0; l < nw; l++)
                    {
                        int wt_sz = net.weight_out(l) * net.weight_in(l);
 #if defined(LIBRAW_USE_OPENMP)
 #pragma omp for schedule(static)
 #endif
                        for (int i = 0; i < wt_sz; i++)
                        {
                            float s = 0.0f;
                            for (int t = 0; t < nt; t++)
                                s += t_acc_gW[t][l][i];
                            avg_gW[l][i] = s * inv_batch;
                        }
                        /* implicit barrier */

                        int b_sz = net.weight_out(l);
 #if defined(LIBRAW_USE_OPENMP)
 #pragma omp for schedule(static)
 #endif
                        for (int i = 0; i < b_sz; i++)
                        {
                            float s = 0.0f;
                            for (int t = 0; t < nt; t++)
                                s += t_acc_gb[t][l][i];
                            avg_gb[l][i] = s * inv_batch;
                        }
                        /* implicit barrier */
                    }
                }

                /* Adam update + progress reporting (single thread) */
 #if defined(LIBRAW_USE_OPENMP)
 #pragma omp single
 #endif
                {
                    adam_step++;
                    net.adam_update_all(avg_gW, avg_gb, lr, adam_step);

                    for (int t = 0; t < nt; t++)
                    {
                        correct_total += t_correct[t];
                        correct_confident += t_correct_conf[t];
                    }
                }
                /* implicit barrier after omp single */
            }
        }

        fprintf(stderr, "DHT-NN train: epoch %d/%d, %d samples, accuracy %.1f%% (confident: %.1f%%)\n",
                epoch + 1, N_EPOCHS, total_samples,
                100.0f * correct_total / (total_samples ? total_samples : 1),
                100.0f * correct_confident / (n_confident ? n_confident : 1));
    }

    /* Cleanup per-thread buffers */
    for (int t = 0; t < n_threads; t++)
    {
        free(t_patch[t]);
        for (int l = 0; l < net.n_hidden; l++)
            free(t_h[t][l]);
        free(t_h[t]);
        free(t_bp_da[t]); free(t_bp_db[t]);
        for (int l = 0; l < nw; l++)
        {
            free(t_acc_gW[t][l]); free(t_acc_gb[t][l]);
        }
        free(t_acc_gW[t]); free(t_acc_gb[t]);
    }
    free(t_patch); free(t_h);
    free(t_bp_da); free(t_bp_db);
    free(t_acc_gW); free(t_acc_gb);
    /* Cleanup averaged gradient buffers */
    for (int l = 0; l < nw; l++)
    {
        free(avg_gW[l]); free(avg_gb[l]);
    }
    free(avg_gW); free(avg_gb);
    free(t_correct);
    free(t_correct_conf);

    net.t = adam_step;

    fprintf(stderr, "DHT-NN train: %d samples x %d epochs, soft labels, final accuracy %.1f%% (confident: %.1f%%)\n",
            total_samples, N_EPOCHS,
            100.0f * correct_total / (total_samples ? total_samples : 1),
            100.0f * correct_confident / (n_confident ? n_confident : 1));
 }

 /*
 * q=13: Inference mode.
 * Loads trained weights from "dht_nn.weights", uses the NN to refine HV directions.
 * Falls back to rule-based if weights file not found.
 */
 void LibRaw::dht_nn_interpolate()
 {
    if (imgdata.idata.filters != 0x16161616 && imgdata.idata.filters != 0x61616161 && imgdata.idata.filters != 0x49494949 &&
        imgdata.idata.filters != 0x94949494)
    {
        ahd_interpolate();
        return;
    }

    DHTNNNetwork net;
    std::string weights_path = dht_nn_filename(net, "weights");
    bool         have_nn = net.load(weights_path.c_str());
    if (have_nn) fprintf(stderr, "DHT-NN: loaded weights from %s\n", weights_path.c_str());
    else fprintf(stderr, "DHT-NN: weights not found at %s, falling back to rule-based\n", weights_path.c_str());

    DHTNNF dht(*this);
    dht.hide_hots();

    if (have_nn)
    {
        /* NN path: initial detection + NN refinement */
 #if defined(LIBRAW_USE_OPENMP)
 #pragma omp parallel for schedule(guided)
 #endif
        for (int i = 0; i < imgdata.sizes.iheight; ++i)
            dht.make_hv_dline(i);

        dht.nn_blend_green(net);
    }
    else
    {
        /* Fallback: full rule-based pipeline */
        dht.make_hv_dirs();
    }

    dht.make_greens(); /* handles HVSH + margin pixels nn_blend_green skipped */
    dht.make_diag_dirs();
    dht.make_rb();
    dht.restore_hots();
    dht.copy_to_image();
 }

 /*
 * q=14: Training mode.
 * 1. AAHD-demosaic the full-resolution raw → full RGB at every pixel.
 * 2. Downsample 2× (in linear space) → half-res ground truth with all channels known.
 * 3. Re-mosaic the half-res image → synthetic Bayer raw.
 * 4. Build DHTNNF from the synthetic half-res raw, run initial HV detection.
 * 5. For each non-green pixel: compare H/V green interpolation to known GT → label.
 * 6. Train the NN on half-res patches with objective ground truth.
 * 7. Restore original raw and demosaic with NN.
 */
 void LibRaw::dht_nn_interpolate_train()
 {
    if (imgdata.idata.filters != 0x16161616 && imgdata.idata.filters != 0x61616161 && imgdata.idata.filters != 0x49494949 &&
        imgdata.idata.filters != 0x94949494)
    {
        ahd_interpolate();
        return;
    }

    const float lr_fresh     = 0.001f;
    const float lr_resume    = 0.0005f;
    float       lr           = lr_fresh;

    DHTNNNetwork net;
    std::string ckpt_path    = dht_nn_filename(net, "ckpt");
    std::string weights_path = dht_nn_filename(net, "weights");

    if (net.load_checkpoint(ckpt_path.c_str()))
    {
        lr = lr_resume;
        fprintf(stderr, "DHT-NN: resumed from checkpoint %s (lr=%.6f)\n", ckpt_path.c_str(), lr);
    }
    else if (net.load(weights_path.c_str()))
    {
        fprintf(stderr, "DHT-NN: loaded weights from %s, starting fresh training state (lr=%.6f)\n", weights_path.c_str(), lr);
        net.alloc_optimizer();
    }
    else
    {
        fprintf(stderr, "DHT-NN: no checkpoint/weights found, training from random init (lr=%.6f)\n", lr);
        net.alloc_optimizer();
    }

    int iwidth  = imgdata.sizes.iwidth;
    int iheight = imgdata.sizes.iheight;

    /* ── Step 1: Save original raw mosaic ── */
    size_t img_size         = (size_t)iwidth * iheight;
    ushort(*saved_image)[4] = (ushort(*)[4])malloc(img_size * sizeof(ushort[4]));
    memcpy(saved_image, imgdata.image, img_size * sizeof(ushort[4]));

    /* ── Step 2: AAHD demosaic → full RGB in imgdata.image ── */
    fprintf(stderr, "DHT-NN train: running AAHD demosaic for ground truth...\n");
    aahd_interpolate();

    /* ── Step 3: Downsample 2× in linear space → half-res ground truth ── */
    int     half_w = iwidth / 2;
    int     half_h = iheight / 2;
    float3 *gt_rgb = (float3 *)malloc(half_w * half_h * sizeof(float3));

    for (int i = 0; i < half_h; i++)
    {
        for (int j = 0; j < half_w; j++)
        {
            int si = i * 2, sj = j * 2;
            for (int c = 0; c < 3; c++)
            {
                /* Average 2×2 block in linear space, convert to perceptual float.
                 * imgdata.image channels: [0]=R, [1]=G, [2]=B, [3]=G2; use [c] for c=0,1,2. */
                float sum = 0;
                sum += (float)imgdata.image[(si + 0) * iwidth + (sj + 0)][c];
                sum += (float)imgdata.image[(si + 0) * iwidth + (sj + 1)][c];
                sum += (float)imgdata.image[(si + 1) * iwidth + (sj + 0)][c];
                sum += (float)imgdata.image[(si + 1) * iwidth + (sj + 1)][c];
                gt_rgb[i * half_w + j][c] = raw_to_float((ushort)(sum * 0.25f + 0.5f));
            }
        }
    }

    fprintf(stderr, "DHT-NN train: ground truth %dx%d -> %dx%d\n", iwidth, iheight, half_w, half_h);

    /* ── Step 4: Restore original raw and build half-res synthetic raw ── */
    /* Re-mosaic: for each half-res pixel, the "known" channel comes from gt_rgb,
     * and the Bayer pattern is preserved (it's 2×2-periodic, same at half res). */
    memcpy(imgdata.image, saved_image, img_size * sizeof(ushort[4]));

    /* Temporarily set half-res dimensions */
    int saved_iwidth      = imgdata.sizes.iwidth;
    int saved_iheight     = imgdata.sizes.iheight;
    imgdata.sizes.iwidth  = half_w;
    imgdata.sizes.iheight = half_h;

    /* Write synthetic half-res raw into imgdata.image.
     * For each pixel, only the Bayer-known channel has a value; others are 0.
     * We convert from perceptual float back to ushort for the known channel. */
    memset(imgdata.image, 0, (size_t)half_w * half_h * sizeof(ushort[4]));
    for (int i = 0; i < half_h; i++)
    {
        for (int j = 0; j < half_w; j++)
        {
            int ch = COLOR(i, j);
            if (ch == 3) ch = 1;
            imgdata.image[i * half_w + j][ch] = float_to_raw(gt_rgb[i * half_w + j][ch]);
        }
    }

    /* ── Step 5: Build DHTNNF from half-res synthetic raw, detect initial HV dirs ── */
    {
        DHTNNF dht_half(*this);

        /* Update channel limits from ground truth to avoid clipping errors during training.
         * The synthetic raw image has discarded pixels which might contain the true extrema.
         */
        for (int c = 0; c < 3; ++c)
        {
            dht_half.channel_maximum[c] = -1e30f;
            dht_half.channel_minimum[c] = 1e30f;
        }
        for (int i = 0; i < half_h; i++)
        {
            for (int j = 0; j < half_w; j++)
            {
                for (int c = 0; c < 3; c++)
                {
                    float val = gt_rgb[i * half_w + j][c];
                    if (val > dht_half.channel_maximum[c]) dht_half.channel_maximum[c] = val;
                    if (val < dht_half.channel_minimum[c]) dht_half.channel_minimum[c] = val;
                }
            }
        }

        dht_half.hide_hots();

 #if defined(LIBRAW_USE_OPENMP)
 #pragma omp parallel for schedule(guided)
 #endif
        for (int i = 0; i < half_h; ++i)
            dht_half.make_hv_dline(i);

        /* ── Step 6: Train NN with ground truth labels ── */
        dht_half.train_nn(net, lr, gt_rgb, half_w, half_h);
    }

    free(gt_rgb);

    /* ── Step 7: Restore original dimensions and raw ── */
    imgdata.sizes.iwidth  = saved_iwidth;
    imgdata.sizes.iheight = saved_iheight;
    memcpy(imgdata.image, saved_image, img_size * sizeof(ushort[4]));
    free(saved_image);

    /* Save checkpoint and inference weights */
    if (net.save_checkpoint(ckpt_path.c_str())) fprintf(stderr, "DHT-NN: saved checkpoint to %s\n", ckpt_path.c_str());
    if (net.save(weights_path.c_str()))
        fprintf(stderr, "DHT-NN: saved weights to %s (%d floats, %zu bytes)\n", weights_path.c_str(), net.total_weights(), (size_t)net.total_weights() * sizeof(float));

    /* Now demosaic the original full-res raw using the trained NN */
    DHTNNF dht(*this);
    dht.hide_hots();

 #if defined(LIBRAW_USE_OPENMP)
 #pragma omp parallel for schedule(guided)
 #endif
    for (int i = 0; i < iheight; ++i)
        dht.make_hv_dline(i);

    dht.nn_blend_green(net);
    dht.make_greens(); /* handles HVSH + margin pixels nn_blend_green skipped */
    dht.make_diag_dirs();
    dht.make_rb();
    dht.restore_hots();
    dht.copy_to_image();
 }

 /*
 * Benchmark: compare NN direction refinement against rule-based refinement.
 * Runs both, prints agreement metrics, leaves ndir in rule-based state.
 */
 void DHTNNF::benchmark_nn(DHTNNNetwork &net)
 {
    int iwidth  = libraw.imgdata.sizes.iwidth;
    int iheight = libraw.imgdata.sizes.iheight;

    /* Save unrefined directions */
    char *initial_ndir = (char *)malloc(nr_height * nr_width);
    memcpy(initial_ndir, ndir, nr_height * nr_width);

    /* Run NN refinement and save result */
    nn_refine_hv_dirs(net);
    char *nn_ndir = (char *)malloc(nr_height * nr_width);
    memcpy(nn_ndir, ndir, nr_height * nr_width);

    /* Restore initial and run rule-based refinement */
    memcpy(ndir, initial_ndir, nr_height * nr_width);
    free(initial_ndir);

    for (int i = 0; i < iheight; i++)
        refine_hv_dirs(i, i & 1);
    for (int i = 0; i < iheight; i++)
        refine_hv_dirs(i, (i & 1) ^ 1);
    for (int i = 0; i < iheight; i++)
        refine_ihv_dirs(i);

    /* Compare NN vs rule-based (skip borders) */
    long long total = 0, agree = 0;
    long long hor_agree = 0, ver_agree = 0;
    long long nn_hor = 0, nn_ver = 0, rb_hor = 0, rb_ver = 0;
    for (int i = 3; i < iheight - 3; ++i)
    {
        for (int j = 3; j < iwidth - 3; ++j)
        {
            int  off       = nr_offset(i + nr_topmargin, j + nr_leftmargin);
            char nn_d      = nn_ndir[off];
            char rb_d      = ndir[off];
            bool nn_is_hor = (nn_d & HOR) != 0;
            bool nn_is_ver = (nn_d & VER) != 0;
            bool rb_is_hor = (rb_d & HOR) != 0;
            bool rb_is_ver = (rb_d & VER) != 0;
            /* Skip pixels where direction is ambiguous */
            if (nn_is_hor == nn_is_ver) continue;
            if (rb_is_hor == rb_is_ver) continue;
            total++;
            if (nn_is_hor) nn_hor++;
            else nn_ver++;
            if (rb_is_hor) rb_hor++;
            else rb_ver++;
            if (nn_is_hor == rb_is_hor)
            {
                agree++;
                if (nn_is_hor) hor_agree++;
                else ver_agree++;
            }
        }
    }
    free(nn_ndir);

    fprintf(stderr, "DHT-NN test: %lld decisive pixels\n", total);
    if (total > 0)
    {
        fprintf(stderr, "DHT-NN test: agreement: %lld / %lld = %.2f%%\n", agree, total, 100.0 * agree / total);
        fprintf(stderr, "DHT-NN test: NN  HOR=%lld VER=%lld (%.1f%% / %.1f%%)\n", nn_hor, nn_ver, 100.0 * nn_hor / total, 100.0 * nn_ver / total);
        fprintf(stderr, "DHT-NN test: RB  HOR=%lld VER=%lld (%.1f%% / %.1f%%)\n", rb_hor, rb_ver, 100.0 * rb_hor / total, 100.0 * rb_ver / total);
        if (rb_hor > 0) fprintf(stderr, "DHT-NN test: HOR precision: %lld / %lld = %.2f%%\n", hor_agree, rb_hor, 100.0 * hor_agree / rb_hor);
        if (rb_ver > 0) fprintf(stderr, "DHT-NN test: VER precision: %lld / %lld = %.2f%%\n", ver_agree, rb_ver, 100.0 * ver_agree / rb_ver);
    }
    /* ndir now contains rule-based directions */
 }

 /*
 * q=15: Test / benchmark mode.
 * Loads trained weights, runs both NN and rule-based direction refinement,
 * compares them and prints accuracy metrics, then demosaics using rule-based
 * directions as the reference output.
 */
 void LibRaw::dht_nn_interpolate_test()
 {
    if (imgdata.idata.filters != 0x16161616 && imgdata.idata.filters != 0x61616161 && imgdata.idata.filters != 0x49494949 &&
        imgdata.idata.filters != 0x94949494)
    {
        ahd_interpolate();
        return;
    }

    DHTNNNetwork net;
    std::string weights_path = dht_nn_filename(net, "weights");
    bool         have_nn = net.load(weights_path.c_str());
    if (!have_nn)
    {
        fprintf(stderr, "DHT-NN test: weights not found at %s, nothing to benchmark\n", weights_path.c_str());
        DHTNNF dht(*this);
        dht.hide_hots();
        dht.make_hv_dirs();
        dht.make_greens();
        dht.make_diag_dirs();
        dht.make_rb();
        dht.restore_hots();
        dht.copy_to_image();
        return;
    }
    fprintf(stderr, "DHT-NN test: loaded weights from %s\n", weights_path.c_str());

    DHTNNF dht(*this);
    dht.hide_hots();

    /* Initial HV detection (unrefined) */
 #if defined(LIBRAW_USE_OPENMP)
 #pragma omp parallel for schedule(guided)
 #endif
    for (int i = 0; i < imgdata.sizes.iheight; ++i)
        dht.make_hv_dline(i);

    /* Benchmark: NN vs rule-based, prints metrics, leaves ndir rule-based */
    dht.benchmark_nn(net);

    /* Demosaic using rule-based directions (reference output) */
    dht.make_greens();
    dht.make_diag_dirs();
    dht.make_rb();
    dht.restore_hots();
    dht.copy_to_image();
 }
No results found