December 6, 2014 08:39
diff --git a/clfflame.nt.d b/clfflame.nt.d
 module clfflame;

 import c.CL.cl;
 import sys, std.(file, string, util, random, math, time, thread, channel, hashmap, png, macros.(switchover, where));
 pragma(lib, "OpenCL");

 alias NUMFUNS = 10; // functions implemented
 alias numfuns = 3; // function set size, must be constant because compiled into shader

 alias SPEED = 1.0;

 void twriteln(string s) { writeln "$(sec())\t$s"; }

 float frand() { return randf(deflt); }
 vec3f randcol() { return vec3f(frand(), frand(), frand()); }
 float smallrand() { return frand() / 300 + frand() / 400 + frand() / 350; }
 float resign(float f) { return f * [-1,1][frand() > 0.5]; }
 vec2f randvec(int component) {
  alias c2 = resign pow(frand() * 1.1, 7); // usually small, potentially large
  if (component == 0) return vec2f(1 + c2, c2);
  if (component == 1) return vec2f(c2, 1 + c2);
  if (component == 2) return vec2f(c2, c2);
  // return vec2f(frand() * 2 - 1, frand() * 2 - 1);
 }

 template dgwrapper(T) {
  extern(C) void callHolder(T t, void* ptr) {
    auto trip = *(void*, void delegate(T) dg)*:ptr;
    auto _threadlocal = trip[0];
    trip[1](t);
  }
  auto dgwrapper(void delegate(T) dg) {
    auto ptr = new (void*, void delegate(T));
    (*ptr) = (_threadlocal, dg);
    return (&callHolder, void*:ptr);
  }
 }

 void clCheckRes (int i) {
  if (i != 0) {
    writeln "CL failed with $i! ";
    fail;
  }
 }

 template clCheckCall(alias A) {
  template clCheckCall(T) {
    type-of A(value-of!T, null) clCheckCall(T t) {
      int error;
      onExit clCheckRes (error);
      return A(t, &error);
    }
  }
 }

 cl_context createContextFromType(cl_context_properties[] props, cl_device_type type, void delegate(char* errinfo, void* private_info, size_t cb) notify) {
  cl_int ret;
  auto tup = dgwrapper!(char*, void*, size_t)(void delegate((char*,void*,size_t)):notify);
  props ~= cl_context_properties:0;
  return clCheckCall!clCreateContextFromType (props.ptr, type, (ParamTypes type-of &clCreateContextFromType)[2]: tup[0], tup[1]);
 }

 cl_context createContext(cl_context_properties[] props, int devs, cl_device_id* devp, void delegate(char* errinfo, void* private_info, size_t cb) notify) {
  cl_int ret;
  auto tup = dgwrapper!(char*, void*, size_t)(void delegate((char*,void*,size_t)):notify);
  props ~= cl_context_properties:0;
  return clCheckCall!clCreateContext (props.ptr, devs, devp, (ParamTypes type-of &clCreateContext)[3]: tup[0], tup[1]);
 }

 import std.lib.glfw3, std.lib.opengl.(, window);

 /*
 shared ThreadPool tp;
 void init() { tp = new ThreadPool(2); }
 */
 void delegate() myAsyncRead(cl_command_queue queue, cl_mem mem, vec4f[] target, cl_event ev) {
  clCheckRes clEnqueueReadBuffer (queue, mem, CL_FALSE, 0, target.length * size-of vec4f, target.ptr, (1, [ev].dup.ptr), &cl_event readback);
  return new λ{ clCheckRes clWaitForEvents (1, &readback); }
  // no benefit from this as clEnqueueMapBuffer copies into main ram
  /*assert(!!ev);
  auto start = sec();
  auto ptr = vec4f*: clCheckCall!clEnqueueMapBuffer(queue, mem, true, CL_MAP_READ, 0, target.length * size-of vec4f, 1, &ev, null);
  writeln "map took $((sec() - start) * 1000)ms";
  auto sem = new Semaphore;
  tp.addTask new λ{
    target[] = ptr[0..target.length];
    sem.release;
  }
  return new λ{
    sem.acquire;
    clEnqueueUnmapMemObject(queue, mem, ptr, 0, null, &cl_event unmap);
    clCheckRes clWaitForEvents (1, &unmap);
  }*/
 }

 enum DrawMode { Frame, Screenshot }
 struct DrawMessage {
  DrawMode mode;
  vec4f[] array;
  void init(DrawMode mode) { this.mode = mode; }
  void init(vec4f[] a) { mode = DrawMode.Frame; array = a; }
  vec4f[] getArray() where mode == DrawMode.Frame return array;
  bool isValid() { if (mode == DrawMode.Frame) return !!array; else return true; }
  alias implicit-cast = isValid();
  alias implicit-cast-2 = getArray();
 }

 shared bool pause;

 template Repeat(T) {
  template Repeat(alias A) {
    static if (A == 0) { alias Repeat = (); }
    else {
      Repeat!(A - 1) below; T t;
      alias Repeat = type-of __flatten_tuple (t, below);
    }
  }
 }

 alias Weights = Repeat!float!NUMFUNS;

 alias Tup = (vec3f color,
  (vec2f a, vec2f b, vec2f c) mat1,
  (vec2f a, vec2f b, vec2f c) mat2,
  Weights weights, (float weight, int fac) caleid);

 struct FunSet {
  Tup[auto~] functions;
  void addfun() {
    type-of functions[0] foo;
    functions ~= foo;
    regenat(functions.length - 1);
  }
  void copyfrom(FunSet* other, int id) {
    functions[id] = other.functions[id];
  }
  void regenat(int id) {
    Weights weights;
    float weightsum;
    static for int i <- 0..NUMFUNS { ref w = weights[i]; w = frand(); weightsum += w; }
    auto scalefactor = 1 / weightsum;
    static for int i <- 0..NUMFUNS { weights[i] *= scalefactor; }
    functions[id] = (randcol(),
      (randvec(0), randvec(1), randvec(2)), // pretransform
      (randvec(0), randvec(1), randvec(2)), // posttransform
      weights, (weight => [0, 1][std.random.rand() % 3 == 0], fac => id + 1));
  }
 }

 class FunFade {
  FunSet a, b;
  (float f, float d)[auto~] transfers;
  cl_mem funvec;
  void init(int numfuns, cl_context ctx) {
    for 0..numfuns {
      a.addfun;
      b.addfun;
      transfers ~= (0, smallrand() * SPEED);
    }
    funvec = clCheckCall!clCreateBuffer (ctx, CL_MEM_READ_ONLY,
      (numfuns * size-of Tup), null);
  }
  void fini() {
    clReleaseMemObject funvec;
  }
  void step() {
    // writeln "a: $(a.functions)";
    // writeln "b: $(b.functions)";
    // writeln "tf: $transfers";
    float step = 1;
    if (pause) step = 0;
    
    for int i <- 0..numfuns {
      ref tf = transfers[i];
      tf.f += tf.d * step;
      if (tf.f > 1) {
        a.copyfrom(&b, i);
        b.regenat(i);
        tf.f -= 1;
        tf.d = smallrand();
      }
    }
  }
  void upload(cl_command_queue queue) {
    Tup[auto~] funs;
    for int i <- 0..numfuns {
      ref tf = transfers[i];
      ref af = a.functions[i], bf = b.functions[i];
      float f = tf.f;
      float interp(float a, b, f) {
        // return a * (1 - f) + b * f;
        auto f2 = (1 - cos(f * PI)) / 2;
        return a * (1 - f2) + b * f2;
      }
      vec2f interp(vec2f a, b, float f) {
        return vec2f(interp(a.x, b.x, f), interp(a.y, b.y, f));
      }
      vec3f interp(vec3f a, b, float f) {
        return vec3f(interp(a.x, b.x, f), interp(a.y, b.y, f), interp(a.z, b.z, f));
      }
      alias binterp = interp;
      /*vec2f binterp(vec2f a, b, float f) {
        auto res = interp(a, b, f);
        auto lres = |res|;
        auto newlen = pow(lres, 0.1);
        res = res * newlen / lres;
        return res;
      }*/
      auto m1 = (
        binterp(af.mat1.a, bf.mat1.a, f),
        binterp(af.mat1.b, bf.mat1.b, f),
        binterp(af.mat1.c, bf.mat1.c, f));
      auto m2 = (
        binterp(af.mat2.a, bf.mat2.a, f),
        binterp(af.mat2.b, bf.mat2.b, f),
        binterp(af.mat2.c, bf.mat2.c, f));
      auto caleid_interp = (
        weight => interp(af.caleid.weight, bf.caleid.weight, f),
        fac => af.caleid.fac);
      Weights interps;
      static for int i <- 0..NUMFUNS {
        interps[i] = interp(af.weights[i], bf.weights[i], f);
      }
      funs ~= (interp(af.color, bf.color, f),
        m1, m2, interps, caleid_interp);
    }
    // writeln "funs: $funs";
    auto funvec_data = ubyte[]:funs[];
    clCheckRes clEnqueueWriteBuffer (queue, funvec, CL_TRUE, 0, funvec_data.(length, ptr), 0, null, null);
  }
 }

 /**
  * start with random point, color black
  * have a set of functions (vec3f color, int index, matrix2x2, vec2)
  * have an output (vec3 sum, int count)
  * every step:
      select random function
      transform point by matrix and vec
      mix color with function object color
      write to output
 **/
 class CLContext {
  cl_context ctx;
  cl_command_queue queue;
  Hashmap!((int, vec4f*), cl_mem) bufcache;
  Hashmap!(int, cl_mem) zbufcache;
  cl_kernel fflameKernel, fixupKernel;
  cl_program fflame, fixup;
  void fini() {
    clReleaseKernel fflameKernel;
    clReleaseKernel fixupKernel;
    clReleaseProgram fflame;
    clReleaseProgram fixup;
    bufcache .iterate λ((int, vec4f*), cl_mem mem) { clReleaseMemObject(mem); };
    zbufcache.iterate λ(int i, cl_mem mem) { clReleaseMemObject(mem); };
    clReleaseCommandQueue queue;
    clReleaseContext ctx;
  }
  cl_mem cacheGetBufferSized(int size, vec4f* ptr) {
    if (auto ptr = bufcache.get(size, ptr)) return *ptr;
    writeln "alloc buffer of $(size)";
    auto res = clCheckCall!clCreateBuffer (ctx, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
      size, null);
    bufcache.insert((size, ptr), res);
    return res;
  }
  cl_mem getZeroBuffer(int size) {
    if (auto p = zbufcache.get(size)) return *p;
    auto res = clCheckCall!clCreateBuffer (ctx, CL_MEM_READ_ONLY, size, null);
    scope zeroes = [for 0..size: byte:0].eval;
    clCheckRes clEnqueueWriteBuffer (queue, res, CL_TRUE, 0, size, zeroes.ptr, 0, null, null);
    zbufcache.insert(size, res);
    return res;
  }
  void init() {
    string weights() { return join [for i <- 0..NUMFUNS: "float weight$i; "]; }
    auto fflamekernel = "
      typedef struct _funobj {
        float4 color;
        float2 mat1a, mat1b, mat1c;
        float2 mat2a, mat2b, mat2c;
        $(weights())
        float caleid_weight; int caleid_fac;
      } funobj;
      uint MWC64X(uint2 *state)
      {
        enum { A=4294883355U};
        uint x=(*state).x, c=(*state).y;  // Unpack the state
        uint res=x^c;                     // Calculate the result
        uint hi=mul_hi(x,A);              // Step the RNG
        x=x*A+c;
        c=hi+(x<c);
        *state=(uint2)(x,c);              // Pack the state back up
        return res;                       // Return the next result
      }
      __constant float PI = 3.14159265358979323846264f;
      // __constant float coeff_1 = PI / 4.0f; // jesus christ nvidia
      __constant float coeff_1 = 0.785398163397f;
      // __constant float coeff_2 = 3.0f * coeff_1;
      __constant float coeff_2 = 2.35619449019f;
      float atan2f(float y, float x) {
        float abs_y = y * sign(y);
        float angle, r;
        /*if (x >= 0) {
          r = (x - abs_y) / (abs_y + x);
          angle = coeff_1 - coeff_1 * r;
        } else {
          r = (x + abs_y) / (abs_y - x);
          angle = coeff_2 - coeff_1 * r;
        }*/
        int s = sign(x);
        r = native_divide(x - s * abs_y, abs_y + s * x);
        angle = ((s == 1)?coeff_1:coeff_2) - coeff_1 * r;
        // return y < 0 ? -angle : angle;
        return angle * sign(y);
      }
      float sinf(float f) { return native_sin(f); }
      float cosf(float f) { return native_cos(f); }
      float2 apply(float2 pos, __constant funobj* fo) {
        float r2 = dot(pos, pos), s = sinf(r2), c = cosf(r2), a = atan2f(pos.y, pos.x), r = fast_length(pos);
        float abypi = native_divide(a, PI);
        float ar = a * r;
        float sar = sinf(ar), car = cosf(ar);
        // this is not actually any slower than a switch would have been.
        pos =
          // 0 linear
          fo->weight0 * pos +
          // 1 sinusoidal
          fo->weight1 * (float2)(sinf(pos.x), sinf(pos.y)) +
          // 2 spherical
          fo->weight2 * native_divide(pos, r2) +
          // 3 swirl
          fo->weight3 * (float2)(pos.x * s - pos.y * c, pos.x * c + pos.y * s) +
          // 4 horseshoe
          fo->weight4 * (float2)((pos.x - pos.y) * (pos.x + pos.y), 2 * pos.x * pos.y) +
          // 5 polar
          fo->weight5 * (float2)(abypi, r - 1.0f) +
          // 6 handkerchief
          fo->weight6 * r * (float2)(sinf(a + r), cosf(a - r)) +
          // 7 heart
          fo->weight7 * r * (float2)(sar,-car) +
          // 8 disc
          fo->weight8 * (abypi) * (float2)(sar, car) +
          // 9 spiral
          fo->weight9 * native_recip(r) * (float2)(cosf(a) + sinf(r), sinf(a) - cosf(r));
        return pos;
      }
      
      __kernel void fflame(__global float4* res, __constant funobj* funset,  const int2 size, const int iters) {
        uint2 rngstate = (uint2)(get_global_id(0), 0);
        float2 pos = (float2)(0, 0);
        float3 col = (float3)(0, 0, 0);
        
        for (int i = 0; i < iters; i++) {
          int selected = MWC64X(&rngstate) % $numfuns;
          __constant funobj *fo = &funset[selected];
          int caleid_rand = MWC64X(&rngstate);
          int randflags = MWC64X(&rngstate);
          
          float2 prevpos = pos;
          pos = pos.x * fo->mat1a + pos.y * fo->mat1b + 1 * fo->mat1c;
          pos = apply(pos, fo);
          {
            float2 cpos = pos - 0.5f;
            // transform pos into radial around origin
            float r = native_sqrt(dot(cpos, cpos)), angle = atan2f(cpos.y, cpos.x); /* -pi..pi */
            float b = PI / fo->caleid_fac;
            int fac = caleid_rand % fo->caleid_fac;
            float newangle = (angle + b * fac) * ((randflags & 1)?1:-1);
            cpos = r * (float2)(cosf(newangle), sinf(newangle)) + 0.5f;
            pos = pos * (1 - fo->caleid_weight) + cpos * fo->caleid_weight;
          }
          pos = pos.x * fo->mat2a + pos.y * fo->mat2b + 1 * fo->mat2c;
          col = (col + fo->color.xyz) * 0.5f;
          
          float2 scaledpos = native_divide(pos + 1.0f, 2.0f) * (float2)(size.x, size.y);
          int2 ipos = (int2)((int) scaledpos.x, (int) scaledpos.y);
          if ((ipos.x >= 0) & (ipos.x < size.x) & (ipos.y >= 0) & (ipos.y < size.y)) {
            int index = ipos.y * size.x + ipos.x;
            res[index] += (float4)(col.x, col.y, col.z, 1);
          }
        }
      }";
    auto fixupkernel = "
      __kernel void fixup(__global float4* data, const int2 size, const float basefactor) {
        int index = get_global_id(0);
        float4 col = data[index];
        float count = col.w / basefactor;
        col = col * native_log(count + 1) / col.w;
        col = clamp(col, 0.0f, 1.0f);
        col.w = 1;
        data[index] = col;
      }
    ";
    clCheckRes clGetPlatformIDs(0, null, &int ids);
    auto platforms = new cl_platform_id[] ids;
    clCheckRes clGetPlatformIDs(ids, platforms.ptr, null);
    writeln "$ids platform(s). ";
    cl_device_id[] getDevices(cl_platform_id platf) {
      int devs;
      clCheckRes clGetDeviceIDs (platf, CL_DEVICE_TYPE_GPU, 0, null, &devs);
      auto devlist = new cl_device_id[] devs;
      clCheckRes clGetDeviceIDs (platf, CL_DEVICE_TYPE_GPU, devs, devlist.ptr, null);
      return devlist;
    }
    auto platf = platforms[0];
    cl_device_id dev = getDevices(platf)[0];
    
    for (string devinfo, int enum2) <- [
      ("Extensions"[], CL_DEVICE_EXTENSIONS),
      ("Name"[], CL_DEVICE_NAME),
      ("Profile"[], CL_DEVICE_PROFILE),
      ("Vendor"[], CL_DEVICE_VENDOR),
      ("Version"[], CL_DEVICE_VERSION),
      ("DriverVersion"[], CL_DRIVER_VERSION)]
    {
      int size;
      clCheckRes clGetDeviceInfo (dev, enum2, 0, null, &size);
      scope devstore = new char[] size;
      clCheckRes clGetDeviceInfo (dev, enum2, size, devstore.ptr, int*:null);
      writeln "$devinfo = $devstore ($size)";
    }

    cl_context_properties[] props;
    props ~= CL_CONTEXT_PLATFORM;
    props ~= cl_context_properties: platf;
    
    ctx = createContext(props, 1, &dev, null);
    writeln "Context created. ";
    
    queue = clCheckCall!clCreateCommandQueue (ctx, dev, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE);
    writeln "Command queue created. ";
    
    writeln "Buffers created. ";
    writeln "Building. ";
    cl_program build(string source) {
      scope sourcelines = [for line <- splitAt(once source, "\n"): line ~ "\n\x00"].eval[];
      // writeln "$(sourcelines.length) lines of source. ";
      scope ptrs = [for line <- sourcelines: line.ptr].eval[];
      auto prog = clCreateProgramWithSource(ctx, sourcelines.length,
        ptrs.ptr, null, null);
      auto err = clBuildProgram (prog, 0, null, "-cl-mad-enable -cl-no-signed-zeros -cl-fast-relaxed-math -Werror -cl-nv-verbose", null x 2);
      int len;
      clGetProgramBuildInfo (prog, dev, CL_PROGRAM_BUILD_LOG, 0, null, &len);
      auto str = new char[] len;
      clGetProgramBuildInfo (prog, dev, CL_PROGRAM_BUILD_LOG, len, str.ptr, null);
      if (err) {
        writeln "Failed to build: $str";
        exit(1);
      } else {
        if (len > 2) writeln "Build log: $str";
      }
      return prog;
    }
    fflame = build fflamekernel;
    fixup = build fixupkernel;
    
    writeln "Program built. ";
    fflameKernel = clCheckCall!clCreateKernel (fflame, "fflame".ptr);
    fixupKernel = clCheckCall!clCreateKernel (fixup, "fixup".ptr);
    
    writeln "Kernel created. ";
  }
  (int, void delegate() wait) calc(vec2i size, int threads, vec4f[] output, cl_mem funvec) {
    auto vec = cacheGetBufferSized(size.(x*y) * size-of vec4f, output.ptr);
    // not supported in my api version o.o
    // clCheckRes clEnqueueFillBuffer (queue, vec, &int zero, size-of int, 0, (ubyte[]:output).length, 0, null, null);
    cl_event zeroOut;
    {
      int len = (ubyte[]:output).length;
      auto zbuf = getZeroBuffer(len);
      clCheckRes clEnqueueCopyBuffer(queue, zbuf, vec, 0, 0, len, 0, null, &zeroOut);
    }
    
    auto iters = cl_int:512;
    clCheckRes clSetKernelArg (fflameKernel, 0, size-of type-of vec, void*:&vec);
    clCheckRes clSetKernelArg (fflameKernel, 1, size-of type-of funvec, void*:&funvec);
    clCheckRes clSetKernelArg (fflameKernel, 2, size-of type-of size, void*:&size);
    clCheckRes clSetKernelArg (fflameKernel, 3, size-of int, void*:&iters);
    
    float basefactor = (threads * iters) * 1f / size.(x * y);
    clCheckRes clSetKernelArg (fixupKernel, 0, size-of type-of vec, void*:&vec);
    clCheckRes clSetKernelArg (fixupKernel, 1, size-of type-of size, void*:&size);
    clCheckRes clSetKernelArg (fixupKernel, 2, size-of float, void*:&basefactor);
    
    clCheckRes clEnqueueNDRangeKernel (queue, fflameKernel, 1, null, [threads].dup.ptr, null, (1, [zeroOut].dup.ptr), &cl_event calcStep);
    clCheckRes clEnqueueNDRangeKernel (queue, fixupKernel , 1, null, [size.(x * y)].dup.ptr, null, (1, [calcStep].dup.ptr), &cl_event fixupStep);
    
    int workDone = threads * iters;
    // read-back
    return (workDone, myAsyncRead(queue, vec, output, fixupStep));
  }
 }

 void saveAsPng(vec4f[] buf, string filename, vec2i size, int aa) {
  auto largesize = size * aa;
  scope ubyte[auto~] pngdata;
  using new PNGWriter λ(string s) { pngdata ~= ubyte[]:s; } {
    configure size;
    scope vec4f[auto~] line;
    alias weights = vec3f(0.299, 0.587, 0.114);
    // see http://excamera.com/sphinx/article-srgb.html
    alias a = 0.055, γ = 2.2;
    float lin2srgb(float f) {
      if (f <= 0.0031308) return f * 12.92;
      return (1 + a) * pow(f, 1 / γ) - a;
    }
    float srgb2lin(float f) {
      if (f <= 0.04045) return f * (1 / 12.92f);
      return pow((f + a) / (1 + a), γ);
    }
    vec3f lin2srgb(vec3f v) { return v.(vec3f(lin2srgb x, lin2srgb y, lin2srgb z)); }
    vec3f srgb2lin(vec3f v) { return v.(vec3f(srgb2lin x, srgb2lin y, srgb2lin z)); }
    for int y <- 0..size.y {
      for int x <- 0..size.x {
        vec3f sum;
        for int y2 <- 0..aa for int x2 <- 0..aa {
          int lx = x * aa + x2, ly = y * aa + y2;
          sum += srgb2lin buf[ly * largesize.x + lx].xyz;
        }
        sum /= aa * aa;
        sum = lin2srgb sum;
        line ~= sum.(vec4f(x, y, z, 1));
      }
      writeLine ubyte[]: line[];
      line.clear;
    }
    end;
  }
  // work THAT out
  scope tmp_filename = filename.basedir().sub("." ~ filename.relativePathAt filename.basedir());
  writeAll(tmp_filename, pngdata[]);
  rename(tmp_filename, filename);
 }

 int main() {
  auto size = vec2i(1920, 1080) / 1;
  auto screensize = vec2i(1600, 900);
  int threads = 8192;
  
  if (true) {
    auto size = vec2i(1920, 1080), aa = 3;
    auto largesize = size * aa;
    
    auto save2calc = new Channel!vec4f[];
    auto calc2save = new Channel!(int, vec4f[]);
    
    for 0..3 save2calc.put(new vec4f[] largesize.(x*y));
    
    string zeroprefix(string s, int i) { while (s.length < i) s = "0$s"; return s; }
    string fn(int i) { return "clfflame_anim/frame_"~zeroprefix("$i", 6)~".png"; }
    
    int firstMissing;
    startThread λ{
      deflt = getPRNG s => 5;
      auto ctx = new CLContext;
      onSuccess ctx.fini;
      auto fade = new FunFade(numfuns, ctx.ctx);
      onSuccess fade.fini;
      while (fn(firstMissing).exists()) { fade.step; firstMissing ++; }
      auto buf = save2calc.take();
      int i = firstMissing; // the index that the current buf/wait belongs to
      twriteln "2: begin calculation $i";
      void delegate() stepcalc(vec4f[] buf) {
        fade.step; fade.upload(ctx.queue);
        return ctx.calc(largesize, 2^20, buf, fade.funvec).wait;
      }
      auto wait = stepcalc(buf);
      while (true) {
        twriteln "2: request buffer";
        auto nbuf = save2calc.take();
        auto ni = i + 1;
        twriteln "2: begin calculation $ni";
        auto nwait = stepcalc(nbuf);
        twriteln "2: block for $i";
        wait();
        twriteln "2: release buffer for $i";
        calc2save.put(i, buf);
        (i, wait, buf) = (ni, nwait, nbuf);
      }
    };
    
    auto start = sec();
    while true {
      twriteln "1: request buffer";
      (int i, vec4f[] buf) = calc2save.take();
      string filename = fn(i);
      twriteln "1: generate png data";
      saveAsPng(buf,
        filename,
        size => size, aa => aa);
      twriteln "1: release buffer";
      save2calc.put(buf);
      float fps = (i - firstMissing + 1) / float:(sec() - start);
      twriteln "1: saved $filename, $fps fps, $(fps * 3600) fph";
    }
  }
  
  auto
    draw2calc = new Channel!DrawMessage,
    calc2draw = new Channel!(vec4f[], double);
  for 0..3 draw2calc.put(new vec4f[] (size[0]*size[1])); // double^Wtriple buffer
  int fps;
  auto threadQuit = new Semaphore;
  
  startThread λ{
    onExit threadQuit.release;
    deflt = getPRNG s => 5;
    
    auto ctx = new CLContext;
    onSuccess ctx.fini;
    
    auto fade = new FunFade(numfuns, ctx.ctx);
    onSuccess fade.fini;
    fade.step; fade.upload(ctx.queue);
    
    auto msg = draw2calc.take();
    auto wait = ctx.calc(size, threads, msg, fade.funvec).wait;
    do auto nmsg = draw2calc.take();
    while (nmsg) {
      // TODO
      /*case nmsg of {
        Frame x: */
      switch DrawMode mode over mode == nmsg.mode {
        case DrawMode.Frame:
          fade.step;
          fade.upload(ctx.queue);
          fps ++;
          (int totalIters, void delegate() nwait) = ctx.calc(size, threads, nmsg, fade.funvec);
          wait(); // wait for previous to complete
          calc2draw.put(msg, totalIters);
          (msg, wait) = (nmsg, nwait); // rotate over
        case DrawMode.Screenshot:
          auto size = vec2i(1920, 1080), aa = 4;
          auto largesize = size * aa;
          scope lbuf = new vec4f[] largesize.(x*y);
          twriteln "begin calculation";
          ctx.calc(largesize, 2^20, lbuf, fade.funvec).wait();
          saveAsPng(lbuf, "out.png", size, aa => 4);
          writeln "written to out.png";
        default: fail "$(nmsg.mode)";
      }
    }
  }
  glwindow = new GLFWWindow;
  // glwindow.fullscreen = true;
  glwindow.setup(screensize);
  bool update() {
    glwindow.update();
    if (key-pressed(Key.Q)) return true;
    if (key-pressed(Key.W)) threads = int:(threads / 2);
    if (key-pressed(Key.E)) threads = int:(threads * 2);
    if (key-pressed(Key.T)) { draw2calc.put DrawMessage:DrawMode.Screenshot; }
    if (key-pressed(Key.Space)) pause = !pause;
    return false;
  }
  void draw(vec4f[] output, double iters) using mode GL {
    ClearColor (0, 0, 0, 0);
    ClearDepth 1;
    Enable TEXTURE_2D;
    Clear (COLOR_BUFFER_BIT | DEPTH_BUFFER_BIT);
    MatrixMode PROJECTION; LoadIdentity;
    glOrtho(0, 1, 1, 0, -1, 1);
    MatrixMode MODELVIEW; LoadIdentity;
    Color3f White;
    
    GenTextures(1, &GLuint datatex);
    onSuccess DeleteTextures(1, &datatex);
    using TEXTURE_2D {
      BindTexture(datatex);
      TexParameteri (TEXTURE_MAX_LEVEL, 0);
      TexParameteri (TEXTURE_MIN_FILTER, NEAREST);
      TexImage2D (0, RGBA, size, 0, RGBA, FLOAT, output.ptr);
    }
    using Quads {
      TexCoord2f(0, 0); Vertex2f(0, 0);
      TexCoord2f(0, 1); Vertex2f(0, 1);
      TexCoord2f(1, 1); Vertex2f(1, 1);
      TexCoord2f(1, 0); Vertex2f(1, 0);
    }
    return;
  }
  auto lastsec = sec();
  auto start = sec();
  int targetfps = 30;
  while !update() {
    (vec4f[] buf, double iters) = calc2draw.take();
    draw(buf, iters);
    draw2calc.put(buf);
    if (sec() - lastsec > 1) {
      writeln "$fps fps - $(iters*fps) steps/s";
      // we took that much fps to do threads tasks
      // so threads*fps is the load that takes 1s
      /*auto oldthreads = threads;
      threads = (threads * fps) / targetfps;
      writeln "adjust to $threads from $oldthreads due to $fps <> $targetfps";*/
      lastsec = sec;
      fps = 0;
    }
    // if (sec() - start > 5) exit(0);
  }
  writeln "Cleaning up calc thread.";
  draw2calc.put(null);
  threadQuit.acquire();
  writeln "Exiting.";
  return 0;
 }