ndevenish · February 24, 2023 10:40
diff --git a/final.cu b/final.cu

 #include <cooperative_groups.h>
 #include <fmt/core.h>
 #include <lodepng.h>

 #include <cmath>
 #include <cstdio>
 #include <limits>
 #include <memory>
 #include <random>
 #include <type_traits>

 #include "common.hpp"
 #include "curand_kernel.h"
 #include "float3.hpp"
 #include "float4.hpp"
 #include "rtweekend.hpp"

 #define CUDA_CALLABLE __forceinline__ __device__ __host__

 namespace cg = cooperative_groups;

 template <typename T, typename... Args>
 __global__ void device_make_object(T **dest_ptr, Args... args) {
    *dest_ptr = new T(args...);
 }
 template <typename T>
 __global__ void device_delete_object(T *obj) {
    delete obj;
 }

 /// Construct a new class object on the device
 template <typename T, typename... Args>
 auto make_device_object(Args... args) {
    // auto obj = make_cuda_managed_malloc<T **>(1);
    T **obj_ref = nullptr;
    cudaMalloc(&obj_ref, sizeof(T **));

    device_make_object<T><<<1, 1>>>(obj_ref, args...);
    auto deleter = [](T *ptr) { device_delete_object<<<1, 1>>>(ptr); };
    T *obj = nullptr;
    cudaMemcpy(&obj, obj_ref, sizeof(T *), cudaMemcpyDeviceToHost);
    cudaDeviceSynchronize();
    return std::shared_ptr<T>(obj, deleter);

    // return std::unique_ptr<T, decltype(deleter)>{obj, deleter};
 }

 class Material;

 class MaterialLibrary {
  public:
    template <typename T, typename... Args>
    auto create(Args... args) -> std::shared_ptr<T> {
        auto mat = make_device_object<T>(args...);
        _materials.push_back(mat);
        return mat;
    }

  private:
    std::vector<std::shared_ptr<Material>> _materials;
 };

 class Ray {
  public:
    Ray() {}
    CUDA_CALLABLE Ray(const float3 &origin, const float3 &direction)
        : _origin(origin), _direction(direction) {}

    CUDA_CALLABLE float3 origin() const {
        return _origin;
    }
    CUDA_CALLABLE float3 direction() const {
        return _direction;
    }

    CUDA_CALLABLE float3 at(float t) const {
        return _origin + t * _direction;
    }

  private:
    float3 _origin;
    float3 _direction;
 };

 __device__ auto random_in_unit_disk(curandState *random_state) -> float3 {
    while (true) {
        float3 p = {2.0f * curand_uniform(random_state) - 1,
                    2.0f * curand_uniform(random_state) - 1,
                    0};
        if (length_squared(p) >= 1) continue;
        // printf("%.2f, %.2f\n", p.x, p.y);
        return p;
    }
 }

 __device__ auto random_in_unit_sphere(curandState *curand_state) -> float3 {
    while (true) {
        auto vec = float3{curand_uniform(curand_state),
                          curand_uniform(curand_state),
                          curand_uniform(curand_state)}
                   - 0.5f;
        if (length_squared(vec) > 1) {
            continue;
        }
        return vec;
    }
 }

 struct Camera {
    Camera(float3 lookfrom,
           float3 lookat,
           float3 up,
           float vfov,
           float aspect_ratio,
           float aperture,
           float focus_dist) {
        auto theta = rad(vfov);
        auto h = tan(theta / 2);
        auto viewport_height = 2.0f * h;
        auto viewport_width = aspect_ratio * viewport_height;

        w = unit_vector(lookfrom - lookat);
        u = unit_vector(cross(up, w));
        v = cross(w, u);

        origin = lookfrom;
        horizontal = focus_dist * viewport_width * u;
        vertical = focus_dist * viewport_height * v;
        lower_left_corner = origin - horizontal / 2 - vertical / 2 - focus_dist * w;
        lens_radius = aperture / 2;
    }

    float3 origin;
    float3 horizontal;
    float3 vertical;
    float3 lower_left_corner;
    float3 u, v, w;
    float lens_radius;

    __device__ auto get_ray(float s, float t, curandState *random_state) -> Ray {
        float3 rd = lens_radius * random_in_unit_disk(random_state);
        float3 offset = u * rd.x + v * rd.y;

        return Ray(origin + offset,
                   lower_left_corner + s * horizontal + t * vertical - origin - offset);
    }
 };

 struct Hit {
    float3 point;
    float3 normal;
    float t;
    bool front_face;
    const Material *material;

    CUDA_CALLABLE void set_face_normal(const Ray &ray, const float3 &outward_normal) {
        front_face = dot(ray.direction(), outward_normal) < 0;
        normal = front_face ? outward_normal : -outward_normal;
    }
 };

 static_assert(std::is_trivial_v<Hit>);

 union RayObjects;
 struct RayObject;

 using HitFunction =
  bool (*)(const void *hittable, const Ray &ray, float t_min, float t_max, Hit &rec);

 struct RayObject {
    HitFunction hit;
    float3 position;
    const Material *material;
 };

 struct RayObject_Sphere {
    HitFunction hit;
    float3 position;
    float radius;
    const Material *material;
 };

 union RayObjects {
    RayObject obj;
    RayObject_Sphere sphere;
 };

 static_assert(sizeof(RayObject_Sphere) == sizeof(RayObject));
 static_assert(std::is_trivial_v<RayObject_Sphere>);

 __device__ bool hit_sphere(const RayObjects *hittable,
                           const Ray &r,
                           float t_min,
                           float t_max,
                           Hit &rec) {
    const RayObject_Sphere *obj = &hittable->sphere;
    float radius = obj->radius;

    float3 oc = r.origin() - obj->position;
    auto a = length_squared(r.direction());
    auto half_b = dot(oc, r.direction());
    auto c = length_squared(oc) - radius * radius;

    auto discriminant = half_b * half_b - a * c;
    if (discriminant < 0) {
        return false;
    }
    float t = (-half_b - std::sqrt(discriminant)) / a;
    // Check the hit isn't behind us
    if (t < t_min || t > t_max) {
        return false;
    }
    rec.t = t;
    rec.point = r.at(rec.t);
    float3 normal = (rec.point - obj->position) / radius;
    rec.set_face_normal(r, normal);
    rec.material = obj->material;

    return true;
 }

 using ptrX = decltype(hit_sphere);

 __device__ HitFunction dev_pHitSphere = reinterpret_cast<HitFunction>(hit_sphere);

 auto make_sphere(float3 position, float radius, const Material *material) {
    HitFunction fun = nullptr;
    cudaMemcpyFromSymbol(&fun, dev_pHitSphere, sizeof(HitFunction *));
    cuda_throw_error();
    return RayObjects{.sphere = RayObject_Sphere(fun, position, radius, material)};
 }

 template <typename T, typename U>
 auto make_sphere(float3 position, float radius, std::unique_ptr<T, U> &material) {
    static_assert(std::is_base_of_v<Material, T>,
                  "make_sphere must be passed a material");

    return make_sphere(position, radius, material.get());
 }
 template <typename T>
 auto make_sphere(float3 position, float radius, std::shared_ptr<T> material) {
    static_assert(std::is_base_of_v<Material, T>,
                  "make_sphere must be passed a material");

    return make_sphere(position, radius, material.get());
 }
 /// Return the color of a ray that otherwise goes to infinity
 __device__ float3 world_color(const Ray &ray) {
    auto unit_direction = unit_vector(ray.direction());
    auto t = 0.5f * (unit_direction.y + 1.0f);
    return (1.0f - t) * float3{1.0f, 1.0f, 1.0f} + t * float3{0.5f, 0.7f, 1.0f};
 }
 __device__ auto next_hit(const Ray &ray,
                         const RayObjects objects[],
                         int num_objects,
                         Hit *hit) -> bool {
    Hit closest_hit{{}, {}, infinity};
    const RayObjects *closest_object = nullptr;

    for (int x = 0; x < num_objects; ++x) {
        Hit hit;
        const RayObjects *obj = &objects[x];
        bool result = objects[x].obj.hit(obj, ray, 0.001f, 1000.0f, hit);
        if (result) {
            if (hit.t < closest_hit.t) {
                closest_hit = hit;
                closest_object = obj;
            }
        }
    }
    if (closest_object == nullptr) {
        return false;
    }
    *hit = closest_hit;
    return true;
 }

 class Material {
  public:
    __device__ virtual bool scatter(const Ray &ray,
                                    const Hit &hit,
                                    float3 &attenuation,
                                    Ray &scattered,
                                    curandState *random_state) const = 0;
 };

 class Lambertian : public Material {
  public:
    __device__ Lambertian(const float3 &a) : albedo(a) {}

    __device__ virtual bool scatter(const Ray &ray,
                                    const Hit &hit,
                                    float3 &attenuation,
                                    Ray &scattered,
                                    curandState *random_state) const override {
        auto scatter_direction =
          hit.normal + unit_vector(random_in_unit_sphere(random_state));
        // Protect against very small vectors
        if (near_zero(scatter_direction)) {
            scatter_direction = hit.normal;
        }
        scattered = Ray{hit.point, scatter_direction};
        attenuation = albedo;
        return true;
    }

  private:
    float3 albedo;
 };

 __device__ auto reflect(const float3 &v, const float3 &n) -> float3 {
    return v - 2 * dot(v, n) * n;
 }
 __device__ auto refract(const float3 &v, const float3 &n, float n_ratio) -> float3 {
    auto cos_theta = min(dot(-v, n), 1.0);
    // float sin_theta = sqrt(1.0f - cos_theta * cos_theta);
    // if (n_ratio * sin_theta > 1.0f) {
    //     return reflect(v, n);
    // }
    float3 r_out_perp = n_ratio * (v + cos_theta * n);
    float3 r_out_para = -sqrt(fabs(1.0 - length_squared(r_out_perp))) * n;
    return r_out_perp + r_out_para;
 }

 class Metal : public Material {
  public:
    __device__ Metal(const float3 &a, float fuzz = 0.0f)
        : albedo(a), fuzz(fuzz < 1 ? fuzz : 1.0f) {}
    __device__ virtual bool scatter(const Ray &ray,
                                    const Hit &hit,
                                    float3 &attenuation,
                                    Ray &scattered,
                                    curandState *random_state) const override {
        auto reflected = reflect(unit_vector(ray.direction()), hit.normal);
        scattered =
          Ray{hit.point, reflected + fuzz * random_in_unit_sphere(random_state)};
        attenuation = albedo;
        return (dot(scattered.direction(), hit.normal) > 0);
    }

  private:
    float3 albedo;
    float fuzz;
 };

 class Dielectric : public Material {
  public:
    __device__ Dielectric(float index_of_refraction, float fuzz = 0.0f)
        : ir(index_of_refraction), fuzz(fuzz) {}
    __device__ virtual bool scatter(const Ray &ray,
                                    const Hit &hit,
                                    float3 &attenuation,
                                    Ray &scattered,
                                    curandState *random_state) const override {
        attenuation = {1, 1, 1};
        float r_ratio = hit.front_face ? (1.0 / ir) : ir;
        float3 unit_direction = unit_vector(ray.direction());

        double cos_theta = min(dot(-unit_direction, hit.normal), 1.0);
        double sin_theta = sqrt(1.0 - cos_theta * cos_theta);

        bool cannot_refract = r_ratio * sin_theta > 1.0;
        float3 direction;
        if (cannot_refract
            || reflectance(cos_theta, r_ratio) > curand_uniform(random_state)) {
            direction = reflect(unit_direction, hit.normal);
        } else {
            direction = refract(unit_direction, hit.normal, r_ratio);
        }
        scattered = Ray{hit.point, direction};
        if (fuzz > 1e-6) {
            scattered =
              Ray{ray.origin(),
                  ray.direction() + 0.05 * random_in_unit_sphere(random_state)};
        }

        return true;
    }

  private:
    __device__ static auto reflectance(float cosine, float ref_idx) -> float {
        auto r0 = (1 - ref_idx) / (1 + ref_idx);
        r0 = r0 * r0;
        return r0 + (1 - r0) * pow((1 - cosine), 5);
    }
    float ir;
    float fuzz;
 };
 __device__ float3 ray_color(const Ray &ray,
                            RayObjects objects[],
                            int num_objects,
                            curandState *random_state) {
    int depth = 50;

    // We accumulate the colour over multiple hits
    float3 albedo{1, 1, 1};
    // float contribution = 1.0f;
    Hit hit;
    Ray next_ray = ray;
    // Accumulate hits until we have enough bounces or hit nothing
    while (--depth > 0) {
        bool result = next_hit(next_ray, objects, num_objects, &hit);

        if (result) {
            // Assume 50% absorption.
            // contribution *= 0.5f;
            // Determine the next ray from this point
            float3 attenuation{};
            if (hit.material->scatter(
                  next_ray, hit, attenuation, next_ray, random_state)) {
                albedo *= attenuation;
            }

        } else {
            // We didn't hit any other objects,
            auto wc = world_color(next_ray);
            albedo *= {wc.x, wc.y, wc.z};
            break;
        }
    }
    return albedo;
 }

 __global__ void doShootRays(float4 data[],
                            int data_pitch,
                            int width,
                            int height,
                            Camera camera,
                            RayObjects objects[],
                            int num_objects,
                            curandState rand_state[]) {
    // Assumptions:
    //  - One thread per pixel
    auto grid = cg::this_grid();
    auto block = cg::this_thread_block();

    constexpr int SAMPLES_PER_PIXEL = 100;

    // Work out the position in pixel space
    int image_x =
      block.group_index().x * block.dim_threads().x + block.thread_index().x;
    int image_y =
      (block.group_index().y * block.dim_threads().y + block.thread_index().y);

    if (image_x > width || image_y > height) return;

    auto random_state = rand_state[image_x + image_y * data_pitch];
    float4 color{};

    for (int n = 0; n < SAMPLES_PER_PIXEL; ++n) {
        // Fractions across viewport
        auto u =
          (static_cast<float>(image_x) + curand_uniform(&random_state)) / (width - 1);
        auto v =
          (static_cast<float>(image_y) + curand_uniform(&random_state)) / (height - 1);

        auto ray = camera.get_ray(u, v, &random_state);
        auto pixel_color = ray_color(ray, objects, num_objects, &random_state);
        color = float4{color.x + pixel_color.x,
                       color.y + pixel_color.y,
                       color.z + pixel_color.z,
                       color.w + 1.0f};
    }

    data[image_x + image_y * data_pitch] = {
      color.x / color.w, color.y / color.w, color.z / color.w, 1.0};

    rand_state[image_x + image_y * data_pitch] = random_state;
 }

 __global__ void initRandomState(curandState rand_state[],
                                size_t width,
                                size_t height,
                                size_t pitch) {
    auto grid = cg::this_grid();
    auto block = cg::this_thread_block();
    int image_x =
      block.group_index().x * block.dim_threads().x + block.thread_index().x;
    int image_y =
      (block.group_index().y * block.dim_threads().y + block.thread_index().y);
    if (image_x >= width || image_y >= height) return;

    size_t pixel_id = image_y * pitch + image_x;
    curand_init(pixel_id, 0, 0, &rand_state[pixel_id]);
 }

 int main(void) {
    auto start_time = std::chrono::high_resolution_clock::now();

    const unsigned int image_width = 1600;
    const unsigned int image_height = 900;

    const float aspect_ratio = (float)image_width / (float)image_height;

    // Calculate the block size
    auto block_dims = dim3{32, 32};
    // And now the block size. One thread per pixel.
    auto grid_dims = dim3{
      static_cast<unsigned int>(
        std::ceil(static_cast<float>(image_width) / block_dims.x)),
      static_cast<unsigned int>(
        std::ceil(static_cast<float>(image_height) / block_dims.y)),
    };
    print("Image:   {:4d} x {:<4d}\n", image_width, image_height);
    print("Threads: {:4d} x {:<4d} = {}\n",
          block_dims.x,
          block_dims.y,
          block_dims.x * block_dims.y);
    print("Blocks:  {:4d} x {:<4d} = {}\n",
          grid_dims.x,
          grid_dims.y,
          grid_dims.x * grid_dims.y);

    auto [dev_image, device_pitch] =
      make_cuda_pitched_malloc<float4>(image_width, image_height);
    cudaMemset(dev_image.get(), 0, device_pitch * sizeof(float4));
    print(
      "Allocated device memory. Pitch = {} vs naive {}\n", device_pitch, image_width);
    cudaDeviceSynchronize();
    cuda_throw_error();

    // Build the dynamic object list
    auto mat_red = make_device_object<Lambertian>(float3{0.9, 0.2, 0.2});
    auto mat_green = make_device_object<Lambertian>(float3{0.5, 0.9, 0.5});
    auto mat_metal = make_device_object<Metal>(float3{0.8, 0.8, 0.8});
    auto mat_metal_col = make_device_object<Metal>(float3{0.8, 0.6, 0.2}, 0.2);
    auto mat_di = make_device_object<Dielectric>(1.5);

    float R = cos(pi / 4);
    auto world = std::vector<RayObjects>();
    // auto mats = std::vector<std::unique_ptr<Material, [](Material *ptr) -> void>> {};
    // auto mats = std::vector<std::unique_ptr<Material, void(Material*)>> {};
    std::random_device r;
    std::default_random_engine e1(r());
    std::uniform_real_distribution<float> uniform_float(0, 1);
    std::uniform_real_distribution<float> upper_float(0.5, 1);

    // Make random materials
    // using MaterialPtr = decltype(make_device_object<Material>);
    // using MP = void(Material * ptr);

    auto mats = MaterialLibrary{};

    // auto mats = std::vector<std::shared_ptr<Material>>{};

    auto mat_ground = mats.create<Lambertian>(float3{0.5, 0.5, 0.5});
    world.push_back(make_sphere({0, -1000, 0}, 1000, mat_ground));

    for (int a = -11; a < 11; ++a) {
        for (int b = -11; b < 11; ++b) {
            auto choose_mat = uniform_float(e1);
            float3 center{
              a + 0.9f * uniform_float(e1), 0.2f, b + 0.9f * uniform_float(e1)};
            if (length((center - float3{4, 0.2, 0})) > 0.9) {
                if (choose_mat < 0.8) {
                    float3 albedo = {
                      uniform_float(e1), uniform_float(e1), uniform_float(e1)};
                    albedo *= albedo;
                    world.push_back(
                      make_sphere(center, 0.2, mats.create<Lambertian>(albedo)));
                } else if (choose_mat < 0.95) {
                    float3 albedo = {upper_float(e1), upper_float(e1), upper_float(e1)};
                    auto fuzz = uniform_float(e1) / 2.0f;
                    world.push_back(
                      make_sphere(center, 0.2, mats.create<Metal>(albedo, fuzz)));
                } else {
                    world.push_back(
                      make_sphere(center, 0.2, mats.create<Dielectric>(1.5)));
                }
            }
        }
    }

    world.push_back(make_sphere({0, 1, 0}, 1.0, mats.create<Dielectric>(1.5)));
    world.push_back(
      make_sphere({-4, 1, 0}, 1.0, mats.create<Lambertian>(float3{0.4, 0.2, 0.1})));
    world.push_back(
      make_sphere({4, 1, 0}, 1.0, mats.create<Metal>(float3{0.7, 0.6, 0.5}, 0.0)));

    // constexpr unsigned int N_obj = 5;
    // auto ray_objects = make_cuda_managed_malloc<RayObjects[]>(N_obj);
    // world.push_back(make_sphere({0, 0, -1}, 0.5, mat_red));
    // world.push_back(make_sphere({0, -100.5, -1}, 100, mat_green));
    // world.push_back(make_sphere({-1, 0, -1}, 0.5, mat_di));
    // world.push_back(make_sphere({-1, 0, -1}, -0.4, mat_di));
    // world.push_back(make_sphere({1, 0, -1}, 0.5, mat_metal_col));

    // Copy our list of objects
    auto ray_objects = make_cuda_managed_malloc<RayObjects[]>(world.size());
    std::copy(world.begin(), world.end(), ray_objects.get());

    float3 lookfrom{13, 2, 3};
    float3 lookat{0, 0, 0};
    float3 up{0, 1, 0};
    float dist_to_focus = 10.0f;
    float aperture = 0.1;
    // float3 lookfrom{30, 5, 20};
    // float3 lookat{0, 0, -1};
    // float3 up{0, 1, 0};

    // float3 lookfrom{-2, 0.4, 0.3};
    // float3 lookat{0, 0, -1};
    // float3 up{0, 1, 0};
    // auto dist_to_focus = length(lookfrom - lookat);
    // float aperture = 4.0f;
    auto camera =
      Camera{lookfrom, lookat, up, 20, aspect_ratio, aperture, dist_to_focus};

    // Create and initialize the random states
    CudaEvent rand_start, rand_end;
    rand_start.record();
    auto random_state = make_cuda_malloc<curandState[]>(device_pitch * image_height);
    initRandomState<<<grid_dims, block_dims>>>(
      random_state.get(), image_width, image_height, device_pitch);
    rand_end.record();
    rand_end.synchronize();
    print("Random init time: {}\n", format_time(rand_end - rand_start));

    CudaEvent kernel_start;
    kernel_start.record();
    doShootRays<<<grid_dims, block_dims>>>(dev_image.get(),
                                           device_pitch,
                                           image_width,
                                           image_height,
                                           camera,
                                           ray_objects.get(),
                                           world.size(),
                                           random_state.get());
    cuda_throw_error();

    CudaEvent kernel_done;
    kernel_done.record();
    kernel_done.synchronize();
    print("Kernel time: {}\n", format_time(kernel_done - kernel_start));

    // Copy the memory data back
    auto host_image = std::make_unique<float4[]>(image_width * image_height);
    cudaMemcpy2D(host_image.get(),
                 image_width * sizeof(float4),
                 dev_image.get(),
                 device_pitch * sizeof(float4),
                 image_width * sizeof(float4),
                 image_height,
                 cudaMemcpyDeviceToHost);

    // Convert this to a linear 3-color int
    auto rgb_image = std::make_unique<uchar3[]>(image_width * image_height);
    for (int y = image_height - 1, k = 0; y >= 0; --y) {
        for (int x = 0; x < image_width; ++x, ++k) {
            auto pixel = sqrt(host_image[x + image_width * y]);
            uchar3 out_pixel{
              static_cast<uint8_t>(pixel.x * 255.999),
              static_cast<uint8_t>(pixel.y * 255.999),
              static_cast<uint8_t>(pixel.z * 255.999),
            };
            rgb_image[k] = out_pixel;
        }
    }

    print("Writing {}\n", bold("image.png"));
    auto err = lodepng_encode24_file("image.png",
                                     reinterpret_cast<uint8_t *>(rgb_image.get()),
                                     image_width,
                                     image_height);
    if (err) {
        print("Error: Failed to save png; {}: {}\n", err, lodepng_error_text(err));
        return 1;
    }

    auto end_time = std::chrono::high_resolution_clock::now();
    print("\nAll done in: {}\n", blue(format_time(end_time - start_time)));
 }

	#include <cooperative_groups.h>
	#include <fmt/core.h>
	#include <lodepng.h>

	#include <cmath>
	#include <cstdio>
	#include <limits>
	#include <memory>
	#include <random>
	#include <type_traits>

	#include "common.hpp"
	#include "curand_kernel.h"
	#include "float3.hpp"
	#include "float4.hpp"
	#include "rtweekend.hpp"

	#define CUDA_CALLABLE __forceinline__ __device__ __host__

	namespace cg = cooperative_groups;

	template <typename T, typename... Args>
	__global__ void device_make_object(T **dest_ptr, Args... args) {
	*dest_ptr = new T(args...);
	}
	template <typename T>
	__global__ void device_delete_object(T *obj) {
	delete obj;
	}

	/// Construct a new class object on the device
	template <typename T, typename... Args>
	auto make_device_object(Args... args) {
	// auto obj = make_cuda_managed_malloc<T **>(1);
	T **obj_ref = nullptr;
	cudaMalloc(&obj_ref, sizeof(T **));

	device_make_object<T><<<1, 1>>>(obj_ref, args...);
	auto deleter = [](T *ptr) { device_delete_object<<<1, 1>>>(ptr); };
	T *obj = nullptr;
	cudaMemcpy(&obj, obj_ref, sizeof(T *), cudaMemcpyDeviceToHost);
	cudaDeviceSynchronize();
	return std::shared_ptr<T>(obj, deleter);

	// return std::unique_ptr<T, decltype(deleter)>{obj, deleter};
	}

	class Material;

	class MaterialLibrary {
	public:
	template <typename T, typename... Args>
	auto create(Args... args) -> std::shared_ptr<T> {
	auto mat = make_device_object<T>(args...);
	_materials.push_back(mat);
	return mat;
	}

	private:
	std::vector<std::shared_ptr<Material>> _materials;
	};

	class Ray {
	public:
	Ray() {}
	CUDA_CALLABLE Ray(const float3 &origin, const float3 &direction)
	: _origin(origin), _direction(direction) {}

	CUDA_CALLABLE float3 origin() const {
	return _origin;
	}
	CUDA_CALLABLE float3 direction() const {
	return _direction;
	}

	CUDA_CALLABLE float3 at(float t) const {
	return _origin + t * _direction;
	}

	private:
	float3 _origin;
	float3 _direction;
	};

	__device__ auto random_in_unit_disk(curandState *random_state) -> float3 {
	while (true) {
	float3 p = {2.0f * curand_uniform(random_state) - 1,
	2.0f * curand_uniform(random_state) - 1,
	0};
	if (length_squared(p) >= 1) continue;
	// printf("%.2f, %.2f\n", p.x, p.y);
	return p;
	}
	}

	__device__ auto random_in_unit_sphere(curandState *curand_state) -> float3 {
	while (true) {
	auto vec = float3{curand_uniform(curand_state),
	curand_uniform(curand_state),
	curand_uniform(curand_state)}
	- 0.5f;
	if (length_squared(vec) > 1) {
	continue;
	}
	return vec;
	}
	}

	struct Camera {
	Camera(float3 lookfrom,
	float3 lookat,
	float3 up,
	float vfov,
	float aspect_ratio,
	float aperture,
	float focus_dist) {
	auto theta = rad(vfov);
	auto h = tan(theta / 2);
	auto viewport_height = 2.0f * h;
	auto viewport_width = aspect_ratio * viewport_height;

	w = unit_vector(lookfrom - lookat);
	u = unit_vector(cross(up, w));
	v = cross(w, u);

	origin = lookfrom;
	horizontal = focus_dist * viewport_width * u;
	vertical = focus_dist * viewport_height * v;
	lower_left_corner = origin - horizontal / 2 - vertical / 2 - focus_dist * w;
	lens_radius = aperture / 2;
	}

	float3 origin;
	float3 horizontal;
	float3 vertical;
	float3 lower_left_corner;
	float3 u, v, w;
	float lens_radius;

	__device__ auto get_ray(float s, float t, curandState *random_state) -> Ray {
	float3 rd = lens_radius * random_in_unit_disk(random_state);
	float3 offset = u * rd.x + v * rd.y;

	return Ray(origin + offset,
	lower_left_corner + s * horizontal + t * vertical - origin - offset);
	}
	};

	struct Hit {
	float3 point;
	float3 normal;
	float t;
	bool front_face;
	const Material *material;

	CUDA_CALLABLE void set_face_normal(const Ray &ray, const float3 &outward_normal) {
	front_face = dot(ray.direction(), outward_normal) < 0;
	normal = front_face ? outward_normal : -outward_normal;
	}
	};

	static_assert(std::is_trivial_v<Hit>);

	union RayObjects;
	struct RayObject;

	using HitFunction =
	bool ()(const void hittable, const Ray &ray, float t_min, float t_max, Hit &rec);

	struct RayObject {
	HitFunction hit;
	float3 position;
	const Material *material;
	};

	struct RayObject_Sphere {
	HitFunction hit;
	float3 position;
	float radius;
	const Material *material;
	};

	union RayObjects {
	RayObject obj;
	RayObject_Sphere sphere;
	};

	static_assert(sizeof(RayObject_Sphere) == sizeof(RayObject));
	static_assert(std::is_trivial_v<RayObject_Sphere>);

	__device__ bool hit_sphere(const RayObjects *hittable,
	const Ray &r,
	float t_min,
	float t_max,
	Hit &rec) {
	const RayObject_Sphere *obj = &hittable->sphere;
	float radius = obj->radius;

	float3 oc = r.origin() - obj->position;
	auto a = length_squared(r.direction());
	auto half_b = dot(oc, r.direction());
	auto c = length_squared(oc) - radius * radius;

	auto discriminant = half_b * half_b - a * c;
	if (discriminant < 0) {
	return false;
	}
	float t = (-half_b - std::sqrt(discriminant)) / a;
	// Check the hit isn't behind us
	if (t < t_min \|\| t > t_max) {
	return false;
	}
	rec.t = t;
	rec.point = r.at(rec.t);
	float3 normal = (rec.point - obj->position) / radius;
	rec.set_face_normal(r, normal);
	rec.material = obj->material;

	return true;
	}

	using ptrX = decltype(hit_sphere);

	__device__ HitFunction dev_pHitSphere = reinterpret_cast<HitFunction>(hit_sphere);

	auto make_sphere(float3 position, float radius, const Material *material) {
	HitFunction fun = nullptr;
	cudaMemcpyFromSymbol(&fun, dev_pHitSphere, sizeof(HitFunction *));
	cuda_throw_error();
	return RayObjects{.sphere = RayObject_Sphere(fun, position, radius, material)};
	}

	template <typename T, typename U>
	auto make_sphere(float3 position, float radius, std::unique_ptr<T, U> &material) {
	static_assert(std::is_base_of_v<Material, T>,
	"make_sphere must be passed a material");

	return make_sphere(position, radius, material.get());
	}
	template <typename T>
	auto make_sphere(float3 position, float radius, std::shared_ptr<T> material) {
	static_assert(std::is_base_of_v<Material, T>,
	"make_sphere must be passed a material");

	return make_sphere(position, radius, material.get());
	}
	/// Return the color of a ray that otherwise goes to infinity
	__device__ float3 world_color(const Ray &ray) {
	auto unit_direction = unit_vector(ray.direction());
	auto t = 0.5f * (unit_direction.y + 1.0f);
	return (1.0f - t) * float3{1.0f, 1.0f, 1.0f} + t * float3{0.5f, 0.7f, 1.0f};
	}
	__device__ auto next_hit(const Ray &ray,
	const RayObjects objects[],
	int num_objects,
	Hit *hit) -> bool {
	Hit closest_hit{{}, {}, infinity};
	const RayObjects *closest_object = nullptr;

	for (int x = 0; x < num_objects; ++x) {
	Hit hit;
	const RayObjects *obj = &objects[x];
	bool result = objects[x].obj.hit(obj, ray, 0.001f, 1000.0f, hit);
	if (result) {
	if (hit.t < closest_hit.t) {
	closest_hit = hit;
	closest_object = obj;
	}
	}
	}
	if (closest_object == nullptr) {
	return false;
	}
	*hit = closest_hit;
	return true;
	}

	class Material {
	public:
	__device__ virtual bool scatter(const Ray &ray,
	const Hit &hit,
	float3 &attenuation,
	Ray &scattered,
	curandState *random_state) const = 0;
	};

	class Lambertian : public Material {
	public:
	__device__ Lambertian(const float3 &a) : albedo(a) {}

	__device__ virtual bool scatter(const Ray &ray,
	const Hit &hit,
	float3 &attenuation,
	Ray &scattered,
	curandState *random_state) const override {
	auto scatter_direction =
	hit.normal + unit_vector(random_in_unit_sphere(random_state));
	// Protect against very small vectors
	if (near_zero(scatter_direction)) {
	scatter_direction = hit.normal;
	}
	scattered = Ray{hit.point, scatter_direction};
	attenuation = albedo;
	return true;
	}

	private:
	float3 albedo;
	};

	__device__ auto reflect(const float3 &v, const float3 &n) -> float3 {
	return v - 2 * dot(v, n) * n;
	}
	__device__ auto refract(const float3 &v, const float3 &n, float n_ratio) -> float3 {
	auto cos_theta = min(dot(-v, n), 1.0);
	// float sin_theta = sqrt(1.0f - cos_theta * cos_theta);
	// if (n_ratio * sin_theta > 1.0f) {
	// return reflect(v, n);
	// }
	float3 r_out_perp = n_ratio * (v + cos_theta * n);
	float3 r_out_para = -sqrt(fabs(1.0 - length_squared(r_out_perp))) * n;
	return r_out_perp + r_out_para;
	}

	class Metal : public Material {
	public:
	__device__ Metal(const float3 &a, float fuzz = 0.0f)
	: albedo(a), fuzz(fuzz < 1 ? fuzz : 1.0f) {}
	__device__ virtual bool scatter(const Ray &ray,
	const Hit &hit,
	float3 &attenuation,
	Ray &scattered,
	curandState *random_state) const override {
	auto reflected = reflect(unit_vector(ray.direction()), hit.normal);
	scattered =
	Ray{hit.point, reflected + fuzz * random_in_unit_sphere(random_state)};
	attenuation = albedo;
	return (dot(scattered.direction(), hit.normal) > 0);
	}

	private:
	float3 albedo;
	float fuzz;
	};

	class Dielectric : public Material {
	public:
	__device__ Dielectric(float index_of_refraction, float fuzz = 0.0f)
	: ir(index_of_refraction), fuzz(fuzz) {}
	__device__ virtual bool scatter(const Ray &ray,
	const Hit &hit,
	float3 &attenuation,
	Ray &scattered,
	curandState *random_state) const override {
	attenuation = {1, 1, 1};
	float r_ratio = hit.front_face ? (1.0 / ir) : ir;
	float3 unit_direction = unit_vector(ray.direction());

	double cos_theta = min(dot(-unit_direction, hit.normal), 1.0);
	double sin_theta = sqrt(1.0 - cos_theta * cos_theta);

	bool cannot_refract = r_ratio * sin_theta > 1.0;
	float3 direction;
	if (cannot_refract
	\|\| reflectance(cos_theta, r_ratio) > curand_uniform(random_state)) {
	direction = reflect(unit_direction, hit.normal);
	} else {
	direction = refract(unit_direction, hit.normal, r_ratio);
	}
	scattered = Ray{hit.point, direction};
	if (fuzz > 1e-6) {
	scattered =
	Ray{ray.origin(),
	ray.direction() + 0.05 * random_in_unit_sphere(random_state)};
	}

	return true;
	}

	private:
	__device__ static auto reflectance(float cosine, float ref_idx) -> float {
	auto r0 = (1 - ref_idx) / (1 + ref_idx);
	r0 = r0 * r0;
	return r0 + (1 - r0) * pow((1 - cosine), 5);
	}
	float ir;
	float fuzz;
	};
	__device__ float3 ray_color(const Ray &ray,
	RayObjects objects[],
	int num_objects,
	curandState *random_state) {
	int depth = 50;

	// We accumulate the colour over multiple hits
	float3 albedo{1, 1, 1};
	// float contribution = 1.0f;
	Hit hit;
	Ray next_ray = ray;
	// Accumulate hits until we have enough bounces or hit nothing
	while (--depth > 0) {
	bool result = next_hit(next_ray, objects, num_objects, &hit);

	if (result) {
	// Assume 50% absorption.
	// contribution *= 0.5f;
	// Determine the next ray from this point
	float3 attenuation{};
	if (hit.material->scatter(
	next_ray, hit, attenuation, next_ray, random_state)) {
	albedo *= attenuation;
	}

	} else {
	// We didn't hit any other objects,
	auto wc = world_color(next_ray);
	albedo *= {wc.x, wc.y, wc.z};
	break;
	}
	}
	return albedo;
	}

	__global__ void doShootRays(float4 data[],
	int data_pitch,
	int width,
	int height,
	Camera camera,
	RayObjects objects[],
	int num_objects,
	curandState rand_state[]) {
	// Assumptions:
	// - One thread per pixel
	auto grid = cg::this_grid();
	auto block = cg::this_thread_block();

	constexpr int SAMPLES_PER_PIXEL = 100;

	// Work out the position in pixel space
	int image_x =
	block.group_index().x * block.dim_threads().x + block.thread_index().x;
	int image_y =
	(block.group_index().y * block.dim_threads().y + block.thread_index().y);

	if (image_x > width \|\| image_y > height) return;

	auto random_state = rand_state[image_x + image_y * data_pitch];
	float4 color{};

	for (int n = 0; n < SAMPLES_PER_PIXEL; ++n) {
	// Fractions across viewport
	auto u =
	(static_cast<float>(image_x) + curand_uniform(&random_state)) / (width - 1);
	auto v =
	(static_cast<float>(image_y) + curand_uniform(&random_state)) / (height - 1);

	auto ray = camera.get_ray(u, v, &random_state);
	auto pixel_color = ray_color(ray, objects, num_objects, &random_state);
	color = float4{color.x + pixel_color.x,
	color.y + pixel_color.y,
	color.z + pixel_color.z,
	color.w + 1.0f};
	}

	data[image_x + image_y * data_pitch] = {
	color.x / color.w, color.y / color.w, color.z / color.w, 1.0};

	rand_state[image_x + image_y * data_pitch] = random_state;
	}

	__global__ void initRandomState(curandState rand_state[],
	size_t width,
	size_t height,
	size_t pitch) {
	auto grid = cg::this_grid();
	auto block = cg::this_thread_block();
	int image_x =
	block.group_index().x * block.dim_threads().x + block.thread_index().x;
	int image_y =
	(block.group_index().y * block.dim_threads().y + block.thread_index().y);
	if (image_x >= width \|\| image_y >= height) return;

	size_t pixel_id = image_y * pitch + image_x;
	curand_init(pixel_id, 0, 0, &rand_state[pixel_id]);
	}

	int main(void) {
	auto start_time = std::chrono::high_resolution_clock::now();

	const unsigned int image_width = 1600;
	const unsigned int image_height = 900;

	const float aspect_ratio = (float)image_width / (float)image_height;

	// Calculate the block size
	auto block_dims = dim3{32, 32};
	// And now the block size. One thread per pixel.
	auto grid_dims = dim3{
	static_cast<unsigned int>(
	std::ceil(static_cast<float>(image_width) / block_dims.x)),
	static_cast<unsigned int>(
	std::ceil(static_cast<float>(image_height) / block_dims.y)),
	};
	print("Image: {:4d} x {:<4d}\n", image_width, image_height);
	print("Threads: {:4d} x {:<4d} = {}\n",
	block_dims.x,
	block_dims.y,
	block_dims.x * block_dims.y);
	print("Blocks: {:4d} x {:<4d} = {}\n",
	grid_dims.x,
	grid_dims.y,
	grid_dims.x * grid_dims.y);

	auto [dev_image, device_pitch] =
	make_cuda_pitched_malloc<float4>(image_width, image_height);
	cudaMemset(dev_image.get(), 0, device_pitch * sizeof(float4));
	print(
	"Allocated device memory. Pitch = {} vs naive {}\n", device_pitch, image_width);
	cudaDeviceSynchronize();
	cuda_throw_error();

	// Build the dynamic object list
	auto mat_red = make_device_object<Lambertian>(float3{0.9, 0.2, 0.2});
	auto mat_green = make_device_object<Lambertian>(float3{0.5, 0.9, 0.5});
	auto mat_metal = make_device_object<Metal>(float3{0.8, 0.8, 0.8});
	auto mat_metal_col = make_device_object<Metal>(float3{0.8, 0.6, 0.2}, 0.2);
	auto mat_di = make_device_object<Dielectric>(1.5);

	float R = cos(pi / 4);
	auto world = std::vector<RayObjects>();
	// auto mats = std::vector<std::unique_ptr<Material, [](Material *ptr) -> void>> {};
	// auto mats = std::vector<std::unique_ptr<Material, void(Material*)>> {};
	std::random_device r;
	std::default_random_engine e1(r());
	std::uniform_real_distribution<float> uniform_float(0, 1);
	std::uniform_real_distribution<float> upper_float(0.5, 1);

	// Make random materials
	// using MaterialPtr = decltype(make_device_object<Material>);
	// using MP = void(Material * ptr);

	auto mats = MaterialLibrary{};

	// auto mats = std::vector<std::shared_ptr<Material>>{};

	auto mat_ground = mats.create<Lambertian>(float3{0.5, 0.5, 0.5});
	world.push_back(make_sphere({0, -1000, 0}, 1000, mat_ground));

	for (int a = -11; a < 11; ++a) {
	for (int b = -11; b < 11; ++b) {
	auto choose_mat = uniform_float(e1);
	float3 center{
	a + 0.9f * uniform_float(e1), 0.2f, b + 0.9f * uniform_float(e1)};
	if (length((center - float3{4, 0.2, 0})) > 0.9) {
	if (choose_mat < 0.8) {
	float3 albedo = {
	uniform_float(e1), uniform_float(e1), uniform_float(e1)};
	albedo *= albedo;
	world.push_back(
	make_sphere(center, 0.2, mats.create<Lambertian>(albedo)));
	} else if (choose_mat < 0.95) {
	float3 albedo = {upper_float(e1), upper_float(e1), upper_float(e1)};
	auto fuzz = uniform_float(e1) / 2.0f;
	world.push_back(
	make_sphere(center, 0.2, mats.create<Metal>(albedo, fuzz)));
	} else {
	world.push_back(
	make_sphere(center, 0.2, mats.create<Dielectric>(1.5)));
	}
	}
	}
	}

	world.push_back(make_sphere({0, 1, 0}, 1.0, mats.create<Dielectric>(1.5)));
	world.push_back(
	make_sphere({-4, 1, 0}, 1.0, mats.create<Lambertian>(float3{0.4, 0.2, 0.1})));
	world.push_back(
	make_sphere({4, 1, 0}, 1.0, mats.create<Metal>(float3{0.7, 0.6, 0.5}, 0.0)));

	// constexpr unsigned int N_obj = 5;
	// auto ray_objects = make_cuda_managed_malloc<RayObjects[]>(N_obj);
	// world.push_back(make_sphere({0, 0, -1}, 0.5, mat_red));
	// world.push_back(make_sphere({0, -100.5, -1}, 100, mat_green));
	// world.push_back(make_sphere({-1, 0, -1}, 0.5, mat_di));
	// world.push_back(make_sphere({-1, 0, -1}, -0.4, mat_di));
	// world.push_back(make_sphere({1, 0, -1}, 0.5, mat_metal_col));

	// Copy our list of objects
	auto ray_objects = make_cuda_managed_malloc<RayObjects[]>(world.size());
	std::copy(world.begin(), world.end(), ray_objects.get());

	float3 lookfrom{13, 2, 3};
	float3 lookat{0, 0, 0};
	float3 up{0, 1, 0};
	float dist_to_focus = 10.0f;
	float aperture = 0.1;
	// float3 lookfrom{30, 5, 20};
	// float3 lookat{0, 0, -1};
	// float3 up{0, 1, 0};

	// float3 lookfrom{-2, 0.4, 0.3};
	// float3 lookat{0, 0, -1};
	// float3 up{0, 1, 0};
	// auto dist_to_focus = length(lookfrom - lookat);
	// float aperture = 4.0f;
	auto camera =
	Camera{lookfrom, lookat, up, 20, aspect_ratio, aperture, dist_to_focus};

	// Create and initialize the random states
	CudaEvent rand_start, rand_end;
	rand_start.record();
	auto random_state = make_cuda_malloc<curandState[]>(device_pitch * image_height);
	initRandomState<<<grid_dims, block_dims>>>(
	random_state.get(), image_width, image_height, device_pitch);
	rand_end.record();
	rand_end.synchronize();
	print("Random init time: {}\n", format_time(rand_end - rand_start));

	CudaEvent kernel_start;
	kernel_start.record();
	doShootRays<<<grid_dims, block_dims>>>(dev_image.get(),
	device_pitch,
	image_width,
	image_height,
	camera,
	ray_objects.get(),
	world.size(),
	random_state.get());
	cuda_throw_error();

	CudaEvent kernel_done;
	kernel_done.record();
	kernel_done.synchronize();
	print("Kernel time: {}\n", format_time(kernel_done - kernel_start));

	// Copy the memory data back
	auto host_image = std::make_unique<float4[]>(image_width * image_height);
	cudaMemcpy2D(host_image.get(),
	image_width * sizeof(float4),
	dev_image.get(),
	device_pitch * sizeof(float4),
	image_width * sizeof(float4),
	image_height,
	cudaMemcpyDeviceToHost);

	// Convert this to a linear 3-color int
	auto rgb_image = std::make_unique<uchar3[]>(image_width * image_height);
	for (int y = image_height - 1, k = 0; y >= 0; --y) {
	for (int x = 0; x < image_width; ++x, ++k) {
	auto pixel = sqrt(host_image[x + image_width * y]);
	uchar3 out_pixel{
	static_cast<uint8_t>(pixel.x * 255.999),
	static_cast<uint8_t>(pixel.y * 255.999),
	static_cast<uint8_t>(pixel.z * 255.999),
	};
	rgb_image[k] = out_pixel;
	}
	}

	print("Writing {}\n", bold("image.png"));
	auto err = lodepng_encode24_file("image.png",
	reinterpret_cast<uint8_t *>(rgb_image.get()),
	image_width,
	image_height);
	if (err) {
	print("Error: Failed to save png; {}: {}\n", err, lodepng_error_text(err));
	return 1;
	}

	auto end_time = std::chrono::high_resolution_clock::now();
	print("\nAll done in: {}\n", blue(format_time(end_time - start_time)));
	}