Created
February 24, 2023 10:40
-
-
Save ndevenish/c4567f43ff2805b72ab82d042acb4700 to your computer and use it in GitHub Desktop.
End result of "Ray tracing in a weekend"
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <cooperative_groups.h> | |
#include <fmt/core.h> | |
#include <lodepng.h> | |
#include <cmath> | |
#include <cstdio> | |
#include <limits> | |
#include <memory> | |
#include <random> | |
#include <type_traits> | |
#include "common.hpp" | |
#include "curand_kernel.h" | |
#include "float3.hpp" | |
#include "float4.hpp" | |
#include "rtweekend.hpp" | |
#define CUDA_CALLABLE __forceinline__ __device__ __host__ | |
namespace cg = cooperative_groups; | |
template <typename T, typename... Args> | |
__global__ void device_make_object(T **dest_ptr, Args... args) { | |
*dest_ptr = new T(args...); | |
} | |
template <typename T> | |
__global__ void device_delete_object(T *obj) { | |
delete obj; | |
} | |
/// Construct a new class object on the device | |
template <typename T, typename... Args> | |
auto make_device_object(Args... args) { | |
// auto obj = make_cuda_managed_malloc<T **>(1); | |
T **obj_ref = nullptr; | |
cudaMalloc(&obj_ref, sizeof(T **)); | |
device_make_object<T><<<1, 1>>>(obj_ref, args...); | |
auto deleter = [](T *ptr) { device_delete_object<<<1, 1>>>(ptr); }; | |
T *obj = nullptr; | |
cudaMemcpy(&obj, obj_ref, sizeof(T *), cudaMemcpyDeviceToHost); | |
cudaDeviceSynchronize(); | |
return std::shared_ptr<T>(obj, deleter); | |
// return std::unique_ptr<T, decltype(deleter)>{obj, deleter}; | |
} | |
class Material; | |
class MaterialLibrary { | |
public: | |
template <typename T, typename... Args> | |
auto create(Args... args) -> std::shared_ptr<T> { | |
auto mat = make_device_object<T>(args...); | |
_materials.push_back(mat); | |
return mat; | |
} | |
private: | |
std::vector<std::shared_ptr<Material>> _materials; | |
}; | |
class Ray { | |
public: | |
Ray() {} | |
CUDA_CALLABLE Ray(const float3 &origin, const float3 &direction) | |
: _origin(origin), _direction(direction) {} | |
CUDA_CALLABLE float3 origin() const { | |
return _origin; | |
} | |
CUDA_CALLABLE float3 direction() const { | |
return _direction; | |
} | |
CUDA_CALLABLE float3 at(float t) const { | |
return _origin + t * _direction; | |
} | |
private: | |
float3 _origin; | |
float3 _direction; | |
}; | |
__device__ auto random_in_unit_disk(curandState *random_state) -> float3 { | |
while (true) { | |
float3 p = {2.0f * curand_uniform(random_state) - 1, | |
2.0f * curand_uniform(random_state) - 1, | |
0}; | |
if (length_squared(p) >= 1) continue; | |
// printf("%.2f, %.2f\n", p.x, p.y); | |
return p; | |
} | |
} | |
__device__ auto random_in_unit_sphere(curandState *curand_state) -> float3 { | |
while (true) { | |
auto vec = float3{curand_uniform(curand_state), | |
curand_uniform(curand_state), | |
curand_uniform(curand_state)} | |
- 0.5f; | |
if (length_squared(vec) > 1) { | |
continue; | |
} | |
return vec; | |
} | |
} | |
struct Camera { | |
Camera(float3 lookfrom, | |
float3 lookat, | |
float3 up, | |
float vfov, | |
float aspect_ratio, | |
float aperture, | |
float focus_dist) { | |
auto theta = rad(vfov); | |
auto h = tan(theta / 2); | |
auto viewport_height = 2.0f * h; | |
auto viewport_width = aspect_ratio * viewport_height; | |
w = unit_vector(lookfrom - lookat); | |
u = unit_vector(cross(up, w)); | |
v = cross(w, u); | |
origin = lookfrom; | |
horizontal = focus_dist * viewport_width * u; | |
vertical = focus_dist * viewport_height * v; | |
lower_left_corner = origin - horizontal / 2 - vertical / 2 - focus_dist * w; | |
lens_radius = aperture / 2; | |
} | |
float3 origin; | |
float3 horizontal; | |
float3 vertical; | |
float3 lower_left_corner; | |
float3 u, v, w; | |
float lens_radius; | |
__device__ auto get_ray(float s, float t, curandState *random_state) -> Ray { | |
float3 rd = lens_radius * random_in_unit_disk(random_state); | |
float3 offset = u * rd.x + v * rd.y; | |
return Ray(origin + offset, | |
lower_left_corner + s * horizontal + t * vertical - origin - offset); | |
} | |
}; | |
struct Hit { | |
float3 point; | |
float3 normal; | |
float t; | |
bool front_face; | |
const Material *material; | |
CUDA_CALLABLE void set_face_normal(const Ray &ray, const float3 &outward_normal) { | |
front_face = dot(ray.direction(), outward_normal) < 0; | |
normal = front_face ? outward_normal : -outward_normal; | |
} | |
}; | |
static_assert(std::is_trivial_v<Hit>); | |
union RayObjects; | |
struct RayObject; | |
using HitFunction = | |
bool (*)(const void *hittable, const Ray &ray, float t_min, float t_max, Hit &rec); | |
struct RayObject { | |
HitFunction hit; | |
float3 position; | |
const Material *material; | |
}; | |
struct RayObject_Sphere { | |
HitFunction hit; | |
float3 position; | |
float radius; | |
const Material *material; | |
}; | |
union RayObjects { | |
RayObject obj; | |
RayObject_Sphere sphere; | |
}; | |
static_assert(sizeof(RayObject_Sphere) == sizeof(RayObject)); | |
static_assert(std::is_trivial_v<RayObject_Sphere>); | |
__device__ bool hit_sphere(const RayObjects *hittable, | |
const Ray &r, | |
float t_min, | |
float t_max, | |
Hit &rec) { | |
const RayObject_Sphere *obj = &hittable->sphere; | |
float radius = obj->radius; | |
float3 oc = r.origin() - obj->position; | |
auto a = length_squared(r.direction()); | |
auto half_b = dot(oc, r.direction()); | |
auto c = length_squared(oc) - radius * radius; | |
auto discriminant = half_b * half_b - a * c; | |
if (discriminant < 0) { | |
return false; | |
} | |
float t = (-half_b - std::sqrt(discriminant)) / a; | |
// Check the hit isn't behind us | |
if (t < t_min || t > t_max) { | |
return false; | |
} | |
rec.t = t; | |
rec.point = r.at(rec.t); | |
float3 normal = (rec.point - obj->position) / radius; | |
rec.set_face_normal(r, normal); | |
rec.material = obj->material; | |
return true; | |
} | |
using ptrX = decltype(hit_sphere); | |
__device__ HitFunction dev_pHitSphere = reinterpret_cast<HitFunction>(hit_sphere); | |
auto make_sphere(float3 position, float radius, const Material *material) { | |
HitFunction fun = nullptr; | |
cudaMemcpyFromSymbol(&fun, dev_pHitSphere, sizeof(HitFunction *)); | |
cuda_throw_error(); | |
return RayObjects{.sphere = RayObject_Sphere(fun, position, radius, material)}; | |
} | |
template <typename T, typename U> | |
auto make_sphere(float3 position, float radius, std::unique_ptr<T, U> &material) { | |
static_assert(std::is_base_of_v<Material, T>, | |
"make_sphere must be passed a material"); | |
return make_sphere(position, radius, material.get()); | |
} | |
template <typename T> | |
auto make_sphere(float3 position, float radius, std::shared_ptr<T> material) { | |
static_assert(std::is_base_of_v<Material, T>, | |
"make_sphere must be passed a material"); | |
return make_sphere(position, radius, material.get()); | |
} | |
/// Return the color of a ray that otherwise goes to infinity | |
__device__ float3 world_color(const Ray &ray) { | |
auto unit_direction = unit_vector(ray.direction()); | |
auto t = 0.5f * (unit_direction.y + 1.0f); | |
return (1.0f - t) * float3{1.0f, 1.0f, 1.0f} + t * float3{0.5f, 0.7f, 1.0f}; | |
} | |
__device__ auto next_hit(const Ray &ray, | |
const RayObjects objects[], | |
int num_objects, | |
Hit *hit) -> bool { | |
Hit closest_hit{{}, {}, infinity}; | |
const RayObjects *closest_object = nullptr; | |
for (int x = 0; x < num_objects; ++x) { | |
Hit hit; | |
const RayObjects *obj = &objects[x]; | |
bool result = objects[x].obj.hit(obj, ray, 0.001f, 1000.0f, hit); | |
if (result) { | |
if (hit.t < closest_hit.t) { | |
closest_hit = hit; | |
closest_object = obj; | |
} | |
} | |
} | |
if (closest_object == nullptr) { | |
return false; | |
} | |
*hit = closest_hit; | |
return true; | |
} | |
class Material { | |
public: | |
__device__ virtual bool scatter(const Ray &ray, | |
const Hit &hit, | |
float3 &attenuation, | |
Ray &scattered, | |
curandState *random_state) const = 0; | |
}; | |
class Lambertian : public Material { | |
public: | |
__device__ Lambertian(const float3 &a) : albedo(a) {} | |
__device__ virtual bool scatter(const Ray &ray, | |
const Hit &hit, | |
float3 &attenuation, | |
Ray &scattered, | |
curandState *random_state) const override { | |
auto scatter_direction = | |
hit.normal + unit_vector(random_in_unit_sphere(random_state)); | |
// Protect against very small vectors | |
if (near_zero(scatter_direction)) { | |
scatter_direction = hit.normal; | |
} | |
scattered = Ray{hit.point, scatter_direction}; | |
attenuation = albedo; | |
return true; | |
} | |
private: | |
float3 albedo; | |
}; | |
__device__ auto reflect(const float3 &v, const float3 &n) -> float3 { | |
return v - 2 * dot(v, n) * n; | |
} | |
__device__ auto refract(const float3 &v, const float3 &n, float n_ratio) -> float3 { | |
auto cos_theta = min(dot(-v, n), 1.0); | |
// float sin_theta = sqrt(1.0f - cos_theta * cos_theta); | |
// if (n_ratio * sin_theta > 1.0f) { | |
// return reflect(v, n); | |
// } | |
float3 r_out_perp = n_ratio * (v + cos_theta * n); | |
float3 r_out_para = -sqrt(fabs(1.0 - length_squared(r_out_perp))) * n; | |
return r_out_perp + r_out_para; | |
} | |
class Metal : public Material { | |
public: | |
__device__ Metal(const float3 &a, float fuzz = 0.0f) | |
: albedo(a), fuzz(fuzz < 1 ? fuzz : 1.0f) {} | |
__device__ virtual bool scatter(const Ray &ray, | |
const Hit &hit, | |
float3 &attenuation, | |
Ray &scattered, | |
curandState *random_state) const override { | |
auto reflected = reflect(unit_vector(ray.direction()), hit.normal); | |
scattered = | |
Ray{hit.point, reflected + fuzz * random_in_unit_sphere(random_state)}; | |
attenuation = albedo; | |
return (dot(scattered.direction(), hit.normal) > 0); | |
} | |
private: | |
float3 albedo; | |
float fuzz; | |
}; | |
class Dielectric : public Material { | |
public: | |
__device__ Dielectric(float index_of_refraction, float fuzz = 0.0f) | |
: ir(index_of_refraction), fuzz(fuzz) {} | |
__device__ virtual bool scatter(const Ray &ray, | |
const Hit &hit, | |
float3 &attenuation, | |
Ray &scattered, | |
curandState *random_state) const override { | |
attenuation = {1, 1, 1}; | |
float r_ratio = hit.front_face ? (1.0 / ir) : ir; | |
float3 unit_direction = unit_vector(ray.direction()); | |
double cos_theta = min(dot(-unit_direction, hit.normal), 1.0); | |
double sin_theta = sqrt(1.0 - cos_theta * cos_theta); | |
bool cannot_refract = r_ratio * sin_theta > 1.0; | |
float3 direction; | |
if (cannot_refract | |
|| reflectance(cos_theta, r_ratio) > curand_uniform(random_state)) { | |
direction = reflect(unit_direction, hit.normal); | |
} else { | |
direction = refract(unit_direction, hit.normal, r_ratio); | |
} | |
scattered = Ray{hit.point, direction}; | |
if (fuzz > 1e-6) { | |
scattered = | |
Ray{ray.origin(), | |
ray.direction() + 0.05 * random_in_unit_sphere(random_state)}; | |
} | |
return true; | |
} | |
private: | |
__device__ static auto reflectance(float cosine, float ref_idx) -> float { | |
auto r0 = (1 - ref_idx) / (1 + ref_idx); | |
r0 = r0 * r0; | |
return r0 + (1 - r0) * pow((1 - cosine), 5); | |
} | |
float ir; | |
float fuzz; | |
}; | |
__device__ float3 ray_color(const Ray &ray, | |
RayObjects objects[], | |
int num_objects, | |
curandState *random_state) { | |
int depth = 50; | |
// We accumulate the colour over multiple hits | |
float3 albedo{1, 1, 1}; | |
// float contribution = 1.0f; | |
Hit hit; | |
Ray next_ray = ray; | |
// Accumulate hits until we have enough bounces or hit nothing | |
while (--depth > 0) { | |
bool result = next_hit(next_ray, objects, num_objects, &hit); | |
if (result) { | |
// Assume 50% absorption. | |
// contribution *= 0.5f; | |
// Determine the next ray from this point | |
float3 attenuation{}; | |
if (hit.material->scatter( | |
next_ray, hit, attenuation, next_ray, random_state)) { | |
albedo *= attenuation; | |
} | |
} else { | |
// We didn't hit any other objects, | |
auto wc = world_color(next_ray); | |
albedo *= {wc.x, wc.y, wc.z}; | |
break; | |
} | |
} | |
return albedo; | |
} | |
__global__ void doShootRays(float4 data[], | |
int data_pitch, | |
int width, | |
int height, | |
Camera camera, | |
RayObjects objects[], | |
int num_objects, | |
curandState rand_state[]) { | |
// Assumptions: | |
// - One thread per pixel | |
auto grid = cg::this_grid(); | |
auto block = cg::this_thread_block(); | |
constexpr int SAMPLES_PER_PIXEL = 100; | |
// Work out the position in pixel space | |
int image_x = | |
block.group_index().x * block.dim_threads().x + block.thread_index().x; | |
int image_y = | |
(block.group_index().y * block.dim_threads().y + block.thread_index().y); | |
if (image_x > width || image_y > height) return; | |
auto random_state = rand_state[image_x + image_y * data_pitch]; | |
float4 color{}; | |
for (int n = 0; n < SAMPLES_PER_PIXEL; ++n) { | |
// Fractions across viewport | |
auto u = | |
(static_cast<float>(image_x) + curand_uniform(&random_state)) / (width - 1); | |
auto v = | |
(static_cast<float>(image_y) + curand_uniform(&random_state)) / (height - 1); | |
auto ray = camera.get_ray(u, v, &random_state); | |
auto pixel_color = ray_color(ray, objects, num_objects, &random_state); | |
color = float4{color.x + pixel_color.x, | |
color.y + pixel_color.y, | |
color.z + pixel_color.z, | |
color.w + 1.0f}; | |
} | |
data[image_x + image_y * data_pitch] = { | |
color.x / color.w, color.y / color.w, color.z / color.w, 1.0}; | |
rand_state[image_x + image_y * data_pitch] = random_state; | |
} | |
__global__ void initRandomState(curandState rand_state[], | |
size_t width, | |
size_t height, | |
size_t pitch) { | |
auto grid = cg::this_grid(); | |
auto block = cg::this_thread_block(); | |
int image_x = | |
block.group_index().x * block.dim_threads().x + block.thread_index().x; | |
int image_y = | |
(block.group_index().y * block.dim_threads().y + block.thread_index().y); | |
if (image_x >= width || image_y >= height) return; | |
size_t pixel_id = image_y * pitch + image_x; | |
curand_init(pixel_id, 0, 0, &rand_state[pixel_id]); | |
} | |
int main(void) { | |
auto start_time = std::chrono::high_resolution_clock::now(); | |
const unsigned int image_width = 1600; | |
const unsigned int image_height = 900; | |
const float aspect_ratio = (float)image_width / (float)image_height; | |
// Calculate the block size | |
auto block_dims = dim3{32, 32}; | |
// And now the block size. One thread per pixel. | |
auto grid_dims = dim3{ | |
static_cast<unsigned int>( | |
std::ceil(static_cast<float>(image_width) / block_dims.x)), | |
static_cast<unsigned int>( | |
std::ceil(static_cast<float>(image_height) / block_dims.y)), | |
}; | |
print("Image: {:4d} x {:<4d}\n", image_width, image_height); | |
print("Threads: {:4d} x {:<4d} = {}\n", | |
block_dims.x, | |
block_dims.y, | |
block_dims.x * block_dims.y); | |
print("Blocks: {:4d} x {:<4d} = {}\n", | |
grid_dims.x, | |
grid_dims.y, | |
grid_dims.x * grid_dims.y); | |
auto [dev_image, device_pitch] = | |
make_cuda_pitched_malloc<float4>(image_width, image_height); | |
cudaMemset(dev_image.get(), 0, device_pitch * sizeof(float4)); | |
print( | |
"Allocated device memory. Pitch = {} vs naive {}\n", device_pitch, image_width); | |
cudaDeviceSynchronize(); | |
cuda_throw_error(); | |
// Build the dynamic object list | |
auto mat_red = make_device_object<Lambertian>(float3{0.9, 0.2, 0.2}); | |
auto mat_green = make_device_object<Lambertian>(float3{0.5, 0.9, 0.5}); | |
auto mat_metal = make_device_object<Metal>(float3{0.8, 0.8, 0.8}); | |
auto mat_metal_col = make_device_object<Metal>(float3{0.8, 0.6, 0.2}, 0.2); | |
auto mat_di = make_device_object<Dielectric>(1.5); | |
float R = cos(pi / 4); | |
auto world = std::vector<RayObjects>(); | |
// auto mats = std::vector<std::unique_ptr<Material, [](Material *ptr) -> void>> {}; | |
// auto mats = std::vector<std::unique_ptr<Material, void(Material*)>> {}; | |
std::random_device r; | |
std::default_random_engine e1(r()); | |
std::uniform_real_distribution<float> uniform_float(0, 1); | |
std::uniform_real_distribution<float> upper_float(0.5, 1); | |
// Make random materials | |
// using MaterialPtr = decltype(make_device_object<Material>); | |
// using MP = void(Material * ptr); | |
auto mats = MaterialLibrary{}; | |
// auto mats = std::vector<std::shared_ptr<Material>>{}; | |
auto mat_ground = mats.create<Lambertian>(float3{0.5, 0.5, 0.5}); | |
world.push_back(make_sphere({0, -1000, 0}, 1000, mat_ground)); | |
for (int a = -11; a < 11; ++a) { | |
for (int b = -11; b < 11; ++b) { | |
auto choose_mat = uniform_float(e1); | |
float3 center{ | |
a + 0.9f * uniform_float(e1), 0.2f, b + 0.9f * uniform_float(e1)}; | |
if (length((center - float3{4, 0.2, 0})) > 0.9) { | |
if (choose_mat < 0.8) { | |
float3 albedo = { | |
uniform_float(e1), uniform_float(e1), uniform_float(e1)}; | |
albedo *= albedo; | |
world.push_back( | |
make_sphere(center, 0.2, mats.create<Lambertian>(albedo))); | |
} else if (choose_mat < 0.95) { | |
float3 albedo = {upper_float(e1), upper_float(e1), upper_float(e1)}; | |
auto fuzz = uniform_float(e1) / 2.0f; | |
world.push_back( | |
make_sphere(center, 0.2, mats.create<Metal>(albedo, fuzz))); | |
} else { | |
world.push_back( | |
make_sphere(center, 0.2, mats.create<Dielectric>(1.5))); | |
} | |
} | |
} | |
} | |
world.push_back(make_sphere({0, 1, 0}, 1.0, mats.create<Dielectric>(1.5))); | |
world.push_back( | |
make_sphere({-4, 1, 0}, 1.0, mats.create<Lambertian>(float3{0.4, 0.2, 0.1}))); | |
world.push_back( | |
make_sphere({4, 1, 0}, 1.0, mats.create<Metal>(float3{0.7, 0.6, 0.5}, 0.0))); | |
// constexpr unsigned int N_obj = 5; | |
// auto ray_objects = make_cuda_managed_malloc<RayObjects[]>(N_obj); | |
// world.push_back(make_sphere({0, 0, -1}, 0.5, mat_red)); | |
// world.push_back(make_sphere({0, -100.5, -1}, 100, mat_green)); | |
// world.push_back(make_sphere({-1, 0, -1}, 0.5, mat_di)); | |
// world.push_back(make_sphere({-1, 0, -1}, -0.4, mat_di)); | |
// world.push_back(make_sphere({1, 0, -1}, 0.5, mat_metal_col)); | |
// Copy our list of objects | |
auto ray_objects = make_cuda_managed_malloc<RayObjects[]>(world.size()); | |
std::copy(world.begin(), world.end(), ray_objects.get()); | |
float3 lookfrom{13, 2, 3}; | |
float3 lookat{0, 0, 0}; | |
float3 up{0, 1, 0}; | |
float dist_to_focus = 10.0f; | |
float aperture = 0.1; | |
// float3 lookfrom{30, 5, 20}; | |
// float3 lookat{0, 0, -1}; | |
// float3 up{0, 1, 0}; | |
// float3 lookfrom{-2, 0.4, 0.3}; | |
// float3 lookat{0, 0, -1}; | |
// float3 up{0, 1, 0}; | |
// auto dist_to_focus = length(lookfrom - lookat); | |
// float aperture = 4.0f; | |
auto camera = | |
Camera{lookfrom, lookat, up, 20, aspect_ratio, aperture, dist_to_focus}; | |
// Create and initialize the random states | |
CudaEvent rand_start, rand_end; | |
rand_start.record(); | |
auto random_state = make_cuda_malloc<curandState[]>(device_pitch * image_height); | |
initRandomState<<<grid_dims, block_dims>>>( | |
random_state.get(), image_width, image_height, device_pitch); | |
rand_end.record(); | |
rand_end.synchronize(); | |
print("Random init time: {}\n", format_time(rand_end - rand_start)); | |
CudaEvent kernel_start; | |
kernel_start.record(); | |
doShootRays<<<grid_dims, block_dims>>>(dev_image.get(), | |
device_pitch, | |
image_width, | |
image_height, | |
camera, | |
ray_objects.get(), | |
world.size(), | |
random_state.get()); | |
cuda_throw_error(); | |
CudaEvent kernel_done; | |
kernel_done.record(); | |
kernel_done.synchronize(); | |
print("Kernel time: {}\n", format_time(kernel_done - kernel_start)); | |
// Copy the memory data back | |
auto host_image = std::make_unique<float4[]>(image_width * image_height); | |
cudaMemcpy2D(host_image.get(), | |
image_width * sizeof(float4), | |
dev_image.get(), | |
device_pitch * sizeof(float4), | |
image_width * sizeof(float4), | |
image_height, | |
cudaMemcpyDeviceToHost); | |
// Convert this to a linear 3-color int | |
auto rgb_image = std::make_unique<uchar3[]>(image_width * image_height); | |
for (int y = image_height - 1, k = 0; y >= 0; --y) { | |
for (int x = 0; x < image_width; ++x, ++k) { | |
auto pixel = sqrt(host_image[x + image_width * y]); | |
uchar3 out_pixel{ | |
static_cast<uint8_t>(pixel.x * 255.999), | |
static_cast<uint8_t>(pixel.y * 255.999), | |
static_cast<uint8_t>(pixel.z * 255.999), | |
}; | |
rgb_image[k] = out_pixel; | |
} | |
} | |
print("Writing {}\n", bold("image.png")); | |
auto err = lodepng_encode24_file("image.png", | |
reinterpret_cast<uint8_t *>(rgb_image.get()), | |
image_width, | |
image_height); | |
if (err) { | |
print("Error: Failed to save png; {}: {}\n", err, lodepng_error_text(err)); | |
return 1; | |
} | |
auto end_time = std::chrono::high_resolution_clock::now(); | |
print("\nAll done in: {}\n", blue(format_time(end_time - start_time))); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment