Created
December 10, 2023 04:57
-
-
Save indiv0/94a443f953622ca471cb68e366bd6406 to your computer and use it in GitHub Desktop.
day_6_1_gpu.rs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use std::borrow; | |
use wgpu::util::DeviceExt as _; | |
// =========== | |
// === Run === | |
// =========== | |
pub fn run(s: &str) -> u64 { | |
pollster::block_on(async { | |
let numbers = parse_input(s); | |
execute_gpu(&numbers).await.unwrap() | |
}) | |
} | |
// =================== | |
// === Parse Input === | |
// =================== | |
#[inline(always)] | |
pub fn parse_input(s: &str) -> [u32; 8] { | |
const PREFIX: &str = "Distance: "; | |
const PREFIX_LEN: usize = PREFIX.len(); | |
const SPACE_LEN: usize = 3; | |
const NL_LEN: usize = 1; | |
const LINE_LEN: usize = PREFIX_LEN + 3 + SPACE_LEN + 4 + SPACE_LEN + 4 + SPACE_LEN + 4 + NL_LEN; | |
const TOTAL_LEN: usize = LINE_LEN * 2; | |
let s = s.as_bytes(); | |
let times = &s[PREFIX_LEN..=LINE_LEN]; | |
let times = [ | |
b_to_u16(times[1]) as u8 * 10 + b_to_u16(times[2]) as u8, | |
b_to_u16(times[8]) as u8 * 10 + b_to_u16(times[9]) as u8, | |
b_to_u16(times[15]) as u8 * 10 + b_to_u16(times[16]) as u8, | |
b_to_u16(times[22]) as u8 * 10 + b_to_u16(times[23]) as u8, | |
]; | |
let distances = &s[(LINE_LEN + PREFIX_LEN)..TOTAL_LEN - NL_LEN]; | |
let distances = [ | |
b_to_u16(distances[0]) * 100 + b_to_u16(distances[1]) * 10 + b_to_u16(distances[2]), | |
b_to_u16(distances[6]) * 1000 + b_to_u16(distances[7]) * 100 + b_to_u16(distances[8]) * 10 + b_to_u16(distances[9]), | |
b_to_u16(distances[13]) * 1000 + b_to_u16(distances[14]) * 100 + b_to_u16(distances[15]) * 10 + b_to_u16(distances[16]), | |
b_to_u16(distances[20]) * 1000 + b_to_u16(distances[21]) * 100 + b_to_u16(distances[22]) * 10 + b_to_u16(distances[23]), | |
]; | |
[ | |
times[0] as _, | |
times[1] as _, | |
times[2] as _, | |
times[3] as _, | |
distances[0] as _, | |
distances[1] as _, | |
distances[2] as _, | |
distances[3] as _, | |
] | |
} | |
// Converts a byte of ASCII `[0-9]` to the equivalent `u8`. | |
#[inline(always)] | |
const fn b_to_u8(b: u8) -> u8 { | |
b - b'0' | |
} | |
// Converts a byte of ASCII `[0-9]` to the equivalent `u16`. | |
#[inline(always)] | |
const fn b_to_u16(b: u8) -> u16 { | |
b_to_u8(b) as u16 | |
} | |
// =================== | |
// === Execute GPU === | |
// =================== | |
async fn execute_gpu(numbers: &[u32]) -> Option<u64> { | |
// Instantiates instance of WebGPU | |
let instance = wgpu::Instance::default(); | |
// `request_adapter` instantiates the general connection to the GPU | |
let adapter = instance | |
.request_adapter(&wgpu::RequestAdapterOptions::default()) | |
.await?; | |
// `request_device` instantiates the feature specific connection to the GPU, defining some parameters, | |
// `features` being the available features. | |
let (device, queue) = adapter | |
.request_device( | |
&wgpu::DeviceDescriptor { | |
label: None, | |
features: wgpu::Features::empty(), | |
limits: wgpu::Limits::downlevel_defaults(), | |
}, | |
None, | |
) | |
.await | |
.unwrap(); | |
let wins = execute_gpu_inner(&device, &queue, numbers).await?; | |
// Multiply the values together. | |
Some(wins.iter().take(4).fold(1, |acc, n| acc * n) as _) | |
} | |
async fn execute_gpu_inner( | |
device: &wgpu::Device, | |
queue: &wgpu::Queue, | |
numbers: &[u32], | |
) -> Option<Vec<u32>> { | |
const SHADER: &str = r#"@group(0) | |
@binding(0) | |
var<storage, read_write> v_indices: array<u32>; // this is used as both input and output for convenience | |
// Let, | |
// a = acceleration | |
// v = velocity | |
// t_a = time for acceleration | |
// t_m = time for movement | |
// T = total time | |
// d = distance travelled | |
// D = race record distance + 1 | |
// | |
// We have, | |
// d = (a * t_a) * t_m | |
// = (a * t_a) * (T - t_a) | |
// = a * t_a * T - a * t_a * t_a | |
// = -a * t_a^2 + a * t_a * T | |
// T = t_a + t_m => t_m = T - t_a | |
// | |
// The roots of f(t_a) = D gives us the values of t_a, between which, d will be greater or equal to | |
// D. | |
// | |
// t_a = ( aT +- sqrt( (aT)^2 - 4aD ) ) / 2 | |
// = ( aT +- det ) / 2 | |
// | |
// where det = sqrt( (aT)^2 - 4aD ) | |
// | |
// => t1 = (aT - det) / 2 | |
// and t2 = (aT + det) / 2 | |
// | |
// The number of ways to win is the number of valid integer time values between the two roots = | |
// ceil(t1) - floor(t2) + 1 | |
fn simulate_race(t_base: u32, d_base: u32) -> u32 { | |
//assert!(t <= 999); | |
//assert!(d <= 9999); | |
var t: f32 = f32(t_base); | |
var d: f32 = f32(d_base) + 0.5; | |
let at = t; | |
// FIXME [NP]: use square? | |
let det = sqrt(at * at - 4. * d); | |
//assert!(det <= 50.); | |
let t1 = ceil((at - det) / 2.); | |
let t2 = floor((at + det) / 2.); | |
let n = t2 - t1 + 1.; | |
//assert!(n.is_sign_positive()); | |
return u32(n); | |
} | |
@compute | |
@workgroup_size(1) | |
fn main(@builtin(global_invocation_id) global_id: vec3<u32>) { | |
var y = global_id.x + 4u; | |
v_indices[global_id.x] = simulate_race(v_indices[global_id.x], v_indices[y]); | |
}"#; | |
// Loads the shader from WGSL | |
let cs_module = device.create_shader_module(wgpu::ShaderModuleDescriptor { | |
label: None, | |
source: wgpu::ShaderSource::Wgsl(borrow::Cow::Borrowed(SHADER)), | |
}); | |
// Gets the size in bytes of the buffer. | |
let size = std::mem::size_of_val(numbers) as wgpu::BufferAddress; | |
// Instantiates buffer without data. | |
// `usage` of buffer specifies how it can be used: | |
// `BufferUsages::MAP_READ` allows it to be read (outside the shader). | |
// `BufferUsages::COPY_DST` allows it to be the destination of the copy. | |
let staging_buffer = device.create_buffer(&wgpu::BufferDescriptor { | |
label: None, | |
size, | |
usage: wgpu::BufferUsages::MAP_READ | wgpu::BufferUsages::COPY_DST, | |
mapped_at_creation: false, | |
}); | |
// Instantiates buffer with data (`numbers`). | |
// Usage allowing the buffer to be: | |
// A storage buffer (can be bound within a bind group and thus available to a shader). | |
// The destination of a copy. | |
// The source of a copy. | |
let storage_buffer = device.create_buffer_init(&wgpu::util::BufferInitDescriptor { | |
label: Some("Storage Buffer"), | |
contents: bytemuck::cast_slice(numbers), | |
usage: wgpu::BufferUsages::STORAGE | |
| wgpu::BufferUsages::COPY_DST | |
| wgpu::BufferUsages::COPY_SRC, | |
}); | |
// A bind group defines how buffers are accessed by shaders. | |
// It is to WebGPU what a descriptor set is to Vulkan. | |
// `binding` here refers to the `binding` of a buffer in the shader (`layout(set = 0, binding = 0) buffer`). | |
// A pipeline specifies the operation of a shader | |
// Instantiates the pipeline. | |
let compute_pipeline = device.create_compute_pipeline(&wgpu::ComputePipelineDescriptor { | |
label: None, | |
layout: None, | |
module: &cs_module, | |
entry_point: "main", | |
}); | |
// Instantiates the bind group, once again specifying the binding of buffers. | |
let bind_group_layout = compute_pipeline.get_bind_group_layout(0); | |
let bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor { | |
label: None, | |
layout: &bind_group_layout, | |
entries: &[wgpu::BindGroupEntry { | |
binding: 0, | |
resource: storage_buffer.as_entire_binding(), | |
}], | |
}); | |
// A command encoder executes one or many pipelines. | |
// It is to WebGPU what a command buffer is to Vulkan. | |
let mut encoder = | |
device.create_command_encoder(&wgpu::CommandEncoderDescriptor { label: None }); | |
{ | |
let mut cpass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor { | |
label: None, | |
timestamp_writes: None, | |
}); | |
cpass.set_pipeline(&compute_pipeline); | |
cpass.set_bind_group(0, &bind_group, &[]); | |
cpass.insert_debug_marker("compute race iterations"); | |
cpass.dispatch_workgroups(numbers.len() as u32 / 2, 1, 1); // Number of cells to run, the (x,y,z) size of item being processed | |
} | |
// Sets adds copy operation to command encoder. | |
// Will copy data from storage buffer on GPU to staging buffer on CPU. | |
encoder.copy_buffer_to_buffer(&storage_buffer, 0, &staging_buffer, 0, size); | |
// Submits command encoder for processing | |
queue.submit(Some(encoder.finish())); | |
// Note that we're not calling `.await` here. | |
let buffer_slice = staging_buffer.slice(..); | |
// Sets the buffer up for mapping, sending over the result of the mapping back to us when it is finished. | |
let (sender, receiver) = flume::bounded(1); | |
buffer_slice.map_async(wgpu::MapMode::Read, move |v| sender.send(v).unwrap()); | |
// Poll the device in a blocking manner so that our future resolves. | |
// In an actual application, `device.poll(...)` should | |
// be called in an event loop or on another thread. | |
device.poll(wgpu::Maintain::Wait); | |
// Awaits until `buffer_future` can be read from | |
if let Ok(Ok(())) = receiver.recv_async().await { | |
// Gets contents of buffer | |
let data = buffer_slice.get_mapped_range(); | |
// Since contents are got in bytes, this converts these bytes back to u32 | |
let result = bytemuck::cast_slice(&data).to_vec(); | |
// With the current interface, we have to make sure all mapped views are | |
// dropped before we unmap the buffer. | |
drop(data); | |
staging_buffer.unmap(); // Unmaps buffer from memory | |
// If you are familiar with C++ these 2 lines can be thought of similarly to: | |
// delete myPointer; | |
// myPointer = NULL; | |
// It effectively frees the memory | |
// Returns data from buffer | |
Some(result) | |
} else { | |
panic!("failed to run compute on gpu!") | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment