indiv0 · December 10, 2023 04:57
diff --git a/day_6_1_gpu.rs b/day_6_1_gpu.rs

 use std::borrow;
 use wgpu::util::DeviceExt as _;



 // ===========
 // === Run ===
 // ===========

 pub fn run(s: &str) -> u64 {
    pollster::block_on(async {
        let numbers = parse_input(s);
        execute_gpu(&numbers).await.unwrap()
    })
 }



 // ===================
 // === Parse Input ===
 // ===================

 #[inline(always)]
 pub fn parse_input(s: &str) -> [u32; 8] {
    const PREFIX: &str = "Distance:   ";
    const PREFIX_LEN: usize = PREFIX.len();
    const SPACE_LEN: usize = 3;
    const NL_LEN: usize = 1;
    const LINE_LEN: usize = PREFIX_LEN + 3 + SPACE_LEN + 4 + SPACE_LEN + 4 + SPACE_LEN + 4 + NL_LEN;
    const TOTAL_LEN: usize = LINE_LEN * 2;
    let s = s.as_bytes();
    let times = &s[PREFIX_LEN..=LINE_LEN];
    let times = [
        b_to_u16(times[1]) as u8 * 10 +  b_to_u16(times[2]) as u8,
        b_to_u16(times[8]) as u8 * 10 +  b_to_u16(times[9]) as u8,
        b_to_u16(times[15]) as u8 * 10 + b_to_u16(times[16]) as u8,
        b_to_u16(times[22]) as u8 * 10 + b_to_u16(times[23]) as u8,
    ];
    let distances = &s[(LINE_LEN + PREFIX_LEN)..TOTAL_LEN - NL_LEN];
    let distances = [
        b_to_u16(distances[0]) * 100 +   b_to_u16(distances[1]) * 10 +   b_to_u16(distances[2]),
        b_to_u16(distances[6]) * 1000 +  b_to_u16(distances[7]) * 100 +  b_to_u16(distances[8]) * 10 +  b_to_u16(distances[9]),
        b_to_u16(distances[13]) * 1000 + b_to_u16(distances[14]) * 100 + b_to_u16(distances[15]) * 10 + b_to_u16(distances[16]),
        b_to_u16(distances[20]) * 1000 + b_to_u16(distances[21]) * 100 + b_to_u16(distances[22]) * 10 + b_to_u16(distances[23]),
    ];
    [
        times[0] as _,
        times[1] as _,
        times[2] as _,
        times[3] as _,
        distances[0] as _,
        distances[1] as _,
        distances[2] as _,
        distances[3] as _,
    ]
 }

 // Converts a byte of ASCII `[0-9]` to the equivalent `u8`.
 #[inline(always)]
 const fn b_to_u8(b: u8) -> u8 {
    b - b'0'
 }

 // Converts a byte of ASCII `[0-9]` to the equivalent `u16`.
 #[inline(always)]
 const fn b_to_u16(b: u8) -> u16 {
    b_to_u8(b) as u16
 }



 // ===================
 // === Execute GPU ===
 // ===================

 async fn execute_gpu(numbers: &[u32]) -> Option<u64> {
    // Instantiates instance of WebGPU
    let instance = wgpu::Instance::default();

    // `request_adapter` instantiates the general connection to the GPU
    let adapter = instance
        .request_adapter(&wgpu::RequestAdapterOptions::default())
        .await?;

    // `request_device` instantiates the feature specific connection to the GPU, defining some parameters,
    //  `features` being the available features.
    let (device, queue) = adapter
        .request_device(
            &wgpu::DeviceDescriptor {
                label: None,
                features: wgpu::Features::empty(),
                limits: wgpu::Limits::downlevel_defaults(),
            },
            None,
        )
        .await
        .unwrap();

    let wins = execute_gpu_inner(&device, &queue, numbers).await?;
    // Multiply the values together.
    Some(wins.iter().take(4).fold(1, |acc, n| acc * n) as _)
 }

 async fn execute_gpu_inner(
    device: &wgpu::Device,
    queue: &wgpu::Queue,
    numbers: &[u32],
 ) -> Option<Vec<u32>> {
    const SHADER: &str = r#"@group(0)
 @binding(0)
 var<storage, read_write> v_indices: array<u32>; // this is used as both input and output for convenience

 // Let,
 //  a   = acceleration
 //  v   = velocity
 //  t_a = time for acceleration
 //  t_m = time for movement
 //  T   = total time
 //  d   = distance travelled
 //  D   = race record distance + 1
 //
 // We have,
 //   d = (a * t_a) * t_m
 //     = (a * t_a) * (T - t_a)
 //     = a * t_a * T - a * t_a * t_a
 //     = -a * t_a^2 + a * t_a * T
 //   T = t_a + t_m => t_m = T - t_a
 //
 // The roots of f(t_a) = D gives us the values of t_a, between which, d will be greater or equal to
 // D.
 //
 // t_a = ( aT +- sqrt( (aT)^2 - 4aD ) ) / 2
 //     = ( aT +- det ) / 2
 //
 // where det = sqrt( (aT)^2 - 4aD )
 //
 // =>  t1 = (aT - det) / 2
 // and t2 = (aT + det) / 2
 //
 // The number of ways to win is the number of valid integer time values between the two roots =
 // ceil(t1) - floor(t2) + 1
 fn simulate_race(t_base: u32, d_base: u32) -> u32 {
    //assert!(t <= 999);
    //assert!(d <= 9999);
    var t: f32 = f32(t_base);
    var d: f32 = f32(d_base) + 0.5;
    let at = t;
    // FIXME [NP]: use square?
    let det = sqrt(at * at - 4. * d);
    //assert!(det <= 50.);
    let t1 = ceil((at - det) / 2.);
    let t2 = floor((at + det) / 2.);
    let n = t2 - t1 + 1.;
    //assert!(n.is_sign_positive());
    return u32(n);
 }

 @compute
 @workgroup_size(1)
 fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {
    var y = global_id.x + 4u;
    v_indices[global_id.x] = simulate_race(v_indices[global_id.x], v_indices[y]);
 }"#;
    // Loads the shader from WGSL
    let cs_module = device.create_shader_module(wgpu::ShaderModuleDescriptor {
        label: None,
        source: wgpu::ShaderSource::Wgsl(borrow::Cow::Borrowed(SHADER)),
    });

    // Gets the size in bytes of the buffer.
    let size = std::mem::size_of_val(numbers) as wgpu::BufferAddress;

    // Instantiates buffer without data.
    // `usage` of buffer specifies how it can be used:
    //   `BufferUsages::MAP_READ` allows it to be read (outside the shader).
    //   `BufferUsages::COPY_DST` allows it to be the destination of the copy.
    let staging_buffer = device.create_buffer(&wgpu::BufferDescriptor {
        label: None,
        size,
        usage: wgpu::BufferUsages::MAP_READ | wgpu::BufferUsages::COPY_DST,
        mapped_at_creation: false,
    });

    // Instantiates buffer with data (`numbers`).
    // Usage allowing the buffer to be:
    //   A storage buffer (can be bound within a bind group and thus available to a shader).
    //   The destination of a copy.
    //   The source of a copy.
    let storage_buffer = device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
        label: Some("Storage Buffer"),
        contents: bytemuck::cast_slice(numbers),
        usage: wgpu::BufferUsages::STORAGE
            | wgpu::BufferUsages::COPY_DST
            | wgpu::BufferUsages::COPY_SRC,
    });

    // A bind group defines how buffers are accessed by shaders.
    // It is to WebGPU what a descriptor set is to Vulkan.
    // `binding` here refers to the `binding` of a buffer in the shader (`layout(set = 0, binding = 0) buffer`).

    // A pipeline specifies the operation of a shader

    // Instantiates the pipeline.
    let compute_pipeline = device.create_compute_pipeline(&wgpu::ComputePipelineDescriptor {
        label: None,
        layout: None,
        module: &cs_module,
        entry_point: "main",
    });

    // Instantiates the bind group, once again specifying the binding of buffers.
    let bind_group_layout = compute_pipeline.get_bind_group_layout(0);
    let bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor {
        label: None,
        layout: &bind_group_layout,
        entries: &[wgpu::BindGroupEntry {
            binding: 0,
            resource: storage_buffer.as_entire_binding(),
        }],
    });

    // A command encoder executes one or many pipelines.
    // It is to WebGPU what a command buffer is to Vulkan.
    let mut encoder =
        device.create_command_encoder(&wgpu::CommandEncoderDescriptor { label: None });
    {
        let mut cpass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor {
            label: None,
            timestamp_writes: None,
        });
        cpass.set_pipeline(&compute_pipeline);
        cpass.set_bind_group(0, &bind_group, &[]);
        cpass.insert_debug_marker("compute race iterations");
        cpass.dispatch_workgroups(numbers.len() as u32 / 2, 1, 1); // Number of cells to run, the (x,y,z) size of item being processed
    }
    // Sets adds copy operation to command encoder.
    // Will copy data from storage buffer on GPU to staging buffer on CPU.
    encoder.copy_buffer_to_buffer(&storage_buffer, 0, &staging_buffer, 0, size);

    // Submits command encoder for processing
    queue.submit(Some(encoder.finish()));

    // Note that we're not calling `.await` here.
    let buffer_slice = staging_buffer.slice(..);
    // Sets the buffer up for mapping, sending over the result of the mapping back to us when it is finished.
    let (sender, receiver) = flume::bounded(1);
    buffer_slice.map_async(wgpu::MapMode::Read, move |v| sender.send(v).unwrap());

    // Poll the device in a blocking manner so that our future resolves.
    // In an actual application, `device.poll(...)` should
    // be called in an event loop or on another thread.
    device.poll(wgpu::Maintain::Wait);

    // Awaits until `buffer_future` can be read from
    if let Ok(Ok(())) = receiver.recv_async().await {
        // Gets contents of buffer
        let data = buffer_slice.get_mapped_range();
        // Since contents are got in bytes, this converts these bytes back to u32
        let result = bytemuck::cast_slice(&data).to_vec();

        // With the current interface, we have to make sure all mapped views are
        // dropped before we unmap the buffer.
        drop(data);
        staging_buffer.unmap(); // Unmaps buffer from memory
                                // If you are familiar with C++ these 2 lines can be thought of similarly to:
                                //   delete myPointer;
                                //   myPointer = NULL;
                                // It effectively frees the memory

        // Returns data from buffer
        Some(result)
    } else {
        panic!("failed to run compute on gpu!")
    }
 }

	use std::borrow;
	use wgpu::util::DeviceExt as _;



	// ===========
	// === Run ===
	// ===========

	pub fn run(s: &str) -> u64 {
	pollster::block_on(async {
	let numbers = parse_input(s);
	execute_gpu(&numbers).await.unwrap()
	})
	}



	// ===================
	// === Parse Input ===
	// ===================

	#[inline(always)]
	pub fn parse_input(s: &str) -> [u32; 8] {
	const PREFIX: &str = "Distance: ";
	const PREFIX_LEN: usize = PREFIX.len();
	const SPACE_LEN: usize = 3;
	const NL_LEN: usize = 1;
	const LINE_LEN: usize = PREFIX_LEN + 3 + SPACE_LEN + 4 + SPACE_LEN + 4 + SPACE_LEN + 4 + NL_LEN;
	const TOTAL_LEN: usize = LINE_LEN * 2;
	let s = s.as_bytes();
	let times = &s[PREFIX_LEN..=LINE_LEN];
	let times = [
	b_to_u16(times[1]) as u8 * 10 + b_to_u16(times[2]) as u8,
	b_to_u16(times[8]) as u8 * 10 + b_to_u16(times[9]) as u8,
	b_to_u16(times[15]) as u8 * 10 + b_to_u16(times[16]) as u8,
	b_to_u16(times[22]) as u8 * 10 + b_to_u16(times[23]) as u8,
	];
	let distances = &s[(LINE_LEN + PREFIX_LEN)..TOTAL_LEN - NL_LEN];
	let distances = [
	b_to_u16(distances[0]) * 100 + b_to_u16(distances[1]) * 10 + b_to_u16(distances[2]),
	b_to_u16(distances[6]) * 1000 + b_to_u16(distances[7]) * 100 + b_to_u16(distances[8]) * 10 + b_to_u16(distances[9]),
	b_to_u16(distances[13]) * 1000 + b_to_u16(distances[14]) * 100 + b_to_u16(distances[15]) * 10 + b_to_u16(distances[16]),
	b_to_u16(distances[20]) * 1000 + b_to_u16(distances[21]) * 100 + b_to_u16(distances[22]) * 10 + b_to_u16(distances[23]),
	];
	[
	times[0] as _,
	times[1] as _,
	times[2] as _,
	times[3] as _,
	distances[0] as _,
	distances[1] as _,
	distances[2] as _,
	distances[3] as _,
	]
	}

	// Converts a byte of ASCII `[0-9]` to the equivalent `u8`.
	#[inline(always)]
	const fn b_to_u8(b: u8) -> u8 {
	b - b'0'
	}

	// Converts a byte of ASCII `[0-9]` to the equivalent `u16`.
	#[inline(always)]
	const fn b_to_u16(b: u8) -> u16 {
	b_to_u8(b) as u16
	}



	// ===================
	// === Execute GPU ===
	// ===================

	async fn execute_gpu(numbers: &[u32]) -> Option<u64> {
	// Instantiates instance of WebGPU
	let instance = wgpu::Instance::default();

	// `request_adapter` instantiates the general connection to the GPU
	let adapter = instance
	.request_adapter(&wgpu::RequestAdapterOptions::default())
	.await?;

	// `request_device` instantiates the feature specific connection to the GPU, defining some parameters,
	// `features` being the available features.
	let (device, queue) = adapter
	.request_device(
	&wgpu::DeviceDescriptor {
	label: None,
	features: wgpu::Features::empty(),
	limits: wgpu::Limits::downlevel_defaults(),
	},
	None,
	)
	.await
	.unwrap();

	let wins = execute_gpu_inner(&device, &queue, numbers).await?;
	// Multiply the values together.
	Some(wins.iter().take(4).fold(1, \|acc, n\| acc * n) as _)
	}

	async fn execute_gpu_inner(
	device: &wgpu::Device,
	queue: &wgpu::Queue,
	numbers: &[u32],
	) -> Option<Vec<u32>> {
	const SHADER: &str = r#"@group(0)
	@binding(0)
	var<storage, read_write> v_indices: array<u32>; // this is used as both input and output for convenience

	// Let,
	// a = acceleration
	// v = velocity
	// t_a = time for acceleration
	// t_m = time for movement
	// T = total time
	// d = distance travelled
	// D = race record distance + 1
	//
	// We have,
	// d = (a * t_a) * t_m
	// = (a * t_a) * (T - t_a)
	// = a * t_a * T - a * t_a * t_a
	// = -a * t_a^2 + a * t_a * T
	// T = t_a + t_m => t_m = T - t_a
	//
	// The roots of f(t_a) = D gives us the values of t_a, between which, d will be greater or equal to
	// D.
	//
	// t_a = ( aT +- sqrt( (aT)^2 - 4aD ) ) / 2
	// = ( aT +- det ) / 2
	//
	// where det = sqrt( (aT)^2 - 4aD )
	//
	// => t1 = (aT - det) / 2
	// and t2 = (aT + det) / 2
	//
	// The number of ways to win is the number of valid integer time values between the two roots =
	// ceil(t1) - floor(t2) + 1
	fn simulate_race(t_base: u32, d_base: u32) -> u32 {
	//assert!(t <= 999);
	//assert!(d <= 9999);
	var t: f32 = f32(t_base);
	var d: f32 = f32(d_base) + 0.5;
	let at = t;
	// FIXME [NP]: use square?
	let det = sqrt(at * at - 4. * d);
	//assert!(det <= 50.);
	let t1 = ceil((at - det) / 2.);
	let t2 = floor((at + det) / 2.);
	let n = t2 - t1 + 1.;
	//assert!(n.is_sign_positive());
	return u32(n);
	}

	@compute
	@workgroup_size(1)
	fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {
	var y = global_id.x + 4u;
	v_indices[global_id.x] = simulate_race(v_indices[global_id.x], v_indices[y]);
	}"#;
	// Loads the shader from WGSL
	let cs_module = device.create_shader_module(wgpu::ShaderModuleDescriptor {
	label: None,
	source: wgpu::ShaderSource::Wgsl(borrow::Cow::Borrowed(SHADER)),
	});

	// Gets the size in bytes of the buffer.
	let size = std::mem::size_of_val(numbers) as wgpu::BufferAddress;

	// Instantiates buffer without data.
	// `usage` of buffer specifies how it can be used:
	// `BufferUsages::MAP_READ` allows it to be read (outside the shader).
	// `BufferUsages::COPY_DST` allows it to be the destination of the copy.
	let staging_buffer = device.create_buffer(&wgpu::BufferDescriptor {
	label: None,
	size,
	usage: wgpu::BufferUsages::MAP_READ \| wgpu::BufferUsages::COPY_DST,
	mapped_at_creation: false,
	});

	// Instantiates buffer with data (`numbers`).
	// Usage allowing the buffer to be:
	// A storage buffer (can be bound within a bind group and thus available to a shader).
	// The destination of a copy.
	// The source of a copy.
	let storage_buffer = device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
	label: Some("Storage Buffer"),
	contents: bytemuck::cast_slice(numbers),
	usage: wgpu::BufferUsages::STORAGE
	\| wgpu::BufferUsages::COPY_DST
	\| wgpu::BufferUsages::COPY_SRC,
	});

	// A bind group defines how buffers are accessed by shaders.
	// It is to WebGPU what a descriptor set is to Vulkan.
	// `binding` here refers to the `binding` of a buffer in the shader (`layout(set = 0, binding = 0) buffer`).

	// A pipeline specifies the operation of a shader

	// Instantiates the pipeline.
	let compute_pipeline = device.create_compute_pipeline(&wgpu::ComputePipelineDescriptor {
	label: None,
	layout: None,
	module: &cs_module,
	entry_point: "main",
	});

	// Instantiates the bind group, once again specifying the binding of buffers.
	let bind_group_layout = compute_pipeline.get_bind_group_layout(0);
	let bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor {
	label: None,
	layout: &bind_group_layout,
	entries: &[wgpu::BindGroupEntry {
	binding: 0,
	resource: storage_buffer.as_entire_binding(),
	}],
	});

	// A command encoder executes one or many pipelines.
	// It is to WebGPU what a command buffer is to Vulkan.
	let mut encoder =
	device.create_command_encoder(&wgpu::CommandEncoderDescriptor { label: None });
	{
	let mut cpass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor {
	label: None,
	timestamp_writes: None,
	});
	cpass.set_pipeline(&compute_pipeline);
	cpass.set_bind_group(0, &bind_group, &[]);
	cpass.insert_debug_marker("compute race iterations");
	cpass.dispatch_workgroups(numbers.len() as u32 / 2, 1, 1); // Number of cells to run, the (x,y,z) size of item being processed
	}
	// Sets adds copy operation to command encoder.
	// Will copy data from storage buffer on GPU to staging buffer on CPU.
	encoder.copy_buffer_to_buffer(&storage_buffer, 0, &staging_buffer, 0, size);

	// Submits command encoder for processing
	queue.submit(Some(encoder.finish()));

	// Note that we're not calling `.await` here.
	let buffer_slice = staging_buffer.slice(..);
	// Sets the buffer up for mapping, sending over the result of the mapping back to us when it is finished.
	let (sender, receiver) = flume::bounded(1);
	buffer_slice.map_async(wgpu::MapMode::Read, move \|v\| sender.send(v).unwrap());

	// Poll the device in a blocking manner so that our future resolves.
	// In an actual application, `device.poll(...)` should
	// be called in an event loop or on another thread.
	device.poll(wgpu::Maintain::Wait);

	// Awaits until `buffer_future` can be read from
	if let Ok(Ok(())) = receiver.recv_async().await {
	// Gets contents of buffer
	let data = buffer_slice.get_mapped_range();
	// Since contents are got in bytes, this converts these bytes back to u32
	let result = bytemuck::cast_slice(&data).to_vec();

	// With the current interface, we have to make sure all mapped views are
	// dropped before we unmap the buffer.
	drop(data);
	staging_buffer.unmap(); // Unmaps buffer from memory
	// If you are familiar with C++ these 2 lines can be thought of similarly to:
	// delete myPointer;
	// myPointer = NULL;
	// It effectively frees the memory

	// Returns data from buffer
	Some(result)
	} else {
	panic!("failed to run compute on gpu!")
	}
	}