|
use rand::{Rng, SeedableRng, rngs::StdRng}; |
|
use rayon::prelude::*; |
|
use std::{ops::Mul, sync::Arc, time::Instant}; |
|
use vulkano::{ |
|
VulkanLibrary, |
|
buffer::{Buffer, BufferCreateInfo, BufferUsage}, |
|
command_buffer::{ |
|
AutoCommandBufferBuilder, CommandBufferUsage, CopyBufferInfo, PrimaryCommandBufferAbstract, |
|
allocator::StandardCommandBufferAllocator, |
|
}, |
|
descriptor_set::{ |
|
DescriptorSet, WriteDescriptorSet, allocator::StandardDescriptorSetAllocator, |
|
}, |
|
device::{ |
|
Device, DeviceCreateInfo, DeviceExtensions, QueueCreateInfo, QueueFlags, |
|
physical::PhysicalDeviceType, |
|
}, |
|
instance::{Instance, InstanceCreateFlags, InstanceCreateInfo}, |
|
memory::allocator::{AllocationCreateInfo, MemoryTypeFilter, StandardMemoryAllocator}, |
|
pipeline::{ |
|
ComputePipeline, Pipeline, PipelineBindPoint, PipelineLayout, |
|
PipelineShaderStageCreateInfo, compute::ComputePipelineCreateInfo, |
|
layout::PipelineDescriptorSetLayoutCreateInfo, |
|
}, |
|
sync::{self, GpuFuture}, |
|
}; |
|
|
|
const MATRIX_A_NUM_ROWS: u32 = 1774; |
|
const MATRIX_A_NUM_COLS: u32 = 1u32 << 11; |
|
const MATRIX_B_NUM_ROWS: u32 = 1u32 << 11; |
|
const MATRIX_B_NUM_COLS: u32 = 1u32 << 10; |
|
|
|
#[derive(PartialEq, Debug)] |
|
struct Matrix { |
|
rows: u32, |
|
cols: u32, |
|
elems: Vec<u32>, |
|
} |
|
|
|
impl Mul for Matrix { |
|
type Output = Matrix; |
|
|
|
fn mul(self, rhs: Self) -> Self::Output { |
|
assert_eq!(self.cols, rhs.rows); |
|
let mut elems = vec![0u32; (self.rows * rhs.cols) as usize]; |
|
|
|
for i in 0..self.rows { |
|
for j in 0..rhs.cols { |
|
let mut sum = 0u32; |
|
for k in 0..self.cols { |
|
sum = sum.wrapping_add( |
|
self.elems[(i * self.cols + k) as usize] |
|
.wrapping_mul(rhs.elems[(k * rhs.cols + j) as usize]), |
|
); |
|
} |
|
elems[(i * rhs.cols + j) as usize] = sum; |
|
} |
|
} |
|
|
|
Matrix { |
|
rows: self.rows, |
|
cols: rhs.cols, |
|
elems, |
|
} |
|
} |
|
} |
|
|
|
impl Matrix { |
|
fn random(seed: u64, rows: u32, cols: u32) -> Matrix { |
|
let mut rng = StdRng::seed_from_u64(seed); |
|
Matrix { |
|
rows, |
|
cols, |
|
elems: (0..rows * cols).map(|_| rng.r#gen::<u32>()).collect(), |
|
} |
|
} |
|
|
|
fn par_mul(&self, rhs: &Matrix) -> Matrix { |
|
assert_eq!(self.cols, rhs.rows); |
|
let mut elems = vec![0u32; (self.rows * rhs.cols) as usize]; |
|
|
|
elems.par_iter_mut().enumerate().for_each(|(lin_idx, v)| { |
|
let r_idx = lin_idx / rhs.cols as usize; |
|
let c_idx = lin_idx - r_idx * rhs.cols as usize; |
|
|
|
*v = (0..self.cols as usize).fold(0u32, |acc, k| { |
|
acc.wrapping_add( |
|
self.elems[r_idx * self.cols as usize + k] |
|
.wrapping_mul(rhs.elems[k * rhs.cols as usize + c_idx]), |
|
) |
|
}); |
|
}); |
|
|
|
Matrix { |
|
rows: self.rows, |
|
cols: rhs.cols, |
|
elems, |
|
} |
|
} |
|
|
|
fn to_bytes(&self) -> Vec<u8> { |
|
let offset0 = 0; |
|
let offset1 = offset0 + std::mem::size_of_val(&self.rows); |
|
let offset2 = offset1 + std::mem::size_of_val(&self.cols); |
|
let encoded_elems_byte_len = std::mem::size_of::<u32>() * (self.rows * self.cols) as usize; |
|
let encoded_mat_byte_len = offset2 + encoded_elems_byte_len; |
|
|
|
let elems_as_bytes = unsafe { |
|
let ptr_elems = self.elems.as_ptr(); |
|
let ptr_elem_bytes: *const u8 = ptr_elems.cast(); |
|
|
|
core::slice::from_raw_parts(ptr_elem_bytes, encoded_elems_byte_len) |
|
}; |
|
|
|
let mut bytes = vec![0u8; encoded_mat_byte_len]; |
|
|
|
bytes[offset0..offset1].copy_from_slice(&self.rows.to_le_bytes()); |
|
bytes[offset1..offset2].copy_from_slice(&self.cols.to_le_bytes()); |
|
bytes[offset2..].copy_from_slice(elems_as_bytes); |
|
|
|
bytes |
|
} |
|
|
|
fn from_bytes(bytes: &[u8]) -> Matrix { |
|
let offset0 = 0; |
|
let offset1 = offset0 + std::mem::size_of::<u32>(); |
|
let offset2 = offset1 + std::mem::size_of::<u32>(); |
|
|
|
let rows = u32::from_le_bytes(bytes[offset0..offset1].try_into().unwrap()); |
|
let cols = u32::from_le_bytes(bytes[offset1..offset2].try_into().unwrap()); |
|
let num_elems = (rows * cols) as usize; |
|
|
|
let encoded_elems_byte_len = std::mem::size_of::<u32>() * num_elems; |
|
let remaining_num_bytes = bytes.len() - offset2; |
|
|
|
assert_eq!(encoded_elems_byte_len, remaining_num_bytes); |
|
|
|
let elems = unsafe { |
|
let ptr_elems_bytes = bytes[offset2..].as_ptr(); |
|
let ptr_elems: *const u32 = ptr_elems_bytes.cast(); |
|
|
|
core::slice::from_raw_parts(ptr_elems, num_elems) |
|
} |
|
.to_vec(); |
|
|
|
Matrix { rows, cols, elems } |
|
} |
|
} |
|
|
|
fn main() { |
|
let library = VulkanLibrary::new().unwrap(); |
|
let instance = Instance::new( |
|
library, |
|
InstanceCreateInfo { |
|
flags: InstanceCreateFlags::ENUMERATE_PORTABILITY, |
|
..Default::default() |
|
}, |
|
) |
|
.unwrap(); |
|
|
|
let device_extensions = DeviceExtensions { |
|
khr_storage_buffer_storage_class: true, |
|
..DeviceExtensions::empty() |
|
}; |
|
let (physical_device, queue_family_index) = instance |
|
.enumerate_physical_devices() |
|
.unwrap() |
|
.filter(|p| p.supported_extensions().contains(&device_extensions)) |
|
.filter_map(|p| { |
|
p.queue_family_properties() |
|
.iter() |
|
.position(|q| { |
|
q.queue_flags |
|
.intersects(QueueFlags::COMPUTE | QueueFlags::TRANSFER) |
|
}) |
|
.map(|i| (p, i as u32)) |
|
}) |
|
.min_by_key(|(p, _)| match p.properties().device_type { |
|
PhysicalDeviceType::DiscreteGpu => 0, |
|
PhysicalDeviceType::IntegratedGpu => 1, |
|
PhysicalDeviceType::VirtualGpu => 2, |
|
PhysicalDeviceType::Cpu => 3, |
|
PhysicalDeviceType::Other => 4, |
|
_ => 5, |
|
}) |
|
.unwrap(); |
|
|
|
println!( |
|
"Using device: {} (type: {:?}). API version: {}", |
|
physical_device.properties().device_name, |
|
physical_device.properties().device_type, |
|
physical_device.api_version(), |
|
); |
|
|
|
let (device, queue) = { |
|
let (device, mut queues) = Device::new( |
|
physical_device, |
|
DeviceCreateInfo { |
|
enabled_extensions: device_extensions, |
|
queue_create_infos: vec![QueueCreateInfo { |
|
queue_family_index, |
|
..Default::default() |
|
}], |
|
..Default::default() |
|
}, |
|
) |
|
.unwrap(); |
|
|
|
(device, queues.next().unwrap()) |
|
}; |
|
|
|
let memory_allocator = Arc::new(StandardMemoryAllocator::new_default(device.clone())); |
|
|
|
let matrix_a = Matrix::random(13, MATRIX_A_NUM_ROWS, MATRIX_A_NUM_COLS); |
|
let matrix_b = Matrix::random(17, MATRIX_B_NUM_ROWS, MATRIX_B_NUM_COLS); |
|
|
|
println!("Matrix A = ({} X {})", matrix_a.rows, matrix_a.cols); |
|
println!("Matrix B = ({} X {})", matrix_b.rows, matrix_b.cols); |
|
println!("Matrix C = A X B = ({} X {})", matrix_a.rows, matrix_b.cols); |
|
|
|
let matrix_a_as_bytes = matrix_a.to_bytes(); |
|
let matrix_b_as_bytes = matrix_b.to_bytes(); |
|
let matrix_a_byte_len = matrix_a_as_bytes.len() as u64; |
|
let matrix_b_byte_len = matrix_b_as_bytes.len() as u64; |
|
let matrix_c_byte_len = (std::mem::size_of_val(&matrix_a.rows) |
|
+ std::mem::size_of_val(&matrix_b.cols) |
|
+ std::mem::size_of::<u32>() * (matrix_a.rows * matrix_b.cols) as usize) |
|
as u64; |
|
|
|
let matrix_a_host_local = Buffer::from_iter( |
|
memory_allocator.clone(), |
|
BufferCreateInfo { |
|
usage: BufferUsage::TRANSFER_SRC, |
|
..Default::default() |
|
}, |
|
AllocationCreateInfo { |
|
memory_type_filter: MemoryTypeFilter::HOST_SEQUENTIAL_WRITE |
|
| MemoryTypeFilter::PREFER_DEVICE, |
|
..Default::default() |
|
}, |
|
matrix_a_as_bytes, |
|
) |
|
.unwrap(); |
|
|
|
let matrix_b_host_local = Buffer::from_iter( |
|
memory_allocator.clone(), |
|
BufferCreateInfo { |
|
usage: BufferUsage::TRANSFER_SRC, |
|
..Default::default() |
|
}, |
|
AllocationCreateInfo { |
|
memory_type_filter: MemoryTypeFilter::HOST_SEQUENTIAL_WRITE |
|
| MemoryTypeFilter::PREFER_DEVICE, |
|
..Default::default() |
|
}, |
|
matrix_b_as_bytes, |
|
) |
|
.unwrap(); |
|
|
|
let matrix_a_device_local = Buffer::new_slice::<u8>( |
|
memory_allocator.clone(), |
|
BufferCreateInfo { |
|
usage: BufferUsage::STORAGE_BUFFER | BufferUsage::TRANSFER_DST, |
|
..Default::default() |
|
}, |
|
AllocationCreateInfo { |
|
memory_type_filter: MemoryTypeFilter::PREFER_DEVICE, |
|
..Default::default() |
|
}, |
|
matrix_a_byte_len, |
|
) |
|
.unwrap(); |
|
|
|
let matrix_b_device_local = Buffer::new_slice::<u8>( |
|
memory_allocator.clone(), |
|
BufferCreateInfo { |
|
usage: BufferUsage::STORAGE_BUFFER | BufferUsage::TRANSFER_DST, |
|
..Default::default() |
|
}, |
|
AllocationCreateInfo { |
|
memory_type_filter: MemoryTypeFilter::PREFER_DEVICE, |
|
..Default::default() |
|
}, |
|
matrix_b_byte_len, |
|
) |
|
.unwrap(); |
|
|
|
let matrix_c_buf = Buffer::new_slice::<u8>( |
|
memory_allocator.clone(), |
|
BufferCreateInfo { |
|
usage: BufferUsage::STORAGE_BUFFER, |
|
..Default::default() |
|
}, |
|
AllocationCreateInfo { |
|
memory_type_filter: MemoryTypeFilter::HOST_RANDOM_ACCESS |
|
| MemoryTypeFilter::PREFER_DEVICE, |
|
..Default::default() |
|
}, |
|
matrix_c_byte_len, |
|
) |
|
.unwrap(); |
|
|
|
let command_buffer_allocator = Arc::new(StandardCommandBufferAllocator::new( |
|
device.clone(), |
|
Default::default(), |
|
)); |
|
|
|
// Transfer data to the GPU |
|
let transfer_command_buffer = { |
|
let mut builder = AutoCommandBufferBuilder::primary( |
|
command_buffer_allocator.clone(), |
|
queue.queue_family_index(), |
|
CommandBufferUsage::OneTimeSubmit, |
|
) |
|
.unwrap(); |
|
|
|
builder |
|
.copy_buffer(CopyBufferInfo::buffers( |
|
matrix_a_host_local, |
|
matrix_a_device_local.clone(), |
|
)) |
|
.unwrap() |
|
.copy_buffer(CopyBufferInfo::buffers( |
|
matrix_b_host_local, |
|
matrix_b_device_local.clone(), |
|
)) |
|
.unwrap(); |
|
|
|
builder.build().unwrap() |
|
}; |
|
|
|
transfer_command_buffer |
|
.execute(queue.clone()) |
|
.unwrap() |
|
.then_signal_fence_and_flush() |
|
.unwrap() |
|
.wait(None) |
|
.unwrap(); |
|
|
|
let pipeline = { |
|
let compute_shader = cs::load(device.clone()).unwrap(); |
|
let compute_shader_entry_point = compute_shader.entry_point("main").unwrap(); |
|
let compute_stage = PipelineShaderStageCreateInfo::new(compute_shader_entry_point); |
|
|
|
let layout = PipelineLayout::new( |
|
device.clone(), |
|
PipelineDescriptorSetLayoutCreateInfo::from_stages([&compute_stage]) |
|
.into_pipeline_layout_create_info(device.clone()) |
|
.unwrap(), |
|
) |
|
.unwrap(); |
|
|
|
ComputePipeline::new( |
|
device.clone(), |
|
None, |
|
ComputePipelineCreateInfo::stage_layout(compute_stage, layout.clone()), |
|
) |
|
.unwrap() |
|
}; |
|
|
|
let descriptor_set_allocator = Arc::new(StandardDescriptorSetAllocator::new( |
|
device.clone(), |
|
Default::default(), |
|
)); |
|
let descriptor_set_layout = pipeline.layout().set_layouts()[0].clone(); |
|
let descriptor_set = DescriptorSet::new( |
|
descriptor_set_allocator, |
|
descriptor_set_layout, |
|
[ |
|
WriteDescriptorSet::buffer(0, matrix_a_device_local.clone()), |
|
WriteDescriptorSet::buffer(1, matrix_b_device_local.clone()), |
|
WriteDescriptorSet::buffer(2, matrix_c_buf.clone()), |
|
], |
|
[], |
|
) |
|
.unwrap(); |
|
|
|
let command_buffer = { |
|
let mut command_buffer_builder = AutoCommandBufferBuilder::primary( |
|
command_buffer_allocator, |
|
queue.queue_family_index(), |
|
CommandBufferUsage::MultipleSubmit, |
|
) |
|
.unwrap(); |
|
|
|
command_buffer_builder |
|
.bind_pipeline_compute(pipeline.clone()) |
|
.unwrap() |
|
.bind_descriptor_sets( |
|
PipelineBindPoint::Compute, |
|
pipeline.layout().clone(), |
|
0, |
|
descriptor_set, |
|
) |
|
.unwrap(); |
|
unsafe { |
|
command_buffer_builder |
|
.dispatch([ |
|
MATRIX_A_NUM_ROWS.div_ceil(8), |
|
MATRIX_B_NUM_COLS.div_ceil(8), |
|
1, |
|
]) |
|
.unwrap(); |
|
} |
|
|
|
command_buffer_builder.build().unwrap() |
|
}; |
|
|
|
// Computing Matrix Multiplication on GPU |
|
let start = Instant::now(); |
|
sync::now(device.clone()) |
|
.then_execute(queue.clone(), command_buffer.clone()) |
|
.unwrap() |
|
.then_signal_fence_and_flush() |
|
.unwrap() |
|
.wait(None) |
|
.unwrap(); |
|
let gpu_tm = start.elapsed(); |
|
println!("(a) GPU matrix multiply: {:?}", gpu_tm); |
|
|
|
// reading GPU-computed matrix multiplication result |
|
let gpu_result_as_bytes = matrix_c_buf.read().unwrap(); |
|
let gpu_result = Matrix::from_bytes(&gpu_result_as_bytes); |
|
|
|
// Computing Parallel Matrix Multiplication on CPU, using Rayon |
|
let start = Instant::now(); |
|
let cpu_result_rayon = matrix_a.par_mul(&matrix_b); |
|
let cpu_tm_rayon = start.elapsed(); |
|
println!("(b) CPU parallel matrix multiply: {:?}", cpu_tm_rayon); |
|
|
|
let start = Instant::now(); |
|
let cpu_result_single_threaded = matrix_a * matrix_b; |
|
let cpu_tm_single_threaded = start.elapsed(); |
|
println!( |
|
"(c) CPU single-threaded matrix multiply: {:?}", |
|
cpu_tm_single_threaded |
|
); |
|
|
|
assert_eq!(gpu_result, cpu_result_single_threaded); |
|
assert_eq!(cpu_result_single_threaded, cpu_result_rayon); |
|
|
|
println!( |
|
"Speed Up (b/a): {}x\nSpeed Up (c/a): {}x\nSpeed Up (c/b): {}x", |
|
cpu_tm_rayon.as_nanos() / gpu_tm.as_nanos(), |
|
cpu_tm_single_threaded.as_nanos() / gpu_tm.as_nanos(), |
|
cpu_tm_single_threaded.as_nanos() / cpu_tm_rayon.as_nanos(), |
|
); |
|
} |
|
|
|
mod cs { |
|
// Compiles shader |
|
vulkano_shaders::shader! { |
|
ty: "compute", |
|
path: "./matrix_multiply.glsl", |
|
vulkan_version: "1.2", |
|
} |
|
} |
I don't think so. Before$A_{1024x1024}$ X $B_{1024x1024}$ was being benchmarked. Now matrix dimensions have changed to ensure that operand matrices don't need to be square.