|
use rand::{Rng, SeedableRng, rngs::StdRng}; |
|
use rayon::prelude::*; |
|
use std::{ops::Mul, sync::Arc, time::Instant}; |
|
use vulkano::{ |
|
VulkanLibrary, |
|
buffer::{Buffer, BufferCreateInfo, BufferUsage}, |
|
command_buffer::{ |
|
AutoCommandBufferBuilder, CommandBufferUsage, CopyBufferInfo, PrimaryCommandBufferAbstract, |
|
allocator::StandardCommandBufferAllocator, |
|
}, |
|
descriptor_set::{ |
|
DescriptorSet, WriteDescriptorSet, allocator::StandardDescriptorSetAllocator, |
|
}, |
|
device::{ |
|
Device, DeviceCreateInfo, DeviceExtensions, QueueCreateInfo, QueueFlags, |
|
physical::PhysicalDeviceType, |
|
}, |
|
instance::{Instance, InstanceCreateFlags, InstanceCreateInfo}, |
|
memory::allocator::{AllocationCreateInfo, MemoryTypeFilter, StandardMemoryAllocator}, |
|
pipeline::{ |
|
ComputePipeline, Pipeline, PipelineBindPoint, PipelineLayout, |
|
PipelineShaderStageCreateInfo, compute::ComputePipelineCreateInfo, |
|
layout::PipelineDescriptorSetLayoutCreateInfo, |
|
}, |
|
sync::{self, GpuFuture}, |
|
}; |
|
|
|
const MATRIX_A_NUM_ROWS: u32 = 1774; |
|
const MATRIX_A_NUM_COLS: u32 = 1u32 << 11; |
|
const MATRIX_B_NUM_ROWS: u32 = 1u32 << 11; |
|
const MATRIX_B_NUM_COLS: u32 = 1u32 << 10; |
|
|
|
#[derive(PartialEq, Debug)] |
|
struct Matrix { |
|
rows: u32, |
|
cols: u32, |
|
elems: Vec<u32>, |
|
} |
|
|
|
impl Mul for Matrix { |
|
type Output = Matrix; |
|
|
|
fn mul(self, rhs: Self) -> Self::Output { |
|
assert_eq!(self.cols, rhs.rows); |
|
let mut elems = vec![0u32; (self.rows * rhs.cols) as usize]; |
|
|
|
for i in 0..self.rows { |
|
for j in 0..rhs.cols { |
|
let mut sum = 0u32; |
|
for k in 0..self.cols { |
|
sum = sum.wrapping_add( |
|
self.elems[(i * self.cols + k) as usize] |
|
.wrapping_mul(rhs.elems[(k * rhs.cols + j) as usize]), |
|
); |
|
} |
|
elems[(i * rhs.cols + j) as usize] = sum; |
|
} |
|
} |
|
|
|
Matrix { |
|
rows: self.rows, |
|
cols: rhs.cols, |
|
elems, |
|
} |
|
} |
|
} |
|
|
|
impl Matrix { |
|
fn random(seed: u64, rows: u32, cols: u32) -> Matrix { |
|
let mut rng = StdRng::seed_from_u64(seed); |
|
Matrix { |
|
rows, |
|
cols, |
|
elems: (0..rows * cols).map(|_| rng.r#gen::<u32>()).collect(), |
|
} |
|
} |
|
|
|
fn par_mul(&self, rhs: &Matrix) -> Matrix { |
|
assert_eq!(self.cols, rhs.rows); |
|
let mut elems = vec![0u32; (self.rows * rhs.cols) as usize]; |
|
|
|
elems.par_iter_mut().enumerate().for_each(|(lin_idx, v)| { |
|
let r_idx = lin_idx / rhs.cols as usize; |
|
let c_idx = lin_idx - r_idx * rhs.cols as usize; |
|
|
|
*v = (0..self.cols as usize).fold(0u32, |acc, k| { |
|
acc.wrapping_add( |
|
self.elems[r_idx * self.cols as usize + k] |
|
.wrapping_mul(rhs.elems[k * rhs.cols as usize + c_idx]), |
|
) |
|
}); |
|
}); |
|
|
|
Matrix { |
|
rows: self.rows, |
|
cols: rhs.cols, |
|
elems, |
|
} |
|
} |
|
|
|
fn to_bytes(&self) -> Vec<u8> { |
|
let offset0 = 0; |
|
let offset1 = offset0 + std::mem::size_of_val(&self.rows); |
|
let offset2 = offset1 + std::mem::size_of_val(&self.cols); |
|
let encoded_elems_byte_len = std::mem::size_of::<u32>() * (self.rows * self.cols) as usize; |
|
let encoded_mat_byte_len = offset2 + encoded_elems_byte_len; |
|
|
|
let elems_as_bytes = unsafe { |
|
let ptr_elems = self.elems.as_ptr(); |
|
let ptr_elem_bytes: *const u8 = ptr_elems.cast(); |
|
|
|
core::slice::from_raw_parts(ptr_elem_bytes, encoded_elems_byte_len) |
|
}; |
|
|
|
let mut bytes = vec![0u8; encoded_mat_byte_len]; |
|
|
|
bytes[offset0..offset1].copy_from_slice(&self.rows.to_le_bytes()); |
|
bytes[offset1..offset2].copy_from_slice(&self.cols.to_le_bytes()); |
|
bytes[offset2..].copy_from_slice(elems_as_bytes); |
|
|
|
bytes |
|
} |
|
|
|
fn from_bytes(bytes: &[u8]) -> Matrix { |
|
let offset0 = 0; |
|
let offset1 = offset0 + std::mem::size_of::<u32>(); |
|
let offset2 = offset1 + std::mem::size_of::<u32>(); |
|
|
|
let rows = u32::from_le_bytes(bytes[offset0..offset1].try_into().unwrap()); |
|
let cols = u32::from_le_bytes(bytes[offset1..offset2].try_into().unwrap()); |
|
let num_elems = (rows * cols) as usize; |
|
|
|
let encoded_elems_byte_len = std::mem::size_of::<u32>() * num_elems; |
|
let remaining_num_bytes = bytes.len() - offset2; |
|
|
|
assert_eq!(encoded_elems_byte_len, remaining_num_bytes); |
|
|
|
let elems = unsafe { |
|
let ptr_elems_bytes = bytes[offset2..].as_ptr(); |
|
let ptr_elems: *const u32 = ptr_elems_bytes.cast(); |
|
|
|
core::slice::from_raw_parts(ptr_elems, num_elems) |
|
} |
|
.to_vec(); |
|
|
|
Matrix { rows, cols, elems } |
|
} |
|
} |
|
|
|
fn main() { |
|
let library = VulkanLibrary::new().unwrap(); |
|
let instance = Instance::new( |
|
library, |
|
InstanceCreateInfo { |
|
flags: InstanceCreateFlags::ENUMERATE_PORTABILITY, |
|
..Default::default() |
|
}, |
|
) |
|
.unwrap(); |
|
|
|
let device_extensions = DeviceExtensions { |
|
khr_storage_buffer_storage_class: true, |
|
..DeviceExtensions::empty() |
|
}; |
|
let (physical_device, queue_family_index) = instance |
|
.enumerate_physical_devices() |
|
.unwrap() |
|
.filter(|p| p.supported_extensions().contains(&device_extensions)) |
|
.filter_map(|p| { |
|
p.queue_family_properties() |
|
.iter() |
|
.position(|q| { |
|
q.queue_flags |
|
.intersects(QueueFlags::COMPUTE | QueueFlags::TRANSFER) |
|
}) |
|
.map(|i| (p, i as u32)) |
|
}) |
|
.min_by_key(|(p, _)| match p.properties().device_type { |
|
PhysicalDeviceType::DiscreteGpu => 0, |
|
PhysicalDeviceType::IntegratedGpu => 1, |
|
PhysicalDeviceType::VirtualGpu => 2, |
|
PhysicalDeviceType::Cpu => 3, |
|
PhysicalDeviceType::Other => 4, |
|
_ => 5, |
|
}) |
|
.unwrap(); |
|
|
|
println!( |
|
"Using device: {} (type: {:?}). API version: {}", |
|
physical_device.properties().device_name, |
|
physical_device.properties().device_type, |
|
physical_device.api_version(), |
|
); |
|
|
|
let (device, queue) = { |
|
let (device, mut queues) = Device::new( |
|
physical_device, |
|
DeviceCreateInfo { |
|
enabled_extensions: device_extensions, |
|
queue_create_infos: vec![QueueCreateInfo { |
|
queue_family_index, |
|
..Default::default() |
|
}], |
|
..Default::default() |
|
}, |
|
) |
|
.unwrap(); |
|
|
|
(device, queues.next().unwrap()) |
|
}; |
|
|
|
let memory_allocator = Arc::new(StandardMemoryAllocator::new_default(device.clone())); |
|
|
|
let matrix_a = Matrix::random(13, MATRIX_A_NUM_ROWS, MATRIX_A_NUM_COLS); |
|
let matrix_b = Matrix::random(17, MATRIX_B_NUM_ROWS, MATRIX_B_NUM_COLS); |
|
|
|
println!("Matrix A = ({} X {})", matrix_a.rows, matrix_a.cols); |
|
println!("Matrix B = ({} X {})", matrix_b.rows, matrix_b.cols); |
|
println!("Matrix C = A X B = ({} X {})", matrix_a.rows, matrix_b.cols); |
|
|
|
let matrix_a_as_bytes = matrix_a.to_bytes(); |
|
let matrix_b_as_bytes = matrix_b.to_bytes(); |
|
let matrix_a_byte_len = matrix_a_as_bytes.len() as u64; |
|
let matrix_b_byte_len = matrix_b_as_bytes.len() as u64; |
|
let matrix_c_byte_len = (std::mem::size_of_val(&matrix_a.rows) |
|
+ std::mem::size_of_val(&matrix_b.cols) |
|
+ std::mem::size_of::<u32>() * (matrix_a.rows * matrix_b.cols) as usize) |
|
as u64; |
|
|
|
let matrix_a_host_local = Buffer::from_iter( |
|
memory_allocator.clone(), |
|
BufferCreateInfo { |
|
usage: BufferUsage::TRANSFER_SRC, |
|
..Default::default() |
|
}, |
|
AllocationCreateInfo { |
|
memory_type_filter: MemoryTypeFilter::HOST_SEQUENTIAL_WRITE |
|
| MemoryTypeFilter::PREFER_DEVICE, |
|
..Default::default() |
|
}, |
|
matrix_a_as_bytes, |
|
) |
|
.unwrap(); |
|
|
|
let matrix_b_host_local = Buffer::from_iter( |
|
memory_allocator.clone(), |
|
BufferCreateInfo { |
|
usage: BufferUsage::TRANSFER_SRC, |
|
..Default::default() |
|
}, |
|
AllocationCreateInfo { |
|
memory_type_filter: MemoryTypeFilter::HOST_SEQUENTIAL_WRITE |
|
| MemoryTypeFilter::PREFER_DEVICE, |
|
..Default::default() |
|
}, |
|
matrix_b_as_bytes, |
|
) |
|
.unwrap(); |
|
|
|
let matrix_a_device_local = Buffer::new_slice::<u8>( |
|
memory_allocator.clone(), |
|
BufferCreateInfo { |
|
usage: BufferUsage::STORAGE_BUFFER | BufferUsage::TRANSFER_DST, |
|
..Default::default() |
|
}, |
|
AllocationCreateInfo { |
|
memory_type_filter: MemoryTypeFilter::PREFER_DEVICE, |
|
..Default::default() |
|
}, |
|
matrix_a_byte_len, |
|
) |
|
.unwrap(); |
|
|
|
let matrix_b_device_local = Buffer::new_slice::<u8>( |
|
memory_allocator.clone(), |
|
BufferCreateInfo { |
|
usage: BufferUsage::STORAGE_BUFFER | BufferUsage::TRANSFER_DST, |
|
..Default::default() |
|
}, |
|
AllocationCreateInfo { |
|
memory_type_filter: MemoryTypeFilter::PREFER_DEVICE, |
|
..Default::default() |
|
}, |
|
matrix_b_byte_len, |
|
) |
|
.unwrap(); |
|
|
|
let matrix_c_buf = Buffer::new_slice::<u8>( |
|
memory_allocator.clone(), |
|
BufferCreateInfo { |
|
usage: BufferUsage::STORAGE_BUFFER, |
|
..Default::default() |
|
}, |
|
AllocationCreateInfo { |
|
memory_type_filter: MemoryTypeFilter::HOST_RANDOM_ACCESS |
|
| MemoryTypeFilter::PREFER_DEVICE, |
|
..Default::default() |
|
}, |
|
matrix_c_byte_len, |
|
) |
|
.unwrap(); |
|
|
|
let command_buffer_allocator = Arc::new(StandardCommandBufferAllocator::new( |
|
device.clone(), |
|
Default::default(), |
|
)); |
|
|
|
// Transfer data to the GPU |
|
let transfer_command_buffer = { |
|
let mut builder = AutoCommandBufferBuilder::primary( |
|
command_buffer_allocator.clone(), |
|
queue.queue_family_index(), |
|
CommandBufferUsage::OneTimeSubmit, |
|
) |
|
.unwrap(); |
|
|
|
builder |
|
.copy_buffer(CopyBufferInfo::buffers( |
|
matrix_a_host_local, |
|
matrix_a_device_local.clone(), |
|
)) |
|
.unwrap() |
|
.copy_buffer(CopyBufferInfo::buffers( |
|
matrix_b_host_local, |
|
matrix_b_device_local.clone(), |
|
)) |
|
.unwrap(); |
|
|
|
builder.build().unwrap() |
|
}; |
|
|
|
transfer_command_buffer |
|
.execute(queue.clone()) |
|
.unwrap() |
|
.then_signal_fence_and_flush() |
|
.unwrap() |
|
.wait(None) |
|
.unwrap(); |
|
|
|
let pipeline = { |
|
let compute_shader = cs::load(device.clone()).unwrap(); |
|
let compute_shader_entry_point = compute_shader.entry_point("main").unwrap(); |
|
let compute_stage = PipelineShaderStageCreateInfo::new(compute_shader_entry_point); |
|
|
|
let layout = PipelineLayout::new( |
|
device.clone(), |
|
PipelineDescriptorSetLayoutCreateInfo::from_stages([&compute_stage]) |
|
.into_pipeline_layout_create_info(device.clone()) |
|
.unwrap(), |
|
) |
|
.unwrap(); |
|
|
|
ComputePipeline::new( |
|
device.clone(), |
|
None, |
|
ComputePipelineCreateInfo::stage_layout(compute_stage, layout.clone()), |
|
) |
|
.unwrap() |
|
}; |
|
|
|
let descriptor_set_allocator = Arc::new(StandardDescriptorSetAllocator::new( |
|
device.clone(), |
|
Default::default(), |
|
)); |
|
let descriptor_set_layout = pipeline.layout().set_layouts()[0].clone(); |
|
let descriptor_set = DescriptorSet::new( |
|
descriptor_set_allocator, |
|
descriptor_set_layout, |
|
[ |
|
WriteDescriptorSet::buffer(0, matrix_a_device_local.clone()), |
|
WriteDescriptorSet::buffer(1, matrix_b_device_local.clone()), |
|
WriteDescriptorSet::buffer(2, matrix_c_buf.clone()), |
|
], |
|
[], |
|
) |
|
.unwrap(); |
|
|
|
let command_buffer = { |
|
let mut command_buffer_builder = AutoCommandBufferBuilder::primary( |
|
command_buffer_allocator, |
|
queue.queue_family_index(), |
|
CommandBufferUsage::MultipleSubmit, |
|
) |
|
.unwrap(); |
|
|
|
command_buffer_builder |
|
.bind_pipeline_compute(pipeline.clone()) |
|
.unwrap() |
|
.bind_descriptor_sets( |
|
PipelineBindPoint::Compute, |
|
pipeline.layout().clone(), |
|
0, |
|
descriptor_set, |
|
) |
|
.unwrap(); |
|
unsafe { |
|
command_buffer_builder |
|
.dispatch([ |
|
MATRIX_A_NUM_ROWS.div_ceil(8), |
|
MATRIX_B_NUM_COLS.div_ceil(8), |
|
1, |
|
]) |
|
.unwrap(); |
|
} |
|
|
|
command_buffer_builder.build().unwrap() |
|
}; |
|
|
|
// Computing Matrix Multiplication on GPU |
|
let start = Instant::now(); |
|
sync::now(device.clone()) |
|
.then_execute(queue.clone(), command_buffer.clone()) |
|
.unwrap() |
|
.then_signal_fence_and_flush() |
|
.unwrap() |
|
.wait(None) |
|
.unwrap(); |
|
let gpu_tm = start.elapsed(); |
|
println!("(a) GPU matrix multiply: {:?}", gpu_tm); |
|
|
|
// reading GPU-computed matrix multiplication result |
|
let gpu_result_as_bytes = matrix_c_buf.read().unwrap(); |
|
let gpu_result = Matrix::from_bytes(&gpu_result_as_bytes); |
|
|
|
// Computing Parallel Matrix Multiplication on CPU, using Rayon |
|
let start = Instant::now(); |
|
let cpu_result_rayon = matrix_a.par_mul(&matrix_b); |
|
let cpu_tm_rayon = start.elapsed(); |
|
println!("(b) CPU parallel matrix multiply: {:?}", cpu_tm_rayon); |
|
|
|
let start = Instant::now(); |
|
let cpu_result_single_threaded = matrix_a * matrix_b; |
|
let cpu_tm_single_threaded = start.elapsed(); |
|
println!( |
|
"(c) CPU single-threaded matrix multiply: {:?}", |
|
cpu_tm_single_threaded |
|
); |
|
|
|
assert_eq!(gpu_result, cpu_result_single_threaded); |
|
assert_eq!(cpu_result_single_threaded, cpu_result_rayon); |
|
|
|
println!( |
|
"Speed Up (b/a): {}x\nSpeed Up (c/a): {}x\nSpeed Up (c/b): {}x", |
|
cpu_tm_rayon.as_nanos() / gpu_tm.as_nanos(), |
|
cpu_tm_single_threaded.as_nanos() / gpu_tm.as_nanos(), |
|
cpu_tm_single_threaded.as_nanos() / cpu_tm_rayon.as_nanos(), |
|
); |
|
} |
|
|
|
mod cs { |
|
// Compiles shader |
|
vulkano_shaders::shader! { |
|
ty: "compute", |
|
path: "./matrix_multiply.glsl", |
|
vulkan_version: "1.2", |
|
} |
|
} |
Comparation is not fair, CPU is doing calculation in single thread. Dot product those two arrays (that is more than shader does), using
ndarray
takes 275 micro seconds on 16 vCore CPU.Anyhow, example is nice :) TNX!