Created
November 6, 2021 16:18
-
-
Save iambrj/1500993ba4e031e8b50b52c040f9a702 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/********** | |
Copyright (c) 2019-2020, Xilinx, Inc. | |
All rights reserved. | |
Redistribution and use in source and binary forms, with or without modification, | |
are permitted provided that the following conditions are met: | |
1. Redistributions of source code must retain the above copyright notice, | |
this list of conditions and the following disclaimer. | |
2. Redistributions in binary form must reproduce the above copyright notice, | |
this list of conditions and the following disclaimer in the documentation | |
and/or other materials provided with the distribution. | |
3. Neither the name of the copyright holder nor the names of its contributors | |
may be used to endorse or promote products derived from this software | |
without specific prior written permission. | |
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND | |
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, | |
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. | |
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, | |
INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | |
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |
HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, | |
EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
**********/ | |
#include "event_timer.hpp" | |
#include <iostream> | |
#include <memory> | |
#include <string> | |
// Xilinx OpenCL and XRT includes | |
#include "xilinx_ocl.hpp" | |
void vmul_sw(float *a, float *b, float *c, uint32_t size) | |
{ | |
for (uint32_t i = 0; i < size; i++) { | |
c[i] = a[i] * b[i]; | |
} | |
} | |
int main(int argc, char *argv[]) | |
{ | |
// Initialize an event timer we'll use for monitoring the application | |
EventTimer et; | |
// Check if the binary file is passed as argument | |
if (argc != 2) { | |
std::cout << "Usage: " << argv[0] << " <XCLBIN File>" << std::endl; | |
return EXIT_FAILURE; | |
} | |
// Copy binary name | |
char* binaryName = argv[1]; | |
// Get target and set BUFSIZE 1024 times bigger for hw runs | |
std::string target = getenv("XCL_EMULATION_MODE"); | |
uint32_t BUFSIZE = (target.compare("hw") == 0) ? (1024 * 1024 * 32) : (1024 * 32); | |
if(argc == 3) { | |
BUFSIZE = std::stoi(argv[2]); | |
} | |
std::cout << "-- Parallelizing the Data Path --" << std::endl << std::endl; | |
// Initialize the runtime (including a command queue) and load the | |
// FPGA image | |
std::cout << "Loading " << binaryName << " to program the board" << std::endl << std::endl; | |
et.add("OpenCL Initialization"); | |
// This application will use the first Xilinx device found in the system | |
swm::XilinxOcl xocl; | |
xocl.initialize(binaryName); | |
cl::CommandQueue q = xocl.get_command_queue(); | |
cl::Kernel krnl = xocl.get_kernel("wide_vmul"); | |
et.finish(); | |
/// New code for example 01 | |
std::cout << "Running kernel test XRT-allocated buffers and wide data path:" << std::endl | |
<< std::endl; | |
// Map our user-allocated buffers as OpenCL buffers using a shared | |
// host pointer | |
et.add("Allocate contiguous OpenCL buffers"); | |
cl_mem_ext_ptr_t bank_ext; | |
bank_ext.flags = 0 | XCL_MEM_TOPOLOGY; | |
bank_ext.obj = NULL; | |
bank_ext.param = 0; | |
cl::Buffer a_buf(xocl.get_context(), | |
static_cast<cl_mem_flags>(CL_MEM_READ_ONLY), | |
BUFSIZE * sizeof(float), | |
NULL, | |
NULL); | |
cl::Buffer b_buf(xocl.get_context(), | |
static_cast<cl_mem_flags>(CL_MEM_READ_ONLY), | |
BUFSIZE * sizeof(float), | |
NULL, | |
NULL); | |
cl::Buffer c_buf(xocl.get_context(), | |
static_cast<cl_mem_flags>(CL_MEM_READ_WRITE), | |
BUFSIZE * sizeof(float), | |
NULL, | |
NULL); | |
cl::Buffer d_buf(xocl.get_context(), | |
static_cast<cl_mem_flags>(CL_MEM_READ_WRITE | | |
CL_MEM_ALLOC_HOST_PTR | | |
CL_MEM_EXT_PTR_XILINX), | |
BUFSIZE * sizeof(float), | |
&bank_ext, | |
NULL); | |
et.finish(); | |
// Set vmul kernel arguments. We do this before mapping the buffers to allow XRT | |
// to allocate the buffers in the appropriate memory banks for the selected | |
// kernels. For buffer 'd' we explicitly set a bank above, but this buffer is | |
// never migrated to the Alveo card so this mapping is theoretical. | |
et.add("Set kernel arguments"); | |
krnl.setArg(0, a_buf); | |
krnl.setArg(1, b_buf); | |
krnl.setArg(2, c_buf); | |
krnl.setArg(3, BUFSIZE); | |
et.add("Map buffers to user space pointers"); | |
float *a = (float *)q.enqueueMapBuffer(a_buf, | |
CL_TRUE, | |
CL_MAP_WRITE, | |
0, | |
BUFSIZE * sizeof(float)); | |
float *b = (float *)q.enqueueMapBuffer(b_buf, | |
CL_TRUE, | |
CL_MAP_WRITE, | |
0, | |
BUFSIZE * sizeof(float)); | |
float *d = (float *)q.enqueueMapBuffer(d_buf, | |
CL_TRUE, | |
CL_MAP_WRITE | CL_MAP_READ, | |
0, | |
BUFSIZE * sizeof(float)); | |
et.finish(); | |
et.add("Populating buffer inputs"); | |
for (uint32_t i = 0; i < BUFSIZE; i++) { | |
a[i] = i; | |
b[i] = 2 * i; | |
} | |
et.finish(); | |
// For comparison, let's have the CPU calculate the result | |
et.add("Software VADD run"); | |
vmul(a, b, d, BUFSIZE); | |
et.finish(); | |
// Send the buffers down to the Alveo card | |
et.add("Memory object migration enqueue"); | |
cl::Event event_sp; | |
q.enqueueMigrateMemObjects({a_buf, b_buf}, 0, NULL, &event_sp); | |
clWaitForEvents(1, (const cl_event *)&event_sp); | |
et.add("OCL Enqueue task"); | |
q.enqueueTask(krnl, NULL, &event_sp); | |
et.add("Wait for kernel to complete"); | |
clWaitForEvents(1, (const cl_event *)&event_sp); | |
// Migrate memory back from device | |
et.add("Read back computation results"); | |
float *c = (float *)q.enqueueMapBuffer(c_buf, | |
CL_TRUE, | |
CL_MAP_READ, | |
0, | |
BUFSIZE * sizeof(float)); | |
et.finish(); | |
// Verify the results | |
bool verified = true; | |
for (uint32_t i = 0; i < BUFSIZE; i++) { | |
if (c[i] != d[i]) { | |
verified = false; | |
std::cout << "ERROR: software and hardware vmul do not match: " | |
<< c[i] << "!=" << d[i] << " at position " << i << std::endl; | |
break; | |
} | |
} | |
if (verified) { | |
std::cout | |
<< std::endl | |
<< "OCL-mapped contiguous buffer example complete successfully!" | |
<< std::endl | |
<< std::endl; | |
} | |
else { | |
std::cout | |
<< std::endl | |
<< "OCL-mapped contiguous buffer example complete! (with errors)" | |
<< std::endl | |
<< std::endl; | |
} | |
std::cout << "--------------- Key execution times ---------------" << std::endl; | |
q.enqueueUnmapMemObject(a_buf, a); | |
q.enqueueUnmapMemObject(b_buf, b); | |
q.enqueueUnmapMemObject(c_buf, c); | |
q.enqueueUnmapMemObject(d_buf, d); | |
q.finish(); | |
et.print(); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/********** | |
Copyright (c) 2018-2020, Xilinx, Inc. | |
All rights reserved. | |
Redistribution and use in source and binary forms, with or without modification, | |
are permitted provided that the following conditions are met: | |
1. Redistributions of source code must retain the above copyright notice, | |
this list of conditions and the following disclaimer. | |
2. Redistributions in binary form must reproduce the above copyright notice, | |
this list of conditions and the following disclaimer in the documentation | |
and/or other materials provided with the distribution. | |
3. Neither the name of the copyright holder nor the names of its contributors | |
may be used to endorse or promote products derived from this software | |
without specific prior written permission. | |
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND | |
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, | |
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. | |
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, | |
INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | |
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |
HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, | |
EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
**********/ | |
/******************************************************************************* | |
Description: | |
Wide Memory Access Example using ap_uint<Width> datatype | |
Description: This is vector addition example to demonstrate Wide Memory | |
access of 512bit Datawidth using ap_uint<> datatype which is defined inside | |
'ap_int.h' file. | |
*******************************************************************************/ | |
//Including to use ap_uint<> datatype | |
#include <ap_int.h> | |
#include <stdio.h> | |
#include <string.h> | |
#define BUFFER_SIZE 64 | |
#define DATAWIDTH 512 | |
#define VECTOR_SIZE (DATAWIDTH / 32) // vector size is 16 (512/32 = 16) | |
typedef ap_uint<DATAWIDTH> uint512_dt; | |
//TRIPCOUNT identifier | |
const unsigned int c_chunk_sz = BUFFER_SIZE; | |
const unsigned int c_size = VECTOR_SIZE; | |
/* | |
Vector Addition Kernel Implementation using uint512_dt datatype | |
Arguments: | |
in1 (input) --> Input Vector1 | |
in2 (input) --> Input Vector2 | |
out (output) --> Output Vector | |
size (input) --> Size of Vector in Integer | |
*/ | |
extern "C" | |
{ | |
void wide_vmul( | |
const uint512_dt *in1, // Read-Only Vector 1 | |
const uint512_dt *in2, // Read-Only Vector 2 | |
uint512_dt *out, // Output Result | |
int size // Size in integer | |
) | |
{ | |
#pragma HLS INTERFACE m_axi port = in1 max_read_burst_length = 32 offset = slave bundle = gmem | |
#pragma HLS INTERFACE m_axi port = in2 max_read_burst_length = 32 offset = slave bundle = gmem1 | |
#pragma HLS INTERFACE m_axi port = out max_write_burst_length = 32 offset = slave bundle = gmem2 | |
#pragma HLS INTERFACE s_axilite port = in1 bundle = control | |
#pragma HLS INTERFACE s_axilite port = in2 bundle = control | |
#pragma HLS INTERFACE s_axilite port = out bundle = control | |
#pragma HLS INTERFACE s_axilite port = size bundle = control | |
#pragma HLS INTERFACE s_axilite port = return bundle = control | |
uint512_dt v1_local[BUFFER_SIZE]; // Local memory to store vector1 | |
uint512_dt v2_local[BUFFER_SIZE]; | |
// Input vector size for integer vectors. However kernel is directly | |
// accessing 512bit data (total 16 elements). So total number of read | |
// from global memory is calculated here: | |
int size_in16 = (size - 1) / VECTOR_SIZE + 1; | |
//Per iteration of this loop perform BUFFER_SIZE vector multiplication | |
for (int i = 0; i < size_in16; i += BUFFER_SIZE) { | |
//#pragma HLS PIPELINE | |
#pragma HLS DATAFLOW | |
#pragma HLS stream variable = v1_local depth = 64 | |
#pragma HLS stream variable = v2_local depth = 64 | |
int chunk_size = BUFFER_SIZE; | |
//boundary checks | |
if ((i + BUFFER_SIZE) > size_in16) | |
chunk_size = size_in16 - i; | |
//burst read from both input vectors at the same time from global memory to local memory | |
v1_rd: | |
for (int j = 0; j < chunk_size; j++) { | |
#pragma HLS pipeline | |
#pragma HLS LOOP_TRIPCOUNT min = 1 max = 64 | |
v1_local[j] = in1[i + j]; | |
v2_local[j] = in2[i + j]; | |
} | |
//burst read second vector and perform vector addition | |
v2_rd_add: | |
for (int j = 0; j < chunk_size; j++) { | |
#pragma HLS pipeline | |
#pragma HLS LOOP_TRIPCOUNT min = 1 max = 64 | |
uint512_dt tmpV1 = v1_local[j]; | |
uint512_dt tmpV2 = v2_local[j]; | |
uint512_dt tmpV3 = 0; | |
vec_sum: for (unsigned int s = 0; s < DATAWIDTH; s+= 32){ | |
#pragma HLS unroll | |
// add the 32-bit elements individually and compose the output vector | |
tmpV3(s + 31, s) = tmpV1(s + 31, s) * tmpV2(s + 31, s); | |
} | |
out[i + j] = tmpV3; | |
} | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment