Last active
June 6, 2018 14:08
-
-
Save buttercutter/a3af6c59906567c3df4179a501513a1b to your computer and use it in GitHub Desktop.
verilog code for HLS kernel interfacing xillybus
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//`define LOOPBACK 1 | |
module xillydemo(PCIE_PERST_B_LS, PCIE_REFCLK_N, PCIE_REFCLK_P, PCIE_RX_N, PCIE_RX_P, GPIO_LED, PCIE_TX_N, PCIE_TX_P); | |
localparam STREAM_WIDTH = 128; | |
input PCIE_PERST_B_LS; | |
input PCIE_REFCLK_N; | |
input PCIE_REFCLK_P; | |
input [7:0] PCIE_RX_N; | |
input [7:0] PCIE_RX_P; | |
output [3:0] GPIO_LED; | |
output [7:0] PCIE_TX_N; | |
output [7:0] PCIE_TX_P; | |
// Clock and quiesce | |
wire bus_clk; | |
wire quiesce; | |
// Memory array | |
reg [7:0] demoarray[0:31]; | |
// Wires related to /dev/xillybus_mem_128 | |
wire user_r_mem_128_rden; | |
wire user_r_mem_128_empty; | |
reg [STREAM_WIDTH-1:0] user_r_mem_128_data; | |
wire user_r_mem_128_eof; | |
wire user_r_mem_128_open; | |
wire user_w_mem_128_wren; | |
wire user_w_mem_128_full; | |
wire [STREAM_WIDTH-1:0] user_w_mem_128_data; | |
wire user_w_mem_128_open; | |
wire [$clog2(STREAM_WIDTH)-1:0] user_mem_128_addr; | |
wire user_mem_128_addr_update; | |
// Wires related to /dev/xillybus_read_128 | |
wire user_r_read_128_rden; | |
wire user_r_read_128_empty; | |
wire [STREAM_WIDTH-1:0] user_r_read_128_data; | |
wire user_r_read_128_eof; | |
wire user_r_read_128_open; | |
// Wires related to /dev/xillybus_write_128 | |
wire user_w_write_128_wren; | |
wire user_w_write_128_full; | |
wire [STREAM_WIDTH-1:0] user_w_write_128_data; | |
wire user_w_write_128_open; | |
// Wires related to /dev/xillybus_read_256 | |
wire user_r_read_256_rden; | |
wire user_r_read_256_empty; | |
wire [(STREAM_WIDTH << 1)-1:0] user_r_read_256_data; | |
wire user_r_read_256_eof; | |
wire user_r_read_256_open; | |
// Wires related to /dev/xillybus_write_256 | |
wire user_w_write_256_wren; | |
wire user_w_write_256_full; | |
wire [(STREAM_WIDTH << 1)-1:0] user_w_write_256_data; | |
wire user_w_write_256_open; | |
xillybus xillybus_ins ( | |
// Ports related to /dev/xillybus_mem_128 | |
// FPGA to CPU signals: | |
.user_r_mem_128_rden(user_r_mem_128_rden), | |
.user_r_mem_128_empty(user_r_mem_128_empty), | |
.user_r_mem_128_data(user_r_mem_128_data), | |
.user_r_mem_128_eof(user_r_mem_128_eof), | |
.user_r_mem_128_open(user_r_mem_128_open), | |
// CPU to FPGA signals: | |
.user_w_mem_128_wren(user_w_mem_128_wren), | |
.user_w_mem_128_full(user_w_mem_128_full), | |
.user_w_mem_128_data(user_w_mem_128_data), | |
.user_w_mem_128_open(user_w_mem_128_open), | |
// Address signals: | |
.user_mem_128_addr(user_mem_128_addr), | |
.user_mem_128_addr_update(user_mem_128_addr_update), | |
// Ports related to /dev/xillybus_read_256 | |
// FPGA to CPU signals: | |
.user_r_read_256_rden(user_r_read_256_rden), | |
.user_r_read_256_empty(user_r_read_256_empty), | |
.user_r_read_256_data(user_r_read_256_data), | |
.user_r_read_256_eof(user_r_read_256_eof), | |
.user_r_read_256_open(user_r_read_256_open), | |
// Ports related to /dev/xillybus_write_256 | |
// CPU to FPGA signals: | |
.user_w_write_256_wren(user_w_write_256_wren), | |
.user_w_write_256_full(user_w_write_256_full), | |
.user_w_write_256_data(user_w_write_256_data), | |
.user_w_write_256_open(user_w_write_256_open), | |
// Ports related to /dev/xillybus_read_128 | |
// FPGA to CPU signals: | |
.user_r_read_128_rden(user_r_read_128_rden), | |
.user_r_read_128_empty(user_r_read_128_empty), | |
.user_r_read_128_data(user_r_read_128_data), | |
.user_r_read_128_eof(user_r_read_128_eof), | |
.user_r_read_128_open(user_r_read_128_open), | |
// Ports related to /dev/xillybus_write_128 | |
// CPU to FPGA signals: | |
.user_w_write_128_wren(user_w_write_128_wren), | |
.user_w_write_128_full(user_w_write_128_full), | |
.user_w_write_128_data(user_w_write_128_data), | |
.user_w_write_128_open(user_w_write_128_open), | |
// Signals to top level | |
.PCIE_PERST_B_LS(PCIE_PERST_B_LS), | |
.PCIE_REFCLK_N(PCIE_REFCLK_N), | |
.PCIE_REFCLK_P(PCIE_REFCLK_P), | |
.PCIE_RX_N(PCIE_RX_N), | |
.PCIE_RX_P(PCIE_RX_P), | |
.GPIO_LED(GPIO_LED), | |
.PCIE_TX_N(PCIE_TX_N), | |
.PCIE_TX_P(PCIE_TX_P), | |
.bus_clk(bus_clk), | |
.quiesce(quiesce) | |
); | |
// A simple inferred RAM | |
always @(posedge bus_clk) | |
begin | |
if (user_w_mem_128_wren) | |
demoarray[user_mem_128_addr] <= user_w_mem_128_data; | |
if (user_r_mem_128_rden) | |
user_r_mem_128_data <= demoarray[user_mem_128_addr]; | |
end | |
assign user_r_mem_128_empty = 0; | |
assign user_r_mem_128_eof = 0; | |
assign user_w_mem_128_full = 0; | |
`ifdef LOOPBACK | |
wire [$clog2(STREAM_WIDTH)-1:0] data_count_of_loopback_fifo; | |
// 128-bit loopback | |
fifo_128 fifo_128x128 | |
( | |
.clk(bus_clk), | |
.reset(!user_w_write_128_open && !user_r_read_128_open), | |
.flush_en(0), | |
.value_i(user_w_write_128_data), | |
.enqueue_en(user_w_write_128_wren), | |
.dequeue_en(user_r_read_128_rden), | |
.value_o(user_r_read_128_data), | |
.full(user_w_write_128_full), | |
.empty(user_r_read_128_empty), | |
.count(data_count_of_loopback_fifo) | |
); | |
assign user_r_read_128_eof = 0; | |
localparam TOTAL_NUM_OF_PIXELS = 512*512; // the image is sized as 3*512*512 , width=512 and height=512 | |
localparam NUM_OF_COMPONENTS_IN_A_PIXEL = 3; // input: [R, G, B] output:[Y, U, V] | |
localparam PIXEL_VALUE_RANGE = 8; // number of bits occupied by [R, G, B] or [Y, U, V] respectively (8-bit unsigned integer for each components) , https://docs.microsoft.com/en-us/windows-hardware/drivers/display/yuv-rgb-data-range-conversions | |
// to check if xillybus has transmitted all pixels data through the loopback fifo | |
reg [$clog2(TOTAL_NUM_OF_PIXELS*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE/STREAM_WIDTH)-1:0] number_of_128_bit_data_passed_through_loopback_fifo = 0; // initially nothing is received | |
always@ (posedge bus_clk) begin | |
if(!user_w_write_128_open && !user_r_read_128_open) | |
number_of_128_bit_data_passed_through_loopback_fifo <= 0; | |
else if(user_w_write_128_wren && (number_of_128_bit_data_passed_through_loopback_fifo < (TOTAL_NUM_OF_PIXELS*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE/STREAM_WIDTH))) | |
number_of_128_bit_data_passed_through_loopback_fifo <= number_of_128_bit_data_passed_through_loopback_fifo + 1; // for every xillybus transaction, input fifo should receive 128 bits, or equivalently 'KERNEL_NUM' pieces of pixels | |
end | |
// Vivado built-in internal logic analyzer module instantiation | |
ila_1 ila( | |
.clk(bus_clk), | |
.probe0(user_w_write_128_data), | |
.probe1(user_r_read_128_data), | |
.probe2(data_count_of_loopback_fifo), | |
.probe3(user_w_write_128_full), | |
.probe4(user_w_write_128_wren), | |
.probe5(user_r_read_128_rden), | |
.probe6(user_r_read_128_empty), | |
.probe7(user_w_write_128_open), | |
.probe8(user_r_read_128_open), | |
.probe9(number_of_128_bit_data_passed_through_loopback_fifo) | |
); | |
`else | |
// Signals for ($floor((STREAM_WIDTH/PIXEL_VALUE_RANGE)/NUM_OF_COMPONENTS_IN_A_PIXEL) = 5) kernels | |
// since an image pixel is unsigned 8-bit integer, its component values of [R, G, B] or [Y, U, V] range from 0 to 255. | |
// A pixel occupies 3*8=24 bits. Therefore, in each transaction, we could at most put 5 pixels (120 bits) into /dev/xillybus_write_128, | |
// computes the relevant kernel equations for 5 pixels, send out 5 pixels again through /dev/xillybus_read_128 | |
localparam NUM_OF_COMPONENTS_IN_A_PIXEL = 3; // input: [R, G, B] , output:[Y, U, V] | |
localparam PIXEL_VALUE_RANGE = 8; // number of bits occupied by [R, G, B] and [Y, U, V] respectively (8-bit unsigned integer for each components) , https://docs.microsoft.com/en-us/windows-hardware/drivers/display/yuv-rgb-data-range-conversions | |
localparam KERNEL_NUM = 5; // 5 copies of kernel, each kernel computes equation for [R, G, B] of one single pixel | |
localparam TOTAL_NUM_OF_PIXELS = 512*512; // lena.tiff is sized as 3*512*512 , width=512 and height=512 | |
// Signals for two buffer FIFOs | |
localparam FIFO_DEPTH = 16; | |
wire [$clog2(FIFO_DEPTH):0] data_count_of_input_fifo; // determines whether all five pixel slots have incoming data or not | |
wire [$clog2(FIFO_DEPTH):0] data_count_of_output_fifo; | |
//-------------------------------------------kernel----------------------------------------// | |
wire [STREAM_WIDTH-1:0] stream_i_V_V_dout; // Read data for 5 pixels or KERNEL_NUM*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE = 120 bits (most significant 8 bits indicates last 128-bit transaction) | |
wire stream_i_V_V_empty; // Empty condition | |
wire [KERNEL_NUM*NUM_OF_COMPONENTS_IN_A_PIXEL-1:0] stream_i_V_V_read; // Read enable for each color components of all five pixels, high active | |
(* mark_debug = "true" *) wire [STREAM_WIDTH-1:0] stream_o_V_V_din; // Write data for 5 pixels or KERNEL_NUM*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE = 120 bits (most significant 8 bits indicates last 128-bit transaction) | |
wire stream_o_V_V_full; // Full condition | |
wire [KERNEL_NUM*NUM_OF_COMPONENTS_IN_A_PIXEL-1:0] stream_o_V_V_write; // Write enable for each color components of all five pixels, high active | |
wire [PIXEL_VALUE_RANGE-1:0] num_of_pixels_left_in_the_last_128_bits_transaction = user_w_write_128_data[STREAM_WIDTH-1 : (KERNEL_NUM-1)*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE]; // most significant 8 bits | |
wire is_last_few_pixels = (data_count_of_input_fifo < KERNEL_NUM) && (num_of_pixels_left_in_the_last_128_bits_transaction > 0); | |
// the remaining pixels do not fill all five pixel slots for a 128-bit stream, AND the last 128-bit transaction indication from software | |
reg [KERNEL_NUM-1:0] ap_start = 0; // initially the HLS kernels are not started | |
wire [KERNEL_NUM-1:0] ap_done; | |
wire [KERNEL_NUM-1:0] ap_idle; | |
wire [KERNEL_NUM-1:0] ap_ready; | |
reg [KERNEL_NUM-1:0] start_signal_during_last_transaction = 0; | |
reg [$clog2(KERNEL_NUM)-1:0] start_index = 0; | |
always @(posedge bus_clk) begin | |
if(!user_w_write_128_open && !user_r_read_128_open) begin | |
start_signal_during_last_transaction <= 0; // reset all bits to zero, preparing for transmission of next image | |
start_index <= 0; | |
end | |
else begin | |
if(start_index < num_of_pixels_left_in_the_last_128_bits_transaction) begin | |
start_signal_during_last_transaction[start_index] <= 1'b1; | |
start_index <= start_index + 1; | |
end | |
end | |
end | |
always @(posedge bus_clk) | |
ap_start <= (is_last_few_pixels) ? (start_signal_during_last_transaction) : {KERNEL_NUM{(&stream_i_V_V_read || !stream_i_V_V_empty)}}; // start signals depend on whether all five pixel slots are filled or not | |
// -----------------input FIFO ----------------------------------// | |
fifo_fwft_128 | |
#( | |
.WIDTH(STREAM_WIDTH), | |
.SIZE(FIFO_DEPTH) | |
) | |
input_pipe( | |
.clk(bus_clk), | |
.reset(!user_w_write_128_open && !user_r_read_128_open), | |
.flush_en(0), | |
.value_i(user_w_write_128_data), | |
.enqueue_en(user_w_write_128_wren), | |
.dequeue_en(&stream_i_V_V_read), | |
.value_o(stream_i_V_V_dout), | |
.full(user_w_write_128_full), | |
.empty(stream_i_V_V_empty), | |
.count(data_count_of_input_fifo) | |
); | |
// to check if xillybus has transmitted all pixels data to the input_pipe fifo | |
reg [$clog2(TOTAL_NUM_OF_PIXELS)-1:0] number_of_pixels_received_by_input_fifo = 0; // initially nothing is received | |
always@ (posedge bus_clk) begin | |
if(!user_w_write_128_open && !user_r_read_128_open) | |
number_of_pixels_received_by_input_fifo <= 0; | |
else if(user_w_write_128_wren && (number_of_pixels_received_by_input_fifo <= (TOTAL_NUM_OF_PIXELS-KERNEL_NUM))) begin | |
if(!is_last_few_pixels) | |
number_of_pixels_received_by_input_fifo <= number_of_pixels_received_by_input_fifo + KERNEL_NUM; // for every xillybus transactions except the last, input fifo should receive 'KERNEL_NUM' pieces of pixels | |
else | |
number_of_pixels_received_by_input_fifo <= number_of_pixels_received_by_input_fifo + num_of_pixels_left_in_the_last_128_bits_transaction; | |
end | |
end | |
// use of generate loop to replicate 5 hardware copies of RGB2YUV computational HLS kernels for 5 different pixels | |
generate | |
genvar kn; // to indicate which kernel | |
for(kn=0; kn<KERNEL_NUM; kn=kn+1) begin | |
kernel RGB2YUV_kn ( | |
.ap_clk(bus_clk), | |
.ap_rst(!user_w_write_128_open && !user_r_read_128_open), | |
.ap_start(ap_start[kn]), // need to confirm ? | |
.ap_done(ap_done[kn]), | |
.ap_idle(ap_idle[kn]), | |
.ap_ready(ap_ready[kn]), | |
.stream_i0_V_V_dout(stream_i_V_V_dout[kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + PIXEL_VALUE_RANGE - 1 : kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE]), // input component R with (PIXEL_VALUE_RANGE) bits | |
.stream_i0_V_V_empty_n(!stream_i_V_V_empty), | |
.stream_i0_V_V_read(stream_i_V_V_read[kn*NUM_OF_COMPONENTS_IN_A_PIXEL]), | |
.stream_i1_V_V_dout(stream_i_V_V_dout[kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + (NUM_OF_COMPONENTS_IN_A_PIXEL-1)*PIXEL_VALUE_RANGE - 1 : kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + PIXEL_VALUE_RANGE]), // input component G with (PIXEL_VALUE_RANGE) bits | |
.stream_i1_V_V_empty_n(!stream_i_V_V_empty), | |
.stream_i1_V_V_read(stream_i_V_V_read[kn*NUM_OF_COMPONENTS_IN_A_PIXEL + 1]), | |
.stream_i2_V_V_dout(stream_i_V_V_dout[kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE - 1 : kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + (NUM_OF_COMPONENTS_IN_A_PIXEL-1)*PIXEL_VALUE_RANGE]), // input component B with (PIXEL_VALUE_RANGE) bits | |
.stream_i2_V_V_empty_n(!stream_i_V_V_empty), | |
.stream_i2_V_V_read(stream_i_V_V_read[kn*NUM_OF_COMPONENTS_IN_A_PIXEL + (NUM_OF_COMPONENTS_IN_A_PIXEL-1)]), | |
.stream_o0_V_V_din(stream_o_V_V_din[kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + PIXEL_VALUE_RANGE - 1 : kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE]), // output component Y with (PIXEL_VALUE_RANGE) bits | |
.stream_o0_V_V_full_n(!stream_o_V_V_full), | |
.stream_o0_V_V_write(stream_o_V_V_write[kn*NUM_OF_COMPONENTS_IN_A_PIXEL]), | |
.stream_o1_V_V_din(stream_o_V_V_din[kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + (NUM_OF_COMPONENTS_IN_A_PIXEL-1)*PIXEL_VALUE_RANGE - 1 : kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + PIXEL_VALUE_RANGE]), // output component U with (PIXEL_VALUE_RANGE) bits | |
.stream_o1_V_V_full_n(!stream_o_V_V_full), | |
.stream_o1_V_V_write(stream_o_V_V_write[kn*NUM_OF_COMPONENTS_IN_A_PIXEL + 1]), | |
.stream_o2_V_V_din(stream_o_V_V_din[kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE - 1 : kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + (NUM_OF_COMPONENTS_IN_A_PIXEL-1)*PIXEL_VALUE_RANGE]), // output component V with (PIXEL_VALUE_RANGE) bits | |
.stream_o2_V_V_full_n(!stream_o_V_V_full), | |
.stream_o2_V_V_write(stream_o_V_V_write[kn*NUM_OF_COMPONENTS_IN_A_PIXEL + (NUM_OF_COMPONENTS_IN_A_PIXEL-1)]) | |
); | |
end | |
endgenerate | |
//----------------------output FIFO-----------------------------// | |
fifo_128 | |
#( | |
.WIDTH(STREAM_WIDTH), | |
.SIZE(FIFO_DEPTH) | |
) | |
output_pipe ( | |
.clk(bus_clk), | |
.reset(!user_w_write_128_open && !user_r_read_128_open), | |
.flush_en(0), | |
.value_i(stream_o_V_V_din), | |
.enqueue_en(&stream_o_V_V_write || (is_last_few_pixels && (ap_done == ap_start))), | |
.dequeue_en(user_r_read_128_rden), | |
.value_o(user_r_read_128_data), | |
.full(stream_o_V_V_full), | |
.empty(user_r_read_128_empty), | |
.count(data_count_of_output_fifo) | |
); | |
// to check if xillybus has transmitted all pixels data from the output_pipe fifo | |
reg [$clog2(TOTAL_NUM_OF_PIXELS)-1:0] number_of_pixels_sent_by_output_fifo = 0; // initially nothing is sent | |
always@ (posedge bus_clk) begin | |
if(!user_w_write_128_open && !user_r_read_128_open) | |
number_of_pixels_sent_by_output_fifo <= 0; | |
else if(user_r_read_128_rden) begin | |
if((number_of_pixels_sent_by_output_fifo + KERNEL_NUM) <= TOTAL_NUM_OF_PIXELS) // equivalent to (!is_last_few_pixels) | |
number_of_pixels_sent_by_output_fifo <= number_of_pixels_sent_by_output_fifo + KERNEL_NUM; // for every xillybus transactions except the last, output fifo should send 'KERNEL_NUM' pieces of pixels | |
else | |
number_of_pixels_sent_by_output_fifo <= number_of_pixels_sent_by_output_fifo + num_of_pixels_left_in_the_last_128_bits_transaction; | |
end | |
end | |
assign user_r_read_128_eof = 0; | |
// Vivado built-in internal logic analyzer module instantiation | |
ila_0 ila( | |
.clk(bus_clk), | |
.probe0(user_w_write_128_data), | |
.probe1(stream_i_V_V_dout), | |
.probe2(stream_o_V_V_din), | |
.probe3(user_r_read_128_data), | |
.probe4(stream_i_V_V_read), | |
.probe5(stream_o_V_V_write), | |
.probe6(data_count_of_input_fifo), | |
.probe7(data_count_of_output_fifo), | |
.probe8(user_w_write_128_full), | |
.probe9(stream_i_V_V_empty), | |
.probe10(user_w_write_128_wren), | |
.probe11(user_r_read_128_rden), | |
.probe12(stream_o_V_V_full), | |
.probe13(user_r_read_128_empty), | |
.probe14(user_w_write_128_open), | |
.probe15(user_r_read_128_open), | |
.probe16(ap_start), | |
.probe17(ap_done), | |
.probe18(ap_idle), | |
.probe19(ap_ready), | |
.probe20(is_last_few_pixels), | |
.probe21(number_of_pixels_received_by_input_fifo), | |
.probe22(number_of_pixels_sent_by_output_fifo) | |
); | |
`endif | |
endmodule |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment