Skip to content

Instantly share code, notes, and snippets.

@buttercutter
Last active June 6, 2018 14:08
Show Gist options
  • Save buttercutter/a3af6c59906567c3df4179a501513a1b to your computer and use it in GitHub Desktop.
Save buttercutter/a3af6c59906567c3df4179a501513a1b to your computer and use it in GitHub Desktop.
verilog code for HLS kernel interfacing xillybus
//`define LOOPBACK 1
module xillydemo(PCIE_PERST_B_LS, PCIE_REFCLK_N, PCIE_REFCLK_P, PCIE_RX_N, PCIE_RX_P, GPIO_LED, PCIE_TX_N, PCIE_TX_P);
localparam STREAM_WIDTH = 128;
input PCIE_PERST_B_LS;
input PCIE_REFCLK_N;
input PCIE_REFCLK_P;
input [7:0] PCIE_RX_N;
input [7:0] PCIE_RX_P;
output [3:0] GPIO_LED;
output [7:0] PCIE_TX_N;
output [7:0] PCIE_TX_P;
// Clock and quiesce
wire bus_clk;
wire quiesce;
// Memory array
reg [7:0] demoarray[0:31];
// Wires related to /dev/xillybus_mem_128
wire user_r_mem_128_rden;
wire user_r_mem_128_empty;
reg [STREAM_WIDTH-1:0] user_r_mem_128_data;
wire user_r_mem_128_eof;
wire user_r_mem_128_open;
wire user_w_mem_128_wren;
wire user_w_mem_128_full;
wire [STREAM_WIDTH-1:0] user_w_mem_128_data;
wire user_w_mem_128_open;
wire [$clog2(STREAM_WIDTH)-1:0] user_mem_128_addr;
wire user_mem_128_addr_update;
// Wires related to /dev/xillybus_read_128
wire user_r_read_128_rden;
wire user_r_read_128_empty;
wire [STREAM_WIDTH-1:0] user_r_read_128_data;
wire user_r_read_128_eof;
wire user_r_read_128_open;
// Wires related to /dev/xillybus_write_128
wire user_w_write_128_wren;
wire user_w_write_128_full;
wire [STREAM_WIDTH-1:0] user_w_write_128_data;
wire user_w_write_128_open;
// Wires related to /dev/xillybus_read_256
wire user_r_read_256_rden;
wire user_r_read_256_empty;
wire [(STREAM_WIDTH << 1)-1:0] user_r_read_256_data;
wire user_r_read_256_eof;
wire user_r_read_256_open;
// Wires related to /dev/xillybus_write_256
wire user_w_write_256_wren;
wire user_w_write_256_full;
wire [(STREAM_WIDTH << 1)-1:0] user_w_write_256_data;
wire user_w_write_256_open;
xillybus xillybus_ins (
// Ports related to /dev/xillybus_mem_128
// FPGA to CPU signals:
.user_r_mem_128_rden(user_r_mem_128_rden),
.user_r_mem_128_empty(user_r_mem_128_empty),
.user_r_mem_128_data(user_r_mem_128_data),
.user_r_mem_128_eof(user_r_mem_128_eof),
.user_r_mem_128_open(user_r_mem_128_open),
// CPU to FPGA signals:
.user_w_mem_128_wren(user_w_mem_128_wren),
.user_w_mem_128_full(user_w_mem_128_full),
.user_w_mem_128_data(user_w_mem_128_data),
.user_w_mem_128_open(user_w_mem_128_open),
// Address signals:
.user_mem_128_addr(user_mem_128_addr),
.user_mem_128_addr_update(user_mem_128_addr_update),
// Ports related to /dev/xillybus_read_256
// FPGA to CPU signals:
.user_r_read_256_rden(user_r_read_256_rden),
.user_r_read_256_empty(user_r_read_256_empty),
.user_r_read_256_data(user_r_read_256_data),
.user_r_read_256_eof(user_r_read_256_eof),
.user_r_read_256_open(user_r_read_256_open),
// Ports related to /dev/xillybus_write_256
// CPU to FPGA signals:
.user_w_write_256_wren(user_w_write_256_wren),
.user_w_write_256_full(user_w_write_256_full),
.user_w_write_256_data(user_w_write_256_data),
.user_w_write_256_open(user_w_write_256_open),
// Ports related to /dev/xillybus_read_128
// FPGA to CPU signals:
.user_r_read_128_rden(user_r_read_128_rden),
.user_r_read_128_empty(user_r_read_128_empty),
.user_r_read_128_data(user_r_read_128_data),
.user_r_read_128_eof(user_r_read_128_eof),
.user_r_read_128_open(user_r_read_128_open),
// Ports related to /dev/xillybus_write_128
// CPU to FPGA signals:
.user_w_write_128_wren(user_w_write_128_wren),
.user_w_write_128_full(user_w_write_128_full),
.user_w_write_128_data(user_w_write_128_data),
.user_w_write_128_open(user_w_write_128_open),
// Signals to top level
.PCIE_PERST_B_LS(PCIE_PERST_B_LS),
.PCIE_REFCLK_N(PCIE_REFCLK_N),
.PCIE_REFCLK_P(PCIE_REFCLK_P),
.PCIE_RX_N(PCIE_RX_N),
.PCIE_RX_P(PCIE_RX_P),
.GPIO_LED(GPIO_LED),
.PCIE_TX_N(PCIE_TX_N),
.PCIE_TX_P(PCIE_TX_P),
.bus_clk(bus_clk),
.quiesce(quiesce)
);
// A simple inferred RAM
always @(posedge bus_clk)
begin
if (user_w_mem_128_wren)
demoarray[user_mem_128_addr] <= user_w_mem_128_data;
if (user_r_mem_128_rden)
user_r_mem_128_data <= demoarray[user_mem_128_addr];
end
assign user_r_mem_128_empty = 0;
assign user_r_mem_128_eof = 0;
assign user_w_mem_128_full = 0;
`ifdef LOOPBACK
wire [$clog2(STREAM_WIDTH)-1:0] data_count_of_loopback_fifo;
// 128-bit loopback
fifo_128 fifo_128x128
(
.clk(bus_clk),
.reset(!user_w_write_128_open && !user_r_read_128_open),
.flush_en(0),
.value_i(user_w_write_128_data),
.enqueue_en(user_w_write_128_wren),
.dequeue_en(user_r_read_128_rden),
.value_o(user_r_read_128_data),
.full(user_w_write_128_full),
.empty(user_r_read_128_empty),
.count(data_count_of_loopback_fifo)
);
assign user_r_read_128_eof = 0;
localparam TOTAL_NUM_OF_PIXELS = 512*512; // the image is sized as 3*512*512 , width=512 and height=512
localparam NUM_OF_COMPONENTS_IN_A_PIXEL = 3; // input: [R, G, B] output:[Y, U, V]
localparam PIXEL_VALUE_RANGE = 8; // number of bits occupied by [R, G, B] or [Y, U, V] respectively (8-bit unsigned integer for each components) , https://docs.microsoft.com/en-us/windows-hardware/drivers/display/yuv-rgb-data-range-conversions
// to check if xillybus has transmitted all pixels data through the loopback fifo
reg [$clog2(TOTAL_NUM_OF_PIXELS*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE/STREAM_WIDTH)-1:0] number_of_128_bit_data_passed_through_loopback_fifo = 0; // initially nothing is received
always@ (posedge bus_clk) begin
if(!user_w_write_128_open && !user_r_read_128_open)
number_of_128_bit_data_passed_through_loopback_fifo <= 0;
else if(user_w_write_128_wren && (number_of_128_bit_data_passed_through_loopback_fifo < (TOTAL_NUM_OF_PIXELS*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE/STREAM_WIDTH)))
number_of_128_bit_data_passed_through_loopback_fifo <= number_of_128_bit_data_passed_through_loopback_fifo + 1; // for every xillybus transaction, input fifo should receive 128 bits, or equivalently 'KERNEL_NUM' pieces of pixels
end
// Vivado built-in internal logic analyzer module instantiation
ila_1 ila(
.clk(bus_clk),
.probe0(user_w_write_128_data),
.probe1(user_r_read_128_data),
.probe2(data_count_of_loopback_fifo),
.probe3(user_w_write_128_full),
.probe4(user_w_write_128_wren),
.probe5(user_r_read_128_rden),
.probe6(user_r_read_128_empty),
.probe7(user_w_write_128_open),
.probe8(user_r_read_128_open),
.probe9(number_of_128_bit_data_passed_through_loopback_fifo)
);
`else
// Signals for ($floor((STREAM_WIDTH/PIXEL_VALUE_RANGE)/NUM_OF_COMPONENTS_IN_A_PIXEL) = 5) kernels
// since an image pixel is unsigned 8-bit integer, its component values of [R, G, B] or [Y, U, V] range from 0 to 255.
// A pixel occupies 3*8=24 bits. Therefore, in each transaction, we could at most put 5 pixels (120 bits) into /dev/xillybus_write_128,
// computes the relevant kernel equations for 5 pixels, send out 5 pixels again through /dev/xillybus_read_128
localparam NUM_OF_COMPONENTS_IN_A_PIXEL = 3; // input: [R, G, B] , output:[Y, U, V]
localparam PIXEL_VALUE_RANGE = 8; // number of bits occupied by [R, G, B] and [Y, U, V] respectively (8-bit unsigned integer for each components) , https://docs.microsoft.com/en-us/windows-hardware/drivers/display/yuv-rgb-data-range-conversions
localparam KERNEL_NUM = 5; // 5 copies of kernel, each kernel computes equation for [R, G, B] of one single pixel
localparam TOTAL_NUM_OF_PIXELS = 512*512; // lena.tiff is sized as 3*512*512 , width=512 and height=512
// Signals for two buffer FIFOs
localparam FIFO_DEPTH = 16;
wire [$clog2(FIFO_DEPTH):0] data_count_of_input_fifo; // determines whether all five pixel slots have incoming data or not
wire [$clog2(FIFO_DEPTH):0] data_count_of_output_fifo;
//-------------------------------------------kernel----------------------------------------//
wire [STREAM_WIDTH-1:0] stream_i_V_V_dout; // Read data for 5 pixels or KERNEL_NUM*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE = 120 bits (most significant 8 bits indicates last 128-bit transaction)
wire stream_i_V_V_empty; // Empty condition
wire [KERNEL_NUM*NUM_OF_COMPONENTS_IN_A_PIXEL-1:0] stream_i_V_V_read; // Read enable for each color components of all five pixels, high active
(* mark_debug = "true" *) wire [STREAM_WIDTH-1:0] stream_o_V_V_din; // Write data for 5 pixels or KERNEL_NUM*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE = 120 bits (most significant 8 bits indicates last 128-bit transaction)
wire stream_o_V_V_full; // Full condition
wire [KERNEL_NUM*NUM_OF_COMPONENTS_IN_A_PIXEL-1:0] stream_o_V_V_write; // Write enable for each color components of all five pixels, high active
wire [PIXEL_VALUE_RANGE-1:0] num_of_pixels_left_in_the_last_128_bits_transaction = user_w_write_128_data[STREAM_WIDTH-1 : (KERNEL_NUM-1)*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE]; // most significant 8 bits
wire is_last_few_pixels = (data_count_of_input_fifo < KERNEL_NUM) && (num_of_pixels_left_in_the_last_128_bits_transaction > 0);
// the remaining pixels do not fill all five pixel slots for a 128-bit stream, AND the last 128-bit transaction indication from software
reg [KERNEL_NUM-1:0] ap_start = 0; // initially the HLS kernels are not started
wire [KERNEL_NUM-1:0] ap_done;
wire [KERNEL_NUM-1:0] ap_idle;
wire [KERNEL_NUM-1:0] ap_ready;
reg [KERNEL_NUM-1:0] start_signal_during_last_transaction = 0;
reg [$clog2(KERNEL_NUM)-1:0] start_index = 0;
always @(posedge bus_clk) begin
if(!user_w_write_128_open && !user_r_read_128_open) begin
start_signal_during_last_transaction <= 0; // reset all bits to zero, preparing for transmission of next image
start_index <= 0;
end
else begin
if(start_index < num_of_pixels_left_in_the_last_128_bits_transaction) begin
start_signal_during_last_transaction[start_index] <= 1'b1;
start_index <= start_index + 1;
end
end
end
always @(posedge bus_clk)
ap_start <= (is_last_few_pixels) ? (start_signal_during_last_transaction) : {KERNEL_NUM{(&stream_i_V_V_read || !stream_i_V_V_empty)}}; // start signals depend on whether all five pixel slots are filled or not
// -----------------input FIFO ----------------------------------//
fifo_fwft_128
#(
.WIDTH(STREAM_WIDTH),
.SIZE(FIFO_DEPTH)
)
input_pipe(
.clk(bus_clk),
.reset(!user_w_write_128_open && !user_r_read_128_open),
.flush_en(0),
.value_i(user_w_write_128_data),
.enqueue_en(user_w_write_128_wren),
.dequeue_en(&stream_i_V_V_read),
.value_o(stream_i_V_V_dout),
.full(user_w_write_128_full),
.empty(stream_i_V_V_empty),
.count(data_count_of_input_fifo)
);
// to check if xillybus has transmitted all pixels data to the input_pipe fifo
reg [$clog2(TOTAL_NUM_OF_PIXELS)-1:0] number_of_pixels_received_by_input_fifo = 0; // initially nothing is received
always@ (posedge bus_clk) begin
if(!user_w_write_128_open && !user_r_read_128_open)
number_of_pixels_received_by_input_fifo <= 0;
else if(user_w_write_128_wren && (number_of_pixels_received_by_input_fifo <= (TOTAL_NUM_OF_PIXELS-KERNEL_NUM))) begin
if(!is_last_few_pixels)
number_of_pixels_received_by_input_fifo <= number_of_pixels_received_by_input_fifo + KERNEL_NUM; // for every xillybus transactions except the last, input fifo should receive 'KERNEL_NUM' pieces of pixels
else
number_of_pixels_received_by_input_fifo <= number_of_pixels_received_by_input_fifo + num_of_pixels_left_in_the_last_128_bits_transaction;
end
end
// use of generate loop to replicate 5 hardware copies of RGB2YUV computational HLS kernels for 5 different pixels
generate
genvar kn; // to indicate which kernel
for(kn=0; kn<KERNEL_NUM; kn=kn+1) begin
kernel RGB2YUV_kn (
.ap_clk(bus_clk),
.ap_rst(!user_w_write_128_open && !user_r_read_128_open),
.ap_start(ap_start[kn]), // need to confirm ?
.ap_done(ap_done[kn]),
.ap_idle(ap_idle[kn]),
.ap_ready(ap_ready[kn]),
.stream_i0_V_V_dout(stream_i_V_V_dout[kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + PIXEL_VALUE_RANGE - 1 : kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE]), // input component R with (PIXEL_VALUE_RANGE) bits
.stream_i0_V_V_empty_n(!stream_i_V_V_empty),
.stream_i0_V_V_read(stream_i_V_V_read[kn*NUM_OF_COMPONENTS_IN_A_PIXEL]),
.stream_i1_V_V_dout(stream_i_V_V_dout[kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + (NUM_OF_COMPONENTS_IN_A_PIXEL-1)*PIXEL_VALUE_RANGE - 1 : kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + PIXEL_VALUE_RANGE]), // input component G with (PIXEL_VALUE_RANGE) bits
.stream_i1_V_V_empty_n(!stream_i_V_V_empty),
.stream_i1_V_V_read(stream_i_V_V_read[kn*NUM_OF_COMPONENTS_IN_A_PIXEL + 1]),
.stream_i2_V_V_dout(stream_i_V_V_dout[kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE - 1 : kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + (NUM_OF_COMPONENTS_IN_A_PIXEL-1)*PIXEL_VALUE_RANGE]), // input component B with (PIXEL_VALUE_RANGE) bits
.stream_i2_V_V_empty_n(!stream_i_V_V_empty),
.stream_i2_V_V_read(stream_i_V_V_read[kn*NUM_OF_COMPONENTS_IN_A_PIXEL + (NUM_OF_COMPONENTS_IN_A_PIXEL-1)]),
.stream_o0_V_V_din(stream_o_V_V_din[kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + PIXEL_VALUE_RANGE - 1 : kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE]), // output component Y with (PIXEL_VALUE_RANGE) bits
.stream_o0_V_V_full_n(!stream_o_V_V_full),
.stream_o0_V_V_write(stream_o_V_V_write[kn*NUM_OF_COMPONENTS_IN_A_PIXEL]),
.stream_o1_V_V_din(stream_o_V_V_din[kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + (NUM_OF_COMPONENTS_IN_A_PIXEL-1)*PIXEL_VALUE_RANGE - 1 : kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + PIXEL_VALUE_RANGE]), // output component U with (PIXEL_VALUE_RANGE) bits
.stream_o1_V_V_full_n(!stream_o_V_V_full),
.stream_o1_V_V_write(stream_o_V_V_write[kn*NUM_OF_COMPONENTS_IN_A_PIXEL + 1]),
.stream_o2_V_V_din(stream_o_V_V_din[kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE - 1 : kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + (NUM_OF_COMPONENTS_IN_A_PIXEL-1)*PIXEL_VALUE_RANGE]), // output component V with (PIXEL_VALUE_RANGE) bits
.stream_o2_V_V_full_n(!stream_o_V_V_full),
.stream_o2_V_V_write(stream_o_V_V_write[kn*NUM_OF_COMPONENTS_IN_A_PIXEL + (NUM_OF_COMPONENTS_IN_A_PIXEL-1)])
);
end
endgenerate
//----------------------output FIFO-----------------------------//
fifo_128
#(
.WIDTH(STREAM_WIDTH),
.SIZE(FIFO_DEPTH)
)
output_pipe (
.clk(bus_clk),
.reset(!user_w_write_128_open && !user_r_read_128_open),
.flush_en(0),
.value_i(stream_o_V_V_din),
.enqueue_en(&stream_o_V_V_write || (is_last_few_pixels && (ap_done == ap_start))),
.dequeue_en(user_r_read_128_rden),
.value_o(user_r_read_128_data),
.full(stream_o_V_V_full),
.empty(user_r_read_128_empty),
.count(data_count_of_output_fifo)
);
// to check if xillybus has transmitted all pixels data from the output_pipe fifo
reg [$clog2(TOTAL_NUM_OF_PIXELS)-1:0] number_of_pixels_sent_by_output_fifo = 0; // initially nothing is sent
always@ (posedge bus_clk) begin
if(!user_w_write_128_open && !user_r_read_128_open)
number_of_pixels_sent_by_output_fifo <= 0;
else if(user_r_read_128_rden) begin
if((number_of_pixels_sent_by_output_fifo + KERNEL_NUM) <= TOTAL_NUM_OF_PIXELS) // equivalent to (!is_last_few_pixels)
number_of_pixels_sent_by_output_fifo <= number_of_pixels_sent_by_output_fifo + KERNEL_NUM; // for every xillybus transactions except the last, output fifo should send 'KERNEL_NUM' pieces of pixels
else
number_of_pixels_sent_by_output_fifo <= number_of_pixels_sent_by_output_fifo + num_of_pixels_left_in_the_last_128_bits_transaction;
end
end
assign user_r_read_128_eof = 0;
// Vivado built-in internal logic analyzer module instantiation
ila_0 ila(
.clk(bus_clk),
.probe0(user_w_write_128_data),
.probe1(stream_i_V_V_dout),
.probe2(stream_o_V_V_din),
.probe3(user_r_read_128_data),
.probe4(stream_i_V_V_read),
.probe5(stream_o_V_V_write),
.probe6(data_count_of_input_fifo),
.probe7(data_count_of_output_fifo),
.probe8(user_w_write_128_full),
.probe9(stream_i_V_V_empty),
.probe10(user_w_write_128_wren),
.probe11(user_r_read_128_rden),
.probe12(stream_o_V_V_full),
.probe13(user_r_read_128_empty),
.probe14(user_w_write_128_open),
.probe15(user_r_read_128_open),
.probe16(ap_start),
.probe17(ap_done),
.probe18(ap_idle),
.probe19(ap_ready),
.probe20(is_last_few_pixels),
.probe21(number_of_pixels_received_by_input_fifo),
.probe22(number_of_pixels_sent_by_output_fifo)
);
`endif
endmodule
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment