Commit 1db76b02 authored by Andrey Filippov's avatar Andrey Filippov

Comparing mclt Bayer with Java results, fixing bugs

parent 84e3107c
......@@ -41,7 +41,7 @@
module mclt16x16_bayer#(
parameter SHIFT_WIDTH = 7, // bits in shift (7 bits - fractional)
parameter PIX_ADDR_WIDTH = 9, // number of pixel address width
// parameter EXT_PIX_LATENCY = 2, // external pixel buffer a->d latency
parameter EXT_PIX_LATENCY = 2, // external pixel buffer a->d latency (may increase to 4 for gamma)
parameter COORD_WIDTH = 10, // bits in full coordinate 10 for 18K RAM
parameter PIXEL_WIDTH = 16, // input pixel width (unsigned)
parameter WND_WIDTH = 18, // input pixel width (unsigned)
......@@ -199,23 +199,20 @@ module mclt16x16_bayer#(
end
`ifdef DSP_ACCUM_FOLD
localparam ADDR_DLYL = 4 - EXT_PIX_LATENCY; // 4'h2; // 3 for mpy, 2 - for dsp
`else
localparam ADDR_DLYL = 5 - EXT_PIX_LATENCY; // 4'h3; // 3 for mpy, 2 - for dsp
`endif
mclt_bayer_fold #(
.SHIFT_WIDTH (SHIFT_WIDTH),
.PIX_ADDR_WIDTH (PIX_ADDR_WIDTH),
.ADDR_DLY (4'h1), // 2 for mpy, 1 - for dsp
.ADDR_DLY (ADDR_DLYL), // 3 for mpy, 2 - for dsp
.COORD_WIDTH (COORD_WIDTH),
// .PIXEL_WIDTH (PIXEL_WIDTH),
.WND_WIDTH (WND_WIDTH)
// .OUT_WIDTH (OUT_WIDTH),
// .DTT_IN_WIDTH (DTT_IN_WIDTH),
// .TRANSPOSE_WIDTH (TRANSPOSE_WIDTH),
// .OUT_RSHIFT (OUT_RSHIFT),
// .OUT_RSHIFT2 (OUT_RSHIFT2),
// .DSP_B_WIDTH (DSP_B_WIDTH),
// .DSP_A_WIDTH (DSP_A_WIDTH),
// .DSP_P_WIDTH (DSP_P_WIDTH),
// .DEAD_CYCLES (DEAD_CYCLES)
) mclt_bayer_fold_i (
.clk (clk), // input
.rst (rst), // input
......@@ -233,7 +230,6 @@ module mclt16x16_bayer#(
.signs (signs), // output[1:0]
.phases (phases), // output[7:0]
.var_pre2_first(var_pre2_first), // output
// .var_first (), // var_first), // output reg
.pre_last_in (pre_last_in_w)// output reg
);
......@@ -269,8 +265,10 @@ module mclt16x16_bayer#(
reg [4:0] dtt_out_ram_cntr;
reg [4:0] dtt_out_ram_wah;
reg [1:0] dtt_out_ram_wpage; // one of 4 pages (128 samples long) being written to
reg [1:0] dtt_out_ram_wpage2; // later by 1 DTT
wire dtt_start_fill; // some data available in DTT output buffer, OK to start consecutive readout
reg dtt_start_first_fill;
reg dtt_start_second_fill;
reg [1:0] dtt_start_out; // start read out to sin/cos rotator
......@@ -305,8 +303,12 @@ module mclt16x16_bayer#(
dtt_start_first_fill <= dtt_start_fill & dtt_first_quad_out;
dtt_start_second_fill<= dtt_start_fill & ~dtt_first_quad_out;
if (dtt_start_first_fill) dtt_out_ram_wpage <= dtt_out_ram_wah[4:3];
if (dtt_start_second_fill) dtt_out_ram_wpage2 <= dtt_out_ram_wpage;
if (rst) dtt_dly_cntr <= 0;
else if (dtt_start_first_fill) dtt_dly_cntr <= DTT_OUT_DELAY;
else if (|dtt_dly_cntr) dtt_dly_cntr <= dtt_dly_cntr - 1;
......@@ -320,21 +322,10 @@ module mclt16x16_bayer#(
if (rst) dtt_rd_regen_dv[3:1] <= 0;
else dtt_rd_regen_dv[3:1] <= dtt_rd_regen_dv[2:0];
if (dtt_start_out[0]) dtt_rd_cntr_pre <= {dtt_out_ram_wpage, 7'b0}; //copy page number
// if (dtt_start_out[0]) dtt_rd_cntr_pre <= {dtt_out_ram_wpage, 7'b0}; //copy page number
if (dtt_start_out[0]) dtt_rd_cntr_pre <= {dtt_out_ram_wpage2, 7'b0}; //copy page number
else if (dtt_rd_regen_dv[0]) dtt_rd_cntr_pre <= dtt_rd_cntr_pre + 1;
/*
dtt_rd_ra0 <= {dtt_rd_cntr_pre[8:7],
dtt_rd_cntr_pre[6] ^ dtt_rd_cntr_pre[5],
dtt_rd_cntr_pre[5]? (~dtt_rd_cntr_pre[4:0]) : dtt_rd_cntr_pre[4:0],
dtt_rd_cntr_pre[5]};
dtt_rd_ra1 <= {dtt_rd_cntr_pre[8:7],
dtt_rd_cntr_pre[6] ^ dtt_rd_cntr_pre[5],
dtt_rd_cntr_pre[5]? (~dtt_rd_cntr_pre[4:0]) : dtt_rd_cntr_pre[4:0],
~dtt_rd_cntr_pre[5]};
*/
dtt_rd_ra0 <= {dtt_rd_cntr_pre[8:7],
dtt_rd_cntr_pre[0] ^ dtt_rd_cntr_pre[1],
dtt_rd_cntr_pre[0]? (~dtt_rd_cntr_pre[6:2]) : dtt_rd_cntr_pre[6:2],
......
......@@ -37,7 +37,7 @@
* with at least one of the Free Software programs.
*/
`timescale 1ns/1ps
`define DSP_ACCUM_FOLD 1
//`define DSP_ACCUM_FOLD 1
module mclt_baeyer_fold_accum # (
parameter PIXEL_WIDTH = 16, // input pixel width (unsigned)
parameter WND_WIDTH = 18, // input pixel width (unsigned)
......@@ -49,15 +49,13 @@ module mclt_baeyer_fold_accum # (
)(
input clk,
input rst,
// input [5:0] phases,
input pre_phase,
input signed [PIXEL_WIDTH-1:0] pix_d, //!< pixel data
input signed [PIXEL_WIDTH-1:0] pix_d, //!< pixel data (should be 1 cycle later for `undef DSP_ACCUM_FOLD
input [1:0] pix_sgn, //!< bit 0: sign to add to dtt-cc input, bit 1: sign to add to dtt-cs input
input signed [WND_WIDTH-1:0] window,
input var_pre2_first,
output signed [DTT_IN_WIDTH-1:0] dtt_in,
output dtt_in_dv
);
reg var_pre_first;
reg var_first;
......@@ -65,43 +63,6 @@ module mclt_baeyer_fold_accum # (
reg [6:0] phases;
`ifdef DSP_ACCUM_FOLD
reg dtt_in_dv_dsp_r;
reg signed [DTT_IN_WIDTH-1:0] dtt_in_dsp;
`else
wire [ 1:0] pix_sgn_d;
reg [PIXEL_WIDTH-1:0] pix_dr; // only for mpy to match dsp
reg signed [WND_WIDTH-1:0] window_r;
reg signed [PIXEL_WIDTH-1:0] pix_d_r; // registered pixel data (to be absorbed by MPY)
reg [ 1:0] pix_sgn_r;
reg signed [PIXEL_WIDTH + WND_WIDTH - 1:0] pix_wnd_r; // MSB not used: positive[PIXEL_WIDTH]*positive[WND_WIDTH]->positive[PIXEL_WIDTH+WND_WIDTH-1]
reg signed [DTT_IN_WIDTH-1:0] pix_wnd_r2; // pixels (positive) multiplied by window(positive), two MSBs == 2'b0 to prevent overflow
// rounding
// wire signed [DTT_IN_WIDTH-3:0] pix_wnd_r2_w = pix_wnd_r[PIXEL_WIDTH + WND_WIDTH - 2 -: DTT_IN_WIDTH - 2]
wire signed [DTT_IN_WIDTH-2:0] pix_wnd_r2_w = pix_wnd_r[PIXEL_WIDTH + WND_WIDTH - 2 -: DTT_IN_WIDTH - 1]
`ifdef ROUND
// + pix_wnd_r[PIXEL_WIDTH + WND_WIDTH -DTT_IN_WIDTH]
+ pix_wnd_r[PIXEL_WIDTH + WND_WIDTH -DTT_IN_WIDTH -1]
`endif
;
reg signed [DTT_IN_WIDTH-1:0] data_cc_r;
reg signed [DTT_IN_WIDTH-1:0] data_sc_r;
reg signed [DTT_IN_WIDTH-1:0] data_sc_r2; // data_sc_r delayed by 1 cycle
reg mode_mux;
reg dtt_in_dv_r;
reg signed [DTT_IN_WIDTH-1:0] data_dtt_in; // multiplexed DTT input data
`endif
`ifdef DSP_ACCUM_FOLD
assign dtt_in = dtt_in_dsp;
assign dtt_in_dv = dtt_in_dv_dsp_r;
`else
assign dtt_in = data_dtt_in;
assign dtt_in_dv = dtt_in_dv_r;
`endif
always @ (posedge clk) begin
phases <= {phases[5:0], pre_phase};
......@@ -119,6 +80,12 @@ module mclt_baeyer_fold_accum # (
`ifdef DSP_ACCUM_FOLD
reg dtt_in_dv_dsp_r;
reg signed [DTT_IN_WIDTH-1:0] dtt_in_dsp;
assign dtt_in = dtt_in_dsp;
assign dtt_in_dv = dtt_in_dv_dsp_r;
always @ (posedge clk) begin
if (rst) dtt_in_dv_dsp_r <= 0;
else dtt_in_dv_dsp_r <= phases[5];
......@@ -223,13 +190,40 @@ module mclt_baeyer_fold_accum # (
);
`else
wire [ 1:0] pix_sgn_d;
/// reg [PIXEL_WIDTH-1:0] pix_dr; // only for mpy to match dsp
reg signed [WND_WIDTH-1:0] window_r;
reg signed [PIXEL_WIDTH-1:0] pix_d_r; // registered pixel data (to be absorbed by MPY)
reg [ 1:0] pix_sgn_r;
reg signed [PIXEL_WIDTH + WND_WIDTH - 1:0] pix_wnd_r; // MSB not used: positive[PIXEL_WIDTH]*positive[WND_WIDTH]->positive[PIXEL_WIDTH+WND_WIDTH-1]
reg signed [DTT_IN_WIDTH-1:0] pix_wnd_r2; // pixels (positive) multiplied by window(positive), two MSBs == 2'b0 to prevent overflow
// rounding
// wire signed [DTT_IN_WIDTH-3:0] pix_wnd_r2_w = pix_wnd_r[PIXEL_WIDTH + WND_WIDTH - 2 -: DTT_IN_WIDTH - 2]
wire signed [DTT_IN_WIDTH-2:0] pix_wnd_r2_w = pix_wnd_r[PIXEL_WIDTH + WND_WIDTH - 2 -: DTT_IN_WIDTH - 1]
`ifdef ROUND
// + pix_wnd_r[PIXEL_WIDTH + WND_WIDTH -DTT_IN_WIDTH]
+ pix_wnd_r[PIXEL_WIDTH + WND_WIDTH -DTT_IN_WIDTH -1]
`endif
;
reg signed [DTT_IN_WIDTH-1:0] data_cc_r;
reg signed [DTT_IN_WIDTH-1:0] data_sc_r;
reg signed [DTT_IN_WIDTH-1:0] data_sc_r2; // data_sc_r delayed by 1 cycle
reg mode_mux;
reg dtt_in_dv_r;
reg signed [DTT_IN_WIDTH-1:0] data_dtt_in; // multiplexed DTT input data
assign dtt_in = data_dtt_in;
assign dtt_in_dv = dtt_in_dv_r;
always @ (posedge clk) begin
if (rst) dtt_in_dv_r <= 0;
else dtt_in_dv_r <= phases[6];
pix_dr <= pix_d;
/// pix_dr <= pix_d;
if (phases[1]) begin
pix_d_r <= pix_dr;
/// pix_d_r <= pix_dr;
pix_d_r <= pix_d;
window_r <= window;
end
if (phases[2]) pix_wnd_r <= pix_d_r * window_r; // 1 MSB is extra
......
......@@ -41,20 +41,9 @@
module mclt_bayer_fold#(
parameter SHIFT_WIDTH = 7, // bits in shift (7 bits - fractional)
parameter PIX_ADDR_WIDTH = 9, // number of pixel address width
// parameter EXT_PIX_LATENCY = 2, // external pixel buffer a->d latency
parameter ADDR_DLY = 4'h2, // extra delay of pixel address to match window delay
parameter ADDR_DLY = 4'h3, // extra delay of pixel address to match window delay
parameter COORD_WIDTH = 10, // bits in full coordinate 10 for 18K RAM
// parameter PIXEL_WIDTH = 16, // input pixel width (unsigned)
parameter WND_WIDTH = 18 // input pixel width (unsigned)
// parameter OUT_WIDTH = 25, // bits in dtt output
// parameter DTT_IN_WIDTH = 25, // bits in DTT input
// parameter TRANSPOSE_WIDTH = 25, // width of the transpose memory (intermediate results)
// parameter OUT_RSHIFT = 2, // overall right shift of the result from input, aligned by MSB (>=3 will never cause saturation)
// parameter OUT_RSHIFT2 = 0, // overall right shift for the second (vertical) pass
// parameter DSP_B_WIDTH = 18, // signed, output from sin/cos ROM
// parameter DSP_A_WIDTH = 25,
// parameter DSP_P_WIDTH = 48,
// parameter DEAD_CYCLES = 14 // start next block immedaitely, or with longer pause
)(
input clk, //!< system clock, posedge
input rst, //!< sync reset
......@@ -71,14 +60,12 @@ module mclt_bayer_fold#(
output pix_page, //!< copy pixel page (should be externally combined with first color)
output signed [WND_WIDTH-1:0] window, //!< msb==0, always positive
output [1:0] signs, //!< bit 0: sign to add to dtt-cc input, bit 1: sign to add to dtt-cs input
// output [14:0] phases, //!< other signals
output [6:0] phases, //!< other signals
output var_pre2_first,//!< two ahead of first of 2 fold variants (4 for monochrome, 2 left for checker)
// output reg var_first, //!< first of 2 fold variants (4 for monochrome, 2 left for checker)
output reg pre_last_in //!< pre last data in
);
reg [6:0] in_cntr; // input phase counter
// reg [14:0] run_r; // run phase
reg [6:0] run_r; // run phase
reg [1:0] tile_size_r; // 0: 16x16, 1 - 18x18, 2 - 20x20, 3 - 22x22 (max for 9-bit addr)
......@@ -86,9 +73,11 @@ module mclt_bayer_fold#(
reg [7:0] top_left_r0; // index of the 16x16 top left corner
reg [7:0] top_left_r; // index of the 16x16 top left corner
reg [1:0] valid_rows_r0;// 3 for green, 1 or 2 for R/B - which of the even/odd checker rows contain pixels
// reg [1:0] valid_rows_r ;// correct latency for window rom
/// wire [ 9:0] fold_addr= {tile_size_r,inv_checker_r, in_cntr[0],in_cntr[6:1]};
wire [ 9:0] fold_addr= {tile_size_r,inv_checker_r, (valid_rows_r0==3)?in_cntr[0]:valid_rows_r0[0],in_cntr[6:1]};
wire [ 9:0] fold_addr= {tile_size_r,inv_checker_r, (valid_rows_r0==3)?
in_cntr[0]:
// valid_rows_r0[0],
~valid_rows_r0[0],
in_cntr[6:1]};
reg [SHIFT_WIDTH-1:0] x_shft_r0; // tile pixel X fractional shift (valid @ start)
reg [SHIFT_WIDTH-1:0] y_shft_r0; // tile pixel Y fractional shift (valid @ start)
reg [SHIFT_WIDTH-1:0] x_shft_r; // matching delay
......@@ -100,28 +89,16 @@ module mclt_bayer_fold#(
wire [PIX_ADDR_WIDTH-1:0] pix_a_w = {~fold_rom_out[15] & fold_rom_out[7],fold_rom_out[15:8]};
reg [PIX_ADDR_WIDTH-1:0] pix_a_r;
wire [ 1:0] sgn_w = fold_rom_out[16 +: 2];
// reg blank_r; // blank window (latency 1 from fold_rom_out)
// wire blank_d; // delayed to matchwindow rom regrst
wire pre_page = in_cntr == 2; // valid 1 cycle before fold_rom_out
wire var_first_d; // adding subtracting first variant of 2 folds
// reg var_pre_first;
assign phases = run_r;
assign var_pre2_first = var_first_d;
// wire [ 3:0] bayer_1hot = { mpix_a_w[4] & mpix_a_w[0],
// mpix_a_w[4] & ~mpix_a_w[0],
// ~mpix_a_w[4] & mpix_a_w[0],
// ~mpix_a_w[4] & ~mpix_a_w[0]};
// wire mpix_use = |(bayer_d & bayer_1hot); //not disabled by bayer, valid with mpix_a_w
// wire mpix_use_d; // delayed
// reg mpix_use_r; // delayed
always @ (posedge clk) begin
if (rst) run_r <= 0;
// else run_r <= {run_r[13:0], start | (run_r[0] & ~(&in_cntr[6:0]))};
else run_r <= {run_r[5:0], start | (run_r[0] & ~(&in_cntr[6:0]))};
if (!run_r[0]) in_cntr <= 0;
......@@ -146,18 +123,6 @@ module mclt_bayer_fold#(
if (run_r[2]) pix_a_r <= pix_a_w + {1'b0, top_left_r};
/// if (in_cntr == 2) valid_rows_r <= valid_rows_r0;
/// blank_r <= ~(wnd_a_w[0] ? valid_rows_r[1]: valid_rows_r[0]);
/// if (run_r[9]) var_pre_first <= var_first_d;
/// if (run_r[10]) begin
// var_first <= var_first_d;
/// var_first <= var_pre_first;
/// end
pre_last_in <= in_cntr[6:0] == 7'h7d;
......@@ -192,18 +157,25 @@ module mclt_bayer_fold#(
);
// Matching window latency with pixel data latency
wire [3:0] addr_dly = ADDR_DLY;
generate
if (ADDR_DLY !=0) begin
wire [3:0] addr_dly = ADDR_DLY - 1; // iverilog problem mitigation
dly_var #(
.WIDTH(11),
.DLY_WIDTH(4)
) dly_pixel_addr_i (
.clk (clk), // input
.rst (rst), // input
// .dly (4'h2), // input[3:0] Delay for external memory latency = 2, reduce for higher
.dly (addr_dly), // input[3:0] Delay for external memory latency = 2, reduce for higher
.din ({pre_page, run_r[3], pix_a_r}), // input[0:0]
.dout ({pix_page, pix_re, pix_addr}) // output[0:0]
);
end else begin
assign pix_page = pre_page;
assign pix_re = run_r[3];
assign pix_addr = pix_a_r;
end
endgenerate
// Latency = 6
mclt_wnd_mul #(
......@@ -238,13 +210,10 @@ module mclt_bayer_fold#(
) dly_var_first_i (
.clk (clk), // input
.rst (rst), // input
// .dly (4'h9), // input[3:0]
.dly (4'h8), // input[3:0]
.din (run_r[0] && (in_cntr[0] == 0)), // input[0:0]
.dout (var_first_d) // output[0:0]
);
//
endmodule
......@@ -508,7 +508,9 @@ module mclt_test_03 ();
);
localparam PIX_ADDR_WIDTH = 9;
localparam ADDR_DLY = 2;
// localparam ADDR_DLY = 2;
localparam EXT_PIX_LATENCY = 2; // external pixel buffer a->d latency (may increase to 4 for gamma)
reg [1:0] TILE_SIZE = 3; // 22;
reg INV_CHECKER = 0;
reg [7:0] TOP_LEFT = 69; // center
......@@ -540,6 +542,7 @@ module mclt_test_03 ();
mclt16x16_bayer #(
.SHIFT_WIDTH (SHIFT_WIDTH),
.PIX_ADDR_WIDTH (PIX_ADDR_WIDTH),
.EXT_PIX_LATENCY (EXT_PIX_LATENCY), // 2), // external pixel buffer a->d latency (may increase to 4 for gamma)
.COORD_WIDTH (COORD_WIDTH),
.PIXEL_WIDTH (PIXEL_WIDTH),
.WND_WIDTH (WND_WIDTH),
......@@ -552,7 +555,7 @@ module mclt_test_03 ();
.DSP_A_WIDTH (DSP_A_WIDTH),
.DSP_P_WIDTH (DSP_P_WIDTH),
.DEAD_CYCLES (DEAD_CYCLES)
) mclt_bayer_fold_i (
) mclt16x16_bayer_i (
.clk (CLK), // input
.rst (RST), // input
.start (start), // input
......@@ -565,7 +568,16 @@ module mclt_test_03 ();
.pix_addr (PIX_ADDR9), // output[8:0]
.pix_re (PIX_RE), // output
.pix_page (PIX_PAGE), // output
.pix_d (PIX_D) // input[15:0]
.pix_d (PIX_D), // input[15:0]
.pre_busy (), // output
.pre_last_in (), // output
.pre_first_out (), // output
.pre_last_out (), // output
.out_addr (), // output[7:0]
.dv (), // output
.dout0 (), // output[24:0] signed
.dout1 () // output[24:0] signed
);
......
This diff is collapsed.
......@@ -43,6 +43,9 @@
`define DEBUG_HISTOGRAMS
// TODO: Later compare instantiate/infer
`define INSTANTIATE_DSP48E1 // not yet tesetd/debugged otherwise
`define DSP_ACCUM_FOLD 1 // for MCLT
// https://forums.xilinx.com/t5/Embedded-Processor-System-Design/AXI4-Bursts-4KB-Address-Boundary-Limitation/td-p/216413
// Interconnect does not have 4K limit, and compressed data can only go to interconnect (memory), so it is OK to violate AXI specs here
`define AXI_4K_LIMIT_DISABLE // Current x393 code (only simulation modules) does not have it implemented, defining it causes mismatch synth/sim
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment