Commit 2d4c00bf authored by Andrey Filippov's avatar Andrey Filippov

working on DSP-based 8x8 DCT implementing Chen algorithm

parent c4619853
......@@ -991,6 +991,39 @@ module jp_channel#(
.dv (), // not used: output data output valid. Will go high on the 94-th cycle after the start (now - on 95-th?)
.d_out (dct_out) // output[12:0]
);
/* New DCT, now in passive mode */
// TODO: enforce minimal pause (when not butted together)
wire dct_last_in_debug;
wire dct_pre_first_out_debug;
wire dct_dv_debug;
wire [12:0] dct_dout_debug;
dct2d8x8_chen #(
.INPUT_WIDTH (10),
.OUTPUT_WIDTH (13),
.STAGE1_SAFE_BITS (3),
.STAGE2_SAFE_BITS (3),
.TRANSPOSE_WIDTH (16),
.TRIM_STAGE_1 (0),
.TRIM_STAGE_2 (2),
.DSP_WIDTH (24),
.DSP_OUT_WIDTH (24),
.DSP_B_WIDTH (18),
.DSP_A_WIDTH (25),
.DSP_P_WIDTH (48),
.DSP_M_WIDTH (43)
) dct2d8x8_chen_i (
.clk (xclk), // input
.rst (!frame_en), // input
.start (dct_start), // input
.xin (yc_nodc), // input[9:0] signed
.last_in (dct_last_in_debug), // output reg
.pre_first_out (dct_pre_first_out_debug), // output
.dv (dct_dv_debug), // output
.d_out (dct_dout_debug) // output[12:0] signed
);
`endif
wire quant_start;
......
This diff is collapsed.
......@@ -48,22 +48,34 @@ module dct1d_chen_reorder_in#(
input start, // with first pixel
output [2*WIDTH -1:0] dout_10_32_76_54, // Concatenated/reordered output data {x[1],x[0]}/{x[3],x[2]}/ {x[7],x[6]}/{x[5],x[4]}
output reg start_out,
output reg en_out // to be sampled when start_out is expected
output en_out // to be sampled when start_out is expected
);
reg last_r;
reg [2:0] cntr_in;
reg [1:0] raddr;
wire restart = !rst && en && (start || last_r);
wire [1:0] we = ((|cntr_in) || en)? {~cntr_in[0]^cntr_in[2],cntr_in[0]^cntr_in[2]}:2'b0;
// wire [1:0] we = ((|cntr_in) || en)? {~cntr_in[0]^cntr_in[2],cntr_in[0]^cntr_in[2]}:2'b0;
wire [1:0] we = ((|cntr_in) || en)? {cntr_in[0]^cntr_in[2], ~cntr_in[0]^cntr_in[2]}:2'b0;
wire [1:0] waddr = {cntr_in[2],cntr_in[2]^cntr_in[1]};
reg [WIDTH-1:0] bufl_ram[0:3];
reg [WIDTH-1:0] bufh_ram[0:3];
reg [2*WIDTH -1:0] dout_10_32_76_54_r;
reg first_period;
reg en_out_r;
reg last_out;
reg re_r;
assign dout_10_32_76_54 = dout_10_32_76_54_r;
assign en_out = en_out_r;
always @(posedge clk) begin
if (rst) last_r <= 0;
else last_r <= &cntr_in;
last_out <= raddr == 2;
if (rst) re_r <= 0;
else if (cntr_in == 5) re_r <= 1;
else if (last_out) re_r <= 0;
if (rst) cntr_in <= 0;
else if (restart || (|cntr_in)) cntr_in <= cntr_in + 1;
......@@ -75,10 +87,18 @@ module dct1d_chen_reorder_in#(
else if (cntr_in == 5) raddr <= 0;
else if (!(&raddr)) raddr <= raddr + 1;
dout_10_32_76_54_r <= {bufh_ram[raddr],bufl_ram[raddr]};
start_out <= (cntr_in == 5);
if (rst) first_period <= 0;
else if (start && en) first_period <= 1;
else if (last_r) first_period <= 0;
if (re_r) dout_10_32_76_54_r <= {bufh_ram[raddr],bufl_ram[raddr]};
en_out <= en || (|cntr_in) || last_r;
start_out <= first_period && (cntr_in == 5);
if (rst) en_out_r <= 0;
else if (cntr_in == 5) en_out_r <= 1;
else if ((raddr == 2) && !en) en_out_r <= 0;
end
endmodule
......@@ -43,33 +43,45 @@ module dct1d_chen_reorder_out#(
)(
input clk,
input rst,
input en, // sampled at timeslot of pre2_start
input [WIDTH -1:0] din, // pre2_start-X-F4-X-F2-X-F6-F5-F0-F3-X-F1-X-F7
input pre2_start, // Two cycles ahead of F4
output [WIDTH -1:0] dout, // data in natural order: F0-F1-F2-F3-F4-F5-F6-F7
output start_out, // 1 ahead of F0
output reg en_out // to be sampled when start_out is expected
output start_out, // 1 ahead of the first F0
output reg dv, // output data valid
output en_out // to be sampled when start_out is expected
);
reg [WIDTH -1:0] reord_buf_ram[0:15];
reg [WIDTH -1:0] dout_r;
reg [3:0] cntr_in;
wire start_8;
wire start_11;
reg start_12;
wire stop_in;
reg pre_we_r;
reg we_r;
reg [3:0] ina_rom;
wire [3:0] waddr = {ina_rom[3] ^ cntr_in[3], ina_rom[2:0]};
reg [3:0] raddr;
assign dout = dout_r;
assign start_out = start_12;
reg [2:0] per_type; // idle/last:0, first cycle - 1, 2-nd - 2, other - 3,... ~en->6 ->7 -> 0 (to generate pre2_start_out)
reg start_out_r;
reg en_out_r;
assign dout = dout_r;
assign start_out = start_out_r;
assign en_out = en_out_r;
always @(posedge clk) begin
if (rst) we_r <= 0;
else if (pre2_start) we_r <= 1;
else if (stop_in) we_r <= 0;
if (rst) per_type <= 0;
else if (pre2_start) per_type <= 3'h1;
else if (&cntr_in[2:0]) begin
if (!per_type[2] && !en) per_type <= 3'h6;
else if ((per_type != 0) && (per_type != 3)) per_type <= per_type + 1;
end
if (rst) pre_we_r <= 0;
else if (pre2_start) pre_we_r <= 1;
else if ((per_type == 0) || ((cntr_in==3) && per_type[2])) pre_we_r <= 0;
we_r <= pre_we_r;
if (rst) cntr_in <= 0;
else if (pre2_start) cntr_in <= {~cntr_in[3],3'b0};
else if (we_r) cntr_in <= cntr_in + 1;
else if (pre_we_r) cntr_in <= cntr_in + 1;
case (cntr_in[2:0])
3'h0: ina_rom <= {1'b0,3'h4};
3'h1: ina_rom <= {1'b1,3'h1};
......@@ -78,46 +90,24 @@ module dct1d_chen_reorder_out#(
3'h4: ina_rom <= {1'b0,3'h6};
3'h5: ina_rom <= {1'b0,3'h5};
3'h6: ina_rom <= {1'b0,3'h0};
3'h7: ina_rom <= {1'b0,3'h3};
3'h7: ina_rom <= {1'b1,3'h3};
endcase
if (we_r) reord_buf_ram[waddr] <= din;
if (start_11) raddr <= {~cntr_in[3], 3'b0};
else if ((raddr[2:0] != 0) || we_r) raddr <= raddr + 1;
if ((per_type == 2) && (cntr_in == 1)) raddr <= {~cntr_in[3], 3'b0};
else if ((raddr[2:0] != 0) || (per_type !=0)) raddr <= raddr + 1;
dout_r <= reord_buf_ram[raddr];
start_out_r <= (per_type == 2) && (cntr_in == 1);
start_12 <= start_11;
en_out <= start_12 || (raddr[2:0] != 0);
end
dly01_16 start_8__i (
.clk (clk), // input
.rst (rst), // input
.dly (4'h7), // input[3:0]
.din (pre2_start), // input
.dout (start_8) // output
);
dly01_16 start_11__i (
.clk (clk), // input
.rst (rst), // input
.dly (4'h1), // input[3:0]
.din (start_8), // input
.dout (start_11) // output
);
dly01_16 dly01_16_2_i (
.clk (clk), // input
.rst (rst), // input
.dly (4'h4), // input[3:0]
.din (start_8 && !pre2_start), // input
.dout (stop_in) // output
);
if (rst ||(per_type == 0) ) en_out_r <= 0;
else if (cntr_in == 1) en_out_r <= (per_type == 2) || !per_type[2];
if (rst) dv <= 0;
else if (start_out_r) dv <= 1;
else if ((raddr[2:0] == 0) && !en_out_r) dv <= 0;
end
endmodule
This diff is collapsed.
/*******************************************************************************
* <b>Module:</b>dct_chen_transpose
* @file dct_chen_transpose.v
* @date:2016-06-09
* @author: Andrey Filippov
*
* @brief: Reorder+transpose data between two 1-d DCT passes
*
* @copyright Copyright (c) 2016 Elphel, Inc.
*
* <b>License:</b>
*
*dct_chen_transpose.v is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* dct_chen_transpose.v is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/> .
*
* Additional permission under GNU GPL version 3 section 7:
* If you modify this Program, or any covered work, by linking or combining it
* with independent modules provided by the FPGA vendor only (this permission
* does not extend to any 3-rd party modules, "soft cores" or macros) under
* different license terms solely for the purpose of generating binary "bitstream"
* files and/or simulating the code, the copyright holders of this Program give
* you the right to distribute the covered work without those independent modules
* as long as the source code for them is available from the FPGA vendor free of
* charge, and there is no dependence on any encrypted modules for simulating of
* the combined code. This permission applies to you if the distributed code
* contains all the components and scripts required to completely simulate it
* with at least one of the Free Software programs.
*******************************************************************************/
`timescale 1ns/1ps
module dct_chen_transpose#(
parameter WIDTH = 24
)(
input clk,
input rst,
input [WIDTH -1:0] din, // pre2_start-X-F4-X-F2-X-F6-F5-F0-F3-X-F1-X-F7
input pre2_start, // Two cycles ahead of F4. Next one should start either at exactly 64 cycles, or >=68 cycles from the previous one
output [2*WIDTH -1:0] dout_10_32_76_54, // Concatenated/reordered output data {x[1],x[0]}/{x[3],x[2]}/ {x[7],x[6]}/{x[5],x[4]}
output reg start_out,
output reg en_out // to be sampled when start_out is expected
);
reg [6:0] wcntr; // write counter, used to calculate write address (2 pages of 64 words), that will be valid next cycle
wire [2:0] wrow = wcntr[5:3];
wire [2:0] wcol = wcntr[2:0];
wire wpage;
reg wcol13; // columns 1 and 3 (special)
wire [3:0] wrow_mod; // effective row, including modifier for wpage
wire [1:0] wcol01_mod = wcol[1:0] + wcol[2];
reg [6:0] waddr;
wire pre2_stop;
reg [WIDTH-1:0] transpose_ram[0:127];
reg pre_we_r;
reg we_r;
reg [5:0] rcntr = 6'h3f; // read counter
reg [5:0] raddr; // read counter, addresses dual words
reg re_r;
reg regen_r;
reg [2*WIDTH-1:0] ram_reg;
reg [2*WIDTH-1:0] ram_reg2;
wire pre_rstart_w = wcntr[5:0] == 61;
reg [1:0] rstop_r;
assign wpage = wcntr[6] ^ wrow_mod[3]; // previous page for row 0, col 1 & 3
assign wrow_mod = {1'b0, wrow} - wcol13;
assign dout_10_32_76_54 = ram_reg2;
// TODO: prevent writing to previous page after pause!
always @(posedge clk) begin
wcol13 <= ~wcol[0] & ~wcol[2];
waddr[0] <= wrow_mod[0] ^ wrow_mod[2];
waddr[1] <= wcol[1];
waddr[2] <= ~wcol01_mod[0] ^ wcol01_mod[1];
waddr[3] <= ~wcol01_mod[1];
waddr[4] <= wrow_mod[0] ^ wrow_mod[2];
waddr[5] <= wrow_mod[2];
waddr[6] <= wpage;
if (rst) pre_we_r <= 0;
else if (pre2_start) pre_we_r <= 1;
else if (pre2_stop) pre_we_r <= 0;
if (rst) wcntr <= 0;
else if (pre_we_r) wcntr <= wcntr + 1; // including page, should be before 'if (pre2_start)'
else if (pre2_start) wcntr <= {wcntr[6], 6'b0}; // if happens during pre_we_r - will be ignore, otherwise (after pause) will zero in-page adderss
we_r <= pre_we_r;
if (we_r) transpose_ram[waddr] <= din;
if (rst) rcntr <= ~0;
else if (pre_rstart_w) rcntr <= 0;
else if (rcntr != ~0) rcntr <= rcntr + 1;
re_r <= ~rcntr[2];
regen_r <= re_r;
if (rcntr == 0) raddr[5] <= wcntr[6]; // page
raddr[4:0] <= {rcntr[1:0],rcntr[5:3]};
if (re_r) ram_reg <= {transpose_ram[2*raddr+1],transpose_ram[2*raddr]}; // See if it will correctly infer
if (regen_r) ram_reg2 <= ram_reg;
if (rst || pre_rstart_w) rstop_r <= 0;
else if (&rcntr) rstop_r <= {rstop_r[0], 1'b1};
start_out <= (rcntr == 1);
if (rst) en_out <= 0;
else if (rcntr == 1) en_out <= 1;
else if (rstop_r[1]) en_out <= 0;
end
dly01_16 dly01_16_stop_i (
.clk (clk), // input
.rst (rst), // input
.dly (4'h3), // input[3:0]
.din (&wcntr[5:0] && !pre2_start), // input
.dout (pre2_stop) // output
);
/*
min latency == 60, // adding 1 for read after write in RAM
max latency = 83 (when using a 2-page buffer)
wseq=(0x08, 0x62, 0x04, 0x6e, 0x0c, 0x0a, 0x00, 0x06,
0x09, 0x02, 0x05, 0x0e, 0x0d, 0x0b, 0x01, 0x07,
0x18, 0x03, 0x14, 0x0f, 0x1c, 0x1a, 0x10, 0x16,
0x19, 0x12, 0x15, 0x1e, 0x1d, 0x1b, 0x11, 0x17,
0x39, 0x13, 0x35, 0x1f, 0x3d, 0x3b, 0x31, 0x37,
0x38, 0x33, 0x34, 0x3f, 0x3c, 0x3a, 0x30, 0x36,
0x29, 0x32, 0x25, 0x3e, 0x2d, 0x2b, 0x21, 0x27,
0x28, 0x23, 0x24, 0x2f, 0x2c, 0x2a, 0x20, 0x26)
rseq = (0x00,0x10,0x20,0x30,-1,-1,-1,-1,
0x02,0x12,0x22,0x32,-1,-1,-1,-1,
0x04,0x14,0x24,0x34,-1,-1,-1,-1,
0x06,0x16,0x26,0x36,-1,-1,-1,-1,
0x08,0x18,0x28,0x38,-1,-1,-1,-1,
0x0a,0x1a,0x2a,0x3a,-1,-1,-1,-1,
0x0c,0x1c,0x2c,0x3c,-1,-1,-1,-1,
0x0e,0x1e,0x2e,0x3e,-1,-1,-1,-1)
*/
endmodule
......@@ -125,8 +125,8 @@ module dsp_addsub_reg2_simd#(
.CEB1 (cea1), // input
.CEB2 (cea2), // input
.CEC (ceb), // input
.CECARRYIN (1'b0), // input
.CECTRL (1'b0), // input
.CECARRYIN (1'b1), // input
.CECTRL (1'b1), // input
.CED (1'b0), // input
.CEINMODE (1'b1), // input
.CEM (1'b1), // input
......
......@@ -58,13 +58,15 @@ module dsp_addsub_simd#(
1'b0, // seld,
1'b0, // seld, // ~en_a,
1'b1}; // ~sela};
wire [3:0] alumode = {2'b0, // Z + X + Y + CIN / -Z +( X + Y + CIN) -1
1'b0,
// No CIN in the middle of SIMD words!
// wire [3:0] alumode = {2'b0, // Z + X + Y + CIN / -Z +( X + Y + CIN) -1
wire [3:0] alumode = {2'b0, // Z + X + Y + CIN / Z -( X + Y + CIN)
subtract, // 1'b0,
subtract};
wire [6:0] opmode = {3'b011, // Z = C-input
2'b00, // Y = 0
2'b11}; // X = A:B
wire cryin = subtract;
// wire cryin = subtract;
DSP48E1 #(
.ACASCREG (1),
......@@ -109,24 +111,24 @@ module dsp_addsub_simd#(
.PATTERNDETECT (), // output
.PCOUT (), // output[47:0]
.UNDERFLOW (), // output
.A (ain[47:18]), // input[29:0]
.A (bin[47:18]), // input[29:0]
.ACIN (30'b0), // input[29:0]
.ALUMODE (alumode), // input[3:0]
.B (ain[17:0]), // input[17:0]
.B (bin[17:0]), // input[17:0]
.BCIN (18'b0), // input[17:0]
.C (bin), // input[47:0]
.C (ain), // input[47:0]
.CARRYCASCIN (1'b0), // input
.CARRYIN (cryin), // input
.CARRYIN (1'b0), // cryin), // input
.CARRYINSEL (3'h0), // input[2:0] // later modify?
.CEA1 (cea), // input
.CEA2 (1'b0), // input
.CEA1 (1'b0), // input
.CEA2 (ceb), // input
.CEAD (1'b0), // input
.CEALUMODE (1'b1), // input
.CEB1 (cea), // input
.CEB2 (1'b0), // input
.CEC (ceb), // input
.CECARRYIN (1'b0), // input
.CECTRL (1'b0), // input
.CEB1 (1'b0), // input
.CEB2 (ceb), // input
.CEC (cea), // input
.CECARRYIN (1'b1), // input
.CECTRL (1'b1), // input
.CED (1'b0), // input
.CEINMODE (1'b1), // input
.CEM (1'b1), // input
......
......@@ -78,11 +78,11 @@ module dsp_ma #(
.ACASCREG (1),
.ADREG (0), // (1),
.ALUMODEREG (1),
.AREG (2), // (1)
.AREG (1), // (2), // (1) - means number in series, so "2" always reads the second
.AUTORESET_PATDET ("NO_RESET"),
.A_INPUT ("DIRECT"),
.BCASCREG (1),
.BREG (2), // (1)
.BREG (1), // (2), // (1) - means number in series, so "2" always reads the second
.B_INPUT ("DIRECT"),
.CARRYINREG (1),
.CARRYINSELREG (1),
......@@ -134,7 +134,7 @@ module dsp_ma #(
.CEB2 (ceb2), // input
.CEC (1'b0), // input
.CECARRYIN (1'b0), // input
.CECTRL (1'b0), // input
.CECTRL (1'b1), // input
.CED (ced), // input
.CEINMODE (1'b1), // input
.CEM (1'b1), // input
......
......@@ -58,14 +58,14 @@ module dsp_ma_preadd #(
input sela, // 0 - select a1, 1 - select a2
input en_a, // 1 - enable a input (0 - zero) ~inmode[1]
input en_d, // 1 - enable d input (0 - zero) ~inmode[2]
input sub_d, // 0 - pre-add (A+D), 1 - pre-subtract (A-D)
input sub_a, // 0 - pre-add (D+A), 1 - pre-subtract (D-A)
input neg_m, // 1 - negate multiplier result
input accum, // 0 - use multiplier result, 1 add to accumulator
output signed [P_WIDTH-1:0] pout
);
`ifdef INSTANTIATE_DSP48E1
wire [4:0] inmode = {~selb,
sub_d,
sub_a,
en_d,
~en_a,
~sela};
......@@ -82,11 +82,11 @@ module dsp_ma_preadd #(
.ACASCREG (1),
.ADREG (1),
.ALUMODEREG (1),
.AREG (2), // (1)
.AREG (1), // 2), // (1) - means number in series, so "2" always reads the second
.AUTORESET_PATDET ("NO_RESET"),
.A_INPUT ("DIRECT"),
.BCASCREG (1),
.BREG (2), // (1)
.BREG (1), // (2), // (1) - means number in series, so "2" always reads the second
.B_INPUT ("DIRECT"),
.CARRYINREG (1),
.CARRYINSELREG (1),
......@@ -138,7 +138,7 @@ module dsp_ma_preadd #(
.CEB2 (ceb2), // input
.CEC (1'b0), // input
.CECARRYIN (1'b0), // input
.CECTRL (1'b0), // input
.CECTRL (1'b1), // input
.CED (ced), // input
.CEINMODE (1'b1), // input
.CEM (1'b1), // input
......@@ -179,7 +179,7 @@ module dsp_ma_preadd #(
reg sela_r;
reg en_a_r;
reg en_d_r;
reg sub_d_r;
reg sub_a_r;
reg neg_m_r;
reg accum_r;
wire signed [P_WIDTH-1:0] m_reg_pm;
......@@ -189,7 +189,8 @@ module dsp_ma_preadd #(
assign pout = p_reg;
assign b_wire = selb_r ? b2_reg : b1_reg;
assign a_wire = en_a_r ? (sela_r ? a2_reg : a1_reg) : {A_WIDTH{1'b0}};
assign d_wire = en_d_r ? (sub_d_r ? -d_reg : d_reg) : {A_WIDTH{1'b0}};
// assign d_wire = en_d_r ? (sub_a_r ? -d_reg : d_reg) : {A_WIDTH{1'b0}};
assign d_wire = en_d_r ? d_reg : {A_WIDTH{1'b0}};
assign m_wire = ad_reg * b_wire;
assign m_reg_pm = neg_m_r ? - m_reg : m_reg;
......@@ -212,7 +213,7 @@ module dsp_ma_preadd #(
else if (ced) d_reg <= din;
if (rst) ad_reg <= 0;
else if (cead) ad_reg <= a_wire + d_wire;
else if (cead) ad_reg <= sub_a_r? (d_wire - a_wire): (d_wire + a_wire);
neg_m_r <= neg_m;
accum_r <= accum;
......@@ -221,7 +222,7 @@ module dsp_ma_preadd #(
sela_r <= sela;
en_a_r <= en_a;
en_d_r <= en_d;
sub_d_r <= sub_d;
sub_a_r <= sub_a;
m_reg <= {{P_WIDTH - A_WIDTH - B_WIDTH{1'b0}}, m_wire};
......
......@@ -1404,6 +1404,11 @@ end
// protect from never end
initial begin
// after 1 frame compressed on all channels
// Debugging DCT Chen
#100000;
$finish;
`ifdef HISPI
#135000;
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment