Commit 530030f6 authored by Andrey Filippov's avatar Andrey Filippov

Switched to new implementation of 8x8 DCT, generated documentation

parent 0e866d77
......@@ -23,7 +23,7 @@ py393/dbg*
debug/*
html/*
man/*
x393_docs/*
includes/x393_cur_params_sim.vh
includes/x393_cur_params_target_*.vh
py393/exp_gpio.py
......
......@@ -40,7 +40,14 @@
*/
`timescale 1ns/1ps
module cmprs_macroblock_buf_iface (
module cmprs_macroblock_buf_iface #(
`ifdef USE_OLD_DCT
parameter DCT_PIPELINE_PAUSE = 0 // No need to delay
`else
parameter DCT_PIPELINE_PAUSE = 48 // TODO: find really required value (minimal), adjust counter bits (now 6)
// 48 seems to be OK (may be less)
`endif
)(
// input rst,
input xclk, // global clock input, compressor single clock rate
......@@ -98,6 +105,7 @@ module cmprs_macroblock_buf_iface (
wire frame_pre_start_w; // start sequence for a new frame
reg frame_pre_start_r;
reg [ 8:0] mb_pre_start; // 1-hot macroblock pre start calcualtions - TODO: adjust width
reg mb_pre_start4_first; // first cycle after mb_pre_start[3]
wire [ 2:0] buf_diff; // difference between page needed and next valid - should be negative to have it ready
wire buf_ready_w; // External memory buffer has all the pages needed
......@@ -117,6 +125,8 @@ module cmprs_macroblock_buf_iface (
reg frame_pre_run;
reg [1:0] frame_may_start;
reg [5:0] dct_pipeline_delay_cntr;
`ifdef DEBUG_RING
assign dbg_add_invalid = add_invalid;
assign dbg_mb_release_buf = mb_release_buf;
......@@ -180,9 +190,17 @@ module cmprs_macroblock_buf_iface (
// calculate before starting each macroblock (will wait if buffer is not ready) (TODO: align mb_pre_start[0] to mb_pre_end[2] - same)
//mb_pre_start_w
if (!frame_en_r) mb_pre_start <= 0;
if (mb_pre_start_w) mb_pre_start <= 1;
else if (!mb_pre_start[4] || buf_ready_w) mb_pre_start <= mb_pre_start << 1;
// TODO: Here enforce minimal pause (if not zero for the DCT pipeline to recover
// will wait for buf_ready_w, but not less than DCT_PIPELINE_PAUSE (or no wait at all)
mb_pre_start4_first <=mb_pre_start[3];
if (xrst) dct_pipeline_delay_cntr <= 0;
else if (mb_pre_start4_first && !buf_ready_w) dct_pipeline_delay_cntr <= DCT_PIPELINE_PAUSE -1;
else if (|dct_pipeline_delay_cntr) dct_pipeline_delay_cntr <= dct_pipeline_delay_cntr -1;
if (!frame_en_r) mb_pre_start <= 0;
if (mb_pre_start_w) mb_pre_start <= 1;
else if (!mb_pre_start[4] || (buf_ready_w && !(|dct_pipeline_delay_cntr))) mb_pre_start <= mb_pre_start << 1;
if (mb_pre_start[1]) mbl_x_r[6:3] <= mb_first_in_row? {2'b0,left_marg[4:3]} : mbl_x_next_r[6:3];
if (mb_pre_start[2]) mbl_x_last_r[7:3] <= {1'b0,mbl_x_r[6:3]} + {2'b0,mb_w_m1[5:3]};
......
......@@ -965,39 +965,10 @@ module jp_channel#(
if (dct_last_in) first_block_dct <= first_block_color_after;
end
`ifdef USE_OLD_XDCT393
xdct393 xdct393_i (
.clk (xclk), // input
.en (frame_en), // input if zero will reset transpose memory page numbers
.start (dct_start), // input single-cycle start pulse that goes with the first pixel data. Other 63 should follow
.xin (yc_nodc), // input[9:0]
.last_in (dct_last_in), // output reg output high during input of the last of 64 pixels in a 8x8 block //
.pre_first_out (dct_pre_first_out), // outpu 1 cycle ahead of the first output in a 64 block
/// .dv (dct_dv), // output data output valid. Will go high on the 94-th cycle after the start (now - on 95-th?)
.dv (), // not used: output data output valid. Will go high on the 94-th cycle after the start (now - on 95-th?)
.d_out (dct_out) // output[12:0]
);
`else
xdct393r xdct393_i (
.clk (xclk), // input
.en (frame_en), // input if zero will reset transpose memory page numbers
.start (dct_start), // input single-cycle start pulse that goes with the first pixel data. Other 63 should follow
.xin (yc_nodc), // input[9:0]
.last_in (dct_last_in), // output reg output high during input of the last of 64 pixels in a 8x8 block //
.pre_first_out (dct_pre_first_out), // outpu 1 cycle ahead of the first output in a 64 block
/// .dv (dct_dv), // output data output valid. Will go high on the 94-th cycle after the start (now - on 95-th?)
.dv (), // not used: output data output valid. Will go high on the 94-th cycle after the start (now - on 95-th?)
.d_out (dct_out) // output[12:0]
);
/* New DCT, now in passive mode */
// TODO: enforce minimal pause (when not butted together)
wire dct_last_in_debug;
wire dct_pre_first_out_debug;
wire dct_dv_debug;
wire [12:0] dct_dout_debug;
// 8x8 DCT implementing Chen algorithm and 2 passes
// Each pass (1d) uses 5 DSP48E1 modules (2 - multipliers and 3 SIMD (2x24) adder/subracters
// Needs a small (<48, but did not calculate yet) pause between block if they did not come
// immediately after each other. This pause is needed to restart pipeline
dct2d8x8_chen #(
.INPUT_WIDTH (10),
......@@ -1005,27 +976,23 @@ module jp_channel#(
.STAGE1_SAFE_BITS (3),
.STAGE2_SAFE_BITS (3),
.TRANSPOSE_WIDTH (16),
.TRIM_STAGE_1 (0),
.TRIM_STAGE_2 (2),
.TRIM_STAGE_1 (1),
.TRIM_STAGE_2 (0),
.DSP_WIDTH (24),
.DSP_OUT_WIDTH (24),
.DSP_B_WIDTH (18),
.DSP_A_WIDTH (25),
.DSP_P_WIDTH (48),
.DSP_M_WIDTH (43)
.DSP_P_WIDTH (48)
) dct2d8x8_chen_i (
.clk (xclk), // input
.rst (!frame_en), // input
.start (dct_start), // input
.xin (yc_nodc), // input[9:0] signed
.last_in (dct_last_in_debug), // output reg
.pre_first_out (dct_pre_first_out_debug), // output
.dv (dct_dv_debug), // output
.d_out (dct_dout_debug) // output[12:0] signed
.clk (xclk), // input
.rst (!frame_en), // input
.start (dct_start), // input
.xin (yc_nodc), // input[9:0] signed
.last_in (dct_last_in), // output reg
.pre_first_out (dct_pre_first_out), // output
.dv (), // output
.d_out (dct_out) // output[12:0] signed
);
`endif
wire quant_start;
dly_16 #(.WIDTH(1)) i_quant_start (.clk(xclk),.rst(1'b0), .dly(4'd0), .din(dct_pre_first_out), .dout(quant_start)); // dly=0+1
......
This diff is collapsed.
/*******************************************************************************
/*!
* <b>Module:</b>dct1d_chen
* @file dct1d_chen.v
* @date:2016-06-05
* @author: Andrey Filippov
* @date 2016-06-05
* @author Andrey Filippov
*
* @brief: 1d 8-point DCT based on Chen algorithm
* @brief 1d 8-point DCT based on Chen algorithm
*
* @copyright Copyright (c) 2016 Elphel, Inc.
*
......@@ -35,17 +35,19 @@
* the combined code. This permission applies to you if the distributed code
* contains all the components and scripts required to completely simulate it
* with at least one of the Free Software programs.
*******************************************************************************/
*/
`timescale 1ns/1ps
module dct1d_chen#(
parameter WIDTH = 24,
parameter OUT_WIDTH = 24,
parameter OUT_WIDTH = 16,
parameter B_WIDTH = 18,
parameter A_WIDTH = 25,
parameter P_WIDTH = 48,
parameter M_WIDTH = 43, // actual multiplier width (== (A_WIDTH +B_WIDTH)
// parameter M_WIDTH = 43, // actual multiplier width (== (A_WIDTH +B_WIDTH)
parameter ROUND_OUT = 8, // cut these number of LSBs on the output, round result (in addition to COSINE_SHIFT)
parameter COSINE_SHIFT= 17,
parameter COS_1_16 = 128553, // (1<<17) * cos(1*pi/16)
parameter COS_2_16 = 121095, // (2<<17) * cos(1*pi/16)
parameter COS_3_16 = 108982, // (3<<17) * cos(1*pi/16)
......@@ -59,11 +61,13 @@ module dct1d_chen#(
input en,
input [2 * WIDTH -1:0] d10_32_76_54, // Concatenated input data {x[1],x[0]}/{x[3],x[2]}/ {x[7],x[6]}/{x[5],x[4]}
input start, // {x[1],x[0]} available next after start, {x[3],x[2]} - second next, then {x[7],x[6]} and {x[5],x[4]}
output [WIDTH -1:0] dout,
output [OUT_WIDTH -1:0] dout,
output reg pre2_start_out, // 2 clock cycle before F4 output, full dout sequence
// start_out-X-F4-X-F2-X-F6-F5-F0-F3-X-F1-X-F7
output reg en_out // valid at the same time slot as pre2_start_out (goes active with pre2_start_out)
);
localparam TOTAL_RSHIFT= COSINE_SHIFT + ROUND_OUT;
localparam BEFORE_SAT_WIDTH = P_WIDTH - TOTAL_RSHIFT;
reg signed [B_WIDTH-1:0] dsp_ma_bin;
wire dsp_ma_ceb1_1; // load b1 register
wire dsp_ma_ceb2_1; // load b2 register
......@@ -94,6 +98,7 @@ module dct1d_chen#(
wire dsp_ma_neg_m_2; // 1 - negate multiplier result
wire dsp_ma_accum_2; // 0 - use multiplier result, 1 add to accumulator
wire signed [P_WIDTH-1:0] dsp_ma_p_2;
wire signed [P_WIDTH-1:0] dsp_ma_p_mux;
// Multipler A/D inputs before shift
wire signed [WIDTH-1:0] dsp_ma_ain24_1;
......@@ -142,10 +147,25 @@ module dct1d_chen#(
reg [7:0] phase;
reg [2:0] phase_cnt;
reg [OUT_WIDTH -1:0] dout_r;
wire [OUT_WIDTH -1:0] dout1_w;
wire [OUT_WIDTH -1:0] dout2_w;
// wire [OUT_WIDTH -1:0] dout1_w;
// wire [OUT_WIDTH -1:0] dout2_w;
wire dout_round_c;
wire[BEFORE_SAT_WIDTH -1:0] dout_round_w; // after rounding, before (optional) saturation
reg [BEFORE_SAT_WIDTH -1:0] dout_round_r; // after rounding, before (optional) saturation
wire [OUT_WIDTH -1:0] dout_sat_w;
wire[BEFORE_SAT_WIDTH -1:0] dout_round; // after rounding, before (optional) saturation
reg [2:0] per_type; // idle/last:0, first cycle - 1, 2-nd - 2, other - 3,... ~en->6 ->7 -> 0 (to generate pre2_start_out)
// Temporarily adding 1 extra latency cycle for rounding/saturation. TODO: Remove when moved to DSP itself
reg pre3_start_out; // 3 clock cycle before F4 output, full dout sequence
// start_out-X-F4-X-F2-X-F6-F5-F0-F3-X-F1-X-F7
reg pre_en_out; // valid at the same time slot as pre2_start_out (goes active with pre2_start_out)
// .ain ({simd_a1,simd_a0}), // input[47:0]
// .bin ({simd_b1,simd_b0}), // input[47:0]
// dsp_addsub_simd1_i input connections
......@@ -233,7 +253,7 @@ module dct1d_chen#(
assign dsp_ma_ced_2 = phase[1] | phase[6];
assign dsp_ma_sela_2 = phase[1] | phase[6];
assign dsp_ma_seld_2 = phase[0] | phase[2] | phase[5] | phase[7];
assign dsp_ma_neg_m_2 = phase[6];
assign dsp_ma_neg_m_2 = phase[1] | phase[6];
assign dsp_ma_accum_2 = phase[0] | phase[2] | phase[4] | phase[6];
// dsp_ma2_i data input connections
assign dsp_ma_ain24_2 = simd_p5;
......@@ -255,10 +275,37 @@ module dct1d_chen#(
// assign dout1_w = dsp_ma_p_1[M_WIDTH -: WIDTH]; // adding one bit for adder (two MPY outputs are added)
// assign dout2_w = dsp_ma_p_2[M_WIDTH -: WIDTH]; // adding one bit for adder (two MPY outputs are added)
assign dout1_w = dsp_ma_p_1[COSINE_SHIFT +: WIDTH]; // adding one bit for adder (two MPY outputs are added)
assign dout2_w = dsp_ma_p_2[COSINE_SHIFT +: WIDTH]; // adding one bit for adder (two MPY outputs are added)
assign dsp_ma_p_mux = phase_cnt[0] ? dsp_ma_p_1 : dsp_ma_p_2;
// assign dout1_w = dsp_ma_p_1[COSINE_SHIFT +: OUT_WIDTH]; // adding one bit for adder (two MPY outputs are added)
// assign dout2_w = dsp_ma_p_2[COSINE_SHIFT +: OUT_WIDTH]; // adding one bit for adder (two MPY outputs are added)
assign dout_round_c = dsp_ma_p_mux[TOTAL_RSHIFT-1];
assign dout_round_w = dsp_ma_p_mux[TOTAL_RSHIFT +: BEFORE_SAT_WIDTH] + dout_round_c;
// Saturation (only if BEFORE_SAT_WIDTH > OUT_WIDTH)
localparam TRIM_MSB = BEFORE_SAT_WIDTH - OUT_WIDTH;
generate
if (TRIM_MSB < 0) begin // should never happen
assign dout_sat_w = { {(-TRIM_MSB){dout_round[BEFORE_SAT_WIDTH-1]}},dout_round };
end else if (TRIM_MSB == 0) begin
assign dout_sat_w = dout_round[0 +: OUT_WIDTH];
end else begin //! saturate. TODO: Maybe (and also symmetric rounding) can be done in DSP itself using masks?
assign dout_sat_w = (dout_round[BEFORE_SAT_WIDTH-1 -: TRIM_MSB] == {TRIM_MSB{dout_round[BEFORE_SAT_WIDTH-1]}})?
dout_round[0 +: OUT_WIDTH]:
{dout_round[BEFORE_SAT_WIDTH-1], {OUT_WIDTH-1{~dout_round[BEFORE_SAT_WIDTH-1]}}};
end
endgenerate
// to possibly remove registers with generate
assign dout_round= dout_round_r;
//BEFORE_SAT_WIDTH
// wire dout_round_c;
// wire [OUT_WIDTH -1:0] dout_round_w;
//ROUND_OUT
//phase_cnt[0] ? dout1_w : dout2_w;
assign dout = dout_r;
always @ (posedge clk) begin
......@@ -284,16 +331,24 @@ module dct1d_chen#(
3'h6: dsp_ma_bin <= COS_4_16;
3'h7: dsp_ma_bin <= COS_6_16;
endcase
dout_r <= phase_cnt[0] ? dout1_w : dout2_w;
// dout_r <= phase_cnt[0] ? dout1_w : dout2_w;
dout_round_r <= dout_round_w;
dout_r <= dout_sat_w;
if (rst) pre2_start_out <= 0;
else pre2_start_out <= (per_type == 2) && phase[3];
if (rst) pre3_start_out <= 0;
else pre3_start_out <= (per_type == 2) && phase[3];
pre2_start_out <=pre3_start_out;
if (rst || !(en || (|phase))) en_out <= 0;
if (rst || !(en || (|phase))) pre_en_out <= 0;
else if (phase[3]) begin
if (per_type == 2) en_out <= 1;
else if (per_type[2]) en_out <= 0;
end
if (per_type == 2) pre_en_out <= 1;
else if (per_type[2]) pre_en_out <= 0;
end
en_out <= pre_en_out;
end
dsp_addsub_simd #(
......
/*******************************************************************************
/*!
* <b>Module:</b>dct1d_chen_reorder_in
* @file dct1d_chen_reorder_in.v
* @date:2016-06-08
* @author: Andrey Filippov
* @date 2016-06-08
* @author Andrey Filippov
*
* @brief: Reorder scan-line pixel stream for dct1d_chen module
* @brief Reorder scan-line pixel stream for dct1d_chen module
*
* @copyright Copyright (c) 2016 Elphel, Inc.
*
......@@ -35,7 +35,7 @@
* the combined code. This permission applies to you if the distributed code
* contains all the components and scripts required to completely simulate it
* with at least one of the Free Software programs.
*******************************************************************************/
*/
`timescale 1ns/1ps
module dct1d_chen_reorder_in#(
......
/*******************************************************************************
/*!
* <b>Module:</b>dct1d_chen_reorder_out
* @file dct1d_chen_reorder_out.v
* @date:2016-06-08
* @author: Andrey Filippov
* @date 2016-06-08
* @author Andrey Filippov
*
* @brief: Reorder data from dct1d_chen output to natural sequence
* @brief Reorder data from dct1d_chen output to natural sequence
*
* @copyright Copyright (c) 2016 Elphel, Inc.
*
......@@ -35,7 +35,7 @@
* the combined code. This permission applies to you if the distributed code
* contains all the components and scripts required to completely simulate it
* with at least one of the Free Software programs.
*******************************************************************************/
*/
`timescale 1ns/1ps
module dct1d_chen_reorder_out#(
......@@ -62,6 +62,7 @@ module dct1d_chen_reorder_out#(
reg [2:0] per_type; // idle/last:0, first cycle - 1, 2-nd - 2, other - 3,... ~en->6 ->7 -> 0 (to generate pre2_start_out)
reg start_out_r;
reg en_out_r;
wire stop_out; // qualify with en
assign dout = dout_r;
assign start_out = start_out_r;
assign en_out = en_out_r;
......@@ -98,16 +99,30 @@ module dct1d_chen_reorder_out#(
if ((per_type == 2) && (cntr_in == 1)) raddr <= {~cntr_in[3], 3'b0};
else if ((raddr[2:0] != 0) || (per_type !=0)) raddr <= raddr + 1;
dout_r <= reord_buf_ram[raddr];
if (en_out_r) dout_r <= reord_buf_ram[raddr];
start_out_r <= (per_type == 2) && (cntr_in == 1);
if (rst ||(per_type == 0) ) en_out_r <= 0;
else if (cntr_in == 1) en_out_r <= (per_type == 2) || !per_type[2];
if (rst) dv <= 0;
else if (start_out_r) dv <= 1;
else if ((raddr[2:0] == 0) && !en_out_r) dv <= 0;
if (rst ||(per_type == 0) ) en_out_r <= 0;
// else if (cntr_in == 1) en_out_r <= (per_type == 2) || !per_type[2];
else if ((cntr_in == 1) && (per_type == 2)) en_out_r <= 1;
else if (stop_out && !en) en_out_r <= 0;
//stop_out
dv <= en_out_r;
// if (rst) dv <= 0;
// else if (start_out_r) dv <= 1;
// else if ((raddr[2:0] == 0) && !en_out_r) dv <= 0;
end
dly01_16 dly01_16_i (
.clk (clk), // input
.rst (rst), // input
.dly (4'd8), // input[3:0]
.din ((&cntr_in[2:0]) && !en), // input
.dout (stop_out) // output
);
endmodule
/*******************************************************************************
/*!
* <b>Module:</b>dct2d8x8_chen
* @file dct2d8x8_chen.v
* @date:2016-06-10
* @author: Andrey Filippov
* @date 2016-06-10
* @author Andrey Filippov
*
* @brief: 2-d DCT implementation of Chen algorithm
* @brief 2-d DCT implementation of Chen algorithm
*
* @copyright Copyright (c) 2016 Elphel, Inc.
*
......@@ -35,7 +35,7 @@
* the combined code. This permission applies to you if the distributed code
* contains all the components and scripts required to completely simulate it
* with at least one of the Free Software programs.
*******************************************************************************/
*/
`timescale 1ns/1ps
module dct2d8x8_chen#(
......@@ -45,13 +45,13 @@ module dct2d8x8_chen#(
parameter STAGE2_SAFE_BITS = 3, // leave this number of extra bits on DCT1D input to prevent output saturation
parameter TRANSPOSE_WIDTH = 16, // transpose memory width
parameter TRIM_STAGE_1 = 1, // Trim these MSBs from the stage1 results (1 - matches old DCT)
parameter TRIM_STAGE_2 = 2, // Trim these MSBs from the stage2 results TODO: put real value
parameter TRIM_STAGE_2 = 0, // Trim these MSBs from the stage2 results
parameter DSP_WIDTH = 24,
parameter DSP_OUT_WIDTH = 24,
// parameter DSP_OUT_WIDTH = 24,
parameter DSP_B_WIDTH = 18,
parameter DSP_A_WIDTH = 25,
parameter DSP_P_WIDTH = 48,
parameter DSP_M_WIDTH = 43 // actual multiplier width (== (A_WIDTH +B_WIDTH)
parameter DSP_P_WIDTH = 48
// parameter DSP_M_WIDTH = 43 // actual multiplier width (== (A_WIDTH +B_WIDTH)
) (
input clk, /// system clock, posedge
input rst, // sync reset
......@@ -68,6 +68,8 @@ module dct2d8x8_chen#(
localparam REPLICATE_IN_STAGE2 = STAGE2_SAFE_BITS;
localparam PAD_IN_STAGE2 = DSP_WIDTH - TRANSPOSE_WIDTH - STAGE2_SAFE_BITS ;
localparam ROUND_STAGE1 = DSP_WIDTH - TRANSPOSE_WIDTH - TRIM_STAGE_1;
localparam ROUND_STAGE2 = DSP_WIDTH - OUTPUT_WIDTH - TRIM_STAGE_2;
reg signed [INPUT_WIDTH-1:0] xin_r;
......@@ -82,7 +84,7 @@ module dct2d8x8_chen#(
wire signed [DSP_WIDTH-1:0] dct1in_pad_h;
wire signed [DSP_WIDTH-1:0] dct1in_pad_l;
wire signed [DSP_OUT_WIDTH-1:0] dct1_out;
wire signed [TRANSPOSE_WIDTH-1:0] dct1_out;
wire stage1_pre2_start_out;
// wire stage1_pre2_en_out;
......@@ -94,20 +96,43 @@ module dct2d8x8_chen#(
wire signed [DSP_WIDTH-1:0] dct2in_pad_h;
wire signed [DSP_WIDTH-1:0] dct2in_pad_l;
wire signed [DSP_OUT_WIDTH-1:0] dct2_out;
wire signed [OUTPUT_WIDTH-1:0] dct2_out;
wire stage2_pre2_start_out;
wire stage2_pre2_en_out;
wire signed [OUTPUT_WIDTH-1:0] dct2_trimmed;
// wire signed [OUTPUT_WIDTH-1:0] dct2_trimmed;
assign dct1in_pad_h = {{REPLICATE_IN_STAGE1{dct1in_h[INPUT_WIDTH-1]}}, dct1in_h, {PAD_IN_STAGE1{1'b0}}};
assign dct1in_pad_l = {{REPLICATE_IN_STAGE1{dct1in_l[INPUT_WIDTH-1]}}, dct1in_l, {PAD_IN_STAGE1{1'b0}}};
assign transpose_din = dct1_out[DSP_OUT_WIDTH-1-TRIM_STAGE_1 -:TRANSPOSE_WIDTH];
assign transpose_din = dct1_out;
/*
generate
if (TRIM_STAGE_1 == 0) begin
assign transpose_din = dct1_out[DSP_OUT_WIDTH-1 -:TRANSPOSE_WIDTH];
end else begin //! saturate. TODO: Maybe (and also symmetric rounding) can be done in DSP itself using masks?
assign transpose_din = (dct1_out[DSP_OUT_WIDTH-1 -: TRIM_STAGE_1] == {TRIM_STAGE_1{dct1_out[DSP_OUT_WIDTH-1]}})?
dct1_out[DSP_OUT_WIDTH-1-TRIM_STAGE_1 -: TRANSPOSE_WIDTH]:
{dct1_out[DSP_OUT_WIDTH-1], {TRANSPOSE_WIDTH-1{~dct1_out[DSP_OUT_WIDTH-1]}}};
end
endgenerate
*/
assign dct2in_pad_h = {{REPLICATE_IN_STAGE2{transpose_douth[TRANSPOSE_WIDTH-1]}}, transpose_douth, {PAD_IN_STAGE2{1'b0}}};
assign dct2in_pad_l = {{REPLICATE_IN_STAGE2{transpose_doutl[TRANSPOSE_WIDTH-1]}}, transpose_doutl, {PAD_IN_STAGE2{1'b0}}};
assign dct2_trimmed = dct2_out[DSP_OUT_WIDTH-1-TRIM_STAGE_2 -:OUTPUT_WIDTH];
// assign dct2_trimmed = dct2_out;
/*
generate
if (TRIM_STAGE_2 == 0) begin
assign dct2_trimmed = dct2_out[DSP_OUT_WIDTH-1 -: OUTPUT_WIDTH];
end else begin //! saturate. Maybe (and also symmetric rounding) can be done in DSP itself using masks?
assign dct2_trimmed = (dct2_out[DSP_OUT_WIDTH-1 -: TRIM_STAGE_2] == {TRIM_STAGE_2{dct2_out[DSP_OUT_WIDTH-1]}})?
dct2_out[DSP_OUT_WIDTH-1-TRIM_STAGE_2 -:OUTPUT_WIDTH]:
{dct2_out[DSP_OUT_WIDTH-1], {OUTPUT_WIDTH-1{~dct2_out[DSP_OUT_WIDTH-1]}}};
end
endgenerate
*/
always @(posedge clk) begin
start_in_r <= start;
......@@ -141,11 +166,11 @@ module dct2d8x8_chen#(
wire dbg_stage1_pre2_en_out;
dct1d_chen #(
.WIDTH (DSP_WIDTH),
.OUT_WIDTH (DSP_OUT_WIDTH),
.OUT_WIDTH (TRANSPOSE_WIDTH), // DSP_OUT_WIDTH),
.B_WIDTH (DSP_B_WIDTH),
.A_WIDTH (DSP_A_WIDTH),
.P_WIDTH (DSP_P_WIDTH),
.M_WIDTH (DSP_M_WIDTH)
.ROUND_OUT (ROUND_STAGE1) // cut these number of LSBs on the output, round result (in addition to COSINE_SHIFT)
) dct1d_chen_stage1_i (
.clk (clk), // input
.rst (rst), // input
......@@ -170,12 +195,12 @@ module dct2d8x8_chen#(
);
dct1d_chen #(
.WIDTH(DSP_WIDTH),
.OUT_WIDTH(DSP_OUT_WIDTH),
.B_WIDTH(DSP_B_WIDTH),
.A_WIDTH(DSP_A_WIDTH),
.P_WIDTH(DSP_P_WIDTH),
.M_WIDTH(DSP_M_WIDTH)
.WIDTH (DSP_WIDTH),
.OUT_WIDTH (OUTPUT_WIDTH),
.B_WIDTH (DSP_B_WIDTH),
.A_WIDTH (DSP_A_WIDTH),
.P_WIDTH (DSP_P_WIDTH),
.ROUND_OUT (ROUND_STAGE2) // cut these number of LSBs on the output, round result (in addition to COSINE_SHIFT)
) dct1d_chen_stage2_i (
.clk (clk), // input
.rst (rst), // input
......@@ -193,7 +218,7 @@ module dct2d8x8_chen#(
.clk (clk), // input
.rst (rst), // input
.en (stage2_pre2_en_out), // input
.din (dct2_trimmed), // input[23:0]
.din (dct2_out), // input[23:0]
.pre2_start (stage2_pre2_start_out), // input
.dout (d_out), // output[23:0]
.start_out (pre_first_out), // output reg
......@@ -202,13 +227,16 @@ module dct2d8x8_chen#(
);
// Just for debugging/comparing with old 1-d DCT:
wire [DSP_WIDTH-1:0] dbg_d_out;
`ifdef SIMULATION // no sense to synthesize it
`ifdef DEBUG_DCT1D
wire [TRANSPOSE_WIDTH-1:0] dbg_d_out;
//wire [15:0] dbg_d_out13=dbg_d_out[7 +: 16] ;
wire dbg_dv;
wire dbg_en_out;
wire dbg_pre_first_out;
dct1d_chen_reorder_out #(
.WIDTH (DSP_WIDTH)
.WIDTH (TRANSPOSE_WIDTH)
) dct1d_chen_reorder_out_dbg_i (
.clk (clk), // input
.rst (rst), // input
......@@ -220,5 +248,7 @@ wire dbg_pre_first_out;
.dv (dbg_dv), // output reg
.en_out (dbg_en_out) // output reg
);
`endif
`endif
endmodule
/*******************************************************************************
/*!
* <b>Module:</b>dct_chen_transpose
* @file dct_chen_transpose.v
* @date:2016-06-09
* @author: Andrey Filippov
* @date 2016-06-09
* @author Andrey Filippov
*
* @brief: Reorder+transpose data between two 1-d DCT passes
* @brief Reorder+transpose data between two 1-d DCT passes
*
* @copyright Copyright (c) 2016 Elphel, Inc.
*
......@@ -35,7 +35,7 @@
* the combined code. This permission applies to you if the distributed code
* contains all the components and scripts required to completely simulate it
* with at least one of the Free Software programs.
*******************************************************************************/
*/
`timescale 1ns/1ps
module dct_chen_transpose#(
......@@ -70,6 +70,7 @@ module dct_chen_transpose#(
reg [2*WIDTH-1:0] ram_reg2;
wire pre_rstart_w = wcntr[5:0] == 61;
reg [1:0] rstop_r;
reg first_after_pause; // first block after pause - do not write 2 items to the "past"
assign wpage = wcntr[6] ^ wrow_mod[3]; // previous page for row 0, col 1 & 3
assign wrow_mod = {1'b0, wrow} - wcol13;
......@@ -93,7 +94,7 @@ module dct_chen_transpose#(
else if (pre_we_r) wcntr <= wcntr + 1; // including page, should be before 'if (pre2_start)'
else if (pre2_start) wcntr <= {wcntr[6], 6'b0}; // if happens during pre_we_r - will be ignored, otherwise (after pause) will zero in-page adderss
we_r <= pre_we_r;
we_r <= pre_we_r && (!first_after_pause || !wcol13 || (|wrow)); // do not write first after pause to the "past"
if (we_r) transpose_ram[waddr] <= din;
......@@ -118,6 +119,11 @@ module dct_chen_transpose#(
if (rst) en_out <= 0;
else if (rcntr == 1) en_out <= 1;
else if (rstop_r[1]) en_out <= 0;
if (rst) first_after_pause <= 0;
else if (pre2_start && !we_r) first_after_pause <= 1;
else if (&wcntr[5:0]) first_after_pause <= 0;
end
dly01_16 dly01_16_stop_i (
......
/*******************************************************************************
/*!
* <b>Module:</b>dsp_addsub_simd
* @file dsp_addsub_simd.v
* @date:2016-06-05
* @author: Andrey Filippov
* @date 2016-06-05
* @author Andrey Filippov
*
* @brief: SIMD adder/subtracter
* @brief SIMD adder/subtracter
*
* @copyright Copyright (c) 2016 Elphel, Inc.
*
......@@ -35,7 +35,7 @@
* the combined code. This permission applies to you if the distributed code
* contains all the components and scripts required to completely simulate it
* with at least one of the Free Software programs.
*******************************************************************************/
*/
`timescale 1ns/1ps
module dsp_addsub_simd#(
......@@ -70,7 +70,7 @@ module dsp_addsub_simd#(
DSP48E1 #(
.ACASCREG (1),
.ADREG (0), // (1),
.ADREG (1),
.ALUMODEREG (1),
.AREG (1), // (1)
.AUTORESET_PATDET ("NO_RESET"),
......@@ -81,7 +81,7 @@ module dsp_addsub_simd#(
.CARRYINREG (1),
.CARRYINSELREG (1),
.CREG (1), //(1),
.DREG (0), //(1),
.DREG (1),
.INMODEREG (1),
.IS_ALUMODE_INVERTED (4'b0),
.IS_CARRYIN_INVERTED (1'b0),
......@@ -131,7 +131,7 @@ module dsp_addsub_simd#(
.CECTRL (1'b1), // input
.CED (1'b0), // input
.CEINMODE (1'b1), // input
.CEM (1'b1), // input
.CEM (1'b0), // input
.CEP (cep), // input
.CLK (clk), // input
.D (25'h1ffffff),// input[24:0]
......@@ -145,9 +145,9 @@ module dsp_addsub_simd#(
.RSTB (rst), // input
.RSTC (rst), // input
.RSTCTRL (rst), // input
.RSTD (rst), // input
.RSTD (1'b0), // input
.RSTINMODE (rst), // input
.RSTM (rst), // input
.RSTM (1'b0), // input
.RSTP (rst) // input
);
`else
......
/*******************************************************************************
/*!
* dsp_ma
* @file dsp_ma.v
* @date:2016-06-05
* @author: Andrey Filippov
* @date 2016-06-05
* @author Andrey Filippov
*
* @brief: DSP with multi-input multiplier and accumulator
* @brief DSP with multi-input multiplier and accumulator
*
* @copyright Copyright (c) 2016 Elphel, Inc.
*
......@@ -35,7 +35,7 @@
* the combined code. This permission applies to you if the distributed code
* contains all the components and scripts required to completely simulate it
* with at least one of the Free Software programs.
*******************************************************************************/
*/
`timescale 1ns/1ps
module dsp_ma #(
......
/*******************************************************************************
/*!
* dsp_ma_preadd
* @file dsp_ma_preadd.v
* @date:2016-06-05
* @author: Andrey Filippov
* @date 2016-06-05
* @author Andrey Filippov
*
* @brief: DSP with multi-input multiplier and accumulator with pre-adder
* @brief DSP with multi-input multiplier and accumulator with pre-adder
*
* @copyright Copyright (c) 2016 Elphel, Inc.
*
......@@ -35,7 +35,7 @@
* the combined code. This permission applies to you if the distributed code
* contains all the components and scripts required to completely simulate it
* with at least one of the Free Software programs.
*******************************************************************************/
*/