Switched to new implementation of 8x8 DCT, generated documentation

530030f6 · Andrey Filippov · 0e866d77 · 530030f6 · 530030f6 · 530030f6
Commit 530030f6 authored Jun 13, 2016 by Andrey Filippov
17 changed files
--- a/.gitignore
+++ b/.gitignore
@@ -23,7 +23,7 @@ py393/dbg*
 debug/*
 html/*
 man/*
+x393_docs/*
 includes/x393_cur_params_sim.vh
 includes/x393_cur_params_target_*.vh
 py393/exp_gpio.py

--- a/compressor_jp/cmprs_macroblock_buf_iface.v
+++ b/compressor_jp/cmprs_macroblock_buf_iface.v
@@ -40,7 +40,14 @@
 */
 `timescale 1ns/1ps
-module  cmprs_macroblock_buf_iface (
+module  cmprs_macroblock_buf_iface #(
+`ifdef USE_OLD_DCT
+    parameter DCT_PIPELINE_PAUSE = 0 // No need to delay
+`else    
+    parameter DCT_PIPELINE_PAUSE = 48 // TODO: find really required value (minimal), adjust counter bits (now 6)
+                                      // 48 seems to be OK (may be less)
+`endif                                      
+)(
 //    input         rst,
    input         xclk,               // global clock input, compressor single clock rate
@@ -98,6 +105,7 @@ module  cmprs_macroblock_buf_iface (
    wire          frame_pre_start_w; // start sequence for a new frame
    reg           frame_pre_start_r; 
    reg    [ 8:0] mb_pre_start;   // 1-hot macroblock pre start calcualtions - TODO: adjust width
+    reg           mb_pre_start4_first; // first cycle after mb_pre_start[3]    
    wire   [ 2:0] buf_diff;       // difference between page needed and next valid - should be negative to have it ready
    wire          buf_ready_w;    // External memory buffer has all the pages needed
@@ -117,6 +125,8 @@ module  cmprs_macroblock_buf_iface (
    reg           frame_pre_run;
    reg     [1:0] frame_may_start;
+    reg     [5:0] dct_pipeline_delay_cntr;
 `ifdef DEBUG_RING
    assign  dbg_add_invalid = add_invalid;
    assign  dbg_mb_release_buf = mb_release_buf;
@@ -180,9 +190,17 @@ module  cmprs_macroblock_buf_iface (
        // calculate before starting each macroblock (will wait if buffer is not ready) (TODO: align mb_pre_start[0] to mb_pre_end[2] - same)
        //mb_pre_start_w
-        if      (!frame_en_r)                     mb_pre_start <= 0;
+        // TODO: Here enforce minimal pause (if not zero for the DCT pipeline to recover
-        if      (mb_pre_start_w)                  mb_pre_start <= 1;
+        // will wait for buf_ready_w, but not less than DCT_PIPELINE_PAUSE (or no wait at all)
-        else if (!mb_pre_start[4] || buf_ready_w) mb_pre_start <= mb_pre_start << 1;
+        mb_pre_start4_first <=mb_pre_start[3];
+        if      (xrst)                                 dct_pipeline_delay_cntr <= 0;
+        else if (mb_pre_start4_first && !buf_ready_w)  dct_pipeline_delay_cntr <= DCT_PIPELINE_PAUSE -1;
+        else if (|dct_pipeline_delay_cntr)             dct_pipeline_delay_cntr <= dct_pipeline_delay_cntr -1;
+        if      (!frame_en_r)                                                      mb_pre_start <= 0;
+        if      (mb_pre_start_w)                                                   mb_pre_start <= 1;
+        else if (!mb_pre_start[4] || (buf_ready_w && !(|dct_pipeline_delay_cntr))) mb_pre_start <= mb_pre_start << 1;
        if (mb_pre_start[1]) mbl_x_r[6:3] <=      mb_first_in_row? {2'b0,left_marg[4:3]} : mbl_x_next_r[6:3];
        if (mb_pre_start[2]) mbl_x_last_r[7:3] <= {1'b0,mbl_x_r[6:3]} + {2'b0,mb_w_m1[5:3]};

--- a/compressor_jp/jp_channel.v
+++ b/compressor_jp/jp_channel.v
@@ -965,39 +965,10 @@ module  jp_channel#(
        if (dct_last_in) first_block_dct   <= first_block_color_after;
    end
+    // 8x8 DCT implementing Chen algorithm and 2 passes
-`ifdef USE_OLD_XDCT393    
+    // Each pass (1d) uses 5 DSP48E1 modules (2 - multipliers and 3 SIMD (2x24) adder/subracters
+    // Needs a small (<48, but did not calculate yet) pause between block if they did not come
-    xdct393 xdct393_i (
+    // immediately after each other. This pause is needed to restart pipeline
-        .clk                (xclk),                // input
-        .en                 (frame_en),            // input  if zero will reset transpose memory page numbers
-        .start              (dct_start),           // input  single-cycle start pulse that goes with the first pixel data. Other 63 should follow
-        .xin                (yc_nodc),             // input[9:0] 
-        .last_in            (dct_last_in),         // output reg  output high during input of the last of 64 pixels in a 8x8 block //
-        .pre_first_out      (dct_pre_first_out),   // outpu 1 cycle ahead of the first output in a 64 block
-///        .dv                 (dct_dv),           // output data output valid. Will go high on the 94-th cycle after the start (now - on 95-th?)
-        .dv                 (),  // not used: output data output valid. Will go high on the 94-th cycle after the start (now - on 95-th?)
-        .d_out              (dct_out)              // output[12:0] 
-    );
-`else
-    xdct393r xdct393_i (
-        .clk                (xclk),                // input
-        .en                 (frame_en),            // input  if zero will reset transpose memory page numbers
-        .start              (dct_start),           // input  single-cycle start pulse that goes with the first pixel data. Other 63 should follow
-        .xin                (yc_nodc),             // input[9:0] 
-        .last_in            (dct_last_in),         // output reg  output high during input of the last of 64 pixels in a 8x8 block //
-        .pre_first_out      (dct_pre_first_out),   // outpu 1 cycle ahead of the first output in a 64 block
-///        .dv                 (dct_dv),           // output data output valid. Will go high on the 94-th cycle after the start (now - on 95-th?)
-        .dv                 (),  // not used: output data output valid. Will go high on the 94-th cycle after the start (now - on 95-th?)
-        .d_out              (dct_out)              // output[12:0] 
-    );
-    /* New DCT, now in passive mode */
-    // TODO: enforce minimal pause (when not butted together)
-    wire        dct_last_in_debug;
-    wire        dct_pre_first_out_debug;
-    wire        dct_dv_debug;
-    wire [12:0] dct_dout_debug;
    dct2d8x8_chen #(
        .INPUT_WIDTH      (10),
@@ -1005,27 +976,23 @@ module  jp_channel#(
        .STAGE1_SAFE_BITS (3),
        .STAGE2_SAFE_BITS (3),
        .TRANSPOSE_WIDTH  (16),
-        .TRIM_STAGE_1     (0),
+        .TRIM_STAGE_1     (1),
-        .TRIM_STAGE_2     (2),
+        .TRIM_STAGE_2     (0),
        .DSP_WIDTH        (24),
-        .DSP_OUT_WIDTH    (24),
        .DSP_B_WIDTH      (18),
        .DSP_A_WIDTH      (25),
-        .DSP_P_WIDTH      (48),
+        .DSP_P_WIDTH      (48)
-        .DSP_M_WIDTH      (43)
    ) dct2d8x8_chen_i (
-        .clk           (xclk),                    // input
+        .clk           (xclk),              // input
-        .rst           (!frame_en),               // input
+        .rst           (!frame_en),         // input
-        .start         (dct_start),               // input
+        .start         (dct_start),         // input
-        .xin           (yc_nodc),                 // input[9:0] signed 
+        .xin           (yc_nodc),           // input[9:0] signed 
-        .last_in       (dct_last_in_debug),       // output reg 
+        .last_in       (dct_last_in),       // output reg 
-        .pre_first_out (dct_pre_first_out_debug), // output
+        .pre_first_out (dct_pre_first_out), // output
-        .dv            (dct_dv_debug),            // output
+        .dv            (),                  // output
-        .d_out         (dct_dout_debug)           // output[12:0] signed 
+        .d_out         (dct_out)           // output[12:0] signed 
    );
-`endif    
    wire          quant_start;
    dly_16 #(.WIDTH(1)) i_quant_start (.clk(xclk),.rst(1'b0), .dly(4'd0), .din(dct_pre_first_out), .dout(quant_start));    // dly=0+1

--- a/compressor_jp/xdct393r.v
+++ b/compressor_jp/xdct393r.v
-/**********************************************************************
-** -----------------------------------------------------------------------------**
-** xdct393r.v
-**
-** 8x8 discrete Cosine Transform
-** adding more registers to increase bandwidth
-**
-** Copyright (C) 2002-2015 Elphel, Inc
-**
-** -----------------------------------------------------------------------------**
-**  xdct393r is free software - hardware description language (HDL) code.
-** 
-**  This program is free software: you can redistribute it and/or modify
-**  it under the terms of the GNU General Public License as published by
-**  the Free Software Foundation, either version 3 of the License, or
-**  (at your option) any later version.
-**
-**  This program is distributed in the hope that it will be useful,
-**  but WITHOUT ANY WARRANTY; without even the implied warranty of
-**  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-**  GNU General Public License for more details.
-**
-**  You should have received a copy of the GNU General Public License
-**  along with this program.  If not, see <http://www.gnu.org/licenses/>.
-** -----------------------------------------------------------------------------**
-**
-** Modified by Andrey Filippov - goal to make it work in start/stop mode, using
-** "start" input (going together with the first data, no restriction on the gap between 64-pixel blocks (>=0)
-** Removed "RST" input ("en" is only used to reset ping-pong transpose memory address)
-** Split module in 2 stages
-** Also saved some area - original design compiled by XST to 865 slices (XC2S300e), this one - 780!
-**
-** It is based on the original design (Xilix app. note XAPP610) by:
-**                  Author: Latha Pillai
-**                  Senior Applications Engineer
-**
-**                  Video Applications
-**                  Advanced Products Group
-**                  Xilinx, Inc.
-**
-**                  Copyright (c) 2001 Xilinx, Inc.
-**                  All rights reserved
-**
-**                  Date:   Feb. 10, 2002
-**
-**                  RESTRICTED RIGHTS LEGEND
-**
-**      This software has not been published by the author, and 
-**      has been disclosed to others for the purpose of enhancing 
-**      and promoting design productivity in Xilinx products.
-**
-**      Therefore use, duplication or disclosure, now and in the 
-**      future should give consideration to the productivity 
-**      enhancements afforded the user of this code by the author's 
-**      efforts.  Thank you for using our products !
-**
-** Disclaimer:  THESE DESIGNS ARE PROVIDED "AS IS" WITH NO WARRANTY 
-**              WHATSOEVER AND XILINX SPECIFICALLY DISCLAIMS ANY 
-**              IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR
-**              A PARTICULAR PURPOSE, OR AGAINST INFRINGEMENT.
-***********************************************************************/
-/*
-after I added DC subtraction before DCT I got 9-bit (allthough not likely to go out of 8bit range) signed data.
-also increased transpose memory to 9 bits (anyway it is 16-bit wide) - see if it will help to prevent saturation
-without significant increase in gates
-Saturatuion is still visible on real pictures, but there was a bug - addsub<i>a_comp, addsub<i>b_comp where not using their
-MSB. I added 1 more bit to add_sub<i>a and add_sub<i>b and fixed that bug. Only 2 mofre slices were used
-*/
-`timescale 1ns/1ps
-// For xdct353 - increasing data in 9 bits -> 10 bits, out 12 bits ->13 bits
-module xdct393r   ( // increased latency by 3
-    input         clk,           // system clock, posedge
-    input         en,            // if zero will reset transpose memory page njumbers
-    input         start,         // single-cycle start pulse that goes with the first pixel data. Other 63 should follow
-    input   [9:0] xin,           // [7:0] - input data
-    output reg    last_in,       // output high during input of the last of 64 pixels in a 8x8 block
-    output        pre_first_out, // 1 cycle ahead of the first output in a 64 block
-    output        dv,            // data output valid. Will go high on the 94-th cycle after the start
-    output [12:0] d_out);        // [8:0]output data
-    wire          stage1_done;
-    wire          tm_page;
-    wire          tm_we;
-    wire    [6:0] tm_ra;
-    wire    [6:0] tm_wa;
-    wire   [15:0] tm_out;
-    wire   [15:0] tm_di;
-//    reg           stage1_done_r; // delay by one clock to use memory output register
-    wire          tm_re; // =1'b1; // TODO: generate, for now just 1'b1
-    wire           tm_regen;
-    always @ (posedge clk) begin
-        last_in <=       (tm_wa[5:0]== 6'h30);
-//        stage1_done_r <= stage1_done;
-//        tm_regen <=      tm_re;
-    end
-    dct393r_stage1 i_dct_stage1(
-        .clk       (clk),
-        .en        (en),
-        .start     (start),
-        .xin       (xin),      // [7:0]
-        .we        (tm_we),          // write to transpose memory
-        .wr_cntr   (tm_wa), // [6:0]    transpose memory write address
-        .z_out     (tm_di[15:0]),
-        .page      (tm_page),
-        .done      (stage1_done));
-    dct393r_stage2 i_dct_stage2(
-        .clk       (clk),
-        .en        (en),
-        .start     (stage1_done),    // stage 1 finished, data available in transpose memory (extra RAM latency)
-        .page      (tm_page),        // transpose memory page finished, valid at start
-        .rd_cntr   (tm_ra[6:0]),     // [6:0]    transpose memory read address
-        .ren       (tm_re),          // output
-        .regen     (tm_regen),       // output reg 
-        .tdin      (tm_out[15:0]),   // [7:0] - data from transpose memory
-        .endv      (pre_first_out),  // output
-        .dv        (dv),             // data output valid
-        .dct2_out  (d_out[12:0]));   // [10:0]output data
-    ram18_var_w_var_r #(
-        .REGISTERS     (1),
-        .LOG2WIDTH_WR  (4),
-        .LOG2WIDTH_RD  (4),
-        .DUMMY(0)
-    ) i_transpose_mem (
-        .rclk      (clk), // input
-        .raddr     ({3'b0,tm_ra[6:0]}), // input[9:0] 
-        .ren       (tm_re), // input
-        .regen     (tm_regen), // input
-        .data_out  (tm_out[15:0]), // output[15:0] 
-        .wclk      (clk), // input
-        .waddr     ({3'b0,tm_wa[6:0]}), // input[9:0] 
-        .we        (tm_we), // input
-        .web       (4'hf), // input[3:0] 
-        .data_in   (tm_di[15:0]) // input[15:0] 
-    );
-endmodule
-// 01/24/2004: Moved all clocks in stage 1 to "negedge" to reduce current pulses
-module dct393r_stage1 ( // increased latency by 1
-    input             clk,           // system clock, posedge
-    input             en,
-    input             start,      // single-cycle start pulse to replace RST
-    input      [ 9:0] xin,      // [7:0]
-    output            we,          // write to transpose memory
-    output     [ 6:0] wr_cntr, // [6:0]    transpose memory write address
-    output reg [15:0] z_out,      //data to transpose memory
-    output            page,    // transpose memory page just filled (valid @ done)
-    output            done);   // last cycle writing to transpose memory - may use after it (move it earlier?)
-/* constants */
-    localparam C3= 16'd54491;
-    localparam S3= 16'd36410;
-    localparam C4= 16'd46341;
-    localparam C6= 16'd25080;
-    localparam S6= 16'd60547;
-    localparam C7= 16'd12785;
-    localparam S7= 16'd64277;
-    reg    [16:0] memory1a, memory2a, memory3a, memory4a;
-/* 1D section */
-/* The max value of a pixel after processing (to make their expected mean to zero)
-is 127. If all the values in a row are 127, the max value of the product terms
-would be (127*2)*(23170/256) and that of z_out_int would be (127*8)*23170/256.
-This value divided by 2raised to 8 is equivalent to ignoring the 8 lsb bits of the value */
-    reg    [ 9:0] xa0_in, xa1_in, xa2_in, xa3_in, xa4_in, xa5_in, xa6_in, xa7_in;
-    reg    [ 9:0] xa0_reg, xa1_reg, xa2_reg, xa3_reg, xa4_reg, xa5_reg, xa6_reg, xa7_reg;
-    reg    [ 9:0] addsub1a_comp, addsub2a_comp, addsub3a_comp, addsub4a_comp;
-//    reg    [10:0] addsub1a_comp, addsub2a_comp, addsub3a_comp, addsub4a_comp; // AF2015: increasing width - was limiting
-    reg    [10:0] add_sub1a, add_sub2a, add_sub3a, add_sub4a;
-    reg           save_sign1a, save_sign2a, save_sign3a, save_sign4a;
-    reg    [17:0] p1a, p2a, p3a, p4a;
-    wire   [35:0] p1a_all, p2a_all, p3a_all, p4a_all;
-    reg           toggleA;
-    reg    [18:0] z_out_int1, z_out_int2;
-    reg    [18:0] z_out_int;
-    wire   [15:0] z_out_prelatch;
-    reg    [ 2:0] indexi;
-/* clks and counters */
-    reg    [ 6:0] wr_cntr_prelatch;
-/* memory section */
-    reg           done_prelatch;
-    reg           we_prelatch;
-    wire          enwe;
-    wire          pre_sxregs;
-    reg           sxregs;
-    reg           page_prelatch;
-    // TODO: See if negedge is needed
-    wire          nclk = ~clk; // seems that everything here is running at negedge (and delays too), but not the transpose memory
-// to conserve energy by disabling toggleA
-    wire          sxregs_d8;
-    reg           enable_toggle;
-//  SRL16_1 i_sxregs_d8   (.Q(sxregs_d8), .A0(1'b1), .A1(1'b1), .A2(1'b1), .A3(1'b0), .CLK(clk),.D(sxregs));    // dly=7+1
-    dly_16 #(.WIDTH(1)) i_sxregs_d8(.clk(nclk),.rst(1'b0), .dly(4'd7), .din(sxregs), .dout(sxregs_d8));   // dly=7+1
-// SRL16_1 i_pre_sxregs (.Q(pre_sxregs), .A0(1'b0), .A1(1'b1), .A2(1'b1), .A3(1'b0), .CLK(clk), .D(start));    // dly=6+1
-    dly_16 #(.WIDTH(1)) i_pre_sxregs(.clk(nclk),.rst(1'b0), .dly(4'd6), .din(start), .dout(pre_sxregs));    // dly=6+1
-// SRL16_1 i_enwe       (.Q(enwe), .A0(1'b1), .A1(1'b0), .A2(1'b1), .A3(1'b0), .CLK(clk), .D(pre_sxregs));    // dly=5+1
-    dly_16 #(.WIDTH(1)) i_enwe(.clk(nclk),.rst(1'b0), .dly(4'd5), .din(pre_sxregs), .dout(enwe));    // dly=5+1
-    always @ (posedge nclk) begin
-        enable_toggle <= en && (sxregs || (enable_toggle && !sxregs_d8));
-        done_prelatch<= (wr_cntr_prelatch[5:0]==6'h3f);
-        if (wr_cntr_prelatch[5:0]==6'h3f) page_prelatch <= wr_cntr_prelatch[6];
-        we_prelatch<= enwe || (en && we_prelatch && (wr_cntr_prelatch[5:0]!=6'h3f));
-        if     (!en) wr_cntr_prelatch <= 7'b0;
-        else if (we_prelatch) wr_cntr_prelatch <= wr_cntr_prelatch + 1;
-        sxregs <= pre_sxregs || ((wr_cntr_prelatch[2:0]==3'h1) && (wr_cntr_prelatch[5:3]!=3'h7));
-        toggleA <= sxregs || (enable_toggle && (~toggleA));
-        if (sxregs) indexi <= 3'h7;
-        else if (enable_toggle) indexi<=indexi+1;
-    end
-/*  1D-DCT BEGIN */
-// store  1D-DCT constant coefficient values for multipliers */
-    always @ (posedge nclk) begin
-        case (indexi)
-         0 : begin memory1a <= {1'b0,C4}; //8'd91
-                   memory2a <= {1'b0,C4}; //8'd91
-                   memory3a <= {1'b0,C4}; //8'd91 
-                   memory4a <= {1'b0,C4}; //8'd91
-             end
-         1 : begin memory1a <= {1'b0,S7}; //8'd126; 
-                   memory2a <= {1'b0,C3}; //8'd106;  
-                   memory3a <= {1'b0,S3}; //8'd71;  
-                   memory4a <= {1'b0,C7}; //8'd25;
-             end
-         2 : begin memory1a <= {1'b0,S6}; //8'd118; 
-                   memory2a <= {1'b0,C6}; //8'd49;  
-                   memory3a <= {1'b1,C6}; //-8'd49; 
-                   memory4a <= {1'b1,S6}; //-8'd118
-             end
-         3 : begin memory1a <= {1'b0,C3}; // 8'd106; 
-                   memory2a <= {1'b1,C7}; //-8'd25;  
-                   memory3a <= {1'b1,S7}; //-8'd126; 
-                   memory4a <= {1'b1,S3}; //-8'd71;
-             end
-         4 : begin memory1a <= {1'b0,C4}; // 8'd91; 
-                   memory2a <= {1'b1,C4}; //-8'd91; 
-                   memory3a <= {1'b1,C4}; //-8'd91; 
-                   memory4a <= {1'b0,C4}; // 8'd91;
-             end
-         5 : begin memory1a <= {1'b0,S3}; // 8'd71; 
-                   memory2a <= {1'b1,S7}; //-8'd126; 
-                   memory3a <= {1'b0,C7}; // 8'd25;   
-                   memory4a <= {1'b0,C3}; // 8'd106;
-             end
-         6 : begin memory1a <= {1'b0,C6}; // 8'd49; 
-                   memory2a <= {1'b1,S6}; //-8'd118; 
-                   memory3a <= {1'b0,S6}; // 8'd118;  
-                   memory4a <= {1'b1,C6}; //-8'd49;
-             end
-         7 : begin memory1a <= {1'b0,C7}; // 8'd25;  
-                   memory2a <= {1'b1,S3}; //-8'd71; 
-                   memory3a <= {1'b0,C3}; // 8'd106;  
-                   memory4a <= {1'b1,S7}; //-8'd126;
-             end
-        endcase
-    end
-/* 8-bit input shifted 8 times through a shift register*/
-// xa0_in will see output registers from posedge, may be replaced by latches if needed - but currently delay is under 5ns
-    always @ (posedge nclk) begin
-        xa0_in <= xin;
-        xa1_in <= xa0_in;
-        xa2_in <= xa1_in;
-        xa3_in <= xa2_in;
-        xa4_in <= xa3_in;
-        xa5_in <= xa4_in;
-        xa6_in <= xa5_in;
-        xa7_in <= xa6_in;
-    end
-/* shifted inputs registered every 8th clk (using cntr8)*/
-    always @ (posedge nclk) if (sxregs) begin 
-        xa0_reg <= xa0_in;
-        xa1_reg <= xa1_in; 
-        xa2_reg <= xa2_in;
-        xa3_reg <= xa3_in;
-        xa4_reg <= xa4_in;
-        xa5_reg <= xa5_in; 
-        xa6_reg <= xa6_in;
-        xa7_reg <= xa7_in;
-    end
-/* adder / subtractor block */
-    always @ (negedge clk)
-        if (toggleA == 1'b1) begin
-            add_sub1a <= {xa7_reg[9],xa7_reg[9:0]} + {xa0_reg[9],xa0_reg[9:0]};
-            add_sub2a <= {xa6_reg[9],xa6_reg[9:0]} + {xa1_reg[9],xa1_reg[9:0]};
-            add_sub3a <= {xa5_reg[9],xa5_reg[9:0]} + {xa2_reg[9],xa2_reg[9:0]};
-            add_sub4a <= {xa4_reg[9],xa4_reg[9:0]} + {xa3_reg[9],xa3_reg[9:0]};
-        end else begin
-            add_sub1a <= {xa7_reg[9],xa7_reg[9:0]} - {xa0_reg[9],xa0_reg[9:0]};
-            add_sub2a <= {xa6_reg[9],xa6_reg[9:0]} - {xa1_reg[9],xa1_reg[9:0]};
-            add_sub3a <= {xa5_reg[9],xa5_reg[9:0]} - {xa2_reg[9],xa2_reg[9:0]};
-            add_sub4a <= {xa4_reg[9],xa4_reg[9:0]} - {xa3_reg[9],xa3_reg[9:0]};
-        end
-// First valid add_sub appears at the 10th clk (8 clks for shifting inputs,
-// 9th clk for registering shifted input and 10th clk for add_sub
-// to synchronize the i value to the add_sub value, i value is incremented
-// only after 10 clks
-// Adding these wires to get rid of the MSB that is always 0
-    wire [10:0] addsub1a_comp_w  = add_sub1a[10]? (-add_sub1a) : add_sub1a;
-    wire [10:0] addsub2a_comp_w  = add_sub2a[10]? (-add_sub2a) : add_sub2a;
-    wire [10:0] addsub3a_comp_w  = add_sub3a[10]? (-add_sub3a) : add_sub3a;
-    wire [10:0] addsub4a_comp_w  = add_sub4a[10]? (-add_sub4a) : add_sub4a;
-    always @ (posedge nclk) begin
-         save_sign1a     <= add_sub1a[10];
-         save_sign2a     <= add_sub2a[10];
-         save_sign3a     <= add_sub3a[10];
-         save_sign4a     <= add_sub4a[10];
-         addsub1a_comp	<= addsub1a_comp_w[9:0]; //add_sub1a[10]? (-add_sub1a) : add_sub1a;
-         addsub2a_comp	<= addsub2a_comp_w[9:0]; //add_sub2a[10]? (-add_sub2a) : add_sub2a;
-         addsub3a_comp	<= addsub3a_comp_w[9:0]; //add_sub3a[10]? (-add_sub3a) : add_sub3a;
-         addsub4a_comp	<= addsub4a_comp_w[9:0]; //add_sub4a[10]? (-add_sub4a) : add_sub4a;
-    end
-    assign p1a_all = addsub1a_comp * memory1a[15:0]; // [16] is sign!
-    assign p2a_all = addsub2a_comp * memory2a[15:0];
-    assign p3a_all = addsub3a_comp * memory3a[15:0];
-    assign p4a_all = addsub4a_comp * memory4a[15:0];
-    reg [17:0] p1a_all_r;
-    reg [17:0] p2a_all_r;
-    reg [17:0] p3a_all_r;
-    reg [17:0] p4a_all_r;
-    reg p1a_sig, p2a_sig, p3a_sig, p4a_sig;
-    always @ (posedge nclk) begin
-        p1a_all_r <= p1a_all[26:9];
-        p2a_all_r <= p2a_all[26:9];
-        p3a_all_r <= p3a_all[26:9];
-        p4a_all_r <= p4a_all[26:9];
-        p1a_sig <= (save_sign1a ^ memory1a[16]);
-        p2a_sig <= (save_sign2a ^ memory2a[16]);
-        p3a_sig <= (save_sign3a ^ memory3a[16]);
-        p4a_sig <= (save_sign4a ^ memory4a[16]);
-    end
-    always @ (posedge nclk) begin
-        p1a <= p1a_sig ? (-p1a_all_r) : p1a_all_r;
-        p2a <= p2a_sig ? (-p2a_all_r) : p2a_all_r;
-        p3a <= p3a_sig ? (-p3a_all_r) : p3a_all_r;
-        p4a <= p4a_sig ? (-p4a_all_r) : p4a_all_r;
-    end
-/* Final adder. Adding the ouputs of the 4 multipliers */
-    always @ (posedge nclk) begin
-        z_out_int1 <= ({p1a[17],p1a} + {p2a[17],p2a});
-        z_out_int2 <= ({p3a[17],p3a} + {p4a[17],p4a});
-        z_out_int <= (z_out_int1 + z_out_int2);
-    end
-// rounding of the value
-    assign z_out_prelatch[15:0] = z_out_int[18:3]+ z_out_int[2]; // correct rounding
-// outputs from output latches to cross clock edge boundary
-    always @ (posedge clk) begin
-        z_out[15:0]  <= z_out_prelatch[15:0];
-//        wr_cntr[6:0] <= wr_cntr_prelatch[6:0];  
-//        done         <= done_prelatch;  
-//        we           <= we_prelatch;  
-//        page         <= page_prelatch;  
-    end
-    dly_16 #(.WIDTH(10)) i_delayed_outs(
-        .clk(clk),
-        .rst(1'b0),
-        .dly(4'd1),
-        .din( {wr_cntr_prelatch[6:0], done_prelatch, we_prelatch, page_prelatch}),
-        .dout({wr_cntr[6:0],          done,          we,          page}));
-/* 1D-DCT END */
-endmodule
-module dct393r_stage2 ( // increased latency by 2 clocks
-    input             clk,           // system clock, posedge
-    input             en,
-    input             start,      // stage 1 finished, data available in transpose memory
-    input             page,      // transpose memory page finished, valid at start
-    output      [6:0] rd_cntr, // [6:0]    transpose memory read address
-    output            ren,     // read enable transpose memory
-    output reg        regen,   // register enable in transpose memory
-    input      [15:0] tdin,      // [15:0] - data from transpose memory, added 6 bit fractional part
-    output reg        endv,        // one cycle ahead of starting (continuing) dv
-    output reg        dv,          // data output valid
-    output reg [12:0] dct2_out);// [8:0]output data
-/* constants */
-    localparam C3= 16'd54491;
-    localparam S3= 16'd36410;
-    localparam C4= 16'd46341;
-    localparam C6= 16'd25080;
-    localparam S6= 16'd60547;
-    localparam C7= 16'd12785;
-    localparam S7= 16'd64277;
-    reg    [16:0] memory1a, memory2a, memory3a, memory4a;
-    reg     [2:0] indexi;
-/* 2D section */
-    reg    [15:0] xb0_in, xb1_in, xb2_in, xb3_in, xb4_in, xb5_in, xb6_in, xb7_in;
-    reg    [15:0] xb0_reg, xb1_reg, xb2_reg, xb3_reg, xb4_reg, xb5_reg, xb6_reg, xb7_reg;
-    reg    [16:0] add_sub1b, add_sub2b, add_sub3b, add_sub4b;
-    reg    [15:0] addsub1b_comp, addsub2b_comp, addsub3b_comp, addsub4b_comp;
-    reg           save_sign1b, save_sign2b, save_sign3b, save_sign4b;
-    reg    [18:0] p1b, p2b, p3b, p4b;
-    wire   [35:0] p1b_all, p2b_all, p3b_all, p4b_all;
-    reg           toggleB;
-    reg    [19:0] dct2d_int1, dct2d_int2;
-    reg    [20:0] dct_2d_int;
-    wire   [12:0] dct_2d_rnd;
-// transpose memory read address
-    reg    [ 5:0] rd_cntrs;
-    reg           rd_page;
-// start with the same as stage1
-    wire          sxregs;
-// to conserve energy by disabling toggleB
-    wire          sxregs_d8;
-    reg           enable_toggle;
-    reg           en_started;
-    wire          pre2_endv;
-    wire          pre2_disdv; // AF2015: was missing
-    reg           pre_endv;
-    reg           pre_disdv;
-    reg           pre_dv;
-// SRL16 i_endv       (.Q(endv), .A0(1'b0), .A1(1'b1), .A2(1'b1), .A3(1'b1), .CLK(clk), .D(start));    // dly=14+1
-//    dly_16 #(.WIDTH(1)) i_endv(.clk(clk),.rst(1'b0), .dly(4'd14), .din(start), .dout(endv));    // dly=14+1
-    dly_16 #(.WIDTH(1)) i_pre2_endv(.clk(clk),.rst(1'b0), .dly(4'd15), .din(start), .dout(pre2_endv));    // dly=15+1
-// SRL16 i_disdv      (.Q(disdv), .A0(1'b0), .A1(1'b1), .A2(1'b1), .A3(1'b1), .CLK(clk), .D(rd_cntrs[5:0]==6'h3f));    // dly=14+1
-//    dly_16 #(.WIDTH(1)) i_disdv(.clk(clk),.rst(1'b0), .dly(4'd14), .din(rd_cntrs[5:0]==6'h3f), .dout(disdv));    // dly=14+1
-    dly_16 #(.WIDTH(1)) i_pre2_disdv(.clk(clk),.rst(1'b0), .dly(4'd15), .din(rd_cntrs[5:0]==6'h3f), .dout(pre2_disdv));    // dly=15+1
-// SRL16 i_sxregs      (.Q(sxregs),    .A0(1'b0), .A1(1'b0), .A2(1'b0), .A3(1'b1), .CLK(clk),.D((rd_cntr[5:3]==3'h0) && en_started));    // dly=8+1
-//    dly_16 #(.WIDTH(1)) i_sxregs(.clk(clk),.rst(1'b0), .dly(4'd8), .din((rd_cntr[5:3]==3'h0) && en_started), .dout(sxregs));    // dly=8+1
-    dly_16 #(.WIDTH(1)) i_sxregs(.clk(clk),.rst(1'b0), .dly(4'd9), .din((rd_cntrs[2:0]==3'h0) && en_started), .dout(sxregs));    // dly=9+1
-// SRL16 i_sxregs_d8   (.Q(sxregs_d8), .A0(1'b1), .A1(1'b1), .A2(1'b1), .A3(1'b0), .CLK(clk),.D(sxregs && en_started));    // dly=7+1
-    dly_16 #(.WIDTH(1)) i_sxregs_d8(.clk(clk),.rst(1'b0), .dly(4'd7), .din(sxregs && en_started), .dout(sxregs_d8));    // dly=7+1
-    assign ren = en_started;
-    always @ (posedge clk) begin
-        enable_toggle <= en && (sxregs || (enable_toggle && !sxregs_d8));
-//        en_started <= en && (start || en_started);
-        if      (!en)                   en_started <= 0;
-        else if (start)                 en_started <= 1;
-        else if (rd_cntrs[5:0] == 6'h3f) en_started <= 0; // should be after (start) as they happen simultaneously
-        regen <= en_started;
-        pre_endv <=pre2_endv;
-        endv <= pre_endv; // output reg
-        pre_disdv <= pre2_disdv;
-        pre_dv <=  en && (pre_endv || (pre_dv && ~pre_disdv));
-//        dv <= en && (endv || (dv && ~disdv));
-        dv <= en && pre_dv; // output reg
-        toggleB <= sxregs || (enable_toggle && (~toggleB));
-        if (sxregs) indexi <= 3'h7;
-        else if (enable_toggle) indexi<=indexi+1;
-        if (start) rd_page <= page;
-        if (start) rd_cntrs[5:0] <=6'b0;    // will always count, but that does not matter- What about saving energy ;-) ? Saved...
-        else if (rd_cntrs[5:0]!=6'h3f) rd_cntrs[5:0] <= rd_cntrs[5:0]+1;
-    end 
-    assign    rd_cntr[6:0]= {rd_page,rd_cntrs[2:0],rd_cntrs[5:3]}; // transposed counter
-// duplicate memory<i>a from stage 1
-// store  1D-DCT constant coeeficient values for multipliers */
-    always @ (posedge clk) begin
-        case (indexi)
-         0 : begin memory1a <= {1'b0,C4}; //8'd91
-                   memory2a <= {1'b0,C4}; //8'd91
-                   memory3a <= {1'b0,C4}; //8'd91 
-                   memory4a <= {1'b0,C4}; //8'd91
-             end
-         1 : begin memory1a <= {1'b0,S7}; //8'd126; 
-                   memory2a <= {1'b0,C3}; //8'd106;  
-                   memory3a <= {1'b0,S3}; //8'd71;  
-                   memory4a <= {1'b0,C7}; //8'd25;
-             end
-         2 : begin memory1a <= {1'b0,S6}; //8'd118; 
-                   memory2a <= {1'b0,C6}; //8'd49;  
-                   memory3a <= {1'b1,C6}; //-8'd49; 
-                   memory4a <= {1'b1,S6}; //-8'd118
-             end
-         3 : begin memory1a <= {1'b0,C3}; // 8'd106; 
-                   memory2a <= {1'b1,C7}; //-8'd25;  
-                   memory3a <= {1'b1,S7}; //-8'd126; 
-                   memory4a <= {1'b1,S3}; //-8'd71;
-             end
-         4 : begin memory1a <= {1'b0,C4}; // 8'd91; 
-                   memory2a <= {1'b1,C4}; //-8'd91; 
-                   memory3a <= {1'b1,C4}; //-8'd91; 
-                   memory4a <= {1'b0,C4}; // 8'd91;
-             end
-         5 : begin memory1a <= {1'b0,S3}; // 8'd71; 
-                   memory2a <= {1'b1,S7}; //-8'd126; 
-                   memory3a <= {1'b0,C7}; // 8'd25;   
-                   memory4a <= {1'b0,C3}; // 8'd106;
-             end
-         6 : begin memory1a <= {1'b0,C6}; // 8'd49; 
-                   memory2a <= {1'b1,S6}; //-8'd118; 
-                   memory3a <= {1'b0,S6}; // 8'd118;  
-                   memory4a <= {1'b1,C6}; //-8'd49;
-             end
-         7 : begin memory1a <= {1'b0,C7}; // 8'd25;  
-                   memory2a <= {1'b1,S3}; //-8'd71; 
-                   memory3a <= {1'b0,C3}; // 8'd106;  
-                   memory4a <= {1'b1,S7}; //-8'd126;
-             end
-        endcase
-    end
-    always @ (posedge clk) begin
-        xb0_in <= tdin;
-        xb1_in <= xb0_in;
-        xb2_in <= xb1_in;
-        xb3_in <= xb2_in;
-        xb4_in <= xb3_in;
-        xb5_in <= xb4_in;
-        xb6_in <= xb5_in;
-        xb7_in <= xb6_in;
-    end
-/* register inputs, inputs read in every eighth clk*/
-    always @ (posedge clk) if (sxregs) begin
-        xb0_reg <= xb0_in;
-        xb1_reg <= xb1_in; 
-        xb2_reg <= xb2_in;
-        xb3_reg <= xb3_in;
-        xb4_reg <= xb4_in;
-        xb5_reg <= xb5_in; 
-        xb6_reg <= xb6_in;
-        xb7_reg <= xb7_in;
-    end
-    always @ (posedge clk)
-        if (toggleB == 1'b1) begin
-            add_sub1b <= {xb7_reg[15],xb7_reg[15:0]} + {xb0_reg[15],xb0_reg[15:0]};
-            add_sub2b <= {xb6_reg[15],xb6_reg[15:0]} + {xb1_reg[15],xb1_reg[15:0]};
-            add_sub3b <= {xb5_reg[15],xb5_reg[15:0]} + {xb2_reg[15],xb2_reg[15:0]};
-            add_sub4b <= {xb4_reg[15],xb4_reg[15:0]} + {xb3_reg[15],xb3_reg[15:0]};
-        end else begin
-            add_sub1b <= {xb7_reg[15],xb7_reg[15:0]} - {xb0_reg[15],xb0_reg[15:0]};
-            add_sub2b <= {xb6_reg[15],xb6_reg[15:0]} - {xb1_reg[15],xb1_reg[15:0]};
-            add_sub3b <= {xb5_reg[15],xb5_reg[15:0]} - {xb2_reg[15],xb2_reg[15:0]};
-            add_sub4b <= {xb4_reg[15],xb4_reg[15:0]} - {xb3_reg[15],xb3_reg[15:0]};
-        end
-// Adding these wires to get rid of the MSB that is always 0
-    wire [16:0] addsub1b_comp_w  = add_sub1b[16]? (-add_sub1b) : add_sub1b;
-    wire [16:0] addsub2b_comp_w  = add_sub2b[16]? (-add_sub2b) : add_sub2b;
-    wire [16:0] addsub3b_comp_w  = add_sub3b[16]? (-add_sub3b) : add_sub3b;
-    wire [16:0] addsub4b_comp_w  = add_sub4b[16]? (-add_sub4b) : add_sub4b;
-    always @ (posedge clk) begin
-        save_sign1b    <= add_sub1b[16];
-        save_sign2b    <= add_sub2b[16];
-        save_sign3b    <= add_sub3b[16];
-        save_sign4b    <= add_sub4b[16];
-        addsub1b_comp	<= addsub1b_comp_w[15:0]; // add_sub1b[16]? (-add_sub1b) : add_sub1b;
-        addsub2b_comp	<= addsub2b_comp_w[15:0]; // add_sub2b[16]? (-add_sub2b) : add_sub2b;
-        addsub3b_comp	<= addsub3b_comp_w[15:0]; // add_sub3b[16]? (-add_sub3b) : add_sub3b;
-        addsub4b_comp	<= addsub4b_comp_w[15:0]; // add_sub4b[16]? (-add_sub4b) : add_sub4b;
-    end
-    assign p1b_all = addsub1b_comp * memory1a[15:0]; // MSB [16] is sign!
-    assign p2b_all = addsub2b_comp * memory2a[15:0];
-    assign p3b_all = addsub3b_comp * memory3a[15:0];
-    assign p4b_all = addsub4b_comp * memory4a[15:0];
-    reg [18:0] p1b_all_r;
-    reg [18:0] p2b_all_r;
-    reg [18:0] p3b_all_r;
-    reg [18:0] p4b_all_r;
-    reg p1b_sig, p2b_sig, p3b_sig, p4b_sig;
-    always @ (posedge clk) begin
-        p1b_all_r <= p1b_all[32:14];
-        p2b_all_r <= p2b_all[32:14];
-        p3b_all_r <= p3b_all[32:14];
-        p4b_all_r <= p4b_all[32:14];
-        p1b_sig <= (save_sign1b ^ memory1a[16]);
-        p2b_sig <= (save_sign2b ^ memory2a[16]);
-        p3b_sig <= (save_sign3b ^ memory3a[16]);
-        p4b_sig <= (save_sign4b ^ memory4a[16]);
-    end
-    always @ (posedge clk) begin
-        p1b[18:0] <= p1b_sig ? (-p1b_all_r) :(p1b_all_r);
-        p2b[18:0] <= p2b_sig ? (-p2b_all_r) :(p2b_all_r);
-        p3b[18:0] <= p3b_sig ? (-p3b_all_r) :(p3b_all_r);
-        p4b[18:0] <= p4b_sig ? (-p4b_all_r) :(p4b_all_r);
-    end
-/* multiply the outputs of the add/sub block with the 8 sets of stored coefficients */
-/* Final adder. Adding the ouputs of the 4 multipliers */
-    always @ (posedge clk) begin
-        dct2d_int1 <= ({p1b[18],p1b[18:0]} + {p2b[18],p2b[18:0]});
-        dct2d_int2 <= ({p3b[18],p3b[18:0]} + {p4b[18],p4b[18:0]});
-        dct_2d_int <= ({dct2d_int1[19],dct2d_int1[19:0]} + {dct2d_int2[19],dct2d_int2[19:0]});
-        if (pre_dv) dct2_out[12:0] <= dct_2d_rnd[12:0] + dct_2d_int[7];
-    end
-    assign dct_2d_rnd[12:0] = dct_2d_int[20:8];
-//    assign dct2_out[12:0] = dct_2d_rnd[12:0] + dct_2d_int[7];
-endmodule
--- a/dsp/dct1d_chen.v
+++ b/dsp/dct1d_chen.v
-/*******************************************************************************
+/*!
 * <b>Module:</b>dct1d_chen
 * @file dct1d_chen.v
- * @date:2016-06-05  
+ * @date 2016-06-05  
- * @author: Andrey Filippov
+ * @author  Andrey Filippov
 *     
- * @brief: 1d 8-point DCT based on Chen algorithm
+ * @brief 1d 8-point DCT based on Chen algorithm
 *
 * @copyright Copyright (c) 2016 Elphel, Inc.
 *
@@ -35,17 +35,19 @@
 * the combined code. This permission applies to you if the distributed code
 * contains all the components and scripts required to completely simulate it
 * with at least one of the Free Software programs.
- *******************************************************************************/
+ */
 `timescale 1ns/1ps
 module  dct1d_chen#(
    parameter WIDTH = 24,
-    parameter OUT_WIDTH =   24,
+    parameter OUT_WIDTH =   16,
    parameter B_WIDTH =     18,
    parameter A_WIDTH =     25,
    parameter P_WIDTH =     48,
-    parameter M_WIDTH =     43, // actual multiplier width (== (A_WIDTH +B_WIDTH)
+//    parameter M_WIDTH =     43, // actual multiplier width (== (A_WIDTH +B_WIDTH)
+    parameter ROUND_OUT =    8, // cut these number of LSBs on the output, round result (in addition to COSINE_SHIFT) 
    parameter COSINE_SHIFT= 17,
    parameter COS_1_16 =    128553, // (1<<17) * cos(1*pi/16)
    parameter COS_2_16 =    121095, // (2<<17) * cos(1*pi/16)
    parameter COS_3_16 =    108982, // (3<<17) * cos(1*pi/16)
@@ -59,11 +61,13 @@ module  dct1d_chen#(
    input                          en,
    input  [2 * WIDTH -1:0]        d10_32_76_54, // Concatenated input data {x[1],x[0]}/{x[3],x[2]}/ {x[7],x[6]}/{x[5],x[4]}
    input                          start,      // {x[1],x[0]} available next after start,  {x[3],x[2]} - second next, then {x[7],x[6]} and {x[5],x[4]} 
-    output [WIDTH -1:0]            dout,
+    output [OUT_WIDTH -1:0]        dout,
    output reg                     pre2_start_out, // 2 clock cycle before F4 output, full dout sequence
                                             // start_out-X-F4-X-F2-X-F6-F5-F0-F3-X-F1-X-F7
    output reg                     en_out    // valid at the same time slot as pre2_start_out (goes active with pre2_start_out)                                      
 );
+    localparam TOTAL_RSHIFT=    COSINE_SHIFT + ROUND_OUT;
+    localparam BEFORE_SAT_WIDTH = P_WIDTH - TOTAL_RSHIFT;  
    reg    signed [B_WIDTH-1:0] dsp_ma_bin;
    wire                        dsp_ma_ceb1_1;     // load b1 register
    wire                        dsp_ma_ceb2_1;     // load b2 register
@@ -94,6 +98,7 @@ module  dct1d_chen#(
    wire                        dsp_ma_neg_m_2;    // 1 - negate multiplier result
    wire                        dsp_ma_accum_2;    // 0 - use multiplier result, 1 add to accumulator
    wire   signed [P_WIDTH-1:0] dsp_ma_p_2;
+    wire   signed [P_WIDTH-1:0] dsp_ma_p_mux;
    // Multipler A/D inputs before shift
    wire   signed [WIDTH-1:0] dsp_ma_ain24_1;
@@ -142,10 +147,25 @@ module  dct1d_chen#(
    reg                   [7:0] phase;
    reg                   [2:0] phase_cnt;
    reg        [OUT_WIDTH -1:0] dout_r;
-    wire       [OUT_WIDTH -1:0] dout1_w;
+//    wire       [OUT_WIDTH -1:0] dout1_w;
-    wire       [OUT_WIDTH -1:0] dout2_w;
+//    wire       [OUT_WIDTH -1:0] dout2_w;
+    wire                        dout_round_c;
+    wire[BEFORE_SAT_WIDTH -1:0] dout_round_w; // after rounding, before (optional) saturation
+    reg [BEFORE_SAT_WIDTH -1:0] dout_round_r; // after rounding, before (optional) saturation              
+    wire       [OUT_WIDTH -1:0] dout_sat_w;
+    wire[BEFORE_SAT_WIDTH -1:0] dout_round; // after rounding, before (optional) saturation
    reg                   [2:0] per_type; // idle/last:0, first cycle - 1, 2-nd - 2, other - 3,... ~en->6 ->7 -> 0  (to generate pre2_start_out)
+    // Temporarily adding 1 extra latency cycle for rounding/saturation. TODO: Remove when moved to DSP itself
+    reg                     pre3_start_out; // 3 clock cycle before F4 output, full dout sequence
+                                             // start_out-X-F4-X-F2-X-F6-F5-F0-F3-X-F1-X-F7
+    reg                     pre_en_out;    // valid at the same time slot as pre2_start_out (goes active with pre2_start_out)                                      
 //        .ain      ({simd_a1,simd_a0}), // input[47:0] 
 //        .bin      ({simd_b1,simd_b0}), // input[47:0]
    // dsp_addsub_simd1_i input connections
@@ -233,7 +253,7 @@ module  dct1d_chen#(
    assign dsp_ma_ced_2 =  phase[1] | phase[6];
    assign dsp_ma_sela_2 =  phase[1] | phase[6];
    assign dsp_ma_seld_2 =  phase[0] | phase[2] | phase[5] | phase[7];
-    assign dsp_ma_neg_m_2 = phase[6];
+    assign dsp_ma_neg_m_2 = phase[1] | phase[6];
    assign dsp_ma_accum_2 = phase[0] | phase[2] | phase[4] | phase[6];
    // dsp_ma2_i data input connections
    assign dsp_ma_ain24_2 = simd_p5; 
@@ -255,10 +275,37 @@ module  dct1d_chen#(
 //    assign  dout1_w = dsp_ma_p_1[M_WIDTH -: WIDTH]; // adding one bit for adder (two MPY outputs are added)
 //    assign  dout2_w = dsp_ma_p_2[M_WIDTH -: WIDTH]; // adding one bit for adder (two MPY outputs are added)
-    assign  dout1_w = dsp_ma_p_1[COSINE_SHIFT +: WIDTH]; // adding one bit for adder (two MPY outputs are added)
+    assign dsp_ma_p_mux = phase_cnt[0] ? dsp_ma_p_1 : dsp_ma_p_2;
-    assign  dout2_w = dsp_ma_p_2[COSINE_SHIFT +: WIDTH]; // adding one bit for adder (two MPY outputs are added)
+//    assign  dout1_w = dsp_ma_p_1[COSINE_SHIFT +: OUT_WIDTH]; // adding one bit for adder (two MPY outputs are added)
+//    assign  dout2_w = dsp_ma_p_2[COSINE_SHIFT +: OUT_WIDTH]; // adding one bit for adder (two MPY outputs are added)
+    assign dout_round_c = dsp_ma_p_mux[TOTAL_RSHIFT-1];
+    assign dout_round_w = dsp_ma_p_mux[TOTAL_RSHIFT +: BEFORE_SAT_WIDTH] + dout_round_c;
+//  Saturation (only if  BEFORE_SAT_WIDTH > OUT_WIDTH)
+    localparam TRIM_MSB = BEFORE_SAT_WIDTH - OUT_WIDTH;
+    generate
+        if (TRIM_MSB < 0) begin // should never happen
+            assign dout_sat_w =  { {(-TRIM_MSB){dout_round[BEFORE_SAT_WIDTH-1]}},dout_round };
+        end else if (TRIM_MSB == 0) begin
+            assign dout_sat_w =  dout_round[0 +: OUT_WIDTH];
+        end else begin //! saturate. TODO: Maybe (and also symmetric rounding) can be done in DSP itself using masks?
+            assign dout_sat_w = (dout_round[BEFORE_SAT_WIDTH-1 -: TRIM_MSB] == {TRIM_MSB{dout_round[BEFORE_SAT_WIDTH-1]}})?
+                                   dout_round[0 +: OUT_WIDTH]:
+                                   {dout_round[BEFORE_SAT_WIDTH-1], {OUT_WIDTH-1{~dout_round[BEFORE_SAT_WIDTH-1]}}};
+        end                   
+    endgenerate                       
+    // to possibly remove registers with generate
+    assign dout_round= dout_round_r; 
+//BEFORE_SAT_WIDTH    
+//    wire                        dout_round_c;
+//    wire       [OUT_WIDTH -1:0] dout_round_w;
+//ROUND_OUT
+//phase_cnt[0] ? dout1_w : dout2_w;    
    assign dout = dout_r;
    always @ (posedge clk) begin
@@ -284,16 +331,24 @@ module  dct1d_chen#(
            3'h6: dsp_ma_bin <= COS_4_16;
            3'h7: dsp_ma_bin <= COS_6_16;
        endcase
-        dout_r <= phase_cnt[0] ? dout1_w : dout2_w;
+//        dout_r <= phase_cnt[0] ? dout1_w : dout2_w;
+        dout_round_r <= dout_round_w;         
+        dout_r <= dout_sat_w;
-        if (rst) pre2_start_out <= 0;
+        if (rst) pre3_start_out <= 0;
-        else     pre2_start_out <= (per_type == 2) && phase[3];
+        else     pre3_start_out <= (per_type == 2) && phase[3];
+        pre2_start_out <=pre3_start_out;
-        if (rst || !(en || (|phase))) en_out <= 0;
+        if (rst || !(en || (|phase))) pre_en_out <= 0;
        else if (phase[3]) begin
-            if      (per_type == 2)   en_out <= 1;
+            if      (per_type == 2)   pre_en_out <= 1;
-            else if (per_type[2])     en_out <= 0;
+            else if (per_type[2])     pre_en_out <= 0;
-        end         
+        end
+        en_out <= pre_en_out;
    end
    dsp_addsub_simd #(

--- a/dsp/dct1d_chen_reorder_in.v
+++ b/dsp/dct1d_chen_reorder_in.v
-/*******************************************************************************
+/*!
 * <b>Module:</b>dct1d_chen_reorder_in
 * @file dct1d_chen_reorder_in.v
- * @date:2016-06-08  
+ * @date 2016-06-08  
- * @author: Andrey Filippov
+ * @author  Andrey Filippov
 *     
- * @brief: Reorder scan-line pixel stream for dct1d_chen module
+ * @brief Reorder scan-line pixel stream for dct1d_chen module
 *
 * @copyright Copyright (c) 2016 Elphel, Inc.
 *
@@ -35,7 +35,7 @@
 * the combined code. This permission applies to you if the distributed code
 * contains all the components and scripts required to completely simulate it
 * with at least one of the Free Software programs.
- *******************************************************************************/
+ */
 `timescale 1ns/1ps
 module  dct1d_chen_reorder_in#(

--- a/dsp/dct1d_chen_reorder_out.v
+++ b/dsp/dct1d_chen_reorder_out.v
-/*******************************************************************************
+/*!
 * <b>Module:</b>dct1d_chen_reorder_out
 * @file dct1d_chen_reorder_out.v
- * @date:2016-06-08  
+ * @date 2016-06-08  
- * @author: Andrey Filippov
+ * @author  Andrey Filippov
 *     
- * @brief: Reorder data from dct1d_chen output to natural sequence
+ * @brief Reorder data from dct1d_chen output to natural sequence
 *
 * @copyright Copyright (c) 2016 Elphel, Inc.
 *
@@ -35,7 +35,7 @@
 * the combined code. This permission applies to you if the distributed code
 * contains all the components and scripts required to completely simulate it
 * with at least one of the Free Software programs.
- *******************************************************************************/
+ */
 `timescale 1ns/1ps
 module  dct1d_chen_reorder_out#(
@@ -62,6 +62,7 @@ module  dct1d_chen_reorder_out#(
    reg  [2:0] per_type; // idle/last:0, first cycle - 1, 2-nd - 2, other - 3,... ~en->6 ->7 -> 0  (to generate pre2_start_out)
    reg        start_out_r;
    reg        en_out_r;
+    wire       stop_out; // qualify with en
    assign dout = dout_r;
    assign start_out = start_out_r; 
    assign en_out = en_out_r;
@@ -98,16 +99,30 @@ module  dct1d_chen_reorder_out#(
        if      ((per_type == 2) && (cntr_in == 1))   raddr <= {~cntr_in[3], 3'b0};
        else if ((raddr[2:0] != 0) || (per_type !=0)) raddr <= raddr + 1;
-        dout_r <=  reord_buf_ram[raddr];
+        if (en_out_r) dout_r <=  reord_buf_ram[raddr];
        start_out_r <=  (per_type == 2) && (cntr_in == 1);
-        if (rst ||(per_type == 0) ) en_out_r <= 0;
+        if (rst ||(per_type == 0) )                 en_out_r <= 0;
-        else if (cntr_in == 1)      en_out_r <= (per_type == 2) || !per_type[2]; 
+//        else if (cntr_in == 1)      en_out_r <= (per_type == 2) || !per_type[2];
+        else if ((cntr_in == 1) && (per_type == 2)) en_out_r <= 1;
-        if      (rst)                            dv <= 0;
+        else if (stop_out && !en)                   en_out_r <= 0;
-        else if (start_out_r)                    dv <= 1;
+        //stop_out
-        else if ((raddr[2:0] == 0) && !en_out_r) dv <= 0;
+        dv <= en_out_r;
+//        if      (rst)                            dv <= 0;
+//        else if (start_out_r)                    dv <= 1;
+//        else if ((raddr[2:0] == 0) && !en_out_r) dv <= 0;
    end
+    dly01_16 dly01_16_i (
+        .clk      (clk),                    // input
+        .rst      (rst),                    // input
+        .dly      (4'd8),                   // input[3:0] 
+        .din      ((&cntr_in[2:0]) && !en), // input
+        .dout     (stop_out)                // output
+    );
 endmodule
--- a/dsp/dct2d8x8_chen.v
+++ b/dsp/dct2d8x8_chen.v
-/*******************************************************************************
+/*!
 * <b>Module:</b>dct2d8x8_chen
 * @file dct2d8x8_chen.v
- * @date:2016-06-10  
+ * @date 2016-06-10  
- * @author: Andrey Filippov
+ * @author  Andrey Filippov
 *     
- * @brief: 2-d DCT implementation of Chen algorithm
+ * @brief 2-d DCT implementation of Chen algorithm
 *
 * @copyright Copyright (c) 2016 Elphel, Inc.
 *
@@ -35,7 +35,7 @@
 * the combined code. This permission applies to you if the distributed code
 * contains all the components and scripts required to completely simulate it
 * with at least one of the Free Software programs.
- *******************************************************************************/
+ */
 `timescale 1ns/1ps
 module  dct2d8x8_chen#(
@@ -45,13 +45,13 @@ module  dct2d8x8_chen#(
    parameter STAGE2_SAFE_BITS = 3, // leave this number of extra bits on DCT1D input to prevent output saturation
    parameter TRANSPOSE_WIDTH = 16, // transpose memory width
    parameter TRIM_STAGE_1 =     1, // Trim these MSBs from the stage1 results (1 - matches old DCT)
-    parameter TRIM_STAGE_2 =     2, // Trim these MSBs from the stage2 results TODO: put real value
+    parameter TRIM_STAGE_2 =     0, // Trim these MSBs from the stage2 results
    parameter DSP_WIDTH =       24,
-    parameter DSP_OUT_WIDTH =   24,
+//    parameter DSP_OUT_WIDTH =   24,
    parameter DSP_B_WIDTH =     18,
    parameter DSP_A_WIDTH =     25,
-    parameter DSP_P_WIDTH =     48,
+    parameter DSP_P_WIDTH =     48
-    parameter DSP_M_WIDTH =     43  // actual multiplier width (== (A_WIDTH +B_WIDTH)
+//    parameter DSP_M_WIDTH =     43  // actual multiplier width (== (A_WIDTH +B_WIDTH)
    ) (
    input                            clk,           /// system clock, posedge
    input                            rst,           // sync reset
@@ -68,6 +68,8 @@ module  dct2d8x8_chen#(
    localparam REPLICATE_IN_STAGE2 = STAGE2_SAFE_BITS;
    localparam PAD_IN_STAGE2 =       DSP_WIDTH - TRANSPOSE_WIDTH - STAGE2_SAFE_BITS ;
+    localparam ROUND_STAGE1 =        DSP_WIDTH - TRANSPOSE_WIDTH - TRIM_STAGE_1;  
+    localparam ROUND_STAGE2 =        DSP_WIDTH - OUTPUT_WIDTH -    TRIM_STAGE_2;  
    reg signed      [INPUT_WIDTH-1:0] xin_r;
@@ -82,7 +84,7 @@ module  dct2d8x8_chen#(
    wire signed       [DSP_WIDTH-1:0] dct1in_pad_h;                  
    wire signed       [DSP_WIDTH-1:0] dct1in_pad_l;
-    wire signed   [DSP_OUT_WIDTH-1:0] dct1_out;
+    wire signed [TRANSPOSE_WIDTH-1:0] dct1_out;
    wire                              stage1_pre2_start_out; 
 //    wire                              stage1_pre2_en_out; 
@@ -94,20 +96,43 @@ module  dct2d8x8_chen#(
    wire signed       [DSP_WIDTH-1:0] dct2in_pad_h;                  
    wire signed       [DSP_WIDTH-1:0] dct2in_pad_l;
-    wire signed   [DSP_OUT_WIDTH-1:0] dct2_out;
+    wire signed    [OUTPUT_WIDTH-1:0] dct2_out;
    wire                              stage2_pre2_start_out; 
    wire                              stage2_pre2_en_out; 
-    wire signed    [OUTPUT_WIDTH-1:0] dct2_trimmed;
+//    wire signed    [OUTPUT_WIDTH-1:0] dct2_trimmed;
    assign dct1in_pad_h = {{REPLICATE_IN_STAGE1{dct1in_h[INPUT_WIDTH-1]}}, dct1in_h, {PAD_IN_STAGE1{1'b0}}};                  
    assign dct1in_pad_l = {{REPLICATE_IN_STAGE1{dct1in_l[INPUT_WIDTH-1]}}, dct1in_l, {PAD_IN_STAGE1{1'b0}}};                  
-    assign transpose_din = dct1_out[DSP_OUT_WIDTH-1-TRIM_STAGE_1 -:TRANSPOSE_WIDTH];
+    assign transpose_din = dct1_out;
+    /*
+    generate
+        if (TRIM_STAGE_1 == 0) begin
+            assign transpose_din = dct1_out[DSP_OUT_WIDTH-1 -:TRANSPOSE_WIDTH];
+        end else begin //! saturate. TODO: Maybe (and also symmetric rounding) can be done in DSP itself using masks?
+            assign transpose_din = (dct1_out[DSP_OUT_WIDTH-1 -: TRIM_STAGE_1] == {TRIM_STAGE_1{dct1_out[DSP_OUT_WIDTH-1]}})?
+                                   dct1_out[DSP_OUT_WIDTH-1-TRIM_STAGE_1 -: TRANSPOSE_WIDTH]:
+                                   {dct1_out[DSP_OUT_WIDTH-1], {TRANSPOSE_WIDTH-1{~dct1_out[DSP_OUT_WIDTH-1]}}};
+        end                   
+    endgenerate                       
+    */
    assign dct2in_pad_h = {{REPLICATE_IN_STAGE2{transpose_douth[TRANSPOSE_WIDTH-1]}}, transpose_douth, {PAD_IN_STAGE2{1'b0}}};                  
    assign dct2in_pad_l = {{REPLICATE_IN_STAGE2{transpose_doutl[TRANSPOSE_WIDTH-1]}}, transpose_doutl, {PAD_IN_STAGE2{1'b0}}};                  
-    assign dct2_trimmed = dct2_out[DSP_OUT_WIDTH-1-TRIM_STAGE_2 -:OUTPUT_WIDTH];
+//    assign dct2_trimmed = dct2_out;
+    /*
+    generate
+        if (TRIM_STAGE_2 == 0) begin
+            assign dct2_trimmed = dct2_out[DSP_OUT_WIDTH-1 -: OUTPUT_WIDTH];
+        end else begin //! saturate. Maybe (and also symmetric rounding) can be done in DSP itself using masks?
+            assign dct2_trimmed = (dct2_out[DSP_OUT_WIDTH-1 -: TRIM_STAGE_2] == {TRIM_STAGE_2{dct2_out[DSP_OUT_WIDTH-1]}})?
+                                  dct2_out[DSP_OUT_WIDTH-1-TRIM_STAGE_2 -:OUTPUT_WIDTH]:
+                                  {dct2_out[DSP_OUT_WIDTH-1], {OUTPUT_WIDTH-1{~dct2_out[DSP_OUT_WIDTH-1]}}};
+        end
+    endgenerate
+    */
    always @(posedge clk) begin
        start_in_r <= start;
@@ -141,11 +166,11 @@ module  dct2d8x8_chen#(
    wire dbg_stage1_pre2_en_out;
    dct1d_chen #(
        .WIDTH           (DSP_WIDTH),
-        .OUT_WIDTH       (DSP_OUT_WIDTH),
+        .OUT_WIDTH       (TRANSPOSE_WIDTH), // DSP_OUT_WIDTH),
        .B_WIDTH         (DSP_B_WIDTH),
        .A_WIDTH         (DSP_A_WIDTH),
        .P_WIDTH         (DSP_P_WIDTH),
-        .M_WIDTH         (DSP_M_WIDTH)
+        .ROUND_OUT       (ROUND_STAGE1) // cut these number of LSBs on the output, round result (in addition to COSINE_SHIFT) 
    ) dct1d_chen_stage1_i (
        .clk             (clk),                         // input
        .rst             (rst),                         // input
@@ -170,12 +195,12 @@ module  dct2d8x8_chen#(
    );
    dct1d_chen #(
-        .WIDTH(DSP_WIDTH),
+        .WIDTH           (DSP_WIDTH),
-        .OUT_WIDTH(DSP_OUT_WIDTH),
+        .OUT_WIDTH       (OUTPUT_WIDTH),
-        .B_WIDTH(DSP_B_WIDTH),
+        .B_WIDTH         (DSP_B_WIDTH),
-        .A_WIDTH(DSP_A_WIDTH),
+        .A_WIDTH         (DSP_A_WIDTH),
-        .P_WIDTH(DSP_P_WIDTH),
+        .P_WIDTH         (DSP_P_WIDTH),
-        .M_WIDTH(DSP_M_WIDTH)
+        .ROUND_OUT       (ROUND_STAGE2) // cut these number of LSBs on the output, round result (in addition to COSINE_SHIFT) 
    ) dct1d_chen_stage2_i (
        .clk             (clk),                         // input
        .rst             (rst),                         // input
@@ -193,7 +218,7 @@ module  dct2d8x8_chen#(
        .clk         (clk),                   // input
        .rst         (rst),                   // input
        .en          (stage2_pre2_en_out),    // input
-        .din         (dct2_trimmed),          // input[23:0] 
+        .din         (dct2_out),              // input[23:0] 
        .pre2_start  (stage2_pre2_start_out), // input
        .dout        (d_out),                 // output[23:0] 
        .start_out   (pre_first_out),         // output reg 
@@ -202,13 +227,16 @@ module  dct2d8x8_chen#(
    );
 // Just for debugging/comparing with old 1-d DCT:
-wire [DSP_WIDTH-1:0] dbg_d_out;
+`ifdef SIMULATION // no sense to synthesize it
+`ifdef DEBUG_DCT1D
+wire [TRANSPOSE_WIDTH-1:0] dbg_d_out;
+//wire        [15:0]   dbg_d_out13=dbg_d_out[7 +: 16] ;
 wire                 dbg_dv;
 wire                 dbg_en_out;
 wire                 dbg_pre_first_out;
    dct1d_chen_reorder_out #(
-        .WIDTH       (DSP_WIDTH)
+        .WIDTH       (TRANSPOSE_WIDTH)
    ) dct1d_chen_reorder_out_dbg_i (
        .clk         (clk),                    // input
        .rst         (rst),                    // input
@@ -220,5 +248,7 @@ wire                 dbg_pre_first_out;
        .dv          (dbg_dv),                 // output reg 
        .en_out      (dbg_en_out)              // output reg 
    );
+`endif
+`endif    
 endmodule
--- a/dsp/dct_chen_transpose.v
+++ b/dsp/dct_chen_transpose.v
-/*******************************************************************************
+/*!
 * <b>Module:</b>dct_chen_transpose
 * @file dct_chen_transpose.v
- * @date:2016-06-09  
+ * @date 2016-06-09  
- * @author: Andrey Filippov
+ * @author  Andrey Filippov
 *     
- * @brief: Reorder+transpose data between two 1-d DCT passes
+ * @brief Reorder+transpose data between two 1-d DCT passes
 *
 * @copyright Copyright (c) 2016 Elphel, Inc.
 *
@@ -35,7 +35,7 @@
 * the combined code. This permission applies to you if the distributed code
 * contains all the components and scripts required to completely simulate it
 * with at least one of the Free Software programs.
- *******************************************************************************/
+ */
 `timescale 1ns/1ps
 module  dct_chen_transpose#(
@@ -70,6 +70,7 @@ module  dct_chen_transpose#(
    reg [2*WIDTH-1:0] ram_reg2;
    wire              pre_rstart_w = wcntr[5:0] == 61;
    reg         [1:0] rstop_r;
+    reg               first_after_pause; // first block after pause - do not write 2 items to the "past"
    assign wpage = wcntr[6] ^ wrow_mod[3]; // previous page for row 0, col 1 & 3
    assign wrow_mod = {1'b0, wrow} - wcol13; 
@@ -93,7 +94,7 @@ module  dct_chen_transpose#(
        else if (pre_we_r)   wcntr <= wcntr + 1;        // including page, should be before 'if (pre2_start)'
        else if (pre2_start) wcntr <= {wcntr[6], 6'b0}; // if happens during pre_we_r - will be ignored, otherwise (after pause) will zero in-page adderss
-        we_r <= pre_we_r;
+        we_r <= pre_we_r && (!first_after_pause || !wcol13 || (|wrow)); // do not write first after pause to the "past"
        if (we_r) transpose_ram[waddr] <= din;
@@ -118,6 +119,11 @@ module  dct_chen_transpose#(
        if      (rst)        en_out <= 0;
        else if (rcntr == 1) en_out <= 1;
        else if (rstop_r[1]) en_out <= 0;
+        if      (rst)                 first_after_pause <= 0;
+        else if (pre2_start && !we_r) first_after_pause <= 1;
+        else if (&wcntr[5:0])         first_after_pause <= 0;
    end
    dly01_16 dly01_16_stop_i (

--- a/dsp/dsp_addsub_simd.v
+++ b/dsp/dsp_addsub_simd.v
-/*******************************************************************************
+/*!
 * <b>Module:</b>dsp_addsub_simd
 * @file dsp_addsub_simd.v
- * @date:2016-06-05  
+ * @date 2016-06-05  
- * @author: Andrey Filippov
+ * @author  Andrey Filippov
 *     
- * @brief: SIMD adder/subtracter
+ * @brief SIMD adder/subtracter
 *
 * @copyright Copyright (c) 2016 Elphel, Inc.
 *
@@ -35,7 +35,7 @@
 * the combined code. This permission applies to you if the distributed code
 * contains all the components and scripts required to completely simulate it
 * with at least one of the Free Software programs.
- *******************************************************************************/
+ */
 `timescale 1ns/1ps
 module  dsp_addsub_simd#(
@@ -70,7 +70,7 @@ module  dsp_addsub_simd#(
    DSP48E1 #(
        .ACASCREG            (1),
-        .ADREG               (0), // (1),
+        .ADREG               (1),
        .ALUMODEREG          (1),
        .AREG                (1), // (1)
        .AUTORESET_PATDET    ("NO_RESET"),
@@ -81,7 +81,7 @@ module  dsp_addsub_simd#(
        .CARRYINREG          (1),
        .CARRYINSELREG       (1),
        .CREG                (1), //(1),
-        .DREG                (0), //(1),
+        .DREG                (1),
        .INMODEREG           (1),
        .IS_ALUMODE_INVERTED (4'b0),
        .IS_CARRYIN_INVERTED (1'b0),
@@ -131,7 +131,7 @@ module  dsp_addsub_simd#(
        .CECTRL         (1'b1),       // input
        .CED            (1'b0),       // input
        .CEINMODE       (1'b1),       // input
-        .CEM            (1'b1),       // input
+        .CEM            (1'b0),       // input
        .CEP            (cep),        // input
        .CLK            (clk),        // input
        .D              (25'h1ffffff),// input[24:0] 
@@ -145,9 +145,9 @@ module  dsp_addsub_simd#(
        .RSTB           (rst),        // input
        .RSTC           (rst),        // input
        .RSTCTRL        (rst),        // input
-        .RSTD           (rst),        // input
+        .RSTD           (1'b0),       // input
        .RSTINMODE      (rst),        // input
-        .RSTM           (rst),        // input
+        .RSTM           (1'b0),        // input
        .RSTP           (rst)         // input
    );
 `else

--- a/dsp/dsp_ma.v
+++ b/dsp/dsp_ma.v
-/*******************************************************************************
+/*!
 *  dsp_ma
 * @file dsp_ma.v
- * @date:2016-06-05  
+ * @date 2016-06-05  
- * @author: Andrey Filippov
+ * @author  Andrey Filippov
 *     
- * @brief: DSP with multi-input multiplier and accumulator
+ * @brief DSP with multi-input multiplier and accumulator
 *
 * @copyright Copyright (c) 2016 Elphel, Inc.
 *
@@ -35,7 +35,7 @@
 * the combined code. This permission applies to you if the distributed code
 * contains all the components and scripts required to completely simulate it
 * with at least one of the Free Software programs.
- *******************************************************************************/
+ */
 `timescale 1ns/1ps
 module  dsp_ma #(

--- a/dsp/dsp_ma_preadd.v
+++ b/dsp/dsp_ma_preadd.v
-/*******************************************************************************
+/*!
 *  dsp_ma_preadd
 * @file dsp_ma_preadd.v
- * @date:2016-06-05  
+ * @date 2016-06-05  
- * @author: Andrey Filippov
+ * @author  Andrey Filippov
 *     
- * @brief: DSP with multi-input multiplier and accumulator with pre-adder
+ * @brief DSP with multi-input multiplier and accumulator with pre-adder
 *
 * @copyright Copyright (c) 2016 Elphel, Inc.
 *
@@ -35,7 +35,7 @@
 * the combined code. This permission applies to you if the distributed code
 * contains all the components and scripts required to completely simulate it
 * with at least one of the Free Software programs.
- *******************************************************************************/
+ */
 `timescale 1ns/1ps
 module  dsp_ma_preadd #(

--- a/fpga_version.vh
+++ b/fpga_version.vh
@@ -35,21 +35,23 @@
 * contains all the components and scripts required to completely simulate it
 * with at least one of the Free Software programs.
 */
-          parameter FPGA_VERSION =          32'h03930096;    // serial, next
+        parameter FPGA_VERSION =          32'h03930098;    // serial, trying dct_chen - works, removing old completely
-//          parameter FPGA_VERSION =          32'h03930095;    // parallel  -0.068/-0.342/5 82.38%
+//      parameter FPGA_VERSION =          32'h03930097;    // serial, trying dct_chen - works
-//          parameter FPGA_VERSION =          32'h03930094;    // hispi, disabling debug  -0.187/-1.252/16 84.14%  
+//      parameter FPGA_VERSION =          32'h03930096;    // serial, next (before changing DCT)
-//        parameter FPGA_VERSION =          32'h03930093;    // hispi, masking sensor data to memory buffer, debug still on
+//      parameter FPGA_VERSION =          32'h03930095;    // parallel  -0.068/-0.342/5 82.38%
-//        parameter FPGA_VERSION =          32'h03930092;    // hispi, even more debugging memory pages sens-> memory
+//      parameter FPGA_VERSION =          32'h03930094;    // hispi, disabling debug  -0.187/-1.252/16 84.14%  
-//        parameter FPGA_VERSION =          32'h03930091;    // hispi, more debugging memory pages sens-> memory
+//      parameter FPGA_VERSION =          32'h03930093;    // hispi, masking sensor data to memory buffer, debug still on
-//        parameter FPGA_VERSION =          32'h03930090;    // hispi, debugging memory pages sens-> memory (not met)
+//      parameter FPGA_VERSION =          32'h03930092;    // hispi, even more debugging memory pages sens-> memory
-//        parameter FPGA_VERSION =          32'h0393008f;    // parallel, all the same
+//      parameter FPGA_VERSION =          32'h03930091;    // hispi, more debugging memory pages sens-> memory
-//        parameter FPGA_VERSION =          32'h0393008e;    // hispi, adding i2c fifo fill, all met,83.73%
+//      parameter FPGA_VERSION =          32'h03930090;    // hispi, debugging memory pages sens-> memory (not met)
-//        parameter FPGA_VERSION =          32'h0393008d;    // parallel, adding i2c fifo fill max err 0.128, 82.61%
+//      parameter FPGA_VERSION =          32'h0393008f;    // parallel, all the same
-//        parameter FPGA_VERSION =          32'h0393008c;      // hispi, all met, 83.55%
+//      parameter FPGA_VERSION =          32'h0393008e;    // hispi, adding i2c fifo fill, all met,83.73%
-//        parameter FPGA_VERSION =          32'h0393008b;    // parallel, all met, 82.06% . Reran 0.051ns error, 82.02%
+//      parameter FPGA_VERSION =          32'h0393008d;    // parallel, adding i2c fifo fill max err 0.128, 82.61%
-//        parameter FPGA_VERSION =          32'h0393008a;    // HiSPI sensor (14 MPix) no timing errors
+//      parameter FPGA_VERSION =          32'h0393008c;      // hispi, all met, 83.55%
-//        parameter FPGA_VERSION =          32'h03930089;    // Auto-synchronizing i2c sequencers with the command ones
+//      parameter FPGA_VERSION =          32'h0393008b;    // parallel, all met, 82.06% . Reran 0.051ns error, 82.02%
-//        parameter FPGA_VERSION =          32'h03930088;    // Fixing circbuf rollover pointers bug (only one path violated)
+//      parameter FPGA_VERSION =          32'h0393008a;    // HiSPI sensor (14 MPix) no timing errors
+//      parameter FPGA_VERSION =          32'h03930089;    // Auto-synchronizing i2c sequencers with the command ones
+//      parameter FPGA_VERSION =          32'h03930088;    // Fixing circbuf rollover pointers bug (only one path violated)
 //      parameter FPGA_VERSION =          32'h03930087;    // Fixed default 90% quantization table
 //      parameter FPGA_VERSION =          32'h03930087;    // Synchronizing i2c sequencer frame number with that of a command sequencer
 //      parameter FPGA_VERSION =          32'h03930086;    // Adding byte-wide JTAG read to speed-up 10359 load

--- a/py393/x393_jpeg.py
+++ b/py393/x393_jpeg.py
@@ -1104,6 +1104,10 @@ write_sensor_i2c  0 1 0 0x302e0010
 #Exposure 0x800 lines
 write_sensor_i2c  0 1 0 0x30120800
+#test - running 8, 8-bit
+write_sensor_i2c  0 1 0 0x30700101
 ################## Serial - chn3  ####################
 cd /usr/local/verilog/; test_mcntrl.py @hargs
 bitstream_set_path /usr/local/verilog/x393_hispi.bit
@@ -1124,7 +1128,7 @@ write_sensor_i2c  3 1 0 0x3028000a
 write_sensor_i2c  3 1 0 0x302c000d
 write_sensor_i2c  3 1 0 0x302e0010
 #exposure
-write_sensor_i2c  3 1 0 0x30120200
+write_sensor_i2c  3 1 0 0x30120800
 compressor_control 3 2

--- a/system_defines.vh
+++ b/system_defines.vh
@@ -42,6 +42,8 @@
  `define SYSTEM_DEFINES
  // TODO: Later compare instantiate/infer
  `define INSTANTIATE_DSP48E1
+  `define DEBUG_DCT1D // undefine after debugging is over
+//  `define USE_OLD_DCT
 // Parameters from x393_sata project
  `define USE_DRP

--- a/x393_1_7_0.Doxyfile
+++ b/x393_1_7_0.Doxyfile
@@ -38,7 +38,7 @@ PROJECT_NUMBER = 1.0
 # If a relative path is entered, it will be relative to the location
 # where doxygen was started. If left blank the current directory will be used.
-OUTPUT_DIRECTORY = 
+OUTPUT_DIRECTORY = x393_docs
 # If the CREATE_SUBDIRS tag is set to YES, then doxygen will create
 # 4096 sub-directories (in 2 levels) under the output directory of each output
@@ -616,7 +616,7 @@ RECURSIVE = YES
 # excluded from the INPUT source files. This way you can easily exclude a
 # subdirectory from a directory tree whose root is specified with the INPUT tag.
-EXCLUDE = unisims ddr3 x353 debug helpers html py393 glbl.v IVERILOG_INCLUDE.v x393_sata/system_defines.vh x393_sata/top_tmp.v 
+EXCLUDE = unisims ddr3 x353 debug helpers html py393 glbl.v IVERILOG_INCLUDE.v x393_sata/system_defines.vh x393_sata/top_tmp.v
 # The EXCLUDE_SYMLINKS tag can be used select whether or not files or
 # directories that are symbolic links (a Unix filesystem feature) are excluded

--- a/x393_testbench04.sav
+++ b/x393_testbench04.sav
 [*]
 [*] GTKWave Analyzer v3.3.66 (w)1999-2015 BSI
-[*] Sun Jun 12 10:04:58 2016
+[*] Mon Jun 13 02:28:45 2016
 [*]
-[dumpfile] "/home/andrey/git/x393/simulation/x393_testbench03-20160612033213998.fst"
+[dumpfile] "/home/andrey/git/x393/simulation/x393_testbench03-20160612183504062.fst"
-[dumpfile_mtime] "Sun Jun 12 09:48:19 2016"
+[dumpfile_mtime] "Mon Jun 13 00:51:06 2016"
-[dumpfile_size] 85326946
+[dumpfile_size] 85539825
 [savefile] "/home/andrey/git/x393/x393_testbench04.sav"
-[timestart] 90696800
+[timestart] 74900000
 [size] 1823 1180
 [pos] 0 0
-*-15.073349 90841667 209370000 209396667 209423333 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
+*-22.194141 94601000 209370000 209396667 209423333 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 [treeopen] x393_testbench03.
 [treeopen] x393_testbench03.read_compressor_frame_irq.
 [treeopen] x393_testbench03.read_contol_register_irq.
@@ -33,6 +33,11 @@
 [treeopen] x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[0].jp_channel_i.dct2d8x8_chen_i.dct_chen_transpose_i.
 [treeopen] x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[0].jp_channel_i.xdct393_i.
 [treeopen] x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].
+[treeopen] x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.
+[treeopen] x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.cmprs_macroblock_buf_iface_i.
+[treeopen] x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.dct2d8x8_chen_i.
+[treeopen] x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.dct2d8x8_chen_i.dct_chen_transpose_i.
+[treeopen] x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.xdct393_i.
 [treeopen] x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[2].
 [treeopen] x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[3].
 [treeopen] x393_testbench03.x393_i.compressor393_i.genblk3.
@@ -62,10 +67,10 @@
 [treeopen] x393_testbench03.x393_i.sensors393_i.sensor_channel_block[3].
 [treeopen] x393_testbench03.x393_i.sensors393_i.sensor_channel_block[3].sensor_channel_i.
 [treeopen] x393_testbench03.x393_i.sensors393_i.sensor_channel_block[3].sensor_channel_i.sensor_i2c_io_i.
-[sst_width] 238
+[sst_width] 395
-[signals_width] 260
+[signals_width] 338
 [sst_expanded] 1
-[sst_vpaned_height] 420
+[sst_vpaned_height] 421
 @820
 x393_testbench03.TEST_TITLE[639:0]
 @c00200
@@ -1600,7 +1605,164 @@ x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[0].jp_channel_i.xdct
 x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[0].jp_channel_i.xdct393_i.dv
 @420
 x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[0].jp_channel_i.xdct393_i.d_out[12:0]
+@200
+-dct_chen_out
+@420
+[color] 2
+x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[0].jp_channel_i.dct2d8x8_chen_i.d_out[12:0]
+@800200
+-chn1
+@200
+-xdct
+@28
+x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.xdct393_i.start
+@420
+x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.xdct393_i.tm_di[15:0]
+x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.xdct393_i.d_out[12:0]
+x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.xdct_dout_debug[12:0]
+@28
+x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.xdct393_i.dv
+@200
+-dct_chen
+@28
+x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.dct2d8x8_chen_i.start
+@420
+x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.dct2d8x8_chen_i.d_out[12:0]
+@28
+x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.dct2d8x8_chen_i.dv
+@c08420
+x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.dct_dout_diff_debug[12:0]
+@28
+(0)x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.dct_dout_diff_debug[12:0]
+(1)x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.dct_dout_diff_debug[12:0]
+(2)x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.dct_dout_diff_debug[12:0]
+(3)x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.dct_dout_diff_debug[12:0]
+(4)x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.dct_dout_diff_debug[12:0]
+(5)x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.dct_dout_diff_debug[12:0]
+(6)x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.dct_dout_diff_debug[12:0]
+(7)x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.dct_dout_diff_debug[12:0]
+(8)x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.dct_dout_diff_debug[12:0]
+(9)x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.dct_dout_diff_debug[12:0]
+(10)x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.dct_dout_diff_debug[12:0]
+(11)x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.dct_dout_diff_debug[12:0]
+(12)x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.dct_dout_diff_debug[12:0]
+@1401200
+-group_end
+@28
+x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.dct2d8x8_chen_i.dbg_stage1_pre2_en_out
+x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.dct2d8x8_chen_i.dct1d_chen_stage1_i.pre2_start_out
+@22
+x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.dct2d8x8_chen_i.dct1_out[15:0]
+@c00022
+x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.dct2d8x8_chen_i.dct1d_chen_stage1_i.phase[7:0]
+@28
+(0)x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.dct2d8x8_chen_i.dct1d_chen_stage1_i.phase[7:0]
+(1)x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.dct2d8x8_chen_i.dct1d_chen_stage1_i.phase[7:0]
+(2)x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.dct2d8x8_chen_i.dct1d_chen_stage1_i.phase[7:0]
+(3)x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.dct2d8x8_chen_i.dct1d_chen_stage1_i.phase[7:0]
+(4)x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.dct2d8x8_chen_i.dct1d_chen_stage1_i.phase[7:0]
+(5)x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.dct2d8x8_chen_i.dct1d_chen_stage1_i.phase[7:0]
+(6)x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.dct2d8x8_chen_i.dct1d_chen_stage1_i.phase[7:0]
+(7)x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.dct2d8x8_chen_i.dct1d_chen_stage1_i.phase[7:0]
+@1401200
+-group_end
+@22
+x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.dct2d8x8_chen_i.dct1d_chen_stage1_i.phase_cnt[2:0]
+@800200
+-transpose
+@28
+x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.dct2d8x8_chen_i.dct_chen_transpose_i.pre2_start
+@c00022
+x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.dct2d8x8_chen_i.dct_chen_transpose_i.wcntr[6:0]
+@28
+(0)x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.dct2d8x8_chen_i.dct_chen_transpose_i.wcntr[6:0]
+(1)x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.dct2d8x8_chen_i.dct_chen_transpose_i.wcntr[6:0]
+(2)x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.dct2d8x8_chen_i.dct_chen_transpose_i.wcntr[6:0]
+(3)x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.dct2d8x8_chen_i.dct_chen_transpose_i.wcntr[6:0]
+(4)x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.dct2d8x8_chen_i.dct_chen_transpose_i.wcntr[6:0]
+(5)x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.dct2d8x8_chen_i.dct_chen_transpose_i.wcntr[6:0]
+(6)x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.dct2d8x8_chen_i.dct_chen_transpose_i.wcntr[6:0]
+@1401200
+-group_end
+@c00022
+x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.dct2d8x8_chen_i.dct_chen_transpose_i.waddr[6:0]
+@28
+(0)x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.dct2d8x8_chen_i.dct_chen_transpose_i.waddr[6:0]
+(1)x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.dct2d8x8_chen_i.dct_chen_transpose_i.waddr[6:0]
+(2)x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.dct2d8x8_chen_i.dct_chen_transpose_i.waddr[6:0]
+(3)x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.dct2d8x8_chen_i.dct_chen_transpose_i.waddr[6:0]
+(4)x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.dct2d8x8_chen_i.dct_chen_transpose_i.waddr[6:0]
+(5)x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.dct2d8x8_chen_i.dct_chen_transpose_i.waddr[6:0]
+(6)x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.dct2d8x8_chen_i.dct_chen_transpose_i.waddr[6:0]
+@1401200
+-group_end
+@28
+x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.dct2d8x8_chen_i.dct_chen_transpose_i.we_r
+@420
+x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.dct2d8x8_chen_i.dct_chen_transpose_i.din[15:0]
+x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.dct2d8x8_chen_i.dct_chen_transpose_i.dout_10_32_76_54[31:0]
+@28
+x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.dct2d8x8_chen_i.dct_chen_transpose_i.en_out
+x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.dct2d8x8_chen_i.dct_chen_transpose_i.rstop_r[1:0]
+@c00022
+x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.dct2d8x8_chen_i.dct_chen_transpose_i.wrow_mod[3:0]
+@28
+(0)x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.dct2d8x8_chen_i.dct_chen_transpose_i.wrow_mod[3:0]
+(1)x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.dct2d8x8_chen_i.dct_chen_transpose_i.wrow_mod[3:0]
+(2)x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.dct2d8x8_chen_i.dct_chen_transpose_i.wrow_mod[3:0]
+(3)x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.dct2d8x8_chen_i.dct_chen_transpose_i.wrow_mod[3:0]
+x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.dct2d8x8_chen_i.dct_chen_transpose_i.wcol13
+@22
+x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.dct2d8x8_chen_i.dct_chen_transpose_i.wrow[2:0]
+@1401200
+-group_end
+@c00022
+x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.dct2d8x8_chen_i.dct_chen_transpose_i.raddr[5:0]
+@28
+(0)x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.dct2d8x8_chen_i.dct_chen_transpose_i.raddr[5:0]
+(1)x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.dct2d8x8_chen_i.dct_chen_transpose_i.raddr[5:0]
+(2)x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.dct2d8x8_chen_i.dct_chen_transpose_i.raddr[5:0]
+(3)x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.dct2d8x8_chen_i.dct_chen_transpose_i.raddr[5:0]
+(4)x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.dct2d8x8_chen_i.dct_chen_transpose_i.raddr[5:0]
+(5)x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.dct2d8x8_chen_i.dct_chen_transpose_i.raddr[5:0]
+@1401200
+-group_end
+@800200
+-debug_start_stop
+@28
+x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.cmprs_macroblock_buf_iface_i.mb_pre_start4_first
+@29
+x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.cmprs_macroblock_buf_iface_i.buf_ready_w
+@8022
+x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.cmprs_macroblock_buf_iface_i.dct_pipeline_delay_cntr[5:0]
+@800022
+x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.cmprs_macroblock_buf_iface_i.mb_pre_start[8:0]
+@28
+(0)x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.cmprs_macroblock_buf_iface_i.mb_pre_start[8:0]
+(1)x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.cmprs_macroblock_buf_iface_i.mb_pre_start[8:0]
+(2)x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.cmprs_macroblock_buf_iface_i.mb_pre_start[8:0]
+(3)x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.cmprs_macroblock_buf_iface_i.mb_pre_start[8:0]
+(4)x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.cmprs_macroblock_buf_iface_i.mb_pre_start[8:0]
+(5)x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.cmprs_macroblock_buf_iface_i.mb_pre_start[8:0]
+(6)x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.cmprs_macroblock_buf_iface_i.mb_pre_start[8:0]
+(7)x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.cmprs_macroblock_buf_iface_i.mb_pre_start[8:0]
+(8)x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.cmprs_macroblock_buf_iface_i.mb_pre_start[8:0]
+@1001200
+-group_end
+@28
+x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.mb_pre_end
+x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.mb_release_buf
+x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.mb_pre_start
+x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.mb_pre2_first_out
+x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.mb_pre_first_out
+x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.yc_pre_first_out
+x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[1].jp_channel_i.dct_start
+@200
+-
 @1000200
+-debug_start_stop
+-transpose
+-chn1
 -xdct
 @800200
 -dct_chen
@@ -2169,9 +2331,6 @@ x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[0].jp_channel_i.dct2
 (23)x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[0].jp_channel_i.dct2d8x8_chen_i.dct1d_chen_stage1_i.simd_b4[23:0]
 @1401200
 -group_end
-@420
-x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[0].jp_channel_i.dct2d8x8_chen_i.dct1d_chen_stage1_i.dout1_w[23:0]
-@1401200
 -ma1
 @c00200
 -ma2
@@ -2195,8 +2354,6 @@ x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[0].jp_channel_i.dct2
 x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[0].jp_channel_i.dct2d8x8_chen_i.dct1d_chen_stage1_i.dsp_ma_seld_2
 @420
 x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[0].jp_channel_i.dct2d8x8_chen_i.dct1d_chen_stage1_i.dsp_ma_p_2[47:0]
-[color] 2
-x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[0].jp_channel_i.dct2d8x8_chen_i.dct1d_chen_stage1_i.dout2_w[23:0]
 @800200
 -dsp48e1
 @420
@@ -2243,6 +2400,25 @@ x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[0].jp_channel_i.dct2
 x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[0].jp_channel_i.dct2d8x8_chen_i.start
 x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[0].jp_channel_i.dct2d8x8_chen_i.en_in_r
 x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[0].jp_channel_i.dct2d8x8_chen_i.dct1_en
+x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[0].jp_channel_i.dct2d8x8_chen_i.dct1d_chen_reorder_out_i.dv
+x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[0].jp_channel_i.dct2d8x8_chen_i.dct1d_chen_reorder_out_i.pre2_start
+@22
+x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[0].jp_channel_i.dct2d8x8_chen_i.dct1d_chen_reorder_out_i.cntr_in[3:0]
+@800022
+x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[0].jp_channel_i.dct2d8x8_chen_i.dct1d_chen_reorder_out_i.raddr[3:0]
+@28
+(0)x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[0].jp_channel_i.dct2d8x8_chen_i.dct1d_chen_reorder_out_i.raddr[3:0]
+(1)x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[0].jp_channel_i.dct2d8x8_chen_i.dct1d_chen_reorder_out_i.raddr[3:0]
+(2)x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[0].jp_channel_i.dct2d8x8_chen_i.dct1d_chen_reorder_out_i.raddr[3:0]
+(3)x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[0].jp_channel_i.dct2d8x8_chen_i.dct1d_chen_reorder_out_i.raddr[3:0]
+@1001200
+-group_end
+@28
+x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[0].jp_channel_i.dct2d8x8_chen_i.dct1d_chen_reorder_out_i.start_out_r
+x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[0].jp_channel_i.dct2d8x8_chen_i.dct1d_chen_reorder_out_i.en
+x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[0].jp_channel_i.dct2d8x8_chen_i.dct1d_chen_reorder_out_i.en_out
+@22
+x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[0].jp_channel_i.dct2d8x8_chen_i.dct1d_chen_reorder_out_i.per_type[2:0]
 @c00200
 -reorder_in
 @28
@@ -2270,15 +2446,9 @@ x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[0].jp_channel_i.dct2
 -stage1_dbg
 @28
 x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[0].jp_channel_i.dct2d8x8_chen_i.start
-@420
-x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[0].jp_channel_i.dct2d8x8_chen_i.dct1d_chen_stage1_i.dout1_w[23:0]
-x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[0].jp_channel_i.dct2d8x8_chen_i.dct1d_chen_stage1_i.dout2_w[23:0]
-@28
 x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[0].jp_channel_i.dct2d8x8_chen_i.dct1d_chen_reorder_out_dbg_i.pre2_start
-@420
-[color] 3
-x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[0].jp_channel_i.dct2d8x8_chen_i.dct1d_chen_reorder_out_dbg_i.din[23:0]
 @22
+[color] 3
 x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[0].jp_channel_i.dct2d8x8_chen_i.dct1d_chen_reorder_out_dbg_i.cntr_in[3:0]
 x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[0].jp_channel_i.dct2d8x8_chen_i.dct1d_chen_reorder_out_dbg_i.waddr[3:0]
 @28
@@ -2289,13 +2459,8 @@ x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[0].jp_channel_i.dct2
 x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[0].jp_channel_i.dct2d8x8_chen_i.dbg_pre_first_out
 x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[0].jp_channel_i.dct2d8x8_chen_i.dbg_dv
 x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[0].jp_channel_i.dct2d8x8_chen_i.dbg_en_out
-@420
 [color] 6
-x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[0].jp_channel_i.dct2d8x8_chen_i.dbg_d_out[23:0]
-@28
 x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[0].jp_channel_i.dct2d8x8_chen_i.stage1_pre2_start_out
-@22
-x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[0].jp_channel_i.dct2d8x8_chen_i.dct1_out[23:0]
 @200
 -
 @1000200
@@ -2316,10 +2481,10 @@ x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[0].jp_channel_i.dct2
 x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[0].jp_channel_i.dct2d8x8_chen_i.dct_chen_transpose_i.we_r
 @22
 x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[0].jp_channel_i.dct2d8x8_chen_i.dct_chen_transpose_i.pre_rstart_w
-@800023
+@c00022
 [color] 2
 x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[0].jp_channel_i.dct2d8x8_chen_i.dct_chen_transpose_i.wcntr[6:0]
-@29
+@28
 [color] 2
 (0)x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[0].jp_channel_i.dct2d8x8_chen_i.dct_chen_transpose_i.wcntr[6:0]
 [color] 2
@@ -2334,7 +2499,7 @@ x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[0].jp_channel_i.dct2
 (5)x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[0].jp_channel_i.dct2d8x8_chen_i.dct_chen_transpose_i.wcntr[6:0]
 [color] 2
 (6)x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[0].jp_channel_i.dct2d8x8_chen_i.dct_chen_transpose_i.wcntr[6:0]
-@1001201
+@1401200
 -group_end
 @22
 x393_testbench03.x393_i.compressor393_i.cmprs_channel_block[0].jp_channel_i.dct2d8x8_chen_i.dct_chen_transpose_i.wpage