diff --git a/dct_tests_04.sav b/dct_tests_04.sav
new file mode 100644
index 0000000000000000000000000000000000000000..51c159e14e2e4ac41b23d33ad09d9e508d027083
--- /dev/null
+++ b/dct_tests_04.sav
@@ -0,0 +1,231 @@
+[*]
+[*] GTKWave Analyzer v3.3.78 (w)1999-2016 BSI
+[*] Wed Dec 13 06:00:34 2017
+[*]
+[dumpfile] "/home/eyesis/nc393/elphel393/fpga-elphel/x393_branch_dct/simulation/dct_tests_04-20171212192023266.fst"
+[dumpfile_mtime] "Wed Dec 13 02:20:23 2017"
+[dumpfile_size] 219185
+[savefile] "/home/eyesis/nc393/elphel393/fpga-elphel/x393_branch_dct/dct_tests_04.sav"
+[timestart] 1588000
+[size] 1814 1171
+[pos] 0 40
+*-15.492632 1795000 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
+[treeopen] dct_tests_03.
+[treeopen] dct_tests_03.dtt_iv_8x8_i.
+[treeopen] dct_tests_03.dtt_iv_8x8r_i.
+[sst_width] 318
+[signals_width] 284
+[sst_expanded] 1
+[sst_vpaned_height] 344
+@420
+dct_tests_03.i
+dct_tests_03.i1
+dct_tests_03.j
+@28
+dct_tests_03.CLK
+dct_tests_03.RST
+[color] 2
+dct_tests_03.start
+[color] 2
+dct_tests_03.start2
+@22
+dct_tests_03.mode_in[1:0]
+@8420
+dct_tests_03.x_in_2d[24:0]
+@22
+dct_tests_03.mode_out[1:0]
+@8420
+dct_tests_03.d_out_2dr[24:0]
+@800200
+-dtt_iv8x8_direct
+@28
+dct_tests_03.dtt_iv_8x8_i.rst
+dct_tests_03.dtt_iv_8x8_i.clk
+dct_tests_03.dtt_iv_8x8_i.start
+@22
+dct_tests_03.dtt_iv_8x8_i.mode[1:0]
+@28
+dct_tests_03.dtt_iv_8x8_i.pre_last_in
+dct_tests_03.dtt_iv_8x8_i.pre_busy
+@c00200
+-debug
+@28
+dct_tests_03.dtt_iv_8x8_i.transpose_start
+@22
+dct_tests_03.dtt_iv_8x8_i.transpose_debug_di[7:0]
+@8022
+dct_tests_03.dtt_iv_8x8_i.transpose_debug_di[7:0]
+@c00022
+dct_tests_03.dtt_iv_8x8_i.transpose_wa[7:0]
+@28
+(0)dct_tests_03.dtt_iv_8x8_i.transpose_wa[7:0]
+(1)dct_tests_03.dtt_iv_8x8_i.transpose_wa[7:0]
+(2)dct_tests_03.dtt_iv_8x8_i.transpose_wa[7:0]
+(3)dct_tests_03.dtt_iv_8x8_i.transpose_wa[7:0]
+(4)dct_tests_03.dtt_iv_8x8_i.transpose_wa[7:0]
+(5)dct_tests_03.dtt_iv_8x8_i.transpose_wa[7:0]
+(6)dct_tests_03.dtt_iv_8x8_i.transpose_wa[7:0]
+(7)dct_tests_03.dtt_iv_8x8_i.transpose_wa[7:0]
+@1401200
+-group_end
+@c08022
+dct_tests_03.dtt_iv_8x8_i.transpose_wa[7:0]
+@28
+(0)dct_tests_03.dtt_iv_8x8_i.transpose_wa[7:0]
+(1)dct_tests_03.dtt_iv_8x8_i.transpose_wa[7:0]
+(2)dct_tests_03.dtt_iv_8x8_i.transpose_wa[7:0]
+(3)dct_tests_03.dtt_iv_8x8_i.transpose_wa[7:0]
+(4)dct_tests_03.dtt_iv_8x8_i.transpose_wa[7:0]
+(5)dct_tests_03.dtt_iv_8x8_i.transpose_wa[7:0]
+(6)dct_tests_03.dtt_iv_8x8_i.transpose_wa[7:0]
+(7)dct_tests_03.dtt_iv_8x8_i.transpose_wa[7:0]
+@1401200
+-group_end
+@28
+(0)dct_tests_03.dtt_iv_8x8_i.transpose_we[1:0]
+dct_tests_03.dtt_iv_8x8_i.pre_dsth
+@8022
+dct_tests_03.dtt_iv_8x8_i.transpose_cntr[6:0]
+@22
+dct_tests_03.dtt_iv_8x8_i.transpose_ra[7:0]
+@28
+dct_tests_03.dtt_iv_8x8_i.dctv_start_0_r
+dct_tests_03.dtt_iv_8x8_i.dctv_start_1_r
+@800028
+dct_tests_03.dtt_iv_8x8_i.pre2_dstv[1:0]
+@28
+(0)dct_tests_03.dtt_iv_8x8_i.pre2_dstv[1:0]
+(1)dct_tests_03.dtt_iv_8x8_i.pre2_dstv[1:0]
+@1001200
+-group_end
+@c00028
+[color] 2
+dct_tests_03.dtt_iv_8x8_i.dctv_out_we[1:0]
+@28
+(0)dct_tests_03.dtt_iv_8x8_i.dctv_out_we[1:0]
+(1)dct_tests_03.dtt_iv_8x8_i.dctv_out_we[1:0]
+@1401200
+-group_end
+@28
+dct_tests_03.dtt_iv_8x8_i.pre_dstv
+@200
+-alt
+@28
+dct_tests_03.dtt_iv_8x8_i.dstv
+dct_tests_03.dtt_iv_8x8_i.out_sel
+dct_tests_03.dtt_iv_8x8_i.out_run
+@22
+dct_tests_03.dtt_iv_8x8_i.out_cntr[6:0]
+@28
+dct_tests_03.dtt_iv_8x8_i.out_sel
+@c00022
+dct_tests_03.dtt_iv_8x8_i.out_wa[3:0]
+@28
+(0)dct_tests_03.dtt_iv_8x8_i.out_wa[3:0]
+(1)dct_tests_03.dtt_iv_8x8_i.out_wa[3:0]
+(2)dct_tests_03.dtt_iv_8x8_i.out_wa[3:0]
+(3)dct_tests_03.dtt_iv_8x8_i.out_wa[3:0]
+@1401200
+-group_end
+@28
+dct_tests_03.dtt_iv_8x8_i.out_we
+dct_tests_03.dtt_iv_8x8_i.sub16
+dct_tests_03.dtt_iv_8x8_i.inc16
+dct_tests_03.dtt_iv_8x8_i.start_out
+@200
+-top
+@22
+dct_tests_03.out_ram_wa[4:0]
+@28
+dct_tests_03.out_ram_cntr
+dct_tests_03.out_ram_wah
+@22
+dct_tests_03.out_wa[3:0]
+@28
+dct_tests_03.out_we
+dct_tests_03.out_ram_ren
+dct_tests_03.out_ram_regen
+@22
+dct_tests_03.out_ram_ra[5:0]
+@28
+dct_tests_03.out_pre_first
+dct_tests_03.out_ram_dv
+@1401200
+-debug
+@22
+dct_tests_03.dtt_iv_8x8_i.mode_out[1:0]
+@28
+dct_tests_03.dtt_iv_8x8_i.pre_busy
+@c00200
+-direct_internal
+@28
+dct_tests_03.dtt_iv_8x8_i.dcth_en0
+dct_tests_03.dtt_iv_8x8_i.dcth_en1
+dct_tests_03.dtt_iv_8x8_i.dcth_start_0_r
+dct_tests_03.dtt_iv_8x8_i.dcth_start_1_r
+@22
+dct_tests_03.dtt_iv_8x8_i.mode[1:0]
+dct_tests_03.dtt_iv_8x8_i.mode_h[1:0]
+dct_tests_03.dtt_iv_8x8_i.mode_h_late[1:0]
+dct_tests_03.dtt_iv_8x8_i.mode_v[1:0]
+dct_tests_03.dtt_iv_8x8_i.mode_out[1:0]
+@28
+dct_tests_03.dtt_iv_8x8_i.dctv_start_0_w
+@22
+dct_tests_03.dtt_iv_8x8_i.dctv_start_1_w
+@800028
+dct_tests_03.dtt_iv_8x8_i.pre2_dsth[1:0]
+@28
+(0)dct_tests_03.dtt_iv_8x8_i.pre2_dsth[1:0]
+(1)dct_tests_03.dtt_iv_8x8_i.pre2_dsth[1:0]
+@1001200
+-group_end
+@200
+-
+@800028
+dct_tests_03.dtt_iv_8x8_i.pre2_dstv[1:0]
+@28
+(0)dct_tests_03.dtt_iv_8x8_i.pre2_dstv[1:0]
+(1)dct_tests_03.dtt_iv_8x8_i.pre2_dstv[1:0]
+@800200
+-g3
+@28
+dct_tests_03.dtt_iv_8x8_i.dct_iv8_1d_pass2_0_i.start
+dct_tests_03.dtt_iv_8x8_i.dct_iv8_1d_pass2_0_i.dst_in
+dct_tests_03.dtt_iv_8x8_i.dct_iv8_1d_pass2_0_i.dst_out
+@1000200
+-g3
+@28
+dct_tests_03.dtt_iv_8x8_i.dct_iv8_1d_pass2_1_i.start
+dct_tests_03.dtt_iv_8x8_i.dct_iv8_1d_pass2_1_i.dst_in
+dct_tests_03.dtt_iv_8x8_i.dct_iv8_1d_pass2_1_i.dst_out
+@200
+-
+@1001200
+-group_end
+@1401200
+-direct_internal
+@1000200
+-dtt_iv8x8_direct
+@800200
+-dtt_iv8x8_inv
+@29
+dct_tests_03.dtt_iv_8x8r_i.clk
+@28
+dct_tests_03.dtt_iv_8x8r_i.start
+dct_tests_03.dtt_iv_8x8r_i.mode[1:0]
+dct_tests_03.dtt_iv_8x8r_i.mode_out[1:0]
+@800200
+-inv_internals
+@200
+-
+@1000200
+-inv_internals
+@200
+-
+@1000200
+-dtt_iv8x8_inv
+@200
+-dbg
+[pattern_trace] 1
+[pattern_trace] 0
diff --git a/dsp/dct_tests_04.tf b/dsp/dct_tests_04.tf
new file mode 100644
index 0000000000000000000000000000000000000000..5ca21aaab80e7fb35ec7a20a2562a001d28d251a
--- /dev/null
+++ b/dsp/dct_tests_04.tf
@@ -0,0 +1,449 @@
+/*!
+ * Module:dct_tests_03
+ * @file dct_tests_03.tf
+ * @date 2016-12-02
+ * @author Andrey Filippov
+ *
+ * @brief 1d 8-point DCT type IV for lapped mdct 16->8, operates in 16 clock cycles
+ * Uses 2 DSP blocks
+ *
+ * @copyright Copyright (c) 2016 Elphel, Inc.
+ *
+ * License:
+ *
+ *dct_tests_03.tf is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * dct_tests_03.tf is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see .
+ *
+ * Additional permission under GNU GPL version 3 section 7:
+ * If you modify this Program, or any covered work, by linking or combining it
+ * with independent modules provided by the FPGA vendor only (this permission
+ * does not extend to any 3-rd party modules, "soft cores" or macros) under
+ * different license terms solely for the purpose of generating binary "bitstream"
+ * files and/or simulating the code, the copyright holders of this Program give
+ * you the right to distribute the covered work without those independent modules
+ * as long as the source code for them is available from the FPGA vendor free of
+ * charge, and there is no dependence on any encrypted modules for simulating of
+ * the combined code. This permission applies to you if the distributed code
+ * contains all the components and scripts required to completely simulate it
+ * with at least one of the Free Software programs.
+ */
+`timescale 1ns/1ps
+// No saturation here, and no rounding as we do not need to match decoder (be bit-precise), skipping rounding adder
+// will reduce needed resources
+//`define DCT_INPUT_UNITY
+module dct_tests_03 ();
+// parameter fstname="dct_tests_03.fst";
+`ifdef IVERILOG
+ `ifdef NON_VDT_ENVIROMENT
+ parameter fstname="dct_tests_03.fst";
+ `else
+ `include "IVERILOG_INCLUDE.v"
+ `endif // NON_VDT_ENVIROMENT
+`else // IVERILOG
+ `ifdef CVC
+ `ifdef NON_VDT_ENVIROMENT
+ parameter fstname = "x393.fst";
+ `else // NON_VDT_ENVIROMENT
+ `include "IVERILOG_INCLUDE.v"
+ `endif // NON_VDT_ENVIROMENT
+ `else
+ parameter fstname = "dct_tests_03.fst";
+ `endif // CVC
+`endif // IVERILOG
+
+ parameter CLK_PERIOD = 10; // ns
+ parameter WIDTH = 25; //4; // input data width
+ parameter OUT_WIDTH = 25; //4; // output data width
+ parameter TRANSPOSE_WIDTH = 25; //4; // width of the transpose memory (intermediate results)
+ parameter OUT_RSHIFT = 2; // overall right shift of the result from input, aligned by MSB (>=3 will never cause saturation)
+ parameter OUT_RSHIFT2 = 0; // overall right shift for the second (vertical) pass
+
+ parameter DCT_GAP = 16; // between runs
+
+ parameter SAME_BITS=4; // (3) to match 24-bit widths
+
+ reg RST = 1'b1;
+ reg CLK = 1'b0;
+ reg [3:0] phase_in;
+ reg [3:0] phase_out;
+ reg run_in;
+ reg run_out;
+ reg run_out_d;
+
+ reg en_x = 0;
+// reg end_x = 0;
+ reg [2:0] x_ra;
+ wire [2:0] x_wa = phase_in[2:0];
+
+
+ wire x_we = !phase_in[3] && run_in;
+ reg [WIDTH-1:0] x_in;
+ reg [WIDTH-1:0] x_in_2d;
+ reg [WIDTH-1:0] x_out;
+ reg [WIDTH-1:0] x_ram[0:7];
+ wire [WIDTH-1:0] x_out_w = x_ram[x_ra];
+
+ reg start = 0;
+ reg start2 = 0; // second start for 2d
+ reg [1:0] mode_in= 0; // 3; // [0] - vertical pass 0: dct, 1 - dst, [1] - horizontal pass
+ wire [1:0] mode_out; // [0] - vertical pass 0: dct, 1 - dst, [1] - horizontal pass
+
+ wire [OUT_WIDTH-1:0] y_dct;
+ wire pre2_start_out;
+ wire en_out;
+
+ reg y_pre_we;
+ reg y_we;
+ reg [3:0] phase_y=8;
+ reg [2:0] y_wa;
+ reg [2:0] y_ra;
+ reg y_dv=0;
+ reg signed [OUT_WIDTH-1:0] y_ram[0:7];
+ wire signed [OUT_WIDTH-1:0] y_out = y_ram[y_ra]; // SuppressThisWarning VEditor - simulation only
+ reg signed [WIDTH-1:0] data_in[0:63];
+ reg signed [OUT_WIDTH-1:0] data_out[0:63];
+
+ wire pre_last_in_2d; // SuppressThisWarning VEditor - simulation only
+ wire pre_first_out_2d; // SuppressThisWarning VEditor - simulation only
+ wire pre_busy_2d; // SuppressThisWarning VEditor - simulation only
+ wire dv_2d; // SuppressThisWarning VEditor - simulation only
+// wire signed [OUT_WIDTH-1:0] d_out_2d;
+
+ wire pre_last_in_2dr; // SuppressThisWarning VEditor - simulation only
+ wire pre_first_out_2dr; // SuppressThisWarning VEditor - simulation only
+ wire pre_busy_2dr; // SuppressThisWarning VEditor - simulation only
+ wire dv_2dr; // SuppressThisWarning VEditor - simulation only
+ wire signed [OUT_WIDTH-1:0] d_out_2dr; // SuppressThisWarning VEditor - simulation only
+
+
+ integer i,j, i1, ir;
+ initial begin
+ for (i=0; i<64; i=i+1) begin
+ `ifdef DCT_INPUT_UNITY
+ data_in[i] = (i[2:0] == (i[5:3] ^ 3'h0)) ? {2'b1,{WIDTH-2{1'b0}}} : 0;
+ ir= (i[2:0] == (i[5:3] ^ 3'h1)) ? {2'b1,{WIDTH-2{1'b0}}} : 0;
+ data_in[i] = ir;
+ `else
+ ir = $random;
+ data_in[i] = ((i[5:3] == 0) || (i[5:3] == 7) || (i[2:0] == 0) || (i[2:0] == 7))? 0:
+ {{SAME_BITS{ir[WIDTH -SAME_BITS - 1]}},ir[WIDTH -SAME_BITS-1:0]};
+ `endif
+ end
+ $display("Input data in line-scan order:");
+ for (i=0; i<64; i=i+8) begin
+ $display ("%d, %d, %d, %d, %d, %d, %d, %d",data_in[i+0],data_in[i+1],data_in[i+2],data_in[i+3],
+ data_in[i+4],data_in[i+5],data_in[i+6],data_in[i+7]);
+ end
+ $display("");
+ $display("Input data - transposed:");
+ j=0;
+ for (i=0; i < 8; i=i+1) begin
+ $display ("%d, %d, %d, %d, %d, %d, %d, %d",data_in[i+ 0],data_in[i+ 8],data_in[i+16],data_in[i+24],
+ data_in[i+32],data_in[i+40],data_in[i+48],data_in[i+56]);
+ end
+ $display("");
+
+ end
+
+ always #(CLK_PERIOD/2) CLK = ~CLK;
+ initial begin
+ $dumpfile(fstname);
+ $dumpvars(0,dct_tests_03); // SuppressThisWarning VEditor
+ #100;
+ RST = 0;
+ #100;
+ repeat (10) @(posedge CLK);
+#1 en_x = 1;
+ for (i = 0; i < 64; i = i+1) begin
+ @(posedge CLK);
+ #1;
+ x_in = data_in[i]; // >>x_wa;
+ if (i==63) begin
+ en_x = 0;
+ end
+ if (&i[2:0]) repeat (8) @(posedge CLK);
+ end
+ #1 x_in = 0;
+ repeat (64) @(posedge CLK);
+
+ $display("");
+ $display("output data - transposed:");
+ for (i=0; i<64; i=i+8) begin
+ $display ("%d, %d, %d, %d, %d, %d, %d, %d",data_out[i+0],data_out[i+1],data_out[i+2],data_out[i+3],
+ data_out[i+4],data_out[i+5],data_out[i+6],data_out[i+7]);
+ end
+
+// repeat (64) @(posedge CLK);
+// $finish;
+ end
+
+ initial begin
+ wait (!RST);
+ while (!start) begin
+ @(posedge CLK);
+ #1;
+ end
+ for (i1 = 0; i1 < 192; i1 = i1+1) begin
+ @(posedge CLK);
+ #1;
+ x_in_2d = data_in[i1 & 63];
+ if ((i1 & 63) == 0) mode_in = mode_in+1;
+ start2 = (i1 & 63) == 63;
+ end
+ for (i1 = 0; i1 < 64; i1 = i1+1) begin
+ @(posedge CLK);
+ #1;
+ start2 = 0;
+ x_in_2d = data_in[i1];
+ end
+
+ repeat (DCT_GAP) @(posedge CLK);
+ #1;
+ start2 = 1;
+ for (i1 = 0; i1 < 64; i1 = i1+1) begin
+ @(posedge CLK);
+ #1;
+ start2 = 0;
+ x_in_2d = data_in[63-i1];
+ end
+
+ repeat (300) @(posedge CLK);
+ $finish;
+
+ end
+
+
+ initial j = 0;
+ always @ (posedge CLK) begin
+ if (y_dv) begin
+//$display (" y[0x%x] => 0x%x %d, j=%d @%t",y_ra,y_out,y_out,j,$time);
+ data_out[{j[2:0],j[5:3]}] = y_out; // transpose array
+ #1 j = j+1;
+ end
+ end
+
+
+
+ always @ (posedge CLK) begin
+ if (RST) run_in <= 0;
+ else if (en_x) run_in <= 1;
+ else if (phase_in == 15) run_in <= 0;
+
+ if (RST) run_out <= 0;
+ else if ((phase_in == 5) || (phase_out==15)) run_out <= run_in;
+
+ if (!run_in) phase_in <= 0;
+ else phase_in <= phase_in + 1;
+
+ if (!run_out) phase_out <= 0;
+ else phase_out <= phase_out + 1;
+
+ run_out_d <= run_out;
+
+ if (RST) start <= 0;
+ else start <= run_out & !run_out_d;
+
+ {y_we,y_pre_we} <= {y_pre_we, en_out};
+
+ if (RST) phase_y <= 8;
+ else if (pre2_start_out) phase_y <= 0;
+ else if (y_pre_we) phase_y <= phase_y + 1;
+
+ if (RST) y_dv <= 0;
+ else if ((phase_y == 6) && y_we) y_dv <= 1;
+ else if (y_ra == 7) y_dv <= 0;
+
+ if (!y_dv) y_ra <= 0;
+ else y_ra <= y_ra + 1;
+
+ if (y_we) y_ram[y_wa] <= y_dct;
+
+
+ if (x_we) x_ram[x_wa] <= x_in;
+
+ x_out <= x_out_w;
+//X2-X7-X3-X4-X5-X6-X0-X1-*-X3-X5-X4-*-X1-X7-*
+ case (phase_out)
+ 4'h0: x_ra <= 2;
+ 4'h1: x_ra <= 7;
+ 4'h2: x_ra <= 3;
+ 4'h3: x_ra <= 4;
+ 4'h4: x_ra <= 5;
+ 4'h5: x_ra <= 6;
+ 4'h6: x_ra <= 0;
+ 4'h7: x_ra <= 1;
+ 4'h8: x_ra <= 'bx;
+ 4'h9: x_ra <= 3;
+ 4'ha: x_ra <= 5;
+ 4'hb: x_ra <= 4;
+ 4'hc: x_ra <= 'bx;
+ 4'hd: x_ra <= 6;
+ 4'he: x_ra <= 7;
+ 4'hf: x_ra <= 'bx;
+ endcase
+
+ case (phase_y[2:0])
+ 3'h0: y_wa <= 0;
+ 3'h1: y_wa <= 7;
+ 3'h2: y_wa <= 4;
+ 3'h3: y_wa <= 3;
+ 3'h4: y_wa <= 1;
+ 3'h5: y_wa <= 6;
+ 3'h6: y_wa <= 2;
+ 3'h7: y_wa <= 5;
+ endcase
+
+ end
+
+ dtt_iv8_1d #(
+ .WIDTH (WIDTH),
+ .OUT_WIDTH (OUT_WIDTH),
+ .OUT_RSHIFT (OUT_RSHIFT),
+ .B_WIDTH (18),
+ .A_WIDTH (25),
+ .P_WIDTH (48),
+ .COSINE_SHIFT (17),
+ .COS_01_32 (130441),
+ .COS_03_32 (125428),
+ .COS_04_32 (121095),
+ .COS_05_32 (115595),
+ .COS_07_32 (101320),
+ .COS_08_32 (92682),
+ .COS_09_32 (83151),
+ .COS_11_32 (61787),
+ .COS_12_32 (50159),
+ .COS_13_32 (38048),
+ .COS_15_32 (12847)
+ ) dtt_iv8_1d_i (
+ .clk (CLK), // input
+ .rst (RST), // input
+ .en (run_in), // input
+ .dst_in (mode_in[1]), // input
+ .d_in (x_out), // input[23:0]
+ .start (start), // input
+ .dout (y_dct), // output[15:0]
+ .pre2_start_out (pre2_start_out), // output reg
+ .en_out (en_out), // output reg
+ .dst_out (), // output
+ .y_index () // output[2:0] reg
+ );
+ parameter ODEPTH = 5;
+ reg signed [OUT_WIDTH-1:0] out_ram[0: ((1<