Added modified xdct393 module to increase clock frequency

ba35eb7a · Andrey Filippov · 30fae557 · ba35eb7a
Commit ba35eb7a authored Nov 01, 2015 by Andrey Filippov
Show whitespace changes
Inline Side-by-side

Showing with 660 additions and 0 deletions

xdct393r.v compressor_jp/xdct393r.v +660 -0

No files found.
--- a/compressor_jp/xdct393r.v
+++ b/compressor_jp/xdct393r.v
+/**********************************************************************
+** -----------------------------------------------------------------------------**
+** xdct393r.v
+**
+** 8x8 discrete Cosine Transform
+** adding more registers to increase bandwidth
+**
+** Copyright (C) 2002-2015 Elphel, Inc
+**
+** -----------------------------------------------------------------------------**
+**  xdct393r is free software - hardware description language (HDL) code.
+** 
+**  This program is free software: you can redistribute it and/or modify
+**  it under the terms of the GNU General Public License as published by
+**  the Free Software Foundation, either version 3 of the License, or
+**  (at your option) any later version.
+**
+**  This program is distributed in the hope that it will be useful,
+**  but WITHOUT ANY WARRANTY; without even the implied warranty of
+**  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+**  GNU General Public License for more details.
+**
+**  You should have received a copy of the GNU General Public License
+**  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+** -----------------------------------------------------------------------------**
+**
+** Modified by Andrey Filippov - goal to make it work in start/stop mode, using
+** "start" input (going together with the first data, no restriction on the gap between 64-pixel blocks (>=0)
+** Removed "RST" input ("en" is only used to reset ping-pong transpose memory address)
+** Split module in 2 stages
+** Also saved some area - original design compiled by XST to 865 slices (XC2S300e), this one - 780!
+**
+** It is based on the original design (Xilix app. note XAPP610) by:
+**                  Author: Latha Pillai
+**                  Senior Applications Engineer
+**
+**                  Video Applications
+**                  Advanced Products Group
+**                  Xilinx, Inc.
+**
+**                  Copyright (c) 2001 Xilinx, Inc.
+**                  All rights reserved
+**
+**                  Date:   Feb. 10, 2002
+**
+**                  RESTRICTED RIGHTS LEGEND
+**
+**      This software has not been published by the author, and 
+**      has been disclosed to others for the purpose of enhancing 
+**      and promoting design productivity in Xilinx products.
+**
+**      Therefore use, duplication or disclosure, now and in the 
+**      future should give consideration to the productivity 
+**      enhancements afforded the user of this code by the author's 
+**      efforts.  Thank you for using our products !
+**
+** Disclaimer:  THESE DESIGNS ARE PROVIDED "AS IS" WITH NO WARRANTY 
+**              WHATSOEVER AND XILINX SPECIFICALLY DISCLAIMS ANY 
+**              IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR
+**              A PARTICULAR PURPOSE, OR AGAINST INFRINGEMENT.
+***********************************************************************/
+
+/*
+after I added DC subtraction before DCT I got 9-bit (allthough not likely to go out of 8bit range) signed data.
+also increased transpose memory to 9 bits (anyway it is 16-bit wide) - see if it will help to prevent saturation
+without significant increase in gates
+
+Saturatuion is still visible on real pictures, but there was a bug - addsub<i>a_comp, addsub<i>b_comp where not using their
+MSB. I added 1 more bit to add_sub<i>a and add_sub<i>b and fixed that bug. Only 2 mofre slices were used
+*/
+
+
+
+`timescale 1ns/1ps
+
+// For xdct353 - increasing data in 9 bits -> 10 bits, out 12 bits ->13 bits
+
+module xdct393r   ( // increased latency by 3
+    input         clk,           // system clock, posedge
+    input         en,            // if zero will reset transpose memory page njumbers
+    input         start,         // single-cycle start pulse that goes with the first pixel data. Other 63 should follow
+    input   [9:0] xin,           // [7:0] - input data
+    output reg    last_in,       // output high during input of the last of 64 pixels in a 8x8 block
+    output        pre_first_out, // 1 cycle ahead of the first output in a 64 block
+    output        dv,            // data output valid. Will go high on the 94-th cycle after the start
+    output [12:0] d_out);        // [8:0]output data
+
+
+    wire          stage1_done;
+    wire          tm_page;
+    wire          tm_we;
+    wire    [6:0] tm_ra;
+    wire    [6:0] tm_wa;
+    wire   [15:0] tm_out;
+    wire   [15:0] tm_di;
+
+//    reg           stage1_done_r; // delay by one clock to use memory output register
+    
+    wire          tm_re; // =1'b1; // TODO: generate, for now just 1'b1
+    wire           tm_regen;
+    always @ (posedge clk) begin
+        last_in <=       (tm_wa[5:0]== 6'h30);
+//        stage1_done_r <= stage1_done;
+//        tm_regen <=      tm_re;
+    end
+    dct393r_stage1 i_dct_stage1(
+        .clk       (clk),
+        .en        (en),
+        .start     (start),
+        .xin       (xin),      // [7:0]
+        .we        (tm_we),          // write to transpose memory
+        .wr_cntr   (tm_wa), // [6:0]    transpose memory write address
+        .z_out     (tm_di[15:0]),
+        .page      (tm_page),
+        .done      (stage1_done));
+        
+    dct393r_stage2 i_dct_stage2(
+        .clk       (clk),
+        .en        (en),
+        .start     (stage1_done),    // stage 1 finished, data available in transpose memory (extra RAM latency)
+        .page      (tm_page),        // transpose memory page finished, valid at start
+        .rd_cntr   (tm_ra[6:0]),     // [6:0]    transpose memory read address
+        .ren       (tm_re),          // output
+        .regen     (tm_regen),       // output reg 
+        .tdin      (tm_out[15:0]),   // [7:0] - data from transpose memory
+        .endv      (pre_first_out),  // output
+        .dv        (dv),             // data output valid
+        .dct2_out  (d_out[12:0]));   // [10:0]output data
+
+    ram18_var_w_var_r #(
+        .REGISTERS     (1),
+        .LOG2WIDTH_WR  (4),
+        .LOG2WIDTH_RD  (4),
+        .DUMMY(0)
+    ) i_transpose_mem (
+        .rclk      (clk), // input
+        .raddr     ({3'b0,tm_ra[6:0]}), // input[9:0] 
+        .ren       (tm_re), // input
+        .regen     (tm_regen), // input
+        .data_out  (tm_out[15:0]), // output[15:0] 
+        .wclk      (clk), // input
+        .waddr     ({3'b0,tm_wa[6:0]}), // input[9:0] 
+        .we        (tm_we), // input
+        .web       (4'hf), // input[3:0] 
+        .data_in   (tm_di[15:0]) // input[15:0] 
+    );
+
+endmodule
+
+// 01/24/2004: Moved all clocks in stage 1 to "negedge" to reduce current pulses
+
+module dct393r_stage1 ( // increased latency by 1
+    input             clk,           // system clock, posedge
+    input             en,
+    input             start,      // single-cycle start pulse to replace RST
+    input      [ 9:0] xin,      // [7:0]
+    output            we,          // write to transpose memory
+    output     [ 6:0] wr_cntr, // [6:0]    transpose memory write address
+    output reg [15:0] z_out,      //data to transpose memory
+    output            page,    // transpose memory page just filled (valid @ done)
+    output            done);   // last cycle writing to transpose memory - may use after it (move it earlier?)
+/* constants */
+
+    localparam C3= 16'd54491;
+    localparam S3= 16'd36410;
+    localparam C4= 16'd46341;
+    localparam C6= 16'd25080;
+    localparam S6= 16'd60547;
+    localparam C7= 16'd12785;
+    localparam S7= 16'd64277;
+
+    reg    [16:0] memory1a, memory2a, memory3a, memory4a;
+
+
+/* 1D section */
+/* The max value of a pixel after processing (to make their expected mean to zero)
+is 127. If all the values in a row are 127, the max value of the product terms
+would be (127*2)*(23170/256) and that of z_out_int would be (127*8)*23170/256.
+This value divided by 2raised to 8 is equivalent to ignoring the 8 lsb bits of the value */
+
+    reg    [ 9:0] xa0_in, xa1_in, xa2_in, xa3_in, xa4_in, xa5_in, xa6_in, xa7_in;
+    reg    [ 9:0] xa0_reg, xa1_reg, xa2_reg, xa3_reg, xa4_reg, xa5_reg, xa6_reg, xa7_reg;
+
+    reg    [ 9:0] addsub1a_comp, addsub2a_comp, addsub3a_comp, addsub4a_comp;
+//    reg    [10:0] addsub1a_comp, addsub2a_comp, addsub3a_comp, addsub4a_comp; // AF2015: increasing width - was limiting
+
+    reg    [10:0] add_sub1a, add_sub2a, add_sub3a, add_sub4a;
+    reg           save_sign1a, save_sign2a, save_sign3a, save_sign4a;
+    reg    [17:0] p1a, p2a, p3a, p4a;
+    wire   [35:0] p1a_all, p2a_all, p3a_all, p4a_all;
+    reg           toggleA;
+
+    reg    [18:0] z_out_int1, z_out_int2;
+    reg    [18:0] z_out_int;
+    wire   [15:0] z_out_prelatch;
+    reg    [ 2:0] indexi;
+
+/* clks and counters */
+    reg    [ 6:0] wr_cntr_prelatch;
+
+/* memory section */
+    reg           done_prelatch;
+    reg           we_prelatch;
+    wire          enwe;
+    wire          pre_sxregs;
+    reg           sxregs;
+    reg           page_prelatch;
+    // TODO: See if negedge is needed
+    wire          nclk = ~clk; // seems that everything here is running at negedge (and delays too), but not the transpose memory
+
+// to conserve energy by disabling toggleA
+
+    wire          sxregs_d8;
+    reg           enable_toggle;
+//  SRL16_1 i_sxregs_d8   (.Q(sxregs_d8), .A0(1'b1), .A1(1'b1), .A2(1'b1), .A3(1'b0), .CLK(clk),.D(sxregs));    // dly=7+1
+    dly_16 #(.WIDTH(1)) i_sxregs_d8(.clk(nclk),.rst(1'b0), .dly(4'd7), .din(sxregs), .dout(sxregs_d8));   // dly=7+1
+    
+    
+// SRL16_1 i_pre_sxregs (.Q(pre_sxregs), .A0(1'b0), .A1(1'b1), .A2(1'b1), .A3(1'b0), .CLK(clk), .D(start));    // dly=6+1
+    dly_16 #(.WIDTH(1)) i_pre_sxregs(.clk(nclk),.rst(1'b0), .dly(4'd6), .din(start), .dout(pre_sxregs));    // dly=6+1
+    
+// SRL16_1 i_enwe       (.Q(enwe), .A0(1'b1), .A1(1'b0), .A2(1'b1), .A3(1'b0), .CLK(clk), .D(pre_sxregs));    // dly=5+1
+    dly_16 #(.WIDTH(1)) i_enwe(.clk(nclk),.rst(1'b0), .dly(4'd5), .din(pre_sxregs), .dout(enwe));    // dly=5+1
+  
+    always @ (posedge nclk) begin
+        enable_toggle <= en && (sxregs || (enable_toggle && !sxregs_d8));
+        
+        done_prelatch<= (wr_cntr_prelatch[5:0]==6'h3f);
+        
+        if (wr_cntr_prelatch[5:0]==6'h3f) page_prelatch <= wr_cntr_prelatch[6];
+        
+        we_prelatch<= enwe || (en && we_prelatch && (wr_cntr_prelatch[5:0]!=6'h3f));
+        
+        if     (!en) wr_cntr_prelatch <= 7'b0;
+        else if (we_prelatch) wr_cntr_prelatch <= wr_cntr_prelatch + 1;
+        
+        sxregs <= pre_sxregs || ((wr_cntr_prelatch[2:0]==3'h1) && (wr_cntr_prelatch[5:3]!=3'h7));
+        
+        toggleA <= sxregs || (enable_toggle && (~toggleA));
+        
+        if (sxregs) indexi <= 3'h7;
+        else if (enable_toggle) indexi<=indexi+1;
+    end
+
+
+/*  1D-DCT BEGIN */
+
+// store  1D-DCT constant coefficient values for multipliers */
+
+    always @ (posedge nclk) begin
+        case (indexi)
+         0 : begin memory1a <= {1'b0,C4}; //8'd91
+                   memory2a <= {1'b0,C4}; //8'd91
+                   memory3a <= {1'b0,C4}; //8'd91 
+                   memory4a <= {1'b0,C4}; //8'd91
+             end
+         1 : begin memory1a <= {1'b0,S7}; //8'd126; 
+                   memory2a <= {1'b0,C3}; //8'd106;  
+                   memory3a <= {1'b0,S3}; //8'd71;  
+                   memory4a <= {1'b0,C7}; //8'd25;
+             end
+         2 : begin memory1a <= {1'b0,S6}; //8'd118; 
+                   memory2a <= {1'b0,C6}; //8'd49;  
+                   memory3a <= {1'b1,C6}; //-8'd49; 
+                   memory4a <= {1'b1,S6}; //-8'd118
+             end
+         3 : begin memory1a <= {1'b0,C3}; // 8'd106; 
+                   memory2a <= {1'b1,C7}; //-8'd25;  
+                   memory3a <= {1'b1,S7}; //-8'd126; 
+                   memory4a <= {1'b1,S3}; //-8'd71;
+             end
+         4 : begin memory1a <= {1'b0,C4}; // 8'd91; 
+                   memory2a <= {1'b1,C4}; //-8'd91; 
+                   memory3a <= {1'b1,C4}; //-8'd91; 
+                   memory4a <= {1'b0,C4}; // 8'd91;
+             end
+         5 : begin memory1a <= {1'b0,S3}; // 8'd71; 
+                   memory2a <= {1'b1,S7}; //-8'd126; 
+                   memory3a <= {1'b0,C7}; // 8'd25;   
+                   memory4a <= {1'b0,C3}; // 8'd106;
+             end
+         6 : begin memory1a <= {1'b0,C6}; // 8'd49; 
+                   memory2a <= {1'b1,S6}; //-8'd118; 
+                   memory3a <= {1'b0,S6}; // 8'd118;  
+                   memory4a <= {1'b1,C6}; //-8'd49;
+             end
+         7 : begin memory1a <= {1'b0,C7}; // 8'd25;  
+                   memory2a <= {1'b1,S3}; //-8'd71; 
+                   memory3a <= {1'b0,C3}; // 8'd106;  
+                   memory4a <= {1'b1,S7}; //-8'd126;
+             end
+        endcase
+    end
+
+/* 8-bit input shifted 8 times through a shift register*/
+// xa0_in will see output registers from posedge, may be replaced by latches if needed - but currently delay is under 5ns
+    always @ (posedge nclk) begin
+        xa0_in <= xin;
+        xa1_in <= xa0_in;
+        xa2_in <= xa1_in;
+        xa3_in <= xa2_in;
+        xa4_in <= xa3_in;
+        xa5_in <= xa4_in;
+        xa6_in <= xa5_in;
+        xa7_in <= xa6_in;
+    end
+
+/* shifted inputs registered every 8th clk (using cntr8)*/
+    always @ (posedge nclk) if (sxregs) begin 
+        xa0_reg <= xa0_in;
+        xa1_reg <= xa1_in; 
+        xa2_reg <= xa2_in;
+        xa3_reg <= xa3_in;
+        xa4_reg <= xa4_in;
+        xa5_reg <= xa5_in; 
+        xa6_reg <= xa6_in;
+        xa7_reg <= xa7_in;
+    end
+
+/* adder / subtractor block */
+    always @ (negedge clk)
+        if (toggleA == 1'b1) begin
+            add_sub1a <= {xa7_reg[9],xa7_reg[9:0]} + {xa0_reg[9],xa0_reg[9:0]};
+            add_sub2a <= {xa6_reg[9],xa6_reg[9:0]} + {xa1_reg[9],xa1_reg[9:0]};
+            add_sub3a <= {xa5_reg[9],xa5_reg[9:0]} + {xa2_reg[9],xa2_reg[9:0]};
+            add_sub4a <= {xa4_reg[9],xa4_reg[9:0]} + {xa3_reg[9],xa3_reg[9:0]};
+        end else begin
+            add_sub1a <= {xa7_reg[9],xa7_reg[9:0]} - {xa0_reg[9],xa0_reg[9:0]};
+            add_sub2a <= {xa6_reg[9],xa6_reg[9:0]} - {xa1_reg[9],xa1_reg[9:0]};
+            add_sub3a <= {xa5_reg[9],xa5_reg[9:0]} - {xa2_reg[9],xa2_reg[9:0]};
+            add_sub4a <= {xa4_reg[9],xa4_reg[9:0]} - {xa3_reg[9],xa3_reg[9:0]};
+        end
+
+// First valid add_sub appears at the 10th clk (8 clks for shifting inputs,
+// 9th clk for registering shifted input and 10th clk for add_sub
+// to synchronize the i value to the add_sub value, i value is incremented
+// only after 10 clks
+
+// Adding these wires to get rid of the MSB that is always 0
+    wire [10:0] addsub1a_comp_w  = add_sub1a[10]? (-add_sub1a) : add_sub1a;
+    wire [10:0] addsub2a_comp_w  = add_sub2a[10]? (-add_sub2a) : add_sub2a;
+    wire [10:0] addsub3a_comp_w  = add_sub3a[10]? (-add_sub3a) : add_sub3a;
+    wire [10:0] addsub4a_comp_w  = add_sub4a[10]? (-add_sub4a) : add_sub4a;
+    
+    always @ (posedge nclk) begin
+         save_sign1a     <= add_sub1a[10];
+         save_sign2a     <= add_sub2a[10];
+         save_sign3a     <= add_sub3a[10];
+         save_sign4a     <= add_sub4a[10];
+         addsub1a_comp	<= addsub1a_comp_w[9:0]; //add_sub1a[10]? (-add_sub1a) : add_sub1a;
+         addsub2a_comp	<= addsub2a_comp_w[9:0]; //add_sub2a[10]? (-add_sub2a) : add_sub2a;
+         addsub3a_comp	<= addsub3a_comp_w[9:0]; //add_sub3a[10]? (-add_sub3a) : add_sub3a;
+         addsub4a_comp	<= addsub4a_comp_w[9:0]; //add_sub4a[10]? (-add_sub4a) : add_sub4a;
+    end
+
+    assign p1a_all = addsub1a_comp * memory1a[15:0]; // [16] is sign!
+    assign p2a_all = addsub2a_comp * memory2a[15:0];
+    assign p3a_all = addsub3a_comp * memory3a[15:0];
+    assign p4a_all = addsub4a_comp * memory4a[15:0];
+    reg [17:0] p1a_all_r;
+    reg [17:0] p2a_all_r;
+    reg [17:0] p3a_all_r;
+    reg [17:0] p4a_all_r;
+    reg p1a_sig, p2a_sig, p3a_sig, p4a_sig;
+    always @ (posedge nclk) begin
+        p1a_all_r <= p1a_all[26:9];
+        p2a_all_r <= p2a_all[26:9];
+        p3a_all_r <= p3a_all[26:9];
+        p4a_all_r <= p4a_all[26:9];
+        p1a_sig <= (save_sign1a ^ memory1a[16]);
+        p2a_sig <= (save_sign2a ^ memory2a[16]);
+        p3a_sig <= (save_sign3a ^ memory3a[16]);
+        p4a_sig <= (save_sign4a ^ memory4a[16]);
+
+    end
+    always @ (posedge nclk) begin
+        p1a <= p1a_sig ? (-p1a_all_r) : p1a_all_r;
+        p2a <= p2a_sig ? (-p2a_all_r) : p2a_all_r;
+        p3a <= p3a_sig ? (-p3a_all_r) : p3a_all_r;
+        p4a <= p4a_sig ? (-p4a_all_r) : p4a_all_r;
+    end
+
+/* Final adder. Adding the ouputs of the 4 multipliers */
+    always @ (posedge nclk) begin
+        z_out_int1 <= ({p1a[17],p1a} + {p2a[17],p2a});
+        z_out_int2 <= ({p3a[17],p3a} + {p4a[17],p4a});
+        z_out_int <= (z_out_int1 + z_out_int2);
+    end
+
+// rounding of the value
+    assign z_out_prelatch[15:0] = z_out_int[18:3]+ z_out_int[2]; // correct rounding
+
+// outputs from output latches to cross clock edge boundary
+    always @ (posedge clk) begin
+        z_out[15:0]  <= z_out_prelatch[15:0];
+//        wr_cntr[6:0] <= wr_cntr_prelatch[6:0];  
+//        done         <= done_prelatch;  
+//        we           <= we_prelatch;  
+//        page         <= page_prelatch;  
+    end
+    dly_16 #(.WIDTH(10)) i_delayed_outs(
+        .clk(clk),
+        .rst(1'b0),
+        .dly(4'd1),
+        .din( {wr_cntr_prelatch[6:0], done_prelatch, we_prelatch, page_prelatch}),
+        .dout({wr_cntr[6:0],          done,          we,          page}));
+
+
+/* 1D-DCT END */
+endmodule
+
+
+module dct393r_stage2 ( // increased latency by 2 clocks
+    input             clk,           // system clock, posedge
+    input             en,
+    input             start,      // stage 1 finished, data available in transpose memory
+    input             page,      // transpose memory page finished, valid at start
+    output      [6:0] rd_cntr, // [6:0]    transpose memory read address
+    output            ren,     // read enable transpose memory
+    output reg        regen,   // register enable in transpose memory
+    input      [15:0] tdin,      // [15:0] - data from transpose memory, added 6 bit fractional part
+    output reg        endv,        // one cycle ahead of starting (continuing) dv
+    output reg        dv,          // data output valid
+    output reg [12:0] dct2_out);// [8:0]output data
+/* constants */
+    localparam C3= 16'd54491;
+    localparam S3= 16'd36410;
+    localparam C4= 16'd46341;
+    localparam C6= 16'd25080;
+    localparam S6= 16'd60547;
+    localparam C7= 16'd12785;
+    localparam S7= 16'd64277;
+
+    reg    [16:0] memory1a, memory2a, memory3a, memory4a;
+
+    reg     [2:0] indexi;
+/* 2D section */
+    reg    [15:0] xb0_in, xb1_in, xb2_in, xb3_in, xb4_in, xb5_in, xb6_in, xb7_in;
+    reg    [15:0] xb0_reg, xb1_reg, xb2_reg, xb3_reg, xb4_reg, xb5_reg, xb6_reg, xb7_reg;
+    reg    [16:0] add_sub1b, add_sub2b, add_sub3b, add_sub4b;
+    reg    [15:0] addsub1b_comp, addsub2b_comp, addsub3b_comp, addsub4b_comp;
+    reg           save_sign1b, save_sign2b, save_sign3b, save_sign4b;
+    reg    [18:0] p1b, p2b, p3b, p4b;
+    wire   [35:0] p1b_all, p2b_all, p3b_all, p4b_all;
+    reg           toggleB;
+    reg    [19:0] dct2d_int1, dct2d_int2;
+    reg    [20:0] dct_2d_int;
+    wire   [12:0] dct_2d_rnd;
+
+// transpose memory read address
+    reg    [ 5:0] rd_cntrs;
+    reg           rd_page;
+
+// start with the same as stage1
+    wire          sxregs;
+// to conserve energy by disabling toggleB
+
+    wire          sxregs_d8;
+    reg           enable_toggle;
+    reg           en_started;
+    wire          pre2_endv;
+    wire          pre2_disdv; // AF2015: was missing
+    reg           pre_endv;
+    reg           pre_disdv;
+    reg           pre_dv;
+// SRL16 i_endv       (.Q(endv), .A0(1'b0), .A1(1'b1), .A2(1'b1), .A3(1'b1), .CLK(clk), .D(start));    // dly=14+1
+//    dly_16 #(.WIDTH(1)) i_endv(.clk(clk),.rst(1'b0), .dly(4'd14), .din(start), .dout(endv));    // dly=14+1
+    dly_16 #(.WIDTH(1)) i_pre2_endv(.clk(clk),.rst(1'b0), .dly(4'd15), .din(start), .dout(pre2_endv));    // dly=15+1
+ 
+// SRL16 i_disdv      (.Q(disdv), .A0(1'b0), .A1(1'b1), .A2(1'b1), .A3(1'b1), .CLK(clk), .D(rd_cntrs[5:0]==6'h3f));    // dly=14+1
+//    dly_16 #(.WIDTH(1)) i_disdv(.clk(clk),.rst(1'b0), .dly(4'd14), .din(rd_cntrs[5:0]==6'h3f), .dout(disdv));    // dly=14+1
+    dly_16 #(.WIDTH(1)) i_pre2_disdv(.clk(clk),.rst(1'b0), .dly(4'd15), .din(rd_cntrs[5:0]==6'h3f), .dout(pre2_disdv));    // dly=15+1
+
+// SRL16 i_sxregs      (.Q(sxregs),    .A0(1'b0), .A1(1'b0), .A2(1'b0), .A3(1'b1), .CLK(clk),.D((rd_cntr[5:3]==3'h0) && en_started));    // dly=8+1
+//    dly_16 #(.WIDTH(1)) i_sxregs(.clk(clk),.rst(1'b0), .dly(4'd8), .din((rd_cntr[5:3]==3'h0) && en_started), .dout(sxregs));    // dly=8+1
+    dly_16 #(.WIDTH(1)) i_sxregs(.clk(clk),.rst(1'b0), .dly(4'd9), .din((rd_cntrs[2:0]==3'h0) && en_started), .dout(sxregs));    // dly=9+1
+
+// SRL16 i_sxregs_d8   (.Q(sxregs_d8), .A0(1'b1), .A1(1'b1), .A2(1'b1), .A3(1'b0), .CLK(clk),.D(sxregs && en_started));    // dly=7+1
+    dly_16 #(.WIDTH(1)) i_sxregs_d8(.clk(clk),.rst(1'b0), .dly(4'd7), .din(sxregs && en_started), .dout(sxregs_d8));    // dly=7+1
+
+    assign ren = en_started;
+    
+    always @ (posedge clk) begin
+        enable_toggle <= en && (sxregs || (enable_toggle && !sxregs_d8));
+        
+//        en_started <= en && (start || en_started);
+        if      (!en)                   en_started <= 0;
+        else if (start)                 en_started <= 1;
+        else if (rd_cntrs[5:0] == 6'h3f) en_started <= 0; // should be after (start) as they happen simultaneously
+        
+        regen <= en_started;
+        
+        pre_endv <=pre2_endv;
+        
+        endv <= pre_endv; // output reg
+        
+        pre_disdv <= pre2_disdv;
+        
+        pre_dv <=  en && (pre_endv || (pre_dv && ~pre_disdv));
+        
+//        dv <= en && (endv || (dv && ~disdv));
+        dv <= en && pre_dv; // output reg
+
+        toggleB <= sxregs || (enable_toggle && (~toggleB));
+        
+        if (sxregs) indexi <= 3'h7;
+        else if (enable_toggle) indexi<=indexi+1;
+        
+        if (start) rd_page <= page;
+        
+        if (start) rd_cntrs[5:0] <=6'b0;    // will always count, but that does not matter- What about saving energy ;-) ? Saved...
+        else if (rd_cntrs[5:0]!=6'h3f) rd_cntrs[5:0] <= rd_cntrs[5:0]+1;
+    end 
+  
+    assign    rd_cntr[6:0]= {rd_page,rd_cntrs[2:0],rd_cntrs[5:3]}; // transposed counter
+// duplicate memory<i>a from stage 1
+// store  1D-DCT constant coeeficient values for multipliers */
+
+    always @ (posedge clk) begin
+        case (indexi)
+         0 : begin memory1a <= {1'b0,C4}; //8'd91
+                   memory2a <= {1'b0,C4}; //8'd91
+                   memory3a <= {1'b0,C4}; //8'd91 
+                   memory4a <= {1'b0,C4}; //8'd91
+             end
+         1 : begin memory1a <= {1'b0,S7}; //8'd126; 
+                   memory2a <= {1'b0,C3}; //8'd106;  
+                   memory3a <= {1'b0,S3}; //8'd71;  
+                   memory4a <= {1'b0,C7}; //8'd25;
+             end
+         2 : begin memory1a <= {1'b0,S6}; //8'd118; 
+                   memory2a <= {1'b0,C6}; //8'd49;  
+                   memory3a <= {1'b1,C6}; //-8'd49; 
+                   memory4a <= {1'b1,S6}; //-8'd118
+             end
+         3 : begin memory1a <= {1'b0,C3}; // 8'd106; 
+                   memory2a <= {1'b1,C7}; //-8'd25;  
+                   memory3a <= {1'b1,S7}; //-8'd126; 
+                   memory4a <= {1'b1,S3}; //-8'd71;
+             end
+         4 : begin memory1a <= {1'b0,C4}; // 8'd91; 
+                   memory2a <= {1'b1,C4}; //-8'd91; 
+                   memory3a <= {1'b1,C4}; //-8'd91; 
+                   memory4a <= {1'b0,C4}; // 8'd91;
+             end
+         5 : begin memory1a <= {1'b0,S3}; // 8'd71; 
+                   memory2a <= {1'b1,S7}; //-8'd126; 
+                   memory3a <= {1'b0,C7}; // 8'd25;   
+                   memory4a <= {1'b0,C3}; // 8'd106;
+             end
+         6 : begin memory1a <= {1'b0,C6}; // 8'd49; 
+                   memory2a <= {1'b1,S6}; //-8'd118; 
+                   memory3a <= {1'b0,S6}; // 8'd118;  
+                   memory4a <= {1'b1,C6}; //-8'd49;
+             end
+         7 : begin memory1a <= {1'b0,C7}; // 8'd25;  
+                   memory2a <= {1'b1,S3}; //-8'd71; 
+                   memory3a <= {1'b0,C3}; // 8'd106;  
+                   memory4a <= {1'b1,S7}; //-8'd126;
+             end
+        endcase
+    end
+
+    always @ (posedge clk) begin
+        xb0_in <= tdin;
+        xb1_in <= xb0_in;
+        xb2_in <= xb1_in;
+        xb3_in <= xb2_in;
+        xb4_in <= xb3_in;
+        xb5_in <= xb4_in;
+        xb6_in <= xb5_in;
+        xb7_in <= xb6_in;
+    end
+
+/* register inputs, inputs read in every eighth clk*/
+
+    always @ (posedge clk) if (sxregs) begin
+        xb0_reg <= xb0_in;
+        xb1_reg <= xb1_in; 
+        xb2_reg <= xb2_in;
+        xb3_reg <= xb3_in;
+        xb4_reg <= xb4_in;
+        xb5_reg <= xb5_in; 
+        xb6_reg <= xb6_in;
+        xb7_reg <= xb7_in;
+    end
+
+    always @ (posedge clk)
+        if (toggleB == 1'b1) begin
+            add_sub1b <= {xb7_reg[15],xb7_reg[15:0]} + {xb0_reg[15],xb0_reg[15:0]};
+            add_sub2b <= {xb6_reg[15],xb6_reg[15:0]} + {xb1_reg[15],xb1_reg[15:0]};
+            add_sub3b <= {xb5_reg[15],xb5_reg[15:0]} + {xb2_reg[15],xb2_reg[15:0]};
+            add_sub4b <= {xb4_reg[15],xb4_reg[15:0]} + {xb3_reg[15],xb3_reg[15:0]};
+        end else begin
+            add_sub1b <= {xb7_reg[15],xb7_reg[15:0]} - {xb0_reg[15],xb0_reg[15:0]};
+            add_sub2b <= {xb6_reg[15],xb6_reg[15:0]} - {xb1_reg[15],xb1_reg[15:0]};
+            add_sub3b <= {xb5_reg[15],xb5_reg[15:0]} - {xb2_reg[15],xb2_reg[15:0]};
+            add_sub4b <= {xb4_reg[15],xb4_reg[15:0]} - {xb3_reg[15],xb3_reg[15:0]};
+        end
+// Adding these wires to get rid of the MSB that is always 0
+    wire [16:0] addsub1b_comp_w  = add_sub1b[16]? (-add_sub1b) : add_sub1b;
+    wire [16:0] addsub2b_comp_w  = add_sub2b[16]? (-add_sub2b) : add_sub2b;
+    wire [16:0] addsub3b_comp_w  = add_sub3b[16]? (-add_sub3b) : add_sub3b;
+    wire [16:0] addsub4b_comp_w  = add_sub4b[16]? (-add_sub4b) : add_sub4b;
+
+    always @ (posedge clk) begin
+        save_sign1b    <= add_sub1b[16];
+        save_sign2b    <= add_sub2b[16];
+        save_sign3b    <= add_sub3b[16];
+        save_sign4b    <= add_sub4b[16];
+        addsub1b_comp	<= addsub1b_comp_w[15:0]; // add_sub1b[16]? (-add_sub1b) : add_sub1b;
+        addsub2b_comp	<= addsub2b_comp_w[15:0]; // add_sub2b[16]? (-add_sub2b) : add_sub2b;
+        addsub3b_comp	<= addsub3b_comp_w[15:0]; // add_sub3b[16]? (-add_sub3b) : add_sub3b;
+        addsub4b_comp	<= addsub4b_comp_w[15:0]; // add_sub4b[16]? (-add_sub4b) : add_sub4b;
+    end
+
+    assign p1b_all = addsub1b_comp * memory1a[15:0]; // MSB [16] is sign!
+    assign p2b_all = addsub2b_comp * memory2a[15:0];
+    assign p3b_all = addsub3b_comp * memory3a[15:0];
+    assign p4b_all = addsub4b_comp * memory4a[15:0];
+    reg [18:0] p1b_all_r;
+    reg [18:0] p2b_all_r;
+    reg [18:0] p3b_all_r;
+    reg [18:0] p4b_all_r;
+    reg p1b_sig, p2b_sig, p3b_sig, p4b_sig;
+
+    always @ (posedge clk) begin
+        p1b_all_r <= p1b_all[32:14];
+        p2b_all_r <= p2b_all[32:14];
+        p3b_all_r <= p3b_all[32:14];
+        p4b_all_r <= p4b_all[32:14];
+        p1b_sig <= (save_sign1b ^ memory1a[16]);
+        p2b_sig <= (save_sign2b ^ memory2a[16]);
+        p3b_sig <= (save_sign3b ^ memory3a[16]);
+        p4b_sig <= (save_sign4b ^ memory4a[16]);
+
+    end
+
+
+    always @ (posedge clk) begin
+        p1b[18:0] <= p1b_sig ? (-p1b_all_r) :(p1b_all_r);
+        p2b[18:0] <= p2b_sig ? (-p2b_all_r) :(p2b_all_r);
+        p3b[18:0] <= p3b_sig ? (-p3b_all_r) :(p3b_all_r);
+        p4b[18:0] <= p4b_sig ? (-p4b_all_r) :(p4b_all_r);
+    end
+
+/* multiply the outputs of the add/sub block with the 8 sets of stored coefficients */
+
+/* Final adder. Adding the ouputs of the 4 multipliers */
+    always @ (posedge clk) begin
+        dct2d_int1 <= ({p1b[18],p1b[18:0]} + {p2b[18],p2b[18:0]});
+        dct2d_int2 <= ({p3b[18],p3b[18:0]} + {p4b[18],p4b[18:0]});
+        dct_2d_int <= ({dct2d_int1[19],dct2d_int1[19:0]} + {dct2d_int2[19],dct2d_int2[19:0]});
+        
+        if (pre_dv) dct2_out[12:0] <= dct_2d_rnd[12:0] + dct_2d_int[7];
+    end
+
+    assign dct_2d_rnd[12:0] = dct_2d_int[20:8];
+//    assign dct2_out[12:0] = dct_2d_rnd[12:0] + dct_2d_int[7];
+endmodule