removed unused file

116c9ce2 · Andrey Filippov · de62190a · de62190a · 116c9ce2
Commit 116c9ce2 authored Jun 09, 2016 by Andrey Filippov
Hide whitespace changes
Inline Side-by-side

Showing with 3 additions and 643 deletions

xdct393.v compressor_jp/xdct393.v +0 -643

system_defines.vh system_defines.vh +3 -0

No files found.
--- a/compressor_jp/xdct393.v
+++ b/compressor_jp/xdct393.v
-/**********************************************************************
-** -----------------------------------------------------------------------------**
-** xdct393.v
-**
-** 8x8 discrete Cosine Transform
-**
-** Copyright (C) 2002-2015 Elphel, Inc
-**
-** -----------------------------------------------------------------------------**
-**  xdct393 is free software - hardware description language (HDL) code.
-** 
-**  This program is free software: you can redistribute it and/or modify
-**  it under the terms of the GNU General Public License as published by
-**  the Free Software Foundation, either version 3 of the License, or
-**  (at your option) any later version.
-**
-**  This program is distributed in the hope that it will be useful,
-**  but WITHOUT ANY WARRANTY; without even the implied warranty of
-**  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-**  GNU General Public License for more details.
-**
-**  You should have received a copy of the GNU General Public License
-**  along with this program.  If not, see <http://www.gnu.org/licenses/>.
-** -----------------------------------------------------------------------------**
-**
-** Modified by Andrey Filippov - goal to make it work in start/stop mode, using
-** "start" input (going together with the first data, no restriction on the gap between 64-pixel blocks (>=0)
-** Removed "RST" input ("en" is only used to reset ping-pong transpose memory address)
-** Split module in 2 stages
-** Also saved some area - original design compiled by XST to 865 slices (XC2S300e), this one - 780!
-**
-** It is based on the original design (Xilix app. note XAPP610) by:
-**                  Author: Latha Pillai
-**                  Senior Applications Engineer
-**
-**                  Video Applications
-**                  Advanced Products Group
-**                  Xilinx, Inc.
-**
-**                  Copyright (c) 2001 Xilinx, Inc.
-**                  All rights reserved
-**
-**                  Date:   Feb. 10, 2002
-**
-**                  RESTRICTED RIGHTS LEGEND
-**
-**      This software has not been published by the author, and 
-**      has been disclosed to others for the purpose of enhancing 
-**      and promoting design productivity in Xilinx products.
-**
-**      Therefore use, duplication or disclosure, now and in the 
-**      future should give consideration to the productivity 
-**      enhancements afforded the user of this code by the author's 
-**      efforts.  Thank you for using our products !
-**
-** Disclaimer:  THESE DESIGNS ARE PROVIDED "AS IS" WITH NO WARRANTY 
-**              WHATSOEVER AND XILINX SPECIFICALLY DISCLAIMS ANY 
-**              IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR
-**              A PARTICULAR PURPOSE, OR AGAINST INFRINGEMENT.
-***********************************************************************/
-/*
-after I added DC subtraction before DCT I got 9-bit (allthough not likely to go out of 8bit range) signed data.
-also increased transpose memory to 9 bits (anyway it is 16-bit wide) - see if it will help to prevent saturation
-without significant increase in gates
-Saturatuion is still visible on real pictures, but there was a bug - addsub<i>a_comp, addsub<i>b_comp where not using their
-MSB. I added 1 more bit to add_sub<i>a and add_sub<i>b and fixed that bug. Only 2 mofre slices were used
-Device utilization summary:
-   Number of External GCLKIOBs         1 out of 4      25%
-   Number of External IOBs            23 out of 178    12%
-      Number of LOCed External IOBs    0 out of 23      0%
-   Number of BLOCKRAMs                 1 out of 16      6%
-   Number of SLICEs                  855 out of 3072   27%
-   Number of GCLKs                     1 out of 4      25%
-*/
-// still not enough - maybe one tiny bit more??
-/*
-Device utilization summary:
-   Number of External GCLKIOBs         1 out of 4      25%
-   Number of External IOBs            26 out of 178    14%
-      Number of LOCed External IOBs    0 out of 26      0%
-   Number of BLOCKRAMs                 1 out of 16      6%
-   Number of SLICEs                  837 out of 3072   27%
-   Number of GCLKs                     1 out of 4      25%
-*/
-`timescale 1ns/1ps
-// For xdct353 - increasing data in 9 bits -> 10 bits, out 12 bits ->13 bits
-module xdct393   (
-    input         clk,           // system clock, posedge
-    input         en,            // if zero will reset transpose memory page njumbers
-    input         start,         // single-cycle start pulse that goes with the first pixel data. Other 63 should follow
-    input   [9:0] xin,           // [7:0] - input data
-    output reg    last_in,       // output high during input of the last of 64 pixels in a 8x8 block
-    output        pre_first_out, // 1 cycle ahead of the first output in a 64 block
-    output        dv,            // data output valid. Will go high on the 94-th cycle after the start
-    output [12:0] d_out);        // [8:0]output data
-    wire          stage1_done;
-    wire          tm_page;
-    wire          tm_we;
-    wire    [6:0] tm_ra;
-    wire    [6:0] tm_wa;
-    wire   [15:0] tm_out;
-    wire   [15:0] tm_di;
-//    reg           stage1_done_r; // delay by one clock to use memory output register
-    wire          tm_re; // =1'b1; // TODO: generate, for now just 1'b1
-    wire           tm_regen;
-    always @ (posedge clk) begin
-        last_in <=       (tm_wa[5:0]== 6'h30);
-//        stage1_done_r <= stage1_done;
-//        tm_regen <=      tm_re;
-    end
-    dct393_stage1 i_dct_stage1(
-        .clk       (clk),
-        .en        (en),
-        .start     (start),
-        .xin       (xin),      // [7:0]
-        .we        (tm_we),          // write to transpose memory
-        .wr_cntr   (tm_wa), // [6:0]    transpose memory write address
-        .z_out     (tm_di[15:0]),
-        .page      (tm_page),
-        .done      (stage1_done));
-    dct393_stage2 i_dct_stage2(
-        .clk       (clk),
-        .en        (en),
-        .start     (stage1_done),      // stage 1 finished, data available in transpose memory (extra RAM latency)
-        .page      (tm_page),      // transpose memory page finished, valid at start
-        .rd_cntr   (tm_ra[6:0]), // [6:0]    transpose memory read address
-        .ren       (tm_re), // output
-        .regen     (tm_regen), // output reg 
-        .tdin      (tm_out[15:0]),      // [7:0] - data from transpose memory
-        .endv      (pre_first_out),
-        .dv        (dv),          // data output valid
-        .dct2_out  (d_out[12:0]));// [10:0]output data
-    ram18_var_w_var_r #(
-        .REGISTERS     (1),
-        .LOG2WIDTH_WR  (4),
-        .LOG2WIDTH_RD  (4),
-        .DUMMY(0)
-    ) i_transpose_mem (
-        .rclk      (clk), // input
-        .raddr     ({3'b0,tm_ra[6:0]}), // input[9:0] 
-        .ren       (tm_re), // input
-        .regen     (tm_regen), // input
-        .data_out  (tm_out[15:0]), // output[15:0] 
-        .wclk      (clk), // input
-        .waddr     ({3'b0,tm_wa[6:0]}), // input[9:0] 
-        .we        (tm_we), // input
-        .web       (4'hf), // input[3:0] 
-        .data_in   (tm_di[15:0]) // input[15:0] 
-    );
-endmodule
-// 01/24/2004: Moved all clocks in stage 1 to "negedge" to reduce current pulses
-module dct393_stage1 (
-    input             clk,           // system clock, posedge
-    input             en,
-    input             start,      // single-cycle start pulse to replace RST
-    input      [ 9:0] xin,      // [7:0]
-    output reg        we,          // write to transpose memory
-    output reg [ 6:0] wr_cntr, // [6:0]    transpose memory write address
-    output reg [15:0] z_out,      //data to transpose memory
-    output reg        page,    // transpose memory page just filled (valid @ done)
-    output reg        done);   // last cycle writing to transpose memory - may use after it (move it earlier?)
-/* constants */
-    localparam C3= 16'd54491;
-    localparam S3= 16'd36410;
-    localparam C4= 16'd46341;
-    localparam C6= 16'd25080;
-    localparam S6= 16'd60547;
-    localparam C7= 16'd12785;
-    localparam S7= 16'd64277;
-    reg    [16:0] memory1a, memory2a, memory3a, memory4a;
-/* 1D section */
-/* The max value of a pixel after processing (to make their expected mean to zero)
-is 127. If all the values in a row are 127, the max value of the product terms
-would be (127*2)*(23170/256) and that of z_out_int would be (127*8)*23170/256.
-This value divided by 2raised to 8 is equivalent to ignoring the 8 lsb bits of the value */
-    reg    [ 9:0] xa0_in, xa1_in, xa2_in, xa3_in, xa4_in, xa5_in, xa6_in, xa7_in;
-    reg    [ 9:0] xa0_reg, xa1_reg, xa2_reg, xa3_reg, xa4_reg, xa5_reg, xa6_reg, xa7_reg;
-    reg    [ 9:0] addsub1a_comp, addsub2a_comp, addsub3a_comp, addsub4a_comp;
-//    reg    [10:0] addsub1a_comp, addsub2a_comp, addsub3a_comp, addsub4a_comp; // AF2015: increasing width - was limiting
-    reg    [10:0] add_sub1a, add_sub2a, add_sub3a, add_sub4a;
-    reg           save_sign1a, save_sign2a, save_sign3a, save_sign4a;
-    reg    [17:0] p1a, p2a, p3a, p4a;
-    wire   [35:0] p1a_all, p2a_all, p3a_all, p4a_all;
-    reg           toggleA;
-    reg    [18:0] z_out_int1, z_out_int2;
-    reg    [18:0] z_out_int;
-    wire   [15:0] z_out_prelatch;
-    reg    [ 2:0] indexi;
-/* clks and counters */
-    reg    [ 6:0] wr_cntr_prelatch;
-/* memory section */
-    reg           done_prelatch;
-    reg           we_prelatch;
-    wire          enwe;
-    wire          pre_sxregs;
-    reg           sxregs;
-    reg           page_prelatch;
-    // TODO: See if negedge is needed
-    wire          nclk = ~clk; // seems that everything here is running at negedge (and delays too), but not the transpose memory
-// to conserve energy by disabling toggleA
-    wire          sxregs_d8;
-    reg           enable_toggle;
-//  SRL16_1 i_sxregs_d8   (.Q(sxregs_d8), .A0(1'b1), .A1(1'b1), .A2(1'b1), .A3(1'b0), .CLK(clk),.D(sxregs));    // dly=7+1
-    dly_16 #(.WIDTH(1)) i_sxregs_d8(.clk(nclk),.rst(1'b0), .dly(4'd7), .din(sxregs), .dout(sxregs_d8));   // dly=7+1
-// SRL16_1 i_pre_sxregs (.Q(pre_sxregs), .A0(1'b0), .A1(1'b1), .A2(1'b1), .A3(1'b0), .CLK(clk), .D(start));    // dly=6+1
-    dly_16 #(.WIDTH(1)) i_pre_sxregs(.clk(nclk),.rst(1'b0), .dly(4'd6), .din(start), .dout(pre_sxregs));    // dly=6+1
-// SRL16_1 i_enwe       (.Q(enwe), .A0(1'b1), .A1(1'b0), .A2(1'b1), .A3(1'b0), .CLK(clk), .D(pre_sxregs));    // dly=5+1
-    dly_16 #(.WIDTH(1)) i_enwe(.clk(nclk),.rst(1'b0), .dly(4'd5), .din(pre_sxregs), .dout(enwe));    // dly=5+1
-    always @ (posedge nclk) begin
-        enable_toggle <= en && (sxregs || (enable_toggle && !sxregs_d8));
-        done_prelatch<= (wr_cntr_prelatch[5:0]==6'h3f);
-        if (wr_cntr_prelatch[5:0]==6'h3f) page_prelatch <= wr_cntr_prelatch[6];
-        we_prelatch<= enwe || (en && we_prelatch && (wr_cntr_prelatch[5:0]!=6'h3f));
-        if     (!en) wr_cntr_prelatch <= 7'b0;
-        else if (we_prelatch) wr_cntr_prelatch <= wr_cntr_prelatch + 1;
-        sxregs <= pre_sxregs || ((wr_cntr_prelatch[2:0]==3'h1) && (wr_cntr_prelatch[5:3]!=3'h7));
-        toggleA <= sxregs || (enable_toggle && (~toggleA));
-        if (sxregs) indexi <= 3'h7;
-        else if (enable_toggle) indexi<=indexi+1;
-    end
-/*  1D-DCT BEGIN */
-// store  1D-DCT constant coeeficient values for multipliers */
-    always @ (posedge nclk) begin
-        case (indexi)
-         0 : begin memory1a <= {1'b0,C4}; //8'd91
-                   memory2a <= {1'b0,C4}; //8'd91
-                   memory3a <= {1'b0,C4}; //8'd91 
-                   memory4a <= {1'b0,C4}; //8'd91
-             end
-         1 : begin memory1a <= {1'b0,S7}; //8'd126; 
-                   memory2a <= {1'b0,C3}; //8'd106;  
-                   memory3a <= {1'b0,S3}; //8'd71;  
-                   memory4a <= {1'b0,C7}; //8'd25;
-             end
-         2 : begin memory1a <= {1'b0,S6}; //8'd118; 
-                   memory2a <= {1'b0,C6}; //8'd49;  
-                   memory3a <= {1'b1,C6}; //-8'd49; 
-                   memory4a <= {1'b1,S6}; //-8'd118
-             end
-         3 : begin memory1a <= {1'b0,C3}; // 8'd106; 
-                   memory2a <= {1'b1,C7}; //-8'd25;  
-                   memory3a <= {1'b1,S7}; //-8'd126; 
-                   memory4a <= {1'b1,S3}; //-8'd71;
-             end
-         4 : begin memory1a <= {1'b0,C4}; // 8'd91; 
-                   memory2a <= {1'b1,C4}; //-8'd91; 
-                   memory3a <= {1'b1,C4}; //-8'd91; 
-                   memory4a <= {1'b0,C4}; // 8'd91;
-             end
-         5 : begin memory1a <= {1'b0,S3}; // 8'd71; 
-                   memory2a <= {1'b1,S7}; //-8'd126; 
-                   memory3a <= {1'b0,C7}; // 8'd25;   
-                   memory4a <= {1'b0,C3}; // 8'd106;
-             end
-         6 : begin memory1a <= {1'b0,C6}; // 8'd49; 
-                   memory2a <= {1'b1,S6}; //-8'd118; 
-                   memory3a <= {1'b0,S6}; // 8'd118;  
-                   memory4a <= {1'b1,C6}; //-8'd49;
-             end
-         7 : begin memory1a <= {1'b0,C7}; // 8'd25;  
-                   memory2a <= {1'b1,S3}; //-8'd71; 
-                   memory3a <= {1'b0,C3}; // 8'd106;  
-                   memory4a <= {1'b1,S7}; //-8'd126;
-             end
-        endcase
-    end
-/* 8-bit input shifted 8 times through a shift register*/
-// xa0_in will see output registers from posedge, may be replaced by latches if needed - but currently delay is under 5ns
-    always @ (posedge nclk) begin
-        xa0_in <= xin;
-        xa1_in <= xa0_in;
-        xa2_in <= xa1_in;
-        xa3_in <= xa2_in;
-        xa4_in <= xa3_in;
-        xa5_in <= xa4_in;
-        xa6_in <= xa5_in;
-        xa7_in <= xa6_in;
-    end
-/* shifted inputs registered every 8th clk (using cntr8)*/
-    always @ (posedge nclk) if (sxregs) begin 
-        xa0_reg <= xa0_in;
-        xa1_reg <= xa1_in; 
-        xa2_reg <= xa2_in;
-        xa3_reg <= xa3_in;
-        xa4_reg <= xa4_in;
-        xa5_reg <= xa5_in; 
-        xa6_reg <= xa6_in;
-        xa7_reg <= xa7_in;
-    end
-/* adder / subtractor block */
-    always @ (negedge clk)
-        if (toggleA == 1'b1) begin
-            add_sub1a <= {xa7_reg[9],xa7_reg[9:0]} + {xa0_reg[9],xa0_reg[9:0]};
-            add_sub2a <= {xa6_reg[9],xa6_reg[9:0]} + {xa1_reg[9],xa1_reg[9:0]};
-            add_sub3a <= {xa5_reg[9],xa5_reg[9:0]} + {xa2_reg[9],xa2_reg[9:0]};
-            add_sub4a <= {xa4_reg[9],xa4_reg[9:0]} + {xa3_reg[9],xa3_reg[9:0]};
-        end else begin
-            add_sub1a <= {xa7_reg[9],xa7_reg[9:0]} - {xa0_reg[9],xa0_reg[9:0]};
-            add_sub2a <= {xa6_reg[9],xa6_reg[9:0]} - {xa1_reg[9],xa1_reg[9:0]};
-            add_sub3a <= {xa5_reg[9],xa5_reg[9:0]} - {xa2_reg[9],xa2_reg[9:0]};
-            add_sub4a <= {xa4_reg[9],xa4_reg[9:0]} - {xa3_reg[9],xa3_reg[9:0]};
-        end
-// First valid add_sub appears at the 10th clk (8 clks for shifting inputs,
-// 9th clk for registering shifted input and 10th clk for add_sub
-// to synchronize the i value to the add_sub value, i value is incremented
-// only after 10 clks
-// Adding these wires to get rid of the MSB that is always 0
-    wire [10:0] addsub1a_comp_w  = add_sub1a[10]? (-add_sub1a) : add_sub1a;
-    wire [10:0] addsub2a_comp_w  = add_sub2a[10]? (-add_sub2a) : add_sub2a;
-    wire [10:0] addsub3a_comp_w  = add_sub3a[10]? (-add_sub3a) : add_sub3a;
-    wire [10:0] addsub4a_comp_w  = add_sub4a[10]? (-add_sub4a) : add_sub4a;
-    always @ (posedge nclk) begin
-         save_sign1a     <= add_sub1a[10];
-         save_sign2a     <= add_sub2a[10];
-         save_sign3a     <= add_sub3a[10];
-         save_sign4a     <= add_sub4a[10];
-         addsub1a_comp	<= addsub1a_comp_w[9:0]; //add_sub1a[10]? (-add_sub1a) : add_sub1a;
-         addsub2a_comp	<= addsub2a_comp_w[9:0]; //add_sub2a[10]? (-add_sub2a) : add_sub2a;
-         addsub3a_comp	<= addsub3a_comp_w[9:0]; //add_sub3a[10]? (-add_sub3a) : add_sub3a;
-         addsub4a_comp	<= addsub4a_comp_w[9:0]; //add_sub4a[10]? (-add_sub4a) : add_sub4a;
-    end
-    assign p1a_all = addsub1a_comp * memory1a[15:0]; // [16] is sign!
-    assign p2a_all = addsub2a_comp * memory2a[15:0];
-    assign p3a_all = addsub3a_comp * memory3a[15:0];
-    assign p4a_all = addsub4a_comp * memory4a[15:0];
-    always @ (posedge nclk)
-      begin
-        p1a <= (save_sign1a ^ memory1a[16]) ? (-p1a_all[26:9]) :(p1a_all[26:9]);
-        p2a <= (save_sign2a ^ memory2a[16]) ? (-p2a_all[26:9]) :(p2a_all[26:9]);
-        p3a <= (save_sign3a ^ memory3a[16]) ? (-p3a_all[26:9]) :(p3a_all[26:9]);
-        p4a <= (save_sign4a ^ memory4a[16]) ? (-p4a_all[26:9]) :(p4a_all[26:9]);
-      end
-/* Final adder. Adding the ouputs of the 4 multipliers */
-    always @ (posedge nclk) begin
-        z_out_int1 <= ({p1a[17],p1a} + {p2a[17],p2a});
-        z_out_int2 <= ({p3a[17],p3a} + {p4a[17],p4a});
-        z_out_int <= (z_out_int1 + z_out_int2);
-    end
-// rounding of the value
-    assign z_out_prelatch[15:0] = z_out_int[18:3]+ z_out_int[2]; // correct rounding
-// outputs from output latches to cross clock edge boundary
-    always @ (posedge clk) begin
-        z_out[15:0]  <= z_out_prelatch[15:0];
-        wr_cntr[6:0] <= wr_cntr_prelatch[6:0];  
-        done         <= done_prelatch;  
-        we           <= we_prelatch;  
-        page         <= page_prelatch;  
-    end
-/* 1D-DCT END */
-endmodule
-module dct393_stage2 (
-    input             clk,           // system clock, posedge
-    input             en,
-    input             start,      // stage 1 finished, data available in transpose memory
-    input             page,      // transpose memory page finished, valid at start
-    output      [6:0] rd_cntr, // [6:0]    transpose memory read address
-    output            ren,     // read enable transpose memory
-    output reg        regen,   // register enable in transpose memory
-    input      [15:0] tdin,      // [15:0] - data from transpose memory, added 6 bit fractional part
-    output            endv,        // one cycle ahead of starting (continuing) dv
-    output reg        dv,          // data output valid
-    output    [12:0] dct2_out);// [8:0]output data
-/* constants */
-    localparam C3= 16'd54491;
-    localparam S3= 16'd36410;
-    localparam C4= 16'd46341;
-    localparam C6= 16'd25080;
-    localparam S6= 16'd60547;
-    localparam C7= 16'd12785;
-    localparam S7= 16'd64277;
-    reg    [16:0] memory1a, memory2a, memory3a, memory4a;
-    reg     [2:0] indexi;
-/* 2D section */
-    reg    [15:0] xb0_in, xb1_in, xb2_in, xb3_in, xb4_in, xb5_in, xb6_in, xb7_in;
-    reg    [15:0] xb0_reg, xb1_reg, xb2_reg, xb3_reg, xb4_reg, xb5_reg, xb6_reg, xb7_reg;
-    reg    [16:0] add_sub1b, add_sub2b, add_sub3b, add_sub4b;
-    reg    [15:0] addsub1b_comp, addsub2b_comp, addsub3b_comp, addsub4b_comp;
-    reg           save_sign1b, save_sign2b, save_sign3b, save_sign4b;
-    reg    [18:0] p1b, p2b, p3b, p4b;
-    wire   [35:0] p1b_all, p2b_all, p3b_all, p4b_all;
-    reg           toggleB;
-    reg    [19:0] dct2d_int1, dct2d_int2;
-    reg    [20:0] dct_2d_int;
-    wire   [12:0] dct_2d_rnd;
-// transpose memory read address
-    reg    [ 5:0] rd_cntrs;
-    reg           rd_page;
-// start with the same as stage1
-    wire          sxregs;
-// to conserve energy by disabling toggleB
-    wire          sxregs_d8;
-    reg           enable_toggle;
-    reg           en_started;
-    wire          disdv; // AF2015: was missing
-// SRL16 i_endv       (.Q(endv), .A0(1'b0), .A1(1'b1), .A2(1'b1), .A3(1'b1), .CLK(clk), .D(start));    // dly=14+1
-//    dly_16 #(.WIDTH(1)) i_endv(.clk(clk),.rst(1'b0), .dly(4'd14), .din(start), .dout(endv));    // dly=14+1
-    dly_16 #(.WIDTH(1)) i_endv(.clk(clk),.rst(1'b0), .dly(4'd15), .din(start), .dout(endv));    // dly=15+1
-// SRL16 i_disdv      (.Q(disdv), .A0(1'b0), .A1(1'b1), .A2(1'b1), .A3(1'b1), .CLK(clk), .D(rd_cntrs[5:0]==6'h3f));    // dly=14+1
-//    dly_16 #(.WIDTH(1)) i_disdv(.clk(clk),.rst(1'b0), .dly(4'd14), .din(rd_cntrs[5:0]==6'h3f), .dout(disdv));    // dly=14+1
-    dly_16 #(.WIDTH(1)) i_disdv(.clk(clk),.rst(1'b0), .dly(4'd15), .din(rd_cntrs[5:0]==6'h3f), .dout(disdv));    // dly=15+1
-// SRL16 i_sxregs      (.Q(sxregs),    .A0(1'b0), .A1(1'b0), .A2(1'b0), .A3(1'b1), .CLK(clk),.D((rd_cntr[5:3]==3'h0) && en_started));    // dly=8+1
-//    dly_16 #(.WIDTH(1)) i_sxregs(.clk(clk),.rst(1'b0), .dly(4'd8), .din((rd_cntr[5:3]==3'h0) && en_started), .dout(sxregs));    // dly=8+1
-    dly_16 #(.WIDTH(1)) i_sxregs(.clk(clk),.rst(1'b0), .dly(4'd9), .din((rd_cntrs[2:0]==3'h0) && en_started), .dout(sxregs));    // dly=9+1
-// SRL16 i_sxregs_d8   (.Q(sxregs_d8), .A0(1'b1), .A1(1'b1), .A2(1'b1), .A3(1'b0), .CLK(clk),.D(sxregs && en_started));    // dly=7+1
-    dly_16 #(.WIDTH(1)) i_sxregs_d8(.clk(clk),.rst(1'b0), .dly(4'd7), .din(sxregs && en_started), .dout(sxregs_d8));    // dly=7+1
-    assign ren = en_started;
-    always @ (posedge clk) begin
-        enable_toggle <= en && (sxregs || (enable_toggle && !sxregs_d8));
-//        en_started <= en && (start || en_started);
-        if      (!en)                   en_started <= 0;
-        else if (start)                 en_started <= 1;
-        else if (rd_cntrs[5:0] == 6'h3f) en_started <= 0; // should be after (start) as they happen simultaneously
-        regen <= en_started;
-        dv <= en && (endv || (dv && ~disdv));
-        toggleB <= sxregs || (enable_toggle && (~toggleB));
-        if (sxregs) indexi <= 3'h7;
-        else if (enable_toggle) indexi<=indexi+1;
-        if (start) rd_page <= page;
-        if (start) rd_cntrs[5:0] <=6'b0;    // will always count, but that does not matter- What about saving energy ;-) ? Saved...
-        else if (rd_cntrs[5:0]!=6'h3f) rd_cntrs[5:0] <= rd_cntrs[5:0]+1;
-    end 
-    assign    rd_cntr[6:0]= {rd_page,rd_cntrs[2:0],rd_cntrs[5:3]}; // transposed counter
-// duplicate memory<i>a from stage 1
-// store  1D-DCT constant coeeficient values for multipliers */
-    always @ (posedge clk) begin
-        case (indexi)
-         0 : begin memory1a <= {1'b0,C4}; //8'd91
-                   memory2a <= {1'b0,C4}; //8'd91
-                   memory3a <= {1'b0,C4}; //8'd91 
-                   memory4a <= {1'b0,C4}; //8'd91
-             end
-         1 : begin memory1a <= {1'b0,S7}; //8'd126; 
-                   memory2a <= {1'b0,C3}; //8'd106;  
-                   memory3a <= {1'b0,S3}; //8'd71;  
-                   memory4a <= {1'b0,C7}; //8'd25;
-             end
-         2 : begin memory1a <= {1'b0,S6}; //8'd118; 
-                   memory2a <= {1'b0,C6}; //8'd49;  
-                   memory3a <= {1'b1,C6}; //-8'd49; 
-                   memory4a <= {1'b1,S6}; //-8'd118
-             end
-         3 : begin memory1a <= {1'b0,C3}; // 8'd106; 
-                   memory2a <= {1'b1,C7}; //-8'd25;  
-                   memory3a <= {1'b1,S7}; //-8'd126; 
-                   memory4a <= {1'b1,S3}; //-8'd71;
-             end
-         4 : begin memory1a <= {1'b0,C4}; // 8'd91; 
-                   memory2a <= {1'b1,C4}; //-8'd91; 
-                   memory3a <= {1'b1,C4}; //-8'd91; 
-                   memory4a <= {1'b0,C4}; // 8'd91;
-             end
-         5 : begin memory1a <= {1'b0,S3}; // 8'd71; 
-                   memory2a <= {1'b1,S7}; //-8'd126; 
-                   memory3a <= {1'b0,C7}; // 8'd25;   
-                   memory4a <= {1'b0,C3}; // 8'd106;
-             end
-         6 : begin memory1a <= {1'b0,C6}; // 8'd49; 
-                   memory2a <= {1'b1,S6}; //-8'd118; 
-                   memory3a <= {1'b0,S6}; // 8'd118;  
-                   memory4a <= {1'b1,C6}; //-8'd49;
-             end
-         7 : begin memory1a <= {1'b0,C7}; // 8'd25;  
-                   memory2a <= {1'b1,S3}; //-8'd71; 
-                   memory3a <= {1'b0,C3}; // 8'd106;  
-                   memory4a <= {1'b1,S7}; //-8'd126;
-             end
-        endcase
-    end
-    always @ (posedge clk) begin
-        xb0_in <= tdin;
-        xb1_in <= xb0_in;
-        xb2_in <= xb1_in;
-        xb3_in <= xb2_in;
-        xb4_in <= xb3_in;
-        xb5_in <= xb4_in;
-        xb6_in <= xb5_in;
-        xb7_in <= xb6_in;
-    end
-/* register inputs, inputs read in every eighth clk*/
-    always @ (posedge clk) if (sxregs) begin
-        xb0_reg <= xb0_in;
-        xb1_reg <= xb1_in; 
-        xb2_reg <= xb2_in;
-        xb3_reg <= xb3_in;
-        xb4_reg <= xb4_in;
-        xb5_reg <= xb5_in; 
-        xb6_reg <= xb6_in;
-        xb7_reg <= xb7_in;
-    end
-    always @ (posedge clk)
-        if (toggleB == 1'b1) begin
-            add_sub1b <= {xb7_reg[15],xb7_reg[15:0]} + {xb0_reg[15],xb0_reg[15:0]};
-            add_sub2b <= {xb6_reg[15],xb6_reg[15:0]} + {xb1_reg[15],xb1_reg[15:0]};
-            add_sub3b <= {xb5_reg[15],xb5_reg[15:0]} + {xb2_reg[15],xb2_reg[15:0]};
-            add_sub4b <= {xb4_reg[15],xb4_reg[15:0]} + {xb3_reg[15],xb3_reg[15:0]};
-        end else begin
-            add_sub1b <= {xb7_reg[15],xb7_reg[15:0]} - {xb0_reg[15],xb0_reg[15:0]};
-            add_sub2b <= {xb6_reg[15],xb6_reg[15:0]} - {xb1_reg[15],xb1_reg[15:0]};
-            add_sub3b <= {xb5_reg[15],xb5_reg[15:0]} - {xb2_reg[15],xb2_reg[15:0]};
-            add_sub4b <= {xb4_reg[15],xb4_reg[15:0]} - {xb3_reg[15],xb3_reg[15:0]};
-        end
-// Adding these wires to get rid of the MSB that is always 0
-    wire [16:0] addsub1b_comp_w  = add_sub1b[16]? (-add_sub1b) : add_sub1b;
-    wire [16:0] addsub2b_comp_w  = add_sub2b[16]? (-add_sub2b) : add_sub2b;
-    wire [16:0] addsub3b_comp_w  = add_sub3b[16]? (-add_sub3b) : add_sub3b;
-    wire [16:0] addsub4b_comp_w  = add_sub4b[16]? (-add_sub4b) : add_sub4b;
-    always @ (posedge clk) begin
-        save_sign1b    <= add_sub1b[16];
-        save_sign2b    <= add_sub2b[16];
-        save_sign3b    <= add_sub3b[16];
-        save_sign4b    <= add_sub4b[16];
-        addsub1b_comp	<= addsub1b_comp_w[15:0]; // add_sub1b[16]? (-add_sub1b) : add_sub1b;
-        addsub2b_comp	<= addsub2b_comp_w[15:0]; // add_sub2b[16]? (-add_sub2b) : add_sub2b;
-        addsub3b_comp	<= addsub3b_comp_w[15:0]; // add_sub3b[16]? (-add_sub3b) : add_sub3b;
-        addsub4b_comp	<= addsub4b_comp_w[15:0]; // add_sub4b[16]? (-add_sub4b) : add_sub4b;
-    end
-//    assign p1b_all = addsub1b_comp[15:0] * memory1a[15:0]; // TODO: Check - memory is 16:0
-//    assign p2b_all = addsub2b_comp[15:0] * memory2a[15:0];
-//    assign p3b_all = addsub3b_comp[15:0] * memory3a[15:0];
-//   assign p4b_all = addsub4b_comp[15:0] * memory4a[15:0];
-    assign p1b_all = addsub1b_comp * memory1a[15:0]; // MSB [16] is sign!
-    assign p2b_all = addsub2b_comp * memory2a[15:0];
-    assign p3b_all = addsub3b_comp * memory3a[15:0];
-    assign p4b_all = addsub4b_comp * memory4a[15:0];
-    always @ (posedge clk) begin
-/// Next line was simulated differently in Icarus 0.9 (wrong?) than in Icarus 0.8 (right?)
-/// Xilinx probably did as 0.8
-/// p1b_all[31:14] - 18-bit number, p1b - 19-bit. in 0.9 (-p1b_all[31:14]) was also 18, not expand to 19 bits, 0.8 - did
-///        p1b <= (save_sign1b ^ memory1a[16]) ? (-p1b_all[31:14]) :(p1b_all[31:14]);
-        p1b[18:0] <= (save_sign1b ^ memory1a[16]) ? (-p1b_all[32:14]) :(p1b_all[32:14]);
-        p2b[18:0] <= (save_sign2b ^ memory2a[16]) ? (-p2b_all[32:14]) :(p2b_all[32:14]);
-        p3b[18:0] <= (save_sign3b ^ memory3a[16]) ? (-p3b_all[32:14]) :(p3b_all[32:14]);
-        p4b[18:0] <= (save_sign4b ^ memory4a[16]) ? (-p4b_all[32:14]) :(p4b_all[32:14]);
-    end
-/* multiply the outputs of the add/sub block with the 8 sets of stored coefficients */
-/* Final adder. Adding the ouputs of the 4 multipliers */
-    always @ (posedge clk) begin
-        dct2d_int1 <= ({p1b[18],p1b[18:0]} + {p2b[18],p2b[18:0]});
-        dct2d_int2 <= ({p3b[18],p3b[18:0]} + {p4b[18],p4b[18:0]});
-        dct_2d_int <= ({dct2d_int1[19],dct2d_int1[19:0]} + {dct2d_int2[19],dct2d_int2[19:0]});
-    end
-    assign dct_2d_rnd[12:0] = dct_2d_int[20:8];
-    assign dct2_out[12:0] = dct_2d_rnd[12:0] + dct_2d_int[7];
-endmodule
--- a/system_defines.vh
+++ b/system_defines.vh
@@ -40,6 +40,9 @@
  // This file may be used to define same pre-processor macros to be included into each parsed file
 `ifndef SYSTEM_DEFINES
  `define SYSTEM_DEFINES
+  // TODO: Later compare instantiate/infer
+  `define INSTANTIATE_DSP48E1
 // Parameters from x393_sata project
  `define USE_DRP
  `define ALIGN_CLOCKS