xdct393.v 27.2 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122
/**********************************************************************
** -----------------------------------------------------------------------------**
** xdct393.v
**
** 8x8 discrete Cosine Transform
**
** Copyright (C) 2002-2015 Elphel, Inc
**
** -----------------------------------------------------------------------------**
**  xdct393 is free software - hardware description language (HDL) code.
** 
**  This program is free software: you can redistribute it and/or modify
**  it under the terms of the GNU General Public License as published by
**  the Free Software Foundation, either version 3 of the License, or
**  (at your option) any later version.
**
**  This program is distributed in the hope that it will be useful,
**  but WITHOUT ANY WARRANTY; without even the implied warranty of
**  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
**  GNU General Public License for more details.
**
**  You should have received a copy of the GNU General Public License
**  along with this program.  If not, see <http://www.gnu.org/licenses/>.
** -----------------------------------------------------------------------------**
**
** Modified by Andrey Filippov - goal to make it work in start/stop mode, using
** "start" input (going together with the first data, no restriction on the gap between 64-pixel blocks (>=0)
** Removed "RST" input ("en" is only used to reset ping-pong transpose memory address)
** Split module in 2 stages
** Also saved some area - original design compiled by XST to 865 slices (XC2S300e), this one - 780!
**
** It is based on the original design (Xilix app. note XAPP610) by:
**                  Author: Latha Pillai
**                  Senior Applications Engineer
**
**                  Video Applications
**                  Advanced Products Group
**                  Xilinx, Inc.
**
**                  Copyright (c) 2001 Xilinx, Inc.
**                  All rights reserved
**
**                  Date:   Feb. 10, 2002
**
**                  RESTRICTED RIGHTS LEGEND
**
**      This software has not been published by the author, and 
**      has been disclosed to others for the purpose of enhancing 
**      and promoting design productivity in Xilinx products.
**
**      Therefore use, duplication or disclosure, now and in the 
**      future should give consideration to the productivity 
**      enhancements afforded the user of this code by the author's 
**      efforts.  Thank you for using our products !
**
** Disclaimer:  THESE DESIGNS ARE PROVIDED "AS IS" WITH NO WARRANTY 
**              WHATSOEVER AND XILINX SPECIFICALLY DISCLAIMS ANY 
**              IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR
**              A PARTICULAR PURPOSE, OR AGAINST INFRINGEMENT.
***********************************************************************/

/*
after I added DC subtraction before DCT I got 9-bit (allthough not likely to go out of 8bit range) signed data.
also increased transpose memory to 9 bits (anyway it is 16-bit wide) - see if it will help to prevent saturation
without significant increase in gates

Saturatuion is still visible on real pictures, but there was a bug - addsub<i>a_comp, addsub<i>b_comp where not using their
MSB. I added 1 more bit to add_sub<i>a and add_sub<i>b and fixed that bug. Only 2 mofre slices were used

Device utilization summary:

   Number of External GCLKIOBs         1 out of 4      25%
   Number of External IOBs            23 out of 178    12%
      Number of LOCed External IOBs    0 out of 23      0%

   Number of BLOCKRAMs                 1 out of 16      6%
   Number of SLICEs                  855 out of 3072   27%

   Number of GCLKs                     1 out of 4      25%


*/

// still not enough - maybe one tiny bit more??
/*
Device utilization summary:

   Number of External GCLKIOBs         1 out of 4      25%
   Number of External IOBs            26 out of 178    14%
      Number of LOCed External IOBs    0 out of 26      0%

   Number of BLOCKRAMs                 1 out of 16      6%
   Number of SLICEs                  837 out of 3072   27%

   Number of GCLKs                     1 out of 4      25%
*/



`timescale 1ns/1ps

// For xdct353 - increasing data in 9 bits -> 10 bits, out 12 bits ->13 bits

module xdct393   (
    input         clk,           // system clock, posedge
    input         en,            // if zero will reset transpose memory page njumbers
    input         start,         // single-cycle start pulse that goes with the first pixel data. Other 63 should follow
    input   [9:0] xin,           // [7:0] - input data
    output reg    last_in,       // output high during input of the last of 64 pixels in a 8x8 block
    output        pre_first_out, // 1 cycle ahead of the first output in a 64 block
    output        dv,            // data output valid. Will go high on the 94-th cycle after the start
    output [12:0] d_out);        // [8:0]output data


    wire          stage1_done;
    wire          tm_page;
    wire          tm_we;
    wire    [6:0] tm_ra;
    wire    [6:0] tm_wa;
    wire   [15:0] tm_out;
    wire   [15:0] tm_di;

123
//    reg           stage1_done_r; // delay by one clock to use memory output register
124
    
125 126
    wire          tm_re; // =1'b1; // TODO: generate, for now just 1'b1
    wire           tm_regen;
127 128
    always @ (posedge clk) begin
        last_in <=       (tm_wa[5:0]== 6'h30);
129 130
//        stage1_done_r <= stage1_done;
//        tm_regen <=      tm_re;
131 132
    end
    dct393_stage1 i_dct_stage1(
133 134 135 136 137 138 139 140 141 142
        .clk       (clk),
        .en        (en),
        .start     (start),
        .xin       (xin),      // [7:0]
        .we        (tm_we),          // write to transpose memory
        .wr_cntr   (tm_wa), // [6:0]    transpose memory write address
        .z_out     (tm_di[15:0]),
        .page      (tm_page),
        .done      (stage1_done));
        
143
    dct393_stage2 i_dct_stage2(
144 145 146 147 148 149 150 151 152 153 154
        .clk       (clk),
        .en        (en),
        .start     (stage1_done),      // stage 1 finished, data available in transpose memory (extra RAM latency)
        .page      (tm_page),      // transpose memory page finished, valid at start
        .rd_cntr   (tm_ra[6:0]), // [6:0]    transpose memory read address
        .ren       (tm_re), // output
        .regen     (tm_regen), // output reg 
        .tdin      (tm_out[15:0]),      // [7:0] - data from transpose memory
        .endv      (pre_first_out),
        .dv        (dv),          // data output valid
        .dct2_out  (d_out[12:0]));// [10:0]output data
155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209

    ram18_var_w_var_r #(
        .REGISTERS     (1),
        .LOG2WIDTH_WR  (4),
        .LOG2WIDTH_RD  (4),
        .DUMMY(0)
    ) i_transpose_mem (
        .rclk      (clk), // input
        .raddr     ({3'b0,tm_ra[6:0]}), // input[9:0] 
        .ren       (tm_re), // input
        .regen     (tm_regen), // input
        .data_out  (tm_out[15:0]), // output[15:0] 
        .wclk      (clk), // input
        .waddr     ({3'b0,tm_wa[6:0]}), // input[9:0] 
        .we        (tm_we), // input
        .web       (4'hf), // input[3:0] 
        .data_in   (tm_di[15:0]) // input[15:0] 
    );

endmodule

// 01/24/2004: Moved all clocks in stage 1 to "negedge" to reduce current pulses

module dct393_stage1 (
    input             clk,           // system clock, posedge
    input             en,
    input             start,      // single-cycle start pulse to replace RST
    input      [ 9:0] xin,      // [7:0]
    output reg        we,          // write to transpose memory
    output reg [ 6:0] wr_cntr, // [6:0]    transpose memory write address
    output reg [15:0] z_out,      //data to transpose memory
    output reg        page,    // transpose memory page just filled (valid @ done)
    output reg        done);   // last cycle writing to transpose memory - may use after it (move it earlier?)
/* constants */

    localparam C3= 16'd54491;
    localparam S3= 16'd36410;
    localparam C4= 16'd46341;
    localparam C6= 16'd25080;
    localparam S6= 16'd60547;
    localparam C7= 16'd12785;
    localparam S7= 16'd64277;

    reg    [16:0] memory1a, memory2a, memory3a, memory4a;


/* 1D section */
/* The max value of a pixel after processing (to make their expected mean to zero)
is 127. If all the values in a row are 127, the max value of the product terms
would be (127*2)*(23170/256) and that of z_out_int would be (127*8)*23170/256.
This value divided by 2raised to 8 is equivalent to ignoring the 8 lsb bits of the value */

    reg    [ 9:0] xa0_in, xa1_in, xa2_in, xa3_in, xa4_in, xa5_in, xa6_in, xa7_in;
    reg    [ 9:0] xa0_reg, xa1_reg, xa2_reg, xa3_reg, xa4_reg, xa5_reg, xa6_reg, xa7_reg;

210 211
    reg    [ 9:0] addsub1a_comp, addsub2a_comp, addsub3a_comp, addsub4a_comp;
//    reg    [10:0] addsub1a_comp, addsub2a_comp, addsub3a_comp, addsub4a_comp; // AF2015: increasing width - was limiting
212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241

    reg    [10:0] add_sub1a, add_sub2a, add_sub3a, add_sub4a;
    reg           save_sign1a, save_sign2a, save_sign3a, save_sign4a;
    reg    [17:0] p1a, p2a, p3a, p4a;
    wire   [35:0] p1a_all, p2a_all, p3a_all, p4a_all;
    reg           toggleA;

    reg    [18:0] z_out_int1, z_out_int2;
    reg    [18:0] z_out_int;
    wire   [15:0] z_out_prelatch;
    reg    [ 2:0] indexi;

/* clks and counters */
    reg    [ 6:0] wr_cntr_prelatch;

/* memory section */
    reg           done_prelatch;
    reg           we_prelatch;
    wire          enwe;
    wire          pre_sxregs;
    reg           sxregs;
    reg           page_prelatch;
    // TODO: See if negedge is needed
    wire          nclk = ~clk; // seems that everything here is running at negedge (and delays too), but not the transpose memory

// to conserve energy by disabling toggleA

    wire          sxregs_d8;
    reg           enable_toggle;
//  SRL16_1 i_sxregs_d8   (.Q(sxregs_d8), .A0(1'b1), .A1(1'b1), .A2(1'b1), .A3(1'b0), .CLK(clk),.D(sxregs));    // dly=7+1
242
    dly_16 #(.WIDTH(1)) i_sxregs_d8(.clk(nclk),.rst(1'b0), .dly(4'd7), .din(sxregs), .dout(sxregs_d8));   // dly=7+1
243 244 245
    
    
// SRL16_1 i_pre_sxregs (.Q(pre_sxregs), .A0(1'b0), .A1(1'b1), .A2(1'b1), .A3(1'b0), .CLK(clk), .D(start));    // dly=6+1
246
    dly_16 #(.WIDTH(1)) i_pre_sxregs(.clk(nclk),.rst(1'b0), .dly(4'd6), .din(start), .dout(pre_sxregs));    // dly=6+1
247 248
    
// SRL16_1 i_enwe       (.Q(enwe), .A0(1'b1), .A1(1'b0), .A2(1'b1), .A3(1'b0), .CLK(clk), .D(pre_sxregs));    // dly=5+1
249
    dly_16 #(.WIDTH(1)) i_enwe(.clk(nclk),.rst(1'b0), .dly(4'd5), .din(pre_sxregs), .dout(enwe));    // dly=5+1
250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363
  
    always @ (posedge nclk) begin
        enable_toggle <= en && (sxregs || (enable_toggle && !sxregs_d8));
        
        done_prelatch<= (wr_cntr_prelatch[5:0]==6'h3f);
        
        if (wr_cntr_prelatch[5:0]==6'h3f) page_prelatch <= wr_cntr_prelatch[6];
        
        we_prelatch<= enwe || (en && we_prelatch && (wr_cntr_prelatch[5:0]!=6'h3f));
        
        if     (!en) wr_cntr_prelatch <= 7'b0;
        else if (we_prelatch) wr_cntr_prelatch <= wr_cntr_prelatch + 1;
        
        sxregs <= pre_sxregs || ((wr_cntr_prelatch[2:0]==3'h1) && (wr_cntr_prelatch[5:3]!=3'h7));
        
        toggleA <= sxregs || (enable_toggle && (~toggleA));
        
        if (sxregs) indexi <= 3'h7;
        else if (enable_toggle) indexi<=indexi+1;
    end


/*  1D-DCT BEGIN */

// store  1D-DCT constant coeeficient values for multipliers */

    always @ (posedge nclk) begin
        case (indexi)
         0 : begin memory1a <= {1'b0,C4}; //8'd91
                   memory2a <= {1'b0,C4}; //8'd91
                   memory3a <= {1'b0,C4}; //8'd91 
                   memory4a <= {1'b0,C4}; //8'd91
             end
         1 : begin memory1a <= {1'b0,S7}; //8'd126; 
                   memory2a <= {1'b0,C3}; //8'd106;  
                   memory3a <= {1'b0,S3}; //8'd71;  
                   memory4a <= {1'b0,C7}; //8'd25;
             end
         2 : begin memory1a <= {1'b0,S6}; //8'd118; 
                   memory2a <= {1'b0,C6}; //8'd49;  
                   memory3a <= {1'b1,C6}; //-8'd49; 
                   memory4a <= {1'b1,S6}; //-8'd118
             end
         3 : begin memory1a <= {1'b0,C3}; // 8'd106; 
                   memory2a <= {1'b1,C7}; //-8'd25;  
                   memory3a <= {1'b1,S7}; //-8'd126; 
                   memory4a <= {1'b1,S3}; //-8'd71;
             end
         4 : begin memory1a <= {1'b0,C4}; // 8'd91; 
                   memory2a <= {1'b1,C4}; //-8'd91; 
                   memory3a <= {1'b1,C4}; //-8'd91; 
                   memory4a <= {1'b0,C4}; // 8'd91;
             end
         5 : begin memory1a <= {1'b0,S3}; // 8'd71; 
                   memory2a <= {1'b1,S7}; //-8'd126; 
                   memory3a <= {1'b0,C7}; // 8'd25;   
                   memory4a <= {1'b0,C3}; // 8'd106;
             end
         6 : begin memory1a <= {1'b0,C6}; // 8'd49; 
                   memory2a <= {1'b1,S6}; //-8'd118; 
                   memory3a <= {1'b0,S6}; // 8'd118;  
                   memory4a <= {1'b1,C6}; //-8'd49;
             end
         7 : begin memory1a <= {1'b0,C7}; // 8'd25;  
                   memory2a <= {1'b1,S3}; //-8'd71; 
                   memory3a <= {1'b0,C3}; // 8'd106;  
                   memory4a <= {1'b1,S7}; //-8'd126;
             end
        endcase
    end

/* 8-bit input shifted 8 times through a shift register*/
// xa0_in will see output registers from posedge, may be replaced by latches if needed - but currently delay is under 5ns
    always @ (posedge nclk) begin
        xa0_in <= xin;
        xa1_in <= xa0_in;
        xa2_in <= xa1_in;
        xa3_in <= xa2_in;
        xa4_in <= xa3_in;
        xa5_in <= xa4_in;
        xa6_in <= xa5_in;
        xa7_in <= xa6_in;
    end

/* shifted inputs registered every 8th clk (using cntr8)*/
    always @ (posedge nclk) if (sxregs) begin 
        xa0_reg <= xa0_in;
        xa1_reg <= xa1_in; 
        xa2_reg <= xa2_in;
        xa3_reg <= xa3_in;
        xa4_reg <= xa4_in;
        xa5_reg <= xa5_in; 
        xa6_reg <= xa6_in;
        xa7_reg <= xa7_in;
    end

/* adder / subtractor block */
    always @ (negedge clk)
        if (toggleA == 1'b1) begin
            add_sub1a <= {xa7_reg[9],xa7_reg[9:0]} + {xa0_reg[9],xa0_reg[9:0]};
            add_sub2a <= {xa6_reg[9],xa6_reg[9:0]} + {xa1_reg[9],xa1_reg[9:0]};
            add_sub3a <= {xa5_reg[9],xa5_reg[9:0]} + {xa2_reg[9],xa2_reg[9:0]};
            add_sub4a <= {xa4_reg[9],xa4_reg[9:0]} + {xa3_reg[9],xa3_reg[9:0]};
        end else begin
            add_sub1a <= {xa7_reg[9],xa7_reg[9:0]} - {xa0_reg[9],xa0_reg[9:0]};
            add_sub2a <= {xa6_reg[9],xa6_reg[9:0]} - {xa1_reg[9],xa1_reg[9:0]};
            add_sub3a <= {xa5_reg[9],xa5_reg[9:0]} - {xa2_reg[9],xa2_reg[9:0]};
            add_sub4a <= {xa4_reg[9],xa4_reg[9:0]} - {xa3_reg[9],xa3_reg[9:0]};
        end

// First valid add_sub appears at the 10th clk (8 clks for shifting inputs,
// 9th clk for registering shifted input and 10th clk for add_sub
// to synchronize the i value to the add_sub value, i value is incremented
// only after 10 clks
364 365 366 367 368 369 370

// Adding these wires to get rid of the MSB that is always 0
    wire [10:0] addsub1a_comp_w  = add_sub1a[10]? (-add_sub1a) : add_sub1a;
    wire [10:0] addsub2a_comp_w  = add_sub2a[10]? (-add_sub2a) : add_sub2a;
    wire [10:0] addsub3a_comp_w  = add_sub3a[10]? (-add_sub3a) : add_sub3a;
    wire [10:0] addsub4a_comp_w  = add_sub4a[10]? (-add_sub4a) : add_sub4a;
    
371 372 373 374 375
    always @ (posedge nclk) begin
         save_sign1a     <= add_sub1a[10];
         save_sign2a     <= add_sub2a[10];
         save_sign3a     <= add_sub3a[10];
         save_sign4a     <= add_sub4a[10];
376 377 378 379
         addsub1a_comp	<= addsub1a_comp_w[9:0]; //add_sub1a[10]? (-add_sub1a) : add_sub1a;
         addsub2a_comp	<= addsub2a_comp_w[9:0]; //add_sub2a[10]? (-add_sub2a) : add_sub2a;
         addsub3a_comp	<= addsub3a_comp_w[9:0]; //add_sub3a[10]? (-add_sub3a) : add_sub3a;
         addsub4a_comp	<= addsub4a_comp_w[9:0]; //add_sub4a[10]? (-add_sub4a) : add_sub4a;
380 381
    end

Andrey Filippov's avatar
Andrey Filippov committed
382 383 384 385
    assign p1a_all = addsub1a_comp * memory1a[15:0]; // [16] is sign!
    assign p2a_all = addsub2a_comp * memory2a[15:0];
    assign p3a_all = addsub3a_comp * memory3a[15:0];
    assign p4a_all = addsub4a_comp * memory4a[15:0];
386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424


    always @ (posedge nclk)
      begin
        p1a <= (save_sign1a ^ memory1a[16]) ? (-p1a_all[26:9]) :(p1a_all[26:9]);
        p2a <= (save_sign2a ^ memory2a[16]) ? (-p2a_all[26:9]) :(p2a_all[26:9]);
        p3a <= (save_sign3a ^ memory3a[16]) ? (-p3a_all[26:9]) :(p3a_all[26:9]);
        p4a <= (save_sign4a ^ memory4a[16]) ? (-p4a_all[26:9]) :(p4a_all[26:9]);
      end

/* Final adder. Adding the ouputs of the 4 multipliers */
    always @ (posedge nclk) begin
        z_out_int1 <= ({p1a[17],p1a} + {p2a[17],p2a});
        z_out_int2 <= ({p3a[17],p3a} + {p4a[17],p4a});
        z_out_int <= (z_out_int1 + z_out_int2);
    end

// rounding of the value
    assign z_out_prelatch[15:0] = z_out_int[18:3]+ z_out_int[2]; // correct rounding

// outputs from output latches to cross clock edge boundary
    always @ (posedge clk) begin
        z_out[15:0]  <= z_out_prelatch[15:0];
        wr_cntr[6:0] <= wr_cntr_prelatch[6:0];  
        done         <= done_prelatch;  
        we           <= we_prelatch;  
        page         <= page_prelatch;  
    end

/* 1D-DCT END */
endmodule


module dct393_stage2 (
    input             clk,           // system clock, posedge
    input             en,
    input             start,      // stage 1 finished, data available in transpose memory
    input             page,      // transpose memory page finished, valid at start
    output      [6:0] rd_cntr, // [6:0]    transpose memory read address
425 426
    output            ren,     // read enable transpose memory
    output reg        regen,   // register enable in transpose memory
427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446
    input      [15:0] tdin,      // [15:0] - data from transpose memory, added 6 bit fractional part
    output            endv,        // one cycle ahead of starting (continuing) dv
    output reg        dv,          // data output valid
    output    [12:0] dct2_out);// [8:0]output data
/* constants */
    localparam C3= 16'd54491;
    localparam S3= 16'd36410;
    localparam C4= 16'd46341;
    localparam C6= 16'd25080;
    localparam S6= 16'd60547;
    localparam C7= 16'd12785;
    localparam S7= 16'd64277;

    reg    [16:0] memory1a, memory2a, memory3a, memory4a;

    reg     [2:0] indexi;
/* 2D section */
    reg    [15:0] xb0_in, xb1_in, xb2_in, xb3_in, xb4_in, xb5_in, xb6_in, xb7_in;
    reg    [15:0] xb0_reg, xb1_reg, xb2_reg, xb3_reg, xb4_reg, xb5_reg, xb6_reg, xb7_reg;
    reg    [16:0] add_sub1b, add_sub2b, add_sub3b, add_sub4b;
447
    reg    [15:0] addsub1b_comp, addsub2b_comp, addsub3b_comp, addsub4b_comp;
448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468
    reg           save_sign1b, save_sign2b, save_sign3b, save_sign4b;
    reg    [18:0] p1b, p2b, p3b, p4b;
    wire   [35:0] p1b_all, p2b_all, p3b_all, p4b_all;
    reg           toggleB;
    reg    [19:0] dct2d_int1, dct2d_int2;
    reg    [20:0] dct_2d_int;
    wire   [12:0] dct_2d_rnd;

// transpose memory read address
    reg    [ 5:0] rd_cntrs;
    reg           rd_page;

// start with the same as stage1
    wire          sxregs;
// to conserve energy by disabling toggleB

    wire          sxregs_d8;
    reg           enable_toggle;
    reg           en_started;
    wire          disdv; // AF2015: was missing
// SRL16 i_endv       (.Q(endv), .A0(1'b0), .A1(1'b1), .A2(1'b1), .A3(1'b1), .CLK(clk), .D(start));    // dly=14+1
469 470
//    dly_16 #(.WIDTH(1)) i_endv(.clk(clk),.rst(1'b0), .dly(4'd14), .din(start), .dout(endv));    // dly=14+1
    dly_16 #(.WIDTH(1)) i_endv(.clk(clk),.rst(1'b0), .dly(4'd15), .din(start), .dout(endv));    // dly=15+1
471
 
472 473 474
// SRL16 i_disdv      (.Q(disdv), .A0(1'b0), .A1(1'b1), .A2(1'b1), .A3(1'b1), .CLK(clk), .D(rd_cntrs[5:0]==6'h3f));    // dly=14+1
//    dly_16 #(.WIDTH(1)) i_disdv(.clk(clk),.rst(1'b0), .dly(4'd14), .din(rd_cntrs[5:0]==6'h3f), .dout(disdv));    // dly=14+1
    dly_16 #(.WIDTH(1)) i_disdv(.clk(clk),.rst(1'b0), .dly(4'd15), .din(rd_cntrs[5:0]==6'h3f), .dout(disdv));    // dly=15+1
475 476

// SRL16 i_sxregs      (.Q(sxregs),    .A0(1'b0), .A1(1'b0), .A2(1'b0), .A3(1'b1), .CLK(clk),.D((rd_cntr[5:3]==3'h0) && en_started));    // dly=8+1
477 478
//    dly_16 #(.WIDTH(1)) i_sxregs(.clk(clk),.rst(1'b0), .dly(4'd8), .din((rd_cntr[5:3]==3'h0) && en_started), .dout(sxregs));    // dly=8+1
    dly_16 #(.WIDTH(1)) i_sxregs(.clk(clk),.rst(1'b0), .dly(4'd9), .din((rd_cntrs[2:0]==3'h0) && en_started), .dout(sxregs));    // dly=9+1
479 480

// SRL16 i_sxregs_d8   (.Q(sxregs_d8), .A0(1'b1), .A1(1'b1), .A2(1'b1), .A3(1'b0), .CLK(clk),.D(sxregs && en_started));    // dly=7+1
481
    dly_16 #(.WIDTH(1)) i_sxregs_d8(.clk(clk),.rst(1'b0), .dly(4'd7), .din(sxregs && en_started), .dout(sxregs_d8));    // dly=7+1
482

483 484 485
    assign ren = en_started;
    
    always @ (posedge clk) begin
486 487
        enable_toggle <= en && (sxregs || (enable_toggle && !sxregs_d8));
        
488 489 490 491 492 493
//        en_started <= en && (start || en_started);
        if      (!en)                   en_started <= 0;
        else if (start)                 en_started <= 1;
        else if (rd_cntrs[5:0] == 6'h3f) en_started <= 0; // should be after (start) as they happen simultaneously
        
        regen <= en_started;
494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592

        dv <= en && (endv || (dv && ~disdv));

        toggleB <= sxregs || (enable_toggle && (~toggleB));
        
        if (sxregs) indexi <= 3'h7;
        else if (enable_toggle) indexi<=indexi+1;
        
        if (start) rd_page <= page;
        
        if (start) rd_cntrs[5:0] <=6'b0;    // will always count, but that does not matter- What about saving energy ;-) ? Saved...
        else if (rd_cntrs[5:0]!=6'h3f) rd_cntrs[5:0] <= rd_cntrs[5:0]+1;
    end 
  
    assign    rd_cntr[6:0]= {rd_page,rd_cntrs[2:0],rd_cntrs[5:3]}; // transposed counter
// duplicate memory<i>a from stage 1
// store  1D-DCT constant coeeficient values for multipliers */

    always @ (posedge clk) begin
        case (indexi)
         0 : begin memory1a <= {1'b0,C4}; //8'd91
                   memory2a <= {1'b0,C4}; //8'd91
                   memory3a <= {1'b0,C4}; //8'd91 
                   memory4a <= {1'b0,C4}; //8'd91
             end
         1 : begin memory1a <= {1'b0,S7}; //8'd126; 
                   memory2a <= {1'b0,C3}; //8'd106;  
                   memory3a <= {1'b0,S3}; //8'd71;  
                   memory4a <= {1'b0,C7}; //8'd25;
             end
         2 : begin memory1a <= {1'b0,S6}; //8'd118; 
                   memory2a <= {1'b0,C6}; //8'd49;  
                   memory3a <= {1'b1,C6}; //-8'd49; 
                   memory4a <= {1'b1,S6}; //-8'd118
             end
         3 : begin memory1a <= {1'b0,C3}; // 8'd106; 
                   memory2a <= {1'b1,C7}; //-8'd25;  
                   memory3a <= {1'b1,S7}; //-8'd126; 
                   memory4a <= {1'b1,S3}; //-8'd71;
             end
         4 : begin memory1a <= {1'b0,C4}; // 8'd91; 
                   memory2a <= {1'b1,C4}; //-8'd91; 
                   memory3a <= {1'b1,C4}; //-8'd91; 
                   memory4a <= {1'b0,C4}; // 8'd91;
             end
         5 : begin memory1a <= {1'b0,S3}; // 8'd71; 
                   memory2a <= {1'b1,S7}; //-8'd126; 
                   memory3a <= {1'b0,C7}; // 8'd25;   
                   memory4a <= {1'b0,C3}; // 8'd106;
             end
         6 : begin memory1a <= {1'b0,C6}; // 8'd49; 
                   memory2a <= {1'b1,S6}; //-8'd118; 
                   memory3a <= {1'b0,S6}; // 8'd118;  
                   memory4a <= {1'b1,C6}; //-8'd49;
             end
         7 : begin memory1a <= {1'b0,C7}; // 8'd25;  
                   memory2a <= {1'b1,S3}; //-8'd71; 
                   memory3a <= {1'b0,C3}; // 8'd106;  
                   memory4a <= {1'b1,S7}; //-8'd126;
             end
        endcase
    end

    always @ (posedge clk) begin
        xb0_in <= tdin;
        xb1_in <= xb0_in;
        xb2_in <= xb1_in;
        xb3_in <= xb2_in;
        xb4_in <= xb3_in;
        xb5_in <= xb4_in;
        xb6_in <= xb5_in;
        xb7_in <= xb6_in;
    end

/* register inputs, inputs read in every eighth clk*/

    always @ (posedge clk) if (sxregs) begin
        xb0_reg <= xb0_in;
        xb1_reg <= xb1_in; 
        xb2_reg <= xb2_in;
        xb3_reg <= xb3_in;
        xb4_reg <= xb4_in;
        xb5_reg <= xb5_in; 
        xb6_reg <= xb6_in;
        xb7_reg <= xb7_in;
    end

    always @ (posedge clk)
        if (toggleB == 1'b1) begin
            add_sub1b <= {xb7_reg[15],xb7_reg[15:0]} + {xb0_reg[15],xb0_reg[15:0]};
            add_sub2b <= {xb6_reg[15],xb6_reg[15:0]} + {xb1_reg[15],xb1_reg[15:0]};
            add_sub3b <= {xb5_reg[15],xb5_reg[15:0]} + {xb2_reg[15],xb2_reg[15:0]};
            add_sub4b <= {xb4_reg[15],xb4_reg[15:0]} + {xb3_reg[15],xb3_reg[15:0]};
        end else begin
            add_sub1b <= {xb7_reg[15],xb7_reg[15:0]} - {xb0_reg[15],xb0_reg[15:0]};
            add_sub2b <= {xb6_reg[15],xb6_reg[15:0]} - {xb1_reg[15],xb1_reg[15:0]};
            add_sub3b <= {xb5_reg[15],xb5_reg[15:0]} - {xb2_reg[15],xb2_reg[15:0]};
            add_sub4b <= {xb4_reg[15],xb4_reg[15:0]} - {xb3_reg[15],xb3_reg[15:0]};
        end
593 594 595 596 597
// Adding these wires to get rid of the MSB that is always 0
    wire [16:0] addsub1b_comp_w  = add_sub1b[16]? (-add_sub1b) : add_sub1b;
    wire [16:0] addsub2b_comp_w  = add_sub2b[16]? (-add_sub2b) : add_sub2b;
    wire [16:0] addsub3b_comp_w  = add_sub3b[16]? (-add_sub3b) : add_sub3b;
    wire [16:0] addsub4b_comp_w  = add_sub4b[16]? (-add_sub4b) : add_sub4b;
598 599 600 601 602 603

    always @ (posedge clk) begin
        save_sign1b    <= add_sub1b[16];
        save_sign2b    <= add_sub2b[16];
        save_sign3b    <= add_sub3b[16];
        save_sign4b    <= add_sub4b[16];
604 605 606 607
        addsub1b_comp	<= addsub1b_comp_w[15:0]; // add_sub1b[16]? (-add_sub1b) : add_sub1b;
        addsub2b_comp	<= addsub2b_comp_w[15:0]; // add_sub2b[16]? (-add_sub2b) : add_sub2b;
        addsub3b_comp	<= addsub3b_comp_w[15:0]; // add_sub3b[16]? (-add_sub3b) : add_sub3b;
        addsub4b_comp	<= addsub4b_comp_w[15:0]; // add_sub4b[16]? (-add_sub4b) : add_sub4b;
608 609 610 611 612 613 614
    end

//    assign p1b_all = addsub1b_comp[15:0] * memory1a[15:0]; // TODO: Check - memory is 16:0
//    assign p2b_all = addsub2b_comp[15:0] * memory2a[15:0];
//    assign p3b_all = addsub3b_comp[15:0] * memory3a[15:0];
//   assign p4b_all = addsub4b_comp[15:0] * memory4a[15:0];

Andrey Filippov's avatar
Andrey Filippov committed
615 616 617 618
    assign p1b_all = addsub1b_comp * memory1a[15:0]; // MSB [16] is sign!
    assign p2b_all = addsub2b_comp * memory2a[15:0];
    assign p3b_all = addsub3b_comp * memory3a[15:0];
    assign p4b_all = addsub4b_comp * memory4a[15:0];
619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643


    always @ (posedge clk) begin
/// Next line was simulated differently in Icarus 0.9 (wrong?) than in Icarus 0.8 (right?)
/// Xilinx probably did as 0.8
/// p1b_all[31:14] - 18-bit number, p1b - 19-bit. in 0.9 (-p1b_all[31:14]) was also 18, not expand to 19 bits, 0.8 - did
///        p1b <= (save_sign1b ^ memory1a[16]) ? (-p1b_all[31:14]) :(p1b_all[31:14]);
        p1b[18:0] <= (save_sign1b ^ memory1a[16]) ? (-p1b_all[32:14]) :(p1b_all[32:14]);
        p2b[18:0] <= (save_sign2b ^ memory2a[16]) ? (-p2b_all[32:14]) :(p2b_all[32:14]);
        p3b[18:0] <= (save_sign3b ^ memory3a[16]) ? (-p3b_all[32:14]) :(p3b_all[32:14]);
        p4b[18:0] <= (save_sign4b ^ memory4a[16]) ? (-p4b_all[32:14]) :(p4b_all[32:14]);
    end

/* multiply the outputs of the add/sub block with the 8 sets of stored coefficients */

/* Final adder. Adding the ouputs of the 4 multipliers */
    always @ (posedge clk) begin
        dct2d_int1 <= ({p1b[18],p1b[18:0]} + {p2b[18],p2b[18:0]});
        dct2d_int2 <= ({p3b[18],p3b[18:0]} + {p4b[18],p4b[18:0]});
        dct_2d_int <= ({dct2d_int1[19],dct2d_int1[19:0]} + {dct2d_int2[19],dct2d_int2[19:0]});
    end

    assign dct_2d_rnd[12:0] = dct_2d_int[20:8];
    assign dct2_out[12:0] = dct_2d_rnd[12:0] + dct_2d_int[7];
endmodule