focus_sharp393.v 16.5 KB
/*
** -----------------------------------------------------------------------------**
** focus_sharp393.v
**
** Module to determine focus sharpness on  by integrating
** DCT coefficient, multiplied my 8x8 array and squared
**
** Copyright (C) 2008-2015 Elphel, Inc
**
** -----------------------------------------------------------------------------**
**  ocus_sharp393.v is free software - hardware description language (HDL) code.
** 
**  This program is free software: you can redistribute it and/or modify
**  it under the terms of the GNU General Public License as published by
**  the Free Software Foundation, either version 3 of the License, or
**  (at your option) any later version.
**
**  This program is distributed in the hope that it will be useful,
**  but WITHOUT ANY WARRANTY; without even the implied warranty of
**  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
**  GNU General Public License for more details.
**
**  You should have received a copy of the GNU General Public License
**  along with this program.  If not, see <http://www.gnu.org/licenses/>.
** -----------------------------------------------------------------------------**
**
*/
`include "system_defines.vh" 
`timescale 1ns/1ps
//TODO: Modify to work with other modes (now only on color)
module focus_sharp393(
    input             clk,          // pixel clock, posedge
    input             clk2x,        // 2x pixel clock
    input             en,           // enable (0 resets)
    input             mclk,         // system clock to write tables
    input             tser_we,      // enable write to a  table
    input             tser_a_not_d, // address/not data distributed to submodules
    input      [ 7:0] tser_d,       // byte-wide serialized tables address/data to submodules
    input      [ 1:0] mode,         // focus mode (combine image with focus info) - 0 - none, 1 - replace, 2 - combine all,  3 - combine woi
    input             firsti,       // first macroblock
    input             lasti,        // last macroblock
    input      [ 2:0] tni,          // block number in a macronblock - 0..3 - Y, >=4 - color (sync to stb)
    input             stb,          // strobe that writes ctypei, dci
    input             start,        // marks first input pixel (needs 1 cycle delay from previous DCT stage)
    input      [12:0] di,           // [11:0] pixel data in (signed)
    input             quant_ds,     // quantizator ds
    input      [12:0] quant_d,      // [11:0]quantizator data output
    input      [15:0] quant_dc_tdo, // [15:0], MSB aligned coefficient for the DC component (used in focus module)
    output reg [12:0] do,           // [11:0] pixel data out, make timing ignore (valid 1.5 clk earlier that Quantizer output)
    output reg        ds,           // data out strobe (one ahead of the start of dv)
    output reg [31:0] hifreq);      //[31:0])  //  accumulated high frequency components in a frame sub-window

    wire   [15:0] tdo;
    reg    [ 5:0] tba;
    reg    [11:0] wnd_reg; // intermediate register
    reg           wnd_wr;  // writing window
    reg    [ 2:0] wnd_a;   // window register address
     
 // next measured in 8x8 blocks, totalwidth - write one less than needed (i.e. 511 fro the 512-wide window)
 // blocks on the border are included
    reg    [ 8:0] wnd_left;
    reg    [ 8:0] wnd_right;
    reg    [ 8:0] wnd_top;
    reg    [ 8:0] wnd_bottom;
    reg    [ 8:1] wnd_totalwidth;
    reg    [ 3:0] filt_sel0; // select filter number, 0..14 (15 used for window parameters)
    reg    [ 3:0] filt_sel;  // select filter number, 0..14 (15 used for window parameters)
    reg           stren; // strength (visualization)
    reg    [ 2:0] ic;
    reg    [ 2:0] oc;
    wire          first,last; //valid at start (with first di word), switches immediately after
    wire   [ 2:0] tn;
    reg    [39:0] acc_frame;
    reg    [12:0] pre_do;
    reg           pre_ds;
    reg           need_corr_max; // limit output by quant_dc_tdo
    reg    [11:0] fdo; // focus data output
    reg           start_d; //start delayed by 1
    reg    [ 2:0] tn_d; //tn delayed by 1

    wire          out_mono;
    wire          out_window;
    wire   [12:0] combined_qf; 
    wire   [12:0] next_do;
    wire   [12:0] fdo_minus_max;
    reg    [11:0] di_d;
    reg    [11:0] d1; 
    reg     [8:0] start2;
//    reg     [7:0] finish2;
    reg     [6:0] finish2; // bit[7] never used
    reg     [5:0] use_k_dly;
    reg    [23:0] acc_blk; // accumulator for the sum ((a[i]*d[i])^2)
    reg    [22:0] sum_blk; // accumulator for the sum ((a[i]*d[i])^2), copied at block end
    reg           acc_ldval; // value to load to acc_blk: 0 - 24'h0, 1 - 24'h7fffff
    wire          acc_clear=start2[8];
    wire          acc_add=use_k_dly[4];
    wire          acc_corr=use_k_dly[5];
    wire          acc_to_out=finish2[6];      
    wire   [17:0] mult_a;
    wire   [17:0] mult_b;
    wire   [35:0] mult_p;

    reg    [17:0] mult_s; //truncated and saturated (always positive) multiplier result (before calculating squared)
    reg           next_ac; // next will be AC component
    reg           use_coef; // use multiplier for the first operation - DCT coeff. by table elements
    reg           started_luma;// started Luma block 
    reg           luma_dc_out; // 1 cycle ahead of the luma DC component out (optionally combined with the WOI (mode=3))
    reg           luma_dc_acc; // 1 cycle ahead of the luma DC component out (always combined with the WOI)
    reg           was_last_luma;
    reg           copy_acc_frame;
    
    wire          twe;
    wire  [15:0]  tdi;
    wire  [22:0]  ta;
    
    assign        fdo_minus_max[12:0]= {1'b0,fdo[11:0]}-{1'b0,quant_dc_tdo[15:5]};
    assign        combined_qf[12:0]=stren?({quant_d[12:0]}+{1'b0,fdo[11:0]}): //original image plus positive
                                          ({quant_d[12],quant_d[12:1]}+ // half original 
                                           {fdo_minus_max[12],fdo_minus_max[12:1]}); // plus half signed
    assign        next_do[12:0] =  (mode[1:0]==2'h1)?(luma_dc_out?fdo_minus_max[12:0]:13'h0):
                                    ((mode[1] && luma_dc_out )? combined_qf[12:0]: {quant_d[12:0]} );

    always @ (posedge clk) begin
        if (!en) ic[2:0] <= 3'b0;
        else if (stb) ic[2:0] <= ic[2:0]+1;
        if (!en) oc[2:0] <= 3'b0;
        else if (start) oc[2:0] <= oc[2:0]+1;
    end

// writing window parameters in the last bank of a table     
    always @ (posedge mclk) begin
      if (twe) begin
          wnd_reg[11:0] <= tdi[11:0] ;
          wnd_a  <= ta[2:0];
        end
        wnd_wr <= twe && (ta[9:3]==7'h78) ; // first 8 location in the last 64-word bank
        if (wnd_wr) begin
            case (wnd_a[2:0])
              3'h0: wnd_left[8:0]       <= wnd_reg[11:3] ;
              3'h1: wnd_right[8:0]      <= wnd_reg[11:3] ;
              3'h2: wnd_top[8:0]        <= wnd_reg[11:3] ;
              3'h3: wnd_bottom[8:0]     <= wnd_reg[11:3] ;
              3'h4: wnd_totalwidth[8:1] <= wnd_reg[11:4] ;
              3'h5: filt_sel0[3:0]      <= wnd_reg[3:0] ;
              3'h6: stren               <= wnd_reg[0] ;
              default: begin end
            endcase
        end
     end
     
// determine if this block needs to be processed (Y, inside WOI)
     reg  [ 7:0]  mblk_hor; //horizontal macroblock (2x2 blocks) counter
     reg  [ 7:0]  mblk_vert; //vertical macroblock (2x2 blocks) counter
     wire         start_of_line= (first || (mblk_hor[7:0] == wnd_totalwidth[8:1]));
     wire         first_in_macro= (tn[2:0]==3'h0);
     reg          in_woi; // maybe specified as slow

     always @(posedge clk) begin
       if (first_in_macro && start) mblk_hor[7:0] <= start_of_line? 8'h0:(mblk_hor[7:0]+1);
       if (first_in_macro && start && start_of_line) mblk_vert[7:0] <= first? 8'h0:(mblk_vert[7:0]+1);
        start_d <= start;
        tn_d[2:0] <= tn[2:0];
        if (start_d) in_woi <= !tn_d[2] && 
                                               ({mblk_hor [7:0],tn_d[0]} >= wnd_left[8:0]) &&
                                               ({mblk_hor [7:0],tn_d[0]} <= wnd_right[8:0]) &&
                                               ({mblk_vert[7:0],tn_d[1]} >= wnd_top[8:0]) &&
                                               ({mblk_vert[7:0],tn_d[1]} <= wnd_bottom[8:0]);
     end
 
//Will use posedge sclk to balance huffman and system

//    wire clkdiv2;
//    FD i_clkdiv2(.C(clk), .D(!clkdiv2), .Q(clkdiv2));
    
    reg  clkdiv2=0;
    always @ (posedge clk) begin
        clkdiv2 <= ~clkdiv2;
    end
    
    
    reg [2:0] clksync;
    wire      csync=clksync[2];
    always @ (posedge clk2x) begin
       clksync[2:0] <= {(clksync[1]==clksync[0]),clksync[0],clkdiv2};
    end

    always @ (posedge clk) begin
        if (di[11]==di[12]) di_d[11:0] <=di[11:0];
        else di_d[11:0] <= {~di[11],{11{di[11]}}}; //saturate
    end
 
    assign       mult_a[17:0] = use_coef ? {1'b0,tdo[15:0],1'b0}: mult_s[17:0];
    assign      mult_b[17:0] = use_coef ? {d1[10:0],{7{d1[0]}}}: mult_s[17:0];

    always @ (posedge clk2x) begin
        filt_sel[3:0] <= filt_sel0[3:0];
        if (clksync[2]) d1[11:0]<=di_d[11:0];
        start2[8:0] <= {start2[7:0], start && csync};
//        finish2[7:0]<= {finish2[6:0],use_coef && !next_ac};
        finish2[6:0]<= {finish2[5:0],use_coef && !next_ac}; // finish2[7] was never used
        if      (!en || start2[0]) tba[5:0] <= 6'h0;
        else if (!csync && (tba[5:0] != 6'h3f))   tba[5:0] <= tba[5:0] + 1;
//        mult_s[17:0] <= (&mult_p[35:31] || !(&mult_p[35:31]))?mult_p[31:14]:18'h1ffff;
        mult_s[17:0] <= (&mult_p[35:31] || !(|mult_p[35:31]))?mult_p[31:14]:18'h1ffff;
        next_ac <= en && (start2[3] || (next_ac && ((tba[5:0] != 6'h3f) || csync )));
        use_coef <= next_ac && !csync;
        use_k_dly[5:0] <= {use_k_dly[4:0],use_coef};
        acc_ldval <= !(|start2[7:6]);
        if      (acc_clear || (acc_corr && acc_blk[23])) acc_blk[23:0] <= {1'b0,{23{acc_ldval}}};
        else if (acc_add)                                acc_blk[23:0] <= acc_blk[23:0] + mult_p[31:8]; // mult_p[35:8];
        if (acc_to_out) fdo[11:0] <= (|acc_blk[23:20])?12'hfff:acc_blk[19:8]; // positive, 0..0xfff
        if (acc_to_out) sum_blk[22:0] <= acc_blk[22:0]; // accumulator for the sum ((a[i]*d[i])^2), copied at block end
   end

//    acc_blk will (after corr) be always with MSB=0 - max 24'h7fffff
// for image output - max 24'h0fffff->12 bit signed, shifted
// combining output
//assign        combined_qf[12:0]={quant_d[11],quant_d[11:0]}+{fdo[11],fdo[11:0]};

//    SRL16 i_out_mono   (.Q(out_mono),   .A0(1'b1), .A1(1'b1), .A2(1'b1), .A3(1'b1), .CLK(clk), .D(started_luma)); // timing not critical
//    SRL16 i_out_window (.Q(out_window), .A0(1'b1), .A1(1'b1), .A2(1'b1), .A3(1'b1), .CLK(clk), .D(in_woi)); // timing not critical
    dly_16 #(.WIDTH(1)) i_out_mono(.clk(clk),  .rst(1'b0), .dly(4'd15), .din(started_luma), .dout(out_mono));    // timing not critical
    dly_16 #(.WIDTH(1)) i_out_window(.clk(clk),.rst(1'b0), .dly(4'd15), .din(in_woi),       .dout(out_window));    // timing not critical
    
    always @ (posedge clk) begin
        if (start) started_luma <= !tn[2];
        luma_dc_out <= quant_ds && out_mono && ((mode[1:0]!=3) || out_window);
        luma_dc_acc <= quant_ds && out_mono && out_window;
        was_last_luma <= en && last && out_mono;
        copy_acc_frame <= was_last_luma && !out_mono;
        if (first && first_in_macro) acc_frame[39:0] <= 40'h0;
        else if (luma_dc_acc)        acc_frame[39:0] <= acc_frame[39:0] + sum_blk[22:0];
        if (copy_acc_frame) hifreq[31:0] <= acc_frame[39:8];
        pre_ds <= quant_ds;
        ds <= pre_ds;
        pre_do[12:0] <= next_do[12:0];
        need_corr_max <=luma_dc_out && (mode[1:0]!=2'h0);
        do[12:0] <= (need_corr_max && !pre_do[12] && (pre_do[11] || (pre_do[10:0]>quant_dc_tdo[15:5])) )?
                    {2'b0,quant_dc_tdo[15:5]} :
                    pre_do[12:0];
     end
     
     table_ad_receive #(
        .MODE_16_BITS (1),
        .NUM_CHN      (1)
    ) table_ad_receive_i (
        .clk       (mclk),              // input
        .a_not_d   (tser_a_not_d),      // input
        .ser_d     (tser_d),            // input[7:0] 
        .dv        (tser_we),           // input
        .ta        (ta),                // output[22:0] 
        .td        (tdi),               // output[15:0] 
        .twe       (twe)               // output
    );
     
     
/*   
   MULT18X18SIO #(
      .AREG(1), // Enable the input registers on the A port (1=on, 0=off)
      .BREG(1), // Enable the input registers on the B port (1=on, 0=off)
      .B_INPUT("DIRECT"), // B cascade input "DIRECT" or "CASCADE" 
      .PREG(1)  // Enable the input registers on the P port (1=on, 0=off)
   ) i_focus_mult (
      .BCOUT(), // 18-bit cascade output
      .P(mult_p),    // 36-bit multiplier output
      .A(mult_a),    // 18-bit multiplier input
      .B(mult_b),    // 18-bit multiplier input
      .BCIN(18'h0), // 18-bit cascade input
      .CEA(en), // Clock enable input for the A port
      .CEB(en), // Clock enable input for the B port
      .CEP(en), // Clock enable input for the P port
      .CLK(sclk), // Clock input
      .RSTA(1'b0), // Synchronous reset input for the A port
      .RSTB(1'b0), // Synchronous reset input for the B port
      .RSTP(1'b0)  // Synchronous reset input for the P port
   );
*/
    reg      [35:0] mult_p_r;
    reg      [17:0] mult_a_r;
    reg      [17:0] mult_b_r;
    assign mult_p = mult_p_r;
    always @(posedge clk2x) begin
        mult_a_r <= mult_a;
        mult_b_r <= mult_b;
        mult_p_r <= mult_a_r * mult_b_r;
    end

/*     
     RAM16X1D i_tn0    (.D(tni[0]),.DPO(tn[0]),.A0(ic[0]),.A1(ic[1]),.A2(1'b0),.A3(1'b0),.DPRA0(oc[0]),.DPRA1(oc[1]),.DPRA2(1'b0),.DPRA3(1'b0),.WCLK(clk),.WE(stb));
     RAM16X1D i_tn1    (.D(tni[1]),.DPO(tn[1]),.A0(ic[0]),.A1(ic[1]),.A2(1'b0),.A3(1'b0),.DPRA0(oc[0]),.DPRA1(oc[1]),.DPRA2(1'b0),.DPRA3(1'b0),.WCLK(clk),.WE(stb));
     RAM16X1D i_tn2    (.D(tni[2]),.DPO(tn[2]),.A0(ic[0]),.A1(ic[1]),.A2(1'b0),.A3(1'b0),.DPRA0(oc[0]),.DPRA1(oc[1]),.DPRA2(1'b0),.DPRA3(1'b0),.WCLK(clk),.WE(stb));
     RAM16X1D i_first  (.D(firsti),.DPO(first),.A0(ic[0]),.A1(ic[1]),.A2(1'b0),.A3(1'b0),.DPRA0(oc[0]),.DPRA1(oc[1]),.DPRA2(1'b0),.DPRA3(1'b0),.WCLK(clk),.WE(stb));
     RAM16X1D i_last   (.D(lasti), .DPO(last), .A0(ic[0]),.A1(ic[1]),.A2(1'b0),.A3(1'b0),.DPRA0(oc[0]),.DPRA1(oc[1]),.DPRA2(1'b0),.DPRA3(1'b0),.WCLK(clk),.WE(stb));
*/
    reg      [ 4:0] ram4[0:3];
    always @ (posedge   clk) begin
        ram4[ic[1:0]] <= {lasti,firsti,tni[2:0]};
    end
    assign {last,first,tn[2:0]} =  ram4[oc[1:0]];
// is it correct posedge sclk on rd, negedge on wr and no xclk?
/*
   RAMB16_S18_S18 i_focus_dct_tab (
      .DOA(tdo[15:0]),       // Port A 16-bit Data Output
      .DOPA(),     // Port A 2-bit Parity Output
      .ADDRA({filt_sel[3:0],tba[2:0],tba[5:3]}),   // Port A 10-bit Address Input
      .CLKA(sclk),     // Port A Clock
      .DIA(16'b0),       // Port A 16-bit Data Input
      .DIPA(2'b0),     // Port A 2-bit parity Input
      .ENA(1'b1),       // Port A RAM Enable Input
      .SSRA(1'b0),     // Port A Synchronous Set/Reset Input
      .WEA(1'b0),       // Port A Write Enable Input

      .DOB(), // Port B 16-bit Data Output
      .DOPB(),     // Port B 4-bit Parity Output
      .ADDRB({ta[9:0]}),   // Port B 2-bit Address Input
      .CLKB(!sclk),     // Port B Clock
      .DIB(tdi[15:0]),       // Port B 16-bit Data Input
      .DIPB(2'b0),     // Port-B 2-bit parity Input
      .ENB(1'b1),       // PortB RAM Enable Input
      .SSRB(1'b0),     // Port B Synchronous Set/Reset Input
      .WEB(twe)        // Port B Write Enable Input
   );
*/
    ram18_var_w_var_r #(
        .REGISTERS    (0),
        .LOG2WIDTH_WR (4),
        .LOG2WIDTH_RD (4),
        .DUMMY        (0)
`ifdef PRELOAD_BRAMS
    `include "includes/focus_filt.dat.vh"
`endif
    ) i_focus_dct_tab (
        .rclk         (clk), // input
        .raddr        ({filt_sel[3:0],tba[2:0],tba[5:3]}), // input[9:0] 
        .ren          (1'b1), // input
        .regen        (1'b1), // input
        .data_out     (tdo[15:0]), // output[31:0] 
        .wclk         (mclk), // input
        .waddr        (ta[9:0]), // input[8:0] 
        .we           (twe), // input
        .web          (4'hf), // input[3:0] 
        .data_in      (tdi[15:0]) // input[31:0] 
    );

endmodule