/*
** -----------------------------------------------------------------------------**
** stuffer393.v
**etrax_dma
** Bit stuffer for JPEG encoder
**
** Copyright (C) 2002-2015 Elphel, Inc
**
** -----------------------------------------------------------------------------**
**  stuffer393.v is free software - hardware description language (HDL) code.
** 
**  This program is free software: you can redistribute it and/or modify
**  it under the terms of the GNU General Public License as published by
**  the Free Software Foundation, either version 3 of the License, or
**  (at your option) any later version.
**
**  This program is distributed in the hope that it will be useful,
**  but WITHOUT ANY WARRANTY; without even the implied warranty of
**  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
**  GNU General Public License for more details.
**
**  You should have received a copy of the GNU General Public License
**  along with this program.  If not, see <http://www.gnu.org/licenses/>.
** -----------------------------------------------------------------------------**
**
*/
`define debug_compressor
// 08.27.2005 - modified "rdy" - moved register to make it faster.
// 01.22.2004 - fixed bug if flush comes with !rdy (required mod of huffman.v to extend "flush" until ready)
// 02.05.2004 - modified data length output. It is 24 it ow, in bytes and is output as last 4 bytes in the
//              data block that is 32-byte DMA page aligned

// running on v8.2i - does not meet constraints with enabled global USE_SYNC_SET yes/auto because set input is slower. Trying to selectively disable it

// s ynthesis attribute use_sync_set of stuffer is no; 
// s ynthesis attribute use_sync_reset of stuffer is no; 
// s ynthesis attribute use_clock_enable of stuffer is no; 
// TODO:
// 1: Add FIFO buffer - with hclk on the read side
// 2: Get rid of imgptr - read addresses from the AFI module
// 3  Add multi-word status transmitter or just status transmit module for each compressor channel (29 bits are OK to read in multiple of 32-byte blocks
// Or make FIFO outside of the stuffer?

module stuffer393 (
    input              clk,         // 2x pixel clock
    input              en_in,       // enable, 0- reset (other clock domain, needs re-sync)
///    input              reset_data_counters, // reset data transfer counters (only when DMA and compressor are disabled)
    input              flush,       // flush output data (fill byte with 0, long word with 0
    input              abort,       // @ any, extracts 0->1 and flushes
    input              stb,         // input data strobe
    input        [3:0] dl,          // [3:0] number of bits to send (0 - 16) ??
    input       [15:0] d,           // [15:0] input data to shift (only lower bits are valid)
// time stamping - will copy time at the end of color_first (later than the first hact after vact in the current froma, but before the next one
// and before the data is needed for output 
    input              color_first, // (different clock) only used for timestamp
    input       [31:0] sec,         // [31:0] number of seconds
    input       [19:0] usec,        // [19:0] number of microseconds
    output             rdy,         // enable huffman encoder to proceed. Used as CE for many huffman encoder registers
    // outputs @ negedge clk
    output reg  [15:0] q,           // [15:0] output data
    output reg         qv,          // output data valid
    output             done,        // reset by !en, goes high after some delay after flushing
///    output reg  [23:0] imgptr,      // [23:0]image pointer in 32-byte chunks 
    output reg         flushing,
    output reg         running      // from registering timestamp until done
`ifdef debug_stuffer
,      output reg   [3:0] etrax_dma_r, // [3:0] just for testing
       output reg   [3:0] test_cntr,
       output reg   [7:0] test_cntr1
`endif
);

`ifdef debug_stuffer
    reg           en_d;
`endif
    reg           en; // re-clock en_in to match this clock
    reg     [2:0] abort_r;
    reg           force_flush;
    
    
    
    reg    [23:1] stage1;         //    stage 1 register (after right-shifting input data by 0..7 - actually left by 7..0)
    wire    [2:0] shift1;        // shift amount for stage 1
    reg     [4:0] stage1_bits;    // number of topmost invalid bits in stage1 register - 2 MSBs, use lower 3  stage2_bits
    reg     [4:0] stage1_length;  // number of bits (1..16) in stage 1 register

    wire          flush_end;
    reg           stage1_full;
    wire    [7:0] byteMask;
    wire   [31:1] longMask;    
    wire   [31:1] dflt_stage2;
    wire   [ 2:0] sel;
    wire   [ 1:0] st2m;
    wire   [31:1] st2_d;
    reg    [31:1] stage2;
    reg    [ 4:0] stage2_bits;
    wire          send8h;
    wire          send8l;
    wire          send8;
    reg           flush_end_delayed;   // update: fixed delay some delay after flush_end to ensure combining with output FIFO empty
    wire          pre_flush_end_delayed;    // some delay after flush_end to ensure combining with output FIFO empty
    reg    [23:0] size_count; //(now will be byte count)
     
// to make it faster - split in parts
    reg           inc_size_count2316;
    reg    [ 2:0] size_out;
    reg           size_out_over;// only needed with extra 32 bytes of zeroes added.
    reg           busy_eob;     // flushing and sending length
    reg           trailer;      // sending out data length and 32 bytes for ETRAX
    reg           was_trailer;  // sending out data length and 32 bytes for ETRAX

    reg    [ 3:0] etrax_dma;    // count words to make total size multiple of 32 bytes.
                                // Last 4 bytes of data will have actual length in bytes
                                // There will always be at least 4 more bytes (0-es) before length - needed for software
    reg           will_flush;   // next dv will be flushing byte/word
    wire          flush_now;
    wire          start_sizeout; //delay by 2 cycles

    reg           send8h_r;
    reg           send8l_r;

    wire          pre_stage2_bits_3;    // what will be registered to stage2_bits[3];
    wire    [4:3] willbe_stage1_bits;
    wire    [3:0] sum_lengths;
    reg     [1:0] st2m_r;
    
    reg     [2:0] stb_time;
    reg    [31:0] sec_r;
    reg    [19:0] usec_r;
    reg           time_out;
    reg           time_size_out;
    wire          start_time_out;
    
// stb_time[2] - single-cycle pulse after color_first goes low 
    reg    [19:0] imgsz32; // current image size in multiples of 32-bytes
    reg           inc_imgsz32;
    // re-clock enable to this clock
    always @ (negedge clk) begin
        en <= en_in;
        // re-clock abort, extract leading edge
        abort_r <= {abort_r[0] & ~abort_r[1], abort_r[0], abort};
        if      (!en)       force_flush <= 0;
        else if (abort_r)   force_flush <= 1;
        else if (flush_end) force_flush <= 0;
        
        if      (!en)         running <= 0;
        else if (stb_time[2]) running <= 1;
        else if (flush_end)   running <= 0;
        
    end

    always @ (negedge clk)  begin
        flushing <= en && !flush_end && (((flush || force_flush) && rdy) || flushing);
    end
    
    wire    [4:0]    pre_stage1_bits;
    assign pre_stage1_bits[4:0]={2'b00,stage1_bits[2:0]} +  {(dl[3:0]==4'b0),dl[3:0]};
    
    always @ (negedge clk)    begin 
        if (!en || flush_end) stage1_bits[4:0] <= 5'b0;
        else if (stb && rdy) stage1_bits <= {(2'b10-pre_stage1_bits[4:3]),pre_stage1_bits[2:0]};
    end

    assign shift1[2:0]= stage1_bits[2:0] + dl[2:0];
    always @ (negedge clk) if (stb && rdy)    begin
        case (shift1[2:0])
            0: stage1[23:1]    <= {     d[15:0],7'b0};
            1: stage1[23:1]    <= {1'b0,d[15:0],6'b0};
            2: stage1[23:1]    <= {2'b0,d[15:0],5'b0};
            3: stage1[23:1]    <= {3'b0,d[15:0],4'b0};
            4: stage1[23:1]    <= {4'b0,d[15:0],3'b0};
            5: stage1[23:1]    <= {5'b0,d[15:0],2'b0};
            6: stage1[23:1]    <= {6'b0,d[15:0],1'b0};
            7: stage1[23:1]    <= {7'b0,d[15:0]     };
         endcase
        stage1_length[4:0]    <= {(dl[3:0]==4'b0),dl[3:0]};
    end


//*****************************
    always @ (negedge clk) begin
        if (!en) stage2_bits    <= 5'b0;
        else if (send8) stage2_bits[4:0] <= stage2_bits[4:0] - 8;
        else if (flushing && !stage1_full && !stage2_bits[4] && (stage2_bits[3:0]!=4'b0)) stage2_bits[4:0]<=5'h10;    // actual flushing to word size
        else        stage2_bits[4:0]    <= (rdy && stage1_full)? {1'b0,stage2_bits[3:0]}+stage1_length[4:0]:{1'b0,stage2_bits[3:0]};
    end

    assign        sum_lengths=stage2_bits[3:0]+stage1_length[3:0];
    assign pre_stage2_bits_3= en &&
                          (send8? (~stage2_bits[3]): (
                                  !(flushing && !stage1_full && !stage2_bits[4] && (stage2_bits[3:0]!=4'b0)) && // not flushing
                                  ((rdy && stage1_full)?sum_lengths[3]:    stage2_bits[3] )
                                  ));
    assign willbe_stage1_bits[4:3]={2{en && !flush_end}} & ((stb && rdy)?(2'b10-pre_stage1_bits[4:3]):stage1_bits[4:3]);
    

// accelerating rdy calculation - making it a register
    wire       pre_busy_eob=en && !flush_end_delayed && (busy_eob || (flush && rdy));
    wire [4:3] pre_stage2_bits_4_interm1=stage2_bits[4:3]-2'h1;
    wire [4:0] pre_stage2_bits_4_interm2={1'b0,stage2_bits[3:0]}+stage1_length[4:0];
    wire       pre_stage2_bits_4=en && (send8?
                                     (pre_stage2_bits_4_interm1[4]):
                                     ((flushing && !stage1_full && !stage2_bits[4] && (stage2_bits[3:0]!=4'b0))?
                                       (1'b1):
                                       (((rdy && stage1_full))?
                                         (pre_stage2_bits_4_interm2[4]):
                                         (1'b0)
                                       )
                                     )
                              );
    wire pre_send8h_r= (( send8h_r  &&  stage2_bits[4])?
                           (&stage2[23:16]):
                           ((!send8l_r  || !stage2_bits[4])?
                             (&((longMask[31:24] & st2_d[31:24]) | (~longMask[31:24] & dflt_stage2[31:24]))):
                             (send8h_r)
                           )
                         );

    wire pre_send8l_r= ((( send8h_r || send8l_r) &&  stage2_bits[4] )?
                        (&stage2[15:8]):
                        (&((longMask[23:16] & st2_d[23:16]) | (~longMask[23:16] & dflt_stage2[23:16])))
                       );

//Trying to delay rdy to make more room before it
    reg           rdy_rega;
    reg           rdy_regb;
    reg           rdy_regc;
    reg           rdy_regd;
// s ynthesis attribute use_sync_set of {module_name|signal_name|instance_name} [is] no; 
 
   always @ (negedge clk) begin
        rdy_rega <= !pre_stage2_bits_4;
        rdy_regb <= !pre_send8h_r;
        rdy_regc <= !pre_send8l_r;
        rdy_regd <= !pre_busy_eob;
        busy_eob <= pre_busy_eob;
//**********************************
        send8h_r<=pre_send8h_r;
        send8l_r<=pre_send8l_r;
    end
    assign rdy = (rdy_rega || (rdy_regb && rdy_regc)) && rdy_regd;
    
    assign send8h= send8h_r && stage2_bits[4];
    assign send8l= send8l_r && stage2_bits[4];
    assign send8=stage2_bits[4] && (send8h_r || send8l_r);

    always    @ (negedge clk) begin
        if (!en) stage1_full <= 1'b0;
/* TODO: MAke sure it is OK !! 05/12/2010 */
        else if (flushing) stage1_full <= 1'b0; //force flush does not turn off stb, in normal operation flushing is after last stb
        else if (rdy) stage1_full <=stb; //force flush does not turn off stb, in normal operation flushing is after last stb

    end
    assign    sel[2:0]=stage2_bits[2:0];
    assign    byteMask[7:0]=    {!sel[2] && !sel[1] && !sel[0],
                                 !sel[2] && !sel[1],
                                 !sel[2] && (!sel[1] || !sel[0]),
                                 !sel[2],
                                 !sel[2] || (!sel[1] && !sel[0]),
                                 !sel[2] || !sel[1],
                                 !sel[2] || !sel[1] || !sel[0],
                                 1'b1
                                 };

//TODO: Try to move stage1_full up here, this is the time-limiting path 05.26.2010
    assign    longMask[31:1]={{8{(flushing || stage1_full) && !stage2_bits[3]}} & byteMask[7:0],
                              {8{flushing || stage1_full}} & ({8{!stage2_bits[3]}} | byteMask[7:0]),
                              {8{stage1_full}},
                              {7{stage1_full}}};

    always @ (negedge clk) st2m_r[1:0]<=willbe_stage1_bits[4:3]-{1'b0,pre_stage2_bits_3};
    
    assign    st2m[1:0]=st2m_r[1:0];
    assign    st2_d[31:1]=    {{8{!flushing || stage1_full}} & (st2m[1]?{stage1[7:1],1'b0}:(st2m[0]? stage1[15:8]:     stage1[23:16])),
                               {8{!flushing || stage1_full}} & (st2m[1]? stage1[23:16]:    (st2m[0]?{stage1[7:1],1'b0}:stage1[15: 8])),
                               st2m[1]? stage1[15: 8]:    {stage1[7:1],1'b0},
                               {stage1[7:1]}};
    assign    dflt_stage2=stage2_bits[4]?{stage2[15:1],16'b0}:{stage2[31:1]};


always @ (negedge clk) begin
    if          (send8h) stage2[31:24] <= stage2[23:16];
    else if (send8l) stage2[31:24] <= 8'h00;
    else                  stage2[31:24] <= (longMask[31:24] & st2_d[31:24]) | (~longMask[31:24] & dflt_stage2[31:24]);
    if          (send8)  stage2[23:16] <= stage2[15:8];
    else                  stage2[23:16] <= (longMask[23:16] & st2_d[23:16]) | (~longMask[23:16] & dflt_stage2[23:16]);

    if          (send8)  stage2[15: 8] <= {stage2[7:1],1'b0};
    else                  stage2[15: 8] <= (longMask[15: 8] & st2_d[15: 8]) | (~longMask[15: 8] & dflt_stage2[15: 8]);

    if          (send8)  stage2[7:  1] <= 7'b0;
    else                  stage2[7:  1] <= (longMask[7: 1] & st2_d[7: 1]) | (~longMask[7: 1] & dflt_stage2[7: 1]);
end

// output stage
    assign   flush_end= !stage2_bits[4] && flushing && !stage1_full && (stage2_bits[3:0]==4'b0);
    assign flush_now= en && (!send8) && (flushing && !stage1_full && !stage2_bits[4]) && !will_flush;
`ifdef debug_stuffer
    reg [3:0] tst_done_dly;
`endif

    always @ (negedge clk) begin
        stb_time[2:0] <= {stb_time[1] & ~stb_time[0], stb_time[0],color_first};
      
        if        (stb_time[2]) sec_r[31:0] <= sec[31:0];
        else if (start_sizeout) sec_r[31:0] <= {8'hff, size_count[23:0]};
        else if (time_size_out) sec_r[31:0] <= {usec_r[15:0],sec_r[31:16]};
        if   (stb_time[2]) usec_r[19:0] <= usec[19:0];
        else if (time_out) usec_r[19:0] <= {16'h0,usec_r[19:16]};
  
 //reset_data_counters; // reset data transfer counters (only when DMA and compressor are disabled)
 
//        if (reset_data_counters ) etrax_dma[3:0] <= 0; // not needed to be reset after frame, and that was wrong (to early)
        if (!en ) etrax_dma[3:0] <= 0; // Now en here waits for flashing to end, so it should not be too early
        else if (qv) etrax_dma[3:0] <= etrax_dma[3:0] + 1;

// just for testing
`ifdef debug_stuffer
        en_d<= en;
        if (en) etrax_dma_r[3:0] <= etrax_dma[3:0];
        if    (done) test_cntr1[7:0] <= 0;
        else if (qv) test_cntr1[7:0] <= test_cntr1[7:0] +1 ; // normally should be one (done 1 ahead of end of qv)
        tst_done_dly[3:0] <= {tst_done_dly[2:0],done};
        if (tst_done_dly[1]) test_cntr[3:0] <= 0;
        else if (qv)         test_cntr[3:0] <= test_cntr[3:0] +1 ;
`endif
 

        size_out_over <= en && (size_out_over?(!done):size_out[0]);
  
        size_out[2:0]<={size_out[1:0],start_sizeout};
        time_out <= en && (start_time_out || (time_out && !(etrax_dma[3:2]== 2'h3)));
        time_size_out <= en && (start_time_out || (time_size_out && !(etrax_dma[3:1]== 3'h7)));
  
        trailer <= en && (trailer?(!flush_end_delayed):(flush_end));
        was_trailer<=trailer; 
        will_flush <= en && (will_flush?(!qv):(flush_now && (stage2_bits[3:0]!=4'b0)));
        if (flush_now) size_count[0] <= stage2_bits[3] ^ (|stage2_bits[2:0]); // odd number of bytes
        if (!en || size_out[2]) size_count[15:1] <= 0;
        else if (!trailer && !was_trailer && qv && (!will_flush || !size_count[0]))  size_count[15:1] <= size_count[15:1]+1;
        inc_size_count2316 <= (!trailer && !was_trailer && qv && (!will_flush || !size_count[0])) && (&size_count[15:1]);
//reset_data_counters instead of !en here?
        if      (!en || size_out[2]) size_count[23:16] <= 0;
        else if (inc_size_count2316) size_count[23:16] <= size_count[23:16]+1;

        qv <= en && (stage2_bits[4] || trailer);
// to make it faster (if needed) use a single register as a source for  q[15:0] in two following lines
        if      (time_size_out)  q[15:0] <= {sec_r[7:0],sec_r[15:8]};
        else                     q[15:0] <= {(stage2_bits[4]?stage2[31:24]:8'b0),
                                             ((stage2_bits[4] && !send8h)? stage2[23:16]:8'b0)};
        inc_imgsz32 <= (etrax_dma[3:0]== 4'h0) && qv;
//reset_data_counters instead of !en here?
//        if (reset_data_counters || done) imgsz32[19:0] <= 0;
        if (!en || done) imgsz32[19:0] <= 0; // now en is just for stuffer, waits for flushing to end
        else if (inc_imgsz32) imgsz32[19:0]<=imgsz32[19:0]+1;

///        if (reset_data_counters) imgptr[23:0] <= 0;
///        else if (done) imgptr[23:0] <= imgptr[23:0]+ imgsz32[19:0];
        
        flush_end_delayed <= en & pre_flush_end_delayed; // en just to prevent optimizing pre_flush_end_delayed+flush_end_delayed into a single SRL16
    end
//start_sizeout
    assign start_time_out= qv && trailer && (etrax_dma[3:0]== 4'h8) && !size_out_over;
    assign start_sizeout= time_out && (etrax_dma[3:0]== 4'hc);
// SRL16_1 i_pre_flush_end_delayed (.D(size_out[1]),.Q(pre_flush_end_delayed), .A0(1'b0), .A1(1'b1), .A2(1'b1), .A3(1'b1), .CLK(clk)); // dly=3+1    // rather arbitrary?
    dly_16 #(.WIDTH(1)) i_pre_flush_end_delayed(.clk(~clk),.rst(1'b0), .dly(14), .din(size_out[1]), .dout(pre_flush_end_delayed));    // dly=14+1 // rather arbitrary?
    assign done = flush_end_delayed;

endmodule