ported all other submodules for the JPEG/JP4 compressor

43f67702 · Andrey Filippov · b040c02d · 43f67702 · 43f67702 · 43f67702
Commit 43f67702 authored Jun 19, 2015 by Andrey Filippov
9 changed files
--- a/compressor_jp/dcc_sync393.v
+++ b/compressor_jp/dcc_sync393.v
+/*******************************************************************************
+ * Module: dcc_sync393
+ * Date:2015-06-17  
+ * Author: andrey     
+ * Description: Synchronises output of DC components
+ * Syncronizes dcc data with dma1 output, adds 16..31 16-bit zero words for Axis DMA
+ * Was not used in late NC353 camera (DMA channel used fro IMU logger)
+ *
+ * Copyright (c) 2015 <set up in Preferences-Verilog/VHDL Editor-Templates> .
+ * dcc_sync393.v is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ *  dcc_sync393.v is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/> .
+ *******************************************************************************/
+`timescale 1ns/1ps
+
+module  dcc_sync393(
+    input             sclk,         // system clock:  twe, ta,tdi - valid @negedge (ra, tdi - 2 cycles ahead)
+    input             dcc_en, // clk rising, sync with start of the frame
+    input             finish_dcc, // sclk rising
+    input             dcc_vld,    // clk rising
+    input      [15:0] dcc_data, //[15:0] clk risimg
+    output reg        statistics_dv, //sclk
+    output reg [15:0] statistics_do); //[15:0] sclk
+
+    reg           statistics_we;
+    reg           dcc_run;
+    reg           dcc_finishing;
+    reg           skip16; // output just 16 zero words (data was multiple of 16 words)
+    reg    [ 4:0] dcc_cntr;
+    
+    always @ (posedge sclk) begin
+        dcc_run <= dcc_en;
+        statistics_we <= dcc_run && dcc_vld && !statistics_we;
+        statistics_do[15:0] <= statistics_we?dcc_data[15:0]:16'h0;
+        statistics_dv <= statistics_we || dcc_finishing;
+        skip16 <= finish_dcc && (statistics_dv?(dcc_cntr[3:0]==4'hf):(dcc_cntr[3:0]==4'h0) ); 
+        if (!dcc_run)           dcc_cntr[3:0] <= 4'h0;
+        else if (statistics_dv) dcc_cntr[3:0] <= dcc_cntr[3:0]+1; 
+        dcc_cntr[4]   <= dcc_run && ((dcc_finishing && ((dcc_cntr[3:0]==4'hf)^dcc_cntr[4]) || skip16));
+        dcc_finishing <= dcc_run && (finish_dcc   || (dcc_finishing && (dcc_cntr[4:1]!=4'hf)));
+    end
+
+endmodule
+
--- a/compressor_jp/encoderDCAC393.v
+++ b/compressor_jp/encoderDCAC393.v
+/*
+** -----------------------------------------------------------------------------**
+** encoderDCAC393.v
+**
+** RLL encoder for JPEG compressor
+**
+** Copyright (C) 2002-2015 Elphel, Inc
+**
+** -----------------------------------------------------------------------------**
+**  encoderDCAC393.v is free software - hardware description language (HDL) code.
+**  This program is free software: you can redistribute it and/or modify
+**  it under the terms of the GNU General Public License as published by
+**  the Free Software Foundation, either version 3 of the License, or
+**  (at your option) any later version.
+**
+**  This program is distributed in the hope that it will be useful,
+**  but WITHOUT ANY WARRANTY; without even the implied warranty of
+**  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+**  GNU General Public License for more details.
+**
+**  You should have received a copy of the GNU General Public License
+**  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+** -----------------------------------------------------------------------------**
+**
+*/
+
+
+// Accepts  13-bits signed data (only 12-bit can be ecoded), so DC difference (to be encoded) is limited (saturated) to 12 bits, not the value itself
+// AC - always limited to 800 .. 7ff
+module encoderDCAC393(
+    input             clk,            // pixel clock, posedge
+    input             en,             // enable (0 resets)
+    input             lasti,          // was "last MCU in a frame" (@ stb)
+    input             first_blocki,   // first block in frame - save fifo write address (@ stb) 
+    input      [ 2:0] comp_numberi,   // [2:0] component number 0..2 in color, 0..3 - in jp4diff, >= 4 - don't use (@ stb) 
+    input             comp_firsti,    // first this component in a frame (reset DC) (@ stb) 
+    input             comp_colori,    // use color - huffman? (@ stb) 
+    input             comp_lastinmbi, // last component in a macroblock (@ stb) is it needed?
+    input             stb,            // strobe that writes firsti, lasti, tni,average
+    input      [12:0] zdi,            // [11:0] zigzag-reordered data input
+    input             first_blockz,   // first block input (@zds)
+    input             zds,            // strobe - one ahead of the DC component output
+    output reg        last,           //
+    output reg [15:0] do,
+    output reg        dv);
+
+
+// 8x13  DC storage memory
+    reg    [12:0] dc_mem[7:0];
+    reg    [12:0] dc_diff0, dc_diff;
+    wire   [11:0] dc_diff_limited=  (dc_diff[12]==dc_diff[11])?
+                                     dc_diff[11:0] :
+                                     {~dc_diff[11],{11{dc_diff[11]}}}; // difference (to be encoded) limited to fit 12 bits
+    reg    [12:0] dc_restored; // corrected DC value of the current block, compensated to fit difference to 12 bits
+    reg    [ 5:0] rll_cntr;
+    reg    [5:0]  cntr;
+    reg    [11:0] ac_in;
+
+    wire          izero=(ac_in[11:0]==12'b0);
+
+    reg    [14:0] val_r;    // DC diff/AC values to be sent out, registered
+
+    reg           DCACen;    // enable DC/AC (2 cycles ahead of do
+    wire          rll_out;
+    wire          pre_dv;
+    reg           was_nonzero_AC;
+    reg    [12:0] zdi_d;
+    reg     [3:0] zds_d;
+    wire          DC_tosend=  zds_d[2];
+    wire          pre_DCACen= zds_d[1];
+
+    wire    [2:0] comp_numbero;   // [2:0] component number 0..2 in color, 0..3 - in jp4diff, >= 4 - don't use
+    wire          comp_firsto;    // first this component in a frame (reset DC)
+    wire          comp_coloro;    // use color - huffman?
+    wire          comp_lastinmbo; // last component in a macroblock
+    wire          lasto;          // last macroblock in a frame
+    reg     [2:0] block_mem_ra;
+    reg     [2:0] block_mem_wa;
+    reg     [2:0] block_mem_wa_save;
+    reg     [6:0] block_mem[0:7];
+    wire    [6:0] block_mem_o=block_mem[block_mem_ra[2:0]];
+    
+    assign comp_numbero[2:0]= block_mem_o[2:0];
+    assign comp_firsto=       block_mem_o[3];
+    assign comp_coloro=       block_mem_o[4];
+    assign comp_lastinmbo=    block_mem_o[5];
+    assign lasto=             block_mem_o[6];
+    always @ (posedge clk) begin
+        if (stb) block_mem[block_mem_wa[2:0]] <= {lasti, comp_lastinmbi, comp_colori,comp_firsti,comp_numberi[2:0]};
+        if      (!en) block_mem_wa[2:0] <= 3'h0;
+        else if (stb) block_mem_wa[2:0] <= block_mem_wa[2:0] +1;
+
+        if (stb && first_blocki) block_mem_wa_save[2:0] <= block_mem_wa[2:0];
+
+        if      (!en) block_mem_ra[2:0] <= 3'h0;
+        else if (zds) block_mem_ra[2:0] <= first_blockz?block_mem_wa_save[2:0]:(block_mem_ra[2:0] +1);
+    end
+
+    assign rll_out= ((val_r[12] && !val_r[14]) || (ac_in[11:0]!=12'b0)) && (rll_cntr[5:0]!=6'b0);
+    assign     pre_dv=rll_out || val_r[14] || was_nonzero_AC;
+
+    always @ (posedge clk) begin
+        val_r[14:0] <={ DC_tosend?
+                            {en,
+                             comp_coloro,
+                             comp_lastinmbo && lasto, // last component's  in a frame DC coefficient
+                             dc_diff_limited[11:0]}:
+                            {2'b0,
+                              (cntr[5:0]==6'h3f),
+                              ac_in[11:0]}}; 
+        was_nonzero_AC <= en && (ac_in[11:0]!=12'b0) && DCACen;
+        if (pre_dv) do <= rll_out? {3'b0,val_r[12],6'b0,rll_cntr[5:0]}:{1'b1,val_r[14:0]};
+        dv    <= pre_dv;
+        DCACen    <= en && (pre_DCACen || (DCACen && (cntr[5:0]!=6'h3f)));    // adjust
+        if (!DCACen) cntr[5:0] <=6'b0;
+        else              cntr[5:0] <=cntr[5:0]+1;
+    end
+
+    always @ (posedge clk) begin
+        zdi_d[12:0] <= zdi[12:0];
+        ac_in[11:0] <= (zdi_d[12]==zdi_d[11])? zdi_d[11:0]:{~zdi_d[11],{11{zdi_d[11]}}};  // always // delay + saturation
+        
+        if (DC_tosend || !izero || !DCACen) rll_cntr[5:0]    <= 6'h0;
+        else if (DCACen) rll_cntr[5:0]    <= rll_cntr[5:0] +1 ;
+        if (DC_tosend) last <= lasto;
+    end
+
+// DC components
+    always @ (posedge clk) begin
+        zds_d[3:0]           <= {zds_d[2:0], zds};
+        if (zds_d[0])   dc_diff0[12:0] <= comp_firsto?13'b0:dc_mem[comp_numbero[2:0]];
+        if (zds_d[1])   dc_diff [12:0] <= zdi_d[12:0]-dc_diff0[12:0];
+        if (zds_d[2])   dc_restored[12:0] <=  dc_diff0[12:0] + {dc_diff_limited[11],dc_diff_limited[11:0]};
+        if (zds_d[3])   dc_mem[comp_numbero[2:0]]   <= dc_restored[12:0];
+    end
+
+// Generate output stream to facilitate huffman encoding. The data will go to FIFO (16x) to compensate for possible long Huffman codes
+// and/or zero-byte insertions
+// format:
+// {2'b11, color,last block,      dc[11:0]} - DC data
+// {2'b10, 1'b0, last coeff,      ac[11:0]} - AC data (last coeff is set if it is last- 63-rd AC coefficient)
+// {2'h00, 2'b00,      6'b0,rll[ 5:0]} - RLL zeroes.
+// {2'h00, 2'b01,      6'b0,rll[ 5:0]} - end of block. lower 6 bits will have length that should be ignored
+
+endmodule
--- a/compressor_jp/focus_sharp393.v
+++ b/compressor_jp/focus_sharp393.v
+/*
+** -----------------------------------------------------------------------------**
+** focus_sharp393.v
+**
+** Module to determine focus sharpness on  by integrating
+** DCT coefficient, multiplied my 8x8 array and squared
+**
+** Copyright (C) 2008-2015 Elphel, Inc
+**
+** -----------------------------------------------------------------------------**
+**  ocus_sharp393.v is free software - hardware description language (HDL) code.
+** 
+**  This program is free software: you can redistribute it and/or modify
+**  it under the terms of the GNU General Public License as published by
+**  the Free Software Foundation, either version 3 of the License, or
+**  (at your option) any later version.
+**
+**  This program is distributed in the hope that it will be useful,
+**  but WITHOUT ANY WARRANTY; without even the implied warranty of
+**  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+**  GNU General Public License for more details.
+**
+**  You should have received a copy of the GNU General Public License
+**  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+** -----------------------------------------------------------------------------**
+**
+*/
+`timescale 1ns/1ps
+//TODO: Modify to work with other modes (now only on color)
+module focus_sharp393(
+    input             clk,          // pixel clock, posedge
+    input             en,           // enable (0 resets)
+    input             sclk,         // system clock:  twe, ta,tdi - valid @negedge (ra, tdi - 2 cycles ahead)
+    input             twe,          // enable write to a table
+    input      [ 9:0] ta,           // [9:0]  table address
+    input      [15:0] tdi,          // [15:0] table data in (8 LSBs - quantization data)
+    input      [ 1:0] mode,         // focus mode (combine image with focus info) - 0 - none, 1 - replace, 2 - combine all,  3 - combine woi
+    input             firsti,       // first macroblock
+    input             lasti,        // last macroblock
+    input      [ 2:0] tni,          // block number in a macronblock - 0..3 - Y, >=4 - color (sync to stb)
+    input             stb,          // strobe that writes ctypei, dci
+    input             start,        // marks first input pixel (needs 1 cycle delay from previous DCT stage)
+    input      [12:0] di,           // [11:0] pixel data in (signed)
+    input             quant_ds,     // quantizator ds
+    input      [12:0] quant_d,      // [11:0]quantizator data output
+    input      [15:0] quant_dc_tdo, // [15:0], MSB aligned coefficient for the DC component (used in focus module)
+    output reg [12:0] do,           // [11:0] pixel data out, make timing ignore (valid 1.5 clk earlier that Quantizer output)
+    output reg        ds,           // data out strobe (one ahead of the start of dv)
+    output reg [31:0] hifreq);        //[31:0])  //  accumulated high frequency components in a frame sub-window
+
+    wire   [15:0] tdo;
+    reg    [ 5:0] tba;
+    reg    [11:0] wnd_reg; // intermediate register
+    reg           wnd_wr;  // writing window
+    reg    [ 2:0] wnd_a;   // window register address
+     
+ // next measured in 8x8 blocks, totalwidth - write one less than needed (i.e. 511 fro the 512-wide window)
+ // blocks on the border are included
+    reg    [ 8:0] wnd_left;
+    reg    [ 8:0] wnd_right;
+    reg    [ 8:0] wnd_top;
+    reg    [ 8:0] wnd_bottom;
+    reg    [ 8:1] wnd_totalwidth;
+    reg    [ 3:0] filt_sel0; // select filter number, 0..14 (15 used for window parameters)
+    reg    [ 3:0] filt_sel;  // select filter number, 0..14 (15 used for window parameters)
+    reg           stren; // strength (visualization)
+    reg    [ 2:0] ic;
+    reg    [ 2:0] oc;
+    wire          first,last; //valid at start (with first di word), switches immediately after
+    wire   [ 2:0] tn;
+    reg    [39:0] acc_frame;
+    reg    [12:0] pre_do;
+    reg           pre_ds;
+    reg           need_corr_max; // limit output by quant_dc_tdo
+    reg    [11:0] fdo; // focus data output
+    reg           start_d; //start delayed by 1
+    reg    [ 2:0] tn_d; //tn delayed by 1
+
+    wire          out_mono;
+    wire          out_window;
+    wire   [12:0] combined_qf; 
+    wire   [12:0] next_do;
+    wire   [12:0] fdo_minus_max;
+    reg    [11:0] di_d;
+    reg    [11:0] d1; 
+    reg     [8:0] start2;
+    reg     [7:0] finish2;
+    reg     [5:0] use_k_dly;
+    reg    [23:0] acc_blk; // accumulator for the sum ((a[i]*d[i])^2)
+    reg    [22:0] sum_blk; // accumulator for the sum ((a[i]*d[i])^2), copied at block end
+    reg           acc_ldval; // value to load to acc_blk: 0 - 24'h0, 1 - 24'h7fffff
+    wire          acc_clear=start2[8];
+    wire          acc_add=use_k_dly[4];
+    wire          acc_corr=use_k_dly[5];
+    wire          acc_to_out=finish2[6];      
+    wire   [17:0] mult_a;
+    wire   [17:0] mult_b;
+    wire   [35:0] mult_p;
+
+    reg    [17:0] mult_s; //truncated and saturated (always positive) multiplier result (before calculating squared)
+    reg           next_ac; // next will be AC component
+    reg           use_coef; // use multiplier for the first operation - DCT coeff. by table elements
+    reg           started_luma;// started Luma block 
+    reg           luma_dc_out; // 1 cycle ahead of the luma DC component out (optionally combined with the WOI (mode=3))
+    reg           luma_dc_acc; // 1 cycle ahead of the luma DC component out (always combined with the WOI)
+    reg           was_last_luma;
+    reg           copy_acc_frame;
+    assign        fdo_minus_max[12:0]= {1'b0,fdo[11:0]}-{1'b0,quant_dc_tdo[15:5]};
+    assign        combined_qf[12:0]=stren?({quant_d[12:0]}+{1'b0,fdo[11:0]}): //original image plus positive
+                                          ({quant_d[12],quant_d[12:1]}+ // half original 
+                                           {fdo_minus_max[12],fdo_minus_max[12:1]}); // plus half signed
+    assign        next_do[12:0] =  (mode[1:0]==2'h1)?(luma_dc_out?fdo_minus_max[12:0]:13'h0):
+                                    ((mode[1] && luma_dc_out )? combined_qf[12:0]: {quant_d[12:0]} );
+
+    always @ (posedge clk) begin
+        if (!en) ic[2:0] <= 3'b0;
+        else if (stb) ic[2:0] <= ic[2:0]+1;
+        if (!en) oc[2:0] <= 3'b0;
+        else if (start) oc[2:0] <= oc[2:0]+1;
+    end
+
+// writing window parameters in the last bank of a table     
+    always @ (negedge sclk) begin
+      if (twe) begin
+          wnd_reg[11:0] <= tdi[11:0] ;
+          wnd_a  <= ta[2:0];
+        end
+        wnd_wr <= twe && (ta[9:3]==7'h78) ; // first 8 location in the last 64-word bank
+        if (wnd_wr) begin
+        case (wnd_a[2:0])
+          3'h0: wnd_left[8:0]       <= wnd_reg[11:3] ;
+          3'h1: wnd_right[8:0]      <= wnd_reg[11:3] ;
+          3'h2: wnd_top[8:0]        <= wnd_reg[11:3] ;
+          3'h3: wnd_bottom[8:0]     <= wnd_reg[11:3] ;
+          3'h4: wnd_totalwidth[8:1] <= wnd_reg[11:4] ;
+          3'h5: filt_sel0[3:0]      <= wnd_reg[3:0] ;
+          3'h6: stren               <= wnd_reg[0] ;
+        endcase
+        end
+     end
+     
+// determine if this block needs to be processed (Y, inside WOI)
+     reg  [ 7:0]  mblk_hor; //horizontal macroblock (2x2 blocks) counter
+     reg  [ 7:0]  mblk_vert; //vertical macroblock (2x2 blocks) counter
+     wire         start_of_line= (first || (mblk_hor[7:0] == wnd_totalwidth[8:1]));
+     wire         first_in_macro= (tn[2:0]==3'h0);
+     reg          in_woi; // maybe specified as slow
+
+     always @(posedge clk) begin
+       if (first_in_macro && start) mblk_hor[7:0] <= start_of_line? 8'h0:(mblk_hor[7:0]+1);
+       if (first_in_macro && start && start_of_line) mblk_vert[7:0] <= first? 8'h0:(mblk_vert[7:0]+1);
+        start_d <= start;
+        tn_d[2:0] <= tn[2:0];
+        if (start_d) in_woi <= !tn_d[2] && 
+                                               ({mblk_hor [7:0],tn_d[0]} >= wnd_left[8:0]) &&
+                                               ({mblk_hor [7:0],tn_d[0]} <= wnd_right[8:0]) &&
+                                               ({mblk_vert[7:0],tn_d[1]} >= wnd_top[8:0]) &&
+                                               ({mblk_vert[7:0],tn_d[1]} <= wnd_bottom[8:0]);
+     end
+ 
+//Will use posedge sclk to balance huffman and system
+
+//    wire clkdiv2;
+//    FD i_clkdiv2(.C(clk), .D(!clkdiv2), .Q(clkdiv2));
+    
+    reg  clkdiv2=0;
+    always @ (posedge clk) begin
+        clkdiv2 <= ~clkdiv2;
+    end
+    
+    
+    reg [2:0] clksync;
+    wire      csync=clksync[2];
+    always @ (posedge sclk) begin
+       clksync[2:0] <= {(clksync[1]==clksync[0]),clksync[0],clkdiv2};
+    end
+
+    always @ (posedge clk) begin
+        if (di[11]==di[12]) di_d[11:0] <=di[11:0];
+        else di_d[11:0] <= {~di[11],{11{di[11]}}}; //saturate
+    end
+ 
+    assign       mult_a[17:0] = use_coef ? {1'b0,tdo[15:0],1'b0}: mult_s[17:0];
+    assign      mult_b[17:0] = use_coef ? {d1[10:0],{7{d1[0]}}}: mult_s[17:0];
+
+    always @ (posedge sclk) begin
+        filt_sel[3:0] <= filt_sel0[3:0];
+        if (clksync[2]) d1[11:0]<=di_d[11:0];
+        start2[8:0] <= {start2[7:0], start && csync};
+        finish2[7:0]<= {finish2[6:0],use_coef && !next_ac};
+        if      (!en || start2[0]) tba[5:0] <= 6'h0;
+        else if (!csync && (tba[5:0] != 6'h3f))   tba[5:0] <= tba[5:0] + 1;
+        mult_s[17:0] <= (&mult_p[35:31] || !(&mult_p[35:31]))?mult_p[31:14]:18'h1ffff;
+        next_ac <= en && (start2[3] || (next_ac && ((tba[5:0] != 6'h3f) || csync )));
+        use_coef <= next_ac && !csync;
+        use_k_dly[5:0] <= {use_k_dly[4:0],use_coef};
+        acc_ldval <= !(|start2[7:6]);
+        if      (acc_clear || (acc_corr && acc_blk[23])) acc_blk[23:0] <= {1'b0,{23{acc_ldval}}};
+        else if (acc_add)                                acc_blk[23:0] <= acc_blk[23:0] + mult_p[31:8]; // mult_p[35:8];
+        if (acc_to_out) fdo[11:0] <= (|acc_blk[23:20])?12'hfff:acc_blk[19:8]; // positive, 0..0xfff
+        if (acc_to_out) sum_blk[22:0] <= acc_blk[22:0]; // accumulator for the sum ((a[i]*d[i])^2), copied at block end
+   end
+
+//    acc_blk will (after corr) be always with MSB=0 - max 24'h7fffff
+// for image output - max 24'h0fffff->12 bit signed, shifted
+// combining output
+//assign        combined_qf[12:0]={quant_d[11],quant_d[11:0]}+{fdo[11],fdo[11:0]};
+
+//    SRL16 i_out_mono   (.Q(out_mono),   .A0(1'b1), .A1(1'b1), .A2(1'b1), .A3(1'b1), .CLK(clk), .D(started_luma)); // timing not critical
+//    SRL16 i_out_window (.Q(out_window), .A0(1'b1), .A1(1'b1), .A2(1'b1), .A3(1'b1), .CLK(clk), .D(in_woi)); // timing not critical
+    dly_16 #(.WIDTH(1)) i_out_mono(.clk(clk),  .rst(1'b0), .dly(15), .din(started_luma), .dout(out_mono));    // timing not critical
+    dly_16 #(.WIDTH(1)) i_out_window(.clk(clk),.rst(1'b0), .dly(15), .din(in_woi),       .dout(out_window));    // timing not critical
+    
+    always @ (posedge clk) begin
+        if (start) started_luma <= !tn[2];
+        luma_dc_out <= quant_ds && out_mono && ((mode[1:0]!=3) || out_window);
+        luma_dc_acc <= quant_ds && out_mono && out_window;
+        was_last_luma <= en && last && out_mono;
+        copy_acc_frame <= was_last_luma && !out_mono;
+        if (first && first_in_macro) acc_frame[39:0] <= 40'h0;
+        else if (luma_dc_acc)        acc_frame[39:0] <= acc_frame[39:0] + sum_blk[22:0];
+        if (copy_acc_frame) hifreq[31:0] <= acc_frame[39:8];
+        pre_ds <= quant_ds;
+        ds <= pre_ds;
+        pre_do[12:0] <= next_do[12:0];
+        need_corr_max <=luma_dc_out && (mode[1:0]!=2'h0);
+        do[12:0] <= (need_corr_max && !pre_do[12] && (pre_do[11] || (pre_do[10:0]>quant_dc_tdo[15:5])) )?
+                    {2'b0,quant_dc_tdo[15:5]} :
+                    pre_do[12:0];
+     end
+/*   
+   MULT18X18SIO #(
+      .AREG(1), // Enable the input registers on the A port (1=on, 0=off)
+      .BREG(1), // Enable the input registers on the B port (1=on, 0=off)
+      .B_INPUT("DIRECT"), // B cascade input "DIRECT" or "CASCADE" 
+      .PREG(1)  // Enable the input registers on the P port (1=on, 0=off)
+   ) i_focus_mult (
+      .BCOUT(), // 18-bit cascade output
+      .P(mult_p),    // 36-bit multiplier output
+      .A(mult_a),    // 18-bit multiplier input
+      .B(mult_b),    // 18-bit multiplier input
+      .BCIN(18'h0), // 18-bit cascade input
+      .CEA(en), // Clock enable input for the A port
+      .CEB(en), // Clock enable input for the B port
+      .CEP(en), // Clock enable input for the P port
+      .CLK(sclk), // Clock input
+      .RSTA(1'b0), // Synchronous reset input for the A port
+      .RSTB(1'b0), // Synchronous reset input for the B port
+      .RSTP(1'b0)  // Synchronous reset input for the P port
+   );
+*/
+    reg      [35:0] mult_p_r;
+    reg      [17:0] mult_a_r;
+    reg      [17:0] mult_b_r;
+    assign mult_p = mult_p_r;
+    always @(posedge sclk) begin
+        mult_a_r <= mult_a;
+        mult_b_r <= mult_b;
+        mult_p_r <= mult_a_r * mult_b_r;
+    end
+
+/*     
+     RAM16X1D i_tn0    (.D(tni[0]),.DPO(tn[0]),.A0(ic[0]),.A1(ic[1]),.A2(1'b0),.A3(1'b0),.DPRA0(oc[0]),.DPRA1(oc[1]),.DPRA2(1'b0),.DPRA3(1'b0),.WCLK(clk),.WE(stb));
+     RAM16X1D i_tn1    (.D(tni[1]),.DPO(tn[1]),.A0(ic[0]),.A1(ic[1]),.A2(1'b0),.A3(1'b0),.DPRA0(oc[0]),.DPRA1(oc[1]),.DPRA2(1'b0),.DPRA3(1'b0),.WCLK(clk),.WE(stb));
+     RAM16X1D i_tn2    (.D(tni[2]),.DPO(tn[2]),.A0(ic[0]),.A1(ic[1]),.A2(1'b0),.A3(1'b0),.DPRA0(oc[0]),.DPRA1(oc[1]),.DPRA2(1'b0),.DPRA3(1'b0),.WCLK(clk),.WE(stb));
+     RAM16X1D i_first  (.D(firsti),.DPO(first),.A0(ic[0]),.A1(ic[1]),.A2(1'b0),.A3(1'b0),.DPRA0(oc[0]),.DPRA1(oc[1]),.DPRA2(1'b0),.DPRA3(1'b0),.WCLK(clk),.WE(stb));
+     RAM16X1D i_last   (.D(lasti), .DPO(last), .A0(ic[0]),.A1(ic[1]),.A2(1'b0),.A3(1'b0),.DPRA0(oc[0]),.DPRA1(oc[1]),.DPRA2(1'b0),.DPRA3(1'b0),.WCLK(clk),.WE(stb));
+*/
+    reg      [ 4:0] ram4[0:3];
+    always @ (posedge   clk) begin
+        ram4[ic[1:0]] <= {lasti,firsti,tni[2:0]};
+    end
+    assign {last,first,tn[2:0]} =  ram4[oc[1:0]];
+// is it correct posedge sclk on rd, negedge on wr and no xclk?
+/*
+   RAMB16_S18_S18 i_focus_dct_tab (
+      .DOA(tdo[15:0]),       // Port A 16-bit Data Output
+      .DOPA(),     // Port A 2-bit Parity Output
+      .ADDRA({filt_sel[3:0],tba[2:0],tba[5:3]}),   // Port A 10-bit Address Input
+      .CLKA(sclk),     // Port A Clock
+      .DIA(16'b0),       // Port A 16-bit Data Input
+      .DIPA(2'b0),     // Port A 2-bit parity Input
+      .ENA(1'b1),       // Port A RAM Enable Input
+      .SSRA(1'b0),     // Port A Synchronous Set/Reset Input
+      .WEA(1'b0),       // Port A Write Enable Input
+
+      .DOB(), // Port B 16-bit Data Output
+      .DOPB(),     // Port B 4-bit Parity Output
+      .ADDRB({ta[9:0]}),   // Port B 2-bit Address Input
+      .CLKB(!sclk),     // Port B Clock
+      .DIB(tdi[15:0]),       // Port B 16-bit Data Input
+      .DIPB(2'b0),     // Port-B 2-bit parity Input
+      .ENB(1'b1),       // PortB RAM Enable Input
+      .SSRB(1'b0),     // Port B Synchronous Set/Reset Input
+      .WEB(twe)        // Port B Write Enable Input
+   );
+*/
+    ram18_var_w_var_r #(
+        .REGISTERS    (0),
+        .LOG2WIDTH_WR (4),
+        .LOG2WIDTH_RD (4),
+        .DUMMY        (0)
+    ) i_focus_dct_tab (
+        .rclk         (clk), // input
+        .raddr        ({filt_sel[3:0],tba[2:0],tba[5:3]}), // input[9:0] 
+        .ren          (1'b1), // input
+        .regen        (1'b1), // input
+        .data_out     (tdo[15:0]), // output[31:0] 
+        .wclk         (!sclk), // input
+        .waddr        ({ta[9:0]}), // input[8:0] 
+        .we           (!sclk), // input
+        .web          (4'hf), // input[3:0] 
+        .data_in      (tdi[15:0]) // input[31:0] 
+    );
+
+endmodule
+
--- a/compressor_jp/huff_fifo393.v
+++ b/compressor_jp/huff_fifo393.v
+/*
+** -----------------------------------------------------------------------------**
+** huff_fifo393.v
+**
+** Part of Huffman encoder for JPEG compressor - FIFO for Huffman encoder
+**
+** Copyright (C) 2002-2015 Elphel, Inc
+**
+** -----------------------------------------------------------------------------**
+**  huff_fifo393.v is free software - hardware description language (HDL) code.
+** 
+**  This program is free software: you can redistribute it and/or modify
+**  it under the terms of the GNU General Public License as published by
+**  the Free Software Foundation, either version 3 of the License, or
+**  (at your option) any later version.
+**
+**  This program is distributed in the hope that it will be useful,
+**  but WITHOUT ANY WARRANTY; without even the implied warranty of
+**  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+**  GNU General Public License for more details.
+**
+**  You should have received a copy of the GNU General Public License
+**  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+** -----------------------------------------------------------------------------**
+**
+*/
+//used the other edge of the clk2x
+module huff_fifo393 (
+    input             xclk,            // pixel clock, posedge
+    input             xclk2x,          // twice frequency - uses negedge inside
+    input             en,              // will reset if ==0 (sync to xclk)
+    input      [15:0] di,              // data in (sync to xclk)
+    input             ds,              // din valid (sync to xclk)
+    input             want_read,
+    input             want_read_early, 
+    output reg        dav,             // FIFO output latch has data (fifo_or_full)
+    output reg [15:0] q_latch);        // output data
+
+    reg     [9:0] wa;
+    reg     [9:0] sync_wa;    // delayed wa, re_latch-calculated at output clock
+    reg     [9:0] ra_r;
+    reg     [9:0] ra_latch;
+    reg           load_q;
+    wire   [15:0] fifo_o;
+    reg           ds1;    // ds delayed by one xclk to give time to block ram to write data. Not needed likely.
+    reg           synci;
+    reg     [1:0] synco;
+    reg           sync_we; // single xclk2x period pulse for each ds@xclk
+    reg           en2x; // en sync to xclk2x;
+
+    reg           re_r;
+    reg           re_latch;
+    reg           fifo_dav; // RAM output reg has data
+    reg           dav_and_fifo_dav;
+    wire          ram_dav;  // RAM has data inside
+    reg     [9:0] diff_a;
+    wire          next_re;
+
+
+    always @ (posedge xclk) begin // input stage, no overrun detection
+        if (!en)       wa[9:0] <= 10'b0;
+        else if (ds)  wa[9:0] <= wa[9:0]+1;
+        ds1                    <= ds && en;
+        if (!en)      synci   <= 1'b0;
+        else if (ds1) synci   <= ~synci;
+    end
+    always @ (negedge xclk2x) begin
+        en2x <= en;
+        synco[1:0]   <= {synco[0],synci};
+        sync_we      <= en2x && (synco[0] != synco[1]);
+    end
+
+    assign ram_dav= sync_we || (diff_a[9:0] != 10'b0);
+    assign next_re= ram_dav && (!dav_and_fifo_dav || want_read);
+  
+    always @ (negedge xclk2x) begin
+        dav              <= en2x && (fifo_dav || (dav && !want_read));
+        fifo_dav         <= en2x && (ram_dav ||(dav && fifo_dav && !want_read));
+        dav_and_fifo_dav <= en2x && (fifo_dav || (dav && !want_read)) && (ram_dav ||(dav && fifo_dav && !want_read)); // will optimize auto
+        re_r    <= en2x &&  next_re;
+        
+        if (!en2x)                   sync_wa[9:0] <= 10'b0;
+        else if (sync_we)            sync_wa[9:0] <= sync_wa[9:0]+1;
+        
+        if        (!en2x)             ra_r  [9:0] <= 10'b0;
+        else if (next_re)             ra_r  [9:0] <= ra_r[9:0]+1;
+        
+        if (!en2x)                    diff_a[9:0] <= 10'b0;
+        else if (sync_we && !next_re) diff_a[9:0] <= diff_a[9:0]+1;
+        else if (!sync_we && next_re) diff_a[9:0] <= diff_a[9:0]-1; 
+        
+    end
+/*  
+  LD i_re  (.Q(re_latch),.G(xclk2x),.D(next_re));  
+
+  LD i_ra9 (.Q(ra_latch[9]),.G(xclk2x),.D(ra_r[9]));  
+  LD i_ra8 (.Q(ra_latch[8]),.G(xclk2x),.D(ra_r[8]));  
+  LD i_ra7 (.Q(ra_latch[7]),.G(xclk2x),.D(ra_r[7]));  
+  LD i_ra6 (.Q(ra_latch[6]),.G(xclk2x),.D(ra_r[6]));  
+  LD i_ra5 (.Q(ra_latch[5]),.G(xclk2x),.D(ra_r[5]));  
+  LD i_ra4 (.Q(ra_latch[4]),.G(xclk2x),.D(ra_r[4]));  
+  LD i_ra3 (.Q(ra_latch[3]),.G(xclk2x),.D(ra_r[3]));  
+  LD i_ra2 (.Q(ra_latch[2]),.G(xclk2x),.D(ra_r[2]));  
+  LD i_ra1 (.Q(ra_latch[1]),.G(xclk2x),.D(ra_r[1]));  
+  LD i_ra0 (.Q(ra_latch[0]),.G(xclk2x),.D(ra_r[0]));  
+*/  
+    always @* if (xclk2x) re_latch <= next_re;
+    always @* if (xclk2x) ra_latch <= ra_r;
+  
+  
+    always @ (posedge xclk2x) begin
+        load_q <= dav?want_read_early:re_r;
+    end
+/*  
+  LD_1 i_q15 (.Q( q_latch[15]),.G(xclk2x),.D(load_q?fifo_o[15]:q_latch[15]));  
+  LD_1 i_q14 (.Q( q_latch[14]),.G(xclk2x),.D(load_q?fifo_o[14]:q_latch[14]));  
+  LD_1 i_q13 (.Q( q_latch[13]),.G(xclk2x),.D(load_q?fifo_o[13]:q_latch[13]));  
+  LD_1 i_q12 (.Q( q_latch[12]),.G(xclk2x),.D(load_q?fifo_o[12]:q_latch[12]));  
+  LD_1 i_q11 (.Q( q_latch[11]),.G(xclk2x),.D(load_q?fifo_o[11]:q_latch[11]));  
+  LD_1 i_q10 (.Q( q_latch[10]),.G(xclk2x),.D(load_q?fifo_o[10]:q_latch[10]));  
+  LD_1 i_q9  (.Q( q_latch[ 9]),.G(xclk2x),.D(load_q?fifo_o[ 9]:q_latch[ 9]));  
+  LD_1 i_q8  (.Q( q_latch[ 8]),.G(xclk2x),.D(load_q?fifo_o[ 8]:q_latch[ 8]));  
+  LD_1 i_q7  (.Q( q_latch[ 7]),.G(xclk2x),.D(load_q?fifo_o[ 7]:q_latch[ 7]));  
+  LD_1 i_q6  (.Q( q_latch[ 6]),.G(xclk2x),.D(load_q?fifo_o[ 6]:q_latch[ 6]));  
+  LD_1 i_q5  (.Q( q_latch[ 5]),.G(xclk2x),.D(load_q?fifo_o[ 5]:q_latch[ 5]));  
+  LD_1 i_q4  (.Q( q_latch[ 4]),.G(xclk2x),.D(load_q?fifo_o[ 4]:q_latch[ 4]));  
+  LD_1 i_q3  (.Q( q_latch[ 3]),.G(xclk2x),.D(load_q?fifo_o[ 3]:q_latch[ 3]));  
+  LD_1 i_q2  (.Q( q_latch[ 2]),.G(xclk2x),.D(load_q?fifo_o[ 2]:q_latch[ 2]));  
+  LD_1 i_q1  (.Q( q_latch[ 1]),.G(xclk2x),.D(load_q?fifo_o[ 1]:q_latch[ 1]));  
+  LD_1 i_q0  (.Q( q_latch[ 0]),.G(xclk2x),.D(load_q?fifo_o[ 0]:q_latch[ 0]));  
+*/
+    always @* if (~xclk2x) begin
+        if (load_q) q_latch <= fifo_o;
+    end
+/*
+   RAMB16_S18_S18 i_fifo (
+                          .DOA(),            // Port A 16-bit Data Output
+                          .DOPA(),           // Port A 2-bit Parity Output
+                          .ADDRA(wa[9:0]),   // Port A 10-bit Address Input
+                          .CLKA(xclk),       // Port A Clock
+                          .DIA(di[15:0]),    // Port A 16-bit Data Input
+                          .DIPA(2'b0),       // Port A 2-bit parity Input
+                          .ENA(ds),          // Port A RAM Enable Input
+                          .SSRA(1'b0),       // Port A Synchronous Set/Reset Input
+                          .WEA(1'b1),        // Port A Write Enable Input
+
+                          .DOB(fifo_o[15:0]),// Port B 16-bit Data Output
+                          .DOPB(),           // Port B 2-bit Parity Output
+                          .ADDRB(ra_latch[9:0]),   // Port B 10-bit Address Input
+                          .CLKB(xclk2x),        // Port B Clock
+                          .DIB(16'b0),       // Port B 16-bit Data Input
+                          .DIPB(2'b0),       // Port-B 2-bit parity Input
+                          .ENB(re_latch),          // PortB RAM Enable Input
+                          .SSRB(1'b0),       // Port B Synchronous Set/Reset Input
+                          .WEB(1'b0)         // Port B Write Enable Input
+                          );
+*/
+
+    ram18_var_w_var_r #(
+        .REGISTERS    (0),
+        .LOG2WIDTH_WR (4),
+        .LOG2WIDTH_RD (4),
+        .DUMMY        (0)
+    ) i_fifo (
+        .rclk         (xclk2x),        // input
+        .raddr        (ra_latch[9:0]), // input[9:0] 
+        .ren          (re_latch),      // input
+        .regen        (1'b1),          // input
+        .data_out     (fifo_o[15:0]),  // output[15:0] 
+        .wclk         (xclk),          // input
+        .waddr        (wa[9:0]),       // input[9:0] 
+        .we           (ds),            // input
+        .web          (4'hf),          // input[3:0] 
+        .data_in      (di[15:0])       // input[15:0] 
+    );
+ 
+endmodule
--- a/compressor_jp/huffman393.v
+++ b/compressor_jp/huffman393.v
+/*
+** -----------------------------------------------------------------------------**
+** huffman333.v
+**
+** Huffman encoder for JPEG compressorrdy
+**
+** Copyright (C) 2002-20015 Elphelk, Inc
+**
+** -----------------------------------------------------------------------------**
+**  huffman393 is free software - hardware description language (HDL) code.
+** 
+**  This program is free software: you can redistribute it and/or modify
+**  it under the terms of the GNU General Public License as published by
+**  the Free Software Foundation, either version 3 of the License, or
+**  (at your option) any later version.
+**
+**  This program is distributed in the hope that it will be useful,
+**  but WITHOUT ANY WARRANTY; without even the implied warranty of
+**  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+**  GNU General Public License for more details.
+**
+**  You should have received a copy of the GNU General Public License
+**  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+** -----------------------------------------------------------------------------**
+**
+*/
+// 01/22/2004 - extended flush until ready (modified stuffer.v too)
+module huffman393    (
+    input             xclk,            // pixel clock, sync to incoming data
+    input             xclk2x,          // twice frequency - uses negedge inside
+    input             en,              // will reset if ==0 (sync to xclk)
+    input             sclk,            // clock to write tables (NOW posgedge) AF2015
+    input             twe,             // enable write to a table - now the following will be valid ant negedge sclk
+    input       [8:0] ta,              // [8:0]  table address
+    input      [15:0] tdi,             // [15:0] table data in
+    input      [15:0] di,              // [15:0]    specially RLL prepared 16-bit data (to FIFO) (sync to xclk)
+    input             ds,              // di valid strobe  (sync to xclk)
+    input             rdy,             // receiver (bit stuffer) is ready to accept data
+    output reg [15:0] do,              // [15:0]    output data
+    output reg [ 3:0] dl,              // [3:0] data length (4'h0 is 'h16)
+    output reg        dv,              // output data valid
+    output reg        flush,           // last block done - flush the rest bits
+    output reg        last_block,
+    output reg        test_lbw,
+    output            gotLastBlock);   // last block done - flush the rest bits
+/*
+  huffman i_huffman  (.pclk(clk),      // pixel clock
+                      .clk(clk2x),     // twice frequency - uses negedge inside
+                      .en(cmprs_en),      // enable (0 resets counter) sync to .pclk(clk)
+                      .twe(twhe),      // enable write to a table
+                      .ta(ta[8:0]),      // [8:0]  table address
+                      .tdi(di[15:0]),      // [23:0] table data in (8 LSBs - quantization data, [13:9] zigzag address
+                      .di(enc_do[15:0]),      // [15:0]   specially RLL prepared 16-bit data (to FIFO)
+                      .ds(enc_dv),      // di valid strobe
+                      .rdy(stuffer_rdy),      // receiver (bit stuffer) is ready to accept data
+                      .do(huff_do),      // [15:0]   output data
+                      .dl(huff_dl),      // [3:0]   output width (0==16)
+                      .dv(huff_dv),      // output data bvalid
+                     .flush(flush),
+                     .last_block(last_block),
+                     .test_lbw(),
+                     .gotLastBlock(test_lbw));   // last block done - flush the rest bits
+
+                     
+*/
+
+    wire   [31:0] tables_out; // Only [19:0] are used
+    reg    [15:0] hcode_latch;    // table output huffman code (1..16 bits)
+    reg    [ 3:0] hlen_latch;        // table - code length only 4 LSBs are used
+    reg    [ 7:0] haddr_r;    // index in huffman table    
+    wire   [ 7:0] haddr_next;
+
+    reg    [ 7:0] haddr70_latch;
+    reg           haddr8_latch;
+    wire   [ 8:0] haddr = {haddr8_latch,haddr70_latch};    // index in huffman table     (after latches)
+     
+    wire   [15:0] fifo_o;
+    reg           stuffer_was_rdy;
+    reg           tables_re_latch;
+    wire          read_next;    // assigned depending on steps (each other cycle for normal codes, each for special 00/F0
+
+    reg     [5:0] steps;
+// first stage registers 
+    reg     [5:0] rll;    // 2 MSBs - counter to send "f0" codes
+
+// replacing SRL16 with FD as SRL has longer output delay from clock 
+    reg     [3:0] rll1;
+    reg     [3:0] rll2;
+    reg           typeDC;
+    reg           typeAC;
+    reg    [11:0] sval;    // signed input value
+
+    wire    [1:0] code_typ0;    // valid at steps[0]
+    reg           tbsel_YC0;    // valid at steps[0] - 0 -Y table, 1 - CbCr
+    reg     [1:0] code_typ1;
+    reg     [1:0] code_typ2;
+    reg           code_typ3;
+    reg           code_typ4;
+    reg           tbsel_YC1;
+    reg           tbsel_YC2;
+    reg           tbsel_YC3;
+
+    reg    [15:0] out_bits;    // bits to send
+    reg     [3:0] out_len;        // length of bits to send (4'h0 means 16)
+    wire          fifo_or_full;    // fifo output register full read_next
+    wire          will_read;
+    wire   [10:0] var_do;
+    wire    [3:0] var_dl;
+    wire    [3:0] var_dl_late;
+
+    reg           dv0;
+
+    reg           eob;
+    wire          gotDC;
+    wire          gotAC;
+    wire          gotRLL;
+    wire          gotEOB;
+    wire          gotLastWord;
+    wire          gotColor;
+
+    wire          want_read; // as will_read, but w/o fifo status
+    reg           ready_to_flush;    // read the last data from fifo
+    reg           en2x; // en sync to xclk2x;
+
+
+    wire          pre_dv;
+    wire   [15:0] pre_bits;
+    wire   [ 3:0] pre_len;
+
+    reg           twe_d; // table write enable (twe) delayed by 1 clock
+
+    always @ (negedge xclk2x) en2x <= en;
+    
+    assign gotDC=         fifo_o[15] &&  fifo_o[14];
+    assign gotAC=         fifo_o[15] && !fifo_o[14];
+    assign gotRLL=        !fifo_o[15] && !fifo_o[12];
+    assign gotEOB=        !fifo_o[15] &&  fifo_o[12];
+    assign gotLastBlock=  fifo_o[15] &&  fifo_o[14] && fifo_o[12];
+    assign gotLastWord=  !fifo_o[14] &&  fifo_o[12];    // (AC or RLL) and last bit set
+    assign gotColor= fifo_o[13];
+
+    always @(negedge xclk2x) stuffer_was_rdy <= !en2x || rdy; // stuffer ready shoud be on if !en (move to register?)for now]
+    reg           stuffer_was_rdy_early_latch;
+    wire          want_read_early;
+/*    
+  LD  i_stuffer_was_rdy_early (.Q(stuffer_was_rdy_early_latch),.G(xclk2x),.D(!en2x || rdy));
+  LD  i_tables_re (.Q(tables_re_latch),.G(xclk2x),.D(en2x && rdy));
+*/  
+    always @* if (xclk2x) stuffer_was_rdy_early_latch <= !en2x || rdy;
+    always @* if (xclk2x) tables_re_latch <= en2x && rdy;
+   
+
+
+    assign read_next= en2x && ((!steps[0] && !rll[5]) || eob ) && fifo_or_full; // fifo will never have data after the last block...
+    assign will_read= stuffer_was_rdy && fifo_or_full && en2x && ((!steps[0] && !rll[5]) || eob ); // fifo will never have data after the last block...
+    assign want_read= stuffer_was_rdy && ((!steps[0] && !rll[5]) || eob ); // for FIFO
+    assign want_read_early= stuffer_was_rdy_early_latch && ((!steps[0] && !rll[5]) || eob ); // for FIFO
+
+    always @ (negedge xclk2x) if (stuffer_was_rdy) begin
+        eob <= read_next && gotEOB;// will be 1 only during step[0]
+
+        if (!en2x) steps[5:0]    <= 'b0;
+        else     steps[5:0]    <= {steps[4] && code_typ4, // will be skipped for codes 00/F0
+                                   steps[3:0],
+                                   (read_next && !(gotRLL && (fifo_o[5:4]==2'b00))) || rll[5] }; // will not start if it was <16, waiting for AC
+    end
+    always @ (negedge xclk2x)    begin
+        last_block <= en2x && (last_block?(!flush):(stuffer_was_rdy && will_read && gotLastBlock));
+        ready_to_flush <= en2x && (ready_to_flush?(!flush):(stuffer_was_rdy && last_block &&  will_read && gotLastWord));
+        test_lbw <= en2x && last_block &&  gotLastWord;
+// did not work if flush was just after not ready?
+        flush    <= en2x &&( flush?(!rdy):(rdy && stuffer_was_rdy && ready_to_flush && !(|steps)) );
+    end
+
+
+    always @ (negedge xclk2x) if (will_read) begin
+        typeDC               <= gotDC;
+        typeAC               <= gotAC;
+        sval[11:0]           <= fifo_o[11:0];
+        if (gotDC) tbsel_YC0 <= gotColor;
+    end
+  
+
+    always @ (negedge xclk2x) if (stuffer_was_rdy) begin
+        if (!en2x || (read_next && gotAC) || (steps[0] && typeAC))             rll[5:4] <= 2'b0;
+        else if (read_next && gotRLL)                                          rll[5:4] <= fifo_o[5:4];
+        else if (rll[5:4]!=2'b00)                                              rll[5:4] <= rll[5:4]-1;
+        
+        if (!en2x || (read_next && !gotAC && !gotRLL) || (steps[0] && typeAC)) rll[3:0] <= 4'b0;
+        else if (read_next && gotRLL)                                          rll[3:0] <= fifo_o[3:0];
+    end
+
+    assign code_typ0={typeDC || (!eob && (rll[5:4]==2'b0)),
+                      typeDC || (!eob && (rll[5:4]!=2'b0))};
+
+    assign haddr_next[7:0] = code_typ2[1]?
+                                        (code_typ2[0]?{var_dl[3:0],4'hf}:       // DC (reusing the spare cells of the AC table)
+                                                      {rll2[3:0],var_dl[3:0]}): // AC normal code
+                                        (code_typ2[0]?8'hf0:                    //skip 16 zeros code
+                                                      8'h00);                   //skip to end of block code
+
+    always @ (negedge xclk2x) if (stuffer_was_rdy && steps[2]) begin    // may be just if (stuffer_was_rdy)
+        haddr_r[7:0]    <= haddr_next[7:0];
+    end
+/*  
+  LD  i_haddr_8 (.Q(haddr[8]),.G(xclk2x),.D(stuffer_was_rdy?tbsel_YC2:tbsel_YC3));
+  LD  i_haddr_7 (.Q(haddr[7]),.G(xclk2x),.D((stuffer_was_rdy && steps[2])?haddr_next[7]:haddr_r[7]));
+  LD  i_haddr_6 (.Q(haddr[6]),.G(xclk2x),.D((stuffer_was_rdy && steps[2])?haddr_next[6]:haddr_r[6]));
+  LD  i_haddr_5 (.Q(haddr[5]),.G(xclk2x),.D((stuffer_was_rdy && steps[2])?haddr_next[5]:haddr_r[5]));
+  LD  i_haddr_4 (.Q(haddr[4]),.G(xclk2x),.D((stuffer_was_rdy && steps[2])?haddr_next[4]:haddr_r[4]));
+  LD  i_haddr_3 (.Q(haddr[3]),.G(xclk2x),.D((stuffer_was_rdy && steps[2])?haddr_next[3]:haddr_r[3]));
+  LD  i_haddr_2 (.Q(haddr[2]),.G(xclk2x),.D((stuffer_was_rdy && steps[2])?haddr_next[2]:haddr_r[2]));
+  LD  i_haddr_1 (.Q(haddr[1]),.G(xclk2x),.D((stuffer_was_rdy && steps[2])?haddr_next[1]:haddr_r[1]));
+  LD  i_haddr_0 (.Q(haddr[0]),.G(xclk2x),.D((stuffer_was_rdy && steps[2])?haddr_next[0]:haddr_r[0]));
+*/
+//     wire   [ 8:0] haddr = {haddr8_latch,haddr70_latch};    // index in huffman table     (after latches)
+    always @* if (xclk2x) begin
+        if (stuffer_was_rdy) haddr8_latch <= tbsel_YC2;
+        else                 haddr8_latch <= tbsel_YC3;
+    end
+
+    always @* if (xclk2x) begin
+        if (stuffer_was_rdy && steps[2]) haddr70_latch <= haddr_next;
+        else                             haddr70_latch <= haddr_r;
+    end
+
+
+    assign pre_dv =         steps[4] || (steps[5] && (var_dl_late[3:0]!=4'b0));
+    assign pre_bits[15:0]    = steps[5]?{5'b0,var_do[10:0]}:     hcode_latch[15:0];
+    assign pre_len [ 3:0]    = steps[5]?      var_dl_late[ 3:0]: hlen_latch  [3:0];
+
+    always @ (negedge xclk2x) if (stuffer_was_rdy) begin
+        dv0            <= pre_dv;
+        out_bits[15:0] <= pre_bits[15:0];
+        out_len [ 3:0] <= pre_len [ 3:0];
+    end
+    
+    always @ (negedge xclk2x) if (!en2x || rdy) begin
+        dv       <= stuffer_was_rdy? pre_dv:dv0;
+        do[15:0] <= stuffer_was_rdy? pre_bits[15:0]:out_bits[15:0];
+        dl[ 3:0] <= stuffer_was_rdy? pre_len [ 3:0]:out_len [ 3:0];
+    end
+
+
+
+// "Extract shift registers" in synthesis should be off! FD has lower output delay than SRL16
+    always @ (negedge xclk2x) if (stuffer_was_rdy) begin
+        code_typ1[1:0] <= code_typ0[1:0];
+        code_typ2[1:0] <= code_typ1[1:0];
+        code_typ3      <= code_typ2[1];
+        code_typ4      <= code_typ3;
+        rll1[3:0]      <= rll[3:0];
+        rll2[3:0]      <= rll1[3:0];
+        tbsel_YC1      <= tbsel_YC0;
+        tbsel_YC2      <= tbsel_YC1;
+        tbsel_YC3      <= tbsel_YC2;
+    end
+  /*
+  LD_1 i_hlen3  (.Q( hlen_latch[ 3]),.G(xclk2x),.D(tables_out[19]));  
+  LD_1 i_hlen2  (.Q( hlen_latch[ 2]),.G(xclk2x),.D(tables_out[18]));  
+  LD_1 i_hlen1  (.Q( hlen_latch[ 1]),.G(xclk2x),.D(tables_out[17]));  
+  LD_1 i_hlen0  (.Q( hlen_latch[ 0]),.G(xclk2x),.D(tables_out[16]));  
+  LD_1 i_hcode15(.Q(hcode_latch[15]),.G(xclk2x),.D(tables_out[15]));  
+  LD_1 i_hcode14(.Q(hcode_latch[14]),.G(xclk2x),.D(tables_out[14]));  
+  LD_1 i_hcode13(.Q(hcode_latch[13]),.G(xclk2x),.D(tables_out[13]));  
+  LD_1 i_hcode12(.Q(hcode_latch[12]),.G(xclk2x),.D(tables_out[12]));  
+  LD_1 i_hcode11(.Q(hcode_latch[11]),.G(xclk2x),.D(tables_out[11]));  
+  LD_1 i_hcode10(.Q(hcode_latch[10]),.G(xclk2x),.D(tables_out[10]));  
+  LD_1 i_hcode9 (.Q(hcode_latch[ 9]),.G(xclk2x),.D(tables_out[ 9]));  
+  LD_1 i_hcode8 (.Q(hcode_latch[ 8]),.G(xclk2x),.D(tables_out[ 8]));  
+  LD_1 i_hcode7 (.Q(hcode_latch[ 7]),.G(xclk2x),.D(tables_out[ 7]));  
+  LD_1 i_hcode6 (.Q(hcode_latch[ 6]),.G(xclk2x),.D(tables_out[ 6]));  
+  LD_1 i_hcode5 (.Q(hcode_latch[ 5]),.G(xclk2x),.D(tables_out[ 5]));  
+  LD_1 i_hcode4 (.Q(hcode_latch[ 4]),.G(xclk2x),.D(tables_out[ 4]));  
+  LD_1 i_hcode3 (.Q(hcode_latch[ 3]),.G(xclk2x),.D(tables_out[ 3]));  
+  LD_1 i_hcode2 (.Q(hcode_latch[ 2]),.G(xclk2x),.D(tables_out[ 2]));  
+  LD_1 i_hcode1 (.Q(hcode_latch[ 1]),.G(xclk2x),.D(tables_out[ 1]));  
+  LD_1 i_hcode0 (.Q(hcode_latch[ 0]),.G(xclk2x),.D(tables_out[ 0]));  
+*/  
+    always @* if (~xclk2x) hlen_latch <=  tables_out[19:16];
+    always @* if (~xclk2x) hcode_latch <= tables_out[15:0];
+  
+
+    huff_fifo393 i_huff_fifo (
+        .xclk(xclk), // input
+        .xclk2x(xclk2x), // input
+        .en(en), // input
+        .di(di[15:0]), // input[15:0] data in (sync to xclk)
+        .ds(ds), // input din valid (sync to xclk)
+        .want_read(want_read), // input
+        .want_read_early(want_read_early), // input
+        .dav(fifo_or_full), // output reg FIFO output register has data 
+        .q_latch(fifo_o[15:0])); // output[15:0] reg data (will add extra buffering if needed)
+
+    varlen_encode393 i_varlen_encode(
+        .clk      (xclk2x),           // input
+        .en       (stuffer_was_rdy),  // input  will enable registers. 0 - freeze
+        .start    (steps[0]),         // input
+        .d        (sval[11:0]),       // input[11:0] 12-bit signed
+        .l        (var_dl[ 3:0]),     // output[3:0] reg code length
+        .l_late   (var_dl_late[3:0]), // output[3:0] reg
+        .q        (var_do[10:0]));    // output[10:0] reg code
+                                        
+                                        
+//   always @ (negedge xclk2x) twe_d <= twe;
+    always @ (posedge   sclk) twe_d <= twe;
+/*   
+   RAMB16_S18_S36 i_htab (
+                          .DOA(),           // Port A 16-bit Data Output
+                          .DOPA(),          // Port A 2-bit Parity Output
+                          .ADDRA({ta[8:0],twe_d}),  // Port A 10-bit Address Input
+                          .CLKA(!xclk2x),      // Port A Clock
+                          .DIA(tdi[15:0]),  // Port A 16-bit Data Input
+                          .DIPA(2'b0),      // Port A 2-bit parity Input
+                          .ENA(1'b1),       // Port A RAM Enable Input
+                          .SSRA(1'b0),      // Port A Synchronous Set/Reset Input
+                          .WEA(twe | twe_d),// Port A Write Enable Input
+
+                          .DOB({unused[11:0],tables_out[19:0]}),      // Port B 32-bit Data Output
+                          .DOPB(),          // Port B 4-bit Parity Output
+                          .ADDRB(haddr[8:0]),  // Port B 9-bit Address Input
+                          .CLKB(xclk2x),       // Port B Clock
+                          .DIB(32'b0),      // Port B 32-bit Data Input
+                          .DIPB(4'b0),      // Port-B 4-bit parity Input
+                          .ENB(tables_re_latch),  // PortB RAM Enable Input
+                          .SSRB(1'b0),      // Port B Synchronous Set/Reset Input
+                          .WEB(1'b0)        // Port B Write Enable Input
+   );
+*/
+
+    ram18_var_w_var_r #(
+        .REGISTERS(0),
+        .LOG2WIDTH_WR(4),
+        .LOG2WIDTH_RD(5),
+        .DUMMY(0)
+    ) i_htab (
+        .rclk(xclk2x), // input
+        .raddr(haddr[8:0]), // input[8:0] 
+        .ren(tables_re_latch), // input
+        .regen(1'b1), // input
+//        .data_out({unused[11:0],tables_out[19:0]}), // output[31:0] 
+        .data_out(tables_out), // output[31:0] 
+        .wclk(sclk), // input
+        .waddr({ta[8:0],twe_d}), // input[9:0] 
+        .we(twe | twe_d), // input
+        .web(4'hf), // input[3:0] 
+        .data_in(tdi[15:0]) // input[15:0] 
+    );
+endmodule
+
--- a/compressor_jp/jp_channel.v
+++ b/compressor_jp/jp_channel.v
@@ -46,7 +46,10 @@ module  jp_channel#(
    input  [63:0] buf_wdata, // input[63:0] 
    
    input         page_ready_chn,     // single mclk (posedge)
-    output        next_page_chn      // single mclk (posedge): Done with the page in the  buffer, memory controller may read more data 
+    output        next_page_chn,      // single mclk (posedge): Done with the page in the  buffer, memory controller may read more data 
+// statistics data was not used in late nc353    
+    output        statistics_dv,
+    output [15:0] statistics_do
    

 );
@@ -70,7 +73,7 @@ module  jp_channel#(
    wire   [ 9:0] m_cb;               // [9:0] scale for CB - default 0.564 (10'h90)
    wire   [ 9:0] m_cr;               // [9:0] scale for CB - default 0.713 (10'hb6)

-
+    reg    [ 1:0] cmprs_fmode_this;   // focusing/overlay mode

    //TODO: assign next 5 values from converter_type[2:0]
    wire   [ 5:0] mb_w_m1;            // macroblock width minus 1 // 3 LSB not used, SHOULD BE SET to 3'b111
@@ -130,10 +133,11 @@ module  jp_channel#(
    wire          color_first;      // sending first_r MCU (valid @ ds)
    wire          color_last;       // sending last_r MCU (valid @ ds)
 // below signals valid at ds ( 1 later than tn, first_r, last_r)
-    wire    [2:0] yc_nodc_component_num;    //[2:0] - component number (YCbCr: 0 - Y, 1 - Cb, 2 - Cr, JP4: 0-1-2-3 in sequence (depends on shift) 4 - don't use
-    wire          yc_nodc_component_color;  // use color quantization table (YCbCR, jp4diff)
-    wire          color_first;   // first_r this component in a frame (DC absolute, otherwise - difference to previous)
-    wire          yc_nodc_component_lastinmb; // last_r component in a macroblock;
+    wire    [2:0] component_num;    //[2:0] - component number (YCbCr: 0 - Y, 1 - Cb, 2 - Cr, JP4: 0-1-2-3 in sequence (depends on shift) 4 - don't use
+    wire          component_color;  // use color quantization table (YCbCR, jp4diff)
+    wire          component_first;  // first this component in a frame (DC absolute, otherwise - difference to previous)
+    
+    wire          component_lastinmb; // last_r component in a macroblock;



@@ -297,10 +301,10 @@ module  jp_channel#(
        .tn                 (color_tn),         // output[2:0] 
        .first              (color_first),      // output reg 
        .last               (color_last),       // output reg 
-        .component_num      (yc_nodc_component_num), // output[2:0] 
-        .component_color    (yc_nodc_component_color), // output
-        .component_first    (color_first),      // output
-        .component_lastinmb (yc_nodc_component_lastinmb) // output reg 
+        .component_num      (component_num),    // output[2:0] 
+        .component_color    (component_color),  // output
+        .component_first    (component_first),  // output
+        .component_lastinmb (component_lastinmb)// output reg 
    );
 //  wire   [ 9:0] yc_nodc;         // [9:0] data out (4:2:0) (signed, average=0)

@@ -316,7 +320,7 @@ module  jp_channel#(
    reg           first_block_color_after;  // after color conversion,
    reg           first_block_dct;     // after DCT
    wire          first_block_quant;   // after quantizer
-    always @ (posedge clk) begin
+    always @ (posedge xclk) begin
        if (dct_start)   first_block_color_after <= first_block_color;
        if (dct_last_in) first_block_dct   <= first_block_color_after;
    end
@@ -337,10 +341,15 @@ module  jp_channel#(
    wire          quant_start;
    dly_16 #(.WIDTH(1)) i_quant_start (.clk(xclk),.rst(1'b0), .dly(0), .din(dct_pre_first_out), .dout(quant_start));    // dly=0+1
 
-    // TODO: Change interface
+    // TODO: Change interface (first are negedge, twhe - @poswedge mclk
    wire          twqe;
    wire          twce;
-    wire    [8:0] ta; 
+    wire          twfe; // focusing table write enable
+    
+    wire          twhe; // now @posedge mclk
+    
+    
+    wire    [9:0] ta; // some use [8:0]
    wire   [15:0] tdi; 
    
    reg    [ 2:0] cmprs_qpage_this;
@@ -352,10 +361,11 @@ module  jp_channel#(
    reg           dcc_en;
    wire          dccout;
    wire   [ 2:0] hfc_sel;
-    wire          dccvld;

+    wire   [15:0] dccdata; // was not used in late nc353
+    wire          dccvld;  // was not used in late nc353
    
-    always @ (posedge clk) begin
+    always @ (posedge xclk) begin
        if (!dccout) dcc_en <=1'b0;
        else if (dct_start && color_first && (color_tn[2:0]==3'b001)) dcc_en <=1'b1; // 3'b001 - closer to the first "start" in quantizator
    end
@@ -366,9 +376,9 @@ module  jp_channel#(
        .sclk               (mclk),                   // input system clock, twqe, twce, ta,tdi - valid @posedge (ra, tdi - 2 cycles ahead (was negedge)
        .twqe               (twqe),                   // input enable write to a quantization table
        .twce               (twce),                   // input enable write to a coring table
-        .ta                 (ta),                     // input[8:0] table address
+        .ta                 (ta[8:0]),                // input[8:0] table address
        .tdi                (tdi),                    // input[15:0] data in (8 LSBs - quantization data - obsolete?)
-        .ctypei             (yc_nodc_component_color),// input component type input (Y/C)
+        .ctypei             (component_color),        // input component type input (Y/C)
        .dci                (yc_avr),                 // input[8:0] - average value in a block - subtracted before DCT. now normal signed number
        .first_stb          (first_block_color),      // input - this is first stb pulse in a frame
        .stb                (dct_start),              // input - strobe that writes ctypei, dci
@@ -386,12 +396,230 @@ module  jp_channel#(
        .color_first        (color_first),            // input - first MCU in a frame
        .coring_num         (coring_num),             // input[2:0] - coring table pair number (0..7)
        .dcc_vld            (dccvld),                 // output reg  - single cycle when dcc_data is valid
-        .dcc_data           (), // output[15:0] - dc component data out (for reading by software) 
+        .dcc_data           (dccdata[15:0]),          // output[15:0] - dc component data out (for reading by software) 
        .n000               (n000),                   // input[7:0] - number of zero pixels (255 if 256) - to be multiplexed with dcc
        .n255               (n255)                    // input[7:0] - number of 0xff pixels (255 if 256) - to be multiplexed with dcc
    );

+    // focus sharp module calculates amount of high-frequency components and optioanlly overlays/replaces actual image
+    wire   [12:0] focus_do;     // output[12:0] reg  pixel data out, make timing ignore (valid 1.5 clk earlier that Quantizer output)
+    wire          focus_ds;     // output reg data out strobe (one ahead of the start of dv)
+    wire   [31:0] hifreq;       // output[31:0] reg accumulated high frequency components in a frame sub-window
+    
+    focus_sharp393 focus_sharp393_i (
+        .clk                (xclk),                   // input
+        .en                 (frame_en),               // input 
+        .sclk               (mclk),                   // input system clock:  twe, ta,tdi - valid @negedge (ra, tdi - 2 cycles ahead)
+        .twe                (twfe),                   // input enable write to a table
+        .ta                 (ta[9:0]),                // input[9:0]  table address
+        .tdi                (tdi),                    // input[15:0]  table data in (8 LSBs - quantization data)
+        .mode               (cmprs_fmode_this[1:0]),  // input[1:0] focus mode (combine image with focus info) - 0 - none, 1 - replace, 2 - combine all,  3 - combine woi
+        .firsti             (color_first),            // input first macroblock
+        .lasti              (color_last),             // input last macroblock
+        .tni                (color_tn[2:0]),          // input[2:0] block number in a macronblock - 0..3 - Y, >=4 - color (sync to stb)
+        .stb                (dct_start),              // input strobe that writes ctypei, dci
+        .start              (quant_start),            // input marks first input pixel (needs 1 cycle delay from previous DCT stage)
+        .di                 (dct_out),                // input[12:0] pixel data in (signed)
+        .quant_ds           (quant_ds),               // input quantizator ds
+        .quant_d            (quant_do[12:0]),         // input[12:0] quantizator data output
+        .quant_dc_tdo       (quant_dc_tdo),           // input[15:0] MSB aligned coefficient for the DC component (used in focus module)
+        .do                 (focus_do[12:0]),         // output[12:0] reg  pixel data out, make timing ignore (valid 1.5 clk earlier that Quantizer output)
+        .ds                 (focus_ds),               // output reg data out strobe (one ahead of the start of dv)
+        .hifreq             (hifreq[31:0])            // output[31:0] reg accumulated high frequency components in a frame sub-window
+    );
+
+    // Format DC components to be output as a mini-frame. Was not used in the late NC353 as the dma1 channel was use3d for IMU instead of dcc
+    reg           pre_finish_dcc;
+    reg           finish_dcc;
+    
+    dcc_sync393 dcc_sync393_i (
+        .sclk               (xclk2x),                // input
+        .dcc_en             (dcc_en),                // input xclk rising, sync with start of the frame
+        .finish_dcc         (finish_dcc),            // input @ sclk rising
+        .dcc_vld            (dccvld),                // input xclk rising
+        .dcc_data           (dccdata[15:0]),         // input[15:0] @clk rising
+        .statistics_dv      (statistics_dv),         // output reg 
+        .statistics_do      (statistics_do[15:0])    // output[15:0] reg @ sclk
+    );
+    
+    wire          enc_last; 
+    wire   [15:0] enc_do; 
+    wire          enc_dv; 
+
+// generate DC data/strobe for the direct output (re) using sdram channel3 buffering
+// encoderDCAC is updated to handle 13-bit signed data instead of the 12-bit. It will limit the values on ot's own
+    encoderDCAC393 encoderDCAC393_i (
+        .clk                (xclk),                   // input
+        .en                 (frame_en),               // input 
+        .lasti              (color_last),             // input - was "last MCU in a frame" (@ stb)
+        .first_blocki       (first_block_color),      // input - first block in frame - save fifo write address (@ stb)
+        .comp_numberi       (component_num[2:0]),     // input[2:0] - component number 0..2 in color, 0..3 - in jp4diff, >= 4 - don't use (@ stb)
+        .comp_firsti        (component_first),        // input - first this component in a frame (reset DC) (@ stb)
+        .comp_colori        (component_color),        // input - use color - huffman? (@ stb)
+        .comp_lastinmbi     (component_lastinmb),     // input - last component in a macroblock (@ stb) is it needed?
+        .stb                (dct_start),              // input - strobe that writes firsti, lasti, tni,average
+        .zdi                (focus_do[12:0]),         // input[12:0] - zigzag-reordered data input
+        .first_blockz       (first_block_quant),      // input - first block input (@zds)
+        .zds                (focus_ds),               // input - strobe - one ahead of the DC component output
+        .last               (enc_last),               // output reg 
+        .do                 (enc_do[15:0]),           // output[15:0] reg 
+        .dv                 (enc_dv)                  // output reg 
+    );
+
+    wire          last_block;
+    wire          test_lbw;
+    wire          stuffer_rdy; // receiver (bit stuffer) is ready to accept data;
+    wire   [15:0] huff_do;     // output[15:0] reg 
+    wire    [3:0] huff_dl;     // output[3:0] reg 
+    wire          huff_dv;     // output reg 
+    wire          flush;       // output reg 
+
+    huffman393 i_huffman (
+        .xclk               (xclk),                   // input
+        .xclk2x             (xclk2x),                 // input
+        .en                 (frame_en),               // input
+        .sclk               (mclk),                   // input - for writing tables - now @posedge
+        .twe                (twhe),                   // input - for writing tables - now @posedge mclk
+        .ta                 (ta[8:0]),                // input[8:0] - table write address @posedge mclk
+        .tdi                (tdi),                    // input[15:0] - table data in @posedge mclk
+        .di                 (enc_do[15:0]),           // input[15:0] - specially RLL prepared 16-bit data (to FIFO)
+        .ds                 (enc_dv),                 // input -  di valid strobe
+        .rdy                (stuffer_rdy),            // input - receiver (bit stuffer) is ready to accept data
+        .do(huff_do[15:0]), // output[15:0] reg 
+        .dl(huff_dl[3:0]), // output[3:0] reg 
+        .dv(huff_dv), // output reg 
+        .flush(flush), // output reg 
+        .last_block(last_block), // output reg 
+        .test_lbw(), // output reg ??
+        .gotLastBlock(test_lbw) // output ??
+    );
+    
+  /*
+ wire last_block, test_lbw;
+ huffman i_huffman  (.pclk(clk),      // pixel clock
+                      .clk(clk2x),   // twice frequency - uses negedge inside
+                      .en(cmprs_en),      // enable (0 resets counter) sync to .pclk(clk)
+//                      .cwr(cwr),      // CPU WR global clock
+                      .twe(twhe),      // enable write to a table
+                      .ta(ta[8:0]),      // [8:0]  table address
+                      .tdi(di[15:0]),      // [23:0] table data in (8 LSBs - quantization data, [13:9] zigzag address
+                      .di(enc_do[15:0]),      // [15:0]   specially RLL prepared 16-bit data (to FIFO)
+                      .ds(enc_dv),      // di valid strobe
+                      .rdy(stuffer_rdy),      // receiver (bit stuffer) is ready to accept data
+                      .do(huff_do),      // [15:0]   output data
+                      .dl(huff_dl),      // [3:0]   output width (0==16)
+                      .dv(huff_dv),      // output data bvalid
+                     .flush(flush),
+                     .last_block(last_block),
+                     .test_lbw(),
+                     .gotLastBlock(test_lbw));   // last block done - flush the rest bits
+  
+  */  
+    
+    
+    wire   [15:0] stuffer_do;
+    wire          stuffer_dv;
+    wire          stuffer_done;
+    reg           stuffer_done_persist;
+    wire          stuffer_flushing;
+    wire   [23:0] imgptr;
+    wire   [31:0] sec;
+    wire   [19:0] usec;
+    
+    always @ (negedge xclk2x) pre_finish_dcc <= stuffer_done;
+    always @ (posedge xclk2x) finish_dcc     <= pre_finish_dcc; //stuffer+done - @negedge clk2x
+
+    stuffer393 stuffer393_i (
+        .clk                 (xclk2x),                 // input clock - uses negedge inside
+        .en                  (cmprs_en_2x_n),          // input
+        .reset_data_counters (reset_data_counters[1]), // input reset data transfer counters (only when DMA and compressor are disabled)
+        .flush               (flush || force_flush),   // input - flush output data (fill byte with 0, long word with FFs
+        .stb                 (huff_dv),                // input
+        .dl                  (huff_dl),                // input[3:0] number of bits to send (0 - 16) (0-16??)
+        .d                   (huff_do),                // input[15:0] data to shift (only lower huff_dl bits are valid)
+// time stamping - will copy time at the end of color_first (later than the first hact after vact in the current froma, but before the next one
+// and before the data is needed for output 
+        .color_first(color_first), // input
+        .sec(sec[31:0]), // input[31:0] 
+        .usec(usec[19:0]), // input[19:0] 
+        .rdy(stuffer_rdy), // output - enable huffman encoder to proceed. Used as CE for many huffman encoder registers
+        .q(stuffer_do), // output[15:0] reg - output data
+        .qv(stuffer_dv), // output reg - output data valid
+        .done(stuffer_done), // output
+        .imgptr(imgptr[23:0]), // output[23:0] reg - image pointer in 32-byte chunks
+        .flushing(stuffer_flushing) // output reg 
+`ifdef debug_stuffer
+       ,.etrax_dma_r(tst_stuf_etrax[3:0]) // [3:0] just for testing
+       ,.test_cntr(test_cntr[3:0])
+       ,.test_cntr1(test_cntr1[7:0])
+`endif
+    );
+    
 /*
+ stuffer   i_stuffer  (.clk(clk2x),         //clock - uses negedge inside
+                     .en(cmprs_en_2x_n),         // enable, 0- reset
+                     .reset_data_counters(reset_data_counters[1]), // reset data transfer counters (only when DMA and compressor are disabled)
+                     .flush(flush || force_flush),      // flush output data (fill byte with 0, long word with FFs
+                     .stb(huff_dv),       // input data strobe
+                     .dl(huff_dl),         // [3:0] number of bits to send (0 - 16)
+                     .d(huff_do),          // [15:0] input data to shift (only lower bits are valid)
+// time stamping - will copy time at the end of color_first (later than the first hact after vact in the current froma, but before the next one
+// and before the data is needed for output 
+                     .color_first(color_first), //
+                     .sec(sec[31:0]),
+                     .usec(usec[19:0]),
+                     .rdy(stuffer_rdy),      // enable huffman encoder to proceed. Used as CE for many huffman encoder registers
+                     .q(stuffer_do),         // [15:0] output data
+                     .qv(stuffer_dv),      // output data valid
+                     .done(stuffer_done),
+                     .imgptr (imgptr[23:0]), // [23:0]image pointer in 32-byte chunks
+                     .flushing(stuffer_flushing)
+`ifdef debug_stuffer
+                     ,.etrax_dma_r(tst_stuf_etrax[3:0]) // [3:0] just for testing
+                     ,.test_cntr(test_cntr[3:0])
+                     ,.test_cntr1(test_cntr1[7:0])
+`endif
+                     );
+
+
+
+dcc_sync i_dcc_sync(//.clk(clk),
+                    .sclk(clk2x),
+                    .dcc_en(dcc_en),                   // clk rising, sync with start of the frame
+                    .finish_dcc(finish_dcc),           // sclk rising
+                    .dcc_vld(dccvld),                 // clk rising
+                    .dcc_data(dccdata[15:0]),         //[15:0] clk risimg
+                    .statistics_dv(statistics_dv),     //sclk
+                    .statistics_do(statistics_do[15:0])//[15:0] sclk
+                 );
+
+
+
+//TODO: compact table                     
+focus_sharp i_focus_sharp(.clk(clk),   // pixel clock
+                   .en(cmprs_en),   // enable (0 resets counter)
+                   .sclk(clk2x), // system clock, twe, ta,tdi - valid @negedge (ra, tdi - 2 cycles ahead
+                   .twe(twfe), // enable write to a table
+                   .ta(ta[9:0]),  // [9:0]  table address
+                   .tdi(di[15:0]),  // [15:0] table data in (8 LSBs - quantization data)
+                   .mode(cmprs_fmode_this[1:0]), // focus mode (combine image with focus info) - 0 - none, 1 - replace, 2 - combine all,  3 - combine woi
+//                   .stren(focus_strength),
+                   .firsti(color_first),  // first macroblock
+                   .lasti(color_last),    // last macroblock
+                   .tni(color_tn[2:0]),   // block number in a macronblock - 0..3 - Y, >=4 - color (sync to stb)
+                   .stb(dct_start),      // strobe that writes ctypei, dci
+                   .start(quant_start),// marks first input pixel (needs 1 cycle delay from previous DCT stage)
+                   .di(dct_out[12:0]),    // [11:0] pixel data in (signed)
+                   .quant_ds(quant_ds), // quantizator data strobe (1 before DC)
+                   .quant_d(quant_do[12:0]), // quantizator data output
+                   .quant_dc_tdo(quant_dc_tdo[15:0]), //[15:0], MSB aligned coefficient for the DC component (used in focus module)
+//                   .quant_dc_tdo_stb(quant_dc_tdo_stb),
+                   .do(focus_do[12:0]),    // [11:0] pixel data out (AC is only 9 bits long?) - changed to 10
+                   .ds(focus_ds),  // data out strobe (one ahead of the start of dv)
+                   .hifreq(hifreq[31:0])  //[31:0])  //  accumulated high frequency components in a frame sub-window
+                   );
+
+
 xdct       i_xdct ( .clk(clk),             // top level module
                     .en(cmprs_en),       // if zero will reset transpose memory page numbers
                     .start(dct_start),    // single-cycle start pulse that goes with the first pixel data. Other 63 should follow

--- a/compressor_jp/quantizer393.v
+++ b/compressor_jp/quantizer393.v
@@ -258,7 +258,7 @@ module quantizer393(
        .rclk         (clk),                          // input
        .raddr        ({tba[9:6],tba[2:0],tba[5:3]}), // input[8:0] 
        .ren          (1'b1),                         // input
-        .regen        (1'b0),                         // input
+        .regen        (1'b1),                         // input
        .data_out     (tdo[15:0]),                    // output[15:0] 
        .wclk         (sclk),                         // input
        .waddr        ({ta[8:0],twqe_d}),             // input[8:0] 
@@ -276,7 +276,7 @@ module quantizer393(
        .rclk         (clk), // input
        .raddr        ({tbac[3:0],qmulr[11:4]}), // input[10:0] 
        .ren          (1'b1), // input
-        .regen        (1'b0), // input
+        .regen        (1'b1), // input
        .data_out     (tdco[3:0]), // output[3:0] 
        .wclk         (sclk), // input
        .waddr        ({ta[8:0],twce_d}), // input[9:0] 
@@ -294,7 +294,7 @@ module quantizer393(
        .rclk         (clk), // input
        .raddr        ({3'b0,rpage,zra[5:0]}), // input[8:0] 
        .ren          (next_dv), // input
-        .regen        (1'b0), // input
+        .regen        (1'b1), // input
        .data_out     (zigzag_q[15:0]), // output[31:0] 
        .wclk         (clk), // input
        .waddr        ({3'b0,wpage,zwa[5:0]}), // input[8:0] 

--- a/compressor_jp/stuffer393.v
+++ b/compressor_jp/stuffer393.v
+/*
+** -----------------------------------------------------------------------------**
+** stuffer393.v
+**etrax_dma
+** Bit stuffer for JPEG encoder
+**
+** Copyright (C) 2002-2015 Elphel, Inc
+**
+** -----------------------------------------------------------------------------**
+**  stuffer393.v is free software - hardware description language (HDL) code.
+** 
+**  This program is free software: you can redistribute it and/or modify
+**  it under the terms of the GNU General Public License as published by
+**  the Free Software Foundation, either version 3 of the License, or
+**  (at your option) any later version.
+**
+**  This program is distributed in the hope that it will be useful,
+**  but WITHOUT ANY WARRANTY; without even the implied warranty of
+**  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+**  GNU General Public License for more details.
+**
+**  You should have received a copy of the GNU General Public License
+**  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+** -----------------------------------------------------------------------------**
+**
+*/
+`define debug_compressor
+// 08.27.2005 - modified "rdy" - moved register to make it faster.
+// 01.22.2004 - fixed bug if flush comes with !rdy (required mod of huffman.v to extend "flush" until ready)
+// 02.05.2004 - modified data length output. It is 24 it ow, in bytes and is output as last 4 bytes in the
+//              data block that is 32-byte DMA page aligned
+
+// running on v8.2i - does not meet constraints with enabled global USE_SYNC_SET yes/auto because set input is slower. Trying to selectively disable it
+
+// s ynthesis attribute use_sync_set of stuffer is no; 
+// s ynthesis attribute use_sync_reset of stuffer is no; 
+// s ynthesis attribute use_clock_enable of stuffer is no; 
+
+
+module stuffer393 (
+    input              clk,            // 2x pixel clock
+    input              en,            // enable, 0- reset
+    input              reset_data_counters, // reset data transfer counters (only when DMA and compressor are disabled)
+    input              flush,        // flush output data (fill byte with 0, long word with 0
+    input              stb,        // input data strobe
+    input        [3:0] dl,            // [3:0] number of bits to send (0 - 16) ??
+    input       [15:0] d,            // [15:0] input data to shift (only lower bits are valid)
+// time stamping - will copy time at the end of color_first (later than the first hact after vact in the current froma, but before the next one
+// and before the data is needed for output 
+    input              color_first, //
+    input       [31:0] sec,    // [31:0] number of seconds
+    input       [19:0] usec,    // [19:0] number of microseconds
+    output             rdy,        // enable huffman encoder to proceed. Used as CE for many huffman encoder registers
+    output reg  [15:0] q,            // [15:0] output data
+    output reg         qv,        // output data valid
+    output             done,// reset by !en, goes high after some delay after flushing
+    output reg  [23:0] imgptr, // [23:0]image pointer in 32-byte chunks 
+    output reg         flushing
+`ifdef debug_stuffer
+,      output reg   [3:0] etrax_dma_r, // [3:0] just for testing
+       output reg   [3:0] test_cntr,
+       output reg   [7:0] test_cntr1
+`endif
+);
+
+`ifdef debug_stuffer
+    reg           en_d;
+`endif
+
+    reg    [23:1] stage1;         //    stage 1 register (after right-shifting input data by 0..7 - actually left by 7..0)
+    wire    [2:0]  shift1;        // shift amount for stage 1
+    reg     [4:0] stage1_bits;    // number of topmost invalid bits in stage1 register - 2 MSBs, use lower 3  stage2_bits
+    reg     [4:0] stage1_length;  // number of bits (1..16) in stage 1 register
+
+    wire          flush_end;
+    reg           stage1_full;
+    wire    [7:0] byteMask;
+    wire   [31:1] longMask;    
+    wire   [31:1] dflt_stage2;
+    wire   [ 2:0] sel;
+    wire   [ 1:0] st2m;
+    wire   [31:1] st2_d;
+    reg    [31:1] stage2;
+    reg    [ 4:0] stage2_bits;
+    wire          send8h;
+    wire          send8l;
+    wire          send8;
+    reg           flush_end_delayed;   // update: fixed delay some delay after flush_end to ensure combining with output FIFO empty
+    wire          pre_flush_end_delayed;    // some delay after flush_end to ensure combining with output FIFO empty
+    reg    [23:0] size_count; //(now will be byte count)
+     
+// to make it faster - split in parts
+    reg           inc_size_count2316;
+    reg    [ 2:0] size_out;
+    reg           size_out_over;// only needed with extra 32 bytes of zeroes added.
+    reg           busy_eob;     // flushing and sending length
+    reg           trailer;      // sending out data length and 32 bytes for ETRAX
+    reg           was_trailer;  // sending out data length and 32 bytes for ETRAX
+
+    reg    [ 3:0] etrax_dma;    // count words to make total size multiple of 32 bytes.
+                                // Last 4 bytes of data will have actual length in bytes
+                                // There will always be at least 4 more bytes (0-es) before length - needed for software
+    reg           will_flush;   // next dv will be flushing byte/word
+    wire          flush_now;
+    wire          start_sizeout; //delay by 2 cycles
+
+    reg           send8h_r;
+    reg           send8l_r;
+
+    wire          pre_stage2_bits_3;    // what will be registered to stage2_bits[3];
+    wire    [4:3] willbe_stage1_bits;
+    wire    [3:0] sum_lengths;
+    reg     [1:0] st2m_r;
+    
+    reg     [2:0] stb_time;
+    reg    [31:0] sec_r;
+    reg    [19:0] usec_r;
+    reg           time_out;
+    reg           time_size_out;
+    wire          start_time_out;
+    
+// stb_time[2] - single-cycle pulse after color_first goes low 
+    reg    [19:0] imgsz32; // current image size in multiples of 32-bytes
+    reg           inc_imgsz32;
+
+    always @ (negedge clk)  flushing <= en && !flush_end && ((flush && rdy) || flushing);
+    
+    wire    [4:0]    pre_stage1_bits;
+    assign pre_stage1_bits[4:0]={2'b00,stage1_bits[2:0]} +  {(dl[3:0]==4'b0),dl[3:0]};
+    
+    always @ (negedge clk)    begin 
+        if (!en || flush_end) stage1_bits[4:0] <= 5'b0;
+        else if (stb && rdy) stage1_bits <= {(2'b10-pre_stage1_bits[4:3]),pre_stage1_bits[2:0]};
+    end
+
+    assign shift1[2:0]= stage1_bits[2:0] + dl[2:0];
+    always @ (negedge clk) if (stb && rdy)    begin
+        case (shift1[2:0])
+            0: stage1[23:1]    <= {     d[15:0],7'b0};
+            1: stage1[23:1]    <= {1'b0,d[15:0],6'b0};
+            2: stage1[23:1]    <= {2'b0,d[15:0],5'b0};
+            3: stage1[23:1]    <= {3'b0,d[15:0],4'b0};
+            4: stage1[23:1]    <= {4'b0,d[15:0],3'b0};
+            5: stage1[23:1]    <= {5'b0,d[15:0],2'b0};
+            6: stage1[23:1]    <= {6'b0,d[15:0],1'b0};
+            7: stage1[23:1]    <= {7'b0,d[15:0]     };
+         endcase
+        stage1_length[4:0]    <= {(dl[3:0]==4'b0),dl[3:0]};
+    end
+
+
+//*****************************
+    always @ (negedge clk) begin
+        if (!en) stage2_bits    <= 5'b0;
+        else if (send8) stage2_bits[4:0] <= stage2_bits[4:0] - 8;
+        else if (flushing && !stage1_full && !stage2_bits[4] && (stage2_bits[3:0]!=4'b0)) stage2_bits[4:0]<=5'h10;    // actual flushing to word size
+        else        stage2_bits[4:0]    <= (rdy && stage1_full)? {1'b0,stage2_bits[3:0]}+stage1_length[4:0]:{1'b0,stage2_bits[3:0]};
+    end
+
+    assign        sum_lengths=stage2_bits[3:0]+stage1_length[3:0];
+    assign pre_stage2_bits_3= en &&
+                          (send8? (~stage2_bits[3]): (
+                                  !(flushing && !stage1_full && !stage2_bits[4] && (stage2_bits[3:0]!=4'b0)) && // not flushing
+                                  ((rdy && stage1_full)?sum_lengths[3]:    stage2_bits[3] )
+                                  ));
+    assign willbe_stage1_bits[4:3]={2{en && !flush_end}} & ((stb && rdy)?(2'b10-pre_stage1_bits[4:3]):stage1_bits[4:3]);
+    
+
+// accelerating rdy calculation - making it a register
+    wire       pre_busy_eob=en && !flush_end_delayed && (busy_eob || (flush && rdy));
+    wire [4:3] pre_stage2_bits_4_interm1=stage2_bits[4:3]-2'h1;
+    wire [4:0] pre_stage2_bits_4_interm2={1'b0,stage2_bits[3:0]}+stage1_length[4:0];
+    wire       pre_stage2_bits_4=en && (send8?
+                                     (pre_stage2_bits_4_interm1[4]):
+                                     ((flushing && !stage1_full && !stage2_bits[4] && (stage2_bits[3:0]!=4'b0))?
+                                       (1'b1):
+                                       (((rdy && stage1_full))?
+                                         (pre_stage2_bits_4_interm2[4]):
+                                         (1'b0)
+                                       )
+                                     )
+                              );
+    wire pre_send8h_r= (( send8h_r  &&  stage2_bits[4])?
+                           (&stage2[23:16]):
+                           ((!send8l_r  || !stage2_bits[4])?
+                             (&((longMask[31:24] & st2_d[31:24]) | (~longMask[31:24] & dflt_stage2[31:24]))):
+                             (send8h_r)
+                           )
+                         );
+
+    wire pre_send8l_r= ((( send8h_r || send8l_r) &&  stage2_bits[4] )?
+                        (&stage2[15:8]):
+                        (&((longMask[23:16] & st2_d[23:16]) | (~longMask[23:16] & dflt_stage2[23:16])))
+                       );
+
+//Trying to delay rdy to make more room before it
+    reg           rdy_rega;
+    reg           rdy_regb;
+    reg           rdy_regc;
+    reg           rdy_regd;
+// s ynthesis attribute use_sync_set of {module_name|signal_name|instance_name} [is] no; 
+ 
+   always @ (negedge clk) begin
+        rdy_rega <= !pre_stage2_bits_4;
+        rdy_regb <= !pre_send8h_r;
+        rdy_regc <= !pre_send8l_r;
+        rdy_regd <= !pre_busy_eob;
+        busy_eob <= pre_busy_eob;
+//**********************************
+        send8h_r<=pre_send8h_r;
+        send8l_r<=pre_send8l_r;
+    end
+    assign rdy = (rdy_rega || (rdy_regb && rdy_regc)) && rdy_regd;
+    
+    assign send8h= send8h_r && stage2_bits[4];
+    assign send8l= send8l_r && stage2_bits[4];
+    assign send8=stage2_bits[4] && (send8h_r || send8l_r);
+
+    always    @ (negedge clk) begin
+        if (!en) stage1_full <= 1'b0;
+/* TODO: MAke sure it is OK !! 05/12/2010 */
+        else if (flushing) stage1_full <= 1'b0; //force flush does not turn off stb, in normal operation flushing is after last stb
+        else if (rdy) stage1_full <=stb; //force flush does not turn off stb, in normal operation flushing is after last stb
+
+    end
+    assign    sel[2:0]=stage2_bits[2:0];
+    assign    byteMask[7:0]=    {!sel[2] && !sel[1] && !sel[0],
+                                 !sel[2] && !sel[1],
+                                 !sel[2] && (!sel[1] || !sel[0]),
+                                 !sel[2],
+                                 !sel[2] || (!sel[1] && !sel[0]),
+                                 !sel[2] || !sel[1],
+                                 !sel[2] || !sel[1] || !sel[0],
+                                 1'b1
+                                 };
+
+//TODO: Try to move stage1_full up here, this is the time-limiting path 05.26.2010
+    assign    longMask[31:1]={{8{(flushing || stage1_full) && !stage2_bits[3]}} & byteMask[7:0],
+                              {8{flushing || stage1_full}} & ({8{!stage2_bits[3]}} | byteMask[7:0]),
+                              {8{stage1_full}},
+                              {7{stage1_full}}};
+
+    always @ (negedge clk) st2m_r[1:0]<=willbe_stage1_bits[4:3]-{1'b0,pre_stage2_bits_3};
+    
+    assign    st2m[1:0]=st2m_r[1:0];
+    assign    st2_d[31:1]=    {{8{!flushing || stage1_full}} & (st2m[1]?{stage1[7:1],1'b0}:(st2m[0]? stage1[15:8]:     stage1[23:16])),
+                               {8{!flushing || stage1_full}} & (st2m[1]? stage1[23:16]:    (st2m[0]?{stage1[7:1],1'b0}:stage1[15: 8])),
+                               st2m[1]? stage1[15: 8]:    {stage1[7:1],1'b0},
+                               {stage1[7:1]}};
+    assign    dflt_stage2=stage2_bits[4]?{stage2[15:1],16'b0}:{stage2[31:1]};
+
+
+always @ (negedge clk) begin
+    if          (send8h) stage2[31:24] <= stage2[23:16];
+    else if (send8l) stage2[31:24] <= 8'h00;
+    else                  stage2[31:24] <= (longMask[31:24] & st2_d[31:24]) | (~longMask[31:24] & dflt_stage2[31:24]);
+    if          (send8)  stage2[23:16] <= stage2[15:8];
+    else                  stage2[23:16] <= (longMask[23:16] & st2_d[23:16]) | (~longMask[23:16] & dflt_stage2[23:16]);
+
+    if          (send8)  stage2[15: 8] <= {stage2[7:1],1'b0};
+    else                  stage2[15: 8] <= (longMask[15: 8] & st2_d[15: 8]) | (~longMask[15: 8] & dflt_stage2[15: 8]);
+
+    if          (send8)  stage2[7:  1] <= 7'b0;
+    else                  stage2[7:  1] <= (longMask[7: 1] & st2_d[7: 1]) | (~longMask[7: 1] & dflt_stage2[7: 1]);
+end
+
+// output stage
+    assign   flush_end= !stage2_bits[4] && flushing && !stage1_full && (stage2_bits[3:0]==4'b0);
+    assign flush_now= en && (!send8) && (flushing && !stage1_full && !stage2_bits[4]) && !will_flush;
+`ifdef debug_stuffer
+    reg [3:0] tst_done_dly;
+`endif
+
+    always @ (negedge clk) begin
+        stb_time[2:0] <= {stb_time[1] & ~stb_time[0], stb_time[0],color_first};
+      
+        if        (stb_time[2]) sec_r[31:0] <= sec[31:0];
+        else if (start_sizeout) sec_r[31:0] <= {8'hff, size_count[23:0]};
+        else if (time_size_out) sec_r[31:0] <= {usec_r[15:0],sec_r[31:16]};
+        if   (stb_time[2]) usec_r[19:0] <= usec[19:0];
+        else if (time_out) usec_r[19:0] <= {16'h0,usec_r[19:16]};
+  
+ //reset_data_counters; // reset data transfer counters (only when DMA and compressor are disabled)
+ 
+        if (reset_data_counters ) etrax_dma[3:0] <= 0; // not needed to be reset after frame, and that was wrong (to early)
+        else if (qv) etrax_dma[3:0] <= etrax_dma[3:0] + 1;
+
+// just for testing
+`ifdef debug_stuffer
+        en_d<= en;
+        if (en) etrax_dma_r[3:0] <= etrax_dma[3:0];
+        if    (done) test_cntr1[7:0] <= 0;
+        else if (qv) test_cntr1[7:0] <= test_cntr1[7:0] +1 ; // normally should be one (done 1 ahead of end of qv)
+        tst_done_dly[3:0] <= {tst_done_dly[2:0],done};
+        if (tst_done_dly[1]) test_cntr[3:0] <= 0;
+        else if (qv)         test_cntr[3:0] <= test_cntr[3:0] +1 ;
+`endif
+ 
+
+        size_out_over <= en && (size_out_over?(!done):size_out[0]);
+  
+        size_out[2:0]<={size_out[1:0],start_sizeout};
+        time_out <= en && (start_time_out || (time_out && !(etrax_dma[3:2]== 2'h3)));
+        time_size_out <= en && (start_time_out || (time_size_out && !(etrax_dma[3:1]== 3'h7)));
+  
+        trailer <= en && (trailer?(!flush_end_delayed):(flush_end));
+        was_trailer<=trailer; 
+        will_flush <= en && (will_flush?(!qv):(flush_now && (stage2_bits[3:0]!=4'b0)));
+        if (flush_now) size_count[0] <= stage2_bits[3] ^ (|stage2_bits[2:0]); // odd number of bytes
+        if (!en || size_out[2]) size_count[15:1] <= 0;
+        else if (!trailer && !was_trailer && qv && (!will_flush || !size_count[0]))  size_count[15:1] <= size_count[15:1]+1;
+        inc_size_count2316 <= (!trailer && !was_trailer && qv && (!will_flush || !size_count[0])) && (&size_count[15:1]);
+//reset_data_counters instead of !en here?
+        if      (!en || size_out[2]) size_count[23:16] <= 0;
+        else if (inc_size_count2316) size_count[23:16] <= size_count[23:16]+1;
+
+        qv <= en && (stage2_bits[4] || trailer);
+// to make it faster (if needed) use a single register as a source for  q[15:0] in two following lines
+        if      (time_size_out)  q[15:0] <= {sec_r[7:0],sec_r[15:8]};
+        else                     q[15:0] <= {(stage2_bits[4]?stage2[31:24]:8'b0),
+                                             ((stage2_bits[4] && !send8h)? stage2[23:16]:8'b0)};
+        inc_imgsz32 <= (etrax_dma[3:0]== 4'h0) && qv;
+//reset_data_counters instead of !en here?
+        if (reset_data_counters || done) imgsz32[19:0] <= 0;
+        else if (inc_imgsz32) imgsz32[19:0]<=imgsz32[19:0]+1;
+
+        if (reset_data_counters) imgptr[23:0] <= 0;
+        else if (done) imgptr[23:0] <= imgptr[23:0]+ imgsz32[19:0];
+        
+        flush_end_delayed <= en & pre_flush_end_delayed; // en just to prevent optimizing pre_flush_end_delayed+flush_end_delayed into a single SRL16
+    end
+//start_sizeout
+    assign start_time_out= qv && trailer && (etrax_dma[3:0]== 4'h8) && !size_out_over;
+    assign start_sizeout= time_out && (etrax_dma[3:0]== 4'hc);
+// SRL16_1 i_pre_flush_end_delayed (.D(size_out[1]),.Q(pre_flush_end_delayed), .A0(1'b0), .A1(1'b1), .A2(1'b1), .A3(1'b1), .CLK(clk)); // dly=3+1    // rather arbitrary?
+    dly_16 #(.WIDTH(1)) i_pre_flush_end_delayed(.clk(~clk),.rst(1'b0), .dly(14), .din(size_out[1]), .dout(pre_flush_end_delayed));    // dly=14+1 // rather arbitrary?
+    assign done = flush_end_delayed;
+
+endmodule
--- a/compressor_jp/varlen_encode393.v
+++ b/compressor_jp/varlen_encode393.v
+/*
+** -----------------------------------------------------------------------------**
+** varlen_encode393.v
+**
+** Part of the Huffman encoder for JPEG compressor - variable length encoder
+**
+** Copyright (C) 2002-2015 Elphel, Inc
+**
+** -----------------------------------------------------------------------------**
+**  varlen_encode393.v is free software - hardware description language (HDL) code.
+** 
+**  This program is free software: you can redistribute it and/or modify
+**  it under the terms of the GNU General Public License as published by
+**  the Free Software Foundation, either version 3 of the License, or
+**  (at your option) any later version.
+**
+**  This program is distributed in the hope that it will be useful,
+**  but WITHOUT ANY WARRANTY; without even the implied warranty of
+**  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+**  GNU General Public License for more details.
+**
+**  You should have received a copy of the GNU General Public License
+**  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+** -----------------------------------------------------------------------------**
+**
+*/
+//used the other edge of the clk2x
+
+// Encoder will work 2 cycles/"normal" word, 1 cycle for codes "00" and "f0",
+// only magnitude output is needed ASAP (2 cycles, the value out should be
+// valid on the 5-th cycle - it will latency 4 cycles run each other cycle
+// I'll make a shortcut - all codes processed in 2 cycles.
+
+module    varlen_encode393 (
+    input             clk,       // twice frequency - uses negedge inside
+    input              en,       // will enable registers. 0 - freeze at once
+    input              start, // (not faster than each other cycle)
+    input       [11:0] d,        // 12-bit signed
+    output reg   [3:0] l,        // [3:0] code length
+    output reg   [3:0] l_late,// delayed l (sync to q)
+    output reg  [10:0] q);    // [10:0]code
+/*
+    varlen_encode393 i_varlen_encode(.clk(clk),
+                                        .en(stuffer_was_rdy), //will enable registers. 0 - freeze
+                                        .start(steps[0]),
+                                        .d(sval[11:0]),        // 12-bit signed
+                                        .l(var_dl[ 3:0]),        // [3:0] code length
+                                        .l_late(var_dl_late[3:0]),
+                                        .q(var_do[10:0]));    // [10:0]code
+*/                                
+    reg    [11:0] d1;
+    reg    [10:0] q0;
+    reg     [2:0] cycles;
+
+    wire          this0 =  |d1[ 3:0];
+    wire          this1 =  |d1[ 7:4];
+    wire          this2 =  |d1[10:8];
+    wire    [1:0] codel0 = {|d1[ 3: 2],d1[ 3] || (d1[ 1] & ~d1[ 2])};
+    wire    [1:0] codel1 = {|d1[ 7: 6],d1[ 7] || (d1[ 5] & ~d1[ 6])};
+    wire    [1:0] codel2 = {|d1[   10],          (d1[ 9] & ~d1[10])};
+    wire    [3:0] codel =  this2? {2'b10,codel2[1:0]} :
+                     (this1? {2'b01, codel1[1:0]} :
+                             (this0 ? {2'b00,codel0[1:0]} : 4'b1111));    // after +1 will be 0;
+
+    always @ (negedge clk)  if (en) begin
+        cycles[2:0]    <= {cycles[1:0],start};
+    end
+
+    always @ (negedge clk) if (en && start) begin
+        d1[  11]    <=  d[11];
+        d1[10:0]    <=  d[11]?-d[10:0]:d[10:0];
+    end
+
+    always @ (negedge clk) if (en & cycles[0]) begin
+        q0[10:0]    <= d1[11]?~d1[10:0]:d1[10:0];
+        l    <= codel[3:0]+1;    // needed only ASAP, valid only 2 cycles after start
+    end
+    
+    always @ (negedge clk) if (en & cycles[2]) begin
+        q[10:0]    <= q0[10:0];
+        l_late[3:0]    <= l[3:0];
+    end
+
+endmodule