working on replacement for 8x8 DCT

4e94f6ca · Andrey Filippov · 116c9ce2 · 4e94f6ca · 4e94f6ca · 4e94f6ca
Commit 4e94f6ca authored Jun 09, 2016 by Andrey Filippov
7 changed files
--- a/dsp/dct1d_chen.v
+++ b/dsp/dct1d_chen.v
+/*******************************************************************************
+ * <b>Module:</b>dct1d_chen
+ * @file dct1d_chen.v
+ * @date:2016-06-05  
+ * @author: Andrey Filippov
+ *     
+ * @brief: 1d 8-point DCT based on Chen algorithm
+ *
+ * @copyright Copyright (c) 2016 Elphel, Inc.
+ *
+ * <b>License:</b>
+ *
+ *dct1d_chen.v is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ *  dct1d_chen.v is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/> .
+ *
+ * Additional permission under GNU GPL version 3 section 7:
+ * If you modify this Program, or any covered work, by linking or combining it
+ * with independent modules provided by the FPGA vendor only (this permission
+ * does not extend to any 3-rd party modules, "soft cores" or macros) under
+ * different license terms solely for the purpose of generating binary "bitstream"
+ * files and/or simulating the code, the copyright holders of this Program give
+ * you the right to distribute the covered work without those independent modules
+ * as long as the source code for them is available from the FPGA vendor free of
+ * charge, and there is no dependence on any encrypted modules for simulating of
+ * the combined code. This permission applies to you if the distributed code
+ * contains all the components and scripts required to completely simulate it
+ * with at least one of the Free Software programs.
+ *******************************************************************************/
+`timescale 1ns/1ps
+module  dct1d_chen#(
+    parameter WIDTH = 24,
+    parameter OUT_WIDTH = 24,
+    parameter B_WIDTH = 18,
+    parameter A_WIDTH = 25,
+    parameter P_WIDTH = 48,
+    parameter M_WIDTH = 43, // actual multiplier width (== (A_WIDTH +B_WIDTH)
+    parameter COS_1_16 = 128553, // (1<<17) * cos(1*pi/16)
+    parameter COS_2_16 = 121095, // (2<<17) * cos(1*pi/16)
+    parameter COS_3_16 = 108982, // (3<<17) * cos(1*pi/16)
+    parameter COS_4_16 =  92682, // (4<<17) * cos(1*pi/16)
+    parameter COS_5_16 =  72820, // (5<<17) * cos(1*pi/16)
+    parameter COS_6_16 =  50159, // (6<<17) * cos(1*pi/16)
+    parameter COS_7_16 =  25570  // (7<<17) * cos(1*pi/16)
+)(
+    input                          clk,
+    input                          rst,
+    input                          en,
+    input  [2 * WIDTH -1:0]        d10_32_76_54, // Concatenated input data {x[1],x[0]}/{x[3],x[2]}/ {x[7],x[6]}/{x[5],x[4]}
+    input                          start,      // {x[1],x[0]} available next after start,  {x[3],x[2]} - second next, then {x[7],x[6]} and {x[5],x[4]} 
+    output [WIDTH -1:0]            dout,
+    output                         pre2_start_out // 2 clock cycle before F4 output, full dout sequence
+                                             // start_out-X-F4-X-F2-X-F6-F5-F0-F3-X-F1-X-F7
+);
+    reg    signed [B_WIDTH-1:0] dsp_ma_bin;
+    wire                        dsp_ma_ceb1_1;     // load b1 register
+    wire                        dsp_ma_ceb2_1;     // load b2 register
+    wire                        dsp_ma_selb_1;     // 0 - select b1, 1 - select b2
+    wire   signed [A_WIDTH-1:0] dsp_ma_ain_1;
+    wire                        dsp_ma_cea1_1;
+    wire                        dsp_ma_cea2_1;
+    wire   signed [A_WIDTH-1:0] dsp_ma_din_1;
+    wire                        dsp_ma_ced_1;
+    wire                        dsp_ma_sela_1;
+    wire                        dsp_ma_en_a_1;      // 0: +/- D, 1: A or A +/- D 
+    wire                        dsp_ma_en_d_1;      // 0: A, 1: D  or A +/- D 
+    wire                        dsp_ma_sub_d_1;     // 1 when  - D, 0 - all other
+    wire                        dsp_ma_neg_m_1;    // 1 - negate multiplier result
+    wire                        dsp_ma_accum_1;    // 0 - use multiplier result, 1 add to accumulator
+    wire   signed [P_WIDTH-1:0] dsp_ma_p_1;
+    wire                        dsp_ma_ceb1_2;     // load b1 register
+    wire                        dsp_ma_ceb2_2;     // load b2 register
+    wire                        dsp_ma_selb_2;     // 0 - select b1, 1 - select b2
+    wire   signed [A_WIDTH-1:0] dsp_ma_ain_2;
+    wire                        dsp_ma_cea1_2;
+    wire                        dsp_ma_cea2_2;
+    wire   signed [A_WIDTH-1:0] dsp_ma_din_2;
+    wire                        dsp_ma_ced_2;
+    wire                        dsp_ma_sela_2;     // 0 - select a1, 1 - select a2
+    wire                        dsp_ma_seld_2;     // 0 - select a1/a2, 1 - select d
+    wire                        dsp_ma_neg_m_2;    // 1 - negate multiplier result
+    wire                        dsp_ma_accum_2;    // 0 - use multiplier result, 1 add to accumulator
+    wire   signed [P_WIDTH-1:0] dsp_ma_p_2;
+    // Multipler A/D inputs before shift
+    wire   signed [WIDTH-1:0] dsp_ma_ain24_1;
+    wire   signed [WIDTH-1:0] dsp_ma_din24_1;
+    wire   signed [WIDTH-1:0] dsp_ma_ain24_2;
+    wire   signed [WIDTH-1:0] dsp_ma_din24_2;
+    wire   signed   [WIDTH-1:0] simd_a0;
+    wire   signed   [WIDTH-1:0] simd_a1;
+    wire   signed   [WIDTH-1:0] simd_a2;
+    wire   signed   [WIDTH-1:0] simd_a3;
+    wire   signed   [WIDTH-1:0] simd_a4;
+    wire   signed   [WIDTH-1:0] simd_a5;
+    wire   signed   [WIDTH-1:0] simd_b0;
+    wire   signed   [WIDTH-1:0] simd_b1;
+    wire   signed   [WIDTH-1:0] simd_b2;
+    wire   signed   [WIDTH-1:0] simd_b3;
+    wire   signed   [WIDTH-1:0] simd_b4;
+    wire   signed   [WIDTH-1:0] simd_b5;
+    wire   signed   [WIDTH-1:0] simd_p0;
+    wire   signed   [WIDTH-1:0] simd_p1;
+    wire   signed   [WIDTH-1:0] simd_p2;
+    wire   signed   [WIDTH-1:0] simd_p3;
+    wire   signed   [WIDTH-1:0] simd_p4;
+    wire   signed   [WIDTH-1:0] simd_p5;
+    wire                        simd_cea01;
+    wire                        simd_cea23;
+    wire                        simd_ceaf45; // first stage A registers CE
+    wire                        simd_ceas45; // second stage A registers CE
+    wire                        simd_ceb01;
+    wire                        simd_ceb23;
+    wire                        simd_ceb45;  // B registers CE
+    wire                        simd_sub01;
+    wire                        simd_sub23;
+    wire                        simd_sub45;
+    wire                        simd_cep01;
+    wire                        simd_cep23;
+    wire                        simd_cep45;
+    reg                   [7:0] phase;
+    reg                   [3:0] phase_cnt;
+    reg        [OUT_WIDTH -1:0] dout_r;
+    wire       [OUT_WIDTH -1:0] dout1_w;
+    wire       [OUT_WIDTH -1:0] dout2_w;
+//        .ain      ({simd_a1,simd_a0}), // input[47:0] 
+//        .bin      ({simd_b1,simd_b0}), // input[47:0]
+    // dsp_addsub_simd1_i input connections
+    assign  simd_a0 = phase[0]? d10_32_76_54[0 * WIDTH +: WIDTH] : simd_p0; // only phase[0] & phase[4], other phases - don't care
+    assign  simd_a1 = phase[0]? d10_32_76_54[1 * WIDTH +: WIDTH] : simd_p1; // only phase[0] & phase[4], other phases - don't care
+    assign  simd_b0 = phase[2]? d10_32_76_54[0 * WIDTH +: WIDTH] : simd_p3; // only phase[2] & phase[5], other phases - don't care
+    assign  simd_b1 = phase[2]? d10_32_76_54[1 * WIDTH +: WIDTH] : simd_p2; // only phase[2] & phase[5], other phases - don't care
+    assign simd_cea01 =  phase[0] | phase[4];
+    assign simd_ceb01 =  phase[2] | phase[5];
+    assign simd_sub01 = phase[3] | phase[6];
+    assign simd_cep01 = phase[2] | phase[3] | phase[5] | phase[6];
+    // dsp_addsub_simd2_i input connections
+    assign  simd_a2 = phase[1]? d10_32_76_54[0 * WIDTH +: WIDTH] : simd_p0; // only phase[1] & phase[7], other phases - don't care
+    assign  simd_a3 =           d10_32_76_54[1 * WIDTH +: WIDTH];           // only phase[1],            other phases - don't care 
+    assign  simd_b2 = phase[3]? d10_32_76_54[0 * WIDTH +: WIDTH] : simd_p1; // only phase[3] & phase[7], other phases - don't care
+    assign  simd_b3 =           d10_32_76_54[1 * WIDTH +: WIDTH];           // only phase[3],            other phases - don't care
+    assign simd_cea23 =  phase[1] | phase[7];
+    assign simd_ceb23 =  phase[3] | phase[7];
+    assign simd_sub23 = phase[4] | phase[7];
+    assign simd_cep23 = phase[0] | phase[3] | phase[4] | phase[7];
+    assign  simd_a4 = simd_p3; // only at phase[6], other phases - don't care
+    assign  simd_a5 = simd_p0; // only at phase[6], other phases - don't care
+    // dsp_addsub_reg2_simd_i input connections
+    assign  simd_b4 = dsp_ma_p_1[M_WIDTH-1 -: WIDTH]; // only at phase[6], other phases - don't care. TODO: add symmetric rounding here?
+    assign  simd_b5 = dsp_ma_p_1[M_WIDTH-1 -: WIDTH]; // only at phase[2], other phases - don't care. TODO: add symmetric rounding here?
+    assign simd_ceaf45 = phase[6];
+    assign simd_ceas45 = phase[2];
+    assign simd_ceb45 =  phase[2] | phase[4];
+    assign simd_sub45 = phase[2] | phase[4];
+    assign simd_cep45 = phase[2] | phase[3] | phase[4] | phase[5];
+    // dsp_ma1_i control connections
+    assign dsp_ma_ceb1_1 =  phase[3] | phase[7];
+    assign dsp_ma_ceb2_1 =  phase[0];
+    assign dsp_ma_selb_1 =  phase[3] | phase[6];
+    assign dsp_ma_cea1_1 =  phase[2] | phase[6];
+    assign dsp_ma_cea2_1 =  phase[1] | phase[3];
+    assign dsp_ma_ced_1 =   phase[2] | phase[6];
+    assign dsp_ma_sela_1 =  phase[1] | phase[7];
+    assign dsp_ma_en_a_1 =  !(phase[2] | phase[4]);
+    assign dsp_ma_en_d_1 =  phase[0] | phase[2] | phase[4] | phase[6];
+    assign dsp_ma_sub_d_1 = phase[0];
+    assign dsp_ma_neg_m_1 = phase[6];
+    assign dsp_ma_accum_1 = phase[5] | phase[7];
+    // dsp_ma1_i data input connections
+/*  assign dsp_ma_ain24_1 = ({WIDTH{phase[6]}} & simd_p1) |
+                            ({WIDTH{phase[1]}} & simd_p2) |
+                            ({WIDTH{phase[2]}} & simd_p0) |
+                            ({WIDTH{phase[3]}} & simd_p2) ; // Other - don't care */
+    assign dsp_ma_ain24_1 = phase[6] ? simd_p1 : (phase[2] ? simd_p0 : simd_p2); 
+    assign dsp_ma_din24_1 = phase[6] ? simd_p2 :  simd_p1; 
+    // dsp_ma2_i control connections
+    assign dsp_ma_ceb1_2 = phase[1] | phase[6];
+    assign dsp_ma_ceb2_2 = phase[2] | phase[5];
+    assign dsp_ma_selb_2 = phase[1] | phase[3] | phase[5] | phase[7];
+    assign dsp_ma_cea1_2 = phase[5];
+    assign dsp_ma_cea2_2 = phase[4];
+    assign dsp_ma_ced_2 =  phase[1] | phase[6];
+    assign dsp_ma_sela_2 =  phase[1] | phase[6];
+    assign dsp_ma_seld_2 =  phase[0] | phase[3] | phase[4] | phase[7];
+    assign dsp_ma_neg_m_2 = phase[6];
+    assign dsp_ma_accum_2 = phase[0] | phase[2] | phase[4] | phase[6];
+    // dsp_ma2_i data input connections
+    assign dsp_ma_ain24_2 = simd_p5; 
+    assign dsp_ma_din24_2 = simd_p4; 
+    assign dsp_ma_din24_1 = phase[6] ? simd_p2 :  simd_p1; 
+// Shift adder outputs to the MSB of the multiplier inputs
+    assign dsp_ma_ain_1 = {dsp_ma_ain24_1, {A_WIDTH-WIDTH{1'b0}}};   
+    assign dsp_ma_din_1 = {dsp_ma_din24_1, {A_WIDTH-WIDTH{1'b0}}};   
+    assign dsp_ma_ain_2 = {dsp_ma_ain24_2, {A_WIDTH-WIDTH{1'b0}}};   
+    assign dsp_ma_din_2 = {dsp_ma_din24_2, {A_WIDTH-WIDTH{1'b0}}};
+// Shift DSP outputs to match output results    
+    assign  dout1_w = dsp_ma_p_1[M_WIDTH -: WIDTH]; // adding one it for adder (two MPY outputs are added)
+    assign  dout2_w = dsp_ma_p_2[M_WIDTH -: WIDTH]; // adding one it for adder (two MPY outputs are added)
+    assign dout = dout_r;
+    always @ (posedge clk) begin
+        phase <= {phase[6:0], en & (start |phase[7])};
+        if      (!rst || start)          phase_cnt <= 0;
+        else if (en || (phase_cnt != 7)) phase_cnt <= phase_cnt + 1;
+        // Cosine table, defined to fit into 17 bits for 18-bit signed DSP B-operand
+        case (phase_cnt)
+            3'h0: dsp_ma_bin <= COS_1_16;
+            3'h1: dsp_ma_bin <= COS_7_16;
+            3'h2: dsp_ma_bin <= COS_2_16;
+            3'h3: dsp_ma_bin <= COS_2_16;
+            3'h4: dsp_ma_bin <= COS_3_16;
+            3'h5: dsp_ma_bin <= COS_5_16;
+            3'h6: dsp_ma_bin <= COS_4_16;
+            3'h7: dsp_ma_bin <= COS_6_16;
+        endcase
+        dout_r <= phase_cnt[0] ? dout1_w : dout2_w;
+    end
+    dsp_addsub_simd #(
+        .NUM_DATA (2),
+        .WIDTH    (WIDTH)
+    ) dsp_addsub_simd1_i (
+        .clk      (clk),               // input
+        .rst      (rst),               // input
+        .ain      ({simd_a1,simd_a0}), // input[47:0] 
+        .bin      ({simd_b1,simd_b0}), // input[47:0] 
+        .cea      (simd_cea01),        // input
+        .ceb      (simd_ceb01),        // input
+        .subtract (simd_sub01),        // input
+        .cep      (simd_cep01),        // input
+        .pout     ({simd_p1,simd_p0})  // output[47:0] 
+    );
+    dsp_addsub_simd #(
+        .NUM_DATA (2),
+        .WIDTH    (WIDTH)
+    ) dsp_addsub_simd2_i (
+        .clk      (clk),               // input
+        .rst      (rst),               // input
+        .ain      ({simd_a3,simd_a2}), // input[47:0] 
+        .bin      ({simd_b3,simd_b2}), // input[47:0] 
+        .cea      (simd_cea23),        // input
+        .ceb      (simd_ceb23),        // input
+        .subtract (simd_sub23),        // input
+        .cep      (simd_cep23),        // input
+        .pout     ({simd_p3,simd_p2})  // output[47:0] 
+    );
+    dsp_addsub_reg2_simd #(
+        .NUM_DATA(2),
+        .WIDTH(24)
+    ) dsp_addsub_reg2_simd_i (
+        .clk      (clk),               // input
+        .rst      (rst),               // input
+        .ain      ({simd_a5,simd_a4}), // input[47:0] 
+        .bin      ({simd_b5,simd_b4}), // input[47:0] 
+        .cea1     (simd_ceaf45),       // input
+        .cea2     (simd_ceas45),       // input
+        .ceb      (simd_ceb45),        // input
+        .subtract (simd_sub45),        // input
+        .cep      (simd_cep45),        // input
+        .pout     ({simd_p5,simd_p4})  // output[47:0] 
+    );
+    dsp_ma_preadd #(
+        .B_WIDTH(18),
+        .A_WIDTH(25),
+        .P_WIDTH(48)
+    ) dsp_ma1_i (
+        .clk   (clk),            // input
+        .rst   (rst),            // input
+        .bin   (dsp_ma_bin),     // input[17:0] signed 
+        .ceb1  (dsp_ma_ceb1_1),  // input
+        .ceb2  (dsp_ma_ceb2_1),  // input
+        .selb  (dsp_ma_selb_1),  // input
+        .ain   (dsp_ma_ain_1),   // input[24:0] signed 
+        .cea1  (dsp_ma_cea1_1),  // input
+        .cea2  (dsp_ma_cea2_1),  // input
+        .din   (dsp_ma_din_1),   // input[24:0] signed 
+        .ced   (dsp_ma_ced_1),   // input
+        .cead  (1'b1),           // input
+        .sela  (dsp_ma_sela_1),  // input
+        .en_a  (dsp_ma_en_a_1),    // input
+        .en_d  (dsp_ma_en_d_1), // input
+        .sub_d (dsp_ma_sub_d_1), // input
+        .neg_m (dsp_ma_neg_m_1), // input
+        .accum (dsp_ma_accum_1), // input
+        .pout  (dsp_ma_p_1)      // output[47:0] signed 
+    );
+    dsp_ma #(
+        .B_WIDTH(B_WIDTH),
+        .A_WIDTH(A_WIDTH),
+        .P_WIDTH(P_WIDTH)
+    ) dsp_ma2_i (
+        .clk   (clk),            // input
+        .rst   (rst),            // input
+        .bin   (dsp_ma_bin),     // input[17:0] signed 
+        .ceb1  (dsp_ma_ceb1_2),  // input
+        .ceb2  (dsp_ma_ceb2_2),  // input
+        .selb  (dsp_ma_selb_2),  // input
+        .ain   (dsp_ma_ain_2),   // input[24:0] signed 
+        .cea1  (dsp_ma_cea1_2),  // input
+        .cea2  (dsp_ma_cea2_2),  // input
+        .din   (dsp_ma_din_2),   // input[24:0] signed 
+        .ced   (dsp_ma_ced_2),   // input
+        .sela  (dsp_ma_sela_2),  // input
+        .seld  (dsp_ma_seld_2),  // input
+        .neg_m (dsp_ma_neg_m_2), // input
+        .accum (dsp_ma_accum_2), // input
+        .pout  (dsp_ma_p_2)      // output[47:0] signed 
+    );
+    dly01_16 dly01_16_i (
+        .clk   (clk),           // input
+        .rst   (rst),           // input
+        .dly   (4'h4),          // input[3:0] 
+        .din   (phase[7]),      // input
+        .dout  (pre2_start_out) // output
+    );
+endmodule
--- a/dsp/dct1d_chen_reorder_in.v
+++ b/dsp/dct1d_chen_reorder_in.v
+/*******************************************************************************
+ * <b>Module:</b>dct1d_chen_reorder_in
+ * @file dct1d_chen_reorder_in.v
+ * @date:2016-06-08  
+ * @author: Andrey Filippov
+ *     
+ * @brief: Reorder scan-line pixel stream for dct1d_chen module
+ *
+ * @copyright Copyright (c) 2016 Elphel, Inc.
+ *
+ * <b>License:</b>
+ *
+ *dct1d_chen_reorder_in.v is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ *  dct1d_chen_reorder_in.v is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/> .
+ *
+ * Additional permission under GNU GPL version 3 section 7:
+ * If you modify this Program, or any covered work, by linking or combining it
+ * with independent modules provided by the FPGA vendor only (this permission
+ * does not extend to any 3-rd party modules, "soft cores" or macros) under
+ * different license terms solely for the purpose of generating binary "bitstream"
+ * files and/or simulating the code, the copyright holders of this Program give
+ * you the right to distribute the covered work without those independent modules
+ * as long as the source code for them is available from the FPGA vendor free of
+ * charge, and there is no dependence on any encrypted modules for simulating of
+ * the combined code. This permission applies to you if the distributed code
+ * contains all the components and scripts required to completely simulate it
+ * with at least one of the Free Software programs.
+ *******************************************************************************/
+`timescale 1ns/1ps
+module  dct1d_chen_reorder_in#(
+    parameter WIDTH = 24
+ )(
+    input                  clk,
+    input                  rst,
+    input                  en,  // to be sampled when start is expected (start time slot)
+    input  [WIDTH -1:0]    din,
+    input                  start, // with first pixel 
+    output [2*WIDTH -1:0]  dout_10_32_76_54, // Concatenated/reordered output data {x[1],x[0]}/{x[3],x[2]}/ {x[7],x[6]}/{x[5],x[4]}
+    output reg             start_out,
+    output reg             en_out // to be sampled when start_out is expected
+);
+    reg                    last_r;
+    reg              [2:0] cntr_in;
+    reg              [1:0] raddr;
+    wire                   restart = !rst && en && (start || last_r);
+    wire             [1:0] we = ((|cntr_in) || en)? {~cntr_in[0]^cntr_in[2],cntr_in[0]^cntr_in[2]}:2'b0;
+    wire             [1:0] waddr = {cntr_in[2],cntr_in[2]^cntr_in[1]};
+    reg        [WIDTH-1:0] bufl_ram[0:3];
+    reg        [WIDTH-1:0] bufh_ram[0:3];
+    reg     [2*WIDTH -1:0] dout_10_32_76_54_r;
+    assign dout_10_32_76_54 = dout_10_32_76_54_r;
+    always @(posedge clk) begin
+        if (rst) last_r <= 0;
+        else     last_r <= &cntr_in;
+        if      (rst)                   cntr_in <= 0;
+        else if (restart || (|cntr_in)) cntr_in <= cntr_in + 1;
+        if (we[0]) bufl_ram[waddr] <= din;
+        if (we[1]) bufh_ram[waddr] <= din;
+        if      (rst )         raddr <= ~0;
+        else if (cntr_in == 5) raddr <= 0;
+        else if (!(&raddr))    raddr <= raddr + 1;
+        dout_10_32_76_54_r <= {bufh_ram[raddr],bufl_ram[raddr]};
+        start_out <= (cntr_in == 5);
+        en_out <= en || (|cntr_in) || last_r;
+    end
+endmodule
--- a/dsp/dct1d_chen_reorder_out.v
+++ b/dsp/dct1d_chen_reorder_out.v
+/*******************************************************************************
+ * <b>Module:</b>dct1d_chen_reorder_out
+ * @file dct1d_chen_reorder_out.v
+ * @date:2016-06-08  
+ * @author: Andrey Filippov
+ *     
+ * @brief: Reorder data from dct1d_chen output to natural sequence
+ *
+ * @copyright Copyright (c) 2016 Elphel, Inc.
+ *
+ * <b>License:</b>
+ *
+ *dct1d_chen_reorder_out.v is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ *  dct1d_chen_reorder_out.v is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/> .
+ *
+ * Additional permission under GNU GPL version 3 section 7:
+ * If you modify this Program, or any covered work, by linking or combining it
+ * with independent modules provided by the FPGA vendor only (this permission
+ * does not extend to any 3-rd party modules, "soft cores" or macros) under
+ * different license terms solely for the purpose of generating binary "bitstream"
+ * files and/or simulating the code, the copyright holders of this Program give
+ * you the right to distribute the covered work without those independent modules
+ * as long as the source code for them is available from the FPGA vendor free of
+ * charge, and there is no dependence on any encrypted modules for simulating of
+ * the combined code. This permission applies to you if the distributed code
+ * contains all the components and scripts required to completely simulate it
+ * with at least one of the Free Software programs.
+ *******************************************************************************/
+`timescale 1ns/1ps
+module  dct1d_chen_reorder_out#(
+    parameter WIDTH = 24
+ )(
+    input                  clk,
+    input                  rst,
+    input  [WIDTH -1:0]    din,       // pre2_start-X-F4-X-F2-X-F6-F5-F0-F3-X-F1-X-F7
+    input                  pre2_start,     // Two cycles ahead of F4 
+    output   [WIDTH -1:0]  dout,      // data in natural order: F0-F1-F2-F3-F4-F5-F6-F7
+    output                 start_out, // 1 ahead of F0
+    output reg             en_out // to be sampled when start_out is expected
+);
+    reg [WIDTH -1:0] reord_buf_ram[0:15];
+    reg [WIDTH -1:0] dout_r;
+    reg  [3:0] cntr_in;
+    wire       start_8;
+    wire       start_11;
+    reg        start_12;
+    wire       stop_in;
+    reg        we_r;
+    reg  [3:0] ina_rom;
+    wire [3:0] waddr = {ina_rom[3] ^ cntr_in[3], ina_rom[2:0]};   
+    reg  [3:0] raddr;
+    assign dout = dout_r; 
+    assign start_out = start_12;
+    always @(posedge clk) begin
+        if      (rst)        we_r <= 0;
+        else if (pre2_start) we_r <= 1;
+        else if (stop_in)    we_r <= 0;
+        if      (rst)        cntr_in <= 0;
+        else if (pre2_start) cntr_in <= {~cntr_in[3],3'b0};
+        else if (we_r)       cntr_in <= cntr_in + 1;
+        case (cntr_in[2:0])
+            3'h0: ina_rom <= {1'b0,3'h4};
+            3'h1: ina_rom <= {1'b1,3'h1};
+            3'h2: ina_rom <= {1'b0,3'h2};
+            3'h3: ina_rom <= {1'b1,3'h7};
+            3'h4: ina_rom <= {1'b0,3'h6};
+            3'h5: ina_rom <= {1'b0,3'h5};
+            3'h6: ina_rom <= {1'b0,3'h0};
+            3'h7: ina_rom <= {1'b0,3'h3};
+        endcase
+        if (we_r) reord_buf_ram[waddr] <= din;
+        if      (start_11)                  raddr <= {~cntr_in[3], 3'b0};
+        else if ((raddr[2:0] != 0) || we_r) raddr <= raddr + 1;
+        dout_r <=  reord_buf_ram[raddr];
+        start_12 <= start_11;
+        en_out <= start_12 || (raddr[2:0] != 0); 
+    end
+    dly01_16 start_8__i (
+        .clk   (clk), // input
+        .rst   (rst), // input
+        .dly   (4'h7), // input[3:0] 
+        .din   (pre2_start), // input
+        .dout  (start_8) // output
+    );
+    dly01_16 start_11__i (
+        .clk   (clk), // input
+        .rst   (rst), // input
+        .dly   (4'h1), // input[3:0] 
+        .din   (start_8), // input
+        .dout  (start_11) // output
+    );
+    dly01_16 dly01_16_2_i (
+        .clk   (clk), // input
+        .rst   (rst), // input
+        .dly   (4'h4), // input[3:0] 
+        .din   (start_8 && !pre2_start), // input
+        .dout  (stop_in)            // output
+    );
+endmodule
--- a/dsp/dsp_addsub_reg2_simd.v
+++ b/dsp/dsp_addsub_reg2_simd.v
+/*******************************************************************************
+ * <b>Module:</b>dsp_addsub_reg2_simd
+ * @file dsp_addsub_reg2_simd.v
+ * @date:2016-06-05  
+ * @author: Andrey Filippov
+ *     
+ * @brief: SIMD adder/subtracter with dual registers on the A-inputa
+ *
+ * @copyright Copyright (c) 2016 Elphel, Inc.
+ *
+ * <b>License:</b>
+ *
+ *dsp_addsub_reg2_simd.v is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ *  dsp_addsub_reg2_simd.v is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/> .
+ *
+ * Additional permission under GNU GPL version 3 section 7:
+ * If you modify this Program, or any covered work, by linking or combining it
+ * with independent modules provided by the FPGA vendor only (this permission
+ * does not extend to any 3-rd party modules, "soft cores" or macros) under
+ * different license terms solely for the purpose of generating binary "bitstream"
+ * files and/or simulating the code, the copyright holders of this Program give
+ * you the right to distribute the covered work without those independent modules
+ * as long as the source code for them is available from the FPGA vendor free of
+ * charge, and there is no dependence on any encrypted modules for simulating of
+ * the combined code. This permission applies to you if the distributed code
+ * contains all the components and scripts required to completely simulate it
+ * with at least one of the Free Software programs.
+ *******************************************************************************/
+`timescale 1ns/1ps
+module  dsp_addsub_reg2_simd#(
+    parameter NUM_DATA =  2,
+    parameter WIDTH =    24
+)(
+    input                          clk,
+    input                          rst,
+    input  [NUM_DATA * WIDTH -1:0] ain,
+    input  [NUM_DATA * WIDTH -1:0] bin,
+    input                          cea1,      // load first a registers
+    input                          cea2,      // load second a registers
+    input                          ceb,       // load first b registers
+    input                          subtract,  // 0 - add, 1 - subtract
+    input                          cep,       // load output registers
+    output [NUM_DATA * WIDTH -1:0] pout);
+`ifdef INSTANTIATE_DSP48E1
+    wire [4:0] inmode = { 1'b1,  // ~selb,
+                          1'b0,  // sub_d,
+                          1'b0,  // seld,
+                          1'b0,  // seld, // ~en_a,
+                          1'b1}; // ~sela};
+    wire [3:0] alumode = {2'b0,        // Z + X + Y + CIN  / -Z +( X + Y + CIN) -1 
+                          1'b0,     
+                          subtract};
+    wire [6:0] opmode =  {3'b011, // Z = C-input
+                          2'b00,  // Y = 0
+                          2'b11}; // X = A:B
+    wire cryin = subtract;                      
+    DSP48E1 #(
+        .ACASCREG            (2), // (1),
+        .ADREG               (0), // (1),
+        .ALUMODEREG          (1),
+        .AREG                (2), // (1)
+        .AUTORESET_PATDET    ("NO_RESET"),
+        .A_INPUT             ("DIRECT"),
+        .BCASCREG            (2), // (1),
+        .BREG                (2), // (1)
+        .B_INPUT             ("DIRECT"),
+        .CARRYINREG          (1),
+        .CARRYINSELREG       (1),
+        .CREG                (1), //(1),
+        .DREG                (0), //(1),
+        .INMODEREG           (1),
+        .IS_ALUMODE_INVERTED (4'b0),
+        .IS_CARRYIN_INVERTED (1'b0),
+        .IS_CLK_INVERTED     (1'b0),
+        .IS_INMODE_INVERTED  (5'b0),
+        .IS_OPMODE_INVERTED  (7'b0),
+        .MASK                (48'hffffffffffff),
+        .MREG                (0),
+        .OPMODEREG           (1),
+        .PATTERN             (48'h000000000000),
+        .PREG                (1),
+        .SEL_MASK            ("MASK"),
+        .SEL_PATTERN         ("PATTERN"),
+        .USE_DPORT           ("TRUE"), //("FALSE"),
+        .USE_MULT            ("NONE"), //("MULTIPLY"),
+        .USE_PATTERN_DETECT  ("NO_PATDET"),
+        .USE_SIMD            ("TWO24") // ("ONE48")
+    ) DSP48E1_i (
+        .ACOUT          (),           // output[29:0] 
+        .BCOUT          (),           // output[17:0] 
+        .CARRYCASCOUT   (),           // output
+        .CARRYOUT       (),           // output[3:0] 
+        .MULTSIGNOUT    (),           // output
+        .OVERFLOW       (),           // output
+        .P              (pout),       // output[47:0] 
+        .PATTERNBDETECT (),           // output
+        .PATTERNDETECT  (),           // output
+        .PCOUT          (),           // output[47:0] 
+        .UNDERFLOW      (),           // output
+        .A              (ain[47:18]), // input[29:0] 
+        .ACIN           (30'b0),      // input[29:0] 
+        .ALUMODE        (alumode),    // input[3:0] 
+        .B              (ain[17:0]),  // input[17:0] 
+        .BCIN           (18'b0),      // input[17:0] 
+        .C              (bin),        // input[47:0] 
+        .CARRYCASCIN    (1'b0),       // input
+        .CARRYIN        (cryin),      // input
+        .CARRYINSEL     (3'h0),       // input[2:0] // later modify? 
+        .CEA1           (cea1),       // input
+        .CEA2           (cea2),       // input
+        .CEAD           (1'b0),       // input
+        .CEALUMODE      (1'b1),       // input
+        .CEB1           (cea1),       // input
+        .CEB2           (cea2),       // input
+        .CEC            (ceb),        // input
+        .CECARRYIN      (1'b0),       // input
+        .CECTRL         (1'b0),       // input
+        .CED            (1'b0),       // input
+        .CEINMODE       (1'b1),       // input
+        .CEM            (1'b1),       // input
+        .CEP            (cep),        // input
+        .CLK            (clk),        // input
+        .D              (25'h1ffffff),// input[24:0] 
+        .INMODE         (inmode),     // input[4:0] 
+        .MULTSIGNIN     (1'b0),       // input
+        .OPMODE         (opmode),     // input[6:0] 
+        .PCIN           (48'b0),      // input[47:0] 
+        .RSTA           (rst),        // input
+        .RSTALLCARRYIN  (rst),        // input
+        .RSTALUMODE     (rst),        // input
+        .RSTB           (rst),        // input
+        .RSTC           (rst),        // input
+        .RSTCTRL        (rst),        // input
+        .RSTD           (rst),        // input
+        .RSTINMODE      (rst),        // input
+        .RSTM           (rst),        // input
+        .RSTP           (rst)         // input
+    );
+`else
+    reg    [NUM_DATA * WIDTH -1:0] a1_reg;
+    reg    [NUM_DATA * WIDTH -1:0] a2_reg;
+    reg    [NUM_DATA * WIDTH -1:0] b_reg;
+    reg    [NUM_DATA * WIDTH -1:0] p_reg;
+    reg                            sub_r;
+    wire   [NUM_DATA * WIDTH -1:0] p_w;
+    assign pout = p_reg;                        
+    generate
+        genvar i;
+        for (i = 0; i < 4; i = i+1) begin: byte_fifo_block
+            assign p_w[WIDTH*i +: WIDTH] = a2_reg[WIDTH*i +: WIDTH] + sub_r ? -b_reg[WIDTH*i +: WIDTH]  : b_reg[WIDTH*i +: WIDTH];
+        end
+    endgenerate            
+    always @ (posedge clk) begin
+        if      (rst)  a1_reg <= 0;
+        else if (cea1) a1_reg <= ain;
+        if      (rst)  a2_reg <= 0;
+        else if (cea2) a2_reg <= a1_reg;
+        if      (rst)  b_reg <= 0;
+        else if (ceb)  b_reg <= bin;
+        sub_r <= subtract;
+        if      (rst) p_reg <= 0;
+        if      (cep) p_reg <= p_w;
+    end
+`endif
+endmodule
--- a/dsp/dsp_addsub_simd.v
+++ b/dsp/dsp_addsub_simd.v
+/*******************************************************************************
+ * <b>Module:</b>dsp_addsub_simd
+ * @file dsp_addsub_simd.v
+ * @date:2016-06-05  
+ * @author: Andrey Filippov
+ *     
+ * @brief: SIMD adder/subtracter
+ *
+ * @copyright Copyright (c) 2016 Elphel, Inc.
+ *
+ * <b>License:</b>
+ *
+ *dsp_addsub_simd.v is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ *  dsp_addsub_simd.v is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/> .
+ *
+ * Additional permission under GNU GPL version 3 section 7:
+ * If you modify this Program, or any covered work, by linking or combining it
+ * with independent modules provided by the FPGA vendor only (this permission
+ * does not extend to any 3-rd party modules, "soft cores" or macros) under
+ * different license terms solely for the purpose of generating binary "bitstream"
+ * files and/or simulating the code, the copyright holders of this Program give
+ * you the right to distribute the covered work without those independent modules
+ * as long as the source code for them is available from the FPGA vendor free of
+ * charge, and there is no dependence on any encrypted modules for simulating of
+ * the combined code. This permission applies to you if the distributed code
+ * contains all the components and scripts required to completely simulate it
+ * with at least one of the Free Software programs.
+ *******************************************************************************/
+`timescale 1ns/1ps
+module  dsp_addsub_simd#(
+    parameter NUM_DATA =  2,   // Currently with INSTANTIATE_DSP48E1 should be 2
+    parameter WIDTH =    24    // Currently with INSTANTIATE_DSP48E1 should be 24
+)(
+    input                          clk,
+    input                          rst,
+    input  [NUM_DATA * WIDTH -1:0] ain,
+    input  [NUM_DATA * WIDTH -1:0] bin,
+    input                          cea,      // load a registers
+    input                          ceb,      // load b registers
+    input                          subtract, // 0 - add, 1 - subtract
+    input                          cep,      // load output registers
+    output [NUM_DATA * WIDTH -1:0] pout);
+`ifdef INSTANTIATE_DSP48E1
+    wire [4:0] inmode = { 1'b1,  // ~selb,
+                          1'b0,  // sub_d,
+                          1'b0,  // seld,
+                          1'b0,  // seld, // ~en_a,
+                          1'b1}; // ~sela};
+    wire [3:0] alumode = {2'b0,        // Z + X + Y + CIN  / -Z +( X + Y + CIN) -1
+                          1'b0,     
+                          subtract};
+    wire [6:0] opmode =  {3'b011, // Z = C-input
+                          2'b00,  // Y = 0
+                          2'b11}; // X = A:B
+    wire cryin = subtract;                      
+    DSP48E1 #(
+        .ACASCREG            (1),
+        .ADREG               (0), // (1),
+        .ALUMODEREG          (1),
+        .AREG                (1), // (1)
+        .AUTORESET_PATDET    ("NO_RESET"),
+        .A_INPUT             ("DIRECT"),
+        .BCASCREG            (1),
+        .BREG                (1), // (1)
+        .B_INPUT             ("DIRECT"),
+        .CARRYINREG          (1),
+        .CARRYINSELREG       (1),
+        .CREG                (1), //(1),
+        .DREG                (0), //(1),
+        .INMODEREG           (1),
+        .IS_ALUMODE_INVERTED (4'b0),
+        .IS_CARRYIN_INVERTED (1'b0),
+        .IS_CLK_INVERTED     (1'b0),
+        .IS_INMODE_INVERTED  (5'b0),
+        .IS_OPMODE_INVERTED  (7'b0),
+        .MASK                (48'hffffffffffff),
+        .MREG                (0),
+        .OPMODEREG           (1),
+        .PATTERN             (48'h000000000000),
+        .PREG                (1),
+        .SEL_MASK            ("MASK"),
+        .SEL_PATTERN         ("PATTERN"),
+        .USE_DPORT           ("TRUE"), //("FALSE"),
+        .USE_MULT            ("NONE"), //("MULTIPLY"),
+        .USE_PATTERN_DETECT  ("NO_PATDET"),
+        .USE_SIMD            ("TWO24") // ("ONE48")
+    ) DSP48E1_i (
+        .ACOUT          (),           // output[29:0] 
+        .BCOUT          (),           // output[17:0] 
+        .CARRYCASCOUT   (),           // output
+        .CARRYOUT       (),           // output[3:0] 
+        .MULTSIGNOUT    (),           // output
+        .OVERFLOW       (),           // output
+        .P              (pout),       // output[47:0] 
+        .PATTERNBDETECT (),           // output
+        .PATTERNDETECT  (),           // output
+        .PCOUT          (),           // output[47:0] 
+        .UNDERFLOW      (),           // output
+        .A              (ain[47:18]), // input[29:0] 
+        .ACIN           (30'b0),      // input[29:0] 
+        .ALUMODE        (alumode),    // input[3:0] 
+        .B              (ain[17:0]),  // input[17:0] 
+        .BCIN           (18'b0),      // input[17:0] 
+        .C              (bin),        // input[47:0] 
+        .CARRYCASCIN    (1'b0),       // input
+        .CARRYIN        (cryin),      // input
+        .CARRYINSEL     (3'h0),       // input[2:0] // later modify? 
+        .CEA1           (cea),        // input
+        .CEA2           (1'b0),       // input
+        .CEAD           (1'b0),       // input
+        .CEALUMODE      (1'b1),       // input
+        .CEB1           (cea),        // input
+        .CEB2           (1'b0),       // input
+        .CEC            (ceb),        // input
+        .CECARRYIN      (1'b0),       // input
+        .CECTRL         (1'b0),       // input
+        .CED            (1'b0),       // input
+        .CEINMODE       (1'b1),       // input
+        .CEM            (1'b1),       // input
+        .CEP            (cep),        // input
+        .CLK            (clk),        // input
+        .D              (25'h1ffffff),// input[24:0] 
+        .INMODE         (inmode),     // input[4:0] 
+        .MULTSIGNIN     (1'b0),       // input
+        .OPMODE         (opmode),     // input[6:0] 
+        .PCIN           (48'b0),      // input[47:0] 
+        .RSTA           (rst),        // input
+        .RSTALLCARRYIN  (rst),        // input
+        .RSTALUMODE     (rst),        // input
+        .RSTB           (rst),        // input
+        .RSTC           (rst),        // input
+        .RSTCTRL        (rst),        // input
+        .RSTD           (rst),        // input
+        .RSTINMODE      (rst),        // input
+        .RSTM           (rst),        // input
+        .RSTP           (rst)         // input
+    );
+`else
+    reg    [NUM_DATA * WIDTH -1:0] a_reg;
+    reg    [NUM_DATA * WIDTH -1:0] b_reg;
+    reg    [NUM_DATA * WIDTH -1:0] p_reg;
+    reg                            sub_r;
+    wire   [NUM_DATA * WIDTH -1:0] p_w;
+    assign pout = p_reg;                        
+    generate
+        genvar i;
+        for (i = 0; i < 4; i = i+1) begin: byte_fifo_block
+            assign p_w[WIDTH*i +: WIDTH] = a_reg[WIDTH*i +: WIDTH] + sub_r ? -b_reg[WIDTH*i +: WIDTH]  : b_reg[WIDTH*i +: WIDTH];
+        end
+    endgenerate            
+    always @ (posedge clk) begin
+        if      (rst) a_reg <= 0;
+        else if (cea) a_reg <= ain;
+        if      (rst) b_reg <= 0;
+        else if (ceb) b_reg <= bin;
+        sub_r <= subtract;
+        if      (rst) p_reg <= 0;
+        if      (cep) p_reg <= p_w;
+    end
+`endif
+endmodule
--- a/dsp/dsp_ma.v
+++ b/dsp/dsp_ma.v
+/*******************************************************************************
+ *  dsp_ma
+ * @file dsp_ma.v
+ * @date:2016-06-05  
+ * @author: Andrey Filippov
+ *     
+ * @brief: DSP with multi-input multiplier and accumulator
+ *
+ * @copyright Copyright (c) 2016 Elphel, Inc.
+ *
+ * <b>License:</b>
+ *
+ * dsp_ma.v is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * dsp_ma.v is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/> .
+ *
+ * Additional permission under GNU GPL version 3 section 7:
+ * If you modify this Program, or any covered work, by linking or combining it
+ * with independent modules provided by the FPGA vendor only (this permission
+ * does not extend to any 3-rd party modules, "soft cores" or macros) under
+ * different license terms solely for the purpose of generating binary "bitstream"
+ * files and/or simulating the code, the copyright holders of this Program give
+ * you the right to distribute the covered work without those independent modules
+ * as long as the source code for them is available from the FPGA vendor free of
+ * charge, and there is no dependence on any encrypted modules for simulating of
+ * the combined code. This permission applies to you if the distributed code
+ * contains all the components and scripts required to completely simulate it
+ * with at least one of the Free Software programs.
+ *******************************************************************************/
+`timescale 1ns/1ps
+module  dsp_ma #(
+    parameter B_WIDTH = 18,
+    parameter A_WIDTH = 25,
+    parameter P_WIDTH = 48)
+(
+    input                       clk,
+    input                       rst,
+    input  signed [B_WIDTH-1:0] bin,
+    input                       ceb1,     // load b1 register
+    input                       ceb2,     // load b2 register
+    input                       selb,     // 0 - select b1, 1 - select b2
+    input  signed [A_WIDTH-1:0] ain,
+    input                       cea1,
+    input                       cea2,
+    input  signed [A_WIDTH-1:0] din,
+    input                       ced,
+    input                       sela,     // 0 - select a1, 1 - select a2
+    input                       seld,     // 0 - select a1/a2, 1 - select d
+    input                       neg_m,    // 1 - negate multiplier result
+    input                       accum,    // 0 - use multiplier result, 1 add to accumulator
+    output signed [P_WIDTH-1:0] pout
+);
+`ifdef INSTANTIATE_DSP48E1
+    wire [4:0] inmode = {~selb,
+                          1'b0, // sub_d,
+                          seld,
+                          seld, // ~en_a,
+                         ~sela};
+    wire [3:0] alumode = {2'b0,
+                          neg_m,
+                          neg_m};
+    wire [6:0] opmode =  {1'b0,
+                          accum,
+                          1'b0,
+                          2'b01,
+                          2'b01};
+    DSP48E1 #(
+        .ACASCREG            (1),
+        .ADREG               (0), // (1),
+        .ALUMODEREG          (1),
+        .AREG                (2), // (1)
+        .AUTORESET_PATDET    ("NO_RESET"),
+        .A_INPUT             ("DIRECT"),
+        .BCASCREG            (1),
+        .BREG                (2), // (1)
+        .B_INPUT             ("DIRECT"),
+        .CARRYINREG          (1),
+        .CARRYINSELREG       (1),
+        .CREG                (0), //(1),
+        .DREG                (1),
+        .INMODEREG           (1),
+        .IS_ALUMODE_INVERTED (4'b0),
+        .IS_CARRYIN_INVERTED (1'b0),
+        .IS_CLK_INVERTED     (1'b0),
+        .IS_INMODE_INVERTED  (5'b0),
+        .IS_OPMODE_INVERTED  (7'b0),
+        .MASK                (48'hffffffffffff),
+        .MREG                (1),
+        .OPMODEREG           (1),
+        .PATTERN             (48'h000000000000),
+        .PREG                (1),
+        .SEL_MASK            ("MASK"),
+        .SEL_PATTERN         ("PATTERN"),
+        .USE_DPORT           ("TRUE"), //("FALSE"),
+        .USE_MULT            ("MULTIPLY"),
+        .USE_PATTERN_DETECT  ("NO_PATDET"),
+        .USE_SIMD            ("ONE48")
+    ) DSP48E1_i (
+        .ACOUT          (),           // output[29:0] 
+        .BCOUT          (),           // output[17:0] 
+        .CARRYCASCOUT   (),           // output
+        .CARRYOUT       (),           // output[3:0] 
+        .MULTSIGNOUT    (),           // output
+        .OVERFLOW       (),           // output
+        .P              (pout),       // output[47:0] 
+        .PATTERNBDETECT (),           // output
+        .PATTERNDETECT  (),           // output
+        .PCOUT          (),           // output[47:0] 
+        .UNDERFLOW      (),           // output
+        .A              ({{30-A_WIDTH{ain[A_WIDTH-1]}}, ain}), // input[29:0] 
+        .ACIN           (30'b0),      // input[29:0] 
+        .ALUMODE        (alumode),    // input[3:0] 
+        .B              (bin),        // input[17:0] 
+        .BCIN           (18'b0),      // input[17:0] 
+        .C              (48'hffffffffffff), // input[47:0] 
+        .CARRYCASCIN    (1'b0),       // input
+        .CARRYIN        (1'b0),       // input
+        .CARRYINSEL     (3'h0),       // input[2:0] // later modify? 
+        .CEA1           (cea1),       // input
+        .CEA2           (cea2),       // input
+        .CEAD           (1'b0),       // input
+        .CEALUMODE      (1'b1),       // input
+        .CEB1           (ceb1),       // input
+        .CEB2           (ceb2),       // input
+        .CEC            (1'b0),       // input
+        .CECARRYIN      (1'b0),       // input
+        .CECTRL         (1'b0),       // input
+        .CED            (ced),        // input
+        .CEINMODE       (1'b1),       // input
+        .CEM            (1'b1),       // input
+        .CEP            (1'b1),       // input
+        .CLK            (clk),        // input
+        .D              (din),        // input[24:0] 
+        .INMODE         (inmode),     // input[4:0] 
+        .MULTSIGNIN     (1'b0),       // input
+        .OPMODE         (opmode),     // input[6:0] 
+        .PCIN           (48'b0),      // input[47:0] 
+        .RSTA           (rst),        // input
+        .RSTALLCARRYIN  (rst),        // input
+        .RSTALUMODE     (rst),        // input
+        .RSTB           (rst),        // input
+        .RSTC           (rst),        // input
+        .RSTCTRL        (rst),        // input
+        .RSTD           (rst),        // input
+        .RSTINMODE      (rst),        // input
+        .RSTM           (rst),        // input
+        .RSTP           (rst)        // input
+    );
+`else
+// Will try to make it infer DSP48e1
+    reg  signed [B_WIDTH-1:0] b1_reg;
+    reg  signed [B_WIDTH-1:0] b2_reg;
+    reg  signed [A_WIDTH-1:0] a1_reg;
+    reg  signed [A_WIDTH-1:0] a2_reg;
+    reg  signed [A_WIDTH-1:0] d_reg;
+    reg  signed [P_WIDTH-1:0] m_reg;
+    reg  signed [P_WIDTH-1:0] p_reg;
+    wire signed [A_WIDTH+B_WIDTH-1:0] m_wire;
+    wire signed [B_WIDTH-1:0] b_wire;
+    wire signed [A_WIDTH-1:0] a_wire;
+    reg                       selb_r;
+    reg                       sela_r;
+    reg                       seld_r;
+    reg                       neg_m_r;
+    reg                       accum_r;
+    wire signed [P_WIDTH-1:0] m_reg_pm;            
+    wire signed [P_WIDTH-1:0] p_reg_cond;            
+    assign pout = p_reg;
+    assign b_wire = selb_r ? b2_reg : b1_reg;
+    assign a_wire = seld_r ? d_reg : (sela_r ? a2_reg : a1_reg);
+    assign m_wire = a_wire * b_wire;
+    assign m_reg_pm =   neg_m_r ? - m_reg : m_reg;  
+    assign p_reg_cond = accum_r ? p_reg : 0;  
+    always @ (posedge clk) begin
+        if      (rst)  b1_reg <= 0;
+        else if (ceb1) b1_reg <= bin;
+        if      (rst)  b2_reg <= 0;
+        else if (ceb2) b2_reg <= bin;
+        if      (rst)  a1_reg <= 0;
+        else if (cea1) a1_reg <= ain;
+        if      (rst)  a2_reg <= 0;
+        else if (cea2) a2_reg <= ain;
+        if      (rst)  d_reg <= 0;
+        else if (ced)  d_reg <= din;
+        selb_r <= selb;
+        sela_r <= sela;
+        seld_r <= seld;
+        neg_m_r <= neg_m;
+        accum_r <= accum;
+        m_reg <= {{P_WIDTH - A_WIDTH - B_WIDTH{1'b0}}, m_wire};
+        p_reg <= p_reg_cond + m_reg_pm;
+    end
+`endif
+endmodule
--- a/dsp/dsp_ma_preadd.v
+++ b/dsp/dsp_ma_preadd.v
+/*******************************************************************************
+ *  dsp_ma_preadd
+ * @file dsp_ma_preadd.v
+ * @date:2016-06-05  
+ * @author: Andrey Filippov
+ *     
+ * @brief: DSP with multi-input multiplier and accumulator with pre-adder
+ *
+ * @copyright Copyright (c) 2016 Elphel, Inc.
+ *
+ * <b>License:</b>
+ *
+ * dsp_ma_preadd.v is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * dsp_ma_preadd.v is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/> .
+ *
+ * Additional permission under GNU GPL version 3 section 7:
+ * If you modify this Program, or any covered work, by linking or combining it
+ * with independent modules provided by the FPGA vendor only (this permission
+ * does not extend to any 3-rd party modules, "soft cores" or macros) under
+ * different license terms solely for the purpose of generating binary "bitstream"
+ * files and/or simulating the code, the copyright holders of this Program give
+ * you the right to distribute the covered work without those independent modules
+ * as long as the source code for them is available from the FPGA vendor free of
+ * charge, and there is no dependence on any encrypted modules for simulating of
+ * the combined code. This permission applies to you if the distributed code
+ * contains all the components and scripts required to completely simulate it
+ * with at least one of the Free Software programs.
+ *******************************************************************************/
+`timescale 1ns/1ps
+module  dsp_ma_preadd #(
+    parameter B_WIDTH = 18,
+    parameter A_WIDTH = 25,
+    parameter P_WIDTH = 48)
+(
+    input                       clk,
+    input                       rst,
+    input  signed [B_WIDTH-1:0] bin,
+    input                       ceb1,     // load b1 register
+    input                       ceb2,     // load b2 register
+    input                       selb,     // 0 - select b1, 1 - select b2
+    input  signed [A_WIDTH-1:0] ain,
+    input                       cea1,     // clock enable a1 reg
+    input                       cea2,     // clock enable a2 reg 
+    input  signed [A_WIDTH-1:0] din,
+    input                       ced,      // enable d-reg
+    input                       cead,     // enable ad register (after pre-adder)  
+    input                       sela,     // 0 - select a1, 1 - select a2
+    input                       en_a,     // 1 - enable a input (0 - zero) ~inmode[1]
+    input                       en_d,     // 1 - enable d input (0 - zero) ~inmode[2]
+    input                       sub_d,    // 0 - pre-add (A+D), 1 - pre-subtract (A-D)
+    input                       neg_m,    // 1 - negate multiplier result
+    input                       accum,    // 0 - use multiplier result, 1 add to accumulator
+    output signed [P_WIDTH-1:0] pout
+);
+`ifdef INSTANTIATE_DSP48E1
+    wire [4:0] inmode = {~selb,
+                          sub_d,
+                          en_d,
+                         ~en_a,
+                         ~sela};
+    wire [3:0] alumode = {2'b0,
+                          neg_m,
+                          neg_m};
+    wire [6:0] opmode =  {1'b0,
+                          accum,
+                          1'b0,
+                          2'b01,
+                          2'b01};
+    DSP48E1 #(
+        .ACASCREG            (1),
+        .ADREG               (1),
+        .ALUMODEREG          (1),
+        .AREG                (2), // (1)
+        .AUTORESET_PATDET    ("NO_RESET"),
+        .A_INPUT             ("DIRECT"),
+        .BCASCREG            (1),
+        .BREG                (2), // (1)
+        .B_INPUT             ("DIRECT"),
+        .CARRYINREG          (1),
+        .CARRYINSELREG       (1),
+        .CREG                (0), //(1),
+        .DREG                (1),
+        .INMODEREG           (1),
+        .IS_ALUMODE_INVERTED (4'b0),
+        .IS_CARRYIN_INVERTED (1'b0),
+        .IS_CLK_INVERTED     (1'b0),
+        .IS_INMODE_INVERTED  (5'b0),
+        .IS_OPMODE_INVERTED  (7'b0),
+        .MASK                (48'hffffffffffff),
+        .MREG                (1),
+        .OPMODEREG           (1),
+        .PATTERN             (48'h000000000000),
+        .PREG                (1),
+        .SEL_MASK            ("MASK"),
+        .SEL_PATTERN         ("PATTERN"),
+        .USE_DPORT           ("TRUE"), //("FALSE"),
+        .USE_MULT            ("MULTIPLY"),
+        .USE_PATTERN_DETECT  ("NO_PATDET"),
+        .USE_SIMD            ("ONE48")
+    ) DSP48E1_i (
+        .ACOUT          (),           // output[29:0] 
+        .BCOUT          (),           // output[17:0] 
+        .CARRYCASCOUT   (),           // output
+        .CARRYOUT       (),           // output[3:0] 
+        .MULTSIGNOUT    (),           // output
+        .OVERFLOW       (),           // output
+        .P              (pout),       // output[47:0] 
+        .PATTERNBDETECT (),           // output
+        .PATTERNDETECT  (),           // output
+        .PCOUT          (),           // output[47:0] 
+        .UNDERFLOW      (),           // output
+        .A              ({{30-A_WIDTH{ain[A_WIDTH-1]}}, ain}), // input[29:0] 
+        .ACIN           (30'b0),      // input[29:0] 
+        .ALUMODE        (alumode),    // input[3:0] 
+        .B              (bin),        // input[17:0] 
+        .BCIN           (18'b0),      // input[17:0] 
+        .C              (48'hffffffffffff), // input[47:0] 
+        .CARRYCASCIN    (1'b0),       // input
+        .CARRYIN        (1'b0),       // input
+        .CARRYINSEL     (3'h0),       // input[2:0] // later modify? 
+        .CEA1           (cea1),       // input
+        .CEA2           (cea2),       // input
+        .CEAD           (cead),       // input
+        .CEALUMODE      (1'b1),       // input
+        .CEB1           (ceb1),       // input
+        .CEB2           (ceb2),       // input
+        .CEC            (1'b0),       // input
+        .CECARRYIN      (1'b0),       // input
+        .CECTRL         (1'b0),       // input
+        .CED            (ced),        // input
+        .CEINMODE       (1'b1),       // input
+        .CEM            (1'b1),       // input
+        .CEP            (1'b1),       // input
+        .CLK            (clk),        // input
+        .D              (din),        // input[24:0] 
+        .INMODE         (inmode),     // input[4:0] 
+        .MULTSIGNIN     (1'b0),       // input
+        .OPMODE         (opmode),     // input[6:0] 
+        .PCIN           (48'b0),      // input[47:0] 
+        .RSTA           (rst),        // input
+        .RSTALLCARRYIN  (rst),        // input
+        .RSTALUMODE     (rst),        // input
+        .RSTB           (rst),        // input
+        .RSTC           (rst),        // input
+        .RSTCTRL        (rst),        // input
+        .RSTD           (rst),        // input
+        .RSTINMODE      (rst),        // input
+        .RSTM           (rst),        // input
+        .RSTP           (rst)        // input
+    );
+`else
+// Will try to make it infer DSP48e1
+    reg  signed [B_WIDTH-1:0] b1_reg;
+    reg  signed [B_WIDTH-1:0] b2_reg;
+    reg  signed [A_WIDTH-1:0] a1_reg;
+    reg  signed [A_WIDTH-1:0] a2_reg;
+    reg  signed [A_WIDTH-1:0] d_reg;
+    reg  signed [A_WIDTH-1:0] ad_reg;
+    reg  signed [P_WIDTH-1:0] m_reg;
+    reg  signed [P_WIDTH-1:0] p_reg;
+    wire signed [A_WIDTH+B_WIDTH-1:0] m_wire;
+    wire signed [B_WIDTH-1:0] b_wire;
+    wire signed [A_WIDTH-1:0] a_wire;
+    wire signed [A_WIDTH-1:0] d_wire;
+    reg                       selb_r;
+    reg                       sela_r;
+    reg                       en_a_r;
+    reg                       en_d_r;
+    reg                       sub_d_r;
+    reg                       neg_m_r;
+    reg                       accum_r;
+    wire signed [P_WIDTH-1:0] m_reg_pm;            
+    wire signed [P_WIDTH-1:0] p_reg_cond;          
+    assign pout = p_reg;
+    assign b_wire = selb_r ? b2_reg : b1_reg;
+    assign a_wire = en_a_r ? (sela_r ? a2_reg : a1_reg) : {A_WIDTH{1'b0}};
+    assign d_wire = en_d_r ? (sub_d_r ? -d_reg : d_reg) : {A_WIDTH{1'b0}};
+    assign m_wire = ad_reg * b_wire;
+    assign m_reg_pm =   neg_m_r ? - m_reg : m_reg;  
+    assign p_reg_cond = accum_r ? p_reg : 0;  
+    always @ (posedge clk) begin
+        if      (rst)  b1_reg <= 0;
+        else if (ceb1) b1_reg <= bin;
+        if      (rst)  b2_reg <= 0;
+        else if (ceb2) b2_reg <= bin;
+        if      (rst)  a1_reg <= 0;
+        else if (cea1) a1_reg <= ain;
+        if      (rst)  a2_reg <= 0;
+        else if (cea2) a2_reg <= ain;
+        if      (rst)  d_reg <= 0;
+        else if (ced)  d_reg <= din;
+        if      (rst)   ad_reg <= 0;
+        else if (cead)  ad_reg <= a_wire + d_wire;
+        neg_m_r <= neg_m;
+        accum_r <= accum;
+        selb_r <=  selb;
+        sela_r <=  sela;
+        en_a_r <=  en_a;
+        en_d_r <=  en_d;
+        sub_d_r <= sub_d;
+        m_reg <= {{P_WIDTH - A_WIDTH - B_WIDTH{1'b0}}, m_wire};
+        p_reg <= p_reg_cond + m_reg_pm;
+    end
+`endif
+endmodule