/*!
* Module:dct_chen_transpose
* @file dct_chen_transpose.v
* @date 2016-06-09
* @author Andrey Filippov
*
* @brief Reorder+transpose data between two 1-d DCT passes
*
* @copyright Copyright (c) 2016 Elphel, Inc.
*
* License:
*
*dct_chen_transpose.v is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* dct_chen_transpose.v is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*
* Additional permission under GNU GPL version 3 section 7:
* If you modify this Program, or any covered work, by linking or combining it
* with independent modules provided by the FPGA vendor only (this permission
* does not extend to any 3-rd party modules, "soft cores" or macros) under
* different license terms solely for the purpose of generating binary "bitstream"
* files and/or simulating the code, the copyright holders of this Program give
* you the right to distribute the covered work without those independent modules
* as long as the source code for them is available from the FPGA vendor free of
* charge, and there is no dependence on any encrypted modules for simulating of
* the combined code. This permission applies to you if the distributed code
* contains all the components and scripts required to completely simulate it
* with at least one of the Free Software programs.
*/
`timescale 1ns/1ps
module dct_chen_transpose#(
parameter WIDTH = 24
)(
input clk,
input rst,
input [WIDTH -1:0] din, // pre2_start-X-F4-X-F2-X-F6-F5-F0-F3-X-F1-X-F7
input pre2_start, // Two cycles ahead of F4. Next one should start either at exactly 64 cycles, or >=68 cycles from the previous one
output [2*WIDTH -1:0] dout_10_32_76_54, // Concatenated/reordered output data {x[1],x[0]}/{x[3],x[2]}/ {x[7],x[6]}/{x[5],x[4]}
output reg start_out,
output reg en_out // to be sampled when start_out is expected
);
reg [6:0] wcntr; // write counter, used to calculate write address (2 pages of 64 words), that will be valid next cycle
wire [2:0] wrow = wcntr[5:3];
wire [2:0] wcol = wcntr[2:0];
wire wpage;
reg wcol13; // columns 1 and 3 (special)
wire [3:0] wrow_mod; // effective row, including modifier for wpage
wire [1:0] wcol01_mod = wcol[1:0] - wcol[2];
reg [6:0] waddr;
wire pre2_stop;
reg [WIDTH-1:0] transpose_ram[0:127];
reg pre_we_r;
reg we_r;
reg [5:0] rcntr = 6'h3f; // read counter
reg [5:0] raddr; // read counter, addresses dual words
reg re_r;
reg regen_r;
reg [2*WIDTH-1:0] ram_reg;
reg [2*WIDTH-1:0] ram_reg2;
wire pre_rstart_w = wcntr[5:0] == 61;
reg [1:0] rstop_r;
reg first_after_pause; // first block after pause - do not write 2 items to the "past"
assign wpage = wcntr[6] ^ wrow_mod[3]; // previous page for row 0, col 1 & 3
assign wrow_mod = {1'b0, wrow} - wcol13;
assign dout_10_32_76_54 = ram_reg2;
// TODO: prevent writing to previous page after pause!
always @(posedge clk) begin
wcol13 <= pre_we_r & ~wcol[0] & ~wcol[2];
waddr[0] <= wrow_mod[0] ^ wrow_mod[2];
waddr[1] <= wcol[0];
waddr[2] <= wcol01_mod[1];
waddr[3] <= ~wcol01_mod[0] ^ wcol01_mod[1];
waddr[4] <= wrow_mod[1] ^ wrow_mod[2];
waddr[5] <= wrow_mod[2];
waddr[6] <= wpage;
if (rst) pre_we_r <= 0;
else if (pre2_start) pre_we_r <= 1;
else if (pre2_stop) pre_we_r <= 0;
if (rst) wcntr <= 0;
else if (pre_we_r) wcntr <= wcntr + 1; // including page, should be before 'if (pre2_start)'
else if (pre2_start) wcntr <= {wcntr[6], 6'b0}; // if happens during pre_we_r - will be ignored, otherwise (after pause) will zero in-page adderss
we_r <= pre_we_r && (!first_after_pause || !wcol13 || (|wrow)); // do not write first after pause to the "past"
if (we_r) transpose_ram[waddr] <= din;
if (rst) rcntr <= ~0;
else if (pre_rstart_w) rcntr <= 0;
else if (!(&rcntr)) rcntr <= rcntr + 1;
re_r <= ~rcntr[2];
regen_r <= re_r;
if (rcntr == 0) raddr[5] <= wcntr[6]; // page
raddr[4:0] <= {rcntr[1:0],rcntr[5:3]};
if (re_r) ram_reg <= {transpose_ram[2*raddr+1],transpose_ram[2*raddr]}; // See if it will correctly infer
if (regen_r) ram_reg2 <= ram_reg;
if (rst || pre_rstart_w) rstop_r <= 0;
else if (&rcntr) rstop_r <= {rstop_r[0], 1'b1};
start_out <= (rcntr == 1);
if (rst) en_out <= 0;
else if (rcntr == 1) en_out <= 1;
else if (rstop_r[1]) en_out <= 0;
if (rst) first_after_pause <= 0;
else if (pre2_start && !we_r) first_after_pause <= 1;
else if (&wcntr[5:0]) first_after_pause <= 0;
end
dly01_16 dly01_16_stop_i (
.clk (clk), // input
.rst (rst), // input
.dly (4'h3), // input[3:0]
.din (&wcntr[5:0] && !pre2_start), // input
.dout (pre2_stop) // output
);
/*
min latency == 60, // adding 1 for read after write in RAM
max latency = 83 (when using a 2-page buffer)
wseq=(0x08, 0x62, 0x04, 0x6e, 0x0c, 0x0a, 0x00, 0x06,
0x09, 0x02, 0x05, 0x0e, 0x0d, 0x0b, 0x01, 0x07,
0x18, 0x03, 0x14, 0x0f, 0x1c, 0x1a, 0x10, 0x16,
0x19, 0x12, 0x15, 0x1e, 0x1d, 0x1b, 0x11, 0x17,
0x39, 0x13, 0x35, 0x1f, 0x3d, 0x3b, 0x31, 0x37,
0x38, 0x33, 0x34, 0x3f, 0x3c, 0x3a, 0x30, 0x36,
0x29, 0x32, 0x25, 0x3e, 0x2d, 0x2b, 0x21, 0x27,
0x28, 0x23, 0x24, 0x2f, 0x2c, 0x2a, 0x20, 0x26)
rseq = (0x00,0x10,0x20,0x30,-1,-1,-1,-1,
0x02,0x12,0x22,0x32,-1,-1,-1,-1,
0x04,0x14,0x24,0x34,-1,-1,-1,-1,
0x06,0x16,0x26,0x36,-1,-1,-1,-1,
0x08,0x18,0x28,0x38,-1,-1,-1,-1,
0x0a,0x1a,0x2a,0x3a,-1,-1,-1,-1,
0x0c,0x1c,0x2c,0x3c,-1,-1,-1,-1,
0x0e,0x1e,0x2e,0x3e,-1,-1,-1,-1)
*/
endmodule