/*******************************************************************************
 * Module: ahci_dma
 * Date:2016-01-01  
 * Author: Andrey Filippov     
 * Description: DMA R/W over 64-AXI channel for AHCI implementation
 *
 * Copyright (c) 2016 Elphel, Inc .
 * ahci_dma.v is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 *  ahci_dma.v is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/> .
 *
 * Additional permission under GNU GPL version 3 section 7:
 * If you modify this Program, or any covered work, by linking or combining it
 * with independent modules provided by the FPGA vendor only (this permission
 * does not extend to any 3-rd party modules, "soft cores" or macros) under
 * different license terms solely for the purpose of generating binary "bitstream"
 * files and/or simulating the code, the copyright holders of this Program give
 * you the right to distribute the covered work without those independent modules
 * as long as the source code for them is available from the FPGA vendor free of
 * charge, and there is no dependence on any encrypted modules for simulating of
 * the combined code. This permission applies to you if the distributed code
 * contains all the components and scripts required to completely simulate it
 * with at least one of the Free Software programs.
 *******************************************************************************/
`timescale 1ns/1ps

module  ahci_dma (
//    input         rst,
    input                         mrst, // @posedge mclk - sync reset
    input                         hrst, // @posedge hclk - sync reset
    
    input                         mclk, // for command/status
    input                         hclk,   // global clock to run axi_hp @ 150MHz
    // Control interface  (@mclk)
    // Documentation insists 6 LSBs should be 0, but AHCI driver seems to ignore it. Will align to just 128 bits.
//    input                  [31:7] ctba,         // command table base address
    input                  [31:4] ctba,         // command table base address
    input                         ctba_ld,      // load command table base address
    input                  [15:0] prdtl,        // number of entries in PRD table (valid at cmd_start)
    input                         dev_wr,       // write to device (valid at start)
    input                         cmd_start,     // start processing command table, reset prdbc
    input                         prd_start,     // at or after cmd_start - enable reading PRD/data (if any)
    input                         cmd_abort,     // try to abort a command: Will keep afi_rready on until RD FIFO is empty and
                                                 // afi_awvalid (slowly one by one?) until afi_wacount is empty, keeping afi_wlast on

// Optional control of the AXI cache mode, default will be set to 4'h3, 4'h3 at mrst
    input                  [3:0]  axi_wr_cache_mode, 
    input                  [3:0]  axi_rd_cache_mode,
    input                         set_axi_wr_cache_mode,
    input                         set_axi_rd_cache_mode,
    
    // Some data from the command table will be used internally, data will be available on the general
    // sys_out[31:0] port and should be consumed
    output reg                    ct_busy,      // cleared after 0x20 DWORDs are read out
    // reading out command table data
    input                  [ 4:0] ct_addr,     // DWORD address
    input                  [ 1:0] ct_re,       // [0] - re, [1]-regen  
    output reg             [31:0] ct_data,     // 
    
    // After the first 0x80 bytes of the Command Table are read out, this module will read/process PRDs,
    // not forwarding them to the output 
    output                        prd_done,     // @mclk prd done (regardless of the interrupt) - data transfer of one PRD is finished (any direction)
    input                         prd_irq_clear, // reset pending prd_irq
    output reg                    prd_irq_pend,  // prd interrupt pending. This is just a condition for irq - actual will be generated after FIS OK
    output reg                    cmd_busy,     // all commands
    output                        cmd_done,     // @ mclk
    output                        abort_busy,
    output                        abort_done,
    output                        axi_mismatch, // axi hp counters where empty when calculated counters were not (handled, but seems to be a bug - catch it)
    
    // Data System memory -> HBA interface @ mclk
    output                 [31:0] sys_out,      // 32-bit data from the system memory to HBA (dma data)
    output                        sys_dav,      // at least one dword is ready to be read
//    output                        sys_dav_many, // several DWORDs are in the FIFO (TODO: decide how many)
    input                         sys_re,       // sys_out data read, advance internal FIFO
    output                        last_h2d_data,// when active and no new data for 2 clocks - that was the last one
    
    // Data HBA -> System memory  interface @ mclk
    input                  [31:0] sys_in,       // HBA -> system memory
    output                        sys_nfull,    // internal FIFO has room for more data (will decide - how big reserved space to keep)
    input                         sys_we,    
    
    output                        extra_din,    // all DRDs are transferred to memory, but FIFO has some data. Valid when transfer is stopped
    
    // axi_hp signals write channel
    // write address
    output  [31:0] afi_awaddr,
    output         afi_awvalid,
    input          afi_awready, // @SuppressThisWarning VEditor unused - used FIF0 level
    output  [ 5:0] afi_awid,
    output  [ 1:0] afi_awlock,
    output reg [ 3:0] afi_awcache,
    output  [ 2:0] afi_awprot,
    output  [ 3:0] afi_awlen,
    output  [ 1:0] afi_awsize,
    output  [ 1:0] afi_awburst,
    output  [ 3:0] afi_awqos,
    // write data
    output  [63:0] afi_wdata,
    output         afi_wvalid,
    input          afi_wready,  // @SuppressThisWarning VEditor unused - used FIF0 level
    output  [ 5:0] afi_wid,
    output         afi_wlast,
    output  [ 7:0] afi_wstrb,
    // write response
    input          afi_bvalid,   // @SuppressThisWarning VEditor unused
    output         afi_bready,
    input   [ 5:0] afi_bid,      // @SuppressThisWarning VEditor unused
    input   [ 1:0] afi_bresp,    // @SuppressThisWarning VEditor unused
    // PL extra (non-AXI) signals
    input   [ 7:0] afi_wcount,
    input   [ 5:0] afi_wacount,
    output         afi_wrissuecap1en,
    // AXI_HP signals - read channel
    // read address
    output  [31:0] afi_araddr,
    output            afi_arvalid,
    input             afi_arready,  // @SuppressThisWarning VEditor unused - used FIF0 level
    output  [ 5:0] afi_arid,
    output  [ 1:0] afi_arlock,
    output reg [ 3:0] afi_arcache,
    output  [ 2:0] afi_arprot,
    output  [ 3:0] afi_arlen,
    output  [ 1:0] afi_arsize,
    output  [ 1:0] afi_arburst,
    output  [ 3:0] afi_arqos,
    // read data
    input   [63:0] afi_rdata,
    input          afi_rvalid,
    output         afi_rready,
    input   [ 5:0] afi_rid,     // @SuppressThisWarning VEditor unused
    input          afi_rlast,   // @SuppressThisWarning VEditor unused
    input   [ 1:0] afi_rresp,   // @SuppressThisWarning VEditor unused
    // PL extra (non-AXI) signals
    input   [ 7:0] afi_rcount,
    input   [ 2:0] afi_racount,
    output         afi_rdissuecap1en,
    
    output  [31:0] debug_out,
    output  [31:0] debug_out1
    
    ,output [31:0] debug_dma_h2d
    
);


// Read command table
//    localparam AFI_FIFO_LAT = 2; // >=2
   localparam SAFE_RD_BITS =   3; //2; // 3;

    reg     [31:0] ct_data_ram [0:31];
    reg      [3:0] int_data_addr;    // internal (ct,prd) data address
//    reg     [31:7] ctba_r;
    reg     [31:4] ctba_r; // Seems that AHCI driver ignores requirement to have 6 LSB==0
    reg     [15:0] prdtl_mclk;
    wire           cmd_start_hclk;
    reg            prd_start_r;
    wire           prd_start_hclk;
    reg            prd_start_hclk_r; // to make sure it is with/after prd_start_hclk if in mclk they are in the same cycle
    wire           cmd_abort_hclk; // TODO: Implement as graceful as possible command abort
    reg            prd_enabled;
    reg      [1:0] ct_over_prd_enabled; // prd read and data r/w enabled, command table fetch done
    
    reg     [31:4] ct_maddr; // granularity matches PRDT entry - 4xDWORD, 2xQWORD
    wire           ct_done;
    wire           first_prd_fetch; // CT read done, prd enabled
    reg     [31:0] afi_addr; // common for afi_araddr and afi_awaddr
    wire           axi_set_raddr_ready = !(|afi_racount[2:1]) && (!axi_set_raddr_r || !afi_racount[0]); // What is the size of ra fifo - just 4? Latency?
//    wire           axi_set_raddr_ready = !(|afi_racount) && !axi_set_raddr_r); // Most pessimistic
    wire           axi_set_waddr_ready = !afi_wacount[5] && !afi_wacount[4]; // What is the size of wa fifo - just 32? Using just half - safe
    wire           axi_set_raddr_w;
    wire           axi_set_waddr_w;
    wire           axi_set_addr_data_w;
    
    reg            axi_set_raddr_r; // [0] - actual write address to fifo
    reg            axi_set_waddr_r; // [0] - actual write address to fifo
    reg            is_ct_addr;    // current address is ct address
    reg            is_prd_addr;   // current address is prd address
    reg            is_data_addr;  // current address is data address (r or w)
    
    reg     [31:1] data_addr; // 2 lower addresses will be used in in/out fifo modules
    reg      [3:0] data_len; //
    reg            data_irq; // interrupt at the end of this PRD
    reg     [21:1] wcount;  // Word count
    reg            wcount_set; 
    reg     [22:1] qwcount; // only [21:3] are used ([22] - carry from subtraction )
    reg            qwcount_done;
        
    reg     [21:3] qw_datawr_left;
    reg     [ 3:0] qw_datawr_burst;
    reg            qw_datawr_last; 
    
    wire           data_afi_re;
    
    reg     [15:0] prds_left;
    reg            last_prd;
    
    reg     [1:0]  afi_rd_ctl; // read non-data (CT or PRD)
    reg     [1:0]  ct_busy_r;
    reg            prd_rd_busy; // reading PRD
    
    reg            dev_wr_mclk;
    reg            dev_wr_hclk;
    reg            prd_wr;    // write PRD data to memory
    reg            prd_rd;    // read  PRD data from memory
    wire     [3:0] afi_wstb4;

    wire           done_dev_wr; // finished PRD mem -> device
    wire           done_dev_rd; // finished PRD device -> mem
    wire           prd_done_hclk = done_dev_wr || done_dev_rd;
    wire           done_flush;  // done flushing last partial dword
    wire           cmd_done_hclk;
    wire           ct_done_mclk;
    reg      [3:0] afi_alen;
    wire           afi_wcount_many = !afi_wcount[7] && !(&afi_wcount[6:4]);
    
    reg            data_next_burst;
    
//    wire           raddr_prd_rq = (|prds_left) && (ct_done || prd_done);
    wire           raddr_prd_rq = (|prds_left) && (first_prd_fetch || prd_done_hclk);
    
    reg            raddr_prd_pend;
            
    wire           raddr_ct_rq = cmd_start_hclk;
    reg            raddr_ct_pend;

/*
    wire           addr_data_rq = (wcount_set || data_next_burst);
     
    wire           waddr_data_rq =  !dev_wr_hclk && addr_data_rq;
    wire           raddr_data_rq =   dev_wr_hclk && addr_data_rq;

*/     
    wire           addr_data_rq_w = (wcount_set || data_next_burst);
    reg            addr_data_rq_r;
     
    wire           waddr_data_rq =  !dev_wr_hclk && addr_data_rq_r;
    wire           raddr_data_rq =   dev_wr_hclk && addr_data_rq_r;

    reg            waddr_data_pend;
    reg            raddr_data_pend;
    // count different types of AXI ID separately - just for debugging
    reg      [3:0] ct_id;
    reg      [3:0] prd_id;
    reg      [3:0] dev_wr_id;
    reg      [3:0] dev_rd_id;
    reg      [5:0] afi_id; // common for 3 channels
    
    wire           fifo_nempty_mclk;
    reg            en_extra_din_r;
    reg     [31:0] ct_data_reg;
//    reg            abort_busy_hclk;
    reg            hrst_r;
    wire           abort_or_reset = cmd_abort_hclk || (hrst_r && !hrst);
//    reg            axi_dirty_r; // afi_wacount of afi_rcount are non-zero (assuming afi_wcount should be zero as addresses are posted first
    
    wire           afi_dirty;
    reg            afi_dirty_mclk;
    wire           abort_done_hclk;
    wire           abort_done_mclk;
    reg            abort_done_unneeded;
    wire           aborting;
    wire           afi_wvalid_data;
    wire           afi_wvalid_abort;
    wire     [5:0] afi_wid_abort;
    wire           afi_rready_abort;
    wire           afi_wlast_abort;
//    wire           abort_done;
    reg            abort_rq_mclk;         
    reg            abort_busy_mclk;
    wire    [21:0] abort_debug;
    reg            rwaddr_rq_r; // next cycle after requesting waddr_data_rq, raddr_data_rq, raddr_ct_rq and raddr_prd_rq (*-pend is valid)
      
    assign afi_wvalid = aborting ? afi_wvalid_abort:  afi_wvalid_data;
    assign afi_wid =    aborting ? afi_wid_abort:     afi_id;
    assign afi_rready = aborting ? afi_rready_abort : (afi_rd_ctl[0] || data_afi_re);
    assign afi_wlast =  aborting ? afi_wlast_abort:   qw_datawr_last;
    assign abort_done = abort_done_mclk || abort_done_unneeded;
    assign abort_busy = abort_busy_mclk; 
    
      
//    assign prd_done = done_dev_wr || done_dev_rd;
    assign cmd_done_hclk = ((ct_busy_r==2'b10) && (prdtl_mclk == 0)) || ((done_flush || done_dev_rd) && last_prd);
    assign ct_done = (ct_busy_r == 2'b10);
    assign first_prd_fetch = ct_over_prd_enabled == 2'b01;
    assign axi_set_raddr_w = axi_set_raddr_ready && (raddr_ct_pend || raddr_prd_pend || raddr_data_pend);    
/// assign axi_set_waddr_w = axi_set_raddr_ready && raddr_data_pend;    
    assign axi_set_waddr_w = axi_set_waddr_ready && waddr_data_pend;    
    assign axi_set_addr_data_w = (axi_set_raddr_ready && raddr_data_pend) || (axi_set_waddr_ready && waddr_data_pend);
    
    
    assign afi_awaddr = afi_addr;
    assign afi_araddr = afi_addr;
    assign afi_arlen  = afi_alen;
    assign afi_awlen  = afi_alen;
    assign afi_arvalid = axi_set_raddr_r;
    assign afi_awvalid = axi_set_waddr_r;
///    assign afi_rready = afi_rd_ctl[0] || data_afi_re;
    assign afi_wstrb = {{2{afi_wstb4[3]}},{2{afi_wstb4[2]}},{2{afi_wstb4[1]}},{2{afi_wstb4[0]}}};
///    assign afi_wlast = qw_datawr_last;

    assign afi_awid = afi_id;
//    assign afi_wid =  afi_id;
    assign afi_arid = afi_id;

// Unused or static output signals
    assign afi_bready = 1'b1;
    assign afi_awlock =        2'h0;
//    assign afi_awcache =       4'h3;
    assign afi_awprot =        3'h0;
    assign afi_awsize =        2'h3;
    assign afi_awburst =       2'h1;
    assign afi_awqos =         4'h0;
    assign afi_wrissuecap1en = 1'b0;

    assign afi_arlock =        2'h0;
//    assign afi_arcache =       4'h3;
    assign afi_arprot =        3'h0;
    assign afi_arsize =        2'h3;
    assign afi_arburst =       2'h1;
    assign afi_arqos =         4'h0;
    assign afi_rdissuecap1en = 1'b0;
    assign extra_din = en_extra_din_r && fifo_nempty_mclk;
//    reg             [31:0] ct_data_reg;
    always @ (posedge mclk) begin
    
        if (mrst) afi_dirty_mclk <= 0;
        afi_dirty_mclk <=afi_dirty;
    
        abort_rq_mclk <= cmd_abort && afi_dirty_mclk;
        
        abort_done_unneeded <= cmd_abort && !afi_dirty_mclk;
        if (mrst || abort_done) abort_busy_mclk <= 0;
        else if (cmd_abort)     abort_busy_mclk <= 1;
    
        if (ct_re[0]) ct_data_reg <=  ct_data_ram[ct_addr];
        if (ct_re[1]) ct_data <=      ct_data_reg;
        
//        if (ctba_ld) ctba_r <=        ctba[31:7];
        if (ctba_ld) ctba_r <=        ctba[31:4];
        
        if (cmd_start) prdtl_mclk <=  prdtl;
        
        if (cmd_start) dev_wr_mclk <= dev_wr;
        
        if      (mrst || cmd_abort) cmd_busy <= 0;
        else if (cmd_start)         cmd_busy <= 1; 
        else if (cmd_done)          cmd_busy <= 0;

        if      (mrst || cmd_abort) ct_busy <= 0;
        else if (cmd_start)         ct_busy <= 1; 
        else if (ct_done_mclk)      ct_busy <= 0;
        
        if      (mrst)                  afi_arcache <= 4'h3;
        else if (set_axi_rd_cache_mode) afi_arcache <= axi_rd_cache_mode;

        if      (mrst)                  afi_awcache <= 4'h3;
        else if (set_axi_wr_cache_mode) afi_awcache <= axi_wr_cache_mode;
        
        prd_start_r <= prd_start;
        
        if (mrst || prd_irq_clear ||cmd_start || cmd_abort) prd_irq_pend <= 0; 
        else if (data_irq && prd_done)                      prd_irq_pend <= 1;
        
        if (mrst || cmd_start || cmd_abort) en_extra_din_r <= 0; 
        else if (cmd_done)                  en_extra_din_r <= 1;
        

    end
       
//        afi_rd_ctl <= { afi_rd_ctl[0],(ct_busy_r[0] || prd_rd_busy) && ((|afi_rcount[7:SAFE_RD_BITS]) || (afi_rvalid && !(|afi_rd_ctl)))};
    wire debug_01 = ct_busy_r[0] || prd_rd_busy ;      
    wire debug_02 =|afi_rcount[7:SAFE_RD_BITS];
    wire debug_03 = (afi_rvalid && !(|afi_rd_ctl));
    
    wire [21:1] wcount_plus_data_addr = wcount[21:1] + data_addr[2:1];
    
    always @ (posedge hclk) begin
        hrst_r <= hrst;
    
        if (hrst) rwaddr_rq_r <= 0;
        else      rwaddr_rq_r <= raddr_ct_rq || raddr_prd_rq || raddr_data_rq || waddr_data_rq;
        
        addr_data_rq_r <= addr_data_rq_w;
        
        prd_start_hclk_r <= prd_start_hclk;
        
        if      (hrst || abort_or_reset) prd_enabled <= 0;
        else if (prd_start_hclk_r)       prd_enabled <= 1; // precedence over  cmd_start_hclk
        else if (cmd_start_hclk)         prd_enabled <= 0;
    
    
//        if (cmd_start_hclk)  ct_maddr[31:4] <= {ctba_r[31:7],3'b0};
        if (cmd_start_hclk)  ct_maddr[31:4] <= ctba_r[31:4];
        else if (ct_done)    ct_maddr[31:4] <= ct_maddr[31:4] + 8; // 16;
        else if (wcount_set) ct_maddr[31:4] <= ct_maddr[31:4] + 1;
        
        // overall sequencing makes sure that there will be no new requests until older served
        // additionally they are mutuially exclusive - only one may be pending at a time
        if      (hrst || cmd_abort_hclk) raddr_ct_pend <= 0;
        else if (raddr_ct_rq)            raddr_ct_pend <= 1;
        else if (axi_set_raddr_ready)    raddr_ct_pend <= 0;
        
        if      (hrst || cmd_abort_hclk) raddr_prd_pend <= 0;
        else if (raddr_prd_rq)           raddr_prd_pend <= 1;
        else if (axi_set_raddr_ready)    raddr_prd_pend <= 0;
        
        if      (hrst || cmd_abort_hclk) raddr_data_pend <= 0;
        else if (raddr_data_rq)          raddr_data_pend <= 1;
        else if (axi_set_raddr_ready)    raddr_data_pend <= 0;
        
        if      (hrst || cmd_abort_hclk) waddr_data_pend <= 0;
        else if (waddr_data_rq)          waddr_data_pend <= 1;
        else if (axi_set_waddr_ready)    waddr_data_pend <= 0;
        
        if (hrst)                                           {is_ct_addr, is_prd_addr, is_data_addr} <= 0;
        else if (raddr_ct_rq || raddr_prd_rq || wcount_set) {is_ct_addr, is_prd_addr, is_data_addr} <= {raddr_ct_rq, raddr_prd_rq, wcount_set};
        
///        if (axi_set_raddr_w || axi_set_waddr_w) begin
        if (rwaddr_rq_r) begin // first cycle one of the *_pend is set
        
            if (raddr_data_pend || waddr_data_pend)  afi_addr <= {data_addr[31:3], 3'b0};
            else                                     afi_addr <= {ct_maddr[31:4],  4'b0};

            if (raddr_data_pend || waddr_data_pend)  afi_alen <= data_len;
            else if (raddr_ct_pend)                  afi_alen <= 4'hf; // 16 QWORDS (128 bytes)
            else                                     afi_alen <= 4'h1; // 2 QWORDS
            
            if (raddr_data_pend || waddr_data_pend)  afi_id <= raddr_data_pend ? {2'h2, dev_rd_id} : {2'h3, dev_wr_id};
            else                                     afi_id <= raddr_ct_pend   ? {2'h0, ct_id} :     {2'h1, prd_id};
        end    
        
        
        if (hrst) axi_set_raddr_r <= 0;
        else      axi_set_raddr_r <= axi_set_raddr_w;

        if (hrst) axi_set_waddr_r <= 0;
        else      axi_set_waddr_r <= axi_set_waddr_w;
        
///     if (addr_data_rq)   data_len <= ((|qwcount[21:7]) || (&qwcount[6:3]))? 4'hf: qwcount[6:3];       // early calculate
        if (addr_data_rq_r) data_len <= ((|qwcount[21:7]) || (&qwcount[6:3]))? 4'hf: qwcount[6:3];       // early calculate


        if      (wcount_set)          qwcount[22:7] <= {1'b0,wcount_plus_data_addr[21:7]}; // wcount[21:1] + data_addr[2:1]; //minus 1
        else if (axi_set_addr_data_w) qwcount[22:7] <= qwcount[22:7] - 1; // may get negative

        if      (wcount_set)          qwcount[ 6:1] <= wcount_plus_data_addr[6:1]; // wcount[21:1] + data_addr[2:1]; //minus 1
        
        if      (wcount_set)                                qwcount_done <= 0;
        else if (axi_set_addr_data_w && (qwcount[21:7]==0)) qwcount_done <= 1;
        
        
        
//wcount_plus_data_addr        
        
///        data_next_burst <= !qwcount[22] && axi_set_addr_data_w && (|qwcount[21:7]); // same time as afi_awvalid || afi_arvalid
        data_next_burst <= !qwcount_done && axi_set_addr_data_w && (|qwcount[21:7]); // same time as afi_awvalid || afi_arvalid

// Get PRD data
        // store data address from PRD, increment when needed
        if (afi_rd_ctl[0] && is_prd_addr && (!int_data_addr[0])) data_addr[31:1] <= afi_rdata[31:1];
        if (axi_set_addr_data_w) data_addr[31:7] <= data_addr[31:7] + 1;

        if (afi_rd_ctl[0] && is_prd_addr && (int_data_addr[0])) data_irq <=     afi_rdata[63];

        if (afi_rd_ctl[0] && is_prd_addr && (int_data_addr[0])) wcount[21:1] <= afi_rdata[53:33];

        wcount_set <= afi_rd_ctl[0] && is_prd_addr && (int_data_addr[0]);

        if      (cmd_start_hclk)  prds_left  <= prdtl_mclk;
        else if (raddr_prd_rq)    prds_left  <= prds_left  - 1;

        if      (raddr_prd_rq)    last_prd  <= prds_left[15:1] == 0;
        
        // Set/increment address to store (internally) CT and PRD data 
        if      (axi_set_raddr_r)                int_data_addr <= 0;
        else if (afi_rd_ctl[0] && !is_data_addr) int_data_addr <= int_data_addr + 1;
        
        if (afi_rd_ctl[0] && is_ct_addr) {ct_data_ram[{int_data_addr,1'b1}],ct_data_ram[{int_data_addr,1'b0}]} <= afi_rdata; // make sure it is synthesized correctly
        
        // generate busy for command table (CT) read
        if      (hrst)                                            ct_busy_r[0] <= 0;
        else if (cmd_start_hclk)                                  ct_busy_r[0] <= 1;
        else if (afi_rd_ctl[0] && is_ct_addr && (&int_data_addr)) ct_busy_r[0] <= 0;
        ct_busy_r[1] <= ct_busy_r[0]; // delayed version to detect end of command
        
        if (hrst || ct_busy_r[0])                   ct_over_prd_enabled[0] <= 0;
        else if (prd_enabled)                       ct_over_prd_enabled[0] <= 1;
        ct_over_prd_enabled[1] <= ct_over_prd_enabled[0];  // detecting 0->1 transition
        
        // generate busy for PRD table entry read
        if      (hrst)                                prd_rd_busy <= 0;
//        else if (prd_rd_busy) prd_rd_busy <= 1;
        else if (raddr_prd_rq && axi_set_raddr_ready) prd_rd_busy <= 1;
        else if (wcount_set)                          prd_rd_busy <= 0;
        
        if (cmd_start_hclk) dev_wr_hclk <= dev_wr_mclk; // 1: memory -> device, 0: device -> memory
        
        prd_wr <= wcount_set && !dev_wr_hclk;
        prd_rd <= wcount_set &&  dev_wr_hclk;
        
        afi_rd_ctl <= { afi_rd_ctl[0],(ct_busy_r[0] || prd_rd_busy) && ((|afi_rcount[7:SAFE_RD_BITS]) || (afi_rvalid && !(|afi_rd_ctl)))};
        
        // calculate afi_wlast - it is (qw_datawr_burst == 0), just use register qw_datawr_last
        
        if      (prd_wr)          qw_datawr_last <= (qwcount[21:3] == 0);
        else if (afi_wvalid_data) qw_datawr_last <= (qw_datawr_burst == 1) || (qw_datawr_last && (qw_datawr_left[21:3] == 16)); // last case - n*16 + 1 (last burst single)       
        
        if      (prd_wr)                                                           qw_datawr_burst <= (|qwcount[21:7])? 4'hf: qwcount[6:3];
        else if (afi_wvalid_data && qw_datawr_last && (qw_datawr_left[21:7] == 1)) qw_datawr_burst <= qw_datawr_left[6:3]; // if not last roll over to 'hf
        else if (afi_wvalid_data)                                                  qw_datawr_burst <= qw_datawr_burst - 1;
        
        if      (prd_wr)                            qw_datawr_left[21:3] <= qwcount[21:3];
        else if (afi_wvalid_data && qw_datawr_last) qw_datawr_left[21:7] <= qw_datawr_left[21:7] - 1; // can go negative - OK?
        
        // Count AXI IDs
        if      (hrst)             ct_id <= 0;
        else if (ct_busy_r==2'b10) ct_id <= ct_id + 1;

        if      (hrst)             prd_id <= 0;
        else if (wcount_set)       prd_id <= prd_id + 1;

        if      (hrst)             dev_wr_id <= 0;
        else if (done_dev_wr)      dev_wr_id <= dev_wr_id + 1;

        if      (hrst)             dev_rd_id <= 0;
        else if (done_dev_rd)      dev_rd_id <= dev_rd_id + 1;
        
        
    end

    // Flushing AXI HP - there is no easy way to reset it, so if there was an error in SATA communication we need to read any data
    // that was already requested (over AXI read adderss channel) and send junk data (with appropriate afi_wlast bit) to the write
    // channel. THis module is not reset and even bitsteram relaod will not work, so hrst input is used just as disable paying attention
    // to other inputs, doe s not reset anything inside.
    // FPGA should not be reset /reloaded if there are any outstanding transactions not aborted
    // Current implementation counts all transactions and relies on it - not on afi_*count. TODO: Implement recovering from mismatch
    
    axi_hp_abort axi_hp_abort_i (
        .hclk          (hclk),              // input
        .hrst          (hrst),              // input
        .abort         (abort_or_reset),    // input
        .busy          (aborting),          // output
        .done          (abort_done_hclk),   // output reg 
        .afi_awvalid   (afi_awvalid),       // input
        .afi_awready   (afi_awready),       // input
        .afi_awid      (afi_awid),          // input[5:0] 
        .afi_awlen     (afi_awlen),         // input[3:0] 
        .afi_wvalid_in (afi_wvalid),        // input
        .afi_wready    (afi_wready),        // input
        .afi_wvalid    (afi_wvalid_abort),  // output
        .afi_wid       (afi_wid_abort),     // output[5:0] reg 
        .afi_arvalid   (afi_arvalid),       // input
        .afi_arready   (afi_arready),       // input
        .afi_arlen     (afi_arlen),         // input[3:0] 
        .afi_rready_in (afi_rready),        // input
        .afi_rvalid    (afi_rvalid),        // input
        .afi_rready    (afi_rready_abort),  // output
        .afi_wlast     (afi_wlast_abort),   // output
        .afi_racount   (afi_racount),       // input[2:0] 
        .afi_rcount    (afi_rcount),        // input[7:0]
        .afi_wacount   (afi_wacount),       // input[5:0] 
        .afi_wcount    (afi_wcount),        // input[7:0] 
        .dirty         (afi_dirty),         // output reg 
        .axi_mismatch  (axi_mismatch),      // output_reg
        .debug         (abort_debug)        // output[21:0]  
    );

    
    ahci_dma_rd_fifo #( // memory to device
        .WCNT_BITS    (21),
        .ADDRESS_BITS (3)
    ) ahci_dma_rd_fifo_i (
        .mrst         (mrst || abort_busy_mclk),     // input
        .hrst         (hrst || cmd_abort_hclk),      // input
        .mclk         (mclk),                        // input
        .hclk         (hclk),                        // input
        .wcnt         (wcount[21:1]),                // input[20:0] 
        .woffs        (data_addr[2:1]),              // input[1:0] 
        .start        (prd_rd),                      // input
        .din          (afi_rdata),                   // input[63:0] 
        .din_av       (afi_rvalid),                  // input
        .din_av_many  (|afi_rcount[7:SAFE_RD_BITS]), // input
        .last_prd     (last_prd),                    // input
        .din_re       (data_afi_re),                 // output
        .done         (done_dev_wr),                 // output reg // @ hclk
        .done_flush   (done_flush),                  // output     // @ hclk
        .dout         (sys_out),                     // output[31:0] 
        .dout_vld     (sys_dav),                     // output
        .dout_re      (sys_re),                      // input
        .last_DW      (last_h2d_data)                // output
       ,.debug_dma_h2d(debug_dma_h2d)                // output[31:0] 
        
    );
    
    ahci_dma_wr_fifo #( // device to memory
        .WCNT_BITS    (21),
        .ADDRESS_BITS (3)
    ) ahci_dma_wr_fifo_i (
        .mrst         (mrst || abort_busy_mclk),     // input
        .hrst         (hrst ||cmd_abort_hclk),       // input
        .mclk         (mclk),                        // input
        .hclk         (hclk),                        // input
        .wcnt         (wcount[21:1]),                // input[20:0] 
        .woffs        (data_addr[2:1]),              // input[1:0] 
        .init         (cmd_start_hclk),              // input
        .start        (prd_wr),                      // input
        .dout         (afi_wdata),                   // output[63:0] reg 
//        .dout_av      (), // input
        .dout_av_many (afi_wcount_many),             // input
        .last_prd     (last_prd),                    // input
        .dout_we      (afi_wvalid_data),             // output
        .dout_wstb    (afi_wstb4),                   // output[3:0] reg 
        .done         (done_dev_rd),                 // output reg 
        .busy         (),                            // output
        .fifo_nempty_mclk  (fifo_nempty_mclk),       // output reg 
        .din          (sys_in),                      // input[31:0] 
        .din_rdy      (sys_nfull),                   // output
        .din_avail    (sys_we)                       // input
    );
    // mclk -> hclk cross-clock synchronization
    pulse_cross_clock #(
        .EXTRA_DLY(0)
    ) cmd_start_hclk_i (
        .rst       (mrst),            // input
        .src_clk   (mclk),            // input
        .dst_clk   (hclk),            // input
        .in_pulse  (cmd_start),       // input
        .out_pulse (cmd_start_hclk),    // output
        .busy()                       // output
    );
    pulse_cross_clock #(
        .EXTRA_DLY(0)
    ) cmd_abort_hclk_i (
        .rst       (mrst),            // input
        .src_clk   (mclk),            // input
        .dst_clk   (hclk),            // input
        .in_pulse  (abort_rq_mclk),   // input
        .out_pulse (cmd_abort_hclk),  // output
        .busy()                       // output
    );
    pulse_cross_clock #(
        .EXTRA_DLY(0)
    ) prd_start_hclk_i (
        .rst       (mrst),            // input
        .src_clk   (mclk),            // input
        .dst_clk   (hclk),            // input
        .in_pulse  (prd_start_r),     // input
        .out_pulse (prd_start_hclk),  // output
        .busy()                       // output
    );


    
    // hclk -> mclk;
    pulse_cross_clock #(
        .EXTRA_DLY(0)
    ) cmd_done_i (
        .rst       (hrst),            // input
        .src_clk   (hclk),            // input
        .dst_clk   (mclk),            // input
        .in_pulse  (cmd_done_hclk),   // input
        .out_pulse (cmd_done),        // output
        .busy()                       // output
    );

    pulse_cross_clock #(
        .EXTRA_DLY(0)
    ) ct_done_mclk_i (
        .rst       (hrst),            // input
        .src_clk   (hclk),            // input
        .dst_clk   (mclk),            // input
        .in_pulse  (ct_done),         // input
        .out_pulse (ct_done_mclk),    // output
        .busy()                       // output
    );

    pulse_cross_clock #(
        .EXTRA_DLY(0)
    ) prd_done_mclk_i (
        .rst       (hrst),            // input
        .src_clk   (hclk),            // input
        .dst_clk   (mclk),            // input
        .in_pulse  (prd_done_hclk),   // input
        .out_pulse (prd_done),        // output
        .busy()                       // output
    );

    pulse_cross_clock #(
        .EXTRA_DLY(0)
    ) abort_done_i (
        .rst       (hrst),            // input
        .src_clk   (hclk),            // input
        .dst_clk   (mclk),            // input
        .in_pulse  (abort_done_hclk), // input
        .out_pulse (abort_done_mclk), // output
        .busy()                       // output
    );
    
//abort_done_hclk    
reg [7:0] dbg_afi_awvalid_cntr;
reg [7:0] dbg_qwcount;
reg [7:0] dbg_qwcount_cntr;
reg [7:0] dbg_set_raddr_count;
reg [7:0] dbg_set_waddr_count;
reg       dbg_was_mismatch;
//        if (axi_set_raddr_w || axi_set_waddr_w) begin
//data_next_burst
always @ (posedge hclk) begin
    if      (hrst)            dbg_afi_awvalid_cntr <= 0;
    else if (axi_set_waddr_r) dbg_afi_awvalid_cntr <= dbg_afi_awvalid_cntr + 1;
    
//    if      (hrst)            dbg_last_afi_len <= 0;
    if (axi_set_raddr_w || axi_set_waddr_w) begin
    end
    if      (wcount_set)          dbg_qwcount <= wcount_plus_data_addr[14:7];
//        if      (wcount_set)          qwcount[22:7] <= {1'b0,wcount_plus_data_addr[21:7]}; // wcount[21:1] + data_addr[2:1]; //minus 1
    
    if       (hrst)      dbg_qwcount_cntr <= 0;
//    else if (wcount_set) dbg_qwcount_cntr <= dbg_qwcount_cntr + 1;
//    else if (data_next_burst) dbg_qwcount_cntr <= dbg_qwcount_cntr + 1;
    else if (!qwcount[22] && axi_set_addr_data_w && (|qwcount[21:7])) dbg_qwcount_cntr <= dbg_qwcount_cntr + 1;
    

    if       (hrst)           dbg_set_raddr_count <= 0;
//    else if (axi_set_raddr_w) dbg_set_raddr_count <= dbg_set_raddr_count + 1;
    else if (axi_set_raddr_ready && raddr_data_pend) dbg_set_raddr_count <= dbg_set_raddr_count + 1;
    

    if       (hrst)                                  dbg_set_waddr_count <= 0;
//    else if (axi_set_waddr_w) dbg_set_waddr_count <= dbg_set_waddr_count + 1;
//    else if (axi_set_waddr_ready && waddr_data_pend) dbg_set_waddr_count <= dbg_set_waddr_count + 1; //0x14
    else if (addr_data_rq_w) dbg_set_waddr_count <= dbg_set_waddr_count + 1; //0x14
    
    if      (hrst)         dbg_was_mismatch <= 0;
    else if (axi_mismatch) dbg_was_mismatch <= 1;
    
end

assign debug_out =  {int_data_addr [3:0],
                     qwcount_done, // prd_rd_busy,
                     afi_racount   [2:0],
                     //--
                     afi_rcount    [7:0],
                     //--
                     ct_busy,
                     cmd_busy,
                     afi_wacount   [5:0],
                     //--
                     afi_wcount    [7:0]};
/*
assign debug_out =  {
                     qwcount[22:7],
                     dev_rd_id,
                     dev_wr_id,
                     prd_id,
                     ct_id
};

assign debug_out =  {qwcount_done,
                     2'b0,
                     dev_wr_id,
                     prd_id,
                     wcount[21:1]
};

assign debug_out1 = { //dbg_set_raddr_count[7:0],
                     qwcount_done,
                     afi_rcount[6:0],
                      //{qwcount[22], qwcount[13:7]},
                     dbg_set_waddr_count[7:0],
                     dbg_qwcount[3:0],
                     afi_alen[3:0],
                     dbg_qwcount_cntr[7:0]}; 
*/                     
assign debug_out1 = { //dbg_set_raddr_count[7:0]
                     8'b0 ,
                     dbg_was_mismatch,
                     1'b0,
                     abort_debug[21:0]}; //    {aw_count[5:0], w_count[7:0], r_count[7:0]};
                     
//
endmodule