/*******************************************************************************
 * Module: phy_top
 * Date:2014-04-30  
 * Author: Andrey Filippov
 * Description: Top module of the DDR3 phy
 *
 * Copyright (c) 2014 Elphel, Inc.
 * phy_top.v is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 *  phy_top.v is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/> .
 *******************************************************************************/
`timescale 1ns/1ps

module  phy_top #(
    parameter IOSTANDARD_DQ =   "SSTL15_T_DCI",
    parameter IOSTANDARD_DQS =  "DIFF_SSTL15_T_DCI",
    parameter IOSTANDARD_CMDA = "SSTL15",
    parameter IOSTANDARD_CLK = "DIFF_SSTL15",
    parameter SLEW_DQ =         "SLOW",
    parameter SLEW_DQS =        "SLOW",
    parameter SLEW_CMDA =       "SLOW",
    parameter SLEW_CLK =       "SLOW",
    parameter IBUF_LOW_PWR =    "TRUE",
    parameter IODELAY_GRP =     "IODELAY_MEMORY",
    parameter real REFCLK_FREQUENCY = 300.0,
    parameter HIGH_PERFORMANCE_MODE = "FALSE",
    parameter integer ADDRESS_NUMBER= 15,
    parameter PHASE_WIDTH           = 8,
    parameter BANDWIDTH =     "OPTIMIZED",
    // Assuming 100MHz input clock, 800MHz Fvco, 400MHz clk, 200MHz clk_div, 200MHz mclk 
    parameter CLKIN_PERIOD          = 10, //ns >1.25, 600<Fvco<1200
    parameter CLKFBOUT_MULT =       8, // Fvco=Fclkin*CLKFBOUT_MULT_F/DIVCLK_DIVIDE, Fout=Fvco/CLKOUT#_DIVIDE
    parameter CLKFBOUT_MULT_REF =   9, // Fvco=Fclkin*CLKFBOUT_MULT_F/DIVCLK_DIVIDE, Fout=Fvco/CLKOUT#_DIVIDE
    parameter CLKFBOUT_DIV_REF =    3, // To get 300MHz for the reference clock
    parameter DIVCLK_DIVIDE=        1,
    parameter CLKFBOUT_PHASE =      0.000,
    parameter SDCLK_PHASE =          0.000,
    parameter CLK_PHASE =           0.000,
    parameter CLK_DIV_PHASE =       0.000,
    parameter MCLK_PHASE =          90.000,  
    parameter REF_JITTER1 =         0.010,
    parameter SS_EN =              "FALSE",
    parameter SS_MODE =      "CENTER_HIGH",
    parameter SS_MOD_PERIOD =       10000
)(
    output                       ddr3_nrst, // output NRST port
    output                       ddr3_clk, // DDR3 clock differential output, positive
    output                       ddr3_nclk,// DDR3 clock differential output, negative
    output  [ADDRESS_NUMBER-1:0] ddr3_a,   // output address ports (14:0) for 4Gb device
    output                 [2:0]ddr3_ba,  // output bank address ports
    output                       ddr3_we,  // output WE port
    output                       ddr3_ras, // output RAS port
    output                       ddr3_cas, // output CAS port
    output                       ddr3_cke, // output Clock Enable port
    output                       ddr3_odt, // output ODT port

    inout                 [15:0] dq,       // DQ  I/O pads
    output                       dml,      // LDM  I/O pad (actually only output)
    inout                        dqsl,     // LDQS I/O pad
    inout                        ndqsl,    // ~LDQS I/O pad
    output                       dmu,      // UDM  I/O pad (actually only output)
    inout                        dqsu,     // UDQS I/O pad
    inout                        ndqsu,    // ~UDQS I/O pad
    
    input                        clk_in,   // master input clock, initially assuming 100MHz
    output                       clk,      // free-running system clock, same frequency as iclk (shared for R/W),     BUFR output
    output                       clk_div,  // free-running half clk frequency, front aligned to clk (shared for R/W), BUFR output
    output                       mclk,     // same as clk_div, through separate BUFG and static phase adjust
    input                        rst_in,   // reset delays/serdes
    input                        ddr_rst,  // active high - generate NRST to memory
    input                        dci_rst,  // active high - reset DCI circuitry
    input                        dly_rst,  // active high - delay calibration circuitry
     
    input [2*ADDRESS_NUMBER-1:0] in_a,     // input address, 2 bits per signal (first, second) (29:0) for 4Gb device
    input                  [5:0] in_ba,    // input bank address, 2 bits per signal (first, second)
    input                  [1:0] in_we,    // input WE, 2 bits (first, second)
    input                  [1:0] in_ras,   // input RAS, 2 bits (first, second)
    input                  [1:0] in_cas,   // input CAS, 2 bits (first, second)
    input                  [1:0] in_cke,   // input CKE, 2 bits (first, second)
    input                  [1:0] in_odt,   // input ODT, 2 bits (first, second)
//    input                  [1:0] in_tri,   // tristate command/address outputs - same timing, but no odelay
    input                        in_tri,   // tristate command/address outputs - same timing, but no odelay
    
    input                 [63:0] din,             // parallel data to be sent out (4 bits per DG I/))
    input                  [7:0] din_dm,          // parallel data to be sent out over DM
    input                  [7:0] tin_dq,          // tristate for data out (sent out earlier than data!) and dm 
    input                  [7:0] din_dqs,         // parallel data to be sent out over DQS
    input                  [7:0] tin_dqs,         // tristate for DQS out (sent out earlier than data!) 
    output                [63:0] dout,            // parallel data received from DDR3 memory, 4 bits per DQ I/O
    
    
    input                        inv_clk_div,     // invert clk_div for R channels (clk_div is shared between R and W)
    input                        dci_disable_dqs, // disable DCI termination during writes and idle for dqs
    input                        dci_disable_dq,  // disable DCI termination during writes and idle for dq and dm signals
    
    input                  [7:0] dly_data, // delay value (3 LSB - fine delay)
    input                  [6:0] dly_addr, // select which delay to program
    input                        ld_delay, // load delay data to selected iodelayl (clk_div synchronous)
    input                        set,       // clk_div synchronous set all delays from previously loaded values
//    output                       locked,
    output                       locked_mmcm,
    output                       locked_pll,
    output                       dly_ready,
    output                       dci_ready,
    output               [7:0]   tmp_debug,
    output                       ps_rdy,
    output     [PHASE_WIDTH-1:0] ps_out 
);
  reg rst= 1'b1;
  always @(negedge clk_div or posedge rst_in) begin
    if (rst_in) rst <= 1'b1;
    else        rst <= 1'b0;
  end

  wire  ld_data_l = (dly_addr[6:5] == 2'h0) && ld_delay ;             
  wire  ld_data_h = (dly_addr[6:5] == 2'h1) && ld_delay ;             
  wire  ld_cmda =   (dly_addr[6:5] == 2'h2) && ld_delay ;             
  wire  ld_mmcm=    (dly_addr[6:0] == 7'h60) && ld_delay ;
  wire  clkfb_ref, clk_ref_pre; 
  wire  clk_ref; // 200MHz/300Mhz to calibrate I/O delays            
//  wire locked_mmcm,locked_pll, dly_ready, dci_ready;
//  assign locked=locked_mmcm && locked_pll && dly_ready && dci_ready; // both PLL ready, I/O delay calibrated
  wire clkin_stopped_mmcm;
  wire clkfb_stopped_mmcm;
  
  
  reg dbg1=0;
  reg dbg2=0;
  always @ (posedge rst_in or posedge mclk) begin
    if (rst_in) dbg1 <= 0;
    else dbg1 <= ~dbg1;
  end

  always @ (posedge rst_in or posedge clk_div) begin
    if (rst_in) dbg2 <= 0;
    else dbg2 <= ~dbg2;
  end
  

  assign tmp_debug ={
    dbg2, //dly_addr[1],
    dbg1, //dly_addr[0],
    clkin_stopped_mmcm,
    clkfb_stopped_mmcm,
    ddr_rst,
    rst_in,
    dci_rst,
    dly_rst
  };

/* memory reset */
    obuf #(
        .CAPACITANCE("DONT_CARE"),
        .DRIVE(12),
        .IOSTANDARD(IOSTANDARD_CMDA),
        .SLEW("SLOW")
    ) obuf_i (
        .O(ddr3_nrst), // output
        .I(~ddr_rst)   // input
    );
  
  cmd_addr #(
    .IODELAY_GRP(IODELAY_GRP),
    .IOSTANDARD(IOSTANDARD_CMDA),
    .SLEW(SLEW_CMDA),
    .REFCLK_FREQUENCY(REFCLK_FREQUENCY),
    .HIGH_PERFORMANCE_MODE(HIGH_PERFORMANCE_MODE),
    .ADDRESS_NUMBER(ADDRESS_NUMBER)
  ) cmd_addr_i(
    .ddr3_a   (ddr3_a[ADDRESS_NUMBER-1:0]), // output address ports (14:0) for 4Gb device
    .ddr3_ba  (ddr3_ba[2:0]),             // output bank address ports
    .ddr3_we  (ddr3_we),                 // output WE port
    .ddr3_ras (ddr3_ras),                // output RAS port
    .ddr3_cas (ddr3_cas),                // output CAS port
    .ddr3_cke (ddr3_cke),                // output Clock Enable port
    .ddr3_odt (ddr3_odt),                // output ODT port
    .clk      (clk),                     // free-running system clock, same frequency as iclk (shared for R/W)
    .clk_div  (clk_div),                 // free-running half clk frequency, front aligned to clk (shared for R/W)
    .rst      (rst),                     // reset delays/serdes
    .in_a     (in_a[2*ADDRESS_NUMBER-1:0]), // input address, 2 bits per signal (first, second) (29:0) for 4Gb device
    .in_ba    (in_ba[5:0]),              // input bank address, 2 bits per signal (first, second)
    .in_we    (in_we[1:0]),              // input WE, 2 bits (first, second)
    .in_ras   (in_ras[1:0]),             // input RAS, 2 bits (first, second)
    .in_cas   (in_cas[1:0]),             // input CAS, 2 bits (first, second)
    .in_cke   (in_cke[1:0]),             // input CKE, 2 bits (first, second)
    .in_odt   (in_odt[1:0]),             // input ODT, 2 bits (first, second)
//    .in_tri   (in_tri[1:0]),             // tristate command/address outputs - same timing, but no odelay
    .in_tri   (in_tri),             // tristate command/address outputs - same timing, but no odelay
    .dly_data (dly_data[7:0]),           // delay value (3 LSB - fine delay)
    .dly_addr (dly_addr[4:0]),           // select which delay to program
    .ld_delay (ld_cmda),               // load delay data to selected iodelayl (clk_div synchronous)
    .set      (set)                      // clk_div synchronous set all delays from previously loaded values
);

  byte_lane #(
    .IODELAY_GRP     (IODELAY_GRP),
    .IBUF_LOW_PWR    (IBUF_LOW_PWR),
    .IOSTANDARD_DQ   (IOSTANDARD_DQ),
    .IOSTANDARD_DQS  (IOSTANDARD_DQS),
    .SLEW_DQ         (SLEW_DQ),
    .SLEW_DQS        (SLEW_DQS),
    .REFCLK_FREQUENCY(REFCLK_FREQUENCY),
    .HIGH_PERFORMANCE_MODE(HIGH_PERFORMANCE_MODE)
  ) byte_lane0_i (
    .dq              (dq[7:0]),          // DQ  I/O pads
    .dm              (dml),              // DM  I/O pad (actually only output)
    .dqs             (dqsl),             //  DQS I/O pad
    .ndqs            (ndqsl),            // ~DQS I/O pad
    .clk             (clk),              // free-running system clock, same frequency as iclk (shared for R/W)
    .clk_div         (clk_div),          // free-running half clk frequency, front aligned to clk (shared for R/W)
    .inv_clk_div     (inv_clk_div),      // invert clk_div for R channels (clk_div is shared between R and W)
    .rst             (rst),
    .dci_disable_dqs (dci_disable_dqs),  // disable DCI termination during writes and idle for dqs
    .dci_disable_dq  (dci_disable_dq),   // disable DCI termination during writes and idle for dq and dm signals
    .din             (din[31:0]),        // parallel data to be sent out (4 bits per DQ I/O))
    .din_dm          (din_dm[3:0]),      // parallel data to be sent out over DM
    .tin_dq          (tin_dq[3:0]),      // tristate for data out (sent out earlier than data!) and dm 
    .din_dqs         (din_dqs[3:0]),     // parallel data to be sent out over DQS
    .tin_dqs         (tin_dqs[3:0]),     // tristate for DQS out (sent out earlier than data!) 
    .dout            (dout[31:0]),       // parallel data received from DDR3 memory, 4 bits per DQ I/O
    .dly_data        (dly_data[7:0]),    // delay value (3 LSB - fine delay)
    .dly_addr        (dly_addr[4:0]),    // select which delay to program
    .ld_delay        (ld_data_l),        // load delay data to selected iodelayl (clk_div synchronous)
    .set             (set)               // clk_div synchronous set all delays from previously loaded values
);

  byte_lane #(
    .IODELAY_GRP     (IODELAY_GRP),
    .IBUF_LOW_PWR    (IBUF_LOW_PWR),
    .IOSTANDARD_DQ   (IOSTANDARD_DQ),
    .IOSTANDARD_DQS  (IOSTANDARD_DQS),
    .SLEW_DQ         (SLEW_DQ),
    .SLEW_DQS        (SLEW_DQS),
    .REFCLK_FREQUENCY(REFCLK_FREQUENCY),
    .HIGH_PERFORMANCE_MODE(HIGH_PERFORMANCE_MODE)
  ) byte_lane1_i (
    .dq              (dq[15:8]),         // DQ  I/O pads
    .dm              (dmu),              // DM  I/O pad (actually only output)
    .dqs             (dqsu),             //  DQS I/O pad
    .ndqs            (ndqsu),            // ~DQS I/O pad
    .clk             (clk),              // free-running system clock, same frequency as iclk (shared for R/W)
    .clk_div         (clk_div),          // free-running half clk frequency, front aligned to clk (shared for R/W)
    .inv_clk_div     (inv_clk_div),      // invert clk_div for R channels (clk_div is shared between R and W)
    .rst             (rst),
    .dci_disable_dqs (dci_disable_dqs),  // disable DCI termination during writes and idle for dqs
    .dci_disable_dq  (dci_disable_dq),   // disable DCI termination during writes and idle for dq and dm signals
    .din             (din[63:32]),       // parallel data to be sent out (4 bits per DQ I/O))
    .din_dm          (din_dm[7:4]),      // parallel data to be sent out over DM
    .tin_dq          (tin_dq[7:4]),      // tristate for data out (sent out earlier than data!) and dm 
    .din_dqs         (din_dqs[7:4]),     // parallel data to be sent out over DQS
    .tin_dqs         (tin_dqs[7:4]),     // tristate for DQS out (sent out earlier than data!) 
    .dout            (dout[63:32]),      // parallel data received from DDR3 memory, 4 bits per DQ I/O
    .dly_data        (dly_data[7:0]),    // delay value (3 LSB - fine delay)
    .dly_addr        (dly_addr[4:0]),    // select which delay to program
    .ld_delay        (ld_data_h),        // load delay data to selected iodelayl (clk_div synchronous)
    .set             (set)               // clk_div synchronous set all delays from previously loaded values
);
//ddr3_clk
wire sdclk; // BUFIO
    oddr_ds #(
        .IOSTANDARD(IOSTANDARD_CLK),
        .SLEW(SLEW_CLK)
    ) oddr_ds_i (
        .clk(sdclk), // input
        .ce(1'b1), // input
        .rst(1'b0), //rst_n_clk), // input no need to reset?
        .set(1'b0), // input
        .din(2'b01), // input[1:0] 
        .tin(rst),   // tristate at reset
        .dq(ddr3_clk), // output
        .ndq(ddr3_nclk) // output
    );
// Clocks: MMCM is used to generate ddr3 differential clock (no dynamic phase shift),
// clk - write bit clock, phase dynamically adjusted, BUFR (initially 400MHz)
// clk_div  half bit frequency clock, phase dynamically adjusted, BUFR. Used also for delay/phase control (200MHz)
// mclk - same frequency as clk_div (same dynamic phase adjust), but with BUFG to be used in other regions. Phase to be
// statically adjusted for clock boundary crossing
// Phase control included, allowing setting phase in +/- 127 steps, each 1/56 of 1/Fvco (~22ps for Fvco=800MHz)
// So shifting phase dynamically by plus/- 113 moves SDCLK by a full period (2.5ns) forward and backward (113= 0x71)
wire clk_pre, clk_div_pre, sdclk_pre, mclk_pre, clk_fb;
BUFR clk_bufr_i (.O(clk), .CE(), .CLR(), .I(clk_pre));
BUFR clk_div_bufr_i (.O(clk_div), .CE(), .CLR(), .I(clk_div_pre));
BUFIO iclk_bufio_i (.O(sdclk), .I(sdclk_pre) );
//BUFIO clk_ref_i (.O(clk_ref), .I(clk_ref_pre));
//assign clk_ref=clk_ref_pre;
//BUFH clk_ref_i (.O(clk_ref), .I(clk_ref_pre));
BUFG clk_ref_i (.O(clk_ref), .I(clk_ref_pre));
BUFG mclk_i (.O(mclk),.I(mclk_pre) );
    /* Instance template for module mmcm_phase_cntr */
    mmcm_phase_cntr #(
        .PHASE_WIDTH         (PHASE_WIDTH),
        .CLKIN_PERIOD        (CLKIN_PERIOD),
        .BANDWIDTH           (BANDWIDTH),
        .CLKFBOUT_MULT_F     (CLKFBOUT_MULT),
        .DIVCLK_DIVIDE       (DIVCLK_DIVIDE),
        .CLKFBOUT_PHASE      (CLKFBOUT_PHASE),
        .CLKOUT0_PHASE       (SDCLK_PHASE),
        .CLKOUT1_PHASE       (CLK_PHASE),
        .CLKOUT2_PHASE       (CLK_DIV_PHASE),
        .CLKOUT3_PHASE       (MCLK_PHASE),
//        .CLKOUT4_PHASE          (0.000),
//        .CLKOUT5_PHASE          (0.000),
//        .CLKOUT6_PHASE          (0.000),
        .CLKFBOUT_USE_FINE_PS     ("FALSE"),
        .CLKOUT0_USE_FINE_PS      ("FALSE"),
        .CLKOUT1_USE_FINE_PS ("TRUE"),
        .CLKOUT2_USE_FINE_PS ("TRUE"),
        .CLKOUT3_USE_FINE_PS ("TRUE"),
//        .CLKOUT4_USE_FINE_PS("FALSE"),
//        .CLKOUT5_USE_FINE_PS("FALSE"),
//        .CLKOUT6_USE_FINE_PS("FALSE"),
        .CLKOUT0_DIVIDE_F    (2.000),
        .CLKOUT1_DIVIDE      (2),
        .CLKOUT2_DIVIDE      (4),
        .CLKOUT3_DIVIDE      (4),
//        .CLKOUT4_DIVIDE(1),
//        .CLKOUT5_DIVIDE(1),
//        .CLKOUT6_DIVIDE(1),
        .COMPENSATION             ("ZHOLD"),
        .REF_JITTER1         (REF_JITTER1),
//        .REF_JITTER2(0.010),
        .SS_EN               (SS_EN),
        .SS_MODE             (SS_MODE),
        .SS_MOD_PERIOD       (SS_MOD_PERIOD),
        .STARTUP_WAIT             ("FALSE")
    ) mmcm_phase_cntr_i (
        .clkin               (clk_in), // input
        .clkfbin             (clk_fb), // input
//        .rst                 (rst), // input
        .rst                 (rst_in), // input
        .pwrdwn                 (1'b0), // input
        .psclk               (clk_div), // input
        .ps_we               (ld_mmcm), // input
        .ps_din              (dly_data), // input[7:0] 
        .ps_ready            (ps_rdy), // output
        .ps_dout             (ps_out), // output[7:0] reg 
        .clkout0             (sdclk_pre), // output
        .clkout1             (clk_pre), // output
        .clkout2             (clk_div_pre), // output
        .clkout3             (mclk_pre), // output
        .clkout4(), // output
        .clkout5(), // output
        .clkout6(), // output
        .clkout0b(), // output
        .clkout1b(), // output
        .clkout2b(), // output
        .clkout3b(), // output
        .clkfbout            (clk_fb), // output
        .clkfboutb(), // output
        .locked              (locked_mmcm),
        .clkin_stopped       (clkin_stopped_mmcm), // output
        .clkfb_stopped       (clkfb_stopped_mmcm) // output
         // output
    );

// Generate reference clock for the I/O delays
    pll_base #(
        .CLKIN_PERIOD(CLKIN_PERIOD),
        .BANDWIDTH("OPTIMIZED"),
        .CLKFBOUT_MULT(CLKFBOUT_MULT_REF),
        .CLKOUT0_DIVIDE(CLKFBOUT_DIV_REF),
        .REF_JITTER1(0.010),
        .STARTUP_WAIT("FALSE")
    ) pll_base_i (
        .clkin(clk_in), // input
        .clkfbin(clkfb_ref), // input
//        .rst(rst), // input
        .rst(rst_in), // input
        .pwrdwn(1'b0), // input
        .clkout0(clk_ref_pre), // output
        .clkout1(), // output
        .clkout2(), // output
        .clkout3(), // output
        .clkout4(), // output
        .clkout5(), // output
        .clkfbout(clkfb_ref), // output
        .locked(locked_pll) // output
    );
// Does it need to be re-calibrated periodically - yes when temperature changes, same as dci_reset
    idelay_ctrl# (
        .IODELAY_GRP("IODELAY_MEMORY")
    ) idelay_ctrl_i (
        .refclk(clk_ref),
        .rst(rst || dly_rst),
        .rdy(dly_ready)
    );
    dci_reset dci_reset_i (
        .reset(rst || dci_rst), // input
        .ready(dci_ready) // output
    );
//assign dci_ready= !(rst || dci_rst); 
endmodule