// Exploiting Expendable Process-Margins in DRAMs for Run-Time Performance Optimization
// Authors: Karthik Chandrasekar*, Sven Goossens**, Christian Weis***, Martijn Koedam**
//          Benny Akesson****, Norbert Wehn***, Kees Goossens**
// *    Computer Engineering, TU Delft, The Netherlands
// **   Electronic Systems Group, TU Eindhoven, The Netherlands
// ***  Microelectronic Systems Design, TU Kaiserslautern, Germany
// **** Control Engineering, Czech Technical University, Czech Republic
// ML605 DRAM Characterization. Copyright TU Delft, Eindhoven University of Technology.
// Contact: k.chandrasekar {at} tudelft.nl, s.l.m.goossens {at} tue.nl
// www.drampower.info
// www.compsoc.eu
#include "mb_interface.h"
#include "stdio.h"
#include "fsl.h"
#include "math.h"
#include "../memctrl/memctrl_driver.h"
#include "../memctrl/memctrl_config.h"
#include "../cores/global_memmap.h"

// Traffic generator configuration register selection
const unsigned int CMD_START         = 0x0;
const unsigned int CMD_ADDR_INC_L    = 0x1;
const unsigned int CMD_ADDR_INC_H    = 0x2;
const unsigned int CMD_ADDR_MAX_L    = 0x3;
const unsigned int CMD_ADDR_MAX_H    = 0x4;
const unsigned int CMD_BLOCK_SIZE_C  = 0x5;
const unsigned int CMD_HALT_ON_ERROR = 0x6;
const unsigned int CMD_LOOP          = 0x7;

// Datasheet-based timings
const int RC = 21;
const int RCD = 6;
const int RL = 6;
const int WL = 5;
const int AL = 0;
const int RP = 6;
const int RFC = 44;
const int RAS = 15;
const int RTP = 4;
const int WR = 6;
const int NAW = 4;
const int LAW = 20;
const int RRD = 4;
const int CCD = 4;
const int WTR = 4;
const int BL = 8;

const int WR_PATTERN_START = 2;
const int RD_PATTERN_START = 32;

#define max(X,Y) ((X) > (Y) ? (X) : (Y))
#define min(X,Y) ((X) < (Y) ? (X) : (Y))

typedef struct {
  unsigned char cmd;
  unsigned int last;
  unsigned int bank;
  unsigned int addr;
} cmd_test_t;


typedef struct timings_s {
  int RCD;
  int RP;
  int RTP;
  int WR;
  int RAS;
} timings_t;

void printTimings(const timings_t* t)
{
  xil_printf("RCD: %d, RP: %d, RTP: %d, WR: %d\r\n", t->RCD, t->RP, t->RTP, t->WR);
}

void cfgTG(unsigned int regId, unsigned int data)
{
  unsigned int cmd = (regId << 29) | data;
  putfslx(cmd, tg_fsl, FSL_DEFAULT);
}

int testShared(const char* testID)
{
  unsigned int fsl_temp;

  // Start test:
  cfgTG(CMD_START, 1);
  // Wait for result:
  getfslx(fsl_temp, tg_fsl, FSL_DEFAULT);

  if (fsl_temp != 0) {
    // test failed
    return 0;
  } else {
    return 1;
  }
}

void addCmd (unsigned char cmd,
             unsigned int last,
             unsigned int bank,
             unsigned int addr,
             cmd_test_t* pattern)
{
  pattern[addr].cmd = cmd;
  pattern[addr].last = last;
  pattern[addr].bank = bank;
}

int writeNOPs(int i, int nNOPs, cmd_test_t* pattern)
{
  int j;
  for (j = 0; j < nNOPs; j++) {
    addCmd(CMD_NOP, 0, 0, i, pattern);  i++;
  }
  return i;
}

void addPattern(cmd_test_t* pattern, int startOffset, int length, int printPattern)
{
  int i;
  for (i = 0; i < length; i++) {
    cfgCommand(pattern[i].cmd, pattern[i].last, pattern[i].bank, i + startOffset, &memctrl_arch_params);
  }
}


void configureWritePattern(int startOffset, int rcd, int wr, int rp, int wl, int printPattern)
{
  int i = 0;
  cmd_test_t pattern[100];

  addCmd(CMD_ACT, 0, 0, i, pattern); i++;
  i = writeNOPs(i, rcd - 1, pattern);
  addCmd(CMD_WR,  0, 0, i, pattern); i++;
  i = writeNOPs(i, wl + wr + BL / 2 - 1, pattern);
  addCmd(CMD_PRE, 0, 0, i, pattern); i++;
  i = writeNOPs(i, rp - 1, pattern);

  // Patterns are automatically separated by 2 NOPs by the controller, so
  // we can remove 2 of them from the end of the pattern.
  if (pattern[i - 1].cmd == CMD_NOP) {
    i--;
  }
  if (pattern[i - 1].cmd == CMD_NOP) {
    i--;
  }
  pattern[i - 1].last = 1;
  pattern[i - 2].last = 1;

  addPattern(pattern, startOffset, i, printPattern);
}

void configureReadPattern(int startOffset, int rcd, int rtp, int rp, int wl, int wtr, int printPattern)
{
  int i = 0;
  cmd_test_t pattern[100];

  addCmd(CMD_ACT, 0, 0, i, pattern); i++;
  i = writeNOPs(i, rcd - 1, pattern);
  addCmd(CMD_RD,  0, 0, i, pattern); i++;
  i = writeNOPs(i, rtp - 1, pattern);
  addCmd(CMD_PRE, 0, 0, i, pattern); i++;
  i = writeNOPs(i, rp - 1, pattern);

  // Patterns are automatically separated by 2 NOPs by the controller, so
  // we can remove 2 of them from the end of the pattern.
  if (pattern[i - 1].cmd == CMD_NOP) {
    i--;
  }
  if (pattern[i - 1].cmd == CMD_NOP) {
    i--;
  }
  pattern[i - 1].last = 1;
  pattern[i - 2].last = 1;

  addPattern(pattern, startOffset, i, printPattern);
}

int testCommon(timings_t* timings, const char* testID, int start, int stop, int* timingPtr)
{
  int best = start;
  int val = start;
  xil_printf("Using timings: ");
  printTimings(timings);
  xil_printf("\tSweeping %s from %d to %d.\r\n", testID, start, stop);

  for (val = start; val >= stop; val--) {
    *timingPtr = val;
    configureWritePattern(WR_PATTERN_START, timings->RCD, timings->WR,  timings->RP, WL, 0);
    configureReadPattern( RD_PATTERN_START, timings->RCD, timings->RTP, timings->RP, WL, WTR, 0);

    if (testShared(testID)) {
      best = val;
    } else {
      *timingPtr = best;
      xil_printf("\tLowest functioning value: %d.\r\n", best);
      return best;
    }
  }
  *timingPtr = best;
  xil_printf("\tLowest functioning value: %d.\r\n", best);
  return best;
}

void bestPatterns()
{
  int bestRP      = RP;
  int bestWR_ind  = WR;
  int bestWR_rtp  = WR;
  int bestRTP_ind = RTP;
  int bestRTP_wr  = RTP;

  timings_t t;
  t.RCD = RCD;
  t.RP  = RP;
  t.RTP = RTP;
  t.WR  = WR;

  testCommon(&t, "RCD",  RCD, 0, &(t.RCD));
  bestRP      = testCommon(&t, "RP",    RP, 2, &(t.RP));
  bestRTP_ind = testCommon(&t, "RTP",  RTP, 0, &(t.RTP));
  t.RTP = RTP;
  bestWR_ind  = testCommon(&t, "WR",    WR, 0, &(t.WR));

  bestRTP_wr  = testCommon(&t, "RTP",  RTP, 0, &(t.RTP));
  t.RTP = bestRTP_ind;
  t.WR  = WR;
  bestWR_rtp  = testCommon(&t, "WR",    WR, 0, &(t.WR));

  xil_printf( "Non-conservative timings:\r\n"\
              "\tRCD: %d\r\n"\
              "\tRP: %d\r\n"\
              "\tRTP independent: %d\r\n"\
              "\tRTP with best WR: %d\r\n"\
              "\tWR independent: %d\r\n"\
              "\tWR with best RTP: %d\r\n"\
              "\r\n",
              t.RCD,
              t.RP,
              bestRTP_ind,
              bestRTP_wr,
              bestWR_ind,
              bestWR_rtp
  );

  if (min(bestRTP_wr + 1, RTP) + min(bestWR_ind + 1, WR) <= min(bestWR_rtp + 1, WR) + min(bestRTP_ind + 1, RTP)) {
    t.RTP = bestRTP_wr;
    t.WR = bestWR_ind;
  } else {
    t.RTP = bestRTP_ind;
    t.WR = bestWR_rtp;
  }

  xil_printf("Selected WR: %d and RTP: %d\r\n", t.WR, t.RTP);

  t.RCD      = min((int)ceil((double)t.RCD * 1.133) , RCD);
  t.RP       = min((int)ceil((double)t.RP  * 1.135) , RP);
  t.RTP      = min((int)ceil((double)t.RTP * 1.2055), RTP);
  t.WR       = min((int)ceil((double)t.WR  * 1.1471), WR);
  t.RAS      = min(t.RCD + t.RTP, RAS);
  int bestRC = min(t.RCD + t.RTP + t.RP, RC);

  xil_printf("Conservative timings:\r\n"\
             "\tRCD: %d\r\n"\
             "\tRP: %d\r\n"\
             "\tRTP: %d\r\n"\
             "\tWR: %d\r\n"\
             "\tDerived timings:\r\n"\
             "\tRC: %d\r\n"\
             "\tRAS: %d\r\n"\
             "\r\n",
             t.RCD,
             t.RP,
             t.RTP,
             t.WR,
             bestRC,
             t.RAS
  );

  xil_printf("Running final test with conservative timings:\r\n");
  testCommon(&t, "", t.RCD, t.RCD, &(t.RCD));
}

int main()
{
  // Setup address decoder and initial pattern set. Will get overwritten during test.
  cfgPatterns(&pattern_set, &memctrl_arch_params, 0);

  // Run through whole address range, and give results when completed.
  cfgTG(CMD_HALT_ON_ERROR, 0);
  // Only loop through range once.
  cfgTG(CMD_LOOP,          0);
  // Use block size of 1, i.e. 2 words, times 4 when width converted = 8 = BL
  cfgTG(CMD_BLOCK_SIZE_C,  1);
  // Each request is worth 128 bytes
  cfgTG(CMD_ADDR_INC_L,    128);
  cfgTG(CMD_ADDR_INC_H,    0);

  // When the generator reaches addr_max, the test report is sent to the microblaze.
  unsigned int addr_max = 0x20000000 | 64;
  cfgTG(CMD_ADDR_MAX_L,    0xFFFF & addr_max);
  cfgTG(CMD_ADDR_MAX_H,    0xFFFF & (addr_max >> 16));

  xil_printf("\t\"Exploiting Expendable Process-Margins in DRAMs for Run-Time Performance Optimization\"\r\n");
  xil_printf("\tAuthors: Karthik Chandrasekar, Sven Goossens, Christian Weis, Martijn Koedam, Benny Akesson, Norbert Wehn, Kees Goossens\r\n");
  xil_printf("\tML605 DRAM Characterization. Copyright TU Delft, Eindhoven University of Technology.\r\n");
  xil_printf("\tContact: k.chandrasekar {at} tudelft.nl, s.l.m.goossens {at} tue.nl\r\n\r\n");
  xil_printf("\twww.drampower.info\r\n");
  xil_printf("\twww.compsoc.eu\r\n");

  xil_printf("Programmed patterns and traffic generator\r\n");
  xil_printf("Starting traffic generator\r\n");
  xil_printf(">>> Testing all chips\r\n");

  // Start the test
  bestPatterns();

  xil_printf("FINISHED\r\n");
  return 0;
}