///////////////////////////////////////////////////////////////////////
// File: top.cpp 
// Author: Alain Greiner 
// Copyright: UPMC/LIP6
// Date : may 2013
// This program is released under the GNU public license
/////////////////////////////////////////////////////////////////////////
// This file define a generic TSAR architecture.
// The physical address space is 40 bits.
//
// The number of clusters cannot be larger than 256.
// The number of processors per cluster cannot be larger than 8.
// 
// - It uses four dspin_local_crossbar per cluster as local interconnect 
// - It uses two virtual_dspin routers per cluster as global interconnect
// - It uses the vci_cc_vcache_wrapper 
// - It uses the vci_mem_cache
// - It contains one vci_xicu per cluster.
// - It contains one vci_multi_dma per cluster.
// - It contains one vci_simple_ram per cluster to model the L3 cache.
//
// The communication between the MemCache and the Xram is 64 bits.
//
// All clusters are identical, but the cluster 0 (called io_cluster), 
// contains 6 extra components:
// - the boot rom (BROM)
// - the disk controller (BDEV)
// - the multi-channel network controller (MNIC)
// - the multi-channel chained buffer dma controller (CDMA)
// - the multi-channel tty controller (MTTY)
// - the frame buffer controller (FBUF)
//
// It is build with one single component implementing a cluster,
// defined in files tsar_xbar_cluster.* (with * = cpp, h, sd)
//
// The IRQs are connected to XICUs as follow:
// - The IRQ_IN[0] to IRQ_IN[7] ports are not used in all clusters.
// - The DMA IRQs are connected to IRQ_IN[8] to IRQ_IN[15] in all clusters.
// - The TTY IRQs are connected to IRQ_IN[16] to IRQ_IN[30] in I/O cluster.
// - The BDEV IRQ is connected to IRQ_IN[31] in I/O cluster.
// 
// Some hardware parameters are used when compiling the OS, and are used 
// by this top.cpp file. They must be defined in the hard_config.h file :
// - CLUSTER_X        : number of clusters in a row (power of 2)
// - CLUSTER_Y        : number of clusters in a column (power of 2)
// - CLUSTER_SIZE     : size of the segment allocated to a cluster
// - NB_PROCS_MAX     : number of processors per cluster (power of 2)
// - NB_DMA_CHANNELS  : number of DMA channels per cluster (< 9)
// - NB_TTY_CHANNELS  : number of TTY channels in I/O cluster (< 16)
// - NB_NIC_CHANNELS  : number of NIC channels in I/O cluster (< 9)
// 
// Some other hardware parameters are not used when compiling the OS,
// and can be directly defined in this top.cpp file:
// - XRAM_LATENCY     : external ram latency 
// - MEMC_WAYS        : L2 cache number of ways
// - MEMC_SETS        : L2 cache number of sets
// - L1_IWAYS     
// - L1_ISETS    
// - L1_DWAYS   
// - L1_DSETS  
// - FBUF_X_SIZE      : width of frame buffer (pixels)
// - FBUF_Y_SIZE      : heigth of frame buffer (lines)
// - BDEV_SECTOR_SIZE : block size for block drvice
// - BDEV_IMAGE_NAME  : file pathname for block device 
// - NIC_RX_NAME      : file pathname for NIC received packets
// - NIC_TX_NAME      : file pathname for NIC transmited packets
// - NIC_TIMEOUT      : max number of cycles before closing a container
/////////////////////////////////////////////////////////////////////////
// General policy for 40 bits physical address decoding:
// All physical segments base addresses are multiple of 1 Mbytes
// (=> the 24 LSB bits = 0, and the 16 MSB bits define the target) 
// The (x_width + y_width) MSB bits (left aligned) define
// the cluster index, and the LADR bits define the local index:
//      | X_ID  | Y_ID  |---| LADR |     OFFSET          |
//      |x_width|y_width|---|  8   |       24            |
/////////////////////////////////////////////////////////////////////////
// General policy for 14 bits SRCID decoding:
// Each component is identified by (x_id, y_id, l_id) tuple.
//      | X_ID  | Y_ID  |---| L_ID |
//      |x_width|y_width|---|  6   |
/////////////////////////////////////////////////////////////////////////

#include <systemc>
#include <sys/time.h>
#include <iostream>
#include <sstream>
#include <cstdlib>
#include <cstdarg>
#include <stdint.h>

#include "gdbserver.h"
#include "mapping_table.h"
#include "alloc_elems.h"
#include "tsar_super_cluster.h"

//#define USE_ALMOS 1
#define USE_GIET 

#ifdef USE_ALMOS
#ifdef USE_GIET
#error "Can't use Two different OS"
#endif
#endif

#ifndef USE_ALMOS
#ifndef USE_GIET
#error "You need to specify one OS"
#endif
#endif

#ifdef USE_ALMOS
   #define PREFIX_OS "almos/"
   #include "almos/hard_config.h"
#endif
#ifdef USE_GIET
   #include "hard_config.h"
   #define RAM_TGTID            0
   #define XCU_TGTID            1
   #define DMA_TGTID            2
   #define TTY_TGTID            3
   #define IOC_TGTID            4
   #define FBF_TGTID            5
   #define NIC_TGTID            6
   #define CMA_TGTID            7
   #define ROM_TGTID            8
   #define SIM_TGTID            9

   #define FBF_X_SIZE		FBUF_X_SIZE
   #define FBF_Y_SIZE		FBUF_Y_SIZE

   #define Z_WIDTH3D		1
   #define Z_SIZE3D		(1 << Z_WIDTH3D)
   #define Z_IO3D		(Y_IO & ((1 << Z_WIDTH3D) - 1))
   #define Y_SIZE3D		(Y_SIZE / Z_SIZE3D)
   #define Y_WIDTH3D		(Y_WIDTH - Z_WIDTH3D)
   #define Y_IO3D		(Y_IO >> Z_WIDTH3D)
   #define X_SIZE3D		X_SIZE
   #define X_WIDTH3D		X_WIDTH
   #define X_IO3D		X_IO

   #define ELEVATOR_X		0
   #define ELEVATOR_Y		0
#endif

///////////////////////////////////////////////////
//               Parallelisation
///////////////////////////////////////////////////


#ifdef _OPENMP
#include <omp.h>
#endif

//  nluster index (computed from x,y coordinates)
#ifdef USE_ALMOS
   #define cluster(x,y,z)   (z + y * Z_SIZE3D  + x * Y_SIZE3D)
#else
   #define cluster(x,y,z)   ((x << (Y_WIDTH3D + Z_WIDTH3D))  + (y << Z_WIDTH3D) + z)
#endif


#define min(x, y) (x < y ? x : y)

///////////////////////////////////////////////////////////
//          DSPIN parameters           
///////////////////////////////////////////////////////////

#define dspin_cmd_width      39
#define dspin_rsp_width      32

///////////////////////////////////////////////////////////
//          VCI parameters           
///////////////////////////////////////////////////////////

#define vci_cell_width_int    4
#define vci_cell_width_ext    8

#ifdef USE_ALMOS
#define vci_address_width     32
#endif
#ifdef USE_GIET
#define vci_address_width     40
#endif
#define vci_plen_width        8
#define vci_rerror_width      1
#define vci_clen_width        1
#define vci_rflag_width       1
#define vci_srcid_width       14
#define vci_pktid_width       4
#define vci_trdid_width       4
#define vci_wrplen_width      1

////////////////////////////////////////////////////////////
//    Secondary Hardware Parameters         
//////////////////////i/////////////////////////////////////


#define XRAM_LATENCY          0

#define MEMC_WAYS             16
#define MEMC_SETS             256

#define L1_IWAYS              4
#define L1_ISETS              64

#define L1_DWAYS              4
#define L1_DSETS              64

#ifdef USE_ALMOS
#define FBUF_X_SIZE           1024
#define FBUF_Y_SIZE           1024
#endif

#ifdef USE_GIET
#define BDEV_SECTOR_SIZE      512
#define BDEV_IMAGE_NAME       "virt_hdd.dmg"
#endif
#ifdef USE_ALMOS
#define BDEV_SECTOR_SIZE      4096
#define BDEV_IMAGE_NAME       PREFIX_OS"hdd-img.bin"
#endif

#define NIC_RX_NAME           PREFIX_OS"nic/rx_packets.txt"
#define NIC_TX_NAME           PREFIX_OS"nic/tx_packets.txt"
#define NIC_TIMEOUT           10000

////////////////////////////////////////////////////////////
//    Software to be loaded in ROM & RAM         
//////////////////////i/////////////////////////////////////

#ifdef USE_ALMOS
#define soft_name       PREFIX_OS"preloader.elf"
#endif
#ifdef USE_GIET
#define soft_name   "../../softs/tsar_boot/preloader.elf"
#endif

////////////////////////////////////////////////////////////
//     DEBUG Parameters default values         
//////////////////////i/////////////////////////////////////

#define MAX_FROZEN_CYCLES     100000000


////////////////////////////////////////////////////////////////////
//     TGTID definition in direct space
// For all components:  global TGTID = global SRCID = cluster_index
////////////////////////////////////////////////////////////////////


/////////////////////////////////////////////////////////
//    Physical segments definition
/////////////////////////////////////////////////////////
// There is 3 segments replicated in all clusters
// and 5 specific segments in the "IO" cluster 
// (containing address 0xBF000000)
/////////////////////////////////////////////////////////

#ifdef USE_ALMOS
   // 2^19 is the offset for the local id (8 bits for global ID :
   // 1 bit for Memcache or Peripheral, 4 for local peripheral id)
   // (Almos supports 32 bits physical addresses)
#endif

bool stop_called = false;

/////////////////////////////////
int _main(int argc, char *argv[])
{
   using namespace sc_core;
   using namespace soclib::caba;
   using namespace soclib::common;

   const int64_t max_cycles   = 5000000;             // Maximum number of cycles simulated in one sc_start call
   int64_t ncycles            = 0x7FFFFFFFFFFFFFFF;  // simulated cycles
   char     disk_name[256]    = BDEV_IMAGE_NAME;    // pathname to the disk image
   char     nic_rx_name[256]  = NIC_RX_NAME;        // pathname to the rx packets file
   char     nic_tx_name[256]  = NIC_TX_NAME;        // pathname to the tx packets file
   ssize_t  threads_nr        = 1;                  // simulator's threads number
   bool     debug_ok          = false;              // trace activated
   size_t   debug_period      = 1;                  // trace period
   size_t   debug_memc_id     = 0;                  // index of memc to be traced 
   size_t   debug_proc_id     = 0;                  // index of proc to be traced
   int64_t  debug_from        = 0;                  // trace start cycle
   int64_t  frozen_cycles     = MAX_FROZEN_CYCLES;  // monitoring frozen processor
   int64_t  reset_counters    = -1;
   int64_t  dump_counters     = -1;
   bool     do_reset_counters = false;
   bool     do_dump_counters  = false;
   struct   timeval t1, t2;
   uint64_t ms1, ms2;

   ////////////// command line arguments //////////////////////
   if (argc > 1) {
      for (int n = 1; n < argc; n = n + 2) {
         if ((strcmp(argv[n], "-NCYCLES") == 0) && (n + 1 < argc)) {
            ncycles = (int64_t) strtol(argv[n + 1], NULL, 0);
         }
         else if ((strcmp(argv[n],"-DISK") == 0) && (n + 1 < argc)) {
            strcpy(disk_name, argv[n + 1]);
         }
         else if ((strcmp(argv[n],"-DEBUG") == 0) && (n + 1 < argc)) {
            debug_ok = true;
            debug_from = (int64_t) strtol(argv[n + 1], NULL, 0);
         }
         else if ((strcmp(argv[n], "-MEMCID") == 0) && (n + 1 < argc)) {
            debug_memc_id = (size_t) strtol(argv[n + 1], NULL, 0);
#ifdef USE_ALMOS
            assert((debug_memc_id < (X_SIZE3D * Y_SIZE3D)) &&
                   "debug_memc_id larger than X_SIZE3D * Y_SIZE3D" );
#else
            size_t x = debug_memc_id >> (Y_WIDTH3D + Z_WIDTH3D);
            size_t y = (debug_memc_id >> Z_WIDTH3D) & ((1 << Y_WIDTH3D) - 1);
            size_t z = debug_memc_id & ((1 << Z_WIDTH3D) - 1);

            assert( (x <= X_SIZE3D) and (y <= Y_SIZE3D) and (z <= Z_SIZE3D) &&
                  "MEMCID parameter refers a not valid memory cache");
#endif
         }
         else if ((strcmp(argv[n], "-PROCID") == 0) && (n + 1 < argc)) {
            debug_proc_id = (size_t) strtol(argv[n + 1], NULL, 0);
#ifdef USE_ALMOS
            assert((debug_proc_id < (X_SIZE3D * Y_SIZE3D * Z_SIZE3D * NB_PROCS_MAX)) && 
                   "debug_proc_id larger than X_SIZE3D * Y_SIZE3D * Z_SIZE3D * NB_PROCS");
#else
            size_t cluster_xyz = debug_proc_id / NB_PROCS_MAX ;
            size_t x = cluster_xyz >> (Y_WIDTH3D + Z_WIDTH3D);
            size_t y = (cluster_xyz >> Z_WIDTH3D) & ((1 << Y_WIDTH3D) - 1);
            size_t z = cluster_xyz & ((1 << Z_WIDTH3D) - 1);

            assert( (x <= X_SIZE3D) and (y <= Y_SIZE3D) and (z <= Z_SIZE3D) &&
                  "PROCID parameter refers a not valid processor");
#endif
         }
         else if ((strcmp(argv[n], "-THREADS") == 0) && ((n + 1) < argc)) {
            threads_nr = (ssize_t) strtol(argv[n + 1], NULL, 0);
            threads_nr = (threads_nr < 1) ? 1 : threads_nr;
         }
         else if ((strcmp(argv[n], "-FROZEN") == 0) && (n + 1 < argc)) {
            frozen_cycles = (int64_t) strtol(argv[n + 1], NULL, 0);
         }
         else if ((strcmp(argv[n], "-PERIOD") == 0) && (n + 1 < argc)) {
            debug_period = (size_t) strtol(argv[n + 1], NULL, 0);
         }
         else if ((strcmp(argv[n], "--reset-counters") == 0) && (n + 1 < argc)) {
            reset_counters = (int64_t) strtol(argv[n + 1], NULL, 0);
            do_reset_counters = true;
         }
         else if ((strcmp(argv[n], "--dump-counters") == 0) && (n + 1 < argc)) {
            dump_counters = (int64_t) strtol(argv[n + 1], NULL, 0);
            do_dump_counters = true;
         }
         else {
            std::cout << "   Arguments are (key,value) couples." << std::endl;
            std::cout << "   The order is not important." << std::endl;
            std::cout << "   Accepted arguments are :" << std::endl << std::endl;
            std::cout << "     -SOFT pathname_for_embedded_soft" << std::endl;
            std::cout << "     -DISK pathname_for_disk_image" << std::endl;
            std::cout << "     -NCYCLES number_of_simulated_cycles" << std::endl;
            std::cout << "     -DEBUG debug_start_cycle" << std::endl;
            std::cout << "     -THREADS simulator's threads number" << std::endl;
            std::cout << "     -FROZEN max_number_of_lines" << std::endl;
            std::cout << "     -PERIOD number_of_cycles between trace" << std::endl;
            std::cout << "     -MEMCID index_memc_to_be_traced" << std::endl;
            std::cout << "     -PROCID index_proc_to_be_traced" << std::endl;
            exit(0);
         }
      }
   }

    // checking hardware parameters
    assert( ( (X_SIZE3D == 1) or (X_SIZE3D == 2) or (X_SIZE3D == 4) or
              (X_SIZE3D == 8) or (X_SIZE3D == 16) ) and
              "The X_SIZE3D parameter must be 1, 2, 4, 8 or 16" );

    assert( ( (Y_SIZE3D == 1) or (Y_SIZE3D == 2) or (Y_SIZE3D == 4) or
              (Y_SIZE3D == 8) or (Y_SIZE3D == 16) ) and
              "The Y_SIZE3D parameter must be 1, 2, 4, 8 or 16" );

    assert( ( (Z_SIZE3D == 1) or (Z_SIZE3D == 2) or (Z_SIZE3D == 4) or
              (Z_SIZE3D == 8) or (Z_SIZE3D == 16) ) and
              "The Z_SIZE3D parameter must be 1, 2, 4, 8 or 16" );

    assert( ( (NB_PROCS_MAX == 1) or (NB_PROCS_MAX == 2) or
              (NB_PROCS_MAX == 4) or (NB_PROCS_MAX == 8) ) and
             "The NB_PROCS_MAX parameter must be 1, 2, 4 or 8" );

    assert( (NB_DMA_CHANNELS < 9) and
            "The NB_DMA_CHANNELS parameter must be smaller than 9" );

    assert( (NB_TTY_CHANNELS < 15) and
            "The NB_TTY_CHANNELS parameter must be smaller than 15" );

    assert( (NB_NIC_CHANNELS < 9) and
            "The NB_NIC_CHANNELS parameter must be smaller than 9" );

#ifdef USE_GIET
    assert( (vci_address_width == 40) and
            "VCI address width with the GIET must be 40 bits" );
#endif

#ifdef USE_ALMOS
    assert( (vci_address_width == 32) and
            "VCI address width with ALMOS must be 32 bits" );
#endif


    std::cout << std::endl;
    std::cout << " - X_SIZE3D         = " << X_SIZE3D << std::endl;
    std::cout << " - Y_SIZE3D         = " << Y_SIZE3D << std::endl;
    std::cout << " - Z_SIZE3D         = " << Z_SIZE3D << std::endl;
    std::cout << " - NB_PROCS_MAX     = " << NB_PROCS_MAX <<  std::endl;
    std::cout << " - NB_DMA_CHANNELS  = " << NB_DMA_CHANNELS <<  std::endl;
    std::cout << " - NB_TTY_CHANNELS  = " << NB_TTY_CHANNELS <<  std::endl;
    std::cout << " - NB_NIC_CHANNELS  = " << NB_NIC_CHANNELS <<  std::endl;
    std::cout << " - MEMC_WAYS        = " << MEMC_WAYS << std::endl;
    std::cout << " - MEMC_SETS        = " << MEMC_SETS << std::endl;
    std::cout << " - RAM_LATENCY      = " << XRAM_LATENCY << std::endl;
    std::cout << " - MAX_FROZEN       = " << frozen_cycles << std::endl;

    std::cout << std::endl;
    // Internal and External VCI parameters definition
    typedef soclib::caba::VciParams<vci_cell_width_int,
                                    vci_plen_width,
                                    vci_address_width,
                                    vci_rerror_width,
                                    vci_clen_width,
                                    vci_rflag_width,
                                    vci_srcid_width,
                                    vci_pktid_width,
                                    vci_trdid_width,
                                    vci_wrplen_width> vci_param_int;

    typedef soclib::caba::VciParams<vci_cell_width_ext,
                                    vci_plen_width,
                                    vci_address_width,
                                    vci_rerror_width,
                                    vci_clen_width,
                                    vci_rflag_width,
                                    vci_srcid_width,
                                    vci_pktid_width,
                                    vci_trdid_width,
                                    vci_wrplen_width> vci_param_ext;

#ifdef _OPENMP
   omp_set_dynamic(false);
   omp_set_num_threads(threads_nr);
   std::cerr << "Built with openmp version " << _OPENMP << std::endl;
#endif

   // Define parameters depending on mesh size
   size_t   x_width;
   size_t   y_width;
   size_t   z_width;

#ifdef USE_ALMOS
   if      (X_SIZE3D == 1) x_width = 0;
   else if (X_SIZE3D == 2) x_width = 1;
   else if (X_SIZE3D <= 4) x_width = 2;
   else if (X_SIZE3D <= 8) x_width = 3;
   else                  x_width = 4;

   if      (Y_SIZE3D == 1) y_width = 0;
   else if (Y_SIZE3D == 2) y_width = 1;
   else if (Y_SIZE3D <= 4) y_width = 2;
   else if (Y_SIZE3D <= 8) y_width = 3;
   else                  y_width = 4;

   if      (Z_SIZE3D == 1) z_width = 0;
   else if (Z_SIZE3D == 2) z_width = 1;
   else if (Z_SIZE3D <= 4) z_width = 2;
   else if (Z_SIZE3D <= 8) z_width = 3;
   else                  z_width = 4;

#else
   x_width = X_WIDTH3D;
   y_width = Y_WIDTH3D;
   z_width = Z_WIDTH3D;

   assert(((X_WIDTH3D + Y_WIDTH3D + Z_WIDTH3D) <= 8) and
           "Up to 256 clusters");

   assert((X_SIZE3D <= (1 << X_WIDTH3D)) and (Y_SIZE3D <= (1 << Y_WIDTH3D)) and
           (Z_SIZE3D <= (1 << Z_WIDTH3D)) and
	   "The X_WIDTH3D and Y_WIDTH3D and Z_WIDTH3D parameter are insufficient");

#endif

   /////////////////////
   //  Mapping Tables
   /////////////////////

   // internal network
   MappingTable maptabd(vci_address_width, 
                        IntTab(x_width + y_width + z_width, 16 - x_width - y_width - z_width), 
                        IntTab(x_width + y_width + z_width, vci_srcid_width - x_width - y_width - z_width), 
                        0x00FF000000);

   for (size_t x = 0; x < X_SIZE3D; x++) {
      for (size_t y = 0; y < Y_SIZE3D; y++) {
         for (size_t z = 0; z < Z_SIZE3D; z++) {
             sc_uint<vci_address_width> offset;
             offset = (sc_uint<vci_address_width>) cluster(x,y,z) 
                   << (vci_address_width - x_width - y_width - z_width);

             std::ostringstream    si;
             si << "seg_xicu_" << x << "_" << y << "_" << z;
             maptabd.add(Segment(si.str(), SEG_XCU_BASE + offset, SEG_XCU_SIZE, 
                  IntTab(cluster(x,y,z), XCU_TGTID), false));

             std::ostringstream    sd;
             sd << "seg_mdma_" << x << "_" << y << "_" << z;
             maptabd.add(Segment(sd.str(), SEG_DMA_BASE + offset, SEG_DMA_SIZE, 
                  IntTab(cluster(x,y,z), DMA_TGTID), false));

             std::ostringstream    sh;
             sh << "seg_memc_" << x << "_" << y << "_" << z;
             maptabd.add(Segment(sh.str(), SEG_RAM_BASE + offset, SEG_RAM_SIZE, 
                  IntTab(cluster(x,y,z), RAM_TGTID), true));

             if (x == X_IO3D && y == Y_IO3D && z == Z_IO3D) {
                maptabd.add(Segment("seg_mtty", SEG_TTY_BASE, SEG_TTY_SIZE, 
                        IntTab(cluster(x,y,z),TTY_TGTID), false));
                maptabd.add(Segment("seg_fbuf", SEG_FBF_BASE, SEG_FBF_SIZE, 
                        IntTab(cluster(x,y,z),FBF_TGTID), false));
                maptabd.add(Segment("seg_bdev", SEG_IOC_BASE, SEG_IOC_SIZE, 
                        IntTab(cluster(x,y,z),IOC_TGTID), false));
                maptabd.add(Segment("seg_brom", SEG_ROM_BASE, SEG_ROM_SIZE, 
                        IntTab(cluster(x,y,z),ROM_TGTID), true));
                maptabd.add(Segment("seg_mnic", SEG_NIC_BASE, SEG_NIC_SIZE, 
                        IntTab(cluster(x,y,z),NIC_TGTID), false));
                maptabd.add(Segment("seg_cdma", SEG_CMA_BASE, SEG_CMA_SIZE, 
                        IntTab(cluster(x,y,z),CMA_TGTID), false));
                maptabd.add(Segment("seg_simh", SEG_SIM_BASE, SEG_SIM_SIZE, 
                        IntTab(cluster(x,y,z),SIM_TGTID), false));
             }
	 }
      }
   }
   std::cout << maptabd << std::endl;

   // external network
   MappingTable maptabx(vci_address_width, 
                        IntTab(x_width + y_width + z_width), 
                        IntTab(x_width + y_width + z_width), 
                        0xFFFF000000ULL);

   for (size_t x = 0; x < X_SIZE3D; x++) {
      for (size_t y = 0; y < Y_SIZE3D ; y++) {
         for (size_t z = 0; z < Z_SIZE3D ; z++) {

            sc_uint<vci_address_width> offset;
            offset = (sc_uint<vci_address_width>) cluster(x,y,z) 
                   << (vci_address_width - x_width - y_width - z_width);

            std::ostringstream sh;
            sh << "x_seg_memc_" << x << "_" << y << "_" << z;

            maptabx.add(Segment(sh.str(), SEG_RAM_BASE + offset, 
                     SEG_RAM_SIZE, IntTab(cluster(x,y,z)), false));
	 }
      }
   }
   std::cout << maptabx << std::endl;

   ////////////////////
   // Signals
   ///////////////////

   sc_clock           signal_clk("clk");
   sc_signal<bool>    signal_resetn("resetn");

   // Z-axis inter-clusters DSPIN signals
   DspinSignals<dspin_cmd_width>* signal_dspin_z_cmd_inc =
      alloc_elems<DspinSignals<dspin_cmd_width> >("signal_dspin_h_cmd_inc", Z_SIZE3D + 1);
   DspinSignals<dspin_cmd_width>* signal_dspin_z_cmd_dec =
      alloc_elems<DspinSignals<dspin_cmd_width> >("signal_dspin_h_cmd_dec", Z_SIZE3D + 1);

   DspinSignals<dspin_rsp_width>* signal_dspin_z_rsp_inc =
      alloc_elems<DspinSignals<dspin_rsp_width> >("signal_dspin_h_rsp_inc", Z_SIZE3D + 1);
   DspinSignals<dspin_rsp_width>* signal_dspin_z_rsp_dec =
      alloc_elems<DspinSignals<dspin_rsp_width> >("signal_dspin_h_rsp_dec", Z_SIZE3D + 1);

   DspinSignals<dspin_cmd_width>* signal_dspin_z_m2p_inc =
      alloc_elems<DspinSignals<dspin_cmd_width> >("signal_dspin_h_m2p_inc", Z_SIZE3D + 1);
   DspinSignals<dspin_cmd_width>* signal_dspin_z_m2p_dec =
      alloc_elems<DspinSignals<dspin_cmd_width> >("signal_dspin_h_m2p_dec", Z_SIZE3D + 1);

   DspinSignals<dspin_rsp_width>* signal_dspin_z_p2m_inc =
      alloc_elems<DspinSignals<dspin_rsp_width> >("signal_dspin_h_p2m_inc", Z_SIZE3D + 1);
   DspinSignals<dspin_rsp_width>* signal_dspin_z_p2m_dec =
      alloc_elems<DspinSignals<dspin_rsp_width> >("signal_dspin_h_p2m_dec", Z_SIZE3D + 1);

   DspinSignals<dspin_cmd_width>* signal_dspin_z_cla_inc =
      alloc_elems<DspinSignals<dspin_cmd_width> >("signal_dspin_h_cla_inc", Z_SIZE3D + 1);
   DspinSignals<dspin_cmd_width>* signal_dspin_z_cla_dec =
      alloc_elems<DspinSignals<dspin_cmd_width> >("signal_dspin_h_cla_dec", Z_SIZE3D + 1);

   ////////////////////////////
   //      Loader    
   ////////////////////////////

   soclib::common::Loader loader(soft_name);

   typedef soclib::common::GdbServer<soclib::common::Mips32ElIss> proc_iss;
   proc_iss::set_loader(loader);

   ////////////////////////////
   // Clusters construction
   ////////////////////////////

   TsarSuperCluster<dspin_cmd_width,
                   dspin_rsp_width,
                   vci_param_int,
                   vci_param_ext> * clusters[Z_SIZE3D];

   for (size_t z = 0; z  < Z_SIZE3D; z++) {
            std::cout << std::endl;
            std::cout << "SuperCluster_" << z << std::endl;

            std::ostringstream sc;
            sc << "scluster_" << z;
            clusters[z] = new TsarSuperCluster<dspin_cmd_width,
                                                 dspin_rsp_width,
                                                 vci_param_int,
                                                 vci_param_ext>
            (
                sc.str().c_str(),
                NB_PROCS_MAX,
                NB_TTY_CHANNELS,
                NB_DMA_CHANNELS,
                X_SIZE3D,
                Y_SIZE3D,
                z,
		ELEVATOR_X,
		ELEVATOR_Y,
                maptabd,
                maptabx,
                x_width,
                y_width,
                z_width,
                P_WIDTH,
		vci_srcid_width,
                RAM_TGTID,
                XCU_TGTID,
                DMA_TGTID,
                FBF_TGTID,
                TTY_TGTID,
                ROM_TGTID,
                NIC_TGTID,
                CMA_TGTID,
                IOC_TGTID,
                SIM_TGTID,
                MEMC_WAYS,
                MEMC_SETS,
                L1_IWAYS,
                L1_ISETS,
                L1_DWAYS,
                L1_DSETS,
                IRQ_PER_PROCESSOR,
                XRAM_LATENCY,
                X_IO3D,
		Y_IO3D,
		Z_IO3D,
                FBF_X_SIZE,
                FBF_Y_SIZE,
                disk_name,
                BDEV_SECTOR_SIZE,
                NB_NIC_CHANNELS,
                nic_rx_name,
                nic_tx_name,
                NIC_TIMEOUT,
                NB_CMA_CHANNELS,
                loader,
                frozen_cycles,
                debug_from,
                debug_ok,
                debug_ok
            );

    }

   ///////////////////////////////////////////////////////////////
   //     Net-list 
   ///////////////////////////////////////////////////////////////

   for (int z = 0; z < Z_SIZE3D; z++) {
       // Clock & RESET
       clusters[z]->p_clk                      (signal_clk);
       clusters[z]->p_resetn                   (signal_resetn);

       // Inter Clusters Z connections
#define UP    0
#define DOWN  1
         clusters[z]->p_cmd_out[UP]      (signal_dspin_z_cmd_inc[z + 1]);
         clusters[z]->p_cmd_in[DOWN]     (signal_dspin_z_cmd_inc[z]);
         clusters[z]->p_cmd_in[UP]       (signal_dspin_z_cmd_dec[z + 1]);
         clusters[z]->p_cmd_out[DOWN]    (signal_dspin_z_cmd_dec[z]);

         clusters[z]->p_rsp_out[UP]      (signal_dspin_z_rsp_inc[z + 1]);
         clusters[z]->p_rsp_in[DOWN]     (signal_dspin_z_rsp_inc[z]);
         clusters[z]->p_rsp_in[UP]       (signal_dspin_z_rsp_dec[z + 1]);
         clusters[z]->p_rsp_out[DOWN]    (signal_dspin_z_rsp_dec[z]);

         clusters[z]->p_m2p_out[UP]      (signal_dspin_z_m2p_inc[z + 1]);
         clusters[z]->p_m2p_in[DOWN]     (signal_dspin_z_m2p_inc[z]);
         clusters[z]->p_m2p_in[UP]       (signal_dspin_z_m2p_dec[z + 1]);
         clusters[z]->p_m2p_out[DOWN]    (signal_dspin_z_m2p_dec[z]);

         clusters[z]->p_p2m_out[UP]      (signal_dspin_z_p2m_inc[z + 1]);
         clusters[z]->p_p2m_in[DOWN]     (signal_dspin_z_p2m_inc[z]);
         clusters[z]->p_p2m_in[UP]       (signal_dspin_z_p2m_dec[z + 1]);
         clusters[z]->p_p2m_out[DOWN]    (signal_dspin_z_p2m_dec[z]);

         clusters[z]->p_cla_out[UP]      (signal_dspin_z_cla_inc[z + 1]);
         clusters[z]->p_cla_in[DOWN]     (signal_dspin_z_cla_inc[z]);
         clusters[z]->p_cla_in[UP]       (signal_dspin_z_cla_dec[z + 1]);
         clusters[z]->p_cla_out[DOWN]    (signal_dspin_z_cla_dec[z]);
   }
   std::cout << std::endl << "Z connections done" << std::endl;

#ifdef WT_IDL
    std::list<VciCcVCacheWrapper<vci_param_int,
        dspin_cmd_width,
        dspin_rsp_width,
        GdbServer<Mips32ElIss> > * > l1_caches;

   for (int x = 0; x < X_SIZE3D; x++) {
      for (int y = 0; y < Y_SIZE3D; y++) {
         for (int proc = 0; proc < NB_PROCS_MAX; proc++) {
            l1_caches.push_back(clusters[x][y]->proc[proc]);
         }
      }
   }

   for (int x = 0; x < X_SIZE3D; x++) {
      for (int y = 0; y < Y_SIZE3D; y++) {
         clusters[x][y]->memc->set_vcache_list(l1_caches);
      }
   }
#endif


// #define SC_TRACE
#ifdef SC_TRACE
   sc_trace_file * tf = sc_create_vcd_trace_file("my_trace_file");

#if 0
   for (int x = 0; x < X_SIZE3D - 1; x++) {
      for (int y = 0; y < Y_SIZE3D; y++) {
         for (int k = 0; k < 3; k++) {
            signal_dspin_h_cmd_inc[x][y][k].trace(tf, "dspin_h_cmd_inc");
            signal_dspin_h_cmd_dec[x][y][k].trace(tf, "dspin_h_cmd_dec");
         }

         for (int k = 0; k < 2; k++) {
            signal_dspin_h_rsp_inc[x][y][k].trace(tf, "dspin_h_rsp_inc");
            signal_dspin_h_rsp_dec[x][y][k].trace(tf, "dspin_h_rsp_dec");
         }
      }
   }

   for (int y = 0; y < Y_SIZE3D - 1; y++) {
      for (int x = 0; x < X_SIZE3D; x++) {
         for (int k = 0; k < 3; k++) {
            signal_dspin_v_cmd_inc[x][y][k].trace(tf, "dspin_v_cmd_inc");
            signal_dspin_v_cmd_dec[x][y][k].trace(tf, "dspin_v_cmd_dec");
         }

         for (int k = 0; k < 2; k++) {
            signal_dspin_v_rsp_inc[x][y][k].trace(tf, "dspin_v_rsp_inc");
            signal_dspin_v_rsp_dec[x][y][k].trace(tf, "dspin_v_rsp_dec");
         }
      }
   }

   for (int x = 0; x < (X_SIZE3D); x++) {
      for (int y = 0; y < Y_SIZE3D; y++) {
         std::ostringstream signame;
         signame << "cluster" << x << "_" << y;
         clusters[x][y]->trace(tf, signame.str());
      }
   }
#endif
   for (size_t z = 0; z < (Z_SIZE3D); z++) {
         clusters[z]->trace(tf);
   }
#endif


   ////////////////////////////////////////////////////////
   //   Simulation
   ///////////////////////////////////////////////////////

   sc_start(sc_core::sc_time(0, SC_NS));
   signal_resetn = false;

   // set network boundaries signals default values
   // for all boundary clusters
   
   signal_dspin_z_cmd_inc[0].write = false;
   signal_dspin_z_cmd_inc[0].read = true;
   signal_dspin_z_cmd_dec[0].write = false;
   signal_dspin_z_cmd_dec[0].read = true;
   signal_dspin_z_cmd_inc[Z_SIZE3D].write = false;
   signal_dspin_z_cmd_inc[Z_SIZE3D].read = true;
   signal_dspin_z_cmd_dec[Z_SIZE3D].write = false;
   signal_dspin_z_cmd_dec[Z_SIZE3D].read = true;

   signal_dspin_z_rsp_inc[0].write = false;
   signal_dspin_z_rsp_inc[0].read = true;
   signal_dspin_z_rsp_dec[0].write = false;
   signal_dspin_z_rsp_dec[0].read = true;
   signal_dspin_z_rsp_inc[Z_SIZE3D].write = false;
   signal_dspin_z_rsp_inc[Z_SIZE3D].read = true;
   signal_dspin_z_rsp_dec[Z_SIZE3D].write = false;
   signal_dspin_z_rsp_dec[Z_SIZE3D].read = true;

   signal_dspin_z_p2m_inc[0].write = false;
   signal_dspin_z_p2m_inc[0].read = true;
   signal_dspin_z_p2m_dec[0].write = false;
   signal_dspin_z_p2m_dec[0].read = true;
   signal_dspin_z_p2m_inc[Z_SIZE3D].write = false;
   signal_dspin_z_p2m_inc[Z_SIZE3D].read = true;
   signal_dspin_z_p2m_dec[Z_SIZE3D].write = false;
   signal_dspin_z_p2m_dec[Z_SIZE3D].read = true;

   signal_dspin_z_m2p_inc[0].write = false;
   signal_dspin_z_m2p_inc[0].read = true;
   signal_dspin_z_m2p_dec[0].write = false;
   signal_dspin_z_m2p_dec[0].read = true;
   signal_dspin_z_m2p_inc[Z_SIZE3D].write = false;
   signal_dspin_z_m2p_inc[Z_SIZE3D].read = true;
   signal_dspin_z_m2p_dec[Z_SIZE3D].write = false;
   signal_dspin_z_m2p_dec[Z_SIZE3D].read = true;

   signal_dspin_z_cla_inc[0].write = false;
   signal_dspin_z_cla_inc[0].read = true;
   signal_dspin_z_cla_dec[0].write = false;
   signal_dspin_z_cla_dec[0].read = true;
   signal_dspin_z_cla_inc[Z_SIZE3D].write = false;
   signal_dspin_z_cla_inc[Z_SIZE3D].read = true;
   signal_dspin_z_cla_dec[Z_SIZE3D].write = false;
   signal_dspin_z_cla_dec[Z_SIZE3D].read = true;

   for (int z = 0; z < Z_SIZE3D; z++) {
       clusters[z]->reset();
   }

   sc_start(sc_core::sc_time(1, SC_NS));
   signal_resetn = true;

   if (debug_ok) {
      if (gettimeofday(&t1, NULL) != 0) {
         perror("gettimeofday");
         return EXIT_FAILURE;
      }

      for (int64_t n = 1; n < ncycles && !stop_called; n++) {
         if ((n % max_cycles) == 0) {

            if (gettimeofday(&t2, NULL) != 0) {
               perror("gettimeofday");
               return EXIT_FAILURE;
            }

            ms1 = (uint64_t) t1.tv_sec * 1000ULL + (uint64_t) t1.tv_usec / 1000;
            ms2 = (uint64_t) t2.tv_sec * 1000ULL + (uint64_t) t2.tv_usec / 1000;
            std::cerr << "platform clock frequency " << (double) 5000000 / (double) (ms2 - ms1) << "Khz" << std::endl;

            if (gettimeofday(&t1, NULL) != 0)
            {
               perror("gettimeofday");
               return EXIT_FAILURE;
            }
         }


         if ((n > debug_from) and (n % debug_period == 0)) {
            std::cout << "****************** cycle " << std::dec << n ;
            std::cout << "************************************************" << std::endl;
	 }

         sc_start(sc_core::sc_time(1, SC_NS));
      }
   }
   else {
      int64_t n = 0;
      while (!stop_called && n != ncycles) {
         if (gettimeofday(&t1, NULL) != 0) {
            perror("gettimeofday");
            return EXIT_FAILURE;
         }
         int64_t nb_cycles = min(max_cycles, ncycles - n);
         if (do_reset_counters) {
            nb_cycles = min(nb_cycles, reset_counters - n);
         }
         if (do_dump_counters) {
            nb_cycles = min(nb_cycles, dump_counters - n);
         }

         sc_start(sc_core::sc_time(nb_cycles, SC_NS));
         n += nb_cycles;

         if (gettimeofday(&t2, NULL) != 0) {
            perror("gettimeofday");
            return EXIT_FAILURE;
         }
         ms1 = (uint64_t) t1.tv_sec * 1000ULL + (uint64_t) t1.tv_usec / 1000;
         ms2 = (uint64_t) t2.tv_sec * 1000ULL + (uint64_t) t2.tv_usec / 1000;
         std::cerr << std::dec << "cycle " << n << " platform clock frequency " << (double) nb_cycles / (double) (ms2 - ms1) << "Khz" << std::endl;
      }
   }


   return EXIT_SUCCESS;
}


void handler(int dummy = 0) {
   stop_called = true;
   sc_stop();
}

void voidhandler(int dummy = 0) {}

int sc_main (int argc, char *argv[]) {
   signal(SIGINT, handler);
   //signal(SIGPIPE, voidhandler);

   try {
      return _main(argc, argv);
   } catch (std::exception &e) {
      std::cout << e.what() << std::endl;
   }
   catch (...) {
      std::cout << "Unknown exception occured" << std::endl;
      throw;
   }
   return 1;
}


// Local Variables:
// tab-width: 3
// c-basic-offset: 3
// c-file-offsets:((innamespace . 0)(inline-open . 0))
// indent-tabs-mode: nil
// End:

// vim: filetype=cpp:expandtab:shiftwidth=3:tabstop=3:softtabstop=3
