/////////////////////////////////////////////////////////////////////////
// File: top.cpp 
// Author: Alain Greiner 
// Copyright: UPMC/LIP6
// Date : august 2012
// This program is released under the GNU public license
/////////////////////////////////////////////////////////////////////////
// This file define a generic, clusterized communication architecture,
// as used in the TSAR project, using only synthetic VCI initiators
// and targets on the direct network, and synthetic packet generators
// on the coherence network.
// It can be used to caracterize the communication infrastructure.
// The number of clusters cannot be larger than 1024.
// 
// The communication infrastructure contains 3 "independent" networks:
// - direct network (VCI command/response)
// - cc40   coherence network (one-way DSPIN network)
// - cc33   coherence network (one-way DSPIN network)
//
// It is build with one single component. The SyntheticCluster contains: 
// - four dspin_local_crossbar per cluster as local interconnect 
// - two virtual_dspin routers per cluster as global interconnect
// - one VCI initiator per cluster on direct network.
// - one VCI target per cluster on direct network.
// - one source per cluster on cc40 coherence network
// - one source per cluster on cc33 coherence network
//
// The packet length and offered load can be independantly defined
// on the three networks, and broadcast packets can be generated 
// with a fixed period on the cc40 and cc33 DSPIN networks.
//
// The main parameters are
// - x_size        : number of clusters in a row (power of 2)
// - y_size        : number of clusters in a column (power of 2)
// - load_d        : offered load per initiator on direct network (*1000)
// - load_c40      : offered load per initiator on cc40 network (*1000)
// - load_c33      : offered load per initiator on cc33 network (*1000)
// - plen_d        : packet length (in flits) on direct network
// - plen_c40      : packet length (in flits) on cc40 network
// - plen_c33      : packet length (in flits) on cc33 network
// - bcp_c40       : broadcast period (in cycles) on cc40 network
// - bcp_c33       : broadcast period (in cycles) on cc33 network
//
// The dspin_local crossbar does not use any routing table, and
// directly decode the MSB bits of or VCI address or DSPIN first flit:
// The (x_width + y_width) MSB bits (left aligned) define
// the cluster index, and the l_width LSB bits are not used here.
//      | X_ID  | Y_ID  | L_ID  |     OFFSET          |
//      |x_width|y_width|l_width|---------------------|
/////////////////////////////////////////////////////////////////////////

#include <systemc>
#include <sys/time.h>
#include <iostream>
#include <sstream>
#include <cstdlib>
#include <cstdarg>
#include <stdint.h>

#include "simple_cluster.h"
#include "alloc_elems.h"

///////////////////////////////////////////////////
//               Parallelisation
///////////////////////////////////////////////////
#define USE_OPENMP            0

#if USE_OPENMP
#include <omp.h>
#endif

///////////////////////////////////////////////////////////
//          DSPIN networks parameters           
///////////////////////////////////////////////////////////

#define cmd_width            40
#define rsp_width            33

///////////////////////////////////////////////////////////
//          VCI parameters           
///////////////////////////////////////////////////////////

#define cell_width            4
#define address_width         32
#define plen_width            8
#define error_width           2
#define clen_width            1
#define rflag_width           1
#define srcid_width           14
#define pktid_width           4
#define trdid_width           4
#define wrplen_width          1

/////////////////////////////////
int _main(int argc, char *argv[])
{
   using namespace sc_core;
   using namespace soclib::caba;
   using namespace soclib::common;

   size_t   ncycles          = 1000000000;         // simulated cycles
   size_t   threads          = 1;                  // simulator's threads number
   size_t   x_size           = 2;                  // number of columns in 2D mesh
   size_t   y_size           = 2;                  // number of rows in 2D mesh
   size_t   load_d           = 50;                 // load (*1000) on direct network.
   size_t   load_c40         = 0;                  // load (*1000) on cc40 network.
   size_t   load_c33         = 0;                  // load (*1000) on cc33 network.
   size_t   plen_d           = 4;                  // packet length on direct network.
   size_t   plen_c40         = 2;                  // packet length on cc40 network.
   size_t   plen_c33         = 2;                  // packet length on cc33 network.
   size_t   bcp_c40          = 0;                  // broadcast period on cc40 network
   size_t   bcp_c33          = 0;                  // broadcast period on cc33 network

   bool     debug_ok         = false;              // trace activated
   uint32_t debug_from       = 0;                  // trace start cycle
   bool     stats_ok         = false;              // stats activated
   uint32_t stats_period     = 0;                  // period

   ////////////// command line arguments //////////////////////
   if (argc > 1)
   {
      for (int n = 1; n < argc; n = n + 2)
      {
         if ((strcmp(argv[n],"-NCYCLES") == 0) && (n+1<argc))
         {
            ncycles = atoi(argv[n+1]);
         }
         else if ((strcmp(argv[n],"-XSIZE") == 0) && (n+1<argc))
         {
            x_size  = atoi(argv[n+1]);
            assert( ( (x_size == 1) or (x_size == 2) or (x_size == 4) or
                      (x_size == 8) or (x_size == 16) or (x_size == 32) ) and
                      "The x_size parameter must be 1, 2, 4, 8, 16 or 32" );
         }
         else if ((strcmp(argv[n],"-YSIZE") == 0) && (n+1<argc))
         {
            y_size  = atoi(argv[n+1]);
            assert( ( (y_size == 1) or (y_size == 2) or (y_size == 4) or
                      (y_size == 8) or (y_size == 16) or (y_size == 32) ) and
                      "The y_size parameter must be 1, 2, 4, 8, 16 or 32" );
         }
         else if ((strcmp(argv[n],"-BCP40") == 0) && (n+1<argc))
         {
            bcp_c40 = atoi(argv[n+1]);
         }
         else if ((strcmp(argv[n],"-BCP33") == 0) && (n+1<argc))
         {
            bcp_c33 = atoi(argv[n+1]);
         }
         else if ((strcmp(argv[n],"-LOAD") == 0) && (n+1<argc))
         {
            load_d = atoi(argv[n+1]);
         }
         else if ((strcmp(argv[n],"-LOAD40") == 0) && (n+1<argc))
         {
            load_c40 = atoi(argv[n+1]);
         }
         else if ((strcmp(argv[n],"-LOAD33") == 0) && (n+1<argc))
         {
            load_c33 = atoi(argv[n+1]);
         }
         else if ((strcmp(argv[n],"-TRACE") == 0) && (n+1<argc) )
         {
            debug_ok = true;
            debug_from = atoi(argv[n+1]);
         }
         else if ((strcmp(argv[n],"-STATS") == 0) && (n+1<argc) )
         {
            stats_ok = true;
            stats_period = atoi(argv[n+1]);
         }
         else if ((strcmp(argv[n], "-THREADS") == 0) && ((n+1) < argc))
         {
            threads = atoi(argv[n+1]);
            threads = (threads < 1) ? 1 : threads;
         }
         else
         {
            std::cout << "   Arguments are (key,value) couples." << std::endl;
            std::cout << "   The order is not important." << std::endl;
            std::cout << "     -NCYCLES number_of_simulated_cycles" << std::endl;
            std::cout << "     -XSIZE   number_of_columns" << std::endl;
            std::cout << "     -YSIZE   number_of_rows" << std::endl;
            std::cout << "     -BCP40   broadcast_period_on_CC40_network" << std::endl;
            std::cout << "     -BCP33   broadcast_period_on_CC33_network" << std::endl;
            std::cout << "     -LOAD    load*1000_on_direct_network" << std::endl;
            std::cout << "     -LOAD40  load*1000_on_cc40_network" << std::endl;
            std::cout << "     -LOAD33  load*1000_on_cc33_network" << std::endl;
            std::cout << "     -TRACE   debug_start_cycle" << std::endl;
            std::cout << "     -STATS   period" << std::endl;
            std::cout << "     -THREADS simulator_threads_number" << std::endl;
            exit(0);
         }
      }
   }

   std::cout << std::endl;
   std::cout << "- x_size     = " << x_size << std::endl;
   std::cout << "- y_size     = " << y_size << std::endl;
   std::cout << "- load_d     = " << load_d <<  std::endl;
   std::cout << "- load_c40   = " << load_c40 <<  std::endl;
   std::cout << "- load_c33   = " << load_c33 <<  std::endl;
   std::cout << "- bcp_c40    = " << bcp_c40 <<  std::endl;
   std::cout << "- bcp_c33    = " << bcp_c33 <<  std::endl;

   std::cout << std::endl;

#if USE_OPENMP
   omp_set_dynamic(false);
   omp_set_num_threads(threads_nr);
   std::cerr << "Built with openmp version " << _OPENMP << std::endl;
#endif

   // Define VCI parameters
   typedef soclib::caba::VciParams<cell_width,
           plen_width,
           address_width,
           error_width,                                   
           clen_width,
           rflag_width,
           srcid_width,
           pktid_width,
           trdid_width,
           wrplen_width> vci_param;

   // Define parameters depending on mesh size
   size_t   x_width;
   size_t   y_width;

   if      (x_size == 1)  x_width = 0;
   else if (x_size == 2)  x_width = 1;
   else if (x_size <= 4)  x_width = 2;
   else if (x_size <= 8)  x_width = 3;
   else if (x_size <= 16) x_width = 4;
   else                   x_width = 5;

   if      (y_size == 1)  y_width = 0;
   else if (y_size == 2)  y_width = 1;
   else if (y_size <= 4)  y_width = 2;
   else if (y_size <= 8)  y_width = 3;
   else if (y_size <= 16) y_width = 4;
   else                   y_width = 5;

   ////////////////////
   // Signals
   ///////////////////

   sc_clock      signal_clk("clk");
   sc_signal<bool>    signal_resetn("resetn");

   // Horizontal inter-clusters DSPIN signals
   DspinSignals<cmd_width>*** signal_dspin_h_cmd_inc =
      alloc_elems<DspinSignals<cmd_width> >("signal_dspin_h_cmd_inc", x_size-1, y_size, 2);
   DspinSignals<cmd_width>*** signal_dspin_h_cmd_dec =
      alloc_elems<DspinSignals<cmd_width> >("signal_dspin_h_cmd_dec", x_size-1, y_size, 2);
   DspinSignals<rsp_width>*** signal_dspin_h_rsp_inc =
      alloc_elems<DspinSignals<rsp_width> >("signal_dspin_h_rsp_inc", x_size-1, y_size, 2);
   DspinSignals<rsp_width>*** signal_dspin_h_rsp_dec =
      alloc_elems<DspinSignals<rsp_width> >("signal_dspin_h_rsp_dec", x_size-1, y_size, 2);

   // Vertical inter-clusters DSPIN signals
   DspinSignals<cmd_width>*** signal_dspin_v_cmd_inc =
      alloc_elems<DspinSignals<cmd_width> >("signal_dspin_v_cmd_inc", x_size, y_size-1, 2);
   DspinSignals<cmd_width>*** signal_dspin_v_cmd_dec =
      alloc_elems<DspinSignals<cmd_width> >("signal_dspin_v_cmd_dec", x_size, y_size-1, 2);
   DspinSignals<rsp_width>*** signal_dspin_v_rsp_inc =
      alloc_elems<DspinSignals<rsp_width> >("signal_dspin_v_rsp_inc", x_size, y_size-1, 2);
   DspinSignals<rsp_width>*** signal_dspin_v_rsp_dec =
      alloc_elems<DspinSignals<rsp_width> >("signal_dspin_v_rsp_dec", x_size, y_size-1, 2);

   // Mesh boundaries DSPIN signals
   DspinSignals<cmd_width>**** signal_dspin_false_cmd_in =
      alloc_elems<DspinSignals<cmd_width> >("signal_dspin_false_cmd_in", x_size, y_size, 2, 4);
   DspinSignals<cmd_width>**** signal_dspin_false_cmd_out =
      alloc_elems<DspinSignals<cmd_width> >("signal_dspin_false_cmd_out", x_size, y_size, 2, 4);
   DspinSignals<rsp_width>**** signal_dspin_false_rsp_in =
      alloc_elems<DspinSignals<rsp_width> >("signal_dspin_false_rsp_in", x_size, y_size, 2, 4);
   DspinSignals<rsp_width>**** signal_dspin_false_rsp_out =
      alloc_elems<DspinSignals<rsp_width> >("signal_dspin_false_rsp_out", x_size, y_size, 2, 4);

    ////////////////////////////
    // Clusters construction
    ////////////////////////////

    SimpleCluster<vci_param, cmd_width, rsp_width>* cluster[32][32];

#if USE_OPENMP
#pragma omp parallel
    {
#pragma omp for
#endif
        for(size_t i = 0; i  < (x_size * y_size); i++)
        {
            size_t x = i / y_size;
            size_t y = i % y_size;

#if USE_OPENMP
#pragma omp critical
            {
#endif
                std::ostringstream sc;
                sc << "cluster_" << x << "_" << y;
                std::cout << "****************** " << sc.str().c_str() 
                          << " ******************" << std::endl;

                cluster[x][y] = new SimpleCluster<vci_param, cmd_width, rsp_width>
                                    ( sc.str().c_str(),
                                      x,
                                      y,
                                      x_width,
                                      y_width,
                                      load_d,
                                      plen_d,
                                      load_c40,
                                      plen_c40,
                                      load_c33,
                                      plen_c33,
                                      bcp_c40,
                                      bcp_c33 );

#if USE_OPENMP
            } // end critical
#endif
        } // end for
#if USE_OPENMP
    }
#endif

    ///////////////////////////////////////////////////////////////
    //     Net-list 
    ///////////////////////////////////////////////////////////////

    // Clock & RESET
    for (size_t x = 0; x < (x_size); x++)
    {
        for (size_t y = 0; y < y_size; y++)
        {
            cluster[x][y]->p_clk     (signal_clk);
            cluster[x][y]->p_resetn  (signal_resetn);
        }
    }

    // Inter Clusters horizontal connections
    for (size_t x = 0; x < (x_size-1); x++)
    {
        for (size_t y = 0; y < y_size; y++)
        {
            for (size_t k = 0; k < 2; k++)
            {
                cluster[x][y]->p_cmd_out[k][EAST]     (signal_dspin_h_cmd_inc[x][y][k]);
                cluster[x+1][y]->p_cmd_in[k][WEST]    (signal_dspin_h_cmd_inc[x][y][k]);
                cluster[x][y]->p_cmd_in[k][EAST]      (signal_dspin_h_cmd_dec[x][y][k]);
                cluster[x+1][y]->p_cmd_out[k][WEST]   (signal_dspin_h_cmd_dec[x][y][k]);
                cluster[x][y]->p_rsp_out[k][EAST]     (signal_dspin_h_rsp_inc[x][y][k]);
                cluster[x+1][y]->p_rsp_in[k][WEST]    (signal_dspin_h_rsp_inc[x][y][k]);
                cluster[x][y]->p_rsp_in[k][EAST]      (signal_dspin_h_rsp_dec[x][y][k]);
                cluster[x+1][y]->p_rsp_out[k][WEST]   (signal_dspin_h_rsp_dec[x][y][k]);
            }
        }
    }
    std::cout << std::endl << "Horizontal connections established" << std::endl;   

    // Inter Clusters vertical connections
    for (size_t y = 0; y < (y_size-1); y++)
    {
        for (size_t x = 0; x < x_size; x++)
        {
            for (size_t k = 0; k < 2; k++)
            {
                cluster[x][y]->p_cmd_out[k][NORTH]    (signal_dspin_v_cmd_inc[x][y][k]);
                cluster[x][y+1]->p_cmd_in[k][SOUTH]   (signal_dspin_v_cmd_inc[x][y][k]);
                cluster[x][y]->p_cmd_in[k][NORTH]     (signal_dspin_v_cmd_dec[x][y][k]);
                cluster[x][y+1]->p_cmd_out[k][SOUTH]  (signal_dspin_v_cmd_dec[x][y][k]);
                cluster[x][y]->p_rsp_out[k][NORTH]    (signal_dspin_v_rsp_inc[x][y][k]);
                cluster[x][y+1]->p_rsp_in[k][SOUTH]   (signal_dspin_v_rsp_inc[x][y][k]);
                cluster[x][y]->p_rsp_in[k][NORTH]     (signal_dspin_v_rsp_dec[x][y][k]);
                cluster[x][y+1]->p_rsp_out[k][SOUTH]  (signal_dspin_v_rsp_dec[x][y][k]);
            }
        }
    }
    std::cout << "Vertical connections established" << std::endl;

    // East & West boundary cluster connections
    for (size_t y = 0; y < y_size; y++)
    {
        for (size_t k = 0; k < 2; k++)
        {
            cluster[0][y]->p_cmd_in[k][WEST]         (signal_dspin_false_cmd_in[0][y][k][WEST]);
            cluster[0][y]->p_cmd_out[k][WEST]        (signal_dspin_false_cmd_out[0][y][k][WEST]);
            cluster[0][y]->p_rsp_in[k][WEST]         (signal_dspin_false_rsp_in[0][y][k][WEST]);
            cluster[0][y]->p_rsp_out[k][WEST]        (signal_dspin_false_rsp_out[0][y][k][WEST]);

            cluster[x_size-1][y]->p_cmd_in[k][EAST]  (signal_dspin_false_cmd_in[x_size-1][y][k][EAST]);
            cluster[x_size-1][y]->p_cmd_out[k][EAST] (signal_dspin_false_cmd_out[x_size-1][y][k][EAST]);
            cluster[x_size-1][y]->p_rsp_in[k][EAST]  (signal_dspin_false_rsp_in[x_size-1][y][k][EAST]);
            cluster[x_size-1][y]->p_rsp_out[k][EAST] (signal_dspin_false_rsp_out[x_size-1][y][k][EAST]);
        }
    }

    // North & South boundary cluster connections
    for (size_t x = 0; x < x_size; x++)
    {
        for (size_t k = 0; k < 2; k++)
        {
            cluster[x][0]->p_cmd_in[k][SOUTH]         (signal_dspin_false_cmd_in[x][0][k][SOUTH]);
            cluster[x][0]->p_cmd_out[k][SOUTH]        (signal_dspin_false_cmd_out[x][0][k][SOUTH]);
            cluster[x][0]->p_rsp_in[k][SOUTH]         (signal_dspin_false_rsp_in[x][0][k][SOUTH]);
            cluster[x][0]->p_rsp_out[k][SOUTH]        (signal_dspin_false_rsp_out[x][0][k][SOUTH]);

            cluster[x][y_size-1]->p_cmd_in[k][NORTH]  (signal_dspin_false_cmd_in[x][y_size-1][k][NORTH]);
            cluster[x][y_size-1]->p_cmd_out[k][NORTH] (signal_dspin_false_cmd_out[x][y_size-1][k][NORTH]);
            cluster[x][y_size-1]->p_rsp_in[k][NORTH]  (signal_dspin_false_rsp_in[x][y_size-1][k][NORTH]);
            cluster[x][y_size-1]->p_rsp_out[k][NORTH] (signal_dspin_false_rsp_out[x][y_size-1][k][NORTH]);
        }
    }

    std::cout << std::endl;

    ////////////////////////////////////////////////////////
    //   Simulation
    ///////////////////////////////////////////////////////

    sc_start(sc_core::sc_time(0, SC_NS));
    signal_resetn = false;

    // network boundaries signals
    for (size_t x = 0; x < x_size ; x++)
    {
        for (size_t y = 0; y < y_size ; y++)
        {
            for (size_t k = 0; k < 2; k++)
            {
                for (size_t a = 0; a < 4; a++)
                {
                    signal_dspin_false_cmd_in[x][y][k][a].write = false;
                    signal_dspin_false_cmd_in[x][y][k][a].read = true;
                    signal_dspin_false_cmd_out[x][y][k][a].write = false;
                    signal_dspin_false_cmd_out[x][y][k][a].read = true;

                    signal_dspin_false_rsp_in[x][y][k][a].write = false;
                    signal_dspin_false_rsp_in[x][y][k][a].read = true;
                    signal_dspin_false_rsp_out[x][y][k][a].write = false;
                    signal_dspin_false_rsp_out[x][y][k][a].read = true;
                }
            }
        }
    }

    sc_start(sc_core::sc_time(1, SC_NS));
    signal_resetn = true;

    for (size_t n = 1; n < ncycles; n++)
    {
        if ( debug_ok and (n > debug_from) )
        {
            std::cout << "****************** cycle " << std::dec << n ;
            std::cout << " ************************************************" << std::endl;

            cluster[0][0]->ini_d->print_trace();
            cluster[0][0]->w_ini_d->print_trace();
            cluster[0][0]->tgt_d->print_trace();
            cluster[0][0]->w_tgt_d->print_trace();

            cluster[0][0]->ini_c->print_trace();
            cluster[0][0]->tgt_c->print_trace();

            cluster[0][0]->xbar_cmd_d->print_trace();
            cluster[0][0]->xbar_rsp_d->print_trace();
            cluster[0][0]->xbar_cmd_c->print_trace();
            cluster[0][0]->xbar_rsp_c->print_trace();

            cluster[0][0]->router_cmd->print_trace();
            cluster[0][0]->router_rsp->print_trace();

            std::cout << "---" << std::endl;

            cluster[0][0]->signal_vci_ini.print_trace("0_0/sig_vci_ini");
            cluster[0][0]->signal_dspin_ini_cmd_d.print_trace("0_0/sig_dspin_ini_cmd_d");
            cluster[0][0]->signal_dspin_ini_rsp_d.print_trace("0_0/sig_dspin_ini_rsp_d");
            cluster[0][0]->signal_dspin_ini_cmd_c.print_trace("0_0/sig_dspin_ini_cmd_c");
            cluster[0][0]->signal_dspin_ini_rsp_c.print_trace("0_0/sig_dspin_ini_rsp_c");
            cluster[0][0]->signal_dspin_tgt_cmd_d.print_trace("0_0/sig_dspin_tgt_cmd_d");
            cluster[0][0]->signal_dspin_tgt_rsp_d.print_trace("0_0/sig_dspin_tgt_rsp_d");
            cluster[0][0]->signal_dspin_tgt_cmd_c.print_trace("0_0/sig_dspin_tgt_cmd_c");
            cluster[0][0]->signal_dspin_tgt_rsp_c.print_trace("0_0/sig_dspin_tgt_rsp_c");
        }

        if( stats_ok and (n % stats_period == 0) ) 
        {
            for( size_t i = 0 ; i < x_size ; i++ )
            {
                for( size_t j = 0 ; j < y_size ; j++ )
                {
                    cluster[i][j]->ini_d->print_stats();
                    cluster[i][j]->ini_c->print_stats();
                    cluster[i][j]->tgt_c->print_stats();
                }
            }
        }

        sc_start(sc_core::sc_time(1, SC_NS));
    }

    for( size_t i = 0 ; i < x_size ; i++ )
    {
        for( size_t j = 0 ; j < y_size ; j++ )
        {
            cluster[i][j]->ini_d->print_stats();
            cluster[i][j]->ini_c->print_stats();
            cluster[i][j]->tgt_c->print_stats();
        }
    }

    return EXIT_SUCCESS;
}

int sc_main (int argc, char *argv[])
{
   try {
      return _main(argc, argv);
   } catch (std::exception &e) {
      std::cout << e.what() << std::endl;
   } catch (...) {
      std::cout << "Unknown exception occured" << std::endl;
      throw;
   }
   return 1;
}


// Local Variables:
// tab-width: 3
// c-basic-offset: 3
// c-file-offsets:((innamespace . 0)(inline-open . 0))
// indent-tabs-mode: nil
// End:

// vim: filetype=cpp:expandtab:shiftwidth=3:tabstop=3:softtabstop=3




