/////////////////////////////////////////////////////////////////////////
// File: tsarv4_generic_xbar.cpp
// Author: Alain Greiner 
// Copyright: UPMC/LIP6
// Date : april 2011
// This program is released under the GNU public license
/////////////////////////////////////////////////////////////////////////
// This file define a generic TSAR architecture without virtual memory.
// - It uses vci_local_crossbar as local interconnect 
// - It uses virtual_dspin as global interconnect
// - It uses the vci_cc_xcache_wrapper_v4 
// - It uses the vci_mem_cache_v4
// - It uses the vci_xicu, with one vci_multi_tty, and one
//   vci_multi_dma controlers per cluster.
// The physical address space is 32 bits.
// The number of clusters cannot be larger than 256.
// The number of processors per cluster cannot be larger than 4.
// The parameters must be power of 2.
// - xmax   : number of clusters in a row
// - ymax   : number of clusters in a column
// - nprocs : number of processors per cluster
//
// The peripherals BDEV, FBUF, and the boot BROM
// are in the cluster containing address 0xBFC00000.
// - The nprocs TTY IRQs are connected to IRQ_IN[0] to IRQ_IN[3]
// - The nprocs DMA IRQs are connected to IRQ_IN[4] to IRQ_IN[7]
// - The IOC IRQ is connected to IRQ_IN[8]
// 
// General policy for 32 bits address decoding in direct space:
// All segments base addresses are multiple of 64 Kbytes
// Therefore the 16 address MSB bits completely define the target: 
// The (x_width + y_width) MSB bits (left aligned) define
// the cluster index, and the 8 LSB bits define the local index:
// 
//      | X_ID  | Y_ID  |---| LADR |     OFFSET          |
//      |x_width|y_width|---|  8   |       16            |
//
// Half of all clusters being in the protected address space domain 
// (addresses larger than 0x8000000), software must execute in
// kernel mode to access memory if we want to exploit locality,
// because some stacks and heaps will be in the protected domain.
/////////////////////////////////////////////////////////////////////////

#include <systemc>
#include <sys/time.h>
#include <iostream>
#include <sstream>
#include <cstdlib>
#include <cstdarg>
#include <stdint.h>

#include "gdbserver.h"
#include "mapping_table.h"
#include "tsarv4_cluster_xbar.h"
#include "alloc_elems.h"

///////////////////////////////////////////////////
//               Parallelisation
///////////////////////////////////////////////////

#define USE_OPENMP               0
#define OPENMP_THREADS_NR        8

#if USE_OPENMP
#include <omp.h>
#endif


//  cluster index (computed from x,y coordinates)
#define cluster(x,y)	(y + ymax*x)

// flit widths for the DSPIN network
#define cmd_width	         40
#define rsp_width	         33

// VCI format
#define cell_width	         4
#define address_width	         32
#define plen_width	         8
#define error_width	         2
#define clen_width	         1
#define rflag_width	         1
#define srcid_width	         14
#define pktid_width	         4
#define trdid_width	         4
#define wrplen_width	         1

///////////////////////////////////////////////////
//     Parameters default values         
///////////////////////////////////////////////////

#define MESH_XMAX		2
#define MESH_YMAX		2

#define NPROCS			4
#define XRAM_LATENCY            0

#define MEMC_WAYS               16
#define MEMC_SETS               256

#define L1_IWAYS                4
#define L1_ISETS                64

#define L1_DWAYS                4
#define L1_DSETS                64

#define FBUF_X_SIZE             128 
#define FBUF_Y_SIZE             128 

#define	BDEV_SECTOR_SIZE 	128
#define BDEV_IMAGE_NAME	        "../../softs/soft_transpose_giet/images.raw"

#define BOOT_SOFT_NAME	 	"../../softs/soft_transpose_giet/bin.soft"

/////////////////////////////////////////////////////////
// Segments definition 
/////////////////////////////////////////////////////////
// There is 5 segments replicated in all clusters:
// - seg_icu 	-> ICU  / LADR = 0xF0
// - seg_tty 	-> MTTY / LADR = 0xF1
// - seg_dma 	-> CDMA / LADR = 0xF2
// - seg_stack	-> RAM  / LADR = 0x80 to 0x8F
// - seg_heap	-> RAM  / LADR = 0x30 to 0x7F
//
// There is 3 specific segments in the "IO" cluster 
// (containing address 0xBF000000)
// - seg_reset	-> BROM / LADR = 0xC0 to 0xCF
// - seg_fbuf	-> FBUF / LADR = 0xD0 to OxEF
// - seg_bdev	-> BDEV / LADR = 0xF3
//
// There is 3 specific segments in the "kcode" cluster
// (containing address 0x80000000)
// - seg_kcode	-> RAM  / LADR = 0x00 to 0x0F
// - seg_kdata	-> RAM  / LADR = 0x10 to 0x1F
// - seg_kunc 	-> RAM  / LADR = 0x20 to 0x2F
//
// There is 2 specific segments in the "code" cluster
// (containing address 0x00000000)
// - seg_code	-> RAM  / LADR = 0x00 to Ox0F
// - seg_data 	-> RAM  / LADR = 0x10 to 0x1F
//
// There is one special segment corresponding to
// the processors in the coherence address space
// - seg_proc	-> PROCS / LADR = 0xB0 to 0xBF
///////////////////////////////////////////////////

// specific segments in "kcode" cluster

#define KCOD_BASE               0x80000000      
#define KCOD_SIZE               0x00010000

#define KDAT_BASE               0x80100000      
#define KDAT_SIZE               0x00010000

#define KUNC_BASE               0x80200000      
#define KUNC_SIZE               0x00010000

// specific segments in "code" cluster

#define CODE_BASE               0x00000000      
#define CODE_SIZE               0x00010000

#define DATA_BASE               0x00100000      
#define DATA_SIZE               0x00010000

// specific segments in "IO" cluster

#define BROM_BASE               0xBFC00000      
#define BROM_SIZE               0x00010000

#define FBUF_BASE               0xBFD00000      
#define FBUF_SIZE               0x00200000

#define BDEV_BASE               0xBFF30000      
#define BDEV_SIZE               0x00000020

// replicated segments

#define HEAP_BASE               0x00300000      
#define HEAP_SIZE               0x00500000

#define STAK_BASE               0x00800000      
#define STAK_SIZE               0x00100000

#define XICU_BASE               0x00F00000      
#define XICU_SIZE               0x00001000

#define MTTY_BASE               0x00F10000      
#define MTTY_SIZE               0x00000040

#define CDMA_BASE               0x00F20000      
#define CDMA_SIZE               0x00000080

#define PROC_BASE               0x00B00000      
#define PROC_SIZE               0x00000010

////////////////////////////////////////////////////////////////////
//     TGTID definition in direct space
// For all components:  global TGTID = global SRCID = cluster_index
////////////////////////////////////////////////////////////////////

#define MEMC_TGTID               0
#define XICU_TGTID               1
#define MTTY_TGTID               2
#define CDMA_TGTID               3
#define FBUF_TGTID               4
#define BROM_TGTID               5
#define BDEV_TGTID               6

///////////////////////////////////////////////////
// service functions for VCI & DSIN signal trace
//////////////////////////////////////////////////

template <typename T>
void  print_vci_signal(std::string name, T &sig) 
{
    if ( sig.cmdval )
    {
        std::cout << name << std::hex << " CMD VCI : "; 
        if ( sig.cmd.read() == 1 ) 	std::cout << "RD ";
        if ( sig.cmd.read() == 2 ) 	std::cout << "WR ";
        if ( sig.cmd.read() == 3 ) 	std::cout << "LL ";
        if ( sig.cmd.read() == 0 ) 	std::cout << "SC ";
        std::cout  << " @ = " << sig.address 
                   << " | wdata = " << sig.wdata  
                   << " | srcid = " << sig.srcid 
                   << " | trdid = " << sig.trdid 
                   << " | eop = " << sig.eop 
                   << " | ack = " << sig.cmdack << std::endl;
    }
    if ( sig.rspval )
    {
         std::cout << name << std::hex 
                   << " RSP VCI : rerror = " << sig.rerror
                   << " | rdata = " << sig.rdata 
                   << " | rsrcid = " << sig.rsrcid 
                   << " | rtrdid = " << sig.rtrdid 
                   << " | reop = " << sig.reop
                   << " | ack = " << sig.rspack << std::endl;
    }
}

template <typename T>
void print_dspin_signal(std::string name, T &sig)
{
    if ( sig.write )
    {
        std::cout << name << " DSPIN : data = " << std::hex << sig.data
                  << " | ack = " << sig.read << std::endl;
    }
}

/////////////////////////////////
int _main(int argc, char *argv[])
{
    using namespace sc_core;
    using namespace soclib::caba;
    using namespace soclib::common;
    
    
    char    soft_name[256] = BOOT_SOFT_NAME;  	// pathname to binary code
    size_t  ncycles        = 1000000000;       	// simulated cycles
    size_t  xmax           = MESH_XMAX;  	// number of clusters in a row
    size_t  ymax           = MESH_YMAX;         // number of clusters in a column
    size_t  nprocs         = NPROCS;    	// number of processors per cluster
    size_t  xfb            = FBUF_X_SIZE;	// frameBuffer column number
    size_t  yfb            = FBUF_Y_SIZE;      	// frameBuffer lines number
    size_t  memc_ways      = MEMC_WAYS;
    size_t  memc_sets      = MEMC_SETS;
    size_t  l1_d_ways      = L1_DWAYS;
    size_t  l1_d_sets      = L1_DSETS;
    size_t  l1_i_ways      = L1_IWAYS;
    size_t  l1_i_sets      = L1_ISETS;
    char    disk_name[256] = BDEV_IMAGE_NAME;  	// pathname to the disk image
    size_t  blk_size       = BDEV_SECTOR_SIZE;  // block size (in bytes)
    size_t  xram_latency   = XRAM_LATENCY;	// external RAM latency
    bool    trace_ok       = false;            	// debug activated
    size_t  from_cycle     = 0;                	// debug start cycle

    ////////////// command line arguments //////////////////////
    if (argc > 1)
    {
        for( int n=1 ; n<argc ; n=n+2 )
        {
            if( (strcmp(argv[n],"-NCYCLES") == 0) && (n+1<argc) )
            {
                ncycles = atoi(argv[n+1]);
            }
            else if( (strcmp(argv[n],"-NPROCS") == 0) && (n+1<argc) )
            {
                nprocs = atoi(argv[n+1]);
                assert( ((nprocs == 1) || (nprocs == 2) || (nprocs == 4)) &&
                        "NPROCS must be equal to 1, 2, or 4");
            }
            else if( (strcmp(argv[n],"-XMAX") == 0) && (n+1<argc) )
            {
                xmax = atoi(argv[n+1]);
                assert( ((xmax == 1) || (xmax == 2) || (xmax == 4) || (xmax == 8) || (xmax == 16)) 
                         && "The XMAX parameter must be 2, 4, 8, or 16" );
            }
            
	    else if( (strcmp(argv[n],"-YMAX") == 0) && (n+1<argc) )
            {
                ymax = atoi(argv[n+1]);
                assert( ((ymax == 1) || (ymax == 2) || (ymax == 4) || (ymax == 8) || (ymax == 16)) 
                         && "The YMAX parameter must be 2, 4, 8, or 16" );
            }
	    else if( (strcmp(argv[n],"-XFB") == 0) && (n+1<argc) )
            {
	        xfb = atoi(argv[n+1]);
            }
	    else if( (strcmp(argv[n],"-YFB") == 0) && (n+1<argc) )
            {
                yfb = atoi(argv[n+1]);
            }
            else if( (strcmp(argv[n],"-SOFT") == 0) && (n+1<argc) )
            {
                strcpy(soft_name, argv[n+1]);
            }
            else if( (strcmp(argv[n],"-DISK") == 0) && (n+1<argc) )
            {
                strcpy(disk_name, argv[n+1]);
            }
            else if( (strcmp(argv[n],"-TRACE") == 0) && (n+1<argc) )
            {
                trace_ok = true;
                from_cycle = atoi(argv[n+1]);
            }
	    else if((strcmp(argv[n], "-MCWAYS") == 0) && (n+1 < argc))
	    {
	        memc_ways = atoi(argv[n+1]);
	    }
	    else if((strcmp(argv[n], "-MCSETS") == 0) && (n+1 < argc))
	    {
	        memc_sets = atoi(argv[n+1]);
	    }
	    else if((strcmp(argv[n], "-XLATENCY") == 0) && (n+1 < argc))
	    {
	        xram_latency = atoi(argv[n+1]);
	    }
            else
            {
                std::cout << "   Arguments on the command line are (key,value) couples." << std::endl;
                std::cout << "   The order is not important." << std::endl;
                std::cout << "   Accepted arguments are :" << std::endl << std::endl;
                std::cout << "     -SOFT pathname_for_embedded_soft" << std::endl;
                std::cout << "     -DISK pathname_for_disk_image" << std::endl;
                std::cout << "     -NCYCLES number_of_simulated_cycles" << std::endl;
                std::cout << "     -NPROCS number_of_processors_per_cluster" << std::endl;
                std::cout << "     -XMAX number_of_clusters_in_a_row" << std::endl;
                std::cout << "     -YMAX number_of_clusters_in_a_column" << std::endl;
                std::cout << "     -TRACE debug_start_cycle" << std::endl;
                std::cout << "     -MCWAYS memory_cache_number_of_ways" << std::endl;
                std::cout << "     -MCSETS memory_cache_number_of_sets" << std::endl;
                std::cout << "     -XLATENCY external_ram_latency_value" << std::endl;
                std::cout << "     -XFB fram_buffer_number_of_pixels" << std::endl;
                std::cout << "     -YFB fram_buffer_number_of_lines" << std::endl;
                exit(0);
            }
        }
    }

    std::cout << std::endl;
    std::cout << " - NPROCS    = " << nprocs <<  std::endl;
    std::cout << " - NCLUSTERS = " << xmax*ymax << std::endl;
    std::cout << std::endl;

#if USE_OPENMP
        omp_set_dynamic(false);
        omp_set_num_threads(threads_nr);
        std::cerr << "Built with openmp version " << _OPENMP << std::endl;
#endif

    // Define VCI parameters
    typedef soclib::caba::VciParams<cell_width,
                                    plen_width,
                                    address_width,
                                    error_width,                                   
                                    clen_width,
                                    rflag_width,
                                    srcid_width,
                                    pktid_width,
                                    trdid_width,
                                    wrplen_width> vci_param;

    size_t	cluster_io_index;
    size_t	cluster_code_index;
    size_t	cluster_kcode_index;
    size_t	x_width;
    size_t	y_width;

    if      (xmax == 1) x_width = 0;
    else if (xmax == 2) x_width = 1;
    else if (xmax <= 4) x_width = 2;
    else if (xmax <= 8) x_width = 3;
    else                x_width = 4;

    if      (ymax == 1) y_width = 0;
    else if (ymax == 2) y_width = 1;
    else if (ymax <= 4) y_width = 2;
    else if (ymax <= 8) y_width = 3;
    else                y_width = 4;

    cluster_io_index = 0xBF >> (8 - x_width - y_width);
    cluster_kcode_index = 0x80 >> (8 - x_width - y_width);
    cluster_code_index = 0;
    
    /////////////////////
    //  Mapping Tables
    /////////////////////

    // direct network
    MappingTable maptabd(address_width, 
                         IntTab(x_width + y_width, 16 - x_width - y_width), 
                         IntTab(x_width + y_width, srcid_width - x_width - y_width), 
                         0x00FF0000);

    for ( size_t x = 0 ; x < xmax ; x++)
    {
        for ( size_t y = 0 ; y < ymax ; y++)
        {
            sc_uint<address_width> offset  = cluster(x,y) << (address_width-x_width-y_width);

            std::ostringstream 	sh;
            sh << "d_seg_heap_" << x << "_" << y;
            maptabd.add(Segment(sh.str(), HEAP_BASE+offset, HEAP_SIZE, IntTab(cluster(x,y),MEMC_TGTID), true));

            std::ostringstream 	ss;
            ss << "d_seg_stak_" << x << "_" << y;
            maptabd.add(Segment(ss.str(), STAK_BASE+offset, STAK_SIZE, IntTab(cluster(x,y),MEMC_TGTID), true));

            std::ostringstream 	si;
            si << "d_seg_xicu_" << x << "_" << y;
            maptabd.add(Segment(si.str(), XICU_BASE+offset, XICU_SIZE, IntTab(cluster(x,y),XICU_TGTID), false));

            std::ostringstream 	st;
            st << "d_seg_mtty_" << x << "_" << y;
            maptabd.add(Segment(st.str(), MTTY_BASE+offset, MTTY_SIZE, IntTab(cluster(x,y),MTTY_TGTID), false));

            std::ostringstream 	sd;
            sd << "d_seg_cdma_" << x << "_" << y;
            maptabd.add(Segment(sd.str(), CDMA_BASE+offset, CDMA_SIZE, IntTab(cluster(x,y),CDMA_TGTID), false));

            if ( cluster(x,y) == cluster_io_index )
            {
	      maptabd.add(Segment("d_seg_fbuf    ", FBUF_BASE, FBUF_SIZE, IntTab(cluster(x,y),FBUF_TGTID), false));
	      maptabd.add(Segment("d_seg_bdev    ", BDEV_BASE, BDEV_SIZE, IntTab(cluster(x,y),BDEV_TGTID), false));
	      maptabd.add(Segment("d_seg_brom    ", BROM_BASE, BROM_SIZE, IntTab(cluster(x,y),BROM_TGTID), true));
            }
            if ( cluster(x,y) == cluster_code_index )
            {
	      maptabd.add(Segment("d_seg_code    ", CODE_BASE, CODE_SIZE, IntTab(cluster(x,y),MEMC_TGTID), true));
	      maptabd.add(Segment("d_seg_data    ", DATA_BASE, DATA_SIZE, IntTab(cluster(x,y),MEMC_TGTID), true));
            }
            if ( cluster(x,y) == cluster_kcode_index )
            {
	      maptabd.add(Segment("d_seg_kcod    ", KCOD_BASE, KCOD_SIZE, IntTab(cluster(x,y),MEMC_TGTID), true));
	      maptabd.add(Segment("d_seg_kdat    ", KDAT_BASE, KDAT_SIZE, IntTab(cluster(x,y),MEMC_TGTID), true));
	      maptabd.add(Segment("d_seg_kunc    ", KUNC_BASE, KUNC_SIZE, IntTab(cluster(x,y),MEMC_TGTID), true));
            }
        }
    }
    std::cout << maptabd << std::endl;

    // coherence network
    // - tgtid_c_proc = srcid_c_proc = local procid
    // - tgtid_c_memc = srcid_c_memc = nprocs
    MappingTable maptabc(address_width, 
                         IntTab(x_width + y_width, 16 - x_width - y_width), 
                         IntTab(x_width + y_width, srcid_width - x_width - y_width), 
                         0x00FF0000);

    for ( size_t x = 0 ; x < xmax ; x++)
    {
        for ( size_t y = 0 ; y < ymax ; y++)
        {
            sc_uint<address_width> offset  = cluster(x,y) << (address_width-x_width-y_width);

            // cleanup requests regarding the heap segment must be routed to the memory cache
            std::ostringstream sh;
            sh << "c_seg_heap_" << x << "_" << y;
            maptabc.add(Segment(sh.str(), HEAP_BASE+offset, HEAP_SIZE, IntTab(cluster(x,y), nprocs), false));

            // cleanup requests regarding the stack segmentmust be routed to the memory cache
            std::ostringstream ss;
            ss << "c_seg_stak_" << x << "_" << y;
            maptabc.add(Segment(ss.str(), STAK_BASE+offset, STAK_SIZE, IntTab(cluster(x,y), nprocs), false));

            // cleanup requests regarding the BROM segment are also be routed to the memory cache
            if ( cluster(x,y) == cluster_io_index )
            {
                maptabc.add(Segment("c_seg_brom    ", BROM_BASE, BROM_SIZE, IntTab(cluster(x,y), nprocs), false));
            }

            // cleanup requests regarding the code and data segment musts be send to the memory cache
            if ( cluster(x,y) == cluster_code_index )
            {
                maptabc.add(Segment("c_seg_code    ", CODE_BASE, CODE_SIZE, IntTab(cluster(x,y), nprocs), false));
                maptabc.add(Segment("c_seg_data    ", DATA_BASE, DATA_SIZE, IntTab(cluster(x,y), nprocs), false));
            }
            // cleanup requests regarding the kcode, kunc, and kdata segments must be send to the memory cache
            if ( cluster(x,y) == cluster_kcode_index )
            {
                maptabc.add(Segment("c_seg_kcod    ", KCOD_BASE, KCOD_SIZE, IntTab(cluster(x,y), nprocs), false));
                maptabc.add(Segment("c_seg_kdat    ", KDAT_BASE, KDAT_SIZE, IntTab(cluster(x,y), nprocs), false));
                maptabc.add(Segment("c_seg_kunc    ", KUNC_BASE, KUNC_SIZE, IntTab(cluster(x,y), nprocs), false));
            }

            // update & invalidate requests must be routed to the proper processor
	    for ( size_t p = 0 ; p < nprocs ; p++)
            {
                std::ostringstream sp;
	        sp << "c_seg_proc_" << x << "_" << y << "_" << p;
	        maptabc.add(Segment(sp.str(), PROC_BASE+offset+(p*0x10000), PROC_SIZE, 
                            IntTab(cluster(x,y), p), false, true, IntTab(cluster(x,y), p))); 
            }
        }
    }
    std::cout << maptabc << std::endl;

    // external network
    MappingTable maptabx(address_width, IntTab(1), IntTab(x_width+y_width), 0xF0000000);

    for ( size_t x = 0 ; x < xmax ; x++)
    {
        for ( size_t y = 0 ; y < ymax ; y++)
        { 

            sc_uint<address_width> offset  = cluster(x,y) << (address_width-x_width-y_width);

            std::ostringstream sh;
            sh << "x_seg_heap_" << x << "_" << y;
            maptabx.add(Segment(sh.str(), HEAP_BASE+offset, HEAP_SIZE, IntTab(cluster(x,y)), false));

            std::ostringstream ss;
            ss << "x_seg_stak_" << x << "_" << y;
            maptabx.add(Segment(ss.str(), STAK_BASE+offset, STAK_SIZE, IntTab(cluster(x,y)), false));

            if ( cluster(x,y) == cluster_code_index )
            {
                maptabx.add(Segment("x_seg_code    ", CODE_BASE, CODE_SIZE, IntTab(cluster(x,y)), false));
                maptabx.add(Segment("x_seg_data    ", DATA_BASE, DATA_SIZE, IntTab(cluster(x,y)), false));
            }
            if ( cluster(x,y) == cluster_kcode_index )
            {
                maptabx.add(Segment("x_seg_kcod    ", KCOD_BASE, KCOD_SIZE, IntTab(cluster(x,y)), false));
                maptabx.add(Segment("x_seg_kdat    ", KDAT_BASE, KDAT_SIZE, IntTab(cluster(x,y)), false));
                maptabx.add(Segment("x_seg_kunc    ", KUNC_BASE, KUNC_SIZE, IntTab(cluster(x,y)), false));
            }
        }
    }
    std::cout << maptabx << std::endl;

    ////////////////////
    // Signals
    ///////////////////

    sc_clock		signal_clk("clk");
    sc_signal<bool> 	signal_resetn("resetn");

    // Horizontal inter-clusters DSPIN signals
    DspinSignals<cmd_width>*** signal_dspin_h_cmd_inc =
      alloc_elems<DspinSignals<cmd_width> >("signal_dspin_h_cmd_inc", xmax-1, ymax, 2);
    DspinSignals<cmd_width>*** signal_dspin_h_cmd_dec =
      alloc_elems<DspinSignals<cmd_width> >("signal_dspin_h_cmd_dec", xmax-1, ymax, 2);
    DspinSignals<rsp_width>*** signal_dspin_h_rsp_inc =
      alloc_elems<DspinSignals<rsp_width> >("signal_dspin_h_rsp_inc", xmax-1, ymax, 2);
    DspinSignals<rsp_width>*** signal_dspin_h_rsp_dec =
      alloc_elems<DspinSignals<rsp_width> >("signal_dspin_h_rsp_dec", xmax-1, ymax, 2);

    // Vertical inter-clusters DSPIN signals
    DspinSignals<cmd_width>*** signal_dspin_v_cmd_inc =
        alloc_elems<DspinSignals<cmd_width> >("signal_dspin_v_cmd_inc", xmax, ymax-1, 2);
    DspinSignals<cmd_width>*** signal_dspin_v_cmd_dec =
        alloc_elems<DspinSignals<cmd_width> >("signal_dspin_v_cmd_dec", xmax, ymax-1, 2);
    DspinSignals<rsp_width>*** signal_dspin_v_rsp_inc =
        alloc_elems<DspinSignals<rsp_width> >("signal_dspin_v_rsp_inc", xmax, ymax-1, 2);
    DspinSignals<rsp_width>*** signal_dspin_v_rsp_dec =
        alloc_elems<DspinSignals<rsp_width> >("signal_dspin_v_rsp_dec", xmax, ymax-1, 2);

    // Mesh boundaries DSPIN signals
    DspinSignals<cmd_width>**** signal_dspin_false_cmd_in =
        alloc_elems<DspinSignals<cmd_width> >("signal_dspin_false_cmd_in", xmax, ymax, 2, 4);
    DspinSignals<cmd_width>**** signal_dspin_false_cmd_out =
        alloc_elems<DspinSignals<cmd_width> >("signal_dspin_false_cmd_out", xmax, ymax, 2, 4);
    DspinSignals<rsp_width>**** signal_dspin_false_rsp_in =
        alloc_elems<DspinSignals<rsp_width> >("signal_dspin_false_rsp_in", xmax, ymax, 2, 4);
    DspinSignals<rsp_width>**** signal_dspin_false_rsp_out =
        alloc_elems<DspinSignals<rsp_width> >("signal_dspin_false_rsp_out", xmax, ymax, 2, 4);


    ////////////////////////////
    //      Components
    ////////////////////////////

#if USE_ALMOS
    soclib::common::Loader loader("bootloader.bin",
				  "arch-info.bin@"TO_STR(BOOT_INFO_BLOCK)":D",
				  "kernel-soclib.bin@"TO_STR(KERNEL_BIN_IMG)":D");
#else
    soclib::common::Loader loader(soft_name);
#endif

    typedef soclib::common::GdbServer<soclib::common::Mips32ElIss> proc_iss;
    proc_iss::set_loader(loader);

    TsarV4ClusterXbar<vci_param, proc_iss, cmd_width, rsp_width>* clusters[xmax][ymax];

#if USE_OPENMP

#pragma omp parallel
{
#pragma omp for
    for( size_t i = 0 ; i  < (xmax * ymax); i++)
    {
        size_t x = i / ymax;
        size_t y = i % ymax;

#pragma omp critical
	std::ostringstream sc;
	sc << "cluster_" << x << "_" << y;
	clusters[x][y] = new TsarV4ClusterXbar<vci_param, proc_iss, cmd_width, rsp_width>
	    (sc.str().c_str(),
             nprocs,
	     x,
	     y,
	     cluster(x,y),
	     maptabd,
	     maptabc,
	     maptabx,
	     x_width,
	     y_width,
	     MEMC_TGTID,
	     XICU_TGTID,
	     FBUF_TGTID,
	     MTTY_TGTID,
	     BROM_TGTID,
	     BDEV_TGTID,
	     CDMA_TGTID,
             memc_ways,
             memc_sets,
             l1_i_ways,
             l1_i_sets,
             l1_d_ways,
             l1_d_sets,
             xram_latency,
	     (cluster(x,y) == cluster_io_index),
	     xfb,
	     yfb,
	     disk_name,
	     blk_size,
	     loader);
	}

#else  // USE_OPENMP

    for( size_t x = 0 ; x  < xmax ; x++)
    {
        for( size_t y = 0 ; y < ymax ; y++ )
        {

std::cout << "building cluster_" << x << "_" << y << std::endl;

	    std::ostringstream sc;
	    sc << "cluster_" << x << "_" << y;
	    clusters[x][y] = new TsarV4ClusterXbar<vci_param, proc_iss, cmd_width, rsp_width>
	    (sc.str().c_str(),
             nprocs,
	     x,
	     y,
	     cluster(x,y),
	     maptabd,
	     maptabc,
	     maptabx,
	     x_width,
	     y_width,
	     MEMC_TGTID,
	     XICU_TGTID,
	     FBUF_TGTID,
	     MTTY_TGTID,
	     BROM_TGTID,
	     BDEV_TGTID,
	     CDMA_TGTID,
             memc_ways,
             memc_sets,
             l1_i_ways,
             l1_i_sets,
             l1_d_ways,
             l1_d_sets,
             xram_latency,
	     (cluster(x,y) == cluster_io_index),
	     xfb,
	     yfb,
	     disk_name,
	     blk_size,
	     loader);

std::cout << "cluster_" << x << "_" << y << " constructed" << std::endl;

	}
    }
    
#endif	// USE_OPENMP

    ///////////////////////////////////////////////////////////////
    //     Net-list 
    ///////////////////////////////////////////////////////////////

    // Clock & RESET
    for ( size_t x = 0 ; x < (xmax) ; x++ )
    {
        for ( size_t y = 0 ; y < ymax ; y++ )
        {
            clusters[x][y]->p_clk			(signal_clk);
            clusters[x][y]->p_resetn			(signal_resetn);
        }
    }

    // Inter Clusters horizontal connections
    if ( xmax > 1 )
    {
        for ( size_t x = 0 ; x < (xmax-1) ; x++ )
        {
            for ( size_t y = 0 ; y < ymax ; y++ )
            {
                for ( size_t k = 0 ; k < 2 ; k++ )
                {
		clusters[x][y]->p_cmd_out[k][EAST]      (signal_dspin_h_cmd_inc[x][y][k]);
                clusters[x+1][y]->p_cmd_in[k][WEST]     (signal_dspin_h_cmd_inc[x][y][k]);
                clusters[x][y]->p_cmd_in[k][EAST]       (signal_dspin_h_cmd_dec[x][y][k]);
                clusters[x+1][y]->p_cmd_out[k][WEST]    (signal_dspin_h_cmd_dec[x][y][k]);
                clusters[x][y]->p_rsp_out[k][EAST]      (signal_dspin_h_rsp_inc[x][y][k]);
                clusters[x+1][y]->p_rsp_in[k][WEST]     (signal_dspin_h_rsp_inc[x][y][k]);
                clusters[x][y]->p_rsp_in[k][EAST]       (signal_dspin_h_rsp_dec[x][y][k]);
                clusters[x+1][y]->p_rsp_out[k][WEST]    (signal_dspin_h_rsp_dec[x][y][k]);
                }
            }
        }
    }
    std::cout << "Horizontal connections established" << std::endl;	

    // Inter Clusters vertical connections
    if ( ymax > 1 )
    {
        for ( size_t y = 0 ; y < (ymax-1) ; y++ )
        {
            for ( size_t x = 0 ; x < xmax ; x++ )
            {
                for ( size_t k = 0 ; k < 2 ; k++ )
                {
                clusters[x][y]->p_cmd_out[k][NORTH]     (signal_dspin_v_cmd_inc[x][y][k]);
                clusters[x][y+1]->p_cmd_in[k][SOUTH]    (signal_dspin_v_cmd_inc[x][y][k]);
                clusters[x][y]->p_cmd_in[k][NORTH]      (signal_dspin_v_cmd_dec[x][y][k]);
                clusters[x][y+1]->p_cmd_out[k][SOUTH]   (signal_dspin_v_cmd_dec[x][y][k]);
                clusters[x][y]->p_rsp_out[k][NORTH]     (signal_dspin_v_rsp_inc[x][y][k]);
                clusters[x][y+1]->p_rsp_in[k][SOUTH]    (signal_dspin_v_rsp_inc[x][y][k]);
                clusters[x][y]->p_rsp_in[k][NORTH]      (signal_dspin_v_rsp_dec[x][y][k]);
                clusters[x][y+1]->p_rsp_out[k][SOUTH]   (signal_dspin_v_rsp_dec[x][y][k]);
                }
            }
        }
    }
    std::cout << "Vertical connections established" << std::endl;

    // East & West boundary cluster connections
    for ( size_t y = 0 ; y < ymax ; y++ )
    {
        for ( size_t k = 0 ; k < 2 ; k++ )
        {
	    clusters[0][y]->p_cmd_in[k][WEST]       	(signal_dspin_false_cmd_in[0][y][k][WEST]);
	    clusters[0][y]->p_cmd_out[k][WEST]      	(signal_dspin_false_cmd_out[0][y][k][WEST]);
	    clusters[0][y]->p_rsp_in[k][WEST]       	(signal_dspin_false_rsp_in[0][y][k][WEST]);
	    clusters[0][y]->p_rsp_out[k][WEST]      	(signal_dspin_false_rsp_out[0][y][k][WEST]);
	  
	    clusters[xmax-1][y]->p_cmd_in[k][EAST]  	(signal_dspin_false_cmd_in[xmax-1][y][k][EAST]);
	    clusters[xmax-1][y]->p_cmd_out[k][EAST] 	(signal_dspin_false_cmd_out[xmax-1][y][k][EAST]);
	    clusters[xmax-1][y]->p_rsp_in[k][EAST]  	(signal_dspin_false_rsp_in[xmax-1][y][k][EAST]);
	    clusters[xmax-1][y]->p_rsp_out[k][EAST] 	(signal_dspin_false_rsp_out[xmax-1][y][k][EAST]);
	}
    }
    
    // North & South boundary clusters connections
    for ( size_t x = 0 ; x < xmax ; x++ )
    {
        for ( size_t k = 0 ; k < 2 ; k++ )
        {
	    clusters[x][0]->p_cmd_in[k][SOUTH]      	(signal_dspin_false_cmd_in[x][0][k][SOUTH]);
	    clusters[x][0]->p_cmd_out[k][SOUTH]     	(signal_dspin_false_cmd_out[x][0][k][SOUTH]);
	    clusters[x][0]->p_rsp_in[k][SOUTH]      	(signal_dspin_false_rsp_in[x][0][k][SOUTH]);
	    clusters[x][0]->p_rsp_out[k][SOUTH]     	(signal_dspin_false_rsp_out[x][0][k][SOUTH]);
	    
	    clusters[x][ymax-1]->p_cmd_in[k][NORTH] 	(signal_dspin_false_cmd_in[x][ymax-1][k][NORTH]);
	    clusters[x][ymax-1]->p_cmd_out[k][NORTH]	(signal_dspin_false_cmd_out[x][ymax-1][k][NORTH]);
	    clusters[x][ymax-1]->p_rsp_in[k][NORTH] 	(signal_dspin_false_rsp_in[x][ymax-1][k][NORTH]);
	    clusters[x][ymax-1]->p_rsp_out[k][NORTH]	(signal_dspin_false_rsp_out[x][ymax-1][k][NORTH]);
	}
    }
      

    ////////////////////////////////////////////////////////
    //   Simulation
    ///////////////////////////////////////////////////////

    sc_start(sc_core::sc_time(0, SC_NS));
    signal_resetn = false;

    // network boundaries signals
    for(size_t x=0; x<xmax ; x++)
    {
        for(size_t y=0 ; y<ymax ; y++)
        {
            for (size_t k=0; k<2; k++)
            {
                for(size_t a=0; a<4; a++)
                {
		        signal_dspin_false_cmd_in[x][y][k][a].write = false;
		        signal_dspin_false_cmd_in[x][y][k][a].read = true;
                        signal_dspin_false_cmd_out[x][y][k][a].write = false;
                        signal_dspin_false_cmd_out[x][y][k][a].read = true;

                        signal_dspin_false_rsp_in[x][y][k][a].write = false;
                        signal_dspin_false_rsp_in[x][y][k][a].read = true;
                        signal_dspin_false_rsp_out[x][y][k][a].write = false;
                        signal_dspin_false_rsp_out[x][y][k][a].read = true;
		}
            }
        }
    }

    sc_start(sc_core::sc_time(1, SC_NS));
    signal_resetn = true;

    for ( size_t n=0 ; n<ncycles ; n++)
    {
        sc_start(sc_core::sc_time(1, SC_NS));
        if ( trace_ok && (n > from_cycle) )
        {
            std::cout << "****************** cycle " << std::dec << n ;
            std::cout << " ***********************************" << std::endl;
/*
            clusters[0][0]->proc[0]->print_trace();
            clusters[0][0]->proc[1]->print_trace();
            clusters[0][0]->proc[2]->print_trace();
            clusters[0][0]->proc[3]->print_trace();

            std::cout << std::endl;  

            clusters[0][1]->proc[0]->print_trace();
            clusters[0][1]->proc[1]->print_trace();
            clusters[0][1]->proc[2]->print_trace();
            clusters[0][1]->proc[3]->print_trace();

            std::cout << std::endl;  

            clusters[1][0]->proc[0]->print_trace();
            clusters[1][0]->proc[1]->print_trace();
*/
            clusters[1][0]->iniwrapperd->print_trace();
            clusters[1][0]->proc[2]->print_trace();
            print_vci_signal("proc_1_0_2_tgt_c", clusters[1][0]->signal_vci_tgt_c_proc[2]);
            print_vci_signal("proc_1_0_2_d", clusters[1][0]->signal_vci_ini_d_proc[2]);
            print_vci_signal("memc_0_0_d", clusters[0][0]->signal_vci_tgt_d_memc);
            print_vci_signal("g2l_0_0_d", clusters[0][0]->signal_vci_g2l_d);
            print_dspin_signal("l2g_0_0_d RSP", clusters[0][0]->signal_dspin_rsp_l2g_d);
            print_dspin_signal("c10_to_c00 RSP", signal_dspin_h_rsp_dec[0][0][0]);
            print_dspin_signal("c00_to_c10 RSP", signal_dspin_h_rsp_inc[0][0][0]);
            print_dspin_signal("g2l_1_0_d RSP", clusters[1][0]->signal_dspin_rsp_g2l_d);
            print_vci_signal("l2g_1_0_d", clusters[1][0]->signal_vci_l2g_d);
/*
            clusters[1][0]->proc[3]->print_trace();

            std::cout << std::endl;  

            clusters[1][1]->proc[0]->print_trace();
            clusters[1][1]->proc[1]->print_trace();
            clusters[1][1]->proc[2]->print_trace();
            clusters[1][1]->proc[3]->print_trace();

            std::cout << std::endl;  

            clusters[0][0]->memc->print_trace();
            clusters[0][1]->memc->print_trace();
            clusters[1][0]->memc->print_trace();
            clusters[1][1]->memc->print_trace();

            clusters[0][0]->iniwrapperd->print_trace();
            clusters[0][0]->tgtwrapperd->print_trace();
            clusters[1][0]->iniwrapperd->print_trace();
            clusters[1][0]->tgtwrapperd->print_trace();
            clusters[0][1]->iniwrapperd->print_trace();
            clusters[0][1]->tgtwrapperd->print_trace();
            clusters[1][1]->iniwrapperd->print_trace();
            clusters[1][1]->tgtwrapperd->print_trace();

            std::cout << std::endl;  

            print_vci_signal("proc_0_0_0_d", clusters[0][0]->signal_vci_ini_d_proc[0]);
            print_vci_signal("proc_1_0_0_d", clusters[1][0]->signal_vci_ini_d_proc[0]);
            print_vci_signal("proc_0_1_0_d", clusters[0][1]->signal_vci_ini_d_proc[0]);
            print_vci_signal("proc_1_1_0_d", clusters[1][1]->signal_vci_ini_d_proc[0]);

            print_vci_signal("proc_0_0_0_c", clusters[0][0]->signal_vci_tgt_c_proc[0]);
            print_vci_signal("proc_1_0_0_c", clusters[1][0]->signal_vci_tgt_c_proc[0]);
            print_vci_signal("proc_0_1_0_c", clusters[0][1]->signal_vci_tgt_c_proc[0]);
            print_vci_signal("proc_1_1_0_c", clusters[1][1]->signal_vci_tgt_c_proc[0]);

            print_vci_signal("memc_0_0_d", clusters[0][0]->signal_vci_tgt_d_memc);
            print_vci_signal("memc_1_0_d", clusters[1][0]->signal_vci_tgt_d_memc);
            print_vci_signal("memc_0_1_d", clusters[0][1]->signal_vci_tgt_d_memc);
            print_vci_signal("memc_1_1_d", clusters[1][1]->signal_vci_tgt_d_memc);

            print_vci_signal("memc_1_0_ini_c", clusters[1][0]->signal_vci_ini_c_memc);

            print_vci_signal("l2g_1_0_c", clusters[1][0]->signal_vci_l2g_c);

            print_dspin_signal("l2g_1_0_c CMD", clusters[1][0]->signal_dspin_cmd_l2g_c);

            print_vci_signal("l2g_0_0_d", clusters[0][0]->signal_vci_l2g_d);
            print_vci_signal("g2l_0_0_d", clusters[0][0]->signal_vci_g2l_d);

            print_vci_signal("l2g_1_0_d", clusters[1][0]->signal_vci_l2g_d);
            print_vci_signal("g2l_1_0_d", clusters[1][0]->signal_vci_g2l_d);

            print_dspin_signal("l2g_0_0_d CMD", clusters[0][0]->signal_dspin_cmd_l2g_d);
            print_dspin_signal("g2l_0_0_d CMD", clusters[0][0]->signal_dspin_cmd_g2l_d);
            print_dspin_signal("l2g_0_0_d RSP", clusters[0][0]->signal_dspin_rsp_l2g_d);
            print_dspin_signal("g2l_0_0_d RSP", clusters[0][0]->signal_dspin_rsp_g2l_d);

            print_dspin_signal("l2g_1_0_d CMD", clusters[1][0]->signal_dspin_cmd_l2g_d);
            print_dspin_signal("g2l_1_0_d CMD", clusters[1][0]->signal_dspin_cmd_g2l_d);
            print_dspin_signal("l2g_1_0_d RSP", clusters[1][0]->signal_dspin_rsp_l2g_d);
            print_dspin_signal("g2l_1_0_d RSP", clusters[1][0]->signal_dspin_rsp_g2l_d);

            print_vci_signal("bdev_tgt", clusters[1][0]->signal_vci_tgt_d_bdev);
            print_vci_signal("bdev_ini", clusters[1][0]->signal_vci_ini_d_bdev);

            print_vci_signal("brom_tgt", clusters[1][0]->signal_vci_tgt_d_brom);
            
            if ( clusters[0][0]->signal_irq_bdev.read() != 0) std::cout << " IRQ_BDEV" << std::endl;
            if ( clusters[0][0]->signal_proc_it[0].read() != 0) std::cout << " IRQ_PROC" << std::endl;
*/
        }
    }
    return EXIT_SUCCESS;
}

int sc_main (int argc, char *argv[])
{
	try {
		return _main(argc, argv);
	} catch (std::exception &e) {
		std::cout << e.what() << std::endl;
	} catch (...) {
		std::cout << "Unknown exception occured" << std::endl;
		throw;
	}
	return 1;
}
