/////////////////////////////////////////////////////////////////////////
// File: tsarv4_dspin_generic_32_top.cpp
// Author: Alain Greiner
// Copyright: UPMC/LIP6
// Date : november 5 2010
// This program is released under the GNU public license
/////////////////////////////////////////////////////////////////////////
// This file define a generic TSAR architecture without virtual memory.
// - It uses the virtual_dspin network as global interconnect 
// - It uses the vci_local_ring_fast as local interconnect 
// - It uses the vci_cc_xcache (No MMU)  
// The physical address space is 32 bits.
// The number of clusters cannot be larger than 256.
// The three parameters are 
// - xmax : number of clusters in a row
// - ymax : number of clusters in a column
// - nprocs : number of processor per cluster
//
// Each cluster contains nprocs processors, one Memory Cache,
// and one XICU component.
// The peripherals BDEV, CDMA, FBUF, MTTY and the boot BROM
// are in the cluster containing address 0xBFC00000.
// - The bdev_irq is connected to IRQ_IN[0]
// - The cdma_irq is connected to IRQ_IN[1]
// - The tty_irq[i] is connected to IRQ_IN[i+2]
// For all clusters, the XICU component contains nprocs timers.
//
// As we target up to 256 clusters, each cluster can contain 
// at most 16 Mbytes (in a 4Gbytes address space).
// - Each memory cache contains 9 Mbytes.
// - The Frame buffer contains 2 Mbytes.
// - The Boot ROM contains 1 Mbytes.
//
// General policy for 32 bits address decoding:
// To simplifly, all segments base addresses are aligned
// on 1 Mbyte adresses. Therefore the 12 address MSB bits
// define the target in the direct address space.
// In these 12 bits, the (x_width + y_width) MSB bits define
// the cluster index, and the 4 LSB bits define the local index:
// 
//      | X_ID  | Y_ID  |---| L_ID |     OFFSET          |
//	|x_width|y_width|---|  4   |       20            |
/////////////////////////////////////////////////////////////////////////

#include <systemc>
#include <sys/time.h>
#include <iostream>
#include <sstream>
#include <cstdlib>
#include <cstdarg>

#include "mapping_table.h"
#include "mips32.h"
#include "vci_simple_ram.h"
#include "vci_multi_tty.h"
#include "vci_mem_cache_v4.h"
#include "vci_cc_xcache_wrapper_v4.h"
#include "vci_xicu.h"
#include "vci_vgmn.h"
#include "vci_local_ring_fast.h"
#include "virtual_dspin_router.h"
#include "vci_framebuffer.h"
#include "vci_dma_tsar_v2.h"
#include "vci_block_device_tsar_v2.h"
#include "gdbserver.h"

#define SECTOR_SIZE     2048

#define FBUF_XSIZE      960
#define FBUF_YSIZE      960

#define N_TTYS		4

//////////////////////////////////////////////
// segments definition in direct space.
// There is 16 Mbytes address space per cluster.
// The 8 MSB bits define the cluster index (x,y),
// even if the number of clusters is less than 256.
// Each memory cache contains up to 9 Mbytes.
// There is one MEMC segment and one XICU segment per cluster
// The peripherals BDEV, FBUF, MTTY, CDMA and the boot BROM
// are mapped in cluster containing address 0xBFC00000

#define	MEMC_BASE	0x00000000	
#define MEMC_SIZE	0x00900000

#define	XICU_BASE	0x00900000	
#define XICU_SIZE	0x00001000

#define	FBUF_BASE	0xBFA00000	
#define FBUF_SIZE	0x00200000

#define	BROM_BASE	0xBFC00000	
#define BROM_SIZE	0x00100000

#define	BDEV_BASE	0xBFD00000	
#define BDEV_SIZE	0x00000100

#define	MTTY_BASE	0xBFE00000	
#define MTTY_SIZE	0x00000100

#define	CDMA_BASE	0xBFF00000	
#define CDMA_SIZE	0x00000100

///////////////////////////////////////////////////////////
//     TGTID & SRCID definition in direct space
// For all components:  global TGTID = global SRCID = cluster_index
// For processors, the local SRCID is between 0 & nprocs-1

#define MEMC_TGTID	0
#define XICU_TGTID	1
#define FBUF_TGTID	2
#define MTTY_TGTID	3
#define BROM_TGTID	4
#define BDEV_TGTID	5
#define CDMA_TGTID	6

#define PROC_SRCID      0
#define BDEV_SRCID      nprocs
#define CDMA_SRCID      (nprocs+1)

////////////////////////////////////////////////////////
// Local TGTID & SRCID definition in coherence space
// (The global TGTID & SRCID is the cluster index)
// For MEMC, the local SRCID is 0
// For processors, the local SRCID is between 1 & nprocs

/////////////////////////
// Router ports index

#define NORTH		0
#define SOUTH		1
#define EAST		2
#define WEST		3
#define LOCAL		4

///////////////////////////////////
// flit widths for the DSPIN network

#define cmd_width	40
#define rsp_width	33

//////////////////
// VCI format

#define cell_width	4
#define address_width	32
#define plen_width	8
#define error_width	1
#define clen_width	1
#define rflag_width	1
#define srcid_width	14
#define pktid_width	4
#define trdid_width	4
#define wrplen_width	1

//  cluster index (computed from x,y coordinates)
#define cluster(x,y)	(y + ymax*x)

/////////////////////////////////
int _main(int argc, char *argv[])
{
    using namespace sc_core;
    using namespace soclib::caba;
    using namespace soclib::common;

    char    soft_name[128]  = "undefined_binary_file";  // pathname to binary code
    char    disk_name[128]  = "undefined_disk_image";   // pathname to the disk image
    size_t  ncycles         = 1000000000;       	// simulated cycles
    size_t  xmax            = 2;         		// number of clusters in a row
    size_t  ymax            = 2;                	// number of clusters in a column
    size_t  nprocs          = 1;         		// number of processors per cluster
    bool    debug_ok        = false;            	// debug activated
    size_t  from_cycle      = 0;                	// debug start cycle
    size_t  to_cycle        = 1000000000;       	// debug end cycle

    ////////////// command line arguments //////////////////////
    if (argc > 1)
    {
        for( int n=1 ; n<argc ; n=n+2 )
        {
            if( (strcmp(argv[n],"-NCYCLES") == 0) && (n+1<argc) )
            {
                ncycles = atoi(argv[n+1]);
            }
            else if( (strcmp(argv[n],"-NPROCS") == 0) && (n+1<argc) )
            {
                nprocs = atoi(argv[n+1]);
                assert( (nprocs <= 8) && "The number of processors per cluster cannot be larger than 8");
            }
            else if( (strcmp(argv[n],"-XMAX") == 0) && (n+1<argc) )
            {
                xmax = atoi(argv[n+1]);
                assert( ((xmax >= 2) && (xmax <= 16))
                     && "The XMAX parameter (number of clusters in a row) must be in the range [2,16]" );
            }
            else if( (strcmp(argv[n],"-YMAX") == 0) && (n+1<argc) )
            {
                ymax = atoi(argv[n+1]);
                assert( ((ymax >= 2) && (ymax <= 16))
                     && "The YMAX parameter (number of clusters in a column) must be in the range [2,16]" );
            }
            else if( (strcmp(argv[n],"-SOFT") == 0) && (n+1<argc) )
            {
                strcpy(soft_name, argv[n+1]);
            }
            else if( (strcmp(argv[n],"-DISK") == 0) && (n+1<argc) )
            {
                strcpy(disk_name, argv[n+1]);
            }
            else if( (strcmp(argv[n],"-DEBUG") == 0) && (n+1<argc) )
            {
                debug_ok = true;
                from_cycle = atoi(argv[n+1]);
            }
            else if( (strcmp(argv[n],"-TOCYCLE") == 0) && (n+1<argc) )
            {
                to_cycle = atoi(argv[n+1]);
            }
            else
            {
                std::cout << "   Arguments on the command line are (key,value) couples." << std::endl;
                std::cout << "   The order is not important." << std::endl;
                std::cout << "   Accepted arguments are :" << std::endl << std::endl;
                std::cout << "     -SOFT elf_file_name" << std::endl;
                std::cout << "     -DISK disk_image_file_name" << std::endl;
                std::cout << "     -NCYCLES number_of_simulated_cycles" << std::endl;
                std::cout << "     -NPROCS number_of_processors_per_cluster" << std::endl;
                std::cout << "     -XMAX number_of_clusters_in_a_row" << std::endl;
                std::cout << "     -YMAX number_of_clusters_in_a_column" << std::endl;
                std::cout << "     -DEBUG debug_start_cycle" << std::endl;
                std::cout << "     -TOCYCLE debug_end_cycle" << std::endl;
                exit(0);
            }
        }
    }

    std::cout << std::endl << "***********  TSAR ARCHITECTURE  **************" << std::endl
              << " - Interconnect = DSPIN & RING" << std::endl
              << " - Number of clusters = " << xmax << " * " << ymax << std::endl
              << " - Number of processors per cluster = " << nprocs << std::endl
              << "**********************************************" << std::endl
              << std::endl;

    // Define VCI parameters
    typedef soclib::caba::VciParams<cell_width,
                                    plen_width,
                                    address_width,
                                    error_width,                                   
                                    clen_width,
                                    rflag_width,
                                    srcid_width,
                                    pktid_width,
                                    trdid_width,
                                    wrplen_width> vci_param;

    size_t			cluster_io_index;
    size_t			x_width;
    size_t			y_width;

    if      (xmax == 2) x_width = 1;
    else if (xmax <= 4) x_width = 2;
    else if (xmax <= 8) x_width = 3;
    else                x_width = 4;

    if      (ymax == 2) y_width = 1;
    else if (ymax <= 4) y_width = 2;
    else if (ymax <= 8) y_width = 3;
    else                y_width = 4;

    cluster_io_index = 0xBF >> (8 - x_width - y_width);

    /////////////////////
    //  Mapping Tables
    /////////////////////

    // direct network
    MappingTable maptabd(address_width, 
                         IntTab(x_width + y_width, 12 - x_width - y_width), 
                         IntTab(x_width + y_width, srcid_width - x_width - y_width), 
                         0x00F00000);
    for ( size_t x = 0 ; x < xmax ; x++)
    {
        for ( size_t y = 0 ; y < ymax ; y++)
        {
            sc_uint<address_width> offset  = cluster(x,y) << (address_width-x_width-y_width);
            std::ostringstream 	sm;
            sm << "d_seg_memc_" << x << "_" << y;
            maptabd.add(Segment(sm.str(), MEMC_BASE+offset, MEMC_SIZE, IntTab(cluster(x,y),MEMC_TGTID), true));
            std::ostringstream 	si;
            si << "d_seg_xicu_" << x << "_" << y;
            maptabd.add(Segment(si.str(), XICU_BASE+offset, XICU_SIZE, IntTab(cluster(x,y),XICU_TGTID), false));
            if ( cluster(x,y) == cluster_io_index )
            {
            maptabd.add(Segment("d_seg_fbuf", FBUF_BASE, FBUF_SIZE, IntTab(cluster(x,y),FBUF_TGTID), false));
            maptabd.add(Segment("d_seg_bdev", BDEV_BASE, BDEV_SIZE, IntTab(cluster(x,y),BDEV_TGTID), false));
            maptabd.add(Segment("d_seg_mtty", MTTY_BASE, MTTY_SIZE, IntTab(cluster(x,y),MTTY_TGTID), false));
            maptabd.add(Segment("d_seg_brom", BROM_BASE, BROM_SIZE, IntTab(cluster(x,y),BROM_TGTID), true));
            maptabd.add(Segment("d_seg_cdma", CDMA_BASE, CDMA_SIZE, IntTab(cluster(x,y),CDMA_TGTID), false));
            }
        }
    }
    std::cout << maptabd << std::endl;

    // coherence network
    MappingTable maptabc(address_width,
                         IntTab(x_width + y_width, 12 - x_width - y_width),
                         IntTab(x_width + y_width, srcid_width - x_width - y_width),
                         0xF0000000);

    for ( size_t x = 0 ; x < xmax ; x++)
    {
        for ( size_t y = 0 ; y < ymax ; y++)
        {
            sc_uint<address_width> offset  = cluster(x,y) << (address_width-x_width-y_width);

            std::ostringstream sm;
            sm << "c_seg_memc_" << x << "_" << y;
            maptabc.add(Segment(sm.str(), MEMC_BASE+offset, MEMC_SIZE, IntTab(cluster(x,y), nprocs), false));
            // the segment base and size will be modified 
            // when the segmentation of the coherence space will be simplified

            if ( cluster(x,y) == cluster_io_index )
            {
                std::ostringstream sr;
                sr << "c_seg_brom_" << x << "_" << y;
                maptabc.add(Segment(sr.str(), BROM_BASE, BROM_SIZE, IntTab(cluster(x,y), nprocs), false));
            }

            sc_uint<address_width> avoid_collision  = 0;
            for ( size_t p = 0 ; p < nprocs ; p++)
            {
                sc_uint<address_width> base = MEMC_SIZE + (p*0x100000) + offset;
                // the following test is to avoid a collision between the c_seg_brom segment
                // and a c_seg_proc segment (all segments base addresses being multiple of 1Mbytes)
                if ( base == BROM_BASE ) avoid_collision = 0x100000;
                std::ostringstream sp;
                sp << "c_seg_proc_" << x << "_" << y << "_" << p;
                maptabc.add(Segment(sp.str(), base + avoid_collision, 0x20, IntTab(cluster(x,y), p), false,
                                  true, IntTab(cluster(x,y), p)));
                // the two last arguments will be removed 
                // when the segmentation of the coherence space will be simplified
            }
        }
    }
    std::cout << maptabc << std::endl;

    // external network
    MappingTable maptabx(address_width, IntTab(1), IntTab(10), 0xF0000000);
    for ( size_t x = 0 ; x < xmax ; x++)
    {
        for ( size_t y = 0 ; y < ymax ; y++)
        {
            sc_uint<address_width> offset  = cluster(x,y) << (address_width-x_width-y_width);
            std::ostringstream sx;
            sx << "seg_xram_" << x << "_" << y;
            maptabx.add(Segment(sx.str(), MEMC_BASE + offset, MEMC_SIZE, IntTab(0), false));
        }
    }
    std::cout << maptabx << std::endl;

    ////////////////////
    // Signals
    ///////////////////

    sc_clock		signal_clk("clk");
    sc_signal<bool> 	signal_resetn("resetn");
    sc_signal<bool>	signal_false;
   
    // IRQ signals (one signal per proc)

    sc_signal<bool>*** 	signal_proc_it =
        alloc_elems<sc_signal<bool> >("signal_proc_it", xmax, ymax, nprocs);

    sc_signal<bool>*    signal_irq_mtty =
        alloc_elems<sc_signal<bool> >("signal_irq_mtty", N_TTYS);

    sc_signal<bool>     signal_irq_bdev;
    sc_signal<bool>     signal_irq_cdma;

    // Direct VCI signals

    VciSignals<vci_param>*** signal_vci_ini_d_proc = 
        alloc_elems<VciSignals<vci_param> >("signal_vci_ini_d_proc", xmax, ymax, nprocs);

    VciSignals<vci_param>** signal_vci_tgt_d_memc = 
        alloc_elems<VciSignals<vci_param> >("signal_vci_tgt_d_memc", xmax, ymax);

    VciSignals<vci_param>** signal_vci_tgt_d_xicu = 
        alloc_elems<VciSignals<vci_param> >("signal_vci_tgt_d_xicu", xmax, ymax);

    VciSignals<vci_param> signal_vci_tgt_d_mtty("signal_vci_tgt_d_mtty");
    VciSignals<vci_param> signal_vci_tgt_d_brom("signal_vci_tgt_d_brom");
    VciSignals<vci_param> signal_vci_tgt_d_bdev("signal_vci_tgt_d_bdev");
    VciSignals<vci_param> signal_vci_tgt_d_cdma("signal_vci_tgt_d_cdma");
    VciSignals<vci_param> signal_vci_tgt_d_fbuf("signal_vci_tgt_d_fbuf");

    VciSignals<vci_param> signal_vci_ini_d_bdev("signal_vci_ini_d_bdev");
    VciSignals<vci_param> signal_vci_ini_d_cdma("signal_vci_ini_d_cdma");

    // Coherence VCI signals

    VciSignals<vci_param>*** signal_vci_ini_c_proc = 
        alloc_elems<VciSignals<vci_param> >("signal_vci_ini_c_proc", xmax, ymax, nprocs);

    VciSignals<vci_param>*** signal_vci_tgt_c_proc = 
        alloc_elems<VciSignals<vci_param> >("signal_vci_tgt_c_proc", xmax, ymax, nprocs);

    VciSignals<vci_param>** signal_vci_ini_c_memc = 
        alloc_elems<VciSignals<vci_param> >("signal_vci_ini_c_memc", xmax, ymax);

    VciSignals<vci_param>** signal_vci_tgt_c_memc = 
        alloc_elems<VciSignals<vci_param> >("signal_vci_tgt_c_memc", xmax, ymax);

    // DSPIN signals between local ring & global interconnects

    DspinSignals<cmd_width>** signal_dspin_cmd_l2g_d =
        alloc_elems<DspinSignals<cmd_width> >("signal_dspin_cmd_l2g_d", xmax, ymax);
    DspinSignals<cmd_width>** signal_dspin_cmd_g2l_d =
        alloc_elems<DspinSignals<cmd_width> >("signal_dspin_cmd_g2l_d", xmax, ymax);

    DspinSignals<cmd_width>** signal_dspin_cmd_l2g_c =
        alloc_elems<DspinSignals<cmd_width> >("signal_dspin_cmd_l2g_c", xmax, ymax);
    DspinSignals<cmd_width>** signal_dspin_cmd_g2l_c =
        alloc_elems<DspinSignals<cmd_width> >("signal_dspin_cmd_g2l_c", xmax, ymax);

    DspinSignals<rsp_width>** signal_dspin_rsp_l2g_d =
        alloc_elems<DspinSignals<rsp_width> >("signal_dspin_rsp_l2g_d", xmax, ymax);
    DspinSignals<rsp_width>** signal_dspin_rsp_g2l_d =
        alloc_elems<DspinSignals<rsp_width> >("signal_dspin_rsp_g2l_d", xmax, ymax);

    DspinSignals<rsp_width>** signal_dspin_rsp_l2g_c =
        alloc_elems<DspinSignals<rsp_width> >("signal_dspin_rsp_l2g_c", xmax, ymax);
    DspinSignals<rsp_width>** signal_dspin_rsp_g2l_c =
        alloc_elems<DspinSignals<rsp_width> >("signal_dspin_rsp_g2l_c", xmax, ymax);

    // Horizontal inter-clusters DSPIN signals
    DspinSignals<cmd_width>*** signal_dspin_h_cmd_inc =
        alloc_elems<DspinSignals<cmd_width> >("signal_dspin_h_cmd_inc", xmax-1, ymax, 2);
    DspinSignals<cmd_width>*** signal_dspin_h_cmd_dec =
        alloc_elems<DspinSignals<cmd_width> >("signal_dspin_h_cmd_dec", xmax-1, ymax, 2);
    DspinSignals<rsp_width>*** signal_dspin_h_rsp_inc =
        alloc_elems<DspinSignals<rsp_width> >("signal_dspin_h_rsp_inc", xmax-1, ymax, 2);
    DspinSignals<rsp_width>*** signal_dspin_h_rsp_dec =
        alloc_elems<DspinSignals<rsp_width> >("signal_dspin_h_rsp_dec", xmax-1, ymax, 2);

    // Vertical inter-clusters DSPIN signals
    DspinSignals<cmd_width>*** signal_dspin_v_cmd_inc =
        alloc_elems<DspinSignals<cmd_width> >("signal_dspin_v_cmd_inc", xmax, ymax-1, 2);
    DspinSignals<cmd_width>*** signal_dspin_v_cmd_dec =
        alloc_elems<DspinSignals<cmd_width> >("signal_dspin_v_cmd_dec", xmax, ymax-1, 2);
    DspinSignals<rsp_width>*** signal_dspin_v_rsp_inc =
        alloc_elems<DspinSignals<rsp_width> >("signal_dspin_v_rsp_inc", xmax, ymax-1, 2);
    DspinSignals<rsp_width>*** signal_dspin_v_rsp_dec =
        alloc_elems<DspinSignals<rsp_width> >("signal_dspin_v_rsp_dec", xmax, ymax-1, 2);

    // Mesh boundaries DSPIN signals
    DspinSignals<cmd_width>**** signal_dspin_false_cmd_in =
        alloc_elems<DspinSignals<cmd_width> >("signal_dspin_false_cmd_in", xmax,  ymax, 2, 2);
    DspinSignals<cmd_width>**** signal_dspin_false_cmd_out =
        alloc_elems<DspinSignals<cmd_width> >("signal_dspin_false_cmd_out", xmax, ymax, 2, 2);
    DspinSignals<rsp_width>**** signal_dspin_false_rsp_in =
        alloc_elems<DspinSignals<rsp_width> >("signal_dspin_false_rsp_in", xmax, ymax, 2, 2);
    DspinSignals<rsp_width>**** signal_dspin_false_rsp_out =
        alloc_elems<DspinSignals<rsp_width> >("signal_dspin_false_rsp_out", xmax, ymax, 2, 2);

    // Xternal network VCI signals
    VciSignals<vci_param> signal_vci_tgt_x_xram("signal_vci_tgt_x_xram");
    VciSignals<vci_param>** signal_vci_ini_x_memc = 
        alloc_elems<VciSignals<vci_param> >("signal_vci_ini_x_memc", xmax, ymax);

    ////////////////////////////
    //      Components
    ////////////////////////////

    typedef soclib::common::GdbServer<soclib::common::Mips32ElIss> proc_iss;

    soclib::common::Loader loader(soft_name);
    proc_iss::set_loader(loader);

    // External RAM
    VciSimpleRam<vci_param> xram(
        "xram", 
        IntTab(0), 
        maptabx, 
        loader);

    // External network
    VciVgmn<vci_param> xnoc(
        "xnoc",
        maptabx, 
        xmax*ymax,
        1, 
        2, 2);

    // Peripherals : TTY, Frame Buffer, Block Device, DMA & Boot ROM 
    VciSimpleRam<vci_param> brom(
        "brom", 
        IntTab(cluster_io_index, BROM_TGTID), 
        maptabd, 
        loader);

    VciMultiTty<vci_param> mtty(
        "mtty",
        IntTab(cluster_io_index, MTTY_TGTID),
        maptabd,
        "tty0","tty1","tty2","tty3",NULL);

    VciFrameBuffer<vci_param> fbuf(
        "fbuf", 
        IntTab(cluster_io_index, FBUF_TGTID),
        maptabd, 
        FBUF_XSIZE,
        FBUF_YSIZE);

    VciBlockDeviceTsarV2<vci_param> bdev(
        "bdev", 
        maptabd, 
        IntTab(cluster_io_index, BDEV_SRCID),	// SRCID_D
        IntTab(cluster_io_index, BDEV_TGTID),	// TGTID_D
        disk_name,
        SECTOR_SIZE); 

    VciDmaTsarV2<vci_param> cdma(
        "cdma",
        maptabd,
        IntTab(cluster_io_index,CDMA_SRCID),    // SRCID_D
        IntTab(cluster_io_index,CDMA_TGTID),    // TGTID_D
        64);

    // processors (nprocs per cluster)    
    VciCcXCacheWrapperV4<vci_param, proc_iss> *proc[xmax][ymax][nprocs];

    for( size_t x = 0 ; x < xmax ; x++ )
    {
        for( size_t y = 0 ; y < ymax ; y++ )
        {
            for ( size_t p = 0 ; p < nprocs ; p++ ) 
            {
                std::ostringstream sp;
                sp << "proc_" << x << "_" << y << "_" << p;

                proc[x][y][p] = new VciCcXCacheWrapperV4<vci_param, proc_iss>(
                    sp.str().c_str(),
                    p+nprocs*cluster(x,y), 
                    maptabd, maptabc,
                    IntTab(cluster(x,y),PROC_SRCID+p),	// SRCID_D
                    IntTab(cluster(x,y),PROC_SRCID+p),	// SRCID_C
                    IntTab(cluster(x,y),PROC_SRCID+p),	// TGTID_C
                    4,64,16,4,64,16); 			// Icache and Dcache sizes
            }
        }
    }

    //  memory caches (one per cluster)
    VciMemCacheV4<vci_param>* memc[xmax][ymax];

    for( size_t x = 0 ; x < xmax ; x++ )
    {
        for( size_t y = 0 ; y < ymax ; y++ )
        {
            std::ostringstream sm;
            sm << "memc_" << x << "_" << y;
	    memc[x][y] = new VciMemCacheV4<vci_param>(
                sm.str().c_str(),
                maptabd, maptabc, maptabx,
                IntTab(cluster(x,y)),			// SRCID_X
                IntTab(cluster(x,y), nprocs),		// SRCID_C
                IntTab(cluster(x,y),MEMC_TGTID),	// TGTID_D
                IntTab(cluster(x,y), nprocs), 	        // TGTID_C
                16,256,16,				// CACHE SIZE
                4096);					// HEAP SIZE
        }
    }

    // XICU (one per cluster)
    VciXicu<vci_param>* xicu[xmax][ymax];

    for( size_t x = 0 ; x < xmax ; x++ )
    {
        for( size_t y = 0 ; y < ymax ; y++ )
        {
            std::ostringstream si;
            si << "xicu_" << x << "_" << y;
            size_t	nhwi;
            if ( cluster(x,y) == cluster_io_index )	nhwi = 2 + N_TTYS;
            else					nhwi = 0;
	    xicu[x][y] = new VciXicu<vci_param>(
                si.str().c_str(),
                maptabd,
                IntTab(cluster(x,y), XICU_TGTID),	// TGTID_D
               	nprocs,					// number of TIMERS
		nhwi,					// number of hard IRQs
		nprocs,					// number of soft IRQs
		nprocs);				// number of output IRQ lines
        }
    }
                                                
    // Local interconnects : one direct ring & one coherence ring per cluster
    VciLocalRingFast<vci_param,cmd_width,rsp_width>* ringd[xmax][ymax];
    VciLocalRingFast<vci_param,cmd_width,rsp_width>* ringc[xmax][ymax];

    for( size_t x = 0 ; x < xmax ; x++ )
    {
        for( size_t y = 0 ; y < ymax ; y++ )
        {
            std::ostringstream sd;
            sd << "ringd_" << x << "_" << y;
            size_t nb_direct_initiators 	= nprocs;
            size_t nb_direct_targets	        = 2;
            if ( cluster(x,y) == cluster_io_index ) 
            {
                nb_direct_initiators		= nprocs + 2;
                nb_direct_targets		= 7;
            }
	    ringd[x][y] = new VciLocalRingFast<vci_param,cmd_width,rsp_width>(
                sd.str().c_str(),
                maptabd,
                IntTab(cluster(x,y)),			// cluster index
                4,					// wrapper fifo depth
                18,					// gateway fifo depth
                nb_direct_initiators,			// number of initiators
                nb_direct_targets);			// number of targets

            std::ostringstream sc;
            sc << "ringc_" << x << "_" << y;
	    ringc[x][y] = new VciLocalRingFast<vci_param,cmd_width,rsp_width>(
                sc.str().c_str(),
                maptabc,
                IntTab(cluster(x,y)),			// cluster index
                4,					// wrapper fifo depth
                18,					// gateway fifo depth
                nprocs+1,				// number of initiators
		nprocs+1);				// number of targets
        }
    }

    // Distributed Global Interconnect : one cmd router & one rsp router per cluster
    VirtualDspinRouter<cmd_width>* cmdrouter[xmax][ymax];
    VirtualDspinRouter<rsp_width>* rsprouter[xmax][ymax];

    for ( size_t x = 0 ; x < xmax ; x++ )
    {
        for ( size_t y = 0 ; y < ymax ; y++ )
        {
            std::ostringstream scmd;
            scmd << "cmdrouter_" << x << "_" << y;
            cmdrouter[x][y] = new VirtualDspinRouter<cmd_width>(
                scmd.str().c_str(),
                x,y,					// coordinate in the mesh
                x_width, y_width,			// x & y fields width
                4,4);					// input & output fifo depths

            std::ostringstream srsp;
            srsp << "rsprouter_" << x << "_" << y;
            rsprouter[x][y] = new VirtualDspinRouter<rsp_width>(
                srsp.str().c_str(),
                x,y,					// coordinates in mesh
                x_width, y_width,			// x & y fields width
                4,4);					// input & output fifo depths
        }
    }

    ///////////////////////////////////////////////////////////////
    //     Net-list 
    ///////////////////////////////////////////////////////////////

    // External Ram (one instance)
    xram.p_clk						(signal_clk);
    xram.p_resetn					(signal_resetn);
    xram.p_vci						(signal_vci_tgt_x_xram);	

    // External Network (one instance)
    xnoc.p_clk						(signal_clk);
    xnoc.p_resetn					(signal_resetn);
    xnoc.p_to_target[0]					(signal_vci_tgt_x_xram);
    for ( size_t x = 0 ; x < xmax ; x++ )
    {
        for ( size_t y = 0 ; y < ymax ; y++ )
        {
            xnoc.p_to_initiator[cluster(x,y)]           (signal_vci_ini_x_memc[x][y]);
        }
    }

    // Distributed components (in clusters)

    for ( size_t x = 0 ; x < xmax ; x++ )
    {
        for ( size_t y = 0 ; y < ymax ; y++ )
        {
            // cmd DSPIN router
            cmdrouter[x][y]->p_clk			(signal_clk);
            cmdrouter[x][y]->p_resetn			(signal_resetn);
            cmdrouter[x][y]->p_out[0][LOCAL]		(signal_dspin_cmd_g2l_d[x][y]);
            cmdrouter[x][y]->p_out[1][LOCAL]		(signal_dspin_cmd_g2l_c[x][y]);
            cmdrouter[x][y]->p_in[0][LOCAL]		(signal_dspin_cmd_l2g_d[x][y]);
            cmdrouter[x][y]->p_in[1][LOCAL]		(signal_dspin_cmd_l2g_c[x][y]);

            // rsp DSPIN router
            rsprouter[x][y]->p_clk			(signal_clk);
            rsprouter[x][y]->p_resetn			(signal_resetn);
            rsprouter[x][y]->p_out[0][LOCAL]		(signal_dspin_rsp_g2l_d[x][y]);
            rsprouter[x][y]->p_out[1][LOCAL]		(signal_dspin_rsp_g2l_c[x][y]);
            rsprouter[x][y]->p_in[0][LOCAL]		(signal_dspin_rsp_l2g_d[x][y]);
            rsprouter[x][y]->p_in[1][LOCAL]		(signal_dspin_rsp_l2g_c[x][y]);

            // direct ring
            ringd[x][y]->p_clk				(signal_clk);
            ringd[x][y]->p_resetn			(signal_resetn);
            ringd[x][y]->p_gate_cmd_out			(signal_dspin_cmd_l2g_d[x][y]);
	    ringd[x][y]->p_gate_cmd_in 			(signal_dspin_cmd_g2l_d[x][y]);
            ringd[x][y]->p_gate_rsp_out   		(signal_dspin_rsp_l2g_d[x][y]);
	    ringd[x][y]->p_gate_rsp_in 			(signal_dspin_rsp_g2l_d[x][y]);
            ringd[x][y]->p_to_target[MEMC_TGTID]	(signal_vci_tgt_d_memc[x][y]);
            ringd[x][y]->p_to_target[XICU_TGTID]	(signal_vci_tgt_d_xicu[x][y]);
	    for ( size_t p = 0 ; p < nprocs ; p++ )
            {
	        ringd[x][y]->p_to_initiator[p]		(signal_vci_ini_d_proc[x][y][p]);
	    }

            // coherence ring
            ringc[x][y]->p_clk				(signal_clk);
            ringc[x][y]->p_resetn			(signal_resetn);
            ringc[x][y]->p_gate_cmd_out			(signal_dspin_cmd_l2g_c[x][y]);
	    ringc[x][y]->p_gate_cmd_in 			(signal_dspin_cmd_g2l_c[x][y]);
            ringc[x][y]->p_gate_rsp_out   		(signal_dspin_rsp_l2g_c[x][y]);
	    ringc[x][y]->p_gate_rsp_in 			(signal_dspin_rsp_g2l_c[x][y]);
            ringc[x][y]->p_to_initiator[nprocs]		(signal_vci_ini_c_memc[x][y]);
            ringc[x][y]->p_to_target[nprocs]		(signal_vci_tgt_c_memc[x][y]);
            for ( size_t p = 0 ; p < nprocs ; p++ )
            {
                ringc[x][y]->p_to_target[p]		(signal_vci_tgt_c_proc[x][y][p]);
                ringc[x][y]->p_to_initiator[p]		(signal_vci_ini_c_proc[x][y][p]);
            }

	    // Processors
	    for ( size_t p = 0 ; p < nprocs ; p++ )
            {
                proc[x][y][p]->p_clk			(signal_clk);  
                proc[x][y][p]->p_resetn			(signal_resetn);  
                proc[x][y][p]->p_vci_ini_rw		(signal_vci_ini_d_proc[x][y][p]);
                proc[x][y][p]->p_vci_ini_c		(signal_vci_ini_c_proc[x][y][p]);
                proc[x][y][p]->p_vci_tgt		(signal_vci_tgt_c_proc[x][y][p]);
                proc[x][y][p]->p_irq[0]		        (signal_proc_it[x][y][p]);
                for ( size_t j = 1 ; j < 6 ; j++ )
                {
                    proc[x][y][p]->p_irq[j]		(signal_false); 
                }
	    }

            // XICU
	    xicu[x][y]->p_clk				(signal_clk);
            xicu[x][y]->p_resetn			(signal_resetn);
	    xicu[x][y]->p_vci				(signal_vci_tgt_d_xicu[x][y]);
	    for ( size_t p = 0 ; p < nprocs ; p++ )
            {
                xicu[x][y]->p_irq[p]			(signal_proc_it[x][y][p]);
            }

            // MEMC
            memc[x][y]->p_clk				(signal_clk);
            memc[x][y]->p_resetn			(signal_resetn);
            memc[x][y]->p_vci_tgt			(signal_vci_tgt_d_memc[x][y]);	
            memc[x][y]->p_vci_ini			(signal_vci_ini_c_memc[x][y]);
            memc[x][y]->p_vci_tgt_cleanup		(signal_vci_tgt_c_memc[x][y]);
            memc[x][y]->p_vci_ixr			(signal_vci_ini_x_memc[x][y]);

            // I/O peripherals
            if ( cluster(x,y) == cluster_io_index )
            {
	        bdev.p_clk				(signal_clk);
	        bdev.p_resetn				(signal_resetn);
	        bdev.p_irq				(signal_irq_bdev); 
	        bdev.p_vci_target			(signal_vci_tgt_d_bdev);
	        bdev.p_vci_initiator			(signal_vci_ini_d_bdev);

                cdma.p_clk                              (signal_clk);
                cdma.p_resetn                           (signal_resetn);
                cdma.p_irq                              (signal_irq_cdma);
                cdma.p_vci_target                       (signal_vci_tgt_d_cdma);
                cdma.p_vci_initiator                    (signal_vci_ini_d_cdma);

	        fbuf.p_clk				(signal_clk); 
	        fbuf.p_resetn				(signal_resetn); 
	        fbuf.p_vci				(signal_vci_tgt_d_fbuf); 

	        brom.p_clk				(signal_clk);
	        brom.p_resetn				(signal_resetn);
	        brom.p_vci				(signal_vci_tgt_d_brom);

	        mtty.p_clk				(signal_clk);
	        mtty.p_resetn				(signal_resetn);
	        mtty.p_vci				(signal_vci_tgt_d_mtty);
                for(size_t i=0 ; i<N_TTYS ; i++)
                {
                    mtty.p_irq[i]                       (signal_irq_mtty[i]);
                }

                ringd[x][y]->p_to_target[BROM_TGTID]	(signal_vci_tgt_d_brom);
                ringd[x][y]->p_to_target[MTTY_TGTID]	(signal_vci_tgt_d_mtty);
	        ringd[x][y]->p_to_target[BDEV_TGTID]	(signal_vci_tgt_d_bdev);
	        ringd[x][y]->p_to_target[FBUF_TGTID]	(signal_vci_tgt_d_fbuf);
	        ringd[x][y]->p_to_target[CDMA_TGTID]	(signal_vci_tgt_d_cdma);

	        ringd[x][y]->p_to_initiator[BDEV_SRCID]	(signal_vci_ini_d_bdev);
	        ringd[x][y]->p_to_initiator[CDMA_SRCID]	(signal_vci_ini_d_cdma);

                xicu[x][y]->p_hwi[0]			(signal_irq_bdev);
                xicu[x][y]->p_hwi[1]			(signal_irq_cdma);
                for(size_t i=0 ; i<N_TTYS ; i++)
                {
                    xicu[x][y]->p_hwi[2+i]              (signal_irq_mtty[i]);
                }
            }
        } // end for y
    } // end for x

    // Inter Clusters horizontal connections
    for ( size_t x = 0 ; x < (xmax-1) ; x++ )
    {
        for ( size_t y = 0 ; y < ymax ; y++ )
        {
            for ( size_t k = 0 ; k < 2 ; k++ )
            {
                cmdrouter[x][y]->p_out[k][EAST]		(signal_dspin_h_cmd_inc[x][y][k]);		
                cmdrouter[x+1][y]->p_in[k][WEST]		(signal_dspin_h_cmd_inc[x][y][k]);

                cmdrouter[x][y]->p_in[k][EAST]		(signal_dspin_h_cmd_dec[x][y][k]);		
                cmdrouter[x+1][y]->p_out[k][WEST]	(signal_dspin_h_cmd_dec[x][y][k]);

                rsprouter[x][y]->p_out[k][EAST]		(signal_dspin_h_rsp_inc[x][y][k]);		
                rsprouter[x+1][y]->p_in[k][WEST]		(signal_dspin_h_rsp_inc[x][y][k]);

                rsprouter[x][y]->p_in[k][EAST]		(signal_dspin_h_rsp_dec[x][y][k]);		
                rsprouter[x+1][y]->p_out[k][WEST]	(signal_dspin_h_rsp_dec[x][y][k]);
            }
        }
    }

    // Inter Clusters vertical connections
    for ( size_t y = 0 ; y < (ymax-1) ; y++ )
    {
        for ( size_t x = 0 ; x < xmax ; x++ )
        {
            for ( size_t k = 0 ; k < 2 ; k++ )
            {
                cmdrouter[x][y]->p_out[k][NORTH]		(signal_dspin_v_cmd_inc[x][y][k]);		
                cmdrouter[x][y+1]->p_in[k][SOUTH]	(signal_dspin_v_cmd_inc[x][y][k]);

                cmdrouter[x][y]->p_in[k][NORTH]		(signal_dspin_v_cmd_dec[x][y][k]);		
                cmdrouter[x][y+1]->p_out[k][SOUTH]	(signal_dspin_v_cmd_dec[x][y][k]);

                rsprouter[x][y]->p_out[k][NORTH]		(signal_dspin_v_rsp_inc[x][y][k]);		
                rsprouter[x][y+1]->p_in[k][SOUTH]	(signal_dspin_v_rsp_inc[x][y][k]);

                rsprouter[x][y]->p_in[k][NORTH]		(signal_dspin_v_rsp_dec[x][y][k]);		
                rsprouter[x][y+1]->p_out[k][SOUTH]	(signal_dspin_v_rsp_dec[x][y][k]);
            }
        }
    }

    // East & West boundary cluster connections
    for ( size_t y = 0 ; y < ymax ; y++ )
    {
        for ( size_t k = 0 ; k < 2 ; k++ )
        {
            cmdrouter[0][y]->p_in[k][WEST]          	(signal_dspin_false_cmd_in[0][y][k][0]);
            cmdrouter[0][y]->p_out[k][WEST]         	(signal_dspin_false_cmd_out[0][y][k][0]);
            rsprouter[0][y]->p_in[k][WEST]          	(signal_dspin_false_rsp_in[0][y][k][0]);
            rsprouter[0][y]->p_out[k][WEST]         	(signal_dspin_false_rsp_out[0][y][k][0]);

            cmdrouter[xmax-1][y]->p_in[k][EAST]  	(signal_dspin_false_cmd_in[xmax-1][y][k][0]);
            cmdrouter[xmax-1][y]->p_out[k][EAST] 	(signal_dspin_false_cmd_out[xmax-1][y][k][0]);
            rsprouter[xmax-1][y]->p_in[k][EAST]  	(signal_dspin_false_rsp_in[xmax-1][y][k][0]);
            rsprouter[xmax-1][y]->p_out[k][EAST] 	(signal_dspin_false_rsp_out[xmax-1][y][k][0]);
        }
    }

    // North & South boundary clusters connections
    for ( size_t x = 0 ; x < xmax ; x++ )
    {
        for ( size_t k = 0 ; k < 2 ; k++ )
        {
            cmdrouter[x][0]->p_in[k][SOUTH]          	(signal_dspin_false_cmd_in[x][0][k][1]);
            cmdrouter[x][0]->p_out[k][SOUTH]         	(signal_dspin_false_cmd_out[x][0][k][1]);
            rsprouter[x][0]->p_in[k][SOUTH]          	(signal_dspin_false_rsp_in[x][0][k][1]);
            rsprouter[x][0]->p_out[k][SOUTH]         	(signal_dspin_false_rsp_out[x][0][k][1]);

            cmdrouter[x][ymax-1]->p_in[k][NORTH]  	(signal_dspin_false_cmd_in[x][ymax-1][k][1]);
            cmdrouter[x][ymax-1]->p_out[k][NORTH] 	(signal_dspin_false_cmd_out[x][xmax-1][k][1]);
            rsprouter[x][ymax-1]->p_in[k][NORTH]  	(signal_dspin_false_rsp_in[x][ymax-1][k][1]);
            rsprouter[x][ymax-1]->p_out[k][NORTH] 	(signal_dspin_false_rsp_out[x][ymax-1][k][1]);
        }
    }

    ////////////////////////////////////////////////////////
    //   Simulation
    ///////////////////////////////////////////////////////

    sc_start(sc_core::sc_time(0, SC_NS));
    signal_resetn = false;

    // network boundaries signals
    for(size_t x=0; x<xmax ; x++)
    {
        for(size_t y=0 ; y<ymax ; y++)
        {
            for (size_t k=0; k<2; k++)
            {
                for(size_t a=0; a<2; a++)
                {
                        signal_dspin_false_cmd_in[x][y][k][a].write = false;
                        signal_dspin_false_cmd_in[x][y][k][a].read = true;
                        signal_dspin_false_cmd_out[x][y][k][a].write = false;
                        signal_dspin_false_cmd_out[x][y][k][a].read = true;

                        signal_dspin_false_rsp_in[x][y][k][a].write = false;
                        signal_dspin_false_rsp_in[x][y][k][a].read = true;
                        signal_dspin_false_rsp_out[x][y][k][a].write = false;
                        signal_dspin_false_rsp_out[x][y][k][a].read = true;
               }
            }
        }
    }


    sc_start(sc_core::sc_time(1, SC_NS));
    signal_resetn = true;

    for(size_t i=1 ; i<ncycles ; i++)
    {
        sc_start(sc_core::sc_time(1, SC_NS));

        if( debug_ok && (i > from_cycle) && (i < to_cycle) )
        {
        std::cout << std::dec << "*************** cycle " << i 
                  << "    *******************************************************" << std::endl;
        proc[0][0][0]->print_trace();
        proc[0][1][0]->print_trace();
        proc[1][0][0]->print_trace();
        proc[1][1][0]->print_trace();
        std::cout << std::endl;
        ringd[0][0]->print_trace();
        ringd[0][1]->print_trace();
        ringd[1][0]->print_trace();
        ringd[1][1]->print_trace();
        ringc[0][0]->print_trace();
        ringc[0][1]->print_trace();
        ringc[1][0]->print_trace();
        ringc[1][1]->print_trace();
        std::cout << std::endl;
        cmdrouter[0][0]->print_trace(0);
        cmdrouter[0][1]->print_trace(0);
        cmdrouter[1][0]->print_trace(0);
        cmdrouter[1][1]->print_trace(0);
        std::cout << std::endl;
        cmdrouter[0][0]->print_trace(1);
        cmdrouter[0][1]->print_trace(1);
        cmdrouter[1][0]->print_trace(1);
        cmdrouter[1][1]->print_trace(1);
        std::cout << std::endl;
        memc[0][0]->print_trace();
        memc[0][1]->print_trace();
        memc[1][0]->print_trace();
        memc[1][1]->print_trace();
//        if ( i%5 == 0) getchar();
        }
    }

    std::cout << "Hit ENTER to end simulation" << std::endl;
    char buf[1];
    std::cin.getline(buf,1);

    return EXIT_SUCCESS;
}

int sc_main (int argc, char *argv[])
{
	try {
		return _main(argc, argv);
	} catch (std::exception &e) {
		std::cout << e.what() << std::endl;
	} catch (...) {
		std::cout << "Unknown exception occured" << std::endl;
		throw;
	}
	return 1;
}
