Changeset 502 for soft


Ignore:
Timestamp:
Feb 8, 2015, 9:20:45 PM (10 years ago)
Author:
alain
Message:

1) Introduce distributed barriers in the multi-threads applications
(classif) transpose, convol, sort, gameoflife)

2) Introducing support for architectures containing empty clusters
in the mapping of these multi-threaded applications.

3) Removing the "command line arguments" in the sort application
(replaced by the giet_procs_number() system call.

Location:
soft/giet_vm/applications
Files:
1 added
10 edited

Legend:

Unmodified
Added
Removed
  • soft/giet_vm/applications/classif/classif.py

    r488 r502  
    33from mapping import *
    44
    5 ######################################################################################
     5###################################################################################
    66#   file   : classif.py 
    77#   date   : november 2014
    88#   author : Alain Greiner
    9 #######################################################################################
     9###################################################################################
    1010#  This file describes the mapping of the multi-threaded "classif"
    1111#  application on a multi-clusters, multi-processors architecture.
    1212#  The mapping of tasks on processors is the following:
    13 #  - one "load" task per cluster,
    14 #  - one "store" task per cluster,
    15 #  - (nprocs-2) "analyse" task per cluster.
    16 #  The mapping of virtual segments on the clusters is the following:
    17 #    - The code vsegs are replicated on all clusters.
     13#    - one "load" task per cluster containing processors,
     14#    - one "store" task per cluster containing processors,
     15#    - (nprocs-2) "analyse" task per cluster containing processors.
     16#  The mapping of virtual segments is the following:
    1817#    - There is one shared data vseg in cluster[0][0]
    19 #    - There is one heap vseg per cluster.
    20 #    - The stacks vsegs are distibuted on all clusters.
     18#    - The code vsegs are replicated on all clusters containing processors.
     19#    - There is one heap vseg per cluster containing processors.
     20#    - The stacks vsegs are distibuted on all clusters containing processors.
    2121#  This mapping uses 5 platform parameters, (obtained from the "mapping" argument)
    2222#    - x_size    : number of clusters in a row
     
    2828#  WARNING: The target architecture cannot contain less
    2929#           than 3 processors per cluster.
    30 ####################################################################################
     30##################################################################################
    3131
    3232#########################
     
    4949
    5050    heap_base  = 0x30000000
    51     heap_size  = 0x00008000     # 32 Kbytes (per cluster)     
     51    heap_size  = 0x00040000     # 256 Kbytes (per cluster)     
    5252
    5353    stack_base = 0x40000000
     
    6363                     local = False )
    6464
    65     # heap_x_y vsegs : shared / one per cluster
     65    # heap vsegs : shared (one per cluster)
    6666    for x in xrange (x_size):
    6767        for y in xrange (y_size):
    68             base = heap_base + ( (4*x + y) * heap_size )
     68            cluster_id = (x * y_size) + y
     69            if ( mapping.clusters[cluster_id].procs ):
     70                size  = heap_size
     71                base  = heap_base + (cluster_id * size)
    6972
    70             mapping.addVseg( vspace, 'classif_heap_%d_%d' %(x,y), base , heap_size,
    71                              'C_WU', vtype = 'HEAP', x = x, y = y, pseg = 'RAM',
    72                              local = False )
     73                mapping.addVseg( vspace, 'classif_heap_%d_%d' %(x,y), base , size,
     74                                 'C_WU', vtype = 'HEAP', x = x, y = y, pseg = 'RAM',
     75                                 local = False )
    7376
    7477    # code vsegs : local (one copy in each cluster)
    7578    for x in xrange (x_size):
    7679        for y in xrange (y_size):
     80            cluster_id = (x * y_size) + y
     81            if ( mapping.clusters[cluster_id].procs ):
    7782
    78             mapping.addVseg( vspace, 'classif_code_%d_%d' %(x,y), code_base , code_size,
    79                              'CXWU', vtype = 'ELF', x = x, y = y, pseg = 'RAM',
    80                              binpath = 'build/classif/classif.elf',
    81                              local = True )
     83                mapping.addVseg( vspace, 'classif_code_%d_%d' %(x,y),
     84                                 code_base , code_size,
     85                                 'CXWU', vtype = 'ELF', x = x, y = y, pseg = 'RAM',
     86                                 binpath = 'build/classif/classif.elf',
     87                                 local = True )
    8288
    83     # stacks vsegs: local (one stack per processor => nprocs stacks per cluster)           
     89    # stacks vsegs: local (one stack per processor => nprocs stacks per cluster)
    8490    for x in xrange (x_size):
    8591        for y in xrange (y_size):
    86             for p in xrange( nprocs ):
    87                 proc_id = (((x * y_size) + y) * nprocs) + p
    88                 size    = (stack_size / nprocs) & 0xFFFFF000
    89                 base    = stack_base + (proc_id * size)
     92            cluster_id = (x * y_size) + y
     93            if ( mapping.clusters[cluster_id].procs ):
     94                for p in xrange( nprocs ):
     95                    proc_id = (((x * y_size) + y) * nprocs) + p
     96                    size    = (stack_size / nprocs) & 0xFFFFF000
     97                    base    = stack_base + (proc_id * size)
    9098
    91                 mapping.addVseg( vspace, 'classif_stack_%d_%d_%d' % (x,y,p), base, size,
    92                                  'C_WU', vtype = 'BUFFER', x = x , y = y , pseg = 'RAM',
    93                                  local = True, big = True )
     99                    mapping.addVseg( vspace, 'classif_stack_%d_%d_%d' % (x,y,p),
     100                                     base, size, 'C_WU', vtype = 'BUFFER',
     101                                     x = x , y = y , pseg = 'RAM',
     102                                     local = True, big = True )
    94103
    95104    # distributed tasks / one task per processor
    96105    for x in xrange (x_size):
    97106        for y in xrange (y_size):
    98             for p in xrange( nprocs ):
    99                 trdid = (((x * y_size) + y) * nprocs) + p
    100                 if  ( p== 0 ):                              # task load
    101                     task_index = 0
    102                     task_name  = 'load_%d_%d_%d' %(x,y,p)           
    103                 elif  ( p== 1 ):                            # task store
    104                     task_index = 1
    105                     task_name  = 'store_%d_%d_%d' %(x,y,p)           
    106                 else :                                      # task analyse
    107                     task_index = 2
    108                     task_name  = 'analyse_%d_%d_%d' % (x,y,p)
     107            cluster_id = (x * y_size) + y
     108            if ( mapping.clusters[cluster_id].procs ):
     109                for p in xrange( nprocs ):
     110                    trdid = (((x * y_size) + y) * nprocs) + p
     111                    if  ( p== 0 ):                              # task load
     112                        task_index = 0
     113                        task_name  = 'load_%d_%d_%d' %(x,y,p)           
     114                    elif  ( p== 1 ):                            # task store
     115                        task_index = 1
     116                        task_name  = 'store_%d_%d_%d' %(x,y,p)           
     117                    else :                                      # task analyse
     118                        task_index = 2
     119                        task_name  = 'analyse_%d_%d_%d' % (x,y,p)
    109120
    110                 mapping.addTask( vspace, task_name, trdid, x, y, p,
    111                                  'classif_stack_%d_%d_%d' % (x,y,p),
    112                                  'classif_heap_%d_%d' % (x,y),
    113                                  task_index )
     121                    mapping.addTask( vspace, task_name, trdid, x, y, p,
     122                                     'classif_stack_%d_%d_%d' % (x,y,p),
     123                                     'classif_heap_%d_%d' % (x,y),
     124                                     task_index )
    114125
    115126    # extend mapping name
     
    118129    return vspace  # useful for test
    119130           
    120 ################################ test ######################################################
     131################################ test ################################################
    121132
    122133if __name__ == '__main__':
  • soft/giet_vm/applications/classif/main.c

    r488 r502  
    1 /////////////////////////////////////////////////////////////////////////////////////////
     1///////////////////////////////////////////////////////////////////////////////////////
    22// File   : main.c   (for classif application)
    33// Date   : november 2014
    44// author : Alain Greiner
    5 /////////////////////////////////////////////////////////////////////////////////////////
     5///////////////////////////////////////////////////////////////////////////////////////
    66// This multi-threaded application takes a stream of Gigabit Ethernet packets,
    77// and makes packet analysis and classification, based on the source MAC address.
     
    99// component to receive and send packets on the Gigabit Ethernet port.
    1010//
    11 // This application is described as a TCG (Task and Communication Graph) containing
    12 // (N+2) tasks per cluster:
     11// It can run on architectures containing up to 16 * 16 clusters,
     12// and up to 8 processors per cluster.
     13//
     14// This application is described as a TCG (Task and Communication Graph)
     15// containing (N+2) tasks per cluster:
    1316// - one "load" task
     17// - one "store" task
    1418// - N "analyse" tasks
    15 // - one "store" task
    16 // The 4 Kbytes containers are diributed (N+2 containers per cluster):
     19// The containers are distributed (N+2 containers per cluster):
    1720// - one RX container (part of the kernel rx_chbuf), in the kernel heap.
    1821// - one TX container (part of the kernel tx-chbuf), in the kernel heap.
     
    3033// The MWMR fifo descriptors array is defined as a global variable in cluster[0][0].
    3134//
    32 // Initialisation is done in two steps by the "load" tasks:
    33 // - Task "load" in cluster[0][0] initialises NIC & CMA channel, and initialises
    34 //   the barrier between all "load" tasks. Other "load" tasks are waiting on the
    35 //   global_sync synchronisation variable.
    36 // - In each cluster[x][y], the "load" task allocates the working containers
    37 //   and the MWMR fifos descriptors in the local heap.
    38 //   The "analyse" tasks are waiting on the sync[x][y] variables.
     35// Initialisation is done in two steps by the "load" & "store" tasks:
     36// - Task "load" in cluster[0][0] initialises the barrier between all "load" tasks,
     37//   allocates NIC & CMA RX channel, and starts the NIC_CMA RX transfer.
     38//   Other "load" tasks are waiting on the load_sync synchronisation variable.
     39//   Task "store" in cluster[0][0] initialises the barrier between all "store" tasks,
     40//   allocates NIC & CMA TX channels, and starts the NIC_CMA TX transfer.
     41//   Other "store" tasks are waiting on the store_sync synchronisation variable.
     42// - When this global initialisation is completed, the "load" task in all clusters
     43//   allocates the working containers and the MWMR fifos descriptors from the
     44//   user local heap. In each cluster, the "analyse" and "store" tasks are waiting
     45//   the local initialisation completion on the local_sync[x][y] variables.
    3946//
    40 // Instrumentation results display is done by the "store" task in cluster[0][0]
    41 // when all "store" tasks completed the number of clusters specified by the
    42 // CONTAINERS_MAX parameter.
    43 //     
    4447// When initialisation is completed, all tasks loop on containers:
    4548// 1) The "load" task get an empty working container from the fifo_s2l,
     
    4750//    and transfer ownership of this container to one "analysis" task by writing
    4851//    into the fifo_l2a.   
    49 //
    5052// 2) The "analyse" task get one working container from the fifo_l2a, analyse
    5153//    each packet header, compute the packet type (depending on the SRC MAC address),
    5254//    increment the correspondint classification counter, and transpose the SRC
    5355//    and the DST MAC addresses fot TX tranmission.
    54 //
    5556// 3) The "store" task transfer get a full working container from the fifo_a2s,
    5657//    transfer this user container content to the the kernel tx_chbuf,
    5758//    and transfer ownership of this empty container to the "load" task by writing
    5859//    into the fifo_s2l.   
    59 //
    60 // This application uses the following hardware parameters (hard_config.h file):
    61 // - X_SIZE       : number of clusters in a row
    62 // - Y_SIZE       : number of clusters in a column
    63 // - NB_PROCS_MAX : number of processors per cluster
    64 /////////////////////////////////////////////////////////////////////////////////////////
     60//     
     61// Instrumentation results display is done by the "store" task in cluster[0][0]
     62// when all "store" tasks completed the number of clusters specified by the
     63// CONTAINERS_MAX parameter.
     64///////////////////////////////////////////////////////////////////////////////////////
    6565
    6666#include "stdio.h"
    67 #include "barrier.h"
     67#include "user_barrier.h"
    6868#include "malloc.h"
    6969#include "user_lock.h"
    7070#include "mwmr_channel.h"
    71 #include "hard_config.h"
    72 
    73 #define CONTAINERS_MAX  5
    74 #define VERBOSE_ANALYSE 1
    75 #define ANALYSIS_TASKS  (NB_PROCS_MAX - 2)
    76 
    77 /////////////////////////////////////////////////////////////////////////////////////////
     71
     72#define X_SIZE_MAX      16
     73#define Y_SIZE_MAX      16
     74#define NPROCS_MAX      8
     75#define CONTAINERS_MAX  500
     76#define VERBOSE_ANALYSE 0
     77
     78///////////////////////////////////////////////////////////////////////////////////////
    7879//    Global variables
    7980// The MWMR channels (descriptors and buffers), as well as the working containers
     
    8182// But the pointers on these distributed structures are shared arrays
    8283// stored in cluster[0][0].
    83 /////////////////////////////////////////////////////////////////////////////////////////
    84 
    85 // pointers on distributed temp[x][y][n] containers
    86 unsigned int*       container[X_SIZE][Y_SIZE][ANALYSIS_TASKS]; 
     84///////////////////////////////////////////////////////////////////////////////////////
     85
     86// pointers on distributed containers
     87unsigned int*       container[X_SIZE_MAX][Y_SIZE_MAX][NPROCS_MAX-2]; 
    8788
    8889// pointers on distributed mwmr fifos containing : temp[x][y][l] container descriptors
    89 mwmr_channel_t*     mwmr_l2a[X_SIZE][Y_SIZE]; 
    90 mwmr_channel_t*     mwmr_a2s[X_SIZE][Y_SIZE];
    91 mwmr_channel_t*     mwmr_s2l[X_SIZE][Y_SIZE];
     90mwmr_channel_t*     mwmr_l2a[X_SIZE_MAX][Y_SIZE_MAX]; 
     91mwmr_channel_t*     mwmr_a2s[X_SIZE_MAX][Y_SIZE_MAX];
     92mwmr_channel_t*     mwmr_s2l[X_SIZE_MAX][Y_SIZE_MAX];
    9293
    9394// local synchros signaling local MWMR fifos initialisation completion
    94 unsigned int        local_sync[X_SIZE][Y_SIZE]; 
     95volatile unsigned int        local_sync[X_SIZE_MAX][Y_SIZE_MAX]; 
    9596
    9697// global synchro signaling global initialisation completion
    97 unsigned int        load_sync  = 0;
    98 unsigned int        store_sync = 0;
     98volatile unsigned int        load_sync  = 0;
     99volatile unsigned int        store_sync = 0;
    99100
    100101// instrumentation counters
    101102unsigned int        counter[16];
    102103
    103 // distributed barriers (between "load" and "store" tasks)
    104 giet_sbt_barrier_t  rx_barrier;
    105 giet_sbt_barrier_t  tx_barrier;
     104// distributed barrier between "load" tasks
     105giet_sqt_barrier_t  rx_barrier;
     106
     107// distributed barrier between "store" tasks
     108giet_sqt_barrier_t  tx_barrier;
    106109
    107110// NIC_RX and NIC_TX channel index
     
    113116/////////////////////////////////////////
    114117{
     118    // each "load" task get platform parameters
     119    unsigned int    x_size;                                             // number of clusters in a row
     120    unsigned int    y_size;                     // number of clusters in a column
     121    unsigned int    nprocs;                     // number of processors per cluster
     122    giet_procs_number( &x_size, &y_size, &nprocs );
     123
     124    giet_assert( (x_size <= X_SIZE_MAX) &&
     125                 (y_size <= Y_SIZE_MAX) &&
     126                 (nprocs <= NPROCS_MAX) ,
     127                 "[CLASSIF ERROR] illegal platform parameters" );
     128
    115129    // each "load" task get processor identifiers
    116130    unsigned int    x;
     
    119133    giet_proc_xyp( &x, &y, &l );
    120134
    121     // "load" task[0][0] initialises barrier between load tasks,
     135    // "load" task[0][0] initialises barrier between all load tasks,
    122136    // allocates the NIC & CMA RX channels, and start the NIC_CMA RX transfer.
    123137    // Other "load" tasks wait completion
    124138    if ( (x==0) && (y==0) )
    125139    {
    126         giet_shr_printf("\n*** Task load on P[%d][%d][%d] starts at cycle %d\n",
    127                         x , y , l , giet_proctime() );
     140        giet_shr_printf("\n*** Task load on P[%d][%d][%d] starts at cycle %d\n"
     141                        "  x_size = %d / y_size = %d / nprocs = %d\n",
     142                        x , y , l , giet_proctime() , x_size, y_size, nprocs );
    128143 
    129         sbt_barrier_init( &rx_barrier, X_SIZE*Y_SIZE , 1 );
    130         nic_rx_channel = giet_nic_rx_alloc();
     144        sqt_barrier_init( &rx_barrier, x_size , y_size , 1 );
     145        nic_rx_channel = giet_nic_rx_alloc( x_size , y_size );
    131146        giet_nic_rx_start( nic_rx_channel );
    132147        load_sync = 1;
     
    137152    }   
    138153
    139     // all load tasks allocate containers[x][y][n] (from local heap)
     154    // each load tasks allocates containers[x][y][n] (from local heap)
    140155    // and register pointers in the local stack
    141156    unsigned int   n;
    142     unsigned int*  cont[ANALYSIS_TASKS];
    143 
    144     for ( n = 0 ; n < ANALYSIS_TASKS ; n++ )
     157    unsigned int*  cont[NPROCS_MAX-2];
     158    unsigned int   analysis_tasks = nprocs-2;
     159
     160    for ( n = 0 ; n < analysis_tasks ; n++ )
    145161    {
    146162        container[x][y][n] = malloc( 4096 );
     
    148164    }
    149165   
    150     // all load tasks allocate data buffers for mwmr fifos (from local heap)
    151     unsigned int*  data_l2a = malloc( ANALYSIS_TASKS<<2 );
    152     unsigned int*  data_a2s = malloc( ANALYSIS_TASKS<<2 );
    153     unsigned int*  data_s2l = malloc( ANALYSIS_TASKS<<2 );
    154 
    155     // all load tasks allocate mwmr fifos descriptors (from local heap)
     166    // each load task allocates data buffers for mwmr fifos (from local heap)
     167    unsigned int*  data_l2a = malloc( analysis_tasks<<2 );
     168    unsigned int*  data_a2s = malloc( analysis_tasks<<2 );
     169    unsigned int*  data_s2l = malloc( analysis_tasks<<2 );
     170
     171    // each load task allocates mwmr fifos descriptors (from local heap)
    156172    mwmr_l2a[x][y] = malloc( sizeof(mwmr_channel_t) );
    157173    mwmr_a2s[x][y] = malloc( sizeof(mwmr_channel_t) );
    158174    mwmr_s2l[x][y] = malloc( sizeof(mwmr_channel_t) );
    159175
    160     // all "load" tasks register local pointers on mwmr fifos in local stack
     176    // each load task registers local pointers on mwmr fifos in local stack
    161177    mwmr_channel_t* fifo_l2a = mwmr_l2a[x][y];
    162178    mwmr_channel_t* fifo_a2s = mwmr_a2s[x][y];
    163179    mwmr_channel_t* fifo_s2l = mwmr_s2l[x][y];
    164180
    165     // all "load" tasks initialise local mwmr fifos descriptors
     181    // each load task initialises local mwmr fifos descriptors
    166182    // ( width = 4 bytes / depth = number of analysis tasks )
    167     mwmr_init( fifo_l2a , data_l2a , 1 , ANALYSIS_TASKS );
    168     mwmr_init( fifo_a2s , data_a2s , 1 , ANALYSIS_TASKS );
    169     mwmr_init( fifo_s2l , data_s2l , 1 , ANALYSIS_TASKS );
     183    mwmr_init( fifo_l2a , data_l2a , 1 , analysis_tasks );
     184    mwmr_init( fifo_a2s , data_a2s , 1 , analysis_tasks );
     185    mwmr_init( fifo_s2l , data_s2l , 1 , analysis_tasks );
    170186
    171187   
    172     // all "load" tasks initialise local containers as empty in fifo_s2l
    173     for ( n = 0 ; n < ANALYSIS_TASKS ; n++ ) mwmr_write( fifo_s2l , &n , 1 );
    174 
    175     // each "load" task[x][y] signals mwmr fifos initialisation completion
     188    // each load task initialises local containers as empty in fifo_s2l
     189    for ( n = 0 ; n < analysis_tasks ; n++ ) mwmr_write( fifo_s2l , &n , 1 );
     190
     191    // each load task[x][y] signals mwmr fifos initialisation completion
    176192    // to other tasks in same cluster[x][y]
    177193    local_sync[x][y] = 1;
    178194
    179     // "load" task[0][0] displays status
     195    // load task[0][0] displays status
    180196    if ( (x==0) && (y==0) )
    181197    giet_shr_printf("\n*** Task load on P[%d,%d,%d] enters main loop at cycle %d\n"
     
    192208                    (unsigned int)fifo_s2l, (unsigned int)data_s2l,
    193209                    (unsigned int)cont[0],
    194                     X_SIZE, Y_SIZE, NB_PROCS_MAX );
     210                    x_size, y_size, nprocs );
    195211 
    196212    /////////////////////////////////////////////////////////////
    197     // All "load" tasks enter the main loop (on containers)
    198     unsigned int count = 0;     // loaded containers count
    199     unsigned int index;         // available container index
    200     unsigned int* temp;         // pointer on available container
     213    // All load tasks enter the main loop (on containers)
     214    unsigned int  count = 0;     // loaded containers count
     215    unsigned int  index;         // available container index
     216    unsigned int* temp;          // pointer on available container
    201217
    202218    while ( count < CONTAINERS_MAX )
    203219    {
    204         // get one empty count index from fifo_s2l
     220        // get one empty container index from fifo_s2l
    205221        mwmr_read( fifo_s2l , &index , 1 );
    206222        temp = cont[index];
    207223
    208         // get one count from  kernel rx_chbuf
     224        // get one container from  kernel rx_chbuf
    209225        giet_nic_rx_move( nic_rx_channel, temp );
    210226
     
    213229        unsigned int nwords   = temp[0] >> 16;
    214230
    215         if ( (x==X_SIZE-1) && (y==Y_SIZE-1) )
     231        if ( (x==0) && (y==0) )
    216232        giet_shr_printf("\n*** Task load on P[%d,%d,%d] get container %d at cycle %d"
    217233                        " : %d packets / %d words\n",
    218234                        x, y, l, count, giet_proctime(), npackets, nwords );
    219235
    220         // put the full count index to fifo_l2a
     236        // put the full container index to fifo_l2a
    221237        mwmr_write( fifo_l2a, &index , 1 );
    222238
     
    225241
    226242    // all "load" tasks synchronise before stats
    227     sbt_barrier_wait( &rx_barrier );
     243    sqt_barrier_wait( &rx_barrier );
    228244
    229245    // "load" task[0][0] stops the NIC_CMA RX transfer and displays stats
     
    244260//////////////////////////////////////////
    245261{
     262    // each "load" task get platform parameters
     263    unsigned int    x_size;                                             // number of clusters in row
     264    unsigned int    y_size;                     // number of clusters in a column
     265    unsigned int    nprocs;                     // number of processors per cluster
     266    giet_procs_number( &x_size, &y_size, &nprocs );
     267
    246268    // get processor identifiers
    247269    unsigned int    x;
     
    250272    giet_proc_xyp( &x, &y, &l );
    251273
    252 
    253274    // "store" task[0][0] initialises the barrier between all "store" tasks,
    254275    // allocates NIC & CMA TX channels, and starts the NIC_CMA TX transfer.
     
    256277    if ( (x==0) && (y==0) )
    257278    {
    258         giet_shr_printf("\n*** Task store on P[%d][%d][%d] starts at cycle %d\n",
    259                         x , y , l , giet_proctime() );
     279        giet_shr_printf("\n*** Task store on P[%d][%d][%d] starts at cycle %d\n"
     280                        "  x_size = %d / y_size = %d / nprocs = %d\n",
     281                        x , y , l , giet_proctime() , x_size, y_size, nprocs );
    260282 
    261         sbt_barrier_init( &tx_barrier , X_SIZE*Y_SIZE , 1 );
    262         nic_tx_channel = giet_nic_tx_alloc();
     283        sqt_barrier_init( &tx_barrier , x_size , y_size , 1 );
     284        nic_tx_channel = giet_nic_tx_alloc( x_size , y_size );
    263285        giet_nic_tx_start( nic_tx_channel );
    264286        store_sync = 1;
     
    272294    while ( local_sync[x][y] == 0 ) asm volatile ("nop");
    273295
    274     // all "store" tasks register pointers on working containers in local stack
     296    // each "store" tasks register pointers on working containers in local stack
    275297    unsigned int   n;
    276     unsigned int*  cont[ANALYSIS_TASKS];
    277     for ( n = 0 ; n < ANALYSIS_TASKS ; n++ )
     298    unsigned int   analysis_tasks = nprocs-2;
     299    unsigned int*  cont[NPROCS_MAX-2];
     300
     301    for ( n = 0 ; n < analysis_tasks ; n++ )
    278302    {
    279303        cont[n] = container[x][y][n];
     
    318342        unsigned int nwords   = temp[0] >> 16;
    319343
    320         if ( (x==X_SIZE-1) && (y==Y_SIZE-1) )
     344        if ( (x==0) && (y==0) )
    321345        giet_shr_printf("\n*** Task store on P[%d,%d,%d] get container %d at cycle %d"
    322346                        " : %d packets / %d words\n",
     
    330354
    331355    // all "store" tasks synchronise before result display
    332     sbt_barrier_wait( &tx_barrier );
     356    sqt_barrier_wait( &tx_barrier );
    333357
    334358    // "store" task[0,0] stops NIC_CMA TX transfer and displays results
     
    377401////////////////////////////////////////////
    378402{
     403    // each "load" task get platform parameters
     404    unsigned int    x_size;                                             // number of clusters in row
     405    unsigned int    y_size;                     // number of clusters in a column
     406    unsigned int    nprocs;                     // number of processors per cluster
     407    giet_procs_number( &x_size, &y_size, &nprocs );
     408
    379409    // get processor identifiers
    380410    unsigned int    x;
     
    385415    if ( (x==0) && (y==0) )
    386416    {
    387         giet_shr_printf("\n*** Task analyse on P[%d][%d][%d] starts at cycle %d\n",
    388                         x , y , l , giet_proctime() );
     417        giet_shr_printf("\n*** Task analyse on P[%d][%d][%d] starts at cycle %d\n"
     418                        "  x_size = %d / y_size = %d / nprocs = %d\n",
     419                        x , y , l , giet_proctime() , x_size, y_size, nprocs );
    389420    }
    390421 
     
    394425    // all "analyse" tasks register pointers on working containers in local stack
    395426    unsigned int   n;
    396     unsigned int*  cont[ANALYSIS_TASKS];
    397     for ( n = 0 ; n < ANALYSIS_TASKS ; n++ )
     427    unsigned int   analysis_tasks = nprocs-2;
     428    unsigned int*  cont[NPROCS_MAX-2];
     429    for ( n = 0 ; n < analysis_tasks ; n++ )
    398430    {
    399431        cont[n] = container[x][y][n];
     
    471503            unsigned int word2 = temp[first + 2];
    472504
     505#if VERBOSE_ANALYSE
    473506            unsigned long long dst = ((unsigned long long)(word1 & 0xFFFF0000)>>16) |
    474507                                     (((unsigned long long)word0)<<16);
    475508            unsigned long long src = ((unsigned long long)(word1 & 0x0000FFFF)<<32) |
    476509                                     ((unsigned long long)word2);
    477 #if VERBOSE_ANALYSE
    478510            if ( p < 10 )
    479511            {
  • soft/giet_vm/applications/convol/convol.py

    r457 r502  
    1212#  This include both the mapping of virtual segments on the clusters,
    1313#  and the mapping of tasks on processors.
     14#  There is one task per processor.
     15#  The mapping of virtual segments is the following:
     16#    - There is one shared data vseg in cluster[0][0]
     17#    - The code vsegs are replicated on all clusters containing processors.
     18#    - There is one heap vseg per cluster containing processors.
     19#    - The stacks vsegs are distibuted on all clusters containing processors.
    1420#  This mapping uses 5 platform parameters, (obtained from the "mapping" argument)
    15 - x_size    : number of clusters in a row
    16 - y_size    : number of clusters in a column
    17 - x_width   : number of bits coding x coordinate
    18 - y_width   : number of bits coding y coordinate
    19 - nprocs : number of processors per cluster
     21  - x_size    : number of clusters in a row
     22  - y_size    : number of clusters in a column
     23  - x_width   : number of bits coding x coordinate
     24  - y_width   : number of bits coding y coordinate
     25  - nprocs : number of processors per cluster
    2026####################################################################################
    2127
     
    4652   
    4753    # data vseg in cluster[0,0] : non local
    48     mapping.addVseg( vspace, 'conv_data', data_base , data_size, 'C_WU', vtype = 'ELF',
    49                      x = 0, y = 0, pseg = 'RAM', binpath = 'build/convol/convol.elf',
     54    mapping.addVseg( vspace, 'conv_data', data_base , data_size,
     55                     'C_WU', vtype = 'ELF', x = 0, y = 0, pseg = 'RAM',
     56                     binpath = 'build/convol/convol.elf',
    5057                     local = False )
    5158
     
    5360    for x in xrange (x_size):
    5461        for y in xrange (y_size):
    55             size       = code_size
    56             base       = code_base
    57             mapping.addVseg( vspace, 'conv_code_%d_%d' % (x,y), base, size,
    58                              'CXWU', vtype = 'ELF', x = x , y = y , pseg = 'RAM',
    59                              binpath = 'build/convol/convol.elf',
    60                              local = True )
     62            cluster_id = (x * y_size) + y
     63            if ( mapping.clusters[cluster_id].procs ):
     64                size       = code_size
     65                base       = code_base
     66
     67                mapping.addVseg( vspace, 'conv_code_%d_%d' % (x,y), base, size,
     68                                 'CXWU', vtype = 'ELF', x = x , y = y , pseg = 'RAM',
     69                                 binpath = 'build/convol/convol.elf',
     70                                 local = True )
    6171
    6272    # stack vsegs : local (one stack per processor)
    6373    for x in xrange (x_size):
    6474        for y in xrange (y_size):
    65             for p in xrange( nprocs ):
    66                 proc_id = (((x * y_size) + y) * nprocs) + p
    67                 size    = (stack_size / nprocs) & 0xFFFFF000
    68                 base    = stack_base + (proc_id * size)
    69                 mapping.addVseg( vspace, 'conv_stack_%d_%d_%d' % (x,y,p), base, size,
    70                                  'C_WU', vtype = 'BUFFER', x = x , y = y , pseg = 'RAM',
    71                                  local = True, big = True )
     75            cluster_id = (x * y_size) + y
     76            if ( mapping.clusters[cluster_id].procs ):
     77                for p in xrange( nprocs ):
     78                    proc_id = (((x * y_size) + y) * nprocs) + p
     79                    size    = (stack_size / nprocs) & 0xFFFFF000
     80                    base    = stack_base + (proc_id * size)
     81
     82                    mapping.addVseg( vspace, 'conv_stack_%d_%d_%d' % (x,y,p),
     83                                     base, size, 'C_WU', vtype = 'BUFFER',
     84                                     x = x , y = y , pseg = 'RAM',
     85                                     local = True, big = True )
    7286           
    73     # heap vsegs : distributed but non local (all heap vsegs can be accessed by all tasks)
     87    # heap vsegs : distributed but non local (any heap can be accessed by any task)
    7488    for x in xrange (x_size):
    7589        for y in xrange (y_size):
    7690            cluster_id = (x * y_size) + y
    77             size       = heap_size
    78             base       = heap_base + (cluster_id * size)
    79             mapping.addVseg( vspace, 'conv_heap_%d_%d' % (x,y), base, size,
    80                              'C_WU', vtype = 'BUFFER', x = x , y = y , pseg = 'RAM',
    81                              local = False, big = True )
     91            if ( mapping.clusters[cluster_id].procs ):
     92                size       = heap_size
     93                base       = heap_base + (cluster_id * size)
     94
     95                mapping.addVseg( vspace, 'conv_heap_%d_%d' % (x,y), base, size,
     96                                 'C_WU', vtype = 'BUFFER', x = x , y = y , pseg = 'RAM',
     97                                 local = False, big = True )
    8298
    8399    # distributed tasks : one task per processor
    84100    for x in xrange (x_size):
    85101        for y in xrange (y_size):
    86             for p in xrange( nprocs ):
    87                 trdid = (((x * y_size) + y) * nprocs) + p
    88                 mapping.addTask( vspace, 'conv_%d_%d_%d' % (x,y,p), trdid, x, y, p,
    89                                  'conv_stack_%d_%d_%d' % (x,y,p),
    90                                  'conv_heap_%d_%d' % (x,y), 0 )
     102            cluster_id = (x * y_size) + y
     103            if ( mapping.clusters[cluster_id].procs ):
     104                for p in xrange( nprocs ):
     105                    trdid = (((x * y_size) + y) * nprocs) + p
     106
     107                    mapping.addTask( vspace, 'conv_%d_%d_%d' % (x,y,p),
     108                                     trdid, x, y, p,
     109                                     'conv_stack_%d_%d_%d' % (x,y,p),
     110                                     'conv_heap_%d_%d' % (x,y), 0 )
    91111
    92112    # extend mapping name
     
    95115    return vspace  # useful for test
    96116           
    97 ################################ test ######################################################
     117################################ test ################################################
    98118
    99119if __name__ == '__main__':
  • soft/giet_vm/applications/convol/main.c

    r488 r502  
    1 ////////////////////////////////////////////////////////////////////////////////////////////
     1///////////////////////////////////////////////////////////////////////////////////////
    22// File   : main.c   (for convol application)
    33// Date   : june 2014
    44// author : Alain Greiner
    5 ////////////////////////////////////////////////////////////////////////////////////////////
     5///////////////////////////////////////////////////////////////////////////////////////
    66// This multi-threaded application application implements a 2D convolution product. 
    77// The convolution kernel is [201]*[35] pixels, but it can be factored in two
    88// independant line and column convolution products.
    99// It can run on a multi-processors, multi-clusters architecture, with one thread
    10 // per processor. It uses the he following hardware parameters, that must be defined
    11 // in the hard_config.h file:
    12 // - X_SIZE       : number of clusters in a row
    13 // - Y_SIZE       : number of clusters in a column
    14 // - NB_PROCS_MAX : number of processors per cluster
    15 // - FBUF_X_SIZE  : number of pixels per line in frame buffer
    16 // - FBUF_Y_SIZE  : number of lines  in frame buffer
     10// per processor.
    1711//
    1812// The (1024 * 1024) pixels image is read from a file (2 bytes per pixel).
    1913//
    20 // - The number of clusters containing processors must be a power of 2.
    21 // - The number of processors per cluster must be a power of 2.
    22 ////////////////////////////////////////////////////////////////////////////////////////////
    23 
    24 #include "hard_config.h"
     14// - number of clusters containing processors must be power of 2 no larger than 256.
     15// - number of processors per cluster must be power of 2 no larger than 8.
     16///////////////////////////////////////////////////////////////////////////////////////
     17
    2518#include "stdio.h"
    2619#include "stdlib.h"
    27 #include "barrier.h"
     20#include "user_barrier.h"
    2821#include "malloc.h"
    2922
    30 #define USE_SBT_BARRIER            1
     23#define USE_SQT_BARRIER            1
    3124#define VERBOSE                    0
    3225#define SUPER_VERBOSE              0
    3326
     27#define X_SIZE_MAX                 16
     28#define Y_SIZE_MAX                 16
     29#define PROCS_MAX                  8
     30#define CLUSTERS_MAX               (X_SIZE_MAX * Y_SIZE_MAX)
     31
    3432#define INITIAL_DISPLAY_ENABLE     0
    3533#define FINAL_DISPLAY_ENABLE       1
    3634
    37 #define NB_CLUSTERS                (X_SIZE * Y_SIZE)
    3835#define PIXEL_SIZE                 2
    3936#define NL                         1024
     
    5350// global instrumentation counters (cluster_id, lpid]
    5451
    55 unsigned int START[NB_CLUSTERS][NB_PROCS_MAX];
    56 unsigned int H_BEG[NB_CLUSTERS][NB_PROCS_MAX];
    57 unsigned int H_END[NB_CLUSTERS][NB_PROCS_MAX];
    58 unsigned int V_BEG[NB_CLUSTERS][NB_PROCS_MAX];
    59 unsigned int V_END[NB_CLUSTERS][NB_PROCS_MAX];
    60 unsigned int D_BEG[NB_CLUSTERS][NB_PROCS_MAX];
    61 unsigned int D_END[NB_CLUSTERS][NB_PROCS_MAX];
     52unsigned int START[CLUSTERS_MAX][PROCS_MAX];
     53unsigned int H_BEG[CLUSTERS_MAX][PROCS_MAX];
     54unsigned int H_END[CLUSTERS_MAX][PROCS_MAX];
     55unsigned int V_BEG[CLUSTERS_MAX][PROCS_MAX];
     56unsigned int V_END[CLUSTERS_MAX][PROCS_MAX];
     57unsigned int D_BEG[CLUSTERS_MAX][PROCS_MAX];
     58unsigned int D_END[CLUSTERS_MAX][PROCS_MAX];
    6259
    6360// global synchronization barrier
    6461
    65 #if USE_SBT_BARRIER
    66 giet_sbt_barrier_t  barrier;
     62#if USE_SQT_BARRIER
     63giet_sqt_barrier_t  barrier;
    6764#else
    6865giet_barrier_t      barrier;
     
    7471
    7572// global pointers on distributed buffers in all clusters
    76 unsigned short * GA[NB_CLUSTERS];
    77 int *            GB[NB_CLUSTERS];
    78 int *            GC[NB_CLUSTERS];
    79 int *            GD[NB_CLUSTERS];
    80 unsigned char *  GZ[NB_CLUSTERS];
     73unsigned short * GA[CLUSTERS_MAX];
     74int *            GB[CLUSTERS_MAX];
     75int *            GC[CLUSTERS_MAX];
     76int *            GD[CLUSTERS_MAX];
     77unsigned char *  GZ[CLUSTERS_MAX];
    8178
    8279///////////////////////////////////////////
     
    109106    int z; // vertical filter index for loops
    110107
     108    // plat-form parameters
     109    unsigned int x_size;             // number of clusters in a row
     110    unsigned int y_size;             // number of clusters in a column
     111    unsigned int nprocs;             // number of processors per cluster
     112   
     113    giet_procs_number( &x_size , &y_size , &nprocs );
     114
    111115    // processor identifiers
    112     unsigned int x;                                           // x coordinate
    113     unsigned int y;                                           // y coordinate
    114     unsigned int lpid;                                        // local proc/task id
     116    unsigned int x;                                         // x coordinate
     117    unsigned int y;                                         // y coordinate
     118    unsigned int lpid;                                      // local proc/task id
    115119    giet_proc_xyp( &x, &y, &lpid );
    116120
    117     int          file        = 0;                             // file descriptor
    118     unsigned int nprocs      = NB_PROCS_MAX;                  // procs per cluster
    119     unsigned int nclusters   = NB_CLUSTERS;                   // number of clusters
    120     unsigned int cluster_id  = (x * Y_SIZE) + y;              // continuous cluster index
    121     unsigned int task_id     = (cluster_id * nprocs) + lpid;  // continuous task index
    122     unsigned int ntasks      = nclusters * nprocs;            // number of tasks
    123     unsigned int frame_size  = FRAME_SIZE;                    // total size (bytes)
    124     unsigned int nblocks     = frame_size / 512;              // number of blocks per frame
    125 
    126     unsigned int lines_per_task     = NL / ntasks;            // lines per task
    127     unsigned int lines_per_cluster  = NL / nclusters;         // lines per cluster
    128     unsigned int pixels_per_task    = NP / ntasks;            // columns per task
    129     unsigned int pixels_per_cluster = NP / nclusters;         // columns per cluster
     121    int          file       = 0;                            // file descriptor
     122    unsigned int nclusters  = x_size * y_size;              // number of clusters
     123    unsigned int cluster_id = (x * y_size) + y;             // continuous cluster index
     124    unsigned int task_id    = (cluster_id * nprocs) + lpid; // continuous task index
     125    unsigned int ntasks     = nclusters * nprocs;           // number of tasks
     126    unsigned int frame_size = FRAME_SIZE;                   // total size (bytes)
     127    unsigned int nblocks    = frame_size / 512;             // number of blocks/frame
     128
     129    unsigned int lines_per_task     = NL / ntasks;          // lines per task
     130    unsigned int lines_per_cluster  = NL / nclusters;       // lines per cluster
     131    unsigned int pixels_per_task    = NP / ntasks;          // columns per task
     132    unsigned int pixels_per_cluster = NP / nclusters;       // columns per cluster
    130133
    131134    int first, last;
     
    140143     // parameters checking
    141144   
    142     if ( (NP != FBUF_X_SIZE) || (NL != FBUF_Y_SIZE) )
    143     {
    144         giet_exit("[TRANSPOSE ERROR] Frame buffer size does not fit image size");
    145     }
    146     if ((nprocs != 1) && (nprocs != 2) && (nprocs != 4))
    147         giet_exit( "[CONVOL ERROR] NB_PROCS_MAX must be 1, 2 or 4\n");
    148 
    149     if ((X_SIZE!=1) && (X_SIZE!=2) && (X_SIZE!=4) && (X_SIZE!=8) && (X_SIZE!=16))
    150         giet_exit( "[CONVOL ERROR] X_SIZE must be 1, 2, 4, 8, 16\n");
     145    if ((nprocs != 1) && (nprocs != 2) && (nprocs != 4) && (nprocs != 8))
     146        giet_exit( "[CONVOL ERROR] NB_PROCS_MAX must be 1, 2, 4 or 8\n");
     147
     148    if ((x_size!=1) && (x_size!=2) && (x_size!=4) && (x_size!=8) && (x_size!=16))
     149        giet_exit( "[CONVOL ERROR] x_size must be 1, 2, 4, 8, 16\n");
    151150       
    152     if ((Y_SIZE!=1) && (Y_SIZE!=2) && (Y_SIZE!=4) && (Y_SIZE!=8) && (Y_SIZE!=16))
    153         giet_exit( "[CONVOL ERROR] Y_SIZE must be 1, 2, 4, 8, 16\n");
     151    if ((y_size!=1) && (y_size!=2) && (y_size!=4) && (y_size!=8) && (y_size!=16))
     152        giet_exit( "[CONVOL ERROR] y_size must be 1, 2, 4, 8, 16\n");
    154153
    155154    if ( NL % nclusters != 0 )
    156         giet_exit( "[CONVOL ERROR] NB_CLUSTERS must be a divider of NL");
     155        giet_exit( "[CONVOL ERROR] CLUSTERS_MAX must be a divider of NL");
    157156
    158157    if ( NP % nclusters != 0 )
    159         giet_exit( "[CONVOL ERROR] NB_CLUSTERS must be a divider of NP");
     158        giet_exit( "[CONVOL ERROR] CLUSTERS_MAX must be a divider of NP");
    160159
    161160   
     
    166165    if ( (x==0) && (y==0) && (lpid==0) )
    167166    {
    168         // parameters checking
    169         if ( (NP != FBUF_X_SIZE) || (NL != FBUF_Y_SIZE) )
    170             giet_exit("[TRANSPOSE ERROR] Frame buffer size does not fit image size");
    171        
    172         if ((nprocs != 1) && (nprocs != 2) && (nprocs != 4))
    173             giet_exit( "[CONVOL ERROR] NB_PROCS_MAX must be 1, 2 or 4\n");
    174 
    175         if ((X_SIZE!=1) && (X_SIZE!=2) && (X_SIZE!=4) && (X_SIZE!=8) && (X_SIZE!=16))
    176             giet_exit( "[CONVOL ERROR] X_SIZE must be 1, 2, 4, 8, 16\n");
    177        
    178         if ((Y_SIZE!=1) && (Y_SIZE!=2) && (Y_SIZE!=4) && (Y_SIZE!=8) && (Y_SIZE!=16))
    179             giet_exit( "[CONVOL ERROR] Y_SIZE must be 1, 2, 4, 8, 16\n");
    180 
    181         if ( NL % nclusters != 0 )
    182             giet_exit( "[CONVOL ERROR] NB_CLUSTERS must be a divider of NL");
    183 
    184         if ( NP % nclusters != 0 )
    185             giet_exit( "[CONVOL ERROR] NB_CLUSTERS must be a divider of NP");
    186 
    187    
    188167        giet_shr_printf("\n[CONVOL] task[0,0,0] starts barrier init at cycle %d\n"
    189                         "- NB_CLUSTERS     = %d\n"
    190                         "- NB_PROCS_MAX    = %d\n"
    191                         "- NB_TASKS        = %d\n"
    192                         "- NB_BLOCKS       = %x\n",
     168                        "- CLUSTERS  = %d\n"
     169                        "- PROCS     = %d\n"
     170                        "- TASKS     = %d\n"
     171                        "- BLOCKS    = %d\n",
    193172                        giet_proctime(), nclusters, nprocs, ntasks, nblocks );
    194 #if USE_SBT_BARRIER
    195         sbt_barrier_init( &barrier, nclusters , nprocs );
     173#if USE_SQT_BARRIER
     174        sqt_barrier_init( &barrier, x_size , y_size , nprocs );
    196175#else
    197176        barrier_init( &barrier, ntasks );
     
    216195
    217196#if VERBOSE
    218 giet_shr_printf( "\n[CONVOL] task[%d,%d,%d] enters malloc at cycle %d\n", x,y,lpid, date );
     197giet_shr_printf( "\n[CONVOL] task[%d,%d,%d] enters malloc at cycle %d\n",
     198                 x,y,lpid, date );
    219199#endif
    220200
     
    242222
    243223    ///////////////////////////////
    244     #if USE_SBT_BARRIER
    245     sbt_barrier_wait( &barrier );
     224    #if USE_SQT_BARRIER
     225    sqt_barrier_wait( &barrier );
    246226    #else
    247227    barrier_wait( &barrier );
     
    253233    ///////////////////////////////////////////////////////////////////
    254234
    255     unsigned short * A[NB_CLUSTERS];
    256     int *            B[NB_CLUSTERS];
    257     int *            C[NB_CLUSTERS];
    258     int *            D[NB_CLUSTERS];
    259     unsigned char *  Z[NB_CLUSTERS];
     235    unsigned short * A[CLUSTERS_MAX];
     236    int            * B[CLUSTERS_MAX];
     237    int            * C[CLUSTERS_MAX];
     238    int            * D[CLUSTERS_MAX];
     239    unsigned char  * Z[CLUSTERS_MAX];
    260240
    261241    for (c = 0; c < nclusters; c++)
     
    283263                         " at cycle %d\n", giet_proctime() );
    284264
    285         for ( c = 0 ; c < NB_CLUSTERS ; c++ )
     265        for ( c = 0 ; c < nclusters ; c++ )
    286266        {
    287267            giet_shr_printf( "\n[CONVOL] task[0,0,0] starts load "
     
    341321
    342322        ////////////////////////////
    343         #if USE_SBT_BARRIER
    344         sbt_barrier_wait( &barrier );
     323        #if USE_SQT_BARRIER
     324        sqt_barrier_wait( &barrier );
    345325        #else
    346326        barrier_wait( &barrier );
     
    447427
    448428    /////////////////////////////
    449     #if USE_SBT_BARRIER
    450     sbt_barrier_wait( &barrier );
     429    #if USE_SQT_BARRIER
     430    sqt_barrier_wait( &barrier );
    451431    #else
    452432    barrier_wait( &barrier );
     
    567547
    568548    ////////////////////////////
    569     #if USE_SBT_BARRIER
    570     sbt_barrier_wait( &barrier );
     549    #if USE_SQT_BARRIER
     550    sqt_barrier_wait( &barrier );
    571551    #else
    572552    barrier_wait( &barrier );
     
    626606     
    627607    //////////////////////////////
    628     #if USE_SBT_BARRIER
    629     sbt_barrier_wait( &barrier );
     608    #if USE_SQT_BARRIER
     609    sqt_barrier_wait( &barrier );
    630610    #else
    631611    barrier_wait( &barrier );
  • soft/giet_vm/applications/gameoflife/gameoflife.ld

    r251 r502  
    33*****************************************************************************/
    44
    5 seg_data_base      = 0x00800000;
    6 seg_code_base      = 0x00400000;
     5seg_data_base      = 0x20000000;
     6seg_code_base      = 0x10000000;
    77
    88/***************************************************************************
     
    2222        *(.ctors)
    2323        *(.rodata)
    24         /* . = ALIGN(4); */
    2524        *(.rodata.*)
    26         /* . = ALIGN(4); */
    2725        *(.data)
    28         /* . = ALIGN(4); */
    2926        *(.lit8)
    3027        *(.lit4)
    3128        *(.sdata)
    32         /* . = ALIGN(4); */
    3329        *(.bss)
    3430        *(COMMON)
  • soft/giet_vm/applications/gameoflife/main.c

    r444 r502  
    1 /*
    2  * This application is an emulation of the game of life automaton
    3  * It must be deployed from processor 0 and use contiguous processor
    4  * (example 0,1,2,3)
    5  */
    6 
     1//////////////////////////////////////////////////////////////////////////////////
     2// File : main.c  (for gameoflife)
     3// Date : November 2013
     4// Author :  Alexandre Joannou <alexandre.joannou@lip6.fr>
     5//
     6// This application is an emulation of the game of life automaton.
     7// The world size is defined by the HEIGHT and WIDTH parameters.
     8// There is one task per processor, and each task compute HEIGHT/nbprocs lines.
     9// The number of processors must be a power of 2 not larger than HEIGHT.
     10//////////////////////////////////////////////////////////////////////////////////
    711
    812#include "stdio.h"
    913#include "limits.h"
    10 #include "barrier.h"
    11 #include "hard_config.h"
     14#include "user_barrier.h"
    1215#include "mapping_info.h"
    1316
    1417#define WIDTH           128
    1518#define HEIGHT          128
    16 #define NB_CLUSTER_MAX  256
    1719#define NB_ITERATION    1000000000
    1820
    19 #define PRINTF(...) ({ if ( proc_id==0) { giet_tty_printf(__VA_ARGS__); } })
    20 
    21 giet_barrier_t barriers[2];
    22 
    23 unsigned int init_ok = 1;
    24 
    25 #define NEW 0
    26 #define OLD 1
     21#define PRINTF(...) ({ if ( proc_id==0) { giet_shr_printf(__VA_ARGS__); } })
     22
     23giet_sqt_barrier_t barrier;
     24
     25unsigned int init_ok = 0;
     26
     27#define OLD 0
     28#define NEW 1
     29#define DSP 2
    2730
    2831typedef unsigned char uint8_t;
    2932typedef unsigned int size_t;
    3033
    31 uint8_t world[2][HEIGHT][WIDTH];
    32 uint8_t world_yuv[HEIGHT][WIDTH];
    33 
    34 /* Generate binary values for world between base_line and base_line + nb_line */
     34uint8_t world[3][HEIGHT][WIDTH];
     35
     36/////////////////////////////////////////////////
    3537void init_world(size_t base_line, size_t nb_line)
    3638{
    3739   size_t x,y;
    38    for (y = base_line ; y < base_line + nb_line; y++){
    39       for(x = 0; x < WIDTH ; x++) {
    40          // TODO OPTIMIZE RANDOM INIT
     40   for (y = base_line ; y < base_line + nb_line; y++)
     41   {
     42      for(x = 0; x < WIDTH ; x++)
     43      {
    4144         world[OLD][y][x] = giet_rand() % 2; 
    4245      }
     
    4447}
    4548
     49/////////////////////////////////////////////////
    4650uint8_t number_of_alive_neigh(size_t x, size_t y)
    4751{
     
    6064}
    6165
    62 /* Compute cell x,y */
     66/////////////////////////////////////////////////
    6367uint8_t compute_cell(size_t x, size_t y)
    6468{
    6569   uint8_t nb_neighbours_alive = number_of_alive_neigh(x,y);
    66    if (world[OLD][y][x] == 1) {
    67       if (nb_neighbours_alive == 2 ||
    68           nb_neighbours_alive == 3)
    69       {
    70          return 1;   
    71       }
    72    }
    73    else {
    74       if (nb_neighbours_alive == 3) {
    75          return 1;
    76       }
    77       else {
    78          return world[OLD][y][x];
    79       }
     70   if (world[OLD][y][x] == 1)
     71   {
     72      if (nb_neighbours_alive == 2 || nb_neighbours_alive == 3)  return 1;
     73   }
     74   else
     75   {
     76      if (nb_neighbours_alive == 3) return 1;
     77      else                          return world[OLD][y][x];
    8078   }
    8179   return 0;
     
    9997{
    10098   size_t x,y;
    101    for (y = base_line; y < base_line + nb_line; y++){
    102       for(x = 0; x < WIDTH ; x++) {
    103          //world_yuv[y][x] = world[NEW][y][x]*100; 
    104          world[NEW][y][x] = world[NEW][y][x]*255; 
     99   for (y = base_line; y < base_line + nb_line; y++)
     100   {
     101      for(x = 0; x < WIDTH ; x++)
     102      {
     103         world[DSP][y][x] = world[OLD][y][x]*255; 
    105104      }
    106105   }
    107106
    108107   giet_fbf_sync_write( base_line * WIDTH ,
    109                        &world[NEW][base_line][0],
    110                        nb_line * WIDTH);
     108                        &world[DSP][base_line][0],
     109                        nb_line * WIDTH );
    111110}
    112111
     
    133132   giet_proc_xyp( &x, &y, &p );
    134133
     134   // get processors number
     135   unsigned int x_size;
     136   unsigned int y_size;
     137   unsigned int n_local_procs;
     138   giet_procs_number( &x_size, &y_size, &n_local_procs );
     139
    135140   // compute continuous processor index
    136    unsigned int proc_id = (((x * Y_SIZE) + y) * NB_PROCS_MAX) + p; 
    137 
    138    unsigned int nlocal_procs  = NB_PROCS_MAX;               // processors per cluster
    139    unsigned int nclusters     = X_SIZE*Y_SIZE;              // number of clusters
    140    unsigned int nglobal_procs = nclusters * nlocal_procs;   // number of processors
     141   unsigned int proc_id = (((x * y_size) + y) * n_local_procs) + p; 
     142
     143   unsigned int n_clusters     = x_size * y_size;            // number of clusters
     144   unsigned int n_global_procs = n_clusters * n_local_procs; // number of processors
    141145   size_t i;
    142146
    143    size_t       nb_line       = HEIGHT / nglobal_procs;
     147   if ( n_global_procs > HEIGHT )
     148   {
     149       PRINTF("[GAMEOFLIFE ERROR] Number or processors too large :"
     150              " nb_procs = %d / image heigth = %d\n", n_global_procs, HEIGHT );
     151       giet_exit("error");
     152   }
     153
     154   size_t       nb_line       = HEIGHT / n_global_procs;
    144155   size_t       base_line     = nb_line * proc_id;
    145156   
    146    PRINTF("*** Starting init at cycle %d ***\n", giet_proctime());
    147 
    148    //  barriers initialization
     157   PRINTF("\n*** Starting barrier initialisation at cycle %d ***\n"
     158          " nprocs = %d / nlines = %d\n",
     159          giet_proctime() , n_global_procs, HEIGHT );
     160
     161   // barrier initialization
    149162   if ( proc_id == 0 )
    150163   {
    151       barrier_init(&barriers[0], nglobal_procs);
    152       barrier_init(&barriers[1], nglobal_procs);
    153 
    154       init_ok = 0;
     164      sqt_barrier_init( &barrier , x_size , y_size , n_local_procs );
     165      init_ok = 1;
    155166   }
    156167   else
    157168   {
    158       while ( init_ok == 1 );
    159    }
    160 
    161    init_world(base_line, nb_line);
    162 
    163    PRINTF("*** Completing init at cycle %d ***\n", giet_proctime());
    164    barrier_wait(&barriers[0]);
     169      while ( init_ok == 0 ) asm volatile("nop");
     170   }
     171
     172   PRINTF("\n*** Starting world initialisation at cycle %d ***\n",
     173          giet_proctime() );
     174
     175   //  parallel world  initialization
     176   init_world( base_line , nb_line );
     177
     178PRINTF("coucou 0\n");
     179
     180   display_world( base_line , nb_line );
     181
     182PRINTF("coucou 1\n");
     183
     184   sqt_barrier_wait( &barrier );
     185
     186   PRINTF("\n*** Starting life at cycle %d ***\n",
     187          giet_proctime() );
    165188   
    166189   for (i = 0; i < NB_ITERATION; i++)
    167190   {
    168       compute_new_gen(base_line, nb_line);
    169       grow_old_world(base_line, nb_line);
    170       display_world(base_line, nb_line);
    171       barrier_wait(&barriers[1]);
    172       barrier_init(&barriers[1], nglobal_procs);
    173    }
    174 
    175    PRINTF("*** End of main at cycle %d ***\n", giet_proctime());
     191      compute_new_gen( base_line, nb_line );
     192      grow_old_world( base_line, nb_line );
     193      display_world( base_line, nb_line );
     194
     195      sqt_barrier_wait( &barrier );
     196
     197      PRINTF(" - iteration %d completed\n", i );
     198   }
     199
     200   PRINTF("\n*** End of main at cycle %d ***\n", giet_proctime());
    176201
    177202   giet_exit("Completed");
  • soft/giet_vm/applications/sort/main.c

    r432 r502  
    99//      barrier routines to apply a sort algorithm in several stages.
    1010//
    11 //      Considerations :
    12 //
    13 //          - It supports up to 256 processors and the number of processors
    14 //            must be a power of 2.
    15 //
    16 //          - If there is only one TTY available, this application uses a spin
    17 //            lock to avoid several threads writting at the same time.
    18 //
    19 //          - This application must be executed on a cache coherent
    20 //            architecture. Otherwise some modifications must be applied
    21 //
    22 //          - The processors executing this application must have a contiguous
    23 //            processor id and the first processor must have id 0.
     11//      Constraints :
     12//
     13//      - It supports up to 1024 processors and the number of processors
     14//        must be a power of 2.
     15//
     16//      _ The array of values to be sorted (ARRAY_LENGTH) must be power of 2
     17//        larger than the number of processors.
     18//
     19//      - This application must be executed on a cache coherent architecture.
    2420//
    2521///////////////////////////////////////////////////////////////////////////////
     
    2824#include "mapping_info.h"
    2925#include "hard_config.h"
    30 #include "barrier.h"
    31 
    32 #define ARRAY_LENGTH    512
    33 #define IPT             (ARRAY_LENGTH / *nb_thread) // ITEMS PER THREAD
     26#include "user_barrier.h"
     27
     28#define ARRAY_LENGTH    4096
     29#define IPT             (ARRAY_LENGTH / threads) // ITEMS PER THREAD
    3430
    3531////////////////////////////////////////////////////////////////////////////////
    36 // Processors other than 0 display algorithm state
    37 // The processor 0 always displays some information so this does not affect him
     32// Processors other than 0 display algorithm state if VERBOSE non zero
    3833
    3934#define VERBOSE         1
    4035
    4136////////////////////////////////////////////////////////////////////////////////
    42 // Define printf according to verbosity option and number of available
    43 // TTY
     37// Define printf according to verbosity option and number of available TTY
    4438
    4539#if (VERBOSE == 1)
     
    5044
    5145#define task0_printf(...) if(thread_id == 0) giet_shr_printf(__VA_ARGS__)
    52 
    53 #define exit    giet_exit
    54 #define procid  giet_procid
    55 #define rand    giet_rand
    5646
    5747int array0[ARRAY_LENGTH];
     
    7363        int init_pos_result);
    7464
    75 ///////////////////////////////////////////////////
    76 // This application support at most 256 processors
    77 // Number of barriers = log2(nb_thread)
    78 
    79 giet_barrier_t barrier[8];
     65///////////////////////////////////////////////////////
     66// This application supports at most 1024 processors
     67// Number of barriers = log2(threads)
     68
     69giet_barrier_t barrier[10];
    8070
    8171//////////////////////////////////////////
     
    8373{
    8474    int thread_id = giet_thread_id();
    85     unsigned int* nb_thread;
    8675    int * src_array = NULL;
    8776    int * dst_array = NULL;
     
    9180    unsigned int time_end;   
    9281
    93     giet_vobj_get_vbase( "sort" ,
    94                          "sort_args",
    95                          (unsigned int*)&nb_thread );
    96    
    97     task0_printf("\n[ Thread 0 ] Starting sort application with %u threads "
    98                  "at cycle %u\n", *nb_thread, time_start);
     82    // compute number of threads (one thread per proc)
     83    unsigned int x_size;
     84    unsigned int y_size;
     85    unsigned int nprocs;
     86    unsigned int threads;
     87    giet_procs_number( &x_size , &y_size , &nprocs );
     88    threads = x_size * y_size * nprocs;
     89
     90    if ( (threads != 1)   && (threads != 2)   && (threads != 4)   &&
     91         (threads != 8)   && (threads != 16 ) && (threads != 32)  &&
     92         (threads != 64)  && (threads != 128) && (threads != 256) &&
     93         (threads != 512) && (threads != 1024) )
     94    {
     95        task0_printf("[SORT ERROR] Number of processors must be power of 2\n"
     96                     "  x_size = %d / y_size = %d / nprocs = %d\n",
     97                     x_size , y_size , nprocs );
     98        giet_exit("error");
     99    }
     100
     101    task0_printf("\n[ Thread 0 ] Starting sort application with %d threads "
     102                 "at cycle %d\n", threads, time_start);
    99103
    100104    ///////////////////////////
     
    103107    if (thread_id == 0)
    104108    {
    105         for (i = 0; i < __builtin_ctz(*nb_thread); i++)
    106         {
    107             barrier_init(&barrier[i], *nb_thread >> i);
     109        for (i = 0; i < __builtin_ctz( threads ); i++)
     110        {
     111            barrier_init(&barrier[i], threads >> i);
    108112        }
    109113
     
    120124    for (i = IPT * thread_id; i < IPT * (thread_id + 1); i++)
    121125    {
    122         array0[i] = rand();
     126        array0[i] = giet_rand();
    123127    }
    124128
     
    132136    printf("[ Thread %d ] Finishing Stage 0\n\r", thread_id);
    133137
    134     for (i = 0; i < __builtin_ctz(*nb_thread); i++)
     138    for (i = 0; i < __builtin_ctz( threads ); i++)
    135139    {
    136140        barrier_wait(&barrier[i]);
     
    139143        {
    140144            printf("[ Thread %d ] Quit\n\r", thread_id );
    141             exit("Completed");
     145            giet_exit("Completed");
    142146        }
    143147
     
    173177    if(thread_id != 0)
    174178    {
    175         exit("error: only thread 0 should get here");
     179        giet_exit("error: only thread 0 should get here");
    176180    }
    177181
     
    196200    if (success)
    197201    {
    198         exit("!!! Success !!!");
     202        giet_exit("!!! Success !!!");
    199203    }
    200204    else
     
    206210            printf("array[%d] = %d\n", i, dst_array[i]);
    207211        }
    208         exit("!!!  Failure !!!");
    209     }
    210 
    211     exit("Completed");
     212        giet_exit("!!!  Failure !!!");
     213    }
     214
     215    giet_exit("Completed");
    212216}
    213217
  • soft/giet_vm/applications/sort/sort.py

    r434 r502  
    3333    # define vsegs base & size
    3434    code_base  = 0x10000000
    35     code_size  = 0x00200000     # 2 Mbytes (replicated in each cluster)
     35    code_size  = 0x00010000     # 64 Kbytes (replicated in each cluster)
    3636
    3737    data_base  = 0x20000000
    38     data_size  = 0x00100000     # 1 Mbyte (non replicated)
    39 
    40     args_base  = 0x20100000
    41     args_size  = 0x00000004     # 4 bytes (non replicated)
     38    data_size  = 0x00010000     # 64 Kbyte (non replicated)
    4239
    4340    stack_base = 0x40000000
     
    5451                     'C_WU', vtype = 'ELF', x = 0, y = 0, pseg = 'RAM',
    5552                     binpath = 'build/sort/sort.elf',
    56                      local = False, big = True )
    57 
    58     # args vseg : non local (only in cluster[0,0])
    59     mapping.addVseg( vspace, 'sort_args', args_base , args_size,
    60                      'C_WU', vtype = 'CONST', x = 0, y = 0, pseg = 'RAM',
    61                      init = ntasks,
    62                      local = False, big = True )
     53                     local = False )
    6354
    6455    # code vsegs : local (one copy per cluster)
    6556    for x in xrange (x_size):
    6657        for y in xrange (y_size):
    67             mapping.addVseg( vspace, 'sort_code', code_base , code_size,
    68                              'CXWU', vtype = 'ELF', x = x, y = y, pseg = 'RAM',
    69                              binpath = 'build/sort/sort.elf',
    70                              local = True, big = True )
     58            cluster_id = (x * y_size) + y
     59            if ( mapping.clusters[cluster_id].procs ):
     60
     61                mapping.addVseg( vspace, 'sort_code', code_base , code_size,
     62                                 'CXWU', vtype = 'ELF', x = x, y = y, pseg = 'RAM',
     63                                 binpath = 'build/sort/sort.elf',
     64                                 local = True )
    7165
    7266    # stacks vsegs : local (one stack per task)
    7367    for x in xrange (x_size):
    7468        for y in xrange (y_size):
    75             for p in xrange (nprocs):
    76                 proc_id = (((x * y_size) + y) * nprocs) + p
    77                 size    = stack_size / nprocs
    78                 base    = stack_base + (proc_id * size)
    79                 mapping.addVseg( vspace, 'sort_stack_%d_%d_%d' % (x,y,p), base, size,
     69            cluster_id = (x * y_size) + y
     70            if ( mapping.clusters[cluster_id].procs ):
     71                for p in xrange (nprocs):
     72                    proc_id = (((x * y_size) + y) * nprocs) + p
     73                    size    = stack_size / nprocs
     74                    base    = stack_base + (proc_id * size)
     75
     76                    mapping.addVseg( vspace, 'sort_stack_%d_%d_%d' % (x,y,p),
     77                                     base, size, 'C_WU', vtype = 'BUFFER',
     78                                     x = x, y = y, pseg = 'RAM',
     79                                     local = True, big = True )
     80
     81    # heap vsegs : distributed but non local (all tasks can access all heap vsegs)
     82    for x in xrange (x_size):
     83        for y in xrange (y_size):
     84            cluster_id = (x * y_size) + y
     85            if ( mapping.clusters[cluster_id].procs ):
     86                size       = heap_size
     87                base       = heap_base + (cluster_id * size)
     88
     89                mapping.addVseg( vspace, 'sort_heap_%d_%d' % (x,y), base, size,
    8090                                 'C_WU', vtype = 'BUFFER', x = x, y = y, pseg = 'RAM',
    81                                  local = True, big = True )
    82 
    83             # heap vsegs : distributed but non local (all tasks can access all heap vsegs)
    84             cluster_id = (x * y_size) + y
    85             size       = heap_size
    86             base       = heap_base + (cluster_id * size)
    87             mapping.addVseg( vspace, 'sort_heap_%d_%d' % (x,y), base, size,
    88                              'C_WU', vtype = 'BUFFER', x = x, y = y, pseg = 'RAM',
    89                              local = False, big = True )
     91                                 local = False, big = True )
    9092
    9193    # distributed tasks / one task per processor
    9294    for x in xrange (x_size):
    9395        for y in xrange (y_size):
    94             for p in xrange( nprocs ):
    95                 trdid = (((x * y_size) + y) * nprocs) + p
    96                 mapping.addTask( vspace, 'sort_%d_%d_%d' % (x,y,p), trdid, x, y, p,
    97                                  'sort_stack_%d_%d_%d' % (x,y,p),
    98                                  'sort_heap_%d_%d' % (x,y), 0 )
     96            cluster_id = (x * y_size) + y
     97            if ( mapping.clusters[cluster_id].procs ):
     98                for p in xrange( nprocs ):
     99                    trdid = (((x * y_size) + y) * nprocs) + p
     100
     101                    mapping.addTask( vspace, 'sort_%d_%d_%d' % (x,y,p),
     102                                     trdid, x, y, p,
     103                                     'sort_stack_%d_%d_%d' % (x,y,p),
     104                                     'sort_heap_%d_%d' % (x,y), 0 )
    99105
    100106    # extend mapping name
  • soft/giet_vm/applications/transpose/main.c

    r444 r502  
    1 /////////////////////////////////////////////////////////////////////////////////////////////
     1///////////////////////////////////////////////////////////////////////////////////////
    22// File   : main.c   (for transpose application)
    33// Date   : february 2014
    44// author : Alain Greiner
    5 /////////////////////////////////////////////////////////////////////////////////////////////
    6 // This multi-threaded application makes a transpose for a NN*NN pixels sequence of images.
     5///////////////////////////////////////////////////////////////////////////////////////
     6// This multi-threaded application makes a transpose for a NN*NN pixels
     7// sequence of images.
    78// It can run on a multi-processors, multi-clusters architecture, with one thread
    8 // per processor. It uses the he following hardware parameters, that must be defined
    9 // in the hard_config.h file:
    10 // - X_SIZE       : number of clusters in a row
    11 // - Y_SIZE       : number of clusters in a column
    12 // - NB_PROCS_MAX : number of processors per cluster
    13 // - FBUF_X_SIZE  : number of pixels per line in frame buffer
    14 // - FBUF_Y_SIZE  : number of lines  in frame buffer
    15 //
     9// per processor.
     10//
    1611// The image sequence is read from a file (one byte per pixel).
    1712// The input and output buffers containing the image are distributed in all clusters.
    1813//
    19 // - The image size NN must be a power of 2 and must fit the frame buffer size.
    20 // - The number of clusters containing processors must be a power of 2.
    21 // - The number of processors per cluster must be a power of 2.
    22 // - The image size NN must be larger or equal to the total number of processor.
     14// - The image size NN must fit the frame buffer size: 128 bytes
     15// - The block size in block device must be 512 bytes.
     16// - The number of clusters  must be a power of 2 no larger than 32
     17// - The number of processors per cluster must be a power of 2 no larger than 4
    2318//
    2419// For each image the application makes a self test (checksum for each line).
    2520// The actual display on the frame buffer depends on frame buffer availability.
    26 /////////////////////////////////////////////////////////////////////////////////////////////
    27 
    28 #include "hard_config.h"
     21///////////////////////////////////////////////////////////////////////////////////////
     22
    2923#include "stdio.h"
    30 #include "barrier.h"
     24#include "user_barrier.h"
    3125#include "malloc.h"
    3226
     27#define BLOCK_SIZE          512                 // block size on disk
     28#define CLUSTERS_MAX        32                  // max number of clusters
     29#define PROCS_MAX           4                   // max number of processors per cluster
    3330#define NN                  128                 // image size : nlines = npixels = 128
    3431#define NB_IMAGES           5                   // number of images to be handled
    3532#define FILE_PATHNAME       "misc/images.raw"   // file pathname on disk
    36 #define NB_CLUSTERS         (X_SIZE * Y_SIZE)   // number of clusters
    3733#define INSTRUMENTATION_OK  0                   // display statistics on TTY when non zero
    3834
     
    4137///////////////////////////////////////////////////////
    4238
    43 // instrumentation counters
    44 // for each processor (up to 4 processors)
    45 // in each cluster (up to 32 clusters)
    46 unsigned int LOAD_START[NB_CLUSTERS][NB_PROCS_MAX];
    47 unsigned int LOAD_END  [NB_CLUSTERS][NB_PROCS_MAX];
    48 unsigned int TRSP_START[NB_CLUSTERS][NB_PROCS_MAX];
    49 unsigned int TRSP_END  [NB_CLUSTERS][NB_PROCS_MAX];
    50 unsigned int DISP_START[NB_CLUSTERS][NB_PROCS_MAX];
    51 unsigned int DISP_END  [NB_CLUSTERS][NB_PROCS_MAX];
     39// instrumentation counters for each processor in each cluster
     40unsigned int LOAD_START[CLUSTERS_MAX][PROCS_MAX];
     41unsigned int LOAD_END  [CLUSTERS_MAX][PROCS_MAX];
     42unsigned int TRSP_START[CLUSTERS_MAX][PROCS_MAX];
     43unsigned int TRSP_END  [CLUSTERS_MAX][PROCS_MAX];
     44unsigned int DISP_START[CLUSTERS_MAX][PROCS_MAX];
     45unsigned int DISP_END  [CLUSTERS_MAX][PROCS_MAX];
    5246
    5347// arrays of pointers on distributed buffers
    5448// one input buffer & one output buffer per cluster
    55 unsigned char*  buf_in [NB_CLUSTERS];
    56 unsigned char*  buf_out[NB_CLUSTERS];
     49unsigned char*  buf_in [CLUSTERS_MAX];
     50unsigned char*  buf_out[CLUSTERS_MAX];
    5751
    5852// checksum variables
     
    6155
    6256// global synchronisation barrier
    63 giet_barrier_t barrier;
     57giet_sqt_barrier_t barrier;
    6458
    6559volatile unsigned int init_ok = 0;
     
    7064{
    7165
    72     int          file = 0;                                         // file descriptor
    73     unsigned int l;                                                // line index for loops
    74     unsigned int p;                                                // pixel index for loops
    75     unsigned int c;                                                // cluster index for loops
    76 
    77     // get processor identifiers
    78     unsigned int x;                                                // x cluster coordinate
    79     unsigned int y;                                                // y cluster coordinate
    80     unsigned int lpid;                                             // local processor index
     66    unsigned int l;                  // line index for loops
     67    unsigned int p;                  // pixel index for loops
     68    unsigned int c;                  // cluster index for loops
     69
     70    // processor identifiers
     71    unsigned int x;                  // x cluster coordinate
     72    unsigned int y;                  // y cluster coordinate
     73    unsigned int lpid;               // local processor index
     74
     75    // plat-form parameters
     76    unsigned int x_size;             // number of clusters in a row
     77    unsigned int y_size;             // number of clusters in a column
     78    unsigned int nprocs;             // number of processors per cluster
     79   
    8180    giet_proc_xyp( &x, &y, &lpid);             
    8281
    83     unsigned int npixels    = NN * NN;                             // pixels per image
    84     unsigned int nblocks    = npixels / 512;                       // blocks per image
    85     unsigned int image      = 0;                                   // image counter
    86 
    87     unsigned int cluster_id = (x * Y_SIZE) + y;                    // "continuous" index   
    88     unsigned int ntasks     = NB_CLUSTERS * NB_PROCS_MAX;          // number of tasks
    89     unsigned int task_id    = (cluster_id * NB_PROCS_MAX) + lpid;  // "continuous" task index
     82    giet_procs_number( &x_size , &y_size , &nprocs );
     83
     84    giet_shr_printf("\n[TRANSPOSE] Processor[%d,%d,%d] starts at cycle %d\n"
     85                    " - x_size = %d\n"
     86                    " - y_size = %d\n"
     87                    " - nprocs = %d\n",
     88                    x, y, lpid, giet_proctime(), x_size , y_size , nprocs );
     89
     90    unsigned int nclusters  = x_size * y_size;               // number of clusters
     91    unsigned int ntasks     = x_size * y_size * nprocs;      // number of tasks
     92    unsigned int npixels    = NN * NN;                       // pixels per image
     93    unsigned int nblocks    = npixels / BLOCK_SIZE;          // blocks per image
     94    unsigned int image      = 0;                             // image counter
     95    int          file       = 0;                             // file descriptor
     96    unsigned int cluster_id = (x * y_size) + y;              // "continuous" index   
     97    unsigned int task_id    = (cluster_id * nprocs) + lpid;  // "continuous" task index
    9098
    9199    // Processor [0,0,0] makes initialisation
    92     // It includes parameters checking, barriers initialization,
     100    // It includes parameters checking, barrier initialization,
    93101    // distributed buffers allocation, and file open
    94102    if ( (x==0) && (y==0) && (lpid==0) )
    95103    {
    96         // Parameters checking
    97         if ( (NN != FBUF_X_SIZE) || (NN != FBUF_Y_SIZE) )
    98         {
    99             giet_exit("[TRANSPOSE ERROR] Frame buffer size does not fit image size");
    100         }
    101         if ((NB_PROCS_MAX != 1) && (NB_PROCS_MAX != 2) && (NB_PROCS_MAX != 4))
     104        if ((nprocs != 1) && (nprocs != 2) && (nprocs != 4))
    102105        {
    103             giet_exit("[TRANSPOSE ERROR] NB_PROCS_MAX must be 1, 2 or 4");
    104         }
    105         if ((NB_CLUSTERS != 1) && (NB_CLUSTERS != 2) && (NB_CLUSTERS != 4) &&
    106             (NB_CLUSTERS != 8) && (NB_CLUSTERS != 16) && (NB_CLUSTERS != 32) )
     106            giet_exit("[TRANSPOSE ERROR] number of procs per cluster must be 1, 2 or 4");
     107        }
     108        if ((nclusters != 1) && (nclusters != 2) && (nclusters != 4) &&
     109            (nclusters != 8) && (nclusters != 16) && (nclusters != 32) )
    107110        {
    108111            giet_exit("[TRANSPOSE ERROR] number of clusters must be 1,2,4,8,16,32");
     
    113116        }
    114117
    115         giet_shr_printf("\n[TRANSPOSE] Processor[0,0,0] starts at cycle %d\n"
    116                         " - x_size    = %d\n"
    117                         " - y_size    = %d\n"
    118                         " - nprocs    = %d\n"
    119                         " - nclusters = %d\n"
    120                         " - ntasks    = %d\n",
    121                         giet_proctime(), X_SIZE, Y_SIZE, NB_PROCS_MAX, NB_CLUSTERS, ntasks );
    122 
    123118        // Barrier initialisation
    124         barrier_init( &barrier, ntasks );
     119        sqt_barrier_init( &barrier, x_size , y_size , nprocs );
    125120
    126121        giet_shr_printf("\n[TRANSPOSE] Proc [0,0,0] completes barrier init at cycle %d\n",
     
    128123
    129124        // Distributed buffers allocation
    130         // The buffers containing one image are distributed in clusters
    131         // (one buf_in and one buf_out per cluster).
    132         // Each buffer contains (NN*NN / NB_CLUSTERS) bytes.
    133         for ( c = 0 ; c < NB_CLUSTERS ; c++ )
    134         {
    135             unsigned int rx = c / Y_SIZE;
    136             unsigned int ry = c % Y_SIZE;
    137 
    138             buf_in[c]  = remote_malloc( npixels/NB_CLUSTERS, rx, ry );
    139             buf_out[c] = remote_malloc( npixels/NB_CLUSTERS, rx, ry );
     125        // The buffers containing one image are distributed in the user
     126        // heap (one buf_in and one buf_out per cluster).
     127        // Each buffer contains (NN*NN / nclusters) bytes.
     128        for ( c = 0 ; c < nclusters ; c++ )
     129        {
     130            unsigned int rx = c / y_size;
     131            unsigned int ry = c % y_size;
     132
     133            buf_in[c]  = remote_malloc( npixels/nclusters, rx, ry );
     134            buf_out[c] = remote_malloc( npixels/nclusters, rx, ry );
    140135
    141136            giet_shr_printf("\n[TRANSPOSE] Proc [0,0,0] completes buffer allocation"
     
    167162    {
    168163        while ( init_ok == 0 );
    169         giet_shr_printf("\n[TRANSPOSE] Processor[%d,%d,%d] starts at cycle %d\n",
    170                         x, y, lpid, giet_proctime() );
    171164    }
    172165   
     
    175168    while (image < NB_IMAGES)
    176169    {
    177         // pseudo parallel load from disk to buf_in buffer : nblocks/NB_CLUSTERS blocks
     170        // pseudo parallel load from disk to buf_in buffer : nblocks/nclusters blocks
    178171        // only task running on processor with (lpid == 0) does it
    179172
     
    184177            giet_fat_read( file,
    185178                           buf_in[cluster_id],
    186                            (nblocks / NB_CLUSTERS),
    187                            ((image*nblocks) + ((nblocks*cluster_id)/NB_CLUSTERS)) );
    188 
     179                           (nblocks / nclusters),
     180                           ((image*nblocks) + ((nblocks*cluster_id)/nclusters)) );
     181
     182            if ( (x==0) && (y==0) )
    189183            giet_shr_printf("\n[TRANSPOSE] Proc [%d,%d,%d] completes load"
    190184                            "  for image %d at cycle %d\n",
     
    194188        LOAD_END[cluster_id][lpid] = giet_proctime();
    195189
    196         /////////////////////////
    197         barrier_wait( &barrier );
     190        /////////////////////////////
     191        sqt_barrier_wait( &barrier );
    198192
    199193        // parallel transpose from buf_in to buf_out
     
    206200
    207201        unsigned int nlt   = NN / ntasks;      // number of lines per task
    208         unsigned int nlc   = NN / NB_CLUSTERS;   // number of lines per cluster
     202        unsigned int nlc   = NN / nclusters;   // number of lines per cluster
    209203
    210204        unsigned int src_cluster;
     
    242236        if ( lpid == 0 )
    243237        {
     238            if ( (x==0) && (y==0) )
    244239            giet_shr_printf("\n[TRANSPOSE] proc [%d,%d,0] completes transpose"
    245240                            " for image %d at cycle %d\n",
     
    249244        TRSP_END[cluster_id][lpid] = giet_proctime();
    250245
    251         /////////////////////////
    252         barrier_wait( &barrier );
     246        /////////////////////////////
     247        sqt_barrier_wait( &barrier );
    253248
    254249        // optional parallel display from local buf_out to frame buffer
     
    265260                                 npt );
    266261
    267             if ( lpid == 0 )
    268             {
    269                 giet_shr_printf("\n[TRANSPOSE] Proc [%d,%d,0] completes display"
    270                                 " for image %d at cycle %d\n",
    271                                 x, y, image, giet_proctime() );
    272             }
     262            if ( (x==0) && (y==0) && (lpid==0) )
     263            giet_shr_printf("\n[TRANSPOSE] Proc [%d,%d,%d] completes display"
     264                            " for image %d at cycle %d\n",
     265                            x, y, lpid, image, giet_proctime() );
    273266
    274267            DISP_END[cluster_id][lpid] = giet_proctime();
    275268
    276             /////////////////////////
    277             barrier_wait( &barrier );
     269            /////////////////////////////
     270            sqt_barrier_wait( &barrier );
    278271        }
    279272
     
    318311        }
    319312
    320         /////////////////////////
    321         barrier_wait( &barrier );
     313        /////////////////////////////
     314        sqt_barrier_wait( &barrier );
    322315
    323316        // instrumentation done by processor [0,0,0]
     
    338331            unsigned int max_disp_ended = 0;
    339332
    340             for (cc = 0; cc < NB_CLUSTERS; cc++)
     333            for (cc = 0; cc < nclusters; cc++)
    341334            {
    342335                for (pp = 0; pp < NB_PROCS_MAX; pp++)
     
    384377        image++;
    385378
    386         /////////////////////////
    387         barrier_wait( &barrier );
     379        /////////////////////////////
     380        sqt_barrier_wait( &barrier );
    388381
    389382    } // end while image     
     
    392385    if ( (x==0) && (y==0) && (lpid==0) )
    393386    {
    394         for ( c = 0 ; c < NB_CLUSTERS ; c++ )
     387        for ( c = 0 ; c < nclusters ; c++ )
    395388        {
    396389            free( buf_in[c] );
  • soft/giet_vm/applications/transpose/transpose.py

    r457 r502  
    33from mapping import *
    44
    5 ######################################################################################
     5##################################################################################
    66#   file   : transpose.py  (for the transpose application)
    77#   date   : may 2014
    88#   author : Alain Greiner
    9 #######################################################################################
     9##################################################################################
    1010#  This file describes the mapping of the multi-threaded "transpose"
    1111#  application on a multi-clusters, multi-processors architecture.
    1212#  This include both the mapping of virtual segments on the clusters,
    1313#  and the mapping of tasks on processors.
     14#  There is one task per processor.
     15#  The mapping of virtual segments is the following:
     16#    - There is one shared data vseg in cluster[0][0]
     17#    - The code vsegs are replicated on all clusters containing processors.
     18#    - There is one heap vseg per cluster containing processors.
     19#    - The stacks vsegs are distibuted on all clusters containing processors.
    1420#  This mapping uses 5 platform parameters, (obtained from the "mapping" argument)
    15 - x_size    : number of clusters in a row
    16 - y_size    : number of clusters in a column
    17 - x_width   : number of bits coding x coordinate
    18 - y_width   : number of bits coding y coordinate
    19 - nprocs    : number of processors per cluster
    20 ####################################################################################
     21  - x_size    : number of clusters in a row
     22  - y_size    : number of clusters in a column
     23  - x_width   : number of bits coding x coordinate
     24  - y_width   : number of bits coding y coordinate
     25  - nprocs    : number of processors per cluster
     26##################################################################################
    2127
    2228#########################
     
    5460    for x in xrange (x_size):
    5561        for y in xrange (y_size):
    56             mapping.addVseg( vspace, 'trsp_code_%d_%d' %(x,y), code_base , code_size,
    57                              'CXWU', vtype = 'ELF', x = x, y = y, pseg = 'RAM',
    58                              binpath = 'build/transpose/transpose.elf',
    59                              local = True )
     62            cluster_id = (x * y_size) + y
     63            if ( mapping.clusters[cluster_id].procs ):
    6064
    61     # stacks vsegs: local (one stack per processor => nprocs stacks per cluster)           
    62     for x in xrange (x_size):
    63         for y in xrange (y_size):
    64             for p in xrange( nprocs ):
    65                 proc_id = (((x * y_size) + y) * nprocs) + p
    66                 size    = (stack_size / nprocs) & 0xFFFFF000
    67                 base    = stack_base + (proc_id * size)
    68                 mapping.addVseg( vspace, 'trsp_stack_%d_%d_%d' % (x,y,p), base, size,
    69                                  'C_WU', vtype = 'BUFFER', x = x , y = y , pseg = 'RAM',
    70                                  local = True, big = True )
     65                mapping.addVseg( vspace, 'trsp_code_%d_%d' %(x,y),
     66                                 code_base , code_size,
     67                                 'CXWU', vtype = 'ELF', x = x, y = y, pseg = 'RAM',
     68                                 binpath = 'build/transpose/transpose.elf',
     69                                 local = True )
    7170
    72     # heap vsegs: distributed but non local (all heap vsegs can be accessed by all tasks)
     71    # stacks vsegs: local (one stack per processor => nprocs stacks per cluster)
    7372    for x in xrange (x_size):
    7473        for y in xrange (y_size):
    7574            cluster_id = (x * y_size) + y
    76             size  = heap_size
    77             base  = heap_base + (cluster_id * size)
    78             mapping.addVseg( vspace, 'trsp_heap_%d_%d' % (x,y), base, size,
    79                              'C_WU', vtype = 'BUFFER', x = x, y = y, pseg = 'RAM',
    80                              local = False, big = True )
     75            if ( mapping.clusters[cluster_id].procs ):
     76                for p in xrange( nprocs ):
     77                    proc_id = (((x * y_size) + y) * nprocs) + p
     78                    size    = (stack_size / nprocs) & 0xFFFFF000
     79                    base    = stack_base + (proc_id * size)
     80
     81                    mapping.addVseg( vspace, 'trsp_stack_%d_%d_%d' % (x,y,p),
     82                                     base, size, 'C_WU', vtype = 'BUFFER',
     83                                     x = x , y = y , pseg = 'RAM',
     84                                     local = True, big = True )
     85
     86    # heap vsegs: distributed non local (all heap vsegs can be accessed by all tasks)
     87    for x in xrange (x_size):
     88        for y in xrange (y_size):
     89            cluster_id = (x * y_size) + y
     90            if ( mapping.clusters[cluster_id].procs ):
     91                size  = heap_size
     92                base  = heap_base + (cluster_id * size)
     93
     94                mapping.addVseg( vspace, 'trsp_heap_%d_%d' % (x,y), base, size,
     95                                 'C_WU', vtype = 'HEAP', x = x, y = y, pseg = 'RAM',
     96                                 local = False, big = True )
    8197
    8298    # distributed tasks / one task per processor
    8399    for x in xrange (x_size):
    84100        for y in xrange (y_size):
    85             for p in xrange( nprocs ):
    86                 trdid = (((x * y_size) + y) * nprocs) + p
    87                 mapping.addTask( vspace, 'trsp_%d_%d_%d' % (x,y,p), trdid, x, y, p,
    88                                  'trsp_stack_%d_%d_%d' % (x,y,p),
    89                                  'trsp_heap_%d_%d' % (x,y), 0 )
     101            cluster_id = (x * y_size) + y
     102            if ( mapping.clusters[cluster_id].procs ):
     103                for p in xrange( nprocs ):
     104                    trdid = (((x * y_size) + y) * nprocs) + p
     105
     106                    mapping.addTask( vspace, 'trsp_%d_%d_%d' % (x,y,p),
     107                                     trdid, x, y, p,
     108                                     'trsp_stack_%d_%d_%d' % (x,y,p),
     109                                     'trsp_heap_%d_%d' % (x,y), 0 )
    90110
    91111    # extend mapping name
     
    94114    return vspace  # useful for test
    95115           
    96 ################################ test ######################################################
     116################################ test ##################################################
    97117
    98118if __name__ == '__main__':
Note: See TracChangeset for help on using the changeset viewer.