Context Navigation

← Previous Change
Next Change →

Changeset 502 for soft/giet_vm/applications

Timestamp:

Feb 8, 2015, 9:20:45 PM (11 years ago)

Author:

alain

Message:

1) Introduce distributed barriers in the multi-threads applications
(classif) transpose, convol, sort, gameoflife)

2) Introducing support for architectures containing empty clusters
in the mapping of these multi-threaded applications.

3) Removing the "command line arguments" in the sort application
(replaced by the giet_procs_number() system call.

Location:

soft/giet_vm/applications

Files:

: 1 added
: 10 edited

classif/classif.py (modified) (5 diffs)
classif/main.c (modified) (22 diffs)
convol/convol.py (modified) (4 diffs)
convol/main.c (modified) (14 diffs)
gameoflife/gameoflife.ld (modified) (2 diffs)
gameoflife/gameoflife.py (added)
gameoflife/main.c (modified) (5 diffs)
sort/main.c (modified) (13 diffs)
sort/sort.py (modified) (2 diffs)
transpose/main.c (modified) (18 diffs)
transpose/transpose.py (modified) (3 diffs)

Legend:

: Unmodified
: Added
: Removed

soft/giet_vm/applications/classif/classif.py

-                      r488
+                      r502
 from mapping import *
 ######################################################################################
+###################################################################################
 #   file   : classif.py
 #   date   : november 2014
 #   author : Alain Greiner
 #######################################################################################
+###################################################################################
 #  This file describes the mapping of the multi-threaded "classif"
 #  application on a multi-clusters, multi-processors architecture.
 #  The mapping of tasks on processors is the following:
+#  - one "load" task per cluster,
+#  - one "store" task per cluster,
+#  - (nprocs-2) "analyse" task per cluster.
+#  The mapping of virtual segments on the clusters is the following:
+#    - The code vsegs are replicated on all clusters.
+#    - one "load" task per cluster containing processors,
+#    - one "store" task per cluster containing processors,
+#    - (nprocs-2) "analyse" task per cluster containing processors.
+#  The mapping of virtual segments is the following:
 #    - There is one shared data vseg in cluster[0][0]
+#    - There is one heap vseg per cluster.
+#    - The stacks vsegs are distibuted on all clusters.
+#    - The code vsegs are replicated on all clusters containing processors.
+#    - There is one heap vseg per cluster containing processors.
+#    - The stacks vsegs are distibuted on all clusters containing processors.
 #  This mapping uses 5 platform parameters, (obtained from the "mapping" argument)
 #    - x_size    : number of clusters in a row
 …
 #  WARNING: The target architecture cannot contain less
 #           than 3 processors per cluster.
 ####################################################################################
+##################################################################################
 #########################
 …
     heap_base  = 0x30000000
     heap_size  = 0x00008000     # 32 Kbytes (per cluster)
+    heap_size  = 0x00040000     # 256 Kbytes (per cluster)
     stack_base = 0x40000000
 …
                      local = False )
     # heap_x_y vsegs : shared / one per cluster
+    # heap vsegs : shared (one per cluster)
     for x in xrange (x_size):
         for y in xrange (y_size):
+            base = heap_base + ( (4*x + y) * heap_size )
+            cluster_id = (x * y_size) + y
+            if ( mapping.clusters[cluster_id].procs ):
+                size  = heap_size
+                base  = heap_base + (cluster_id * size)
             mapping.addVseg( vspace, 'classif_heap_%d_%d' %(x,y), base , heap_size,
                              'C_WU', vtype = 'HEAP', x = x, y = y, pseg = 'RAM',
                              local = False )
+                mapping.addVseg( vspace, 'classif_heap_%d_%d' %(x,y), base , size,
+                                 'C_WU', vtype = 'HEAP', x = x, y = y, pseg = 'RAM',
+                                 local = False )
     # code vsegs : local (one copy in each cluster)
     for x in xrange (x_size):
         for y in xrange (y_size):
+            cluster_id = (x * y_size) + y
+            if ( mapping.clusters[cluster_id].procs ):
+            mapping.addVseg( vspace, 'classif_code_%d_%d' %(x,y), code_base , code_size,
+                             'CXWU', vtype = 'ELF', x = x, y = y, pseg = 'RAM',
+                             binpath = 'build/classif/classif.elf',
+                             local = True )
+                mapping.addVseg( vspace, 'classif_code_%d_%d' %(x,y),
+                                 code_base , code_size,
+                                 'CXWU', vtype = 'ELF', x = x, y = y, pseg = 'RAM',
+                                 binpath = 'build/classif/classif.elf',
+                                 local = True )
     # stacks vsegs: local (one stack per processor => nprocs stacks per cluster)
+    # stacks vsegs: local (one stack per processor => nprocs stacks per cluster)
     for x in xrange (x_size):
         for y in xrange (y_size):
+            for p in xrange( nprocs ):
+                proc_id = (((x * y_size) + y) * nprocs) + p
+                size    = (stack_size / nprocs) & 0xFFFFF000
+                base    = stack_base + (proc_id * size)
+            cluster_id = (x * y_size) + y
+            if ( mapping.clusters[cluster_id].procs ):
+                for p in xrange( nprocs ):
+                    proc_id = (((x * y_size) + y) * nprocs) + p
+                    size    = (stack_size / nprocs) & 0xFFFFF000
+                    base    = stack_base + (proc_id * size)
+                mapping.addVseg( vspace, 'classif_stack_%d_%d_%d' % (x,y,p), base, size,
+                                 'C_WU', vtype = 'BUFFER', x = x , y = y , pseg = 'RAM',
+                                 local = True, big = True )
+                    mapping.addVseg( vspace, 'classif_stack_%d_%d_%d' % (x,y,p),
+                                     base, size, 'C_WU', vtype = 'BUFFER',
+                                     x = x , y = y , pseg = 'RAM',
+                                     local = True, big = True )
     # distributed tasks / one task per processor
     for x in xrange (x_size):
         for y in xrange (y_size):
+            for p in xrange( nprocs ):
+                trdid = (((x * y_size) + y) * nprocs) + p
+                if  ( p== 0 ):                              # task load
+                    task_index = 0
+                    task_name  = 'load_%d_%d_%d' %(x,y,p)
+                elif  ( p== 1 ):                            # task store
+                    task_index = 1
+                    task_name  = 'store_%d_%d_%d' %(x,y,p)
+                else :                                      # task analyse
+                    task_index = 2
+                    task_name  = 'analyse_%d_%d_%d' % (x,y,p)
+            cluster_id = (x * y_size) + y
+            if ( mapping.clusters[cluster_id].procs ):
+                for p in xrange( nprocs ):
+                    trdid = (((x * y_size) + y) * nprocs) + p
+                    if  ( p== 0 ):                              # task load
+                        task_index = 0
+                        task_name  = 'load_%d_%d_%d' %(x,y,p)
+                    elif  ( p== 1 ):                            # task store
+                        task_index = 1
+                        task_name  = 'store_%d_%d_%d' %(x,y,p)
+                    else :                                      # task analyse
+                        task_index = 2
+                        task_name  = 'analyse_%d_%d_%d' % (x,y,p)
                 mapping.addTask( vspace, task_name, trdid, x, y, p,
                                  'classif_stack_%d_%d_%d' % (x,y,p),
                                  'classif_heap_%d_%d' % (x,y),
                                  task_index )
+                    mapping.addTask( vspace, task_name, trdid, x, y, p,
+                                     'classif_stack_%d_%d_%d' % (x,y,p),
+                                     'classif_heap_%d_%d' % (x,y),
+                                     task_index )
     # extend mapping name
 …
     return vspace  # useful for test
 ################################ test ######################################################
+################################ test ################################################
 if __name__ == '__main__':

soft/giet_vm/applications/classif/main.c

-                      r488
+                      r502
 /////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////
 // File   : main.c   (for classif application)
 // Date   : november 2014
 // author : Alain Greiner
 /////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////
 // This multi-threaded application takes a stream of Gigabit Ethernet packets,
 // and makes packet analysis and classification, based on the source MAC address.
 …
 // component to receive and send packets on the Gigabit Ethernet port.
 //
+// This application is described as a TCG (Task and Communication Graph) containing
+// (N+2) tasks per cluster:
+// It can run on architectures containing up to 16 * 16 clusters,
+// and up to 8 processors per cluster.
+//
+// This application is described as a TCG (Task and Communication Graph)
+// containing (N+2) tasks per cluster:
 // - one "load" task
+// - one "store" task
 // - N "analyse" tasks
+// - one "store" task
+// The 4 Kbytes containers are diributed (N+2 containers per cluster):
+// The containers are distributed (N+2 containers per cluster):
 // - one RX container (part of the kernel rx_chbuf), in the kernel heap.
 // - one TX container (part of the kernel tx-chbuf), in the kernel heap.
 …
 // The MWMR fifo descriptors array is defined as a global variable in cluster[0][0].
 //
+// Initialisation is done in two steps by the "load" tasks:
+// - Task "load" in cluster[0][0] initialises NIC & CMA channel, and initialises
+//   the barrier between all "load" tasks. Other "load" tasks are waiting on the
+//   global_sync synchronisation variable.
+// - In each cluster[x][y], the "load" task allocates the working containers
+//   and the MWMR fifos descriptors in the local heap.
+//   The "analyse" tasks are waiting on the sync[x][y] variables.
+// Initialisation is done in two steps by the "load" & "store" tasks:
+// - Task "load" in cluster[0][0] initialises the barrier between all "load" tasks,
+//   allocates NIC & CMA RX channel, and starts the NIC_CMA RX transfer.
+//   Other "load" tasks are waiting on the load_sync synchronisation variable.
+//   Task "store" in cluster[0][0] initialises the barrier between all "store" tasks,
+//   allocates NIC & CMA TX channels, and starts the NIC_CMA TX transfer.
+//   Other "store" tasks are waiting on the store_sync synchronisation variable.
+// - When this global initialisation is completed, the "load" task in all clusters
+//   allocates the working containers and the MWMR fifos descriptors from the
+//   user local heap. In each cluster, the "analyse" and "store" tasks are waiting
+//   the local initialisation completion on the local_sync[x][y] variables.
 //
-// Instrumentation results display is done by the "store" task in cluster[0][0]
-// when all "store" tasks completed the number of clusters specified by the
-// CONTAINERS_MAX parameter.
-//
 // When initialisation is completed, all tasks loop on containers:
 // 1) The "load" task get an empty working container from the fifo_s2l,
 …
 //    and transfer ownership of this container to one "analysis" task by writing
 //    into the fifo_l2a.
-//
 // 2) The "analyse" task get one working container from the fifo_l2a, analyse
 //    each packet header, compute the packet type (depending on the SRC MAC address),
 //    increment the correspondint classification counter, and transpose the SRC
 //    and the DST MAC addresses fot TX tranmission.
-//
 // 3) The "store" task transfer get a full working container from the fifo_a2s,
 //    transfer this user container content to the the kernel tx_chbuf,
 //    and transfer ownership of this empty container to the "load" task by writing
 //    into the fifo_s2l.
+//
+// This application uses the following hardware parameters (hard_config.h file):
+// - X_SIZE       : number of clusters in a row
+// - Y_SIZE       : number of clusters in a column
+// - NB_PROCS_MAX : number of processors per cluster
+/////////////////////////////////////////////////////////////////////////////////////////
+//
+// Instrumentation results display is done by the "store" task in cluster[0][0]
+// when all "store" tasks completed the number of clusters specified by the
+// CONTAINERS_MAX parameter.
+///////////////////////////////////////////////////////////////////////////////////////
 #include "stdio.h"
 #include "barrier.h"
+#include "user_barrier.h"
 #include "malloc.h"
 #include "user_lock.h"
 #include "mwmr_channel.h"
+#include "hard_config.h"
+#define CONTAINERS_MAX  5
+#define VERBOSE_ANALYSE 1
+#define ANALYSIS_TASKS  (NB_PROCS_MAX - 2)
+/////////////////////////////////////////////////////////////////////////////////////////
+#define X_SIZE_MAX      16
+#define Y_SIZE_MAX      16
+#define NPROCS_MAX      8
+#define CONTAINERS_MAX  500
+#define VERBOSE_ANALYSE 0
+///////////////////////////////////////////////////////////////////////////////////////
 //    Global variables
 // The MWMR channels (descriptors and buffers), as well as the working containers
 …
 // But the pointers on these distributed structures are shared arrays
 // stored in cluster[0][0].
 /////////////////////////////////////////////////////////////////////////////////////////
 // pointers on distributed temp[x][y][n] containers
 unsigned int*       container[X_SIZE][Y_SIZE][ANALYSIS_TASKS];
+///////////////////////////////////////////////////////////////////////////////////////
+// pointers on distributed containers
+unsigned int*       container[X_SIZE_MAX][Y_SIZE_MAX][NPROCS_MAX-2];
 // pointers on distributed mwmr fifos containing : temp[x][y][l] container descriptors
 mwmr_channel_t*     mwmr_l2a[X_SIZE][Y_SIZE];
 mwmr_channel_t*     mwmr_a2s[X_SIZE][Y_SIZE];
 mwmr_channel_t*     mwmr_s2l[X_SIZE][Y_SIZE];
+mwmr_channel_t*     mwmr_l2a[X_SIZE_MAX][Y_SIZE_MAX];
+mwmr_channel_t*     mwmr_a2s[X_SIZE_MAX][Y_SIZE_MAX];
+mwmr_channel_t*     mwmr_s2l[X_SIZE_MAX][Y_SIZE_MAX];
 // local synchros signaling local MWMR fifos initialisation completion
 unsigned int        local_sync[X_SIZE][Y_SIZE];
+volatile unsigned int        local_sync[X_SIZE_MAX][Y_SIZE_MAX];
 // global synchro signaling global initialisation completion
 unsigned int        load_sync  = 0;
 unsigned int        store_sync = 0;
+volatile unsigned int        load_sync  = 0;
+volatile unsigned int        store_sync = 0;
 // instrumentation counters
 unsigned int        counter[16];
+// distributed barriers (between "load" and "store" tasks)
+giet_sbt_barrier_t  rx_barrier;
+giet_sbt_barrier_t  tx_barrier;
+// distributed barrier between "load" tasks
+giet_sqt_barrier_t  rx_barrier;
+// distributed barrier between "store" tasks
+giet_sqt_barrier_t  tx_barrier;
 // NIC_RX and NIC_TX channel index
 …
 /////////////////////////////////////////
+{
+    // each "load" task get platform parameters
+    unsigned int    x_size;                                             // number of clusters in a row
+    unsigned int    y_size;                     // number of clusters in a column
+    unsigned int    nprocs;                     // number of processors per cluster
+    giet_procs_number( &x_size, &y_size, &nprocs );
+    giet_assert( (x_size <= X_SIZE_MAX) &&
+                 (y_size <= Y_SIZE_MAX) &&
+                 (nprocs <= NPROCS_MAX) ,
+                 "[CLASSIF ERROR] illegal platform parameters" );
     // each "load" task get processor identifiers
     unsigned int    x;
 …
     giet_proc_xyp( &x, &y, &l );
     // "load" task[0][0] initialises barrier between load tasks,
+    // "load" task[0][0] initialises barrier between all load tasks,
     // allocates the NIC & CMA RX channels, and start the NIC_CMA RX transfer.
     // Other "load" tasks wait completion
     if ( (x==0) && (y==0) )
+    {
+        giet_shr_printf("\n*** Task load on P[%d][%d][%d] starts at cycle %d\n",
+                        x , y , l , giet_proctime() );
+        giet_shr_printf("\n*** Task load on P[%d][%d][%d] starts at cycle %d\n"
+                        "  x_size = %d / y_size = %d / nprocs = %d\n",
+                        x , y , l , giet_proctime() , x_size, y_size, nprocs );
         sbt_barrier_init( &rx_barrier, X_SIZE*Y_SIZE , 1 );
         nic_rx_channel = giet_nic_rx_alloc();
+        sqt_barrier_init( &rx_barrier, x_size , y_size , 1 );
+        nic_rx_channel = giet_nic_rx_alloc( x_size , y_size );
         giet_nic_rx_start( nic_rx_channel );
         load_sync = 1;
 …
+    }
     // all load tasks allocate containers[x][y][n] (from local heap)
+    // each load tasks allocates containers[x][y][n] (from local heap)
     // and register pointers in the local stack
     unsigned int   n;
+    unsigned int*  cont[ANALYSIS_TASKS];
+    for ( n = 0 ; n < ANALYSIS_TASKS ; n++ )
+    unsigned int*  cont[NPROCS_MAX-2];
+    unsigned int   analysis_tasks = nprocs-2;
+    for ( n = 0 ; n < analysis_tasks ; n++ )
+    {
         container[x][y][n] = malloc( 4096 );
 …
+    }
     // all load tasks allocate data buffers for mwmr fifos (from local heap)
     unsigned int*  data_l2a = malloc( ANALYSIS_TASKS<<2 );
     unsigned int*  data_a2s = malloc( ANALYSIS_TASKS<<2 );
     unsigned int*  data_s2l = malloc( ANALYSIS_TASKS<<2 );
     // all load tasks allocate mwmr fifos descriptors (from local heap)
+    // each load task allocates data buffers for mwmr fifos (from local heap)
+    unsigned int*  data_l2a = malloc( analysis_tasks<<2 );
+    unsigned int*  data_a2s = malloc( analysis_tasks<<2 );
+    unsigned int*  data_s2l = malloc( analysis_tasks<<2 );
+    // each load task allocates mwmr fifos descriptors (from local heap)
     mwmr_l2a[x][y] = malloc( sizeof(mwmr_channel_t) );
     mwmr_a2s[x][y] = malloc( sizeof(mwmr_channel_t) );
     mwmr_s2l[x][y] = malloc( sizeof(mwmr_channel_t) );
     // all "load" tasks register local pointers on mwmr fifos in local stack
+    // each load task registers local pointers on mwmr fifos in local stack
     mwmr_channel_t* fifo_l2a = mwmr_l2a[x][y];
     mwmr_channel_t* fifo_a2s = mwmr_a2s[x][y];
     mwmr_channel_t* fifo_s2l = mwmr_s2l[x][y];
     // all "load" tasks initialise local mwmr fifos descriptors
+    // each load task initialises local mwmr fifos descriptors
     // ( width = 4 bytes / depth = number of analysis tasks )
     mwmr_init( fifo_l2a , data_l2a , 1 , ANALYSIS_TASKS );
     mwmr_init( fifo_a2s , data_a2s , 1 , ANALYSIS_TASKS );
     mwmr_init( fifo_s2l , data_s2l , 1 , ANALYSIS_TASKS );
+    mwmr_init( fifo_l2a , data_l2a , 1 , analysis_tasks );
+    mwmr_init( fifo_a2s , data_a2s , 1 , analysis_tasks );
+    mwmr_init( fifo_s2l , data_s2l , 1 , analysis_tasks );
     // all "load" tasks initialise local containers as empty in fifo_s2l
     for ( n = 0 ; n < ANALYSIS_TASKS ; n++ ) mwmr_write( fifo_s2l , &n , 1 );
     // each "load" task[x][y] signals mwmr fifos initialisation completion
+    // each load task initialises local containers as empty in fifo_s2l
+    for ( n = 0 ; n < analysis_tasks ; n++ ) mwmr_write( fifo_s2l , &n , 1 );
+    // each load task[x][y] signals mwmr fifos initialisation completion
     // to other tasks in same cluster[x][y]
     local_sync[x][y] = 1;
     // "load" task[0][0] displays status
+    // load task[0][0] displays status
     if ( (x==0) && (y==0) )
     giet_shr_printf("\n*** Task load on P[%d,%d,%d] enters main loop at cycle %d\n"
 …
                     (unsigned int)fifo_s2l, (unsigned int)data_s2l,
                     (unsigned int)cont[0],
                     X_SIZE, Y_SIZE, NB_PROCS_MAX );
+                    x_size, y_size, nprocs );
     /////////////////////////////////////////////////////////////
     // All "load" tasks enter the main loop (on containers)
     unsigned int count = 0;     // loaded containers count
     unsigned int index;         // available container index
     unsigned int* temp;         // pointer on available container
+    // All load tasks enter the main loop (on containers)
+    unsigned int  count = 0;     // loaded containers count
+    unsigned int  index;         // available container index
+    unsigned int* temp;          // pointer on available container
     while ( count < CONTAINERS_MAX )
+    {
         // get one empty count index from fifo_s2l
+        // get one empty container index from fifo_s2l
         mwmr_read( fifo_s2l , &index , 1 );
         temp = cont[index];
         // get one count from  kernel rx_chbuf
+        // get one container from  kernel rx_chbuf
         giet_nic_rx_move( nic_rx_channel, temp );
 …
         unsigned int nwords   = temp[0] >> 16;
         if ( (x==X_SIZE-1) && (y==Y_SIZE-1) )
+        if ( (x==0) && (y==0) )
         giet_shr_printf("\n*** Task load on P[%d,%d,%d] get container %d at cycle %d"
                         " : %d packets / %d words\n",
                         x, y, l, count, giet_proctime(), npackets, nwords );
         // put the full count index to fifo_l2a
+        // put the full container index to fifo_l2a
         mwmr_write( fifo_l2a, &index , 1 );
 …
     // all "load" tasks synchronise before stats
     sbt_barrier_wait( &rx_barrier );
+    sqt_barrier_wait( &rx_barrier );
     // "load" task[0][0] stops the NIC_CMA RX transfer and displays stats
 …
 //////////////////////////////////////////
+{
+    // each "load" task get platform parameters
+    unsigned int    x_size;                                             // number of clusters in row
+    unsigned int    y_size;                     // number of clusters in a column
+    unsigned int    nprocs;                     // number of processors per cluster
+    giet_procs_number( &x_size, &y_size, &nprocs );
     // get processor identifiers
     unsigned int    x;
 …
     giet_proc_xyp( &x, &y, &l );
     // "store" task[0][0] initialises the barrier between all "store" tasks,
     // allocates NIC & CMA TX channels, and starts the NIC_CMA TX transfer.
 …
     if ( (x==0) && (y==0) )
+    {
+        giet_shr_printf("\n*** Task store on P[%d][%d][%d] starts at cycle %d\n",
+                        x , y , l , giet_proctime() );
+        giet_shr_printf("\n*** Task store on P[%d][%d][%d] starts at cycle %d\n"
+                        "  x_size = %d / y_size = %d / nprocs = %d\n",
+                        x , y , l , giet_proctime() , x_size, y_size, nprocs );
         sbt_barrier_init( &tx_barrier , X_SIZE*Y_SIZE , 1 );
         nic_tx_channel = giet_nic_tx_alloc();
+        sqt_barrier_init( &tx_barrier , x_size , y_size , 1 );
+        nic_tx_channel = giet_nic_tx_alloc( x_size , y_size );
         giet_nic_tx_start( nic_tx_channel );
         store_sync = 1;
 …
     while ( local_sync[x][y] == 0 ) asm volatile ("nop");
     // all "store" tasks register pointers on working containers in local stack
+    // each "store" tasks register pointers on working containers in local stack
     unsigned int   n;
+    unsigned int*  cont[ANALYSIS_TASKS];
+    for ( n = 0 ; n < ANALYSIS_TASKS ; n++ )
+    unsigned int   analysis_tasks = nprocs-2;
+    unsigned int*  cont[NPROCS_MAX-2];
+    for ( n = 0 ; n < analysis_tasks ; n++ )
+    {
         cont[n] = container[x][y][n];
 …
         unsigned int nwords   = temp[0] >> 16;
         if ( (x==X_SIZE-1) && (y==Y_SIZE-1) )
+        if ( (x==0) && (y==0) )
         giet_shr_printf("\n*** Task store on P[%d,%d,%d] get container %d at cycle %d"
                         " : %d packets / %d words\n",
 …
     // all "store" tasks synchronise before result display
     sbt_barrier_wait( &tx_barrier );
+    sqt_barrier_wait( &tx_barrier );
     // "store" task[0,0] stops NIC_CMA TX transfer and displays results
 …
 ////////////////////////////////////////////
+{
+    // each "load" task get platform parameters
+    unsigned int    x_size;                                             // number of clusters in row
+    unsigned int    y_size;                     // number of clusters in a column
+    unsigned int    nprocs;                     // number of processors per cluster
+    giet_procs_number( &x_size, &y_size, &nprocs );
     // get processor identifiers
     unsigned int    x;
 …
     if ( (x==0) && (y==0) )
+    {
+        giet_shr_printf("\n*** Task analyse on P[%d][%d][%d] starts at cycle %d\n",
+                        x , y , l , giet_proctime() );
+        giet_shr_printf("\n*** Task analyse on P[%d][%d][%d] starts at cycle %d\n"
+                        "  x_size = %d / y_size = %d / nprocs = %d\n",
+                        x , y , l , giet_proctime() , x_size, y_size, nprocs );
+    }
 …
     // all "analyse" tasks register pointers on working containers in local stack
     unsigned int   n;
+    unsigned int*  cont[ANALYSIS_TASKS];
+    for ( n = 0 ; n < ANALYSIS_TASKS ; n++ )
+    unsigned int   analysis_tasks = nprocs-2;
+    unsigned int*  cont[NPROCS_MAX-2];
+    for ( n = 0 ; n < analysis_tasks ; n++ )
+    {
         cont[n] = container[x][y][n];
 …
             unsigned int word2 = temp[first + 2];
+#if VERBOSE_ANALYSE
             unsigned long long dst = ((unsigned long long)(word1 & 0xFFFF0000)>>16) |
                                      (((unsigned long long)word0)<<16);
             unsigned long long src = ((unsigned long long)(word1 & 0x0000FFFF)<<32) |
                                      ((unsigned long long)word2);
-#if VERBOSE_ANALYSE
             if ( p < 10 )
+            {

soft/giet_vm/applications/convol/convol.py

-                      r457
+                      r502
 #  This include both the mapping of virtual segments on the clusters,
 #  and the mapping of tasks on processors.
+#  There is one task per processor.
+#  The mapping of virtual segments is the following:
+#    - There is one shared data vseg in cluster[0][0]
+#    - The code vsegs are replicated on all clusters containing processors.
+#    - There is one heap vseg per cluster containing processors.
+#    - The stacks vsegs are distibuted on all clusters containing processors.
 #  This mapping uses 5 platform parameters, (obtained from the "mapping" argument)
 #  - x_size    : number of clusters in a row
 #  - y_size    : number of clusters in a column
 #  - x_width   : number of bits coding x coordinate
 #  - y_width   : number of bits coding y coordinate
 #  - nprocs : number of processors per cluster
+#    - x_size    : number of clusters in a row
+#    - y_size    : number of clusters in a column
+#    - x_width   : number of bits coding x coordinate
+#    - y_width   : number of bits coding y coordinate
+#    - nprocs : number of processors per cluster
 ####################################################################################
 …
     # data vseg in cluster[0,0] : non local
+    mapping.addVseg( vspace, 'conv_data', data_base , data_size, 'C_WU', vtype = 'ELF',
+                     x = 0, y = 0, pseg = 'RAM', binpath = 'build/convol/convol.elf',
+    mapping.addVseg( vspace, 'conv_data', data_base , data_size,
+                     'C_WU', vtype = 'ELF', x = 0, y = 0, pseg = 'RAM',
+                     binpath = 'build/convol/convol.elf',
                      local = False )
 …
     for x in xrange (x_size):
         for y in xrange (y_size):
+            size       = code_size
+            base       = code_base
+            mapping.addVseg( vspace, 'conv_code_%d_%d' % (x,y), base, size,
+                             'CXWU', vtype = 'ELF', x = x , y = y , pseg = 'RAM',
+                             binpath = 'build/convol/convol.elf',
+                             local = True )
+            cluster_id = (x * y_size) + y
+            if ( mapping.clusters[cluster_id].procs ):
+                size       = code_size
+                base       = code_base
+                mapping.addVseg( vspace, 'conv_code_%d_%d' % (x,y), base, size,
+                                 'CXWU', vtype = 'ELF', x = x , y = y , pseg = 'RAM',
+                                 binpath = 'build/convol/convol.elf',
+                                 local = True )
     # stack vsegs : local (one stack per processor)
     for x in xrange (x_size):
         for y in xrange (y_size):
+            for p in xrange( nprocs ):
+                proc_id = (((x * y_size) + y) * nprocs) + p
+                size    = (stack_size / nprocs) & 0xFFFFF000
+                base    = stack_base + (proc_id * size)
+                mapping.addVseg( vspace, 'conv_stack_%d_%d_%d' % (x,y,p), base, size,
+                                 'C_WU', vtype = 'BUFFER', x = x , y = y , pseg = 'RAM',
+                                 local = True, big = True )
+            cluster_id = (x * y_size) + y
+            if ( mapping.clusters[cluster_id].procs ):
+                for p in xrange( nprocs ):
+                    proc_id = (((x * y_size) + y) * nprocs) + p
+                    size    = (stack_size / nprocs) & 0xFFFFF000
+                    base    = stack_base + (proc_id * size)
+                    mapping.addVseg( vspace, 'conv_stack_%d_%d_%d' % (x,y,p),
+                                     base, size, 'C_WU', vtype = 'BUFFER',
+                                     x = x , y = y , pseg = 'RAM',
+                                     local = True, big = True )
     # heap vsegs : distributed but non local (all heap vsegs can be accessed by all tasks)
+    # heap vsegs : distributed but non local (any heap can be accessed by any task)
     for x in xrange (x_size):
         for y in xrange (y_size):
             cluster_id = (x * y_size) + y
+            size       = heap_size
+            base       = heap_base + (cluster_id * size)
+            mapping.addVseg( vspace, 'conv_heap_%d_%d' % (x,y), base, size,
+                             'C_WU', vtype = 'BUFFER', x = x , y = y , pseg = 'RAM',
+                             local = False, big = True )
+            if ( mapping.clusters[cluster_id].procs ):
+                size       = heap_size
+                base       = heap_base + (cluster_id * size)
+                mapping.addVseg( vspace, 'conv_heap_%d_%d' % (x,y), base, size,
+                                 'C_WU', vtype = 'BUFFER', x = x , y = y , pseg = 'RAM',
+                                 local = False, big = True )
     # distributed tasks : one task per processor
     for x in xrange (x_size):
         for y in xrange (y_size):
+            for p in xrange( nprocs ):
+                trdid = (((x * y_size) + y) * nprocs) + p
+                mapping.addTask( vspace, 'conv_%d_%d_%d' % (x,y,p), trdid, x, y, p,
+                                 'conv_stack_%d_%d_%d' % (x,y,p),
+                                 'conv_heap_%d_%d' % (x,y), 0 )
+            cluster_id = (x * y_size) + y
+            if ( mapping.clusters[cluster_id].procs ):
+                for p in xrange( nprocs ):
+                    trdid = (((x * y_size) + y) * nprocs) + p
+                    mapping.addTask( vspace, 'conv_%d_%d_%d' % (x,y,p),
+                                     trdid, x, y, p,
+                                     'conv_stack_%d_%d_%d' % (x,y,p),
+                                     'conv_heap_%d_%d' % (x,y), 0 )
     # extend mapping name
 …
     return vspace  # useful for test
 ################################ test ######################################################
+################################ test ################################################
 if __name__ == '__main__':

soft/giet_vm/applications/convol/main.c

-                      r488
+                      r502
 ////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////
 // File   : main.c   (for convol application)
 // Date   : june 2014
 // author : Alain Greiner
 ////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////
 // This multi-threaded application application implements a 2D convolution product.
 // The convolution kernel is [201]*[35] pixels, but it can be factored in two
 // independant line and column convolution products.
 // It can run on a multi-processors, multi-clusters architecture, with one thread
+// per processor. It uses the he following hardware parameters, that must be defined
+// in the hard_config.h file:
+// - X_SIZE       : number of clusters in a row
+// - Y_SIZE       : number of clusters in a column
+// - NB_PROCS_MAX : number of processors per cluster
+// - FBUF_X_SIZE  : number of pixels per line in frame buffer
+// - FBUF_Y_SIZE  : number of lines  in frame buffer
+// per processor.
 //
 // The (1024 * 1024) pixels image is read from a file (2 bytes per pixel).
 //
+// - The number of clusters containing processors must be a power of 2.
+// - The number of processors per cluster must be a power of 2.
+////////////////////////////////////////////////////////////////////////////////////////////
+#include "hard_config.h"
+// - number of clusters containing processors must be power of 2 no larger than 256.
+// - number of processors per cluster must be power of 2 no larger than 8.
+///////////////////////////////////////////////////////////////////////////////////////
 #include "stdio.h"
 #include "stdlib.h"
 #include "barrier.h"
+#include "user_barrier.h"
 #include "malloc.h"
 #define USE_SBT_BARRIER            1
+#define USE_SQT_BARRIER            1
 #define VERBOSE                    0
 #define SUPER_VERBOSE              0
+#define X_SIZE_MAX                 16
+#define Y_SIZE_MAX                 16
+#define PROCS_MAX                  8
+#define CLUSTERS_MAX               (X_SIZE_MAX * Y_SIZE_MAX)
 #define INITIAL_DISPLAY_ENABLE     0
 #define FINAL_DISPLAY_ENABLE       1
-#define NB_CLUSTERS                (X_SIZE * Y_SIZE)
 #define PIXEL_SIZE                 2
 #define NL                         1024
 …
 // global instrumentation counters (cluster_id, lpid]
 unsigned int START[NB_CLUSTERS][NB_PROCS_MAX];
 unsigned int H_BEG[NB_CLUSTERS][NB_PROCS_MAX];
 unsigned int H_END[NB_CLUSTERS][NB_PROCS_MAX];
 unsigned int V_BEG[NB_CLUSTERS][NB_PROCS_MAX];
 unsigned int V_END[NB_CLUSTERS][NB_PROCS_MAX];
 unsigned int D_BEG[NB_CLUSTERS][NB_PROCS_MAX];
 unsigned int D_END[NB_CLUSTERS][NB_PROCS_MAX];
+unsigned int START[CLUSTERS_MAX][PROCS_MAX];
+unsigned int H_BEG[CLUSTERS_MAX][PROCS_MAX];
+unsigned int H_END[CLUSTERS_MAX][PROCS_MAX];
+unsigned int V_BEG[CLUSTERS_MAX][PROCS_MAX];
+unsigned int V_END[CLUSTERS_MAX][PROCS_MAX];
+unsigned int D_BEG[CLUSTERS_MAX][PROCS_MAX];
+unsigned int D_END[CLUSTERS_MAX][PROCS_MAX];
 // global synchronization barrier
 #if USE_SBT_BARRIER
 giet_sbt_barrier_t  barrier;
+#if USE_SQT_BARRIER
+giet_sqt_barrier_t  barrier;
 #else
 giet_barrier_t      barrier;
 …
 // global pointers on distributed buffers in all clusters
 unsigned short * GA[NB_CLUSTERS];
 int *            GB[NB_CLUSTERS];
 int *            GC[NB_CLUSTERS];
 int *            GD[NB_CLUSTERS];
 unsigned char *  GZ[NB_CLUSTERS];
+unsigned short * GA[CLUSTERS_MAX];
+int *            GB[CLUSTERS_MAX];
+int *            GC[CLUSTERS_MAX];
+int *            GD[CLUSTERS_MAX];
+unsigned char *  GZ[CLUSTERS_MAX];
 ///////////////////////////////////////////
 …
     int z; // vertical filter index for loops
+    // plat-form parameters
+    unsigned int x_size;             // number of clusters in a row
+    unsigned int y_size;             // number of clusters in a column
+    unsigned int nprocs;             // number of processors per cluster
+    giet_procs_number( &x_size , &y_size , &nprocs );
     // processor identifiers
     unsigned int x;                                           // x coordinate
     unsigned int y;                                           // y coordinate
     unsigned int lpid;                                        // local proc/task id
+    unsigned int x;                                         // x coordinate
+    unsigned int y;                                         // y coordinate
+    unsigned int lpid;                                      // local proc/task id
     giet_proc_xyp( &x, &y, &lpid );
+    int          file        = 0;                             // file descriptor
+    unsigned int nprocs      = NB_PROCS_MAX;                  // procs per cluster
+    unsigned int nclusters   = NB_CLUSTERS;                   // number of clusters
+    unsigned int cluster_id  = (x * Y_SIZE) + y;              // continuous cluster index
+    unsigned int task_id     = (cluster_id * nprocs) + lpid;  // continuous task index
+    unsigned int ntasks      = nclusters * nprocs;            // number of tasks
+    unsigned int frame_size  = FRAME_SIZE;                    // total size (bytes)
+    unsigned int nblocks     = frame_size / 512;              // number of blocks per frame
+    unsigned int lines_per_task     = NL / ntasks;            // lines per task
+    unsigned int lines_per_cluster  = NL / nclusters;         // lines per cluster
+    unsigned int pixels_per_task    = NP / ntasks;            // columns per task
+    unsigned int pixels_per_cluster = NP / nclusters;         // columns per cluster
+    int          file       = 0;                            // file descriptor
+    unsigned int nclusters  = x_size * y_size;              // number of clusters
+    unsigned int cluster_id = (x * y_size) + y;             // continuous cluster index
+    unsigned int task_id    = (cluster_id * nprocs) + lpid; // continuous task index
+    unsigned int ntasks     = nclusters * nprocs;           // number of tasks
+    unsigned int frame_size = FRAME_SIZE;                   // total size (bytes)
+    unsigned int nblocks    = frame_size / 512;             // number of blocks/frame
+    unsigned int lines_per_task     = NL / ntasks;          // lines per task
+    unsigned int lines_per_cluster  = NL / nclusters;       // lines per cluster
+    unsigned int pixels_per_task    = NP / ntasks;          // columns per task
+    unsigned int pixels_per_cluster = NP / nclusters;       // columns per cluster
     int first, last;
 …
      // parameters checking
+    if ( (NP != FBUF_X_SIZE) || (NL != FBUF_Y_SIZE) )
+    {
+        giet_exit("[TRANSPOSE ERROR] Frame buffer size does not fit image size");
+    }
+    if ((nprocs != 1) && (nprocs != 2) && (nprocs != 4))
+        giet_exit( "[CONVOL ERROR] NB_PROCS_MAX must be 1, 2 or 4\n");
+    if ((X_SIZE!=1) && (X_SIZE!=2) && (X_SIZE!=4) && (X_SIZE!=8) && (X_SIZE!=16))
+        giet_exit( "[CONVOL ERROR] X_SIZE must be 1, 2, 4, 8, 16\n");
+    if ((nprocs != 1) && (nprocs != 2) && (nprocs != 4) && (nprocs != 8))
+        giet_exit( "[CONVOL ERROR] NB_PROCS_MAX must be 1, 2, 4 or 8\n");
+    if ((x_size!=1) && (x_size!=2) && (x_size!=4) && (x_size!=8) && (x_size!=16))
+        giet_exit( "[CONVOL ERROR] x_size must be 1, 2, 4, 8, 16\n");
     if ((Y_SIZE!=1) && (Y_SIZE!=2) && (Y_SIZE!=4) && (Y_SIZE!=8) && (Y_SIZE!=16))
         giet_exit( "[CONVOL ERROR] Y_SIZE must be 1, 2, 4, 8, 16\n");
+    if ((y_size!=1) && (y_size!=2) && (y_size!=4) && (y_size!=8) && (y_size!=16))
+        giet_exit( "[CONVOL ERROR] y_size must be 1, 2, 4, 8, 16\n");
     if ( NL % nclusters != 0 )
         giet_exit( "[CONVOL ERROR] NB_CLUSTERS must be a divider of NL");
+        giet_exit( "[CONVOL ERROR] CLUSTERS_MAX must be a divider of NL");
     if ( NP % nclusters != 0 )
         giet_exit( "[CONVOL ERROR] NB_CLUSTERS must be a divider of NP");
+        giet_exit( "[CONVOL ERROR] CLUSTERS_MAX must be a divider of NP");
 …
     if ( (x==0) && (y==0) && (lpid==0) )
+    {
-        // parameters checking
-        if ( (NP != FBUF_X_SIZE) || (NL != FBUF_Y_SIZE) )
-            giet_exit("[TRANSPOSE ERROR] Frame buffer size does not fit image size");
-        if ((nprocs != 1) && (nprocs != 2) && (nprocs != 4))
-            giet_exit( "[CONVOL ERROR] NB_PROCS_MAX must be 1, 2 or 4\n");
-        if ((X_SIZE!=1) && (X_SIZE!=2) && (X_SIZE!=4) && (X_SIZE!=8) && (X_SIZE!=16))
-            giet_exit( "[CONVOL ERROR] X_SIZE must be 1, 2, 4, 8, 16\n");
-        if ((Y_SIZE!=1) && (Y_SIZE!=2) && (Y_SIZE!=4) && (Y_SIZE!=8) && (Y_SIZE!=16))
-            giet_exit( "[CONVOL ERROR] Y_SIZE must be 1, 2, 4, 8, 16\n");
-        if ( NL % nclusters != 0 )
-            giet_exit( "[CONVOL ERROR] NB_CLUSTERS must be a divider of NL");
-        if ( NP % nclusters != 0 )
-            giet_exit( "[CONVOL ERROR] NB_CLUSTERS must be a divider of NP");
         giet_shr_printf("\n[CONVOL] task[0,0,0] starts barrier init at cycle %d\n"
                         "- NB_CLUSTERS     = %d\n"
                         "- NB_PROCS_MAX    = %d\n"
                         "- NB_TASKS        = %d\n"
                         "- NB_BLOCKS       = %x\n",
+                        "- CLUSTERS  = %d\n"
+                        "- PROCS     = %d\n"
+                        "- TASKS     = %d\n"
+                        "- BLOCKS    = %d\n",
                         giet_proctime(), nclusters, nprocs, ntasks, nblocks );
 #if USE_SBT_BARRIER
         sbt_barrier_init( &barrier, nclusters , nprocs );
+#if USE_SQT_BARRIER
+        sqt_barrier_init( &barrier, x_size , y_size , nprocs );
 #else
         barrier_init( &barrier, ntasks );
 …
 #if VERBOSE
+giet_shr_printf( "\n[CONVOL] task[%d,%d,%d] enters malloc at cycle %d\n", x,y,lpid, date );
+giet_shr_printf( "\n[CONVOL] task[%d,%d,%d] enters malloc at cycle %d\n",
+                 x,y,lpid, date );
 #endif
 …
     ///////////////////////////////
     #if USE_SBT_BARRIER
     sbt_barrier_wait( &barrier );
+    #if USE_SQT_BARRIER
+    sqt_barrier_wait( &barrier );
     #else
     barrier_wait( &barrier );
 …
     ///////////////////////////////////////////////////////////////////
     unsigned short * A[NB_CLUSTERS];
     int *            B[NB_CLUSTERS];
     int *            C[NB_CLUSTERS];
     int *            D[NB_CLUSTERS];
     unsigned char *  Z[NB_CLUSTERS];
+    unsigned short * A[CLUSTERS_MAX];
+    int            * B[CLUSTERS_MAX];
+    int            * C[CLUSTERS_MAX];
+    int            * D[CLUSTERS_MAX];
+    unsigned char  * Z[CLUSTERS_MAX];
     for (c = 0; c < nclusters; c++)
 …
                          " at cycle %d\n", giet_proctime() );
         for ( c = 0 ; c < NB_CLUSTERS ; c++ )
+        for ( c = 0 ; c < nclusters ; c++ )
+        {
             giet_shr_printf( "\n[CONVOL] task[0,0,0] starts load "
 …
         ////////////////////////////
         #if USE_SBT_BARRIER
         sbt_barrier_wait( &barrier );
+        #if USE_SQT_BARRIER
+        sqt_barrier_wait( &barrier );
         #else
         barrier_wait( &barrier );
 …
     /////////////////////////////
     #if USE_SBT_BARRIER
     sbt_barrier_wait( &barrier );
+    #if USE_SQT_BARRIER
+    sqt_barrier_wait( &barrier );
     #else
     barrier_wait( &barrier );
 …
     ////////////////////////////
     #if USE_SBT_BARRIER
     sbt_barrier_wait( &barrier );
+    #if USE_SQT_BARRIER
+    sqt_barrier_wait( &barrier );
     #else
     barrier_wait( &barrier );
 …
     //////////////////////////////
     #if USE_SBT_BARRIER
     sbt_barrier_wait( &barrier );
+    #if USE_SQT_BARRIER
+    sqt_barrier_wait( &barrier );
     #else
     barrier_wait( &barrier );

soft/giet_vm/applications/gameoflife/gameoflife.ld

-                      r251
+                      r502
 *****************************************************************************/
 seg_data_base      = 0x00800000;
 seg_code_base      = 0x00400000;
+seg_data_base      = 0x20000000;
+seg_code_base      = 0x10000000;
 /***************************************************************************
 …
         *(.ctors)
         *(.rodata)
-        /* . = ALIGN(4); */
         *(.rodata.*)
-        /* . = ALIGN(4); */
         *(.data)
-        /* . = ALIGN(4); */
         *(.lit8)
         *(.lit4)
         *(.sdata)
-        /* . = ALIGN(4); */
         *(.bss)
         *(COMMON)

soft/giet_vm/applications/gameoflife/main.c

-                      r444
+                      r502
+/*
+ * This application is an emulation of the game of life automaton
+ * It must be deployed from processor 0 and use contiguous processor
+ * (example 0,1,2,3)
+ */
+//////////////////////////////////////////////////////////////////////////////////
+// File : main.c  (for gameoflife)
+// Date : November 2013
+// Author :  Alexandre Joannou <alexandre.joannou@lip6.fr>
+//
+// This application is an emulation of the game of life automaton.
+// The world size is defined by the HEIGHT and WIDTH parameters.
+// There is one task per processor, and each task compute HEIGHT/nbprocs lines.
+// The number of processors must be a power of 2 not larger than HEIGHT.
+//////////////////////////////////////////////////////////////////////////////////
 #include "stdio.h"
 #include "limits.h"
+#include "barrier.h"
+#include "hard_config.h"
+#include "user_barrier.h"
 #include "mapping_info.h"
 #define WIDTH           128
 #define HEIGHT          128
-#define NB_CLUSTER_MAX  256
 #define NB_ITERATION    1000000000
+#define PRINTF(...) ({ if ( proc_id==0) { giet_tty_printf(__VA_ARGS__); } })
+giet_barrier_t barriers[2];
+unsigned int init_ok = 1;
+#define NEW 0
+#define OLD 1
+#define PRINTF(...) ({ if ( proc_id==0) { giet_shr_printf(__VA_ARGS__); } })
+giet_sqt_barrier_t barrier;
+unsigned int init_ok = 0;
+#define OLD 0
+#define NEW 1
+#define DSP 2
 typedef unsigned char uint8_t;
 typedef unsigned int size_t;
+uint8_t world[2][HEIGHT][WIDTH];
+uint8_t world_yuv[HEIGHT][WIDTH];
+/* Generate binary values for world between base_line and base_line + nb_line */
+uint8_t world[3][HEIGHT][WIDTH];
+/////////////////////////////////////////////////
 void init_world(size_t base_line, size_t nb_line)
+{
    size_t x,y;
+   for (y = base_line ; y < base_line + nb_line; y++){
+      for(x = 0; x < WIDTH ; x++) {
+         // TODO OPTIMIZE RANDOM INIT
+   for (y = base_line ; y < base_line + nb_line; y++)
+   {
+      for(x = 0; x < WIDTH ; x++)
+      {
          world[OLD][y][x] = giet_rand() % 2;
+      }
 …
+}
+/////////////////////////////////////////////////
 uint8_t number_of_alive_neigh(size_t x, size_t y)
+{
 …
+}
 /* Compute cell x,y */
+/////////////////////////////////////////////////
 uint8_t compute_cell(size_t x, size_t y)
+{
    uint8_t nb_neighbours_alive = number_of_alive_neigh(x,y);
+   if (world[OLD][y][x] == 1) {
+      if (nb_neighbours_alive == 2 ||
+          nb_neighbours_alive == 3)
+      {
+         return 1;
+      }
+   }
+   else {
+      if (nb_neighbours_alive == 3) {
+         return 1;
+      }
+      else {
+         return world[OLD][y][x];
+      }
+   if (world[OLD][y][x] == 1)
+   {
+      if (nb_neighbours_alive == 2 || nb_neighbours_alive == 3)  return 1;
+   }
+   else
+   {
+      if (nb_neighbours_alive == 3) return 1;
+      else                          return world[OLD][y][x];
+   }
    return 0;
 …
+{
    size_t x,y;
+   for (y = base_line; y < base_line + nb_line; y++){
+      for(x = 0; x < WIDTH ; x++) {
+         //world_yuv[y][x] = world[NEW][y][x]*100;
+         world[NEW][y][x] = world[NEW][y][x]*255;
+   for (y = base_line; y < base_line + nb_line; y++)
+   {
+      for(x = 0; x < WIDTH ; x++)
+      {
+         world[DSP][y][x] = world[OLD][y][x]*255;
+      }
+   }
    giet_fbf_sync_write( base_line * WIDTH ,
                        &world[NEW][base_line][0],
                        nb_line * WIDTH);
+                        &world[DSP][base_line][0],
+                        nb_line * WIDTH );
+}
 …
    giet_proc_xyp( &x, &y, &p );
+   // get processors number
+   unsigned int x_size;
+   unsigned int y_size;
+   unsigned int n_local_procs;
+   giet_procs_number( &x_size, &y_size, &n_local_procs );
    // compute continuous processor index
+   unsigned int proc_id = (((x * Y_SIZE) + y) * NB_PROCS_MAX) + p;
+   unsigned int nlocal_procs  = NB_PROCS_MAX;               // processors per cluster
+   unsigned int nclusters     = X_SIZE*Y_SIZE;              // number of clusters
+   unsigned int nglobal_procs = nclusters * nlocal_procs;   // number of processors
+   unsigned int proc_id = (((x * y_size) + y) * n_local_procs) + p;
+   unsigned int n_clusters     = x_size * y_size;            // number of clusters
+   unsigned int n_global_procs = n_clusters * n_local_procs; // number of processors
    size_t i;
+   size_t       nb_line       = HEIGHT / nglobal_procs;
+   if ( n_global_procs > HEIGHT )
+   {
+       PRINTF("[GAMEOFLIFE ERROR] Number or processors too large :"
+              " nb_procs = %d / image heigth = %d\n", n_global_procs, HEIGHT );
+       giet_exit("error");
+   }
+   size_t       nb_line       = HEIGHT / n_global_procs;
    size_t       base_line     = nb_line * proc_id;
+   PRINTF("*** Starting init at cycle %d ***\n", giet_proctime());
+   //  barriers initialization
+   PRINTF("\n*** Starting barrier initialisation at cycle %d ***\n"
+          " nprocs = %d / nlines = %d\n",
+          giet_proctime() , n_global_procs, HEIGHT );
+   // barrier initialization
    if ( proc_id == 0 )
+   {
+      barrier_init(&barriers[0], nglobal_procs);
+      barrier_init(&barriers[1], nglobal_procs);
+      init_ok = 0;
+      sqt_barrier_init( &barrier , x_size , y_size , n_local_procs );
+      init_ok = 1;
+   }
    else
+   {
+      while ( init_ok == 1 );
+   }
+   init_world(base_line, nb_line);
+   PRINTF("*** Completing init at cycle %d ***\n", giet_proctime());
+   barrier_wait(&barriers[0]);
+      while ( init_ok == 0 ) asm volatile("nop");
+   }
+   PRINTF("\n*** Starting world initialisation at cycle %d ***\n",
+          giet_proctime() );
+   //  parallel world  initialization
+   init_world( base_line , nb_line );
+PRINTF("coucou 0\n");
+   display_world( base_line , nb_line );
+PRINTF("coucou 1\n");
+   sqt_barrier_wait( &barrier );
+   PRINTF("\n*** Starting life at cycle %d ***\n",
+          giet_proctime() );
    for (i = 0; i < NB_ITERATION; i++)
+   {
+      compute_new_gen(base_line, nb_line);
+      grow_old_world(base_line, nb_line);
+      display_world(base_line, nb_line);
+      barrier_wait(&barriers[1]);
+      barrier_init(&barriers[1], nglobal_procs);
+   }
+   PRINTF("*** End of main at cycle %d ***\n", giet_proctime());
+      compute_new_gen( base_line, nb_line );
+      grow_old_world( base_line, nb_line );
+      display_world( base_line, nb_line );
+      sqt_barrier_wait( &barrier );
+      PRINTF(" - iteration %d completed\n", i );
+   }
+   PRINTF("\n*** End of main at cycle %d ***\n", giet_proctime());
    giet_exit("Completed");

soft/giet_vm/applications/sort/main.c

-                      r432
+                      r502
 //      barrier routines to apply a sort algorithm in several stages.
 //
+//      Considerations :
+//
+//          - It supports up to 256 processors and the number of processors
+//            must be a power of 2.
+//
+//          - If there is only one TTY available, this application uses a spin
+//            lock to avoid several threads writting at the same time.
+//
+//          - This application must be executed on a cache coherent
+//            architecture. Otherwise some modifications must be applied
+//
+//          - The processors executing this application must have a contiguous
+//            processor id and the first processor must have id 0.
+//      Constraints :
+//
+//      - It supports up to 1024 processors and the number of processors
+//        must be a power of 2.
+//
+//      _ The array of values to be sorted (ARRAY_LENGTH) must be power of 2
+//        larger than the number of processors.
+//
+//      - This application must be executed on a cache coherent architecture.
 //
 ///////////////////////////////////////////////////////////////////////////////
 …
 #include "mapping_info.h"
 #include "hard_config.h"
 #include "barrier.h"
 #define ARRAY_LENGTH    512
 #define IPT             (ARRAY_LENGTH / *nb_thread) // ITEMS PER THREAD
+#include "user_barrier.h"
+#define ARRAY_LENGTH    4096
+#define IPT             (ARRAY_LENGTH / threads) // ITEMS PER THREAD
 ////////////////////////////////////////////////////////////////////////////////
+// Processors other than 0 display algorithm state
+// The processor 0 always displays some information so this does not affect him
+// Processors other than 0 display algorithm state if VERBOSE non zero
 #define VERBOSE         1
 ////////////////////////////////////////////////////////////////////////////////
+// Define printf according to verbosity option and number of available
+// TTY
+// Define printf according to verbosity option and number of available TTY
 #if (VERBOSE == 1)
 …
 #define task0_printf(...) if(thread_id == 0) giet_shr_printf(__VA_ARGS__)
-#define exit    giet_exit
-#define procid  giet_procid
-#define rand    giet_rand
 int array0[ARRAY_LENGTH];
 …
         int init_pos_result);
 ///////////////////////////////////////////////////
 // This application support at most 256 processors
 // Number of barriers = log2(nb_thread)
 giet_barrier_t barrier[8];
+///////////////////////////////////////////////////////
+// This application supports at most 1024 processors
+// Number of barriers = log2(threads)
+giet_barrier_t barrier[10];
 //////////////////////////////////////////
 …
+{
     int thread_id = giet_thread_id();
-    unsigned int* nb_thread;
     int * src_array = NULL;
     int * dst_array = NULL;
 …
     unsigned int time_end;
+    giet_vobj_get_vbase( "sort" ,
+                         "sort_args",
+                         (unsigned int*)&nb_thread );
+    task0_printf("\n[ Thread 0 ] Starting sort application with %u threads "
+                 "at cycle %u\n", *nb_thread, time_start);
+    // compute number of threads (one thread per proc)
+    unsigned int x_size;
+    unsigned int y_size;
+    unsigned int nprocs;
+    unsigned int threads;
+    giet_procs_number( &x_size , &y_size , &nprocs );
+    threads = x_size * y_size * nprocs;
+    if ( (threads != 1)   && (threads != 2)   && (threads != 4)   &&
+         (threads != 8)   && (threads != 16 ) && (threads != 32)  &&
+         (threads != 64)  && (threads != 128) && (threads != 256) &&
+         (threads != 512) && (threads != 1024) )
+    {
+        task0_printf("[SORT ERROR] Number of processors must be power of 2\n"
+                     "  x_size = %d / y_size = %d / nprocs = %d\n",
+                     x_size , y_size , nprocs );
+        giet_exit("error");
+    }
+    task0_printf("\n[ Thread 0 ] Starting sort application with %d threads "
+                 "at cycle %d\n", threads, time_start);
     ///////////////////////////
 …
     if (thread_id == 0)
+    {
         for (i = 0; i < __builtin_ctz(*nb_thread); i++)
+        {
             barrier_init(&barrier[i], *nb_thread >> i);
+        for (i = 0; i < __builtin_ctz( threads ); i++)
+        {
+            barrier_init(&barrier[i], threads >> i);
+        }
 …
     for (i = IPT * thread_id; i < IPT * (thread_id + 1); i++)
+    {
         array0[i] = rand();
+        array0[i] = giet_rand();
+    }
 …
     printf("[ Thread %d ] Finishing Stage 0\n\r", thread_id);
     for (i = 0; i < __builtin_ctz(*nb_thread); i++)
+    for (i = 0; i < __builtin_ctz( threads ); i++)
+    {
         barrier_wait(&barrier[i]);
 …
+        {
             printf("[ Thread %d ] Quit\n\r", thread_id );
             exit("Completed");
+            giet_exit("Completed");
+        }
 …
     if(thread_id != 0)
+    {
         exit("error: only thread 0 should get here");
+        giet_exit("error: only thread 0 should get here");
+    }
 …
     if (success)
+    {
         exit("!!! Success !!!");
+        giet_exit("!!! Success !!!");
+    }
     else
 …
             printf("array[%d] = %d\n", i, dst_array[i]);
+        }
         exit("!!!  Failure !!!");
+    }
     exit("Completed");
+        giet_exit("!!!  Failure !!!");
+    }
+    giet_exit("Completed");
+}

soft/giet_vm/applications/sort/sort.py

-                      r434
+                      r502
     # define vsegs base & size
     code_base  = 0x10000000
     code_size  = 0x00200000     # 2 Mbytes (replicated in each cluster)
+    code_size  = 0x00010000     # 64 Kbytes (replicated in each cluster)
     data_base  = 0x20000000
+    data_size  = 0x00100000     # 1 Mbyte (non replicated)
+    args_base  = 0x20100000
+    args_size  = 0x00000004     # 4 bytes (non replicated)
+    data_size  = 0x00010000     # 64 Kbyte (non replicated)
     stack_base = 0x40000000
 …
                      'C_WU', vtype = 'ELF', x = 0, y = 0, pseg = 'RAM',
                      binpath = 'build/sort/sort.elf',
+                     local = False, big = True )
+    # args vseg : non local (only in cluster[0,0])
+    mapping.addVseg( vspace, 'sort_args', args_base , args_size,
+                     'C_WU', vtype = 'CONST', x = 0, y = 0, pseg = 'RAM',
+                     init = ntasks,
+                     local = False, big = True )
+                     local = False )
     # code vsegs : local (one copy per cluster)
     for x in xrange (x_size):
         for y in xrange (y_size):
+            mapping.addVseg( vspace, 'sort_code', code_base , code_size,
+                             'CXWU', vtype = 'ELF', x = x, y = y, pseg = 'RAM',
+                             binpath = 'build/sort/sort.elf',
+                             local = True, big = True )
+            cluster_id = (x * y_size) + y
+            if ( mapping.clusters[cluster_id].procs ):
+                mapping.addVseg( vspace, 'sort_code', code_base , code_size,
+                                 'CXWU', vtype = 'ELF', x = x, y = y, pseg = 'RAM',
+                                 binpath = 'build/sort/sort.elf',
+                                 local = True )
     # stacks vsegs : local (one stack per task)
     for x in xrange (x_size):
         for y in xrange (y_size):
+            for p in xrange (nprocs):
+                proc_id = (((x * y_size) + y) * nprocs) + p
+                size    = stack_size / nprocs
+                base    = stack_base + (proc_id * size)
+                mapping.addVseg( vspace, 'sort_stack_%d_%d_%d' % (x,y,p), base, size,
+            cluster_id = (x * y_size) + y
+            if ( mapping.clusters[cluster_id].procs ):
+                for p in xrange (nprocs):
+                    proc_id = (((x * y_size) + y) * nprocs) + p
+                    size    = stack_size / nprocs
+                    base    = stack_base + (proc_id * size)
+                    mapping.addVseg( vspace, 'sort_stack_%d_%d_%d' % (x,y,p),
+                                     base, size, 'C_WU', vtype = 'BUFFER',
+                                     x = x, y = y, pseg = 'RAM',
+                                     local = True, big = True )
+    # heap vsegs : distributed but non local (all tasks can access all heap vsegs)
+    for x in xrange (x_size):
+        for y in xrange (y_size):
+            cluster_id = (x * y_size) + y
+            if ( mapping.clusters[cluster_id].procs ):
+                size       = heap_size
+                base       = heap_base + (cluster_id * size)
+                mapping.addVseg( vspace, 'sort_heap_%d_%d' % (x,y), base, size,
                                  'C_WU', vtype = 'BUFFER', x = x, y = y, pseg = 'RAM',
+                                 local = True, big = True )
+            # heap vsegs : distributed but non local (all tasks can access all heap vsegs)
+            cluster_id = (x * y_size) + y
+            size       = heap_size
+            base       = heap_base + (cluster_id * size)
+            mapping.addVseg( vspace, 'sort_heap_%d_%d' % (x,y), base, size,
+                             'C_WU', vtype = 'BUFFER', x = x, y = y, pseg = 'RAM',
+                             local = False, big = True )
+                                 local = False, big = True )
     # distributed tasks / one task per processor
     for x in xrange (x_size):
         for y in xrange (y_size):
+            for p in xrange( nprocs ):
+                trdid = (((x * y_size) + y) * nprocs) + p
+                mapping.addTask( vspace, 'sort_%d_%d_%d' % (x,y,p), trdid, x, y, p,
+                                 'sort_stack_%d_%d_%d' % (x,y,p),
+                                 'sort_heap_%d_%d' % (x,y), 0 )
+            cluster_id = (x * y_size) + y
+            if ( mapping.clusters[cluster_id].procs ):
+                for p in xrange( nprocs ):
+                    trdid = (((x * y_size) + y) * nprocs) + p
+                    mapping.addTask( vspace, 'sort_%d_%d_%d' % (x,y,p),
+                                     trdid, x, y, p,
+                                     'sort_stack_%d_%d_%d' % (x,y,p),
+                                     'sort_heap_%d_%d' % (x,y), 0 )
     # extend mapping name

soft/giet_vm/applications/transpose/main.c

-                      r444
+                      r502
 /////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////
 // File   : main.c   (for transpose application)
 // Date   : february 2014
 // author : Alain Greiner
+/////////////////////////////////////////////////////////////////////////////////////////////
+// This multi-threaded application makes a transpose for a NN*NN pixels sequence of images.
+///////////////////////////////////////////////////////////////////////////////////////
+// This multi-threaded application makes a transpose for a NN*NN pixels
+// sequence of images.
 // It can run on a multi-processors, multi-clusters architecture, with one thread
+// per processor. It uses the he following hardware parameters, that must be defined
+// in the hard_config.h file:
+// - X_SIZE       : number of clusters in a row
+// - Y_SIZE       : number of clusters in a column
+// - NB_PROCS_MAX : number of processors per cluster
+// - FBUF_X_SIZE  : number of pixels per line in frame buffer
+// - FBUF_Y_SIZE  : number of lines  in frame buffer
+//
+// per processor.
+//
 // The image sequence is read from a file (one byte per pixel).
 // The input and output buffers containing the image are distributed in all clusters.
 //
 // - The image size NN must be a power of 2 and must fit the frame buffer size.
 // - The number of clusters containing processors must be a power of 2.
 // - The number of processors per cluster must be a power of 2.
 // - The image size NN must be larger or equal to the total number of processor.
+// - The image size NN must fit the frame buffer size: 128 bytes
+// - The block size in block device must be 512 bytes.
+// - The number of clusters  must be a power of 2 no larger than 32
+// - The number of processors per cluster must be a power of 2 no larger than 4
 //
 // For each image the application makes a self test (checksum for each line).
 // The actual display on the frame buffer depends on frame buffer availability.
+/////////////////////////////////////////////////////////////////////////////////////////////
+#include "hard_config.h"
+///////////////////////////////////////////////////////////////////////////////////////
 #include "stdio.h"
 #include "barrier.h"
+#include "user_barrier.h"
 #include "malloc.h"
+#define BLOCK_SIZE          512                 // block size on disk
+#define CLUSTERS_MAX        32                  // max number of clusters
+#define PROCS_MAX           4                   // max number of processors per cluster
 #define NN                  128                 // image size : nlines = npixels = 128
 #define NB_IMAGES           5                   // number of images to be handled
 #define FILE_PATHNAME       "misc/images.raw"   // file pathname on disk
-#define NB_CLUSTERS         (X_SIZE * Y_SIZE)   // number of clusters
 #define INSTRUMENTATION_OK  0                   // display statistics on TTY when non zero
 …
 ///////////////////////////////////////////////////////
+// instrumentation counters
+// for each processor (up to 4 processors)
+// in each cluster (up to 32 clusters)
+unsigned int LOAD_START[NB_CLUSTERS][NB_PROCS_MAX];
+unsigned int LOAD_END  [NB_CLUSTERS][NB_PROCS_MAX];
+unsigned int TRSP_START[NB_CLUSTERS][NB_PROCS_MAX];
+unsigned int TRSP_END  [NB_CLUSTERS][NB_PROCS_MAX];
+unsigned int DISP_START[NB_CLUSTERS][NB_PROCS_MAX];
+unsigned int DISP_END  [NB_CLUSTERS][NB_PROCS_MAX];
+// instrumentation counters for each processor in each cluster
+unsigned int LOAD_START[CLUSTERS_MAX][PROCS_MAX];
+unsigned int LOAD_END  [CLUSTERS_MAX][PROCS_MAX];
+unsigned int TRSP_START[CLUSTERS_MAX][PROCS_MAX];
+unsigned int TRSP_END  [CLUSTERS_MAX][PROCS_MAX];
+unsigned int DISP_START[CLUSTERS_MAX][PROCS_MAX];
+unsigned int DISP_END  [CLUSTERS_MAX][PROCS_MAX];
 // arrays of pointers on distributed buffers
 // one input buffer & one output buffer per cluster
 unsigned char*  buf_in [NB_CLUSTERS];
 unsigned char*  buf_out[NB_CLUSTERS];
+unsigned char*  buf_in [CLUSTERS_MAX];
+unsigned char*  buf_out[CLUSTERS_MAX];
 // checksum variables
 …
 // global synchronisation barrier
 giet_barrier_t barrier;
+giet_sqt_barrier_t barrier;
 volatile unsigned int init_ok = 0;
 …
+{
+    int          file = 0;                                         // file descriptor
+    unsigned int l;                                                // line index for loops
+    unsigned int p;                                                // pixel index for loops
+    unsigned int c;                                                // cluster index for loops
+    // get processor identifiers
+    unsigned int x;                                                // x cluster coordinate
+    unsigned int y;                                                // y cluster coordinate
+    unsigned int lpid;                                             // local processor index
+    unsigned int l;                  // line index for loops
+    unsigned int p;                  // pixel index for loops
+    unsigned int c;                  // cluster index for loops
+    // processor identifiers
+    unsigned int x;                  // x cluster coordinate
+    unsigned int y;                  // y cluster coordinate
+    unsigned int lpid;               // local processor index
+    // plat-form parameters
+    unsigned int x_size;             // number of clusters in a row
+    unsigned int y_size;             // number of clusters in a column
+    unsigned int nprocs;             // number of processors per cluster
     giet_proc_xyp( &x, &y, &lpid);
+    unsigned int npixels    = NN * NN;                             // pixels per image
+    unsigned int nblocks    = npixels / 512;                       // blocks per image
+    unsigned int image      = 0;                                   // image counter
+    unsigned int cluster_id = (x * Y_SIZE) + y;                    // "continuous" index
+    unsigned int ntasks     = NB_CLUSTERS * NB_PROCS_MAX;          // number of tasks
+    unsigned int task_id    = (cluster_id * NB_PROCS_MAX) + lpid;  // "continuous" task index
+    giet_procs_number( &x_size , &y_size , &nprocs );
+    giet_shr_printf("\n[TRANSPOSE] Processor[%d,%d,%d] starts at cycle %d\n"
+                    " - x_size = %d\n"
+                    " - y_size = %d\n"
+                    " - nprocs = %d\n",
+                    x, y, lpid, giet_proctime(), x_size , y_size , nprocs );
+    unsigned int nclusters  = x_size * y_size;               // number of clusters
+    unsigned int ntasks     = x_size * y_size * nprocs;      // number of tasks
+    unsigned int npixels    = NN * NN;                       // pixels per image
+    unsigned int nblocks    = npixels / BLOCK_SIZE;          // blocks per image
+    unsigned int image      = 0;                             // image counter
+    int          file       = 0;                             // file descriptor
+    unsigned int cluster_id = (x * y_size) + y;              // "continuous" index
+    unsigned int task_id    = (cluster_id * nprocs) + lpid;  // "continuous" task index
     // Processor [0,0,0] makes initialisation
     // It includes parameters checking, barriers initialization,
+    // It includes parameters checking, barrier initialization,
     // distributed buffers allocation, and file open
     if ( (x==0) && (y==0) && (lpid==0) )
+    {
+        // Parameters checking
+        if ( (NN != FBUF_X_SIZE) || (NN != FBUF_Y_SIZE) )
+        {
+            giet_exit("[TRANSPOSE ERROR] Frame buffer size does not fit image size");
+        }
+        if ((NB_PROCS_MAX != 1) && (NB_PROCS_MAX != 2) && (NB_PROCS_MAX != 4))
+        if ((nprocs != 1) && (nprocs != 2) && (nprocs != 4))
+        {
             giet_exit("[TRANSPOSE ERROR] NB_PROCS_MAX must be 1, 2 or 4");
+        }
         if ((NB_CLUSTERS != 1) && (NB_CLUSTERS != 2) && (NB_CLUSTERS != 4) &&
             (NB_CLUSTERS != 8) && (NB_CLUSTERS != 16) && (NB_CLUSTERS != 32) )
+            giet_exit("[TRANSPOSE ERROR] number of procs per cluster must be 1, 2 or 4");
+        }
+        if ((nclusters != 1) && (nclusters != 2) && (nclusters != 4) &&
+            (nclusters != 8) && (nclusters != 16) && (nclusters != 32) )
+        {
             giet_exit("[TRANSPOSE ERROR] number of clusters must be 1,2,4,8,16,32");
 …
+        }
-        giet_shr_printf("\n[TRANSPOSE] Processor[0,0,0] starts at cycle %d\n"
-                        " - x_size    = %d\n"
-                        " - y_size    = %d\n"
-                        " - nprocs    = %d\n"
-                        " - nclusters = %d\n"
-                        " - ntasks    = %d\n",
-                        giet_proctime(), X_SIZE, Y_SIZE, NB_PROCS_MAX, NB_CLUSTERS, ntasks );
         // Barrier initialisation
         barrier_init( &barrier, ntasks );
+        sqt_barrier_init( &barrier, x_size , y_size , nprocs );
         giet_shr_printf("\n[TRANSPOSE] Proc [0,0,0] completes barrier init at cycle %d\n",
 …
         // Distributed buffers allocation
         // The buffers containing one image are distributed in clusters
         // (one buf_in and one buf_out per cluster).
         // Each buffer contains (NN*NN / NB_CLUSTERS) bytes.
         for ( c = 0 ; c < NB_CLUSTERS ; c++ )
+        {
             unsigned int rx = c / Y_SIZE;
             unsigned int ry = c % Y_SIZE;
             buf_in[c]  = remote_malloc( npixels/NB_CLUSTERS, rx, ry );
             buf_out[c] = remote_malloc( npixels/NB_CLUSTERS, rx, ry );
+        // The buffers containing one image are distributed in the user
+        // heap (one buf_in and one buf_out per cluster).
+        // Each buffer contains (NN*NN / nclusters) bytes.
+        for ( c = 0 ; c < nclusters ; c++ )
+        {
+            unsigned int rx = c / y_size;
+            unsigned int ry = c % y_size;
+            buf_in[c]  = remote_malloc( npixels/nclusters, rx, ry );
+            buf_out[c] = remote_malloc( npixels/nclusters, rx, ry );
             giet_shr_printf("\n[TRANSPOSE] Proc [0,0,0] completes buffer allocation"
 …
+    {
         while ( init_ok == 0 );
-        giet_shr_printf("\n[TRANSPOSE] Processor[%d,%d,%d] starts at cycle %d\n",
-                        x, y, lpid, giet_proctime() );
+    }
 …
     while (image < NB_IMAGES)
+    {
         // pseudo parallel load from disk to buf_in buffer : nblocks/NB_CLUSTERS blocks
+        // pseudo parallel load from disk to buf_in buffer : nblocks/nclusters blocks
         // only task running on processor with (lpid == 0) does it
 …
             giet_fat_read( file,
                            buf_in[cluster_id],
+                           (nblocks / NB_CLUSTERS),
+                           ((image*nblocks) + ((nblocks*cluster_id)/NB_CLUSTERS)) );
+                           (nblocks / nclusters),
+                           ((image*nblocks) + ((nblocks*cluster_id)/nclusters)) );
+            if ( (x==0) && (y==0) )
             giet_shr_printf("\n[TRANSPOSE] Proc [%d,%d,%d] completes load"
                             "  for image %d at cycle %d\n",
 …
         LOAD_END[cluster_id][lpid] = giet_proctime();
         /////////////////////////
         barrier_wait( &barrier );
+        /////////////////////////////
+        sqt_barrier_wait( &barrier );
         // parallel transpose from buf_in to buf_out
 …
         unsigned int nlt   = NN / ntasks;      // number of lines per task
         unsigned int nlc   = NN / NB_CLUSTERS;   // number of lines per cluster
+        unsigned int nlc   = NN / nclusters;   // number of lines per cluster
         unsigned int src_cluster;
 …
         if ( lpid == 0 )
+        {
+            if ( (x==0) && (y==0) )
             giet_shr_printf("\n[TRANSPOSE] proc [%d,%d,0] completes transpose"
                             " for image %d at cycle %d\n",
 …
         TRSP_END[cluster_id][lpid] = giet_proctime();
         /////////////////////////
         barrier_wait( &barrier );
+        /////////////////////////////
+        sqt_barrier_wait( &barrier );
         // optional parallel display from local buf_out to frame buffer
 …
                                  npt );
+            if ( lpid == 0 )
+            {
+                giet_shr_printf("\n[TRANSPOSE] Proc [%d,%d,0] completes display"
+                                " for image %d at cycle %d\n",
+                                x, y, image, giet_proctime() );
+            }
+            if ( (x==0) && (y==0) && (lpid==0) )
+            giet_shr_printf("\n[TRANSPOSE] Proc [%d,%d,%d] completes display"
+                            " for image %d at cycle %d\n",
+                            x, y, lpid, image, giet_proctime() );
             DISP_END[cluster_id][lpid] = giet_proctime();
             /////////////////////////
             barrier_wait( &barrier );
+            /////////////////////////////
+            sqt_barrier_wait( &barrier );
+        }
 …
+        }
         /////////////////////////
         barrier_wait( &barrier );
+        /////////////////////////////
+        sqt_barrier_wait( &barrier );
         // instrumentation done by processor [0,0,0]
 …
             unsigned int max_disp_ended = 0;
             for (cc = 0; cc < NB_CLUSTERS; cc++)
+            for (cc = 0; cc < nclusters; cc++)
+            {
                 for (pp = 0; pp < NB_PROCS_MAX; pp++)
 …
         image++;
         /////////////////////////
         barrier_wait( &barrier );
+        /////////////////////////////
+        sqt_barrier_wait( &barrier );
     } // end while image
 …
     if ( (x==0) && (y==0) && (lpid==0) )
+    {
         for ( c = 0 ; c < NB_CLUSTERS ; c++ )
+        for ( c = 0 ; c < nclusters ; c++ )
+        {
             free( buf_in[c] );

soft/giet_vm/applications/transpose/transpose.py

-                      r457
+                      r502
 from mapping import *
 ######################################################################################
+##################################################################################
 #   file   : transpose.py  (for the transpose application)
 #   date   : may 2014
 #   author : Alain Greiner
 #######################################################################################
+##################################################################################
 #  This file describes the mapping of the multi-threaded "transpose"
 #  application on a multi-clusters, multi-processors architecture.
 #  This include both the mapping of virtual segments on the clusters,
 #  and the mapping of tasks on processors.
+#  There is one task per processor.
+#  The mapping of virtual segments is the following:
+#    - There is one shared data vseg in cluster[0][0]
+#    - The code vsegs are replicated on all clusters containing processors.
+#    - There is one heap vseg per cluster containing processors.
+#    - The stacks vsegs are distibuted on all clusters containing processors.
 #  This mapping uses 5 platform parameters, (obtained from the "mapping" argument)
 #  - x_size    : number of clusters in a row
 #  - y_size    : number of clusters in a column
 #  - x_width   : number of bits coding x coordinate
 #  - y_width   : number of bits coding y coordinate
 #  - nprocs    : number of processors per cluster
 ####################################################################################
+#    - x_size    : number of clusters in a row
+#    - y_size    : number of clusters in a column
+#    - x_width   : number of bits coding x coordinate
+#    - y_width   : number of bits coding y coordinate
+#    - nprocs    : number of processors per cluster
+##################################################################################
 #########################
 …
     for x in xrange (x_size):
         for y in xrange (y_size):
+            mapping.addVseg( vspace, 'trsp_code_%d_%d' %(x,y), code_base , code_size,
+                             'CXWU', vtype = 'ELF', x = x, y = y, pseg = 'RAM',
+                             binpath = 'build/transpose/transpose.elf',
+                             local = True )
+            cluster_id = (x * y_size) + y
+            if ( mapping.clusters[cluster_id].procs ):
+    # stacks vsegs: local (one stack per processor => nprocs stacks per cluster)
+    for x in xrange (x_size):
+        for y in xrange (y_size):
+            for p in xrange( nprocs ):
+                proc_id = (((x * y_size) + y) * nprocs) + p
+                size    = (stack_size / nprocs) & 0xFFFFF000
+                base    = stack_base + (proc_id * size)
+                mapping.addVseg( vspace, 'trsp_stack_%d_%d_%d' % (x,y,p), base, size,
+                                 'C_WU', vtype = 'BUFFER', x = x , y = y , pseg = 'RAM',
+                                 local = True, big = True )
+                mapping.addVseg( vspace, 'trsp_code_%d_%d' %(x,y),
+                                 code_base , code_size,
+                                 'CXWU', vtype = 'ELF', x = x, y = y, pseg = 'RAM',
+                                 binpath = 'build/transpose/transpose.elf',
+                                 local = True )
     # heap vsegs: distributed but non local (all heap vsegs can be accessed by all tasks)
+    # stacks vsegs: local (one stack per processor => nprocs stacks per cluster)
     for x in xrange (x_size):
         for y in xrange (y_size):
             cluster_id = (x * y_size) + y
+            size  = heap_size
+            base  = heap_base + (cluster_id * size)
+            mapping.addVseg( vspace, 'trsp_heap_%d_%d' % (x,y), base, size,
+                             'C_WU', vtype = 'BUFFER', x = x, y = y, pseg = 'RAM',
+                             local = False, big = True )
+            if ( mapping.clusters[cluster_id].procs ):
+                for p in xrange( nprocs ):
+                    proc_id = (((x * y_size) + y) * nprocs) + p
+                    size    = (stack_size / nprocs) & 0xFFFFF000
+                    base    = stack_base + (proc_id * size)
+                    mapping.addVseg( vspace, 'trsp_stack_%d_%d_%d' % (x,y,p),
+                                     base, size, 'C_WU', vtype = 'BUFFER',
+                                     x = x , y = y , pseg = 'RAM',
+                                     local = True, big = True )
+    # heap vsegs: distributed non local (all heap vsegs can be accessed by all tasks)
+    for x in xrange (x_size):
+        for y in xrange (y_size):
+            cluster_id = (x * y_size) + y
+            if ( mapping.clusters[cluster_id].procs ):
+                size  = heap_size
+                base  = heap_base + (cluster_id * size)
+                mapping.addVseg( vspace, 'trsp_heap_%d_%d' % (x,y), base, size,
+                                 'C_WU', vtype = 'HEAP', x = x, y = y, pseg = 'RAM',
+                                 local = False, big = True )
     # distributed tasks / one task per processor
     for x in xrange (x_size):
         for y in xrange (y_size):
+            for p in xrange( nprocs ):
+                trdid = (((x * y_size) + y) * nprocs) + p
+                mapping.addTask( vspace, 'trsp_%d_%d_%d' % (x,y,p), trdid, x, y, p,
+                                 'trsp_stack_%d_%d_%d' % (x,y,p),
+                                 'trsp_heap_%d_%d' % (x,y), 0 )
+            cluster_id = (x * y_size) + y
+            if ( mapping.clusters[cluster_id].procs ):
+                for p in xrange( nprocs ):
+                    trdid = (((x * y_size) + y) * nprocs) + p
+                    mapping.addTask( vspace, 'trsp_%d_%d_%d' % (x,y,p),
+                                     trdid, x, y, p,
+                                     'trsp_stack_%d_%d_%d' % (x,y,p),
+                                     'trsp_heap_%d_%d' % (x,y), 0 )
     # extend mapping name
 …
     return vspace  # useful for test
 ################################ test ######################################################
+################################ test ##################################################
 if __name__ == '__main__':

Note: See TracChangeset for help on using the changeset viewer.

Download in other formats: