Ignore:
Timestamp:
Feb 8, 2015, 9:20:45 PM (10 years ago)
Author:
alain
Message:

1) Introduce distributed barriers in the multi-threads applications
(classif) transpose, convol, sort, gameoflife)

2) Introducing support for architectures containing empty clusters
in the mapping of these multi-threaded applications.

3) Removing the "command line arguments" in the sort application
(replaced by the giet_procs_number() system call.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • soft/giet_vm/applications/classif/main.c

    r488 r502  
    1 /////////////////////////////////////////////////////////////////////////////////////////
     1///////////////////////////////////////////////////////////////////////////////////////
    22// File   : main.c   (for classif application)
    33// Date   : november 2014
    44// author : Alain Greiner
    5 /////////////////////////////////////////////////////////////////////////////////////////
     5///////////////////////////////////////////////////////////////////////////////////////
    66// This multi-threaded application takes a stream of Gigabit Ethernet packets,
    77// and makes packet analysis and classification, based on the source MAC address.
     
    99// component to receive and send packets on the Gigabit Ethernet port.
    1010//
    11 // This application is described as a TCG (Task and Communication Graph) containing
    12 // (N+2) tasks per cluster:
     11// It can run on architectures containing up to 16 * 16 clusters,
     12// and up to 8 processors per cluster.
     13//
     14// This application is described as a TCG (Task and Communication Graph)
     15// containing (N+2) tasks per cluster:
    1316// - one "load" task
     17// - one "store" task
    1418// - N "analyse" tasks
    15 // - one "store" task
    16 // The 4 Kbytes containers are diributed (N+2 containers per cluster):
     19// The containers are distributed (N+2 containers per cluster):
    1720// - one RX container (part of the kernel rx_chbuf), in the kernel heap.
    1821// - one TX container (part of the kernel tx-chbuf), in the kernel heap.
     
    3033// The MWMR fifo descriptors array is defined as a global variable in cluster[0][0].
    3134//
    32 // Initialisation is done in two steps by the "load" tasks:
    33 // - Task "load" in cluster[0][0] initialises NIC & CMA channel, and initialises
    34 //   the barrier between all "load" tasks. Other "load" tasks are waiting on the
    35 //   global_sync synchronisation variable.
    36 // - In each cluster[x][y], the "load" task allocates the working containers
    37 //   and the MWMR fifos descriptors in the local heap.
    38 //   The "analyse" tasks are waiting on the sync[x][y] variables.
     35// Initialisation is done in two steps by the "load" & "store" tasks:
     36// - Task "load" in cluster[0][0] initialises the barrier between all "load" tasks,
     37//   allocates NIC & CMA RX channel, and starts the NIC_CMA RX transfer.
     38//   Other "load" tasks are waiting on the load_sync synchronisation variable.
     39//   Task "store" in cluster[0][0] initialises the barrier between all "store" tasks,
     40//   allocates NIC & CMA TX channels, and starts the NIC_CMA TX transfer.
     41//   Other "store" tasks are waiting on the store_sync synchronisation variable.
     42// - When this global initialisation is completed, the "load" task in all clusters
     43//   allocates the working containers and the MWMR fifos descriptors from the
     44//   user local heap. In each cluster, the "analyse" and "store" tasks are waiting
     45//   the local initialisation completion on the local_sync[x][y] variables.
    3946//
    40 // Instrumentation results display is done by the "store" task in cluster[0][0]
    41 // when all "store" tasks completed the number of clusters specified by the
    42 // CONTAINERS_MAX parameter.
    43 //     
    4447// When initialisation is completed, all tasks loop on containers:
    4548// 1) The "load" task get an empty working container from the fifo_s2l,
     
    4750//    and transfer ownership of this container to one "analysis" task by writing
    4851//    into the fifo_l2a.   
    49 //
    5052// 2) The "analyse" task get one working container from the fifo_l2a, analyse
    5153//    each packet header, compute the packet type (depending on the SRC MAC address),
    5254//    increment the correspondint classification counter, and transpose the SRC
    5355//    and the DST MAC addresses fot TX tranmission.
    54 //
    5556// 3) The "store" task transfer get a full working container from the fifo_a2s,
    5657//    transfer this user container content to the the kernel tx_chbuf,
    5758//    and transfer ownership of this empty container to the "load" task by writing
    5859//    into the fifo_s2l.   
    59 //
    60 // This application uses the following hardware parameters (hard_config.h file):
    61 // - X_SIZE       : number of clusters in a row
    62 // - Y_SIZE       : number of clusters in a column
    63 // - NB_PROCS_MAX : number of processors per cluster
    64 /////////////////////////////////////////////////////////////////////////////////////////
     60//     
     61// Instrumentation results display is done by the "store" task in cluster[0][0]
     62// when all "store" tasks completed the number of clusters specified by the
     63// CONTAINERS_MAX parameter.
     64///////////////////////////////////////////////////////////////////////////////////////
    6565
    6666#include "stdio.h"
    67 #include "barrier.h"
     67#include "user_barrier.h"
    6868#include "malloc.h"
    6969#include "user_lock.h"
    7070#include "mwmr_channel.h"
    71 #include "hard_config.h"
    72 
    73 #define CONTAINERS_MAX  5
    74 #define VERBOSE_ANALYSE 1
    75 #define ANALYSIS_TASKS  (NB_PROCS_MAX - 2)
    76 
    77 /////////////////////////////////////////////////////////////////////////////////////////
     71
     72#define X_SIZE_MAX      16
     73#define Y_SIZE_MAX      16
     74#define NPROCS_MAX      8
     75#define CONTAINERS_MAX  500
     76#define VERBOSE_ANALYSE 0
     77
     78///////////////////////////////////////////////////////////////////////////////////////
    7879//    Global variables
    7980// The MWMR channels (descriptors and buffers), as well as the working containers
     
    8182// But the pointers on these distributed structures are shared arrays
    8283// stored in cluster[0][0].
    83 /////////////////////////////////////////////////////////////////////////////////////////
    84 
    85 // pointers on distributed temp[x][y][n] containers
    86 unsigned int*       container[X_SIZE][Y_SIZE][ANALYSIS_TASKS]; 
     84///////////////////////////////////////////////////////////////////////////////////////
     85
     86// pointers on distributed containers
     87unsigned int*       container[X_SIZE_MAX][Y_SIZE_MAX][NPROCS_MAX-2]; 
    8788
    8889// pointers on distributed mwmr fifos containing : temp[x][y][l] container descriptors
    89 mwmr_channel_t*     mwmr_l2a[X_SIZE][Y_SIZE]; 
    90 mwmr_channel_t*     mwmr_a2s[X_SIZE][Y_SIZE];
    91 mwmr_channel_t*     mwmr_s2l[X_SIZE][Y_SIZE];
     90mwmr_channel_t*     mwmr_l2a[X_SIZE_MAX][Y_SIZE_MAX]; 
     91mwmr_channel_t*     mwmr_a2s[X_SIZE_MAX][Y_SIZE_MAX];
     92mwmr_channel_t*     mwmr_s2l[X_SIZE_MAX][Y_SIZE_MAX];
    9293
    9394// local synchros signaling local MWMR fifos initialisation completion
    94 unsigned int        local_sync[X_SIZE][Y_SIZE]; 
     95volatile unsigned int        local_sync[X_SIZE_MAX][Y_SIZE_MAX]; 
    9596
    9697// global synchro signaling global initialisation completion
    97 unsigned int        load_sync  = 0;
    98 unsigned int        store_sync = 0;
     98volatile unsigned int        load_sync  = 0;
     99volatile unsigned int        store_sync = 0;
    99100
    100101// instrumentation counters
    101102unsigned int        counter[16];
    102103
    103 // distributed barriers (between "load" and "store" tasks)
    104 giet_sbt_barrier_t  rx_barrier;
    105 giet_sbt_barrier_t  tx_barrier;
     104// distributed barrier between "load" tasks
     105giet_sqt_barrier_t  rx_barrier;
     106
     107// distributed barrier between "store" tasks
     108giet_sqt_barrier_t  tx_barrier;
    106109
    107110// NIC_RX and NIC_TX channel index
     
    113116/////////////////////////////////////////
    114117{
     118    // each "load" task get platform parameters
     119    unsigned int    x_size;                                             // number of clusters in a row
     120    unsigned int    y_size;                     // number of clusters in a column
     121    unsigned int    nprocs;                     // number of processors per cluster
     122    giet_procs_number( &x_size, &y_size, &nprocs );
     123
     124    giet_assert( (x_size <= X_SIZE_MAX) &&
     125                 (y_size <= Y_SIZE_MAX) &&
     126                 (nprocs <= NPROCS_MAX) ,
     127                 "[CLASSIF ERROR] illegal platform parameters" );
     128
    115129    // each "load" task get processor identifiers
    116130    unsigned int    x;
     
    119133    giet_proc_xyp( &x, &y, &l );
    120134
    121     // "load" task[0][0] initialises barrier between load tasks,
     135    // "load" task[0][0] initialises barrier between all load tasks,
    122136    // allocates the NIC & CMA RX channels, and start the NIC_CMA RX transfer.
    123137    // Other "load" tasks wait completion
    124138    if ( (x==0) && (y==0) )
    125139    {
    126         giet_shr_printf("\n*** Task load on P[%d][%d][%d] starts at cycle %d\n",
    127                         x , y , l , giet_proctime() );
     140        giet_shr_printf("\n*** Task load on P[%d][%d][%d] starts at cycle %d\n"
     141                        "  x_size = %d / y_size = %d / nprocs = %d\n",
     142                        x , y , l , giet_proctime() , x_size, y_size, nprocs );
    128143 
    129         sbt_barrier_init( &rx_barrier, X_SIZE*Y_SIZE , 1 );
    130         nic_rx_channel = giet_nic_rx_alloc();
     144        sqt_barrier_init( &rx_barrier, x_size , y_size , 1 );
     145        nic_rx_channel = giet_nic_rx_alloc( x_size , y_size );
    131146        giet_nic_rx_start( nic_rx_channel );
    132147        load_sync = 1;
     
    137152    }   
    138153
    139     // all load tasks allocate containers[x][y][n] (from local heap)
     154    // each load tasks allocates containers[x][y][n] (from local heap)
    140155    // and register pointers in the local stack
    141156    unsigned int   n;
    142     unsigned int*  cont[ANALYSIS_TASKS];
    143 
    144     for ( n = 0 ; n < ANALYSIS_TASKS ; n++ )
     157    unsigned int*  cont[NPROCS_MAX-2];
     158    unsigned int   analysis_tasks = nprocs-2;
     159
     160    for ( n = 0 ; n < analysis_tasks ; n++ )
    145161    {
    146162        container[x][y][n] = malloc( 4096 );
     
    148164    }
    149165   
    150     // all load tasks allocate data buffers for mwmr fifos (from local heap)
    151     unsigned int*  data_l2a = malloc( ANALYSIS_TASKS<<2 );
    152     unsigned int*  data_a2s = malloc( ANALYSIS_TASKS<<2 );
    153     unsigned int*  data_s2l = malloc( ANALYSIS_TASKS<<2 );
    154 
    155     // all load tasks allocate mwmr fifos descriptors (from local heap)
     166    // each load task allocates data buffers for mwmr fifos (from local heap)
     167    unsigned int*  data_l2a = malloc( analysis_tasks<<2 );
     168    unsigned int*  data_a2s = malloc( analysis_tasks<<2 );
     169    unsigned int*  data_s2l = malloc( analysis_tasks<<2 );
     170
     171    // each load task allocates mwmr fifos descriptors (from local heap)
    156172    mwmr_l2a[x][y] = malloc( sizeof(mwmr_channel_t) );
    157173    mwmr_a2s[x][y] = malloc( sizeof(mwmr_channel_t) );
    158174    mwmr_s2l[x][y] = malloc( sizeof(mwmr_channel_t) );
    159175
    160     // all "load" tasks register local pointers on mwmr fifos in local stack
     176    // each load task registers local pointers on mwmr fifos in local stack
    161177    mwmr_channel_t* fifo_l2a = mwmr_l2a[x][y];
    162178    mwmr_channel_t* fifo_a2s = mwmr_a2s[x][y];
    163179    mwmr_channel_t* fifo_s2l = mwmr_s2l[x][y];
    164180
    165     // all "load" tasks initialise local mwmr fifos descriptors
     181    // each load task initialises local mwmr fifos descriptors
    166182    // ( width = 4 bytes / depth = number of analysis tasks )
    167     mwmr_init( fifo_l2a , data_l2a , 1 , ANALYSIS_TASKS );
    168     mwmr_init( fifo_a2s , data_a2s , 1 , ANALYSIS_TASKS );
    169     mwmr_init( fifo_s2l , data_s2l , 1 , ANALYSIS_TASKS );
     183    mwmr_init( fifo_l2a , data_l2a , 1 , analysis_tasks );
     184    mwmr_init( fifo_a2s , data_a2s , 1 , analysis_tasks );
     185    mwmr_init( fifo_s2l , data_s2l , 1 , analysis_tasks );
    170186
    171187   
    172     // all "load" tasks initialise local containers as empty in fifo_s2l
    173     for ( n = 0 ; n < ANALYSIS_TASKS ; n++ ) mwmr_write( fifo_s2l , &n , 1 );
    174 
    175     // each "load" task[x][y] signals mwmr fifos initialisation completion
     188    // each load task initialises local containers as empty in fifo_s2l
     189    for ( n = 0 ; n < analysis_tasks ; n++ ) mwmr_write( fifo_s2l , &n , 1 );
     190
     191    // each load task[x][y] signals mwmr fifos initialisation completion
    176192    // to other tasks in same cluster[x][y]
    177193    local_sync[x][y] = 1;
    178194
    179     // "load" task[0][0] displays status
     195    // load task[0][0] displays status
    180196    if ( (x==0) && (y==0) )
    181197    giet_shr_printf("\n*** Task load on P[%d,%d,%d] enters main loop at cycle %d\n"
     
    192208                    (unsigned int)fifo_s2l, (unsigned int)data_s2l,
    193209                    (unsigned int)cont[0],
    194                     X_SIZE, Y_SIZE, NB_PROCS_MAX );
     210                    x_size, y_size, nprocs );
    195211 
    196212    /////////////////////////////////////////////////////////////
    197     // All "load" tasks enter the main loop (on containers)
    198     unsigned int count = 0;     // loaded containers count
    199     unsigned int index;         // available container index
    200     unsigned int* temp;         // pointer on available container
     213    // All load tasks enter the main loop (on containers)
     214    unsigned int  count = 0;     // loaded containers count
     215    unsigned int  index;         // available container index
     216    unsigned int* temp;          // pointer on available container
    201217
    202218    while ( count < CONTAINERS_MAX )
    203219    {
    204         // get one empty count index from fifo_s2l
     220        // get one empty container index from fifo_s2l
    205221        mwmr_read( fifo_s2l , &index , 1 );
    206222        temp = cont[index];
    207223
    208         // get one count from  kernel rx_chbuf
     224        // get one container from  kernel rx_chbuf
    209225        giet_nic_rx_move( nic_rx_channel, temp );
    210226
     
    213229        unsigned int nwords   = temp[0] >> 16;
    214230
    215         if ( (x==X_SIZE-1) && (y==Y_SIZE-1) )
     231        if ( (x==0) && (y==0) )
    216232        giet_shr_printf("\n*** Task load on P[%d,%d,%d] get container %d at cycle %d"
    217233                        " : %d packets / %d words\n",
    218234                        x, y, l, count, giet_proctime(), npackets, nwords );
    219235
    220         // put the full count index to fifo_l2a
     236        // put the full container index to fifo_l2a
    221237        mwmr_write( fifo_l2a, &index , 1 );
    222238
     
    225241
    226242    // all "load" tasks synchronise before stats
    227     sbt_barrier_wait( &rx_barrier );
     243    sqt_barrier_wait( &rx_barrier );
    228244
    229245    // "load" task[0][0] stops the NIC_CMA RX transfer and displays stats
     
    244260//////////////////////////////////////////
    245261{
     262    // each "load" task get platform parameters
     263    unsigned int    x_size;                                             // number of clusters in row
     264    unsigned int    y_size;                     // number of clusters in a column
     265    unsigned int    nprocs;                     // number of processors per cluster
     266    giet_procs_number( &x_size, &y_size, &nprocs );
     267
    246268    // get processor identifiers
    247269    unsigned int    x;
     
    250272    giet_proc_xyp( &x, &y, &l );
    251273
    252 
    253274    // "store" task[0][0] initialises the barrier between all "store" tasks,
    254275    // allocates NIC & CMA TX channels, and starts the NIC_CMA TX transfer.
     
    256277    if ( (x==0) && (y==0) )
    257278    {
    258         giet_shr_printf("\n*** Task store on P[%d][%d][%d] starts at cycle %d\n",
    259                         x , y , l , giet_proctime() );
     279        giet_shr_printf("\n*** Task store on P[%d][%d][%d] starts at cycle %d\n"
     280                        "  x_size = %d / y_size = %d / nprocs = %d\n",
     281                        x , y , l , giet_proctime() , x_size, y_size, nprocs );
    260282 
    261         sbt_barrier_init( &tx_barrier , X_SIZE*Y_SIZE , 1 );
    262         nic_tx_channel = giet_nic_tx_alloc();
     283        sqt_barrier_init( &tx_barrier , x_size , y_size , 1 );
     284        nic_tx_channel = giet_nic_tx_alloc( x_size , y_size );
    263285        giet_nic_tx_start( nic_tx_channel );
    264286        store_sync = 1;
     
    272294    while ( local_sync[x][y] == 0 ) asm volatile ("nop");
    273295
    274     // all "store" tasks register pointers on working containers in local stack
     296    // each "store" tasks register pointers on working containers in local stack
    275297    unsigned int   n;
    276     unsigned int*  cont[ANALYSIS_TASKS];
    277     for ( n = 0 ; n < ANALYSIS_TASKS ; n++ )
     298    unsigned int   analysis_tasks = nprocs-2;
     299    unsigned int*  cont[NPROCS_MAX-2];
     300
     301    for ( n = 0 ; n < analysis_tasks ; n++ )
    278302    {
    279303        cont[n] = container[x][y][n];
     
    318342        unsigned int nwords   = temp[0] >> 16;
    319343
    320         if ( (x==X_SIZE-1) && (y==Y_SIZE-1) )
     344        if ( (x==0) && (y==0) )
    321345        giet_shr_printf("\n*** Task store on P[%d,%d,%d] get container %d at cycle %d"
    322346                        " : %d packets / %d words\n",
     
    330354
    331355    // all "store" tasks synchronise before result display
    332     sbt_barrier_wait( &tx_barrier );
     356    sqt_barrier_wait( &tx_barrier );
    333357
    334358    // "store" task[0,0] stops NIC_CMA TX transfer and displays results
     
    377401////////////////////////////////////////////
    378402{
     403    // each "load" task get platform parameters
     404    unsigned int    x_size;                                             // number of clusters in row
     405    unsigned int    y_size;                     // number of clusters in a column
     406    unsigned int    nprocs;                     // number of processors per cluster
     407    giet_procs_number( &x_size, &y_size, &nprocs );
     408
    379409    // get processor identifiers
    380410    unsigned int    x;
     
    385415    if ( (x==0) && (y==0) )
    386416    {
    387         giet_shr_printf("\n*** Task analyse on P[%d][%d][%d] starts at cycle %d\n",
    388                         x , y , l , giet_proctime() );
     417        giet_shr_printf("\n*** Task analyse on P[%d][%d][%d] starts at cycle %d\n"
     418                        "  x_size = %d / y_size = %d / nprocs = %d\n",
     419                        x , y , l , giet_proctime() , x_size, y_size, nprocs );
    389420    }
    390421 
     
    394425    // all "analyse" tasks register pointers on working containers in local stack
    395426    unsigned int   n;
    396     unsigned int*  cont[ANALYSIS_TASKS];
    397     for ( n = 0 ; n < ANALYSIS_TASKS ; n++ )
     427    unsigned int   analysis_tasks = nprocs-2;
     428    unsigned int*  cont[NPROCS_MAX-2];
     429    for ( n = 0 ; n < analysis_tasks ; n++ )
    398430    {
    399431        cont[n] = container[x][y][n];
     
    471503            unsigned int word2 = temp[first + 2];
    472504
     505#if VERBOSE_ANALYSE
    473506            unsigned long long dst = ((unsigned long long)(word1 & 0xFFFF0000)>>16) |
    474507                                     (((unsigned long long)word0)<<16);
    475508            unsigned long long src = ((unsigned long long)(word1 & 0x0000FFFF)<<32) |
    476509                                     ((unsigned long long)word2);
    477 #if VERBOSE_ANALYSE
    478510            if ( p < 10 )
    479511            {
Note: See TracChangeset for help on using the changeset viewer.