Ignore:
Timestamp:
Feb 8, 2015, 9:20:45 PM (10 years ago)
Author:
alain
Message:

1) Introduce distributed barriers in the multi-threads applications
(classif) transpose, convol, sort, gameoflife)

2) Introducing support for architectures containing empty clusters
in the mapping of these multi-threaded applications.

3) Removing the "command line arguments" in the sort application
(replaced by the giet_procs_number() system call.

Location:
soft/giet_vm/applications/transpose
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • soft/giet_vm/applications/transpose/main.c

    r444 r502  
    1 /////////////////////////////////////////////////////////////////////////////////////////////
     1///////////////////////////////////////////////////////////////////////////////////////
    22// File   : main.c   (for transpose application)
    33// Date   : february 2014
    44// author : Alain Greiner
    5 /////////////////////////////////////////////////////////////////////////////////////////////
    6 // This multi-threaded application makes a transpose for a NN*NN pixels sequence of images.
     5///////////////////////////////////////////////////////////////////////////////////////
     6// This multi-threaded application makes a transpose for a NN*NN pixels
     7// sequence of images.
    78// It can run on a multi-processors, multi-clusters architecture, with one thread
    8 // per processor. It uses the he following hardware parameters, that must be defined
    9 // in the hard_config.h file:
    10 // - X_SIZE       : number of clusters in a row
    11 // - Y_SIZE       : number of clusters in a column
    12 // - NB_PROCS_MAX : number of processors per cluster
    13 // - FBUF_X_SIZE  : number of pixels per line in frame buffer
    14 // - FBUF_Y_SIZE  : number of lines  in frame buffer
    15 //
     9// per processor.
     10//
    1611// The image sequence is read from a file (one byte per pixel).
    1712// The input and output buffers containing the image are distributed in all clusters.
    1813//
    19 // - The image size NN must be a power of 2 and must fit the frame buffer size.
    20 // - The number of clusters containing processors must be a power of 2.
    21 // - The number of processors per cluster must be a power of 2.
    22 // - The image size NN must be larger or equal to the total number of processor.
     14// - The image size NN must fit the frame buffer size: 128 bytes
     15// - The block size in block device must be 512 bytes.
     16// - The number of clusters  must be a power of 2 no larger than 32
     17// - The number of processors per cluster must be a power of 2 no larger than 4
    2318//
    2419// For each image the application makes a self test (checksum for each line).
    2520// The actual display on the frame buffer depends on frame buffer availability.
    26 /////////////////////////////////////////////////////////////////////////////////////////////
    27 
    28 #include "hard_config.h"
     21///////////////////////////////////////////////////////////////////////////////////////
     22
    2923#include "stdio.h"
    30 #include "barrier.h"
     24#include "user_barrier.h"
    3125#include "malloc.h"
    3226
     27#define BLOCK_SIZE          512                 // block size on disk
     28#define CLUSTERS_MAX        32                  // max number of clusters
     29#define PROCS_MAX           4                   // max number of processors per cluster
    3330#define NN                  128                 // image size : nlines = npixels = 128
    3431#define NB_IMAGES           5                   // number of images to be handled
    3532#define FILE_PATHNAME       "misc/images.raw"   // file pathname on disk
    36 #define NB_CLUSTERS         (X_SIZE * Y_SIZE)   // number of clusters
    3733#define INSTRUMENTATION_OK  0                   // display statistics on TTY when non zero
    3834
     
    4137///////////////////////////////////////////////////////
    4238
    43 // instrumentation counters
    44 // for each processor (up to 4 processors)
    45 // in each cluster (up to 32 clusters)
    46 unsigned int LOAD_START[NB_CLUSTERS][NB_PROCS_MAX];
    47 unsigned int LOAD_END  [NB_CLUSTERS][NB_PROCS_MAX];
    48 unsigned int TRSP_START[NB_CLUSTERS][NB_PROCS_MAX];
    49 unsigned int TRSP_END  [NB_CLUSTERS][NB_PROCS_MAX];
    50 unsigned int DISP_START[NB_CLUSTERS][NB_PROCS_MAX];
    51 unsigned int DISP_END  [NB_CLUSTERS][NB_PROCS_MAX];
     39// instrumentation counters for each processor in each cluster
     40unsigned int LOAD_START[CLUSTERS_MAX][PROCS_MAX];
     41unsigned int LOAD_END  [CLUSTERS_MAX][PROCS_MAX];
     42unsigned int TRSP_START[CLUSTERS_MAX][PROCS_MAX];
     43unsigned int TRSP_END  [CLUSTERS_MAX][PROCS_MAX];
     44unsigned int DISP_START[CLUSTERS_MAX][PROCS_MAX];
     45unsigned int DISP_END  [CLUSTERS_MAX][PROCS_MAX];
    5246
    5347// arrays of pointers on distributed buffers
    5448// one input buffer & one output buffer per cluster
    55 unsigned char*  buf_in [NB_CLUSTERS];
    56 unsigned char*  buf_out[NB_CLUSTERS];
     49unsigned char*  buf_in [CLUSTERS_MAX];
     50unsigned char*  buf_out[CLUSTERS_MAX];
    5751
    5852// checksum variables
     
    6155
    6256// global synchronisation barrier
    63 giet_barrier_t barrier;
     57giet_sqt_barrier_t barrier;
    6458
    6559volatile unsigned int init_ok = 0;
     
    7064{
    7165
    72     int          file = 0;                                         // file descriptor
    73     unsigned int l;                                                // line index for loops
    74     unsigned int p;                                                // pixel index for loops
    75     unsigned int c;                                                // cluster index for loops
    76 
    77     // get processor identifiers
    78     unsigned int x;                                                // x cluster coordinate
    79     unsigned int y;                                                // y cluster coordinate
    80     unsigned int lpid;                                             // local processor index
     66    unsigned int l;                  // line index for loops
     67    unsigned int p;                  // pixel index for loops
     68    unsigned int c;                  // cluster index for loops
     69
     70    // processor identifiers
     71    unsigned int x;                  // x cluster coordinate
     72    unsigned int y;                  // y cluster coordinate
     73    unsigned int lpid;               // local processor index
     74
     75    // plat-form parameters
     76    unsigned int x_size;             // number of clusters in a row
     77    unsigned int y_size;             // number of clusters in a column
     78    unsigned int nprocs;             // number of processors per cluster
     79   
    8180    giet_proc_xyp( &x, &y, &lpid);             
    8281
    83     unsigned int npixels    = NN * NN;                             // pixels per image
    84     unsigned int nblocks    = npixels / 512;                       // blocks per image
    85     unsigned int image      = 0;                                   // image counter
    86 
    87     unsigned int cluster_id = (x * Y_SIZE) + y;                    // "continuous" index   
    88     unsigned int ntasks     = NB_CLUSTERS * NB_PROCS_MAX;          // number of tasks
    89     unsigned int task_id    = (cluster_id * NB_PROCS_MAX) + lpid;  // "continuous" task index
     82    giet_procs_number( &x_size , &y_size , &nprocs );
     83
     84    giet_shr_printf("\n[TRANSPOSE] Processor[%d,%d,%d] starts at cycle %d\n"
     85                    " - x_size = %d\n"
     86                    " - y_size = %d\n"
     87                    " - nprocs = %d\n",
     88                    x, y, lpid, giet_proctime(), x_size , y_size , nprocs );
     89
     90    unsigned int nclusters  = x_size * y_size;               // number of clusters
     91    unsigned int ntasks     = x_size * y_size * nprocs;      // number of tasks
     92    unsigned int npixels    = NN * NN;                       // pixels per image
     93    unsigned int nblocks    = npixels / BLOCK_SIZE;          // blocks per image
     94    unsigned int image      = 0;                             // image counter
     95    int          file       = 0;                             // file descriptor
     96    unsigned int cluster_id = (x * y_size) + y;              // "continuous" index   
     97    unsigned int task_id    = (cluster_id * nprocs) + lpid;  // "continuous" task index
    9098
    9199    // Processor [0,0,0] makes initialisation
    92     // It includes parameters checking, barriers initialization,
     100    // It includes parameters checking, barrier initialization,
    93101    // distributed buffers allocation, and file open
    94102    if ( (x==0) && (y==0) && (lpid==0) )
    95103    {
    96         // Parameters checking
    97         if ( (NN != FBUF_X_SIZE) || (NN != FBUF_Y_SIZE) )
    98         {
    99             giet_exit("[TRANSPOSE ERROR] Frame buffer size does not fit image size");
    100         }
    101         if ((NB_PROCS_MAX != 1) && (NB_PROCS_MAX != 2) && (NB_PROCS_MAX != 4))
     104        if ((nprocs != 1) && (nprocs != 2) && (nprocs != 4))
    102105        {
    103             giet_exit("[TRANSPOSE ERROR] NB_PROCS_MAX must be 1, 2 or 4");
    104         }
    105         if ((NB_CLUSTERS != 1) && (NB_CLUSTERS != 2) && (NB_CLUSTERS != 4) &&
    106             (NB_CLUSTERS != 8) && (NB_CLUSTERS != 16) && (NB_CLUSTERS != 32) )
     106            giet_exit("[TRANSPOSE ERROR] number of procs per cluster must be 1, 2 or 4");
     107        }
     108        if ((nclusters != 1) && (nclusters != 2) && (nclusters != 4) &&
     109            (nclusters != 8) && (nclusters != 16) && (nclusters != 32) )
    107110        {
    108111            giet_exit("[TRANSPOSE ERROR] number of clusters must be 1,2,4,8,16,32");
     
    113116        }
    114117
    115         giet_shr_printf("\n[TRANSPOSE] Processor[0,0,0] starts at cycle %d\n"
    116                         " - x_size    = %d\n"
    117                         " - y_size    = %d\n"
    118                         " - nprocs    = %d\n"
    119                         " - nclusters = %d\n"
    120                         " - ntasks    = %d\n",
    121                         giet_proctime(), X_SIZE, Y_SIZE, NB_PROCS_MAX, NB_CLUSTERS, ntasks );
    122 
    123118        // Barrier initialisation
    124         barrier_init( &barrier, ntasks );
     119        sqt_barrier_init( &barrier, x_size , y_size , nprocs );
    125120
    126121        giet_shr_printf("\n[TRANSPOSE] Proc [0,0,0] completes barrier init at cycle %d\n",
     
    128123
    129124        // Distributed buffers allocation
    130         // The buffers containing one image are distributed in clusters
    131         // (one buf_in and one buf_out per cluster).
    132         // Each buffer contains (NN*NN / NB_CLUSTERS) bytes.
    133         for ( c = 0 ; c < NB_CLUSTERS ; c++ )
    134         {
    135             unsigned int rx = c / Y_SIZE;
    136             unsigned int ry = c % Y_SIZE;
    137 
    138             buf_in[c]  = remote_malloc( npixels/NB_CLUSTERS, rx, ry );
    139             buf_out[c] = remote_malloc( npixels/NB_CLUSTERS, rx, ry );
     125        // The buffers containing one image are distributed in the user
     126        // heap (one buf_in and one buf_out per cluster).
     127        // Each buffer contains (NN*NN / nclusters) bytes.
     128        for ( c = 0 ; c < nclusters ; c++ )
     129        {
     130            unsigned int rx = c / y_size;
     131            unsigned int ry = c % y_size;
     132
     133            buf_in[c]  = remote_malloc( npixels/nclusters, rx, ry );
     134            buf_out[c] = remote_malloc( npixels/nclusters, rx, ry );
    140135
    141136            giet_shr_printf("\n[TRANSPOSE] Proc [0,0,0] completes buffer allocation"
     
    167162    {
    168163        while ( init_ok == 0 );
    169         giet_shr_printf("\n[TRANSPOSE] Processor[%d,%d,%d] starts at cycle %d\n",
    170                         x, y, lpid, giet_proctime() );
    171164    }
    172165   
     
    175168    while (image < NB_IMAGES)
    176169    {
    177         // pseudo parallel load from disk to buf_in buffer : nblocks/NB_CLUSTERS blocks
     170        // pseudo parallel load from disk to buf_in buffer : nblocks/nclusters blocks
    178171        // only task running on processor with (lpid == 0) does it
    179172
     
    184177            giet_fat_read( file,
    185178                           buf_in[cluster_id],
    186                            (nblocks / NB_CLUSTERS),
    187                            ((image*nblocks) + ((nblocks*cluster_id)/NB_CLUSTERS)) );
    188 
     179                           (nblocks / nclusters),
     180                           ((image*nblocks) + ((nblocks*cluster_id)/nclusters)) );
     181
     182            if ( (x==0) && (y==0) )
    189183            giet_shr_printf("\n[TRANSPOSE] Proc [%d,%d,%d] completes load"
    190184                            "  for image %d at cycle %d\n",
     
    194188        LOAD_END[cluster_id][lpid] = giet_proctime();
    195189
    196         /////////////////////////
    197         barrier_wait( &barrier );
     190        /////////////////////////////
     191        sqt_barrier_wait( &barrier );
    198192
    199193        // parallel transpose from buf_in to buf_out
     
    206200
    207201        unsigned int nlt   = NN / ntasks;      // number of lines per task
    208         unsigned int nlc   = NN / NB_CLUSTERS;   // number of lines per cluster
     202        unsigned int nlc   = NN / nclusters;   // number of lines per cluster
    209203
    210204        unsigned int src_cluster;
     
    242236        if ( lpid == 0 )
    243237        {
     238            if ( (x==0) && (y==0) )
    244239            giet_shr_printf("\n[TRANSPOSE] proc [%d,%d,0] completes transpose"
    245240                            " for image %d at cycle %d\n",
     
    249244        TRSP_END[cluster_id][lpid] = giet_proctime();
    250245
    251         /////////////////////////
    252         barrier_wait( &barrier );
     246        /////////////////////////////
     247        sqt_barrier_wait( &barrier );
    253248
    254249        // optional parallel display from local buf_out to frame buffer
     
    265260                                 npt );
    266261
    267             if ( lpid == 0 )
    268             {
    269                 giet_shr_printf("\n[TRANSPOSE] Proc [%d,%d,0] completes display"
    270                                 " for image %d at cycle %d\n",
    271                                 x, y, image, giet_proctime() );
    272             }
     262            if ( (x==0) && (y==0) && (lpid==0) )
     263            giet_shr_printf("\n[TRANSPOSE] Proc [%d,%d,%d] completes display"
     264                            " for image %d at cycle %d\n",
     265                            x, y, lpid, image, giet_proctime() );
    273266
    274267            DISP_END[cluster_id][lpid] = giet_proctime();
    275268
    276             /////////////////////////
    277             barrier_wait( &barrier );
     269            /////////////////////////////
     270            sqt_barrier_wait( &barrier );
    278271        }
    279272
     
    318311        }
    319312
    320         /////////////////////////
    321         barrier_wait( &barrier );
     313        /////////////////////////////
     314        sqt_barrier_wait( &barrier );
    322315
    323316        // instrumentation done by processor [0,0,0]
     
    338331            unsigned int max_disp_ended = 0;
    339332
    340             for (cc = 0; cc < NB_CLUSTERS; cc++)
     333            for (cc = 0; cc < nclusters; cc++)
    341334            {
    342335                for (pp = 0; pp < NB_PROCS_MAX; pp++)
     
    384377        image++;
    385378
    386         /////////////////////////
    387         barrier_wait( &barrier );
     379        /////////////////////////////
     380        sqt_barrier_wait( &barrier );
    388381
    389382    } // end while image     
     
    392385    if ( (x==0) && (y==0) && (lpid==0) )
    393386    {
    394         for ( c = 0 ; c < NB_CLUSTERS ; c++ )
     387        for ( c = 0 ; c < nclusters ; c++ )
    395388        {
    396389            free( buf_in[c] );
  • soft/giet_vm/applications/transpose/transpose.py

    r457 r502  
    33from mapping import *
    44
    5 ######################################################################################
     5##################################################################################
    66#   file   : transpose.py  (for the transpose application)
    77#   date   : may 2014
    88#   author : Alain Greiner
    9 #######################################################################################
     9##################################################################################
    1010#  This file describes the mapping of the multi-threaded "transpose"
    1111#  application on a multi-clusters, multi-processors architecture.
    1212#  This include both the mapping of virtual segments on the clusters,
    1313#  and the mapping of tasks on processors.
     14#  There is one task per processor.
     15#  The mapping of virtual segments is the following:
     16#    - There is one shared data vseg in cluster[0][0]
     17#    - The code vsegs are replicated on all clusters containing processors.
     18#    - There is one heap vseg per cluster containing processors.
     19#    - The stacks vsegs are distibuted on all clusters containing processors.
    1420#  This mapping uses 5 platform parameters, (obtained from the "mapping" argument)
    15 - x_size    : number of clusters in a row
    16 - y_size    : number of clusters in a column
    17 - x_width   : number of bits coding x coordinate
    18 - y_width   : number of bits coding y coordinate
    19 - nprocs    : number of processors per cluster
    20 ####################################################################################
     21  - x_size    : number of clusters in a row
     22  - y_size    : number of clusters in a column
     23  - x_width   : number of bits coding x coordinate
     24  - y_width   : number of bits coding y coordinate
     25  - nprocs    : number of processors per cluster
     26##################################################################################
    2127
    2228#########################
     
    5460    for x in xrange (x_size):
    5561        for y in xrange (y_size):
    56             mapping.addVseg( vspace, 'trsp_code_%d_%d' %(x,y), code_base , code_size,
    57                              'CXWU', vtype = 'ELF', x = x, y = y, pseg = 'RAM',
    58                              binpath = 'build/transpose/transpose.elf',
    59                              local = True )
     62            cluster_id = (x * y_size) + y
     63            if ( mapping.clusters[cluster_id].procs ):
    6064
    61     # stacks vsegs: local (one stack per processor => nprocs stacks per cluster)           
    62     for x in xrange (x_size):
    63         for y in xrange (y_size):
    64             for p in xrange( nprocs ):
    65                 proc_id = (((x * y_size) + y) * nprocs) + p
    66                 size    = (stack_size / nprocs) & 0xFFFFF000
    67                 base    = stack_base + (proc_id * size)
    68                 mapping.addVseg( vspace, 'trsp_stack_%d_%d_%d' % (x,y,p), base, size,
    69                                  'C_WU', vtype = 'BUFFER', x = x , y = y , pseg = 'RAM',
    70                                  local = True, big = True )
     65                mapping.addVseg( vspace, 'trsp_code_%d_%d' %(x,y),
     66                                 code_base , code_size,
     67                                 'CXWU', vtype = 'ELF', x = x, y = y, pseg = 'RAM',
     68                                 binpath = 'build/transpose/transpose.elf',
     69                                 local = True )
    7170
    72     # heap vsegs: distributed but non local (all heap vsegs can be accessed by all tasks)
     71    # stacks vsegs: local (one stack per processor => nprocs stacks per cluster)
    7372    for x in xrange (x_size):
    7473        for y in xrange (y_size):
    7574            cluster_id = (x * y_size) + y
    76             size  = heap_size
    77             base  = heap_base + (cluster_id * size)
    78             mapping.addVseg( vspace, 'trsp_heap_%d_%d' % (x,y), base, size,
    79                              'C_WU', vtype = 'BUFFER', x = x, y = y, pseg = 'RAM',
    80                              local = False, big = True )
     75            if ( mapping.clusters[cluster_id].procs ):
     76                for p in xrange( nprocs ):
     77                    proc_id = (((x * y_size) + y) * nprocs) + p
     78                    size    = (stack_size / nprocs) & 0xFFFFF000
     79                    base    = stack_base + (proc_id * size)
     80
     81                    mapping.addVseg( vspace, 'trsp_stack_%d_%d_%d' % (x,y,p),
     82                                     base, size, 'C_WU', vtype = 'BUFFER',
     83                                     x = x , y = y , pseg = 'RAM',
     84                                     local = True, big = True )
     85
     86    # heap vsegs: distributed non local (all heap vsegs can be accessed by all tasks)
     87    for x in xrange (x_size):
     88        for y in xrange (y_size):
     89            cluster_id = (x * y_size) + y
     90            if ( mapping.clusters[cluster_id].procs ):
     91                size  = heap_size
     92                base  = heap_base + (cluster_id * size)
     93
     94                mapping.addVseg( vspace, 'trsp_heap_%d_%d' % (x,y), base, size,
     95                                 'C_WU', vtype = 'HEAP', x = x, y = y, pseg = 'RAM',
     96                                 local = False, big = True )
    8197
    8298    # distributed tasks / one task per processor
    8399    for x in xrange (x_size):
    84100        for y in xrange (y_size):
    85             for p in xrange( nprocs ):
    86                 trdid = (((x * y_size) + y) * nprocs) + p
    87                 mapping.addTask( vspace, 'trsp_%d_%d_%d' % (x,y,p), trdid, x, y, p,
    88                                  'trsp_stack_%d_%d_%d' % (x,y,p),
    89                                  'trsp_heap_%d_%d' % (x,y), 0 )
     101            cluster_id = (x * y_size) + y
     102            if ( mapping.clusters[cluster_id].procs ):
     103                for p in xrange( nprocs ):
     104                    trdid = (((x * y_size) + y) * nprocs) + p
     105
     106                    mapping.addTask( vspace, 'trsp_%d_%d_%d' % (x,y,p),
     107                                     trdid, x, y, p,
     108                                     'trsp_stack_%d_%d_%d' % (x,y,p),
     109                                     'trsp_heap_%d_%d' % (x,y), 0 )
    90110
    91111    # extend mapping name
     
    94114    return vspace  # useful for test
    95115           
    96 ################################ test ######################################################
     116################################ test ##################################################
    97117
    98118if __name__ == '__main__':
Note: See TracChangeset for help on using the changeset viewer.