Ignore:
Timestamp:
Feb 8, 2015, 9:20:45 PM (10 years ago)
Author:
alain
Message:

1) Introduce distributed barriers in the multi-threads applications
(classif) transpose, convol, sort, gameoflife)

2) Introducing support for architectures containing empty clusters
in the mapping of these multi-threaded applications.

3) Removing the "command line arguments" in the sort application
(replaced by the giet_procs_number() system call.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • soft/giet_vm/applications/transpose/main.c

    r444 r502  
    1 /////////////////////////////////////////////////////////////////////////////////////////////
     1///////////////////////////////////////////////////////////////////////////////////////
    22// File   : main.c   (for transpose application)
    33// Date   : february 2014
    44// author : Alain Greiner
    5 /////////////////////////////////////////////////////////////////////////////////////////////
    6 // This multi-threaded application makes a transpose for a NN*NN pixels sequence of images.
     5///////////////////////////////////////////////////////////////////////////////////////
     6// This multi-threaded application makes a transpose for a NN*NN pixels
     7// sequence of images.
    78// It can run on a multi-processors, multi-clusters architecture, with one thread
    8 // per processor. It uses the he following hardware parameters, that must be defined
    9 // in the hard_config.h file:
    10 // - X_SIZE       : number of clusters in a row
    11 // - Y_SIZE       : number of clusters in a column
    12 // - NB_PROCS_MAX : number of processors per cluster
    13 // - FBUF_X_SIZE  : number of pixels per line in frame buffer
    14 // - FBUF_Y_SIZE  : number of lines  in frame buffer
    15 //
     9// per processor.
     10//
    1611// The image sequence is read from a file (one byte per pixel).
    1712// The input and output buffers containing the image are distributed in all clusters.
    1813//
    19 // - The image size NN must be a power of 2 and must fit the frame buffer size.
    20 // - The number of clusters containing processors must be a power of 2.
    21 // - The number of processors per cluster must be a power of 2.
    22 // - The image size NN must be larger or equal to the total number of processor.
     14// - The image size NN must fit the frame buffer size: 128 bytes
     15// - The block size in block device must be 512 bytes.
     16// - The number of clusters  must be a power of 2 no larger than 32
     17// - The number of processors per cluster must be a power of 2 no larger than 4
    2318//
    2419// For each image the application makes a self test (checksum for each line).
    2520// The actual display on the frame buffer depends on frame buffer availability.
    26 /////////////////////////////////////////////////////////////////////////////////////////////
    27 
    28 #include "hard_config.h"
     21///////////////////////////////////////////////////////////////////////////////////////
     22
    2923#include "stdio.h"
    30 #include "barrier.h"
     24#include "user_barrier.h"
    3125#include "malloc.h"
    3226
     27#define BLOCK_SIZE          512                 // block size on disk
     28#define CLUSTERS_MAX        32                  // max number of clusters
     29#define PROCS_MAX           4                   // max number of processors per cluster
    3330#define NN                  128                 // image size : nlines = npixels = 128
    3431#define NB_IMAGES           5                   // number of images to be handled
    3532#define FILE_PATHNAME       "misc/images.raw"   // file pathname on disk
    36 #define NB_CLUSTERS         (X_SIZE * Y_SIZE)   // number of clusters
    3733#define INSTRUMENTATION_OK  0                   // display statistics on TTY when non zero
    3834
     
    4137///////////////////////////////////////////////////////
    4238
    43 // instrumentation counters
    44 // for each processor (up to 4 processors)
    45 // in each cluster (up to 32 clusters)
    46 unsigned int LOAD_START[NB_CLUSTERS][NB_PROCS_MAX];
    47 unsigned int LOAD_END  [NB_CLUSTERS][NB_PROCS_MAX];
    48 unsigned int TRSP_START[NB_CLUSTERS][NB_PROCS_MAX];
    49 unsigned int TRSP_END  [NB_CLUSTERS][NB_PROCS_MAX];
    50 unsigned int DISP_START[NB_CLUSTERS][NB_PROCS_MAX];
    51 unsigned int DISP_END  [NB_CLUSTERS][NB_PROCS_MAX];
     39// instrumentation counters for each processor in each cluster
     40unsigned int LOAD_START[CLUSTERS_MAX][PROCS_MAX];
     41unsigned int LOAD_END  [CLUSTERS_MAX][PROCS_MAX];
     42unsigned int TRSP_START[CLUSTERS_MAX][PROCS_MAX];
     43unsigned int TRSP_END  [CLUSTERS_MAX][PROCS_MAX];
     44unsigned int DISP_START[CLUSTERS_MAX][PROCS_MAX];
     45unsigned int DISP_END  [CLUSTERS_MAX][PROCS_MAX];
    5246
    5347// arrays of pointers on distributed buffers
    5448// one input buffer & one output buffer per cluster
    55 unsigned char*  buf_in [NB_CLUSTERS];
    56 unsigned char*  buf_out[NB_CLUSTERS];
     49unsigned char*  buf_in [CLUSTERS_MAX];
     50unsigned char*  buf_out[CLUSTERS_MAX];
    5751
    5852// checksum variables
     
    6155
    6256// global synchronisation barrier
    63 giet_barrier_t barrier;
     57giet_sqt_barrier_t barrier;
    6458
    6559volatile unsigned int init_ok = 0;
     
    7064{
    7165
    72     int          file = 0;                                         // file descriptor
    73     unsigned int l;                                                // line index for loops
    74     unsigned int p;                                                // pixel index for loops
    75     unsigned int c;                                                // cluster index for loops
    76 
    77     // get processor identifiers
    78     unsigned int x;                                                // x cluster coordinate
    79     unsigned int y;                                                // y cluster coordinate
    80     unsigned int lpid;                                             // local processor index
     66    unsigned int l;                  // line index for loops
     67    unsigned int p;                  // pixel index for loops
     68    unsigned int c;                  // cluster index for loops
     69
     70    // processor identifiers
     71    unsigned int x;                  // x cluster coordinate
     72    unsigned int y;                  // y cluster coordinate
     73    unsigned int lpid;               // local processor index
     74
     75    // plat-form parameters
     76    unsigned int x_size;             // number of clusters in a row
     77    unsigned int y_size;             // number of clusters in a column
     78    unsigned int nprocs;             // number of processors per cluster
     79   
    8180    giet_proc_xyp( &x, &y, &lpid);             
    8281
    83     unsigned int npixels    = NN * NN;                             // pixels per image
    84     unsigned int nblocks    = npixels / 512;                       // blocks per image
    85     unsigned int image      = 0;                                   // image counter
    86 
    87     unsigned int cluster_id = (x * Y_SIZE) + y;                    // "continuous" index   
    88     unsigned int ntasks     = NB_CLUSTERS * NB_PROCS_MAX;          // number of tasks
    89     unsigned int task_id    = (cluster_id * NB_PROCS_MAX) + lpid;  // "continuous" task index
     82    giet_procs_number( &x_size , &y_size , &nprocs );
     83
     84    giet_shr_printf("\n[TRANSPOSE] Processor[%d,%d,%d] starts at cycle %d\n"
     85                    " - x_size = %d\n"
     86                    " - y_size = %d\n"
     87                    " - nprocs = %d\n",
     88                    x, y, lpid, giet_proctime(), x_size , y_size , nprocs );
     89
     90    unsigned int nclusters  = x_size * y_size;               // number of clusters
     91    unsigned int ntasks     = x_size * y_size * nprocs;      // number of tasks
     92    unsigned int npixels    = NN * NN;                       // pixels per image
     93    unsigned int nblocks    = npixels / BLOCK_SIZE;          // blocks per image
     94    unsigned int image      = 0;                             // image counter
     95    int          file       = 0;                             // file descriptor
     96    unsigned int cluster_id = (x * y_size) + y;              // "continuous" index   
     97    unsigned int task_id    = (cluster_id * nprocs) + lpid;  // "continuous" task index
    9098
    9199    // Processor [0,0,0] makes initialisation
    92     // It includes parameters checking, barriers initialization,
     100    // It includes parameters checking, barrier initialization,
    93101    // distributed buffers allocation, and file open
    94102    if ( (x==0) && (y==0) && (lpid==0) )
    95103    {
    96         // Parameters checking
    97         if ( (NN != FBUF_X_SIZE) || (NN != FBUF_Y_SIZE) )
    98         {
    99             giet_exit("[TRANSPOSE ERROR] Frame buffer size does not fit image size");
    100         }
    101         if ((NB_PROCS_MAX != 1) && (NB_PROCS_MAX != 2) && (NB_PROCS_MAX != 4))
     104        if ((nprocs != 1) && (nprocs != 2) && (nprocs != 4))
    102105        {
    103             giet_exit("[TRANSPOSE ERROR] NB_PROCS_MAX must be 1, 2 or 4");
    104         }
    105         if ((NB_CLUSTERS != 1) && (NB_CLUSTERS != 2) && (NB_CLUSTERS != 4) &&
    106             (NB_CLUSTERS != 8) && (NB_CLUSTERS != 16) && (NB_CLUSTERS != 32) )
     106            giet_exit("[TRANSPOSE ERROR] number of procs per cluster must be 1, 2 or 4");
     107        }
     108        if ((nclusters != 1) && (nclusters != 2) && (nclusters != 4) &&
     109            (nclusters != 8) && (nclusters != 16) && (nclusters != 32) )
    107110        {
    108111            giet_exit("[TRANSPOSE ERROR] number of clusters must be 1,2,4,8,16,32");
     
    113116        }
    114117
    115         giet_shr_printf("\n[TRANSPOSE] Processor[0,0,0] starts at cycle %d\n"
    116                         " - x_size    = %d\n"
    117                         " - y_size    = %d\n"
    118                         " - nprocs    = %d\n"
    119                         " - nclusters = %d\n"
    120                         " - ntasks    = %d\n",
    121                         giet_proctime(), X_SIZE, Y_SIZE, NB_PROCS_MAX, NB_CLUSTERS, ntasks );
    122 
    123118        // Barrier initialisation
    124         barrier_init( &barrier, ntasks );
     119        sqt_barrier_init( &barrier, x_size , y_size , nprocs );
    125120
    126121        giet_shr_printf("\n[TRANSPOSE] Proc [0,0,0] completes barrier init at cycle %d\n",
     
    128123
    129124        // Distributed buffers allocation
    130         // The buffers containing one image are distributed in clusters
    131         // (one buf_in and one buf_out per cluster).
    132         // Each buffer contains (NN*NN / NB_CLUSTERS) bytes.
    133         for ( c = 0 ; c < NB_CLUSTERS ; c++ )
    134         {
    135             unsigned int rx = c / Y_SIZE;
    136             unsigned int ry = c % Y_SIZE;
    137 
    138             buf_in[c]  = remote_malloc( npixels/NB_CLUSTERS, rx, ry );
    139             buf_out[c] = remote_malloc( npixels/NB_CLUSTERS, rx, ry );
     125        // The buffers containing one image are distributed in the user
     126        // heap (one buf_in and one buf_out per cluster).
     127        // Each buffer contains (NN*NN / nclusters) bytes.
     128        for ( c = 0 ; c < nclusters ; c++ )
     129        {
     130            unsigned int rx = c / y_size;
     131            unsigned int ry = c % y_size;
     132
     133            buf_in[c]  = remote_malloc( npixels/nclusters, rx, ry );
     134            buf_out[c] = remote_malloc( npixels/nclusters, rx, ry );
    140135
    141136            giet_shr_printf("\n[TRANSPOSE] Proc [0,0,0] completes buffer allocation"
     
    167162    {
    168163        while ( init_ok == 0 );
    169         giet_shr_printf("\n[TRANSPOSE] Processor[%d,%d,%d] starts at cycle %d\n",
    170                         x, y, lpid, giet_proctime() );
    171164    }
    172165   
     
    175168    while (image < NB_IMAGES)
    176169    {
    177         // pseudo parallel load from disk to buf_in buffer : nblocks/NB_CLUSTERS blocks
     170        // pseudo parallel load from disk to buf_in buffer : nblocks/nclusters blocks
    178171        // only task running on processor with (lpid == 0) does it
    179172
     
    184177            giet_fat_read( file,
    185178                           buf_in[cluster_id],
    186                            (nblocks / NB_CLUSTERS),
    187                            ((image*nblocks) + ((nblocks*cluster_id)/NB_CLUSTERS)) );
    188 
     179                           (nblocks / nclusters),
     180                           ((image*nblocks) + ((nblocks*cluster_id)/nclusters)) );
     181
     182            if ( (x==0) && (y==0) )
    189183            giet_shr_printf("\n[TRANSPOSE] Proc [%d,%d,%d] completes load"
    190184                            "  for image %d at cycle %d\n",
     
    194188        LOAD_END[cluster_id][lpid] = giet_proctime();
    195189
    196         /////////////////////////
    197         barrier_wait( &barrier );
     190        /////////////////////////////
     191        sqt_barrier_wait( &barrier );
    198192
    199193        // parallel transpose from buf_in to buf_out
     
    206200
    207201        unsigned int nlt   = NN / ntasks;      // number of lines per task
    208         unsigned int nlc   = NN / NB_CLUSTERS;   // number of lines per cluster
     202        unsigned int nlc   = NN / nclusters;   // number of lines per cluster
    209203
    210204        unsigned int src_cluster;
     
    242236        if ( lpid == 0 )
    243237        {
     238            if ( (x==0) && (y==0) )
    244239            giet_shr_printf("\n[TRANSPOSE] proc [%d,%d,0] completes transpose"
    245240                            " for image %d at cycle %d\n",
     
    249244        TRSP_END[cluster_id][lpid] = giet_proctime();
    250245
    251         /////////////////////////
    252         barrier_wait( &barrier );
     246        /////////////////////////////
     247        sqt_barrier_wait( &barrier );
    253248
    254249        // optional parallel display from local buf_out to frame buffer
     
    265260                                 npt );
    266261
    267             if ( lpid == 0 )
    268             {
    269                 giet_shr_printf("\n[TRANSPOSE] Proc [%d,%d,0] completes display"
    270                                 " for image %d at cycle %d\n",
    271                                 x, y, image, giet_proctime() );
    272             }
     262            if ( (x==0) && (y==0) && (lpid==0) )
     263            giet_shr_printf("\n[TRANSPOSE] Proc [%d,%d,%d] completes display"
     264                            " for image %d at cycle %d\n",
     265                            x, y, lpid, image, giet_proctime() );
    273266
    274267            DISP_END[cluster_id][lpid] = giet_proctime();
    275268
    276             /////////////////////////
    277             barrier_wait( &barrier );
     269            /////////////////////////////
     270            sqt_barrier_wait( &barrier );
    278271        }
    279272
     
    318311        }
    319312
    320         /////////////////////////
    321         barrier_wait( &barrier );
     313        /////////////////////////////
     314        sqt_barrier_wait( &barrier );
    322315
    323316        // instrumentation done by processor [0,0,0]
     
    338331            unsigned int max_disp_ended = 0;
    339332
    340             for (cc = 0; cc < NB_CLUSTERS; cc++)
     333            for (cc = 0; cc < nclusters; cc++)
    341334            {
    342335                for (pp = 0; pp < NB_PROCS_MAX; pp++)
     
    384377        image++;
    385378
    386         /////////////////////////
    387         barrier_wait( &barrier );
     379        /////////////////////////////
     380        sqt_barrier_wait( &barrier );
    388381
    389382    } // end while image     
     
    392385    if ( (x==0) && (y==0) && (lpid==0) )
    393386    {
    394         for ( c = 0 ; c < NB_CLUSTERS ; c++ )
     387        for ( c = 0 ; c < nclusters ; c++ )
    395388        {
    396389            free( buf_in[c] );
Note: See TracChangeset for help on using the changeset viewer.