Context Navigation

← Previous Change
Next Change →

transpose

Timestamp:

Feb 8, 2015, 9:20:45 PM (10 years ago)

Author:

alain

Message:

1) Introduce distributed barriers in the multi-threads applications
(classif) transpose, convol, sort, gameoflife)

2) Introducing support for architectures containing empty clusters
in the mapping of these multi-threaded applications.

3) Removing the "command line arguments" in the sort application
(replaced by the giet_procs_number() system call.

Location:

soft/giet_vm/applications/transpose

Files:

: 2 edited

main.c (modified) (18 diffs)
transpose.py (modified) (3 diffs)

Legend:

: Unmodified
: Added
: Removed

soft/giet_vm/applications/transpose/main.c

-                      r444
+                      r502
 /////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////
 // File   : main.c   (for transpose application)
 // Date   : february 2014
 // author : Alain Greiner
+/////////////////////////////////////////////////////////////////////////////////////////////
+// This multi-threaded application makes a transpose for a NN*NN pixels sequence of images.
+///////////////////////////////////////////////////////////////////////////////////////
+// This multi-threaded application makes a transpose for a NN*NN pixels
+// sequence of images.
 // It can run on a multi-processors, multi-clusters architecture, with one thread
+// per processor. It uses the he following hardware parameters, that must be defined
+// in the hard_config.h file:
+// - X_SIZE       : number of clusters in a row
+// - Y_SIZE       : number of clusters in a column
+// - NB_PROCS_MAX : number of processors per cluster
+// - FBUF_X_SIZE  : number of pixels per line in frame buffer
+// - FBUF_Y_SIZE  : number of lines  in frame buffer
+//
+// per processor.
+//
 // The image sequence is read from a file (one byte per pixel).
 // The input and output buffers containing the image are distributed in all clusters.
 //
 // - The image size NN must be a power of 2 and must fit the frame buffer size.
 // - The number of clusters containing processors must be a power of 2.
 // - The number of processors per cluster must be a power of 2.
 // - The image size NN must be larger or equal to the total number of processor.
+// - The image size NN must fit the frame buffer size: 128 bytes
+// - The block size in block device must be 512 bytes.
+// - The number of clusters  must be a power of 2 no larger than 32
+// - The number of processors per cluster must be a power of 2 no larger than 4
 //
 // For each image the application makes a self test (checksum for each line).
 // The actual display on the frame buffer depends on frame buffer availability.
+/////////////////////////////////////////////////////////////////////////////////////////////
+#include "hard_config.h"
+///////////////////////////////////////////////////////////////////////////////////////
 #include "stdio.h"
 #include "barrier.h"
+#include "user_barrier.h"
 #include "malloc.h"
+#define BLOCK_SIZE          512                 // block size on disk
+#define CLUSTERS_MAX        32                  // max number of clusters
+#define PROCS_MAX           4                   // max number of processors per cluster
 #define NN                  128                 // image size : nlines = npixels = 128
 #define NB_IMAGES           5                   // number of images to be handled
 #define FILE_PATHNAME       "misc/images.raw"   // file pathname on disk
-#define NB_CLUSTERS         (X_SIZE * Y_SIZE)   // number of clusters
 #define INSTRUMENTATION_OK  0                   // display statistics on TTY when non zero
 …
 ///////////////////////////////////////////////////////
+// instrumentation counters
+// for each processor (up to 4 processors)
+// in each cluster (up to 32 clusters)
+unsigned int LOAD_START[NB_CLUSTERS][NB_PROCS_MAX];
+unsigned int LOAD_END  [NB_CLUSTERS][NB_PROCS_MAX];
+unsigned int TRSP_START[NB_CLUSTERS][NB_PROCS_MAX];
+unsigned int TRSP_END  [NB_CLUSTERS][NB_PROCS_MAX];
+unsigned int DISP_START[NB_CLUSTERS][NB_PROCS_MAX];
+unsigned int DISP_END  [NB_CLUSTERS][NB_PROCS_MAX];
+// instrumentation counters for each processor in each cluster
+unsigned int LOAD_START[CLUSTERS_MAX][PROCS_MAX];
+unsigned int LOAD_END  [CLUSTERS_MAX][PROCS_MAX];
+unsigned int TRSP_START[CLUSTERS_MAX][PROCS_MAX];
+unsigned int TRSP_END  [CLUSTERS_MAX][PROCS_MAX];
+unsigned int DISP_START[CLUSTERS_MAX][PROCS_MAX];
+unsigned int DISP_END  [CLUSTERS_MAX][PROCS_MAX];
 // arrays of pointers on distributed buffers
 // one input buffer & one output buffer per cluster
 unsigned char*  buf_in [NB_CLUSTERS];
 unsigned char*  buf_out[NB_CLUSTERS];
+unsigned char*  buf_in [CLUSTERS_MAX];
+unsigned char*  buf_out[CLUSTERS_MAX];
 // checksum variables
 …
 // global synchronisation barrier
 giet_barrier_t barrier;
+giet_sqt_barrier_t barrier;
 volatile unsigned int init_ok = 0;
 …
+{
+    int          file = 0;                                         // file descriptor
+    unsigned int l;                                                // line index for loops
+    unsigned int p;                                                // pixel index for loops
+    unsigned int c;                                                // cluster index for loops
+    // get processor identifiers
+    unsigned int x;                                                // x cluster coordinate
+    unsigned int y;                                                // y cluster coordinate
+    unsigned int lpid;                                             // local processor index
+    unsigned int l;                  // line index for loops
+    unsigned int p;                  // pixel index for loops
+    unsigned int c;                  // cluster index for loops
+    // processor identifiers
+    unsigned int x;                  // x cluster coordinate
+    unsigned int y;                  // y cluster coordinate
+    unsigned int lpid;               // local processor index
+    // plat-form parameters
+    unsigned int x_size;             // number of clusters in a row
+    unsigned int y_size;             // number of clusters in a column
+    unsigned int nprocs;             // number of processors per cluster
     giet_proc_xyp( &x, &y, &lpid);
+    unsigned int npixels    = NN * NN;                             // pixels per image
+    unsigned int nblocks    = npixels / 512;                       // blocks per image
+    unsigned int image      = 0;                                   // image counter
+    unsigned int cluster_id = (x * Y_SIZE) + y;                    // "continuous" index
+    unsigned int ntasks     = NB_CLUSTERS * NB_PROCS_MAX;          // number of tasks
+    unsigned int task_id    = (cluster_id * NB_PROCS_MAX) + lpid;  // "continuous" task index
+    giet_procs_number( &x_size , &y_size , &nprocs );
+    giet_shr_printf("\n[TRANSPOSE] Processor[%d,%d,%d] starts at cycle %d\n"
+                    " - x_size = %d\n"
+                    " - y_size = %d\n"
+                    " - nprocs = %d\n",
+                    x, y, lpid, giet_proctime(), x_size , y_size , nprocs );
+    unsigned int nclusters  = x_size * y_size;               // number of clusters
+    unsigned int ntasks     = x_size * y_size * nprocs;      // number of tasks
+    unsigned int npixels    = NN * NN;                       // pixels per image
+    unsigned int nblocks    = npixels / BLOCK_SIZE;          // blocks per image
+    unsigned int image      = 0;                             // image counter
+    int          file       = 0;                             // file descriptor
+    unsigned int cluster_id = (x * y_size) + y;              // "continuous" index
+    unsigned int task_id    = (cluster_id * nprocs) + lpid;  // "continuous" task index
     // Processor [0,0,0] makes initialisation
     // It includes parameters checking, barriers initialization,
+    // It includes parameters checking, barrier initialization,
     // distributed buffers allocation, and file open
     if ( (x==0) && (y==0) && (lpid==0) )
+    {
+        // Parameters checking
+        if ( (NN != FBUF_X_SIZE) || (NN != FBUF_Y_SIZE) )
+        {
+            giet_exit("[TRANSPOSE ERROR] Frame buffer size does not fit image size");
+        }
+        if ((NB_PROCS_MAX != 1) && (NB_PROCS_MAX != 2) && (NB_PROCS_MAX != 4))
+        if ((nprocs != 1) && (nprocs != 2) && (nprocs != 4))
+        {
             giet_exit("[TRANSPOSE ERROR] NB_PROCS_MAX must be 1, 2 or 4");
+        }
         if ((NB_CLUSTERS != 1) && (NB_CLUSTERS != 2) && (NB_CLUSTERS != 4) &&
             (NB_CLUSTERS != 8) && (NB_CLUSTERS != 16) && (NB_CLUSTERS != 32) )
+            giet_exit("[TRANSPOSE ERROR] number of procs per cluster must be 1, 2 or 4");
+        }
+        if ((nclusters != 1) && (nclusters != 2) && (nclusters != 4) &&
+            (nclusters != 8) && (nclusters != 16) && (nclusters != 32) )
+        {
             giet_exit("[TRANSPOSE ERROR] number of clusters must be 1,2,4,8,16,32");
 …
+        }
-        giet_shr_printf("\n[TRANSPOSE] Processor[0,0,0] starts at cycle %d\n"
-                        " - x_size    = %d\n"
-                        " - y_size    = %d\n"
-                        " - nprocs    = %d\n"
-                        " - nclusters = %d\n"
-                        " - ntasks    = %d\n",
-                        giet_proctime(), X_SIZE, Y_SIZE, NB_PROCS_MAX, NB_CLUSTERS, ntasks );
         // Barrier initialisation
         barrier_init( &barrier, ntasks );
+        sqt_barrier_init( &barrier, x_size , y_size , nprocs );
         giet_shr_printf("\n[TRANSPOSE] Proc [0,0,0] completes barrier init at cycle %d\n",
 …
         // Distributed buffers allocation
         // The buffers containing one image are distributed in clusters
         // (one buf_in and one buf_out per cluster).
         // Each buffer contains (NN*NN / NB_CLUSTERS) bytes.
         for ( c = 0 ; c < NB_CLUSTERS ; c++ )
+        {
             unsigned int rx = c / Y_SIZE;
             unsigned int ry = c % Y_SIZE;
             buf_in[c]  = remote_malloc( npixels/NB_CLUSTERS, rx, ry );
             buf_out[c] = remote_malloc( npixels/NB_CLUSTERS, rx, ry );
+        // The buffers containing one image are distributed in the user
+        // heap (one buf_in and one buf_out per cluster).
+        // Each buffer contains (NN*NN / nclusters) bytes.
+        for ( c = 0 ; c < nclusters ; c++ )
+        {
+            unsigned int rx = c / y_size;
+            unsigned int ry = c % y_size;
+            buf_in[c]  = remote_malloc( npixels/nclusters, rx, ry );
+            buf_out[c] = remote_malloc( npixels/nclusters, rx, ry );
             giet_shr_printf("\n[TRANSPOSE] Proc [0,0,0] completes buffer allocation"
 …
+    {
         while ( init_ok == 0 );
-        giet_shr_printf("\n[TRANSPOSE] Processor[%d,%d,%d] starts at cycle %d\n",
-                        x, y, lpid, giet_proctime() );
+    }
 …
     while (image < NB_IMAGES)
+    {
         // pseudo parallel load from disk to buf_in buffer : nblocks/NB_CLUSTERS blocks
+        // pseudo parallel load from disk to buf_in buffer : nblocks/nclusters blocks
         // only task running on processor with (lpid == 0) does it
 …
             giet_fat_read( file,
                            buf_in[cluster_id],
+                           (nblocks / NB_CLUSTERS),
+                           ((image*nblocks) + ((nblocks*cluster_id)/NB_CLUSTERS)) );
+                           (nblocks / nclusters),
+                           ((image*nblocks) + ((nblocks*cluster_id)/nclusters)) );
+            if ( (x==0) && (y==0) )
             giet_shr_printf("\n[TRANSPOSE] Proc [%d,%d,%d] completes load"
                             "  for image %d at cycle %d\n",
 …
         LOAD_END[cluster_id][lpid] = giet_proctime();
         /////////////////////////
         barrier_wait( &barrier );
+        /////////////////////////////
+        sqt_barrier_wait( &barrier );
         // parallel transpose from buf_in to buf_out
 …
         unsigned int nlt   = NN / ntasks;      // number of lines per task
         unsigned int nlc   = NN / NB_CLUSTERS;   // number of lines per cluster
+        unsigned int nlc   = NN / nclusters;   // number of lines per cluster
         unsigned int src_cluster;
 …
         if ( lpid == 0 )
+        {
+            if ( (x==0) && (y==0) )
             giet_shr_printf("\n[TRANSPOSE] proc [%d,%d,0] completes transpose"
                             " for image %d at cycle %d\n",
 …
         TRSP_END[cluster_id][lpid] = giet_proctime();
         /////////////////////////
         barrier_wait( &barrier );
+        /////////////////////////////
+        sqt_barrier_wait( &barrier );
         // optional parallel display from local buf_out to frame buffer
 …
                                  npt );
+            if ( lpid == 0 )
+            {
+                giet_shr_printf("\n[TRANSPOSE] Proc [%d,%d,0] completes display"
+                                " for image %d at cycle %d\n",
+                                x, y, image, giet_proctime() );
+            }
+            if ( (x==0) && (y==0) && (lpid==0) )
+            giet_shr_printf("\n[TRANSPOSE] Proc [%d,%d,%d] completes display"
+                            " for image %d at cycle %d\n",
+                            x, y, lpid, image, giet_proctime() );
             DISP_END[cluster_id][lpid] = giet_proctime();
             /////////////////////////
             barrier_wait( &barrier );
+            /////////////////////////////
+            sqt_barrier_wait( &barrier );
+        }
 …
+        }
         /////////////////////////
         barrier_wait( &barrier );
+        /////////////////////////////
+        sqt_barrier_wait( &barrier );
         // instrumentation done by processor [0,0,0]
 …
             unsigned int max_disp_ended = 0;
             for (cc = 0; cc < NB_CLUSTERS; cc++)
+            for (cc = 0; cc < nclusters; cc++)
+            {
                 for (pp = 0; pp < NB_PROCS_MAX; pp++)
 …
         image++;
         /////////////////////////
         barrier_wait( &barrier );
+        /////////////////////////////
+        sqt_barrier_wait( &barrier );
     } // end while image
 …
     if ( (x==0) && (y==0) && (lpid==0) )
+    {
         for ( c = 0 ; c < NB_CLUSTERS ; c++ )
+        for ( c = 0 ; c < nclusters ; c++ )
+        {
             free( buf_in[c] );

soft/giet_vm/applications/transpose/transpose.py

-                      r457
+                      r502
 from mapping import *
 ######################################################################################
+##################################################################################
 #   file   : transpose.py  (for the transpose application)
 #   date   : may 2014
 #   author : Alain Greiner
 #######################################################################################
+##################################################################################
 #  This file describes the mapping of the multi-threaded "transpose"
 #  application on a multi-clusters, multi-processors architecture.
 #  This include both the mapping of virtual segments on the clusters,
 #  and the mapping of tasks on processors.
+#  There is one task per processor.
+#  The mapping of virtual segments is the following:
+#    - There is one shared data vseg in cluster[0][0]
+#    - The code vsegs are replicated on all clusters containing processors.
+#    - There is one heap vseg per cluster containing processors.
+#    - The stacks vsegs are distibuted on all clusters containing processors.
 #  This mapping uses 5 platform parameters, (obtained from the "mapping" argument)
 #  - x_size    : number of clusters in a row
 #  - y_size    : number of clusters in a column
 #  - x_width   : number of bits coding x coordinate
 #  - y_width   : number of bits coding y coordinate
 #  - nprocs    : number of processors per cluster
 ####################################################################################
+#    - x_size    : number of clusters in a row
+#    - y_size    : number of clusters in a column
+#    - x_width   : number of bits coding x coordinate
+#    - y_width   : number of bits coding y coordinate
+#    - nprocs    : number of processors per cluster
+##################################################################################
 #########################
 …
     for x in xrange (x_size):
         for y in xrange (y_size):
+            mapping.addVseg( vspace, 'trsp_code_%d_%d' %(x,y), code_base , code_size,
+                             'CXWU', vtype = 'ELF', x = x, y = y, pseg = 'RAM',
+                             binpath = 'build/transpose/transpose.elf',
+                             local = True )
+            cluster_id = (x * y_size) + y
+            if ( mapping.clusters[cluster_id].procs ):
+    # stacks vsegs: local (one stack per processor => nprocs stacks per cluster)
+    for x in xrange (x_size):
+        for y in xrange (y_size):
+            for p in xrange( nprocs ):
+                proc_id = (((x * y_size) + y) * nprocs) + p
+                size    = (stack_size / nprocs) & 0xFFFFF000
+                base    = stack_base + (proc_id * size)
+                mapping.addVseg( vspace, 'trsp_stack_%d_%d_%d' % (x,y,p), base, size,
+                                 'C_WU', vtype = 'BUFFER', x = x , y = y , pseg = 'RAM',
+                                 local = True, big = True )
+                mapping.addVseg( vspace, 'trsp_code_%d_%d' %(x,y),
+                                 code_base , code_size,
+                                 'CXWU', vtype = 'ELF', x = x, y = y, pseg = 'RAM',
+                                 binpath = 'build/transpose/transpose.elf',
+                                 local = True )
     # heap vsegs: distributed but non local (all heap vsegs can be accessed by all tasks)
+    # stacks vsegs: local (one stack per processor => nprocs stacks per cluster)
     for x in xrange (x_size):
         for y in xrange (y_size):
             cluster_id = (x * y_size) + y
+            size  = heap_size
+            base  = heap_base + (cluster_id * size)
+            mapping.addVseg( vspace, 'trsp_heap_%d_%d' % (x,y), base, size,
+                             'C_WU', vtype = 'BUFFER', x = x, y = y, pseg = 'RAM',
+                             local = False, big = True )
+            if ( mapping.clusters[cluster_id].procs ):
+                for p in xrange( nprocs ):
+                    proc_id = (((x * y_size) + y) * nprocs) + p
+                    size    = (stack_size / nprocs) & 0xFFFFF000
+                    base    = stack_base + (proc_id * size)
+                    mapping.addVseg( vspace, 'trsp_stack_%d_%d_%d' % (x,y,p),
+                                     base, size, 'C_WU', vtype = 'BUFFER',
+                                     x = x , y = y , pseg = 'RAM',
+                                     local = True, big = True )
+    # heap vsegs: distributed non local (all heap vsegs can be accessed by all tasks)
+    for x in xrange (x_size):
+        for y in xrange (y_size):
+            cluster_id = (x * y_size) + y
+            if ( mapping.clusters[cluster_id].procs ):
+                size  = heap_size
+                base  = heap_base + (cluster_id * size)
+                mapping.addVseg( vspace, 'trsp_heap_%d_%d' % (x,y), base, size,
+                                 'C_WU', vtype = 'HEAP', x = x, y = y, pseg = 'RAM',
+                                 local = False, big = True )
     # distributed tasks / one task per processor
     for x in xrange (x_size):
         for y in xrange (y_size):
+            for p in xrange( nprocs ):
+                trdid = (((x * y_size) + y) * nprocs) + p
+                mapping.addTask( vspace, 'trsp_%d_%d_%d' % (x,y,p), trdid, x, y, p,
+                                 'trsp_stack_%d_%d_%d' % (x,y,p),
+                                 'trsp_heap_%d_%d' % (x,y), 0 )
+            cluster_id = (x * y_size) + y
+            if ( mapping.clusters[cluster_id].procs ):
+                for p in xrange( nprocs ):
+                    trdid = (((x * y_size) + y) * nprocs) + p
+                    mapping.addTask( vspace, 'trsp_%d_%d_%d' % (x,y,p),
+                                     trdid, x, y, p,
+                                     'trsp_stack_%d_%d_%d' % (x,y,p),
+                                     'trsp_heap_%d_%d' % (x,y), 0 )
     # extend mapping name
 …
     return vspace  # useful for test
 ################################ test ######################################################
+################################ test ##################################################
 if __name__ == '__main__':

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 502 for soft/giet_vm/applications/transpose

Legend:

soft/giet_vm/applications/transpose/main.c

soft/giet_vm/applications/transpose/transpose.py

Download in other formats: