Changeset 652 for trunk/user


Ignore:
Timestamp:
Nov 14, 2019, 3:56:51 PM (5 years ago)
Author:
alain
Message:

Introduce the three placement modes in "transpose", "convol', "fft" applications.

Location:
trunk/user
Files:
7 edited

Legend:

Unmodified
Added
Removed
  • trunk/user/convol/convol.c

    r645 r652  
    55///////////////////////////////////////////////////////////////////////////////////////
    66// This multi-threaded application implements a 2D convolution product. 
    7 // It can run on a multi-processors, multi-clusters architecture, with one thread
    8 // per processor, and uses the POSIX threads API.
     7// It can run on a multi-cores, multi-clusters architecture, with one thread
     8// per core, and uses the POSIX threads API.
    99//
    1010// The main() function can be launched on any processor P[x,y,l].
     
    1414// when the parallel execution is completed.
    1515//
    16 // The convolution kernel is [201]*[35] pixels, but it can be factored in two
    17 // independant line and column convolution products.
     16// The convolution kernel is defined in the execute() function.
     17// It can be factored in two independant line and column convolution products.
    1818// The five buffers containing the image are distributed in clusters.
     19// For the philips image, it is a [201]*[35] pixels rectangle, and the.
    1920//
    2021// The (1024 * 1024) pixels image is read from a file (2 bytes per pixel).
    2122//
    2223// - number of clusters containing processors must be power of 2 no larger than 256.
    23 // - number of processors per cluster must be power of 2 no larger than 8.
     24// - number of processors per cluster must be power of 2 no larger than 4.
     25//
     26// The number N of working threads is always defined by the number of cores availables
     27// in the architecture, but this application supports three placement modes.
     28// In all modes, the working threads are identified by the [tid] continuous index
     29// in range [0, NTHREADS-1], and defines how the lines are shared amongst the threads.
     30// This continuous index can always be decomposed in two continuous sub-indexes:
     31// tid == cid * ncores + lid,  where cid is in [0,NCLUSTERS-1] and lid in [0,NCORES-1].
     32//
     33// - NO_PLACEMENT: the main thread is itsef a working thread. The (N_1) other working
     34//   threads are created by the main thread, but the placement is done by the OS, using
     35//   the DQDT for load balancing, and two working threads can be placed on the same core.
     36//   The [cid,lid] are only abstract identifiers, and cannot be associated to a physical
     37//   cluster or a physical core. In this mode, the main thread run on any cluster,
     38//   but has tid = 0 (i.e. cid = 0 & tid = 0).
     39//
     40// - EXPLICIT_PLACEMENT: the main thread is again a working thread, but the placement of
     41//   of the threads on the cores is explicitely controled by the main thread to have
     42//   exactly one working thread per core, and the [cxy][lpid] core coordinates for a given
     43//   thread[tid] can be directly derived from the [tid] value: [cid] is an alias for the
     44//   physical cluster identifier, and [lid] is the local core index.
     45//
     46// - PARALLEL_PLACEMENT: the main thread is not anymore a working thread, and uses the
     47//   non standard pthread_parallel_create() function to avoid the costly sequencial
     48//   loops for pthread_create() and pthread_join(). It garanty one working thread
     49//   per core, and the same relation between the thread[tid] and the core[cxy][lpid].
     50//
     51// The [tid] continuous index defines how the work is shared amongst the threads:
     52// - each thread handles NL/nthreads lines for the horizontal filter.
     53// - each thread handles NP/nthreads columns for the vertical filter.
    2454///////////////////////////////////////////////////////////////////////////////////////
    2555
     56#include <sys/mman.h>
    2657#include <stdio.h>
    2758#include <stdlib.h>
     
    2960#include <unistd.h>
    3061#include <pthread.h>
     62#include <string.h>
    3163#include <almosmkh.h>
    3264#include <hal_macros.h>
    3365
    34 #define IMAGE_IN_PATH              "misc/philips_1024.raw"
    35 
    36 #define USE_SQT_BARRIER            1
    37 #define VERBOSE                    1
    38 #define SUPER_VERBOSE              0
    39 
    40 #define USE_DQT_BARRIER            1
     66#define VERBOSE_MAIN               1
     67#define VERBOSE_EXEC               1
    4168
    4269#define X_MAX                      16
    4370#define Y_MAX                      16
    44 #define PROCS_MAX                  4
     71#define CORES_MAX                  4
    4572#define CLUSTERS_MAX               (X_MAX * Y_MAX)
    46 #define THREADS_MAX                (X_MAX * Y_MAX * PROCS_MAX]
    47 
    48 #define INITIAL_DISPLAY_ENABLE     1
    49 #define FINAL_DISPLAY_ENABLE       1
    50 
    51 #define PIXEL_SIZE                 2       // input image has 2 bytes per pixel
    52 #define FBF_TYPE                   420     // output image has 1 byte per pixel
    53 
     73#define THREADS_MAX                (X_MAX * Y_MAX * CORES_MAX)
     74
     75#define IMAGE_IN_PATH              "misc/philips_1024_2.raw"
     76#define IMAGE_IN_PIXEL_SIZE        2                               // 2 bytes per pixel
     77
     78#define IMAGE_OUT_PATH             "misc/philips_after_1O24.raw"
     79#define IMAGE_OUT_PIXEL_SIZE       1                               // 1 bytes per pixel
     80
     81#define FBF_TYPE                   420
    5482#define NL                         1024
    5583#define NP                         1024
    5684#define NB_PIXELS                  (NP * NL)
    57 #define FRAME_SIZE                 (NB_PIXELS * PIXEL_SIZE)
    58 
     85
     86#define NO_PLACEMENT               0
     87#define EXPLICIT_PLACEMENT         0
     88#define PARALLEL_PLACEMENT         1
     89
     90#define USE_DQT_BARRIER            1
     91#define INITIAL_DISPLAY_ENABLE     1
     92#define FINAL_DISPLAY_ENABLE       1
    5993
    6094#define TA(c,l,p)  (A[c][((NP) * (l)) + (p)])
     
    68102
    69103//////////////////////////////////////////////////////////
    70 //   global variables stored in seg_data in cluster[0,0]
     104//            global variables
    71105//////////////////////////////////////////////////////////
    72106
    73 // Instrumentation counters (cluster_id, lpid]
    74 unsigned int START[CLUSTERS_MAX][PROCS_MAX];
    75 unsigned int H_BEG[CLUSTERS_MAX][PROCS_MAX];
    76 unsigned int H_END[CLUSTERS_MAX][PROCS_MAX];
    77 unsigned int V_BEG[CLUSTERS_MAX][PROCS_MAX];
    78 unsigned int V_END[CLUSTERS_MAX][PROCS_MAX];
    79 unsigned int D_BEG[CLUSTERS_MAX][PROCS_MAX];
    80 unsigned int D_END[CLUSTERS_MAX][PROCS_MAX];
    81 
    82 // file pointers on input image
    83 FILE * f_image_in;
    84 FILE * f_instrum;
     107// global instrumentation counters for the main thread
     108unsigned int SEQUENCIAL_TIME = 0;
     109unsigned int PARALLEL_TIME   = 0;
     110
     111// instrumentation counters for thread[tid] in cluster[cid]
     112unsigned int START[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
     113unsigned int H_BEG[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
     114unsigned int H_END[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
     115unsigned int V_BEG[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
     116unsigned int V_END[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
     117unsigned int D_BEG[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
     118unsigned int D_END[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
     119
     120// pointer on buffer containing the input image, maped by the main to the input file
     121unsigned char *  image_in;
     122
     123// pointer on buffer containing the output image, maped by the main to the output file
     124unsigned char *  image_out;
    85125
    86126// return values at thread exit
     
    91131pthread_barrier_t     barrier;
    92132
    93 // coordinates of core executing the main thread
    94 unsigned int cxy_main;
    95 unsigned int lid_main;
     133// platform parameters
     134unsigned int  x_size;              // number of clusters in a row
     135unsigned int  y_size;              // number of clusters in a column
     136unsigned int  ncores;              // number of processors per cluster
    96137
    97138// arrays of pointers on distributed buffers in all clusters
    98139unsigned short * GA[CLUSTERS_MAX];
    99 int *            GB[CLUSTERS_MAX];
    100 int *            GC[CLUSTERS_MAX];
    101 int *            GD[CLUSTERS_MAX];
    102 unsigned char *  GZ[CLUSTERS_MAX];
    103 
    104 // trdid[] array for execution threads
    105 // 1D array if no explicit threads placement / 2D array if explicit placement
    106 pthread_t        trdid[CLUSTERS_MAX][PROCS_MAX];
    107 //pthread_t        trdid[THREADS_MAX];
    108 
    109 // attr[] array for execution threads
    110 // unused if no explicit threads placement
    111 pthread_attr_t   attr[CLUSTERS_MAX][PROCS_MAX];
     140int            * GB[CLUSTERS_MAX];
     141int            * GC[CLUSTERS_MAX];
     142int            * GD[CLUSTERS_MAX];
     143unsigned char  * GZ[CLUSTERS_MAX];
     144
     145// array of threads kernel identifiers / indexed by [tid]
     146pthread_t        exec_trdid[THREADS_MAX];
     147
     148// array of threads attributes / indexed bi [tid]
     149pthread_attr_t   exec_attr[THREADS_MAX];
     150
     151// array of execute() function arguments / indexed by [tid]
     152pthread_parallel_work_args_t exec_args[THREADS_MAX];
     153
     154// main thread continuous index
     155unsigned int     tid_main;
    112156
    113157/////////////////////////////////////////////////////////////////////////////////////
     
    115159/////////////////////////////////////////////////////////////////////////////////////
    116160
    117 void execute( void );
    118 
    119 void instrument( unsigned int nclusters,
    120                  unsigned int ncores );
     161void execute( pthread_parallel_work_args_t * args );
     162
     163void instrument( FILE * f , char * filename );
    121164
    122165/////////////////
    123166void main( void )
    124167{
    125     unsigned int x_size;                 // number of clusters in a row
    126     unsigned int y_size;                 // number of clusters in a column
    127     unsigned int ncores;                 // number of processors per cluster
    128 
    129     unsigned long long  date;
    130 
    131     char         name[64];               // instrumentation file name
    132     char         path[128];              // instrumentation path name
     168    unsigned long long start_cycle;
     169    unsigned long long end_sequencial_cycle;
     170    unsigned long long end_parallel_cycle;
    133171
    134172    int          error;
    135173
    136     // get platform parameters
    137     if ( get_config( &x_size , &y_size , &ncores ) )
    138     {
    139         printf("\n[convol error] cannot get hardware configuration\n");
     174    char         instru_name[32];               // instrumentation file name
     175    char         instru_path[64];              // instrumentation path name
     176
     177    /////////////////////////////////////////////////////////////////////////////////
     178    get_cycle( &start_cycle );
     179    /////////////////////////////////////////////////////////////////////////////////
     180
     181    if( (NO_PLACEMENT + EXPLICIT_PLACEMENT + PARALLEL_PLACEMENT) != 1 )
     182    {
     183        printf("\n[convol error] illegal placement\n");
    140184        exit( 0 );
    141185    }
    142186
    143     // get core executing this main thread
    144     // and register these coordinates in global variables
    145     get_core_id( &cxy_main , &lid_main );
    146    
    147     // check ncores
    148     if( (ncores != 1) && (ncores != 2) && (ncores != 4) )
     187    // get & check platform parameters
     188    get_config( &x_size , &y_size , &ncores );
     189
     190    if((ncores != 1) && (ncores != 2) && (ncores != 4))
    149191    {
    150192        printf("\n[convol error] number of cores per cluster must be 1/2/4\n");
     
    152194    }
    153195
    154     // check x_size
    155     if( (x_size != 1) && (x_size != 2) && (x_size != 4) && (x_size != 8) && (x_size != 16) )
     196    if( (x_size != 1) && (x_size != 2) && (x_size != 4) &&
     197        (x_size != 8) && (x_size != 16) )
    156198    {
    157199        printf("\n[convol error] x_size must be 1/2/4/8/16\n");
    158200        exit( 0 );
    159201    }
    160 
    161     // check y_size
    162     if( (y_size != 1) && (y_size != 2) && (y_size != 4) && (y_size != 8) && (y_size != 16) )
     202       
     203    if( (y_size != 1) && (y_size != 2) && (y_size != 4) &&
     204        (y_size != 8) && (y_size != 16) )
    163205    {
    164206        printf("\n[convol error] y_size must be 1/2/4/8/16\n");
    165207        exit( 0 );
    166208    }
     209       
     210    // main thread get identifiers for core executing main
     211    unsigned int  cxy_main;
     212    unsigned int  lid_main;
     213    get_core_id( &cxy_main , &lid_main );
    167214
    168215    // compute nthreads and nclusters
    169     unsigned int nthreads  = x_size * y_size * ncores;
    170216    unsigned int nclusters = x_size * y_size;
    171 
    172     get_cycle( &date );
    173     printf("\n[convol] starts on core[%x,%d] / %d thread(s) / cycle %d\n",
    174     cxy_main, lid_main, nthreads, (unsigned int)date );
    175 
    176     // build instrumentation file name
    177     if( USE_DQT_BARRIER )
    178     snprintf( name , 64 , "p_convol_dqt_%d_%d", x_size * y_size , ncores );
    179     else
    180     snprintf( name , 64 , "p_convol_smp_%d_%d", x_size * y_size , ncores );
    181 
    182     // build pathname
    183     snprintf( path , 128 , "/home/%s", name );
     217    unsigned int nthreads  = nclusters * ncores;
     218
     219    // main thread get FBF size and type
     220    unsigned int   fbf_width;
     221    unsigned int   fbf_height;
     222    unsigned int   fbf_type;
     223    fbf_get_config( &fbf_width , &fbf_height , &fbf_type );
     224
     225    if( (fbf_width != NP) || (fbf_height != NL) || (fbf_type != FBF_TYPE) )
     226    {
     227        printf("\n[convol error] image does not fit FBF size or type\n");
     228        exit( 0 );
     229    }
     230
     231    if( nthreads > NL )
     232    {
     233        printf("\n[convol error] number of threads larger than number of lines\n");
     234        exit( 0 );
     235    }
     236
     237    // define instrumentation file name
     238    if( NO_PLACEMENT )
     239    {
     240        printf("\n[convol] %d cluster(s) / %d core(s) / FBF[%d*%d] / PID %x / NO_PLACE\n",
     241        nclusters, ncores, fbf_width, fbf_height, getpid() );
     242
     243        // build instrumentation file name
     244        if( USE_DQT_BARRIER )
     245        snprintf( instru_name , 32 , "conv_dqt_no_place_%d_%d", x_size * y_size , ncores );
     246        else
     247        snprintf( instru_name , 32 , "conv_smp_no_place_%d_%d", x_size * y_size , ncores );
     248    }
     249
     250    if( EXPLICIT_PLACEMENT )
     251    {
     252        printf("\n[convol] %d cluster(s) / %d core(s) / FBF[%d*%d] / PID %x / EXPLICIT\n",
     253        nclusters, ncores, fbf_width, fbf_height, getpid() );
     254
     255        // build instrumentation file name
     256        if( USE_DQT_BARRIER )
     257        snprintf( instru_name , 32 , "conv_dqt_explicit_%d_%d_%d", x_size * y_size , ncores );
     258        else
     259        snprintf( instru_name , 32 , "conv_smp_explicit_%d_%d_%d", x_size * y_size , ncores );
     260    }
     261
     262    if( PARALLEL_PLACEMENT )
     263    {
     264        printf("\n[convol] %d cluster(s) / %d core(s) / FBF[%d*%d] / PID %x / PARALLEL\n",
     265        nclusters, ncores, fbf_width, fbf_height, getpid() );
     266
     267        // build instrumentation file name
     268        if( USE_DQT_BARRIER )
     269        snprintf( instru_name , 32 , "conv_dqt_parallel_%d_%d_%d", x_size * y_size , ncores );
     270        else
     271        snprintf( instru_name , 32 , "conv_smp_parallel_%d_%d_%d", x_size * y_size , ncores );
     272    }
    184273
    185274    // open instrumentation file
    186     f_instrum = fopen( path , NULL );
    187     if ( f_instrum == NULL )
     275    snprintf( instru_path , 64 , "/home/%s", instru_name );
     276    FILE * f_instru = fopen( instru_path , NULL );
     277    if ( f_instru == NULL )
    188278    {
    189         printf("\n[convol error] cannot open instrumentation file <%s>\n", path );
     279        printf("\n[convol error] cannot open instrumentation file %s\n", instru_path );
    190280        exit( 0 );
    191281    }
    192282
    193 #if DEBUG_MAIN
    194 get_cycle( &date );
    195 printf("\n[convol] main on core[%x,%d] open file <%s> at cycle %d\n",
    196 cxy_main, lid_main, path, (unsigned int)date );
    197 #endif
    198 
    199     // open input file
    200     f_image_in = fopen( IMAGE_IN_PATH , NULL );
    201     if ( f_image_in == NULL )
    202     {
    203         printf("\n[convol error] cannot open input file <%s>\n", IMAGE_IN_PATH );
    204         exit( 0 );
    205     }
    206 
    207 #if DEBUG_MAIN
    208 get_cycle( &date );
    209 printf("\n[convol] main on core[%x,%d] open file <%s> at cycle %d\n",
    210 cxy_main, lid_main, path, (unsigned int)date );
    211 #endif
    212    
    213     // get FBF config
    214     unsigned int  fbf_width;
    215     unsigned int  fbf_height;
    216     unsigned int  fbf_type;
    217     fbf_get_config( &fbf_width , &fbf_height , &fbf_type );
    218 
    219     // check FBF size
    220     if ( (fbf_width != NP) || (fbf_height != NL) )
    221     {
    222         printf("\n[convol error] bad FBF size\n");
    223         exit( 0 );
    224     }
    225 
    226     // check FBF subsampling
    227     if ( fbf_type != FBF_TYPE )
    228     {
    229         printf("\n[convol error] bad FBF subsampling\n");
    230         exit( 0 );
    231     }
    232 
    233     // initialise barrier
     283#if  VERBOSE_MAIN
     284printf("\n[convol] main on core[%x,%d] open instrumentation file %s\n",
     285cxy_main, lid_main, instru_path );
     286#endif
     287
     288    // main initialise barrier
    234289    if( USE_DQT_BARRIER )
    235290    {
     
    251306    }
    252307
    253     get_cycle( &date );
    254     printf("\n[convol] main on core[%x,%d] completes initialisation at cycle %d\n"
    255            "- CLUSTERS     = %d\n"
    256            "- PROCS        = %d\n"
    257            "- THREADS      = %d\n",
    258            cxy_main, lid_main, (unsigned int)date, nclusters, ncores, nthreads );
    259 
    260     // launch exec threads with explicit placement
    261     unsigned int x;
    262     unsigned int y;
    263     unsigned int l;
    264     unsigned int cxy;
    265  
    266     for( x = 0 ; x < x_size ; x++ )
    267     {
    268         for( y = 0 ; y < y_size ; y++ )
    269         {
    270            cxy = HAL_CXY_FROM_XY(x,y);
    271            for( l = 0 ; l < ncores ; l++ )
    272            {
    273                // no other thread on the core running the main
    274                if( (cxy != cxy_main) || (l != lid_main) )
    275                {
    276                    // define thread attributes
    277                    attr[cxy][l].attributes = PT_ATTR_CLUSTER_DEFINED | PT_ATTR_CORE_DEFINED;
    278                    attr[cxy][l].cxy        = cxy;
    279                    attr[cxy][l].lid        = l;
     308#if VERBOSE_MAIN
     309printf("\n[convol] main on core[%x,%d] completes barrier init\n",
     310cxy_main, lid_main );
     311#endif
     312
     313    // main open input file
     314    int fd_in = open( IMAGE_IN_PATH , O_RDONLY , 0 );
     315
     316    if ( fd_in < 0 )
     317    {
     318        printf("\n[convol error] cannot open input file <%s>\n", IMAGE_IN_PATH );
     319        exit( 0 );
     320    }
     321
     322#if VERBOSE_MAIN
     323printf("\n[convol] main on core[%x,%d] open file <%s>\n",
     324cxy_main, lid_main, IMAGE_IN_PATH );
     325#endif
     326   
     327    // main thread map image_in buffer to input file
     328    image_in = (unsigned char *)mmap( NULL,
     329                                      NB_PIXELS * IMAGE_IN_PIXEL_SIZE,
     330                                      PROT_READ,
     331                                      MAP_FILE | MAP_SHARED,
     332                                      fd_in,
     333                                      0 );           // offset
     334    if ( image_in == NULL )
     335    {
     336        printf("\n[convol error] main cannot map buffer to file %s\n", IMAGE_IN_PATH );
     337        exit( 0 );
     338    }
     339
     340#if  VERBOSE_MAIN
     341printf("\n[convol] main on core[%x,%x] map buffer to file <%s>\n",
     342cxy_main, lid_main, IMAGE_IN_PATH );
     343#endif
     344
     345    // main thread open output file
     346    int fd_out = open( IMAGE_OUT_PATH , O_CREAT , 0 );
     347
     348    if ( fd_out < 0 )
     349    {
     350        printf("\n[convol error] main cannot open file %s\n", IMAGE_OUT_PATH );
     351        exit( 0 );
     352    }
     353
     354#if  VERBOSE_MAIN
     355printf("\n[convol] main on core[%x,%d] open file <%s>\n",
     356cxy_main, lid_main, IMAGE_OUT_PATH );
     357#endif
     358
     359    // main thread map image_out buffer to output file
     360    image_out = (unsigned char *)mmap( NULL,
     361                                       NB_PIXELS + IMAGE_OUT_PIXEL_SIZE,
     362                                       PROT_WRITE,
     363                                       MAP_FILE | MAP_SHARED,
     364                                       fd_out,
     365                                       0 );     // offset
     366    if ( image_out == NULL )
     367    {
     368        printf("\n[convol error] main cannot map buffer to file %s\n", IMAGE_OUT_PATH );
     369        exit( 0 );
     370    }
     371
     372#if  VERBOSE_MAIN
     373printf("\n[convol] main on core[%x,%x] map buffer to file <%s>\n",
     374cxy_main, lid_main, IMAGE_OUT_PATH );
     375#endif
     376
     377    /////////////////////////////////////////////////////////////////////////////////////
     378    get_cycle( &end_sequencial_cycle );
     379    SEQUENCIAL_TIME = (unsigned int)(end_sequencial_cycle - start_cycle);
     380    /////////////////////////////////////////////////////////////////////////////////////
     381
     382    //////////////////
     383    if( NO_PLACEMENT )
     384    {
     385        // the tid value for the main thread is always 0
     386        // main thread creates new threads with tid in [1,nthreads-1] 
     387        unsigned int tid;
     388        for ( tid = 0 ; tid < nthreads ; tid++ )
     389        {
     390            // register tid value in exec_args[tid] array
     391            exec_args[tid].tid = tid;
     392           
     393            // create other threads
     394            if( tid > 0 )
     395            {
     396                if ( pthread_create( &exec_trdid[tid],
     397                                     NULL,                  // no attribute
     398                                     &execute,
     399                                     &exec_args[tid] ) )
     400                {
     401                    printf("\n[convol error] cannot create thread %d\n", tid );
     402                    exit( 0 );
     403                }
     404
     405#if VERBOSE_MAIN
     406printf("\n[convol] main created thread %d\n", tid );
     407#endif
     408
     409            }
     410            else
     411            {
     412                tid_main = 0;
     413            }
     414        }  // end for tid
     415
     416        // main thread calls itself the execute() function
     417        execute( &exec_args[0] );
     418
     419        // main thread wait other threads completion
     420        for ( tid = 1 ; tid < nthreads ; tid++ )
     421        {
     422            unsigned int * status;
     423
     424            // main wait thread[tid] status
     425            if ( pthread_join( exec_trdid[tid], (void*)(&status)) )
     426            {
     427                printf("\n[convol error] main cannot join thread %d\n", tid );
     428                exit( 0 );
     429            }
     430       
     431            // check status
     432            if( *status != THREAD_EXIT_SUCCESS )
     433            {
     434                printf("\n[convol error] thread %x returned failure\n", tid );
     435                exit( 0 );
     436            }
     437
     438#if VERBOSE_MAIN
     439printf("\n[convol] main successfully joined thread %x\n", tid );
     440#endif
     441       
     442        }  // end for tid
     443
     444    }  // end if no_placement
     445
     446    ////////////////////////
     447    if( EXPLICIT_PLACEMENT )
     448    {
     449        // main thread places each other threads on a specific core[cxy][lid]
     450        // but the actual thread creation is sequencial
     451        unsigned int x;
     452        unsigned int y;
     453        unsigned int l;
     454        unsigned int cxy;                   // cluster identifier
     455        unsigned int tid;                   // thread continuous index
     456
     457        for( x = 0 ; x < x_size ; x++ )
     458        {
     459            for( y = 0 ; y < y_size ; y++ )
     460            {
     461                cxy = HAL_CXY_FROM_XY( x , y );
     462                for( l = 0 ; l < ncores ; l++ )
     463                {
     464                    // compute thread continuous index
     465                    tid = (((x  * y_size) + y) * ncores) + l;
     466
     467                    // register tid value in exec_args[tid] array
     468                    exec_args[tid].tid = tid;
     469
     470                    // no thread created on the core running the main
     471                    if( (cxy != cxy_main) || (l != lid_main) )
     472                    {
     473                        // define thread attributes
     474                        exec_attr[tid].attributes = PT_ATTR_CLUSTER_DEFINED |
     475                                                    PT_ATTR_CORE_DEFINED;
     476                        exec_attr[tid].cxy        = cxy;
     477                        exec_attr[tid].lid        = l;
    280478 
    281                    // create thread on core[x,y,l]
    282                    if (pthread_create( &trdid[cxy][l],
    283                                        &attr[cxy][l],   
    284                                        &execute,
    285                                        NULL ) )     // execute has no argument
    286                    {
    287                        printf("\n[convol error] created thread %x on core[%x][%d]\n",
    288                        trdid[cxy][l] , cxy , l );
    289                        exit( 0 );
    290                    }
    291                 }
    292             }
    293         }
    294     }   
    295 
    296 /*
    297     // launch other threads without explicit placement
    298     for ( n = 1 ; n < nthreads ; n++ )
    299     {
    300         if ( giet_pthread_create( &trdid[n],
    301                                   NULL,                  // no attribute
    302                                   &execute,
    303                                   NULL ) )               // no argument
    304         {
    305             printf("\n[convol error] creating thread %x\n", trdid[n] );
    306             exit( 0 );
    307         }
    308     }
    309 */
    310 
    311     // the main thread run itself the execute() function
    312     execute();
    313 
    314     // wait other threads completions if explicit threads placement
    315     for( x = 0 ; x < x_size ; x++ )
    316     {
    317         for( y = 0 ; y < y_size ; y++ )
    318         {
    319             unsigned int cxy = HAL_CXY_FROM_XY(x,y);
    320             for( l = 0 ; l < ncores ; l++ )
    321             {
    322                 // no other thread on the core running the main
    323                 if( (cxy != cxy_main) || (l != lid_main) )
    324                 {
    325                     unsigned int * exit_status;
    326 
    327                     // wait thread running on core[x,y,l]
    328                     if (pthread_join( trdid[cxy][l] , (void*)(&exit_status) ) )
     479                        // create thread[tid] on core[cxy][l]
     480                        if ( pthread_create( &exec_trdid[tid],   
     481                                             &exec_attr[tid],   
     482                                             &execute,
     483                                             &exec_args[tid] ) )       
     484                        {
     485                            printf("\n[convol error] cannot create thread %d\n", tid );
     486                            exit( 0 );
     487                        }
     488#if VERBOSE_MAIN
     489printf("\n[convol] main created thread[%d] on core[%x,%d]\n", tid, cxy, l );
     490#endif
     491                    }
     492                    else
    329493                    {
    330                         printf("\n[convol error] main cannot join thread[%x,%d]\n", cxy, l );
    331                         exit( 0 );
    332                     }
    333 
    334                     // check exit_status
    335                     if( *exit_status != 0 )
    336                     {
    337                         printf("\n[convol error] thread[%x,%d]return failure\n", cxy, l );
    338                         exit( 0 );
     494                        tid_main = tid;
    339495                    }
    340496                }
    341497            }
    342498        }
    343     }
    344 /*   
    345     // wait other threads completion when no explicit threads placement
    346     for ( n = 1 ; n < nthreads ; n++ )
    347     {
    348         if ( pthread_join( trdid[n], NULL ) )
    349         {
    350             printf("\n[convol error] joining thread %x\n", trdid[n] );
     499
     500        // main thread calls itself the execute() function
     501        execute( &exec_args[tid_main] );
     502
     503        // main thread wait other threads completion
     504        for( tid = 0 ; tid < nthreads ; tid++ )
     505        {
     506            // no other thread on the core running the main
     507            if( tid != tid_main )
     508            {
     509                unsigned int * status;
     510
     511                // wait thread[tid]
     512                if( pthread_join( exec_trdid[tid] , (void*)(&status) ) )
     513                {
     514                    printf("\n[convol error] main cannot join thread %d\n", tid );
     515                    exit( 0 );
     516                }
     517       
     518                // check status
     519                if( *status != THREAD_EXIT_SUCCESS )
     520                {
     521                    printf("\n[convol error] thread %d returned failure\n", tid );
     522                    exit( 0 );
     523                }
     524#if VERBOSE_MAIN
     525printf("\n[convol] main joined thread %d on core[%x,%d]\n", tid , cxy , l );
     526#endif
     527            }
     528        }
     529    }  // end if explicit_placement
     530
     531    ////////////////////////
     532    if( PARALLEL_PLACEMENT )
     533    {
     534        // compute covering DQT size an level
     535        unsigned int z          = (x_size > y_size) ? x_size : y_size;
     536        unsigned int root_level = ((z == 1) ? 0 :
     537                                  ((z == 2) ? 1 :
     538                                  ((z == 4) ? 2 :
     539                                  ((z == 8) ? 3 : 4))));
     540
     541        // create & execute the working threads
     542        if( pthread_parallel_create( root_level , &execute ) )
     543        {
     544            printf("\n[convol error] in %s\n", __FUNCTION__ );
    351545            exit( 0 );
    352546        }
    353     }
    354 */
    355     // call the instrument() function
    356     instrument( nclusters , ncores );
    357 
     547    }  // end if parallel_placement
     548
     549    /////////////////////////////////////////////////////////////////////////////
     550    get_cycle( &end_parallel_cycle );
     551    PARALLEL_TIME = (unsigned int)(end_parallel_cycle - end_sequencial_cycle);
     552    /////////////////////////////////////////////////////////////////////////////
     553
     554    // main thread register instrumentation results
     555    instrument( f_instru , instru_name );
     556
     557    // main thread close input file
     558    close( fd_in );
     559
     560    // main thread close output file
     561    close( fd_out );
     562
     563    // main thread close instrumentation file
     564    fclose( f_instru );
     565
     566    // main thread suicide
    358567    exit( 0 );
    359568   
     
    362571
    363572
    364 //////////////
    365 void execute()
     573
     574
     575
     576///////////////////////////////////////////////////
     577void execute( pthread_parallel_work_args_t * args )
    366578{
    367579    unsigned long long date;
    368580
    369     // Each thread[x,y,p] initialises the convolution kernel parameters in local stack.
     581    // Each thread initialises the convolution kernel parameters in local stack.
    370582    // The values defined in the next 12 lines are Philips proprietary information.
    371583
     
    382594    unsigned int hnorm  = 201;
    383595
    384     // get plat-form config
    385     unsigned int x_size;            // number of clusters in a row
    386     unsigned int y_size;            // number of clusters in a column
    387     unsigned int ncores;            // number of processors per cluster
    388     get_config( &x_size , &y_size , &ncores );
    389 
    390     // get cluster indentifier and core local index
    391     unsigned int cxy;
    392     unsigned int lid;
    393     get_core_id( &cxy , &lid );
    394     unsigned int x = HAL_X_FROM_CXY( cxy );
    395     unsigned int y = HAL_Y_FROM_CXY( cxy );
     596    // WARNING
     597    //A thread is identified by the tid index, defined in the "args" structure.
     598    // This index being in range [0,nclusters*ncores-1] we can always write
     599    //       tid == cid * ncores + lid
     600    // with cid in [0,nclusters-1] and lid in [0,ncores-1].
     601    // if NO_PLACEMENT, there is no relation between these
     602    // thread [cid][lid] indexes, and the core coordinates [cxy][lpid]
     603
     604    // get thread abstract identifiers
     605    unsigned int tid = args->tid;
     606    unsigned int cid = tid / ncores;   
     607    unsigned int lid = tid % ncores;
     608
     609#if VERBOSE_EXEC
     610unsigned int cxy;              // core cluster identifier
     611unsigned int lpid;             // core local identifier
     612get_core_id( &cxy , &lpid );
     613printf("\n[convol] exec[%d] on core[%x,%d] enters parallel exec\n",
     614tid , cxy , lpid );
     615#endif
     616
     617    // build total number of threads and clusters from global variables
     618    unsigned int nclusters = x_size * y_size;
     619    unsigned int nthreads  = nclusters * ncores;
    396620
    397621    // indexes for loops
     
    401625    unsigned int z;                 // vertical filter index
    402626
    403     unsigned int nclusters  = x_size * y_size;              // number of clusters
    404     unsigned int cluster_id = (x * y_size) + y;             // continuous cluster index
    405     unsigned int thread_id  = (cluster_id * ncores) + lid;  // continuous thread index
    406     unsigned int nthreads   = nclusters * ncores;           // number of threads
    407     unsigned int frame_size = FRAME_SIZE;                   // total size (bytes)
    408     unsigned int lines_per_thread   = NL / nthreads;        // lines per thread
    409     unsigned int lines_per_cluster  = NL / nclusters;       // lines per cluster
    410     unsigned int pixels_per_thread  = NP / nthreads;        // columns per thread
    411     unsigned int pixels_per_cluster = NP / nclusters;       // columns per cluster
     627    unsigned int lines_per_thread   = NL / nthreads;
     628    unsigned int lines_per_cluster  = NL / nclusters;
     629    unsigned int pixels_per_thread  = NP / nthreads;
     630    unsigned int pixels_per_cluster = NP / nclusters;
     631
     632    // compute number of pixels stored in one abstract cluster cid
     633    unsigned int local_pixels = NL * NP / nclusters;       
    412634
    413635    unsigned int first, last;
    414636
    415637    get_cycle( &date );
    416     START[cluster_id][lid] = (unsigned int)date;
    417 
    418     // Each thread[cxy][0] allocate the global buffers in cluster cxy
     638    START[cid][lid] = (unsigned int)date;
     639
     640    // Each thread[cid][0] allocates 5 local buffers,
     641    // shared by all threads that have the same cid
    419642    if ( lid == 0 )
    420643    {
    421 
    422 #if VERBOSE
    423 printf( "\n[convol] thread[%x,%d] enters malloc at cycle %d\n",
    424 cxy , lid , (unsigned int)date );
    425 #endif
    426 
    427         GA[cluster_id] = remote_malloc( (FRAME_SIZE/nclusters)   , cxy );
    428         GB[cluster_id] = remote_malloc( (FRAME_SIZE/nclusters)*2 , cxy );
    429         GC[cluster_id] = remote_malloc( (FRAME_SIZE/nclusters)*2 , cxy );
    430         GD[cluster_id] = remote_malloc( (FRAME_SIZE/nclusters)*2 , cxy );
    431         GZ[cluster_id] = remote_malloc( (FRAME_SIZE/nclusters)/2 , cxy );
    432        
    433 #if VERBOSE
    434 printf( "\n[convol]  Shared Buffer Virtual Addresses in cluster %x\n"
    435         "### GA = %x\n"
    436         "### GB = %x\n"               
    437         "### GC = %x\n"               
    438         "### GD = %x\n"               
    439         "### GZ = %x\n",
    440         cxy,
    441         GA[cluster_id],
    442         GB[cluster_id],
    443         GC[cluster_id],
    444         GD[cluster_id],
    445         GZ[cluster_id] );
     644        GA[cid] = malloc( local_pixels * sizeof( unsigned short ) );
     645        GB[cid] = malloc( local_pixels * sizeof( int ) );
     646        GC[cid] = malloc( local_pixels * sizeof( int ) );
     647        GD[cid] = malloc( local_pixels * sizeof( int ) );
     648        GZ[cid] = malloc( local_pixels * sizeof( unsigned char ) );
     649
     650        if( (GA[cid] == NULL) || (GB[cid] == NULL) || (GC[cid] == NULL) ||
     651            (GD[cid] == NULL) || (GZ[cid] == NULL) )
     652        {
     653            printf("\n[convol error] thread[%d] cannot allocate buf_in\n", tid );
     654            pthread_exit( &THREAD_EXIT_FAILURE );
     655        }
     656
     657#if VERBOSE_EXEC
     658printf( "\n[convol] exec[%d] on core[%x,%d] allocated shared buffers\n"
     659"### GA = %x\n"
     660"### GB = %x\n"               
     661"### GC = %x\n"               
     662"### GD = %x\n"               
     663"### GZ = %x\n",
     664tid, cxy , lpid, GA[cid], GB[cid], GC[cid], GD[cid], GZ[cid] );
    446665#endif
    447666   
     
    451670    pthread_barrier_wait( &barrier );
    452671
    453     // Each thread[cxy,p] initialise in its private stack a copy of the
    454     // arrays of pointers on the shared, distributed buffers.
     672    // Each thread[cid,lid] allocate and initialise in its private stack
     673    // a copy of the arrays of pointers on the distributed buffers.
    455674    unsigned short * A[CLUSTERS_MAX];
    456675    int            * B[CLUSTERS_MAX];
     
    468687    }
    469688
    470     // Each thread[x,y,0] access the file containing the input image, to load
    471     // the local A[c] buffer (frame_size / nclusters loaded in each cluster).
    472     // Other threads are waiting on the barrier.
     689    // Each thread[cid,0] access the file containing the input image, to load
     690    // the local A[cid] buffer. Other threads are waiting on the barrier.
    473691    if ( lid==0 )
    474692    {
    475         unsigned int offset = (frame_size/nclusters)*cluster_id;
    476         unsigned int size   = frame_size/nclusters;
    477 
    478         // seek the pointer in file
    479         if ( fseek( f_image_in,
    480                     offset,
    481                     SEEK_SET ) )
    482         {
    483             printf("\n[convol error] in %s : thread[%x,%d] cannot seek input file\n",
    484             __FUNCTION__ , cxy , lid );
    485             pthread_exit( &THREAD_EXIT_FAILURE );
    486         }
    487 
    488         if ( fread( A[cluster_id],
    489                     1,
    490                     size,
    491                     f_image_in ) != size )
    492         {
    493             printf("\n[convol error] in %s : thread[%x,%d] cannot read input file\n",
    494             __FUNCTION__ , cxy , lid );
    495             pthread_exit( &THREAD_EXIT_FAILURE );
    496         }
     693        unsigned int size   = local_pixels * sizeof( unsigned short );
     694        unsigned int offset = size * cid;
     695
     696        memcpy( A[cid],
     697                image_in + offset,
     698                size );
    497699 
    498 #if VERBOSE
     700#if VERBOSE_EXEC
    499701get_cycle( &date );
    500 printf( "\n[convol] thread[%x,%d] load input file at cycle %d\n",
    501 cxy , lid , (unsigned int)date );
     702printf( "\n[convol] thread %d on core[%x,%d] load input file in A[%d]\n",
     703tid , cxy , lpid , cid );
    502704#endif
    503705
     
    505707
    506708    // Optionnal parallel display of the initial image stored in A[c] buffers.
    507     // Eah thread[x,y,p] displays (NL/nthreads) lines. (one byte per pixel).
     709    // Eah thread[cid,lid] displays (NL/nthreads) lines.
    508710
    509711    if ( INITIAL_DISPLAY_ENABLE )
     
    516718            line = offset + l;
    517719
     720            // copy TA[cid] to TZ[cid]
    518721            for ( p = 0 ; p < NP ; p++ )
    519722            {
    520                 TZ(cluster_id, line, p) = (unsigned char)(TA(cluster_id, line, p) >> 8);
     723                TZ(cid, line, p) = (unsigned char)(TA(cid, line, p) >> 8);
    521724            }
    522725
    523             if (fbf_write( &TZ(cluster_id, line, 0),                  // first pixel in TZ
    524                            NP,                                        // number of bytes
    525                            NP*(l + (thread_id * lines_per_thread))))  // offset in FBF
     726            // display one line to frame buffer
     727            if (fbf_write( &TZ(cid, line, 0),                     // first pixel in TZ
     728                           NP,                                    // number of bytes
     729                           NP*(l + (tid * lines_per_thread))))    // offset in FBF
    526730            {
    527731                printf("\n[convol error] in %s : thread[%x,%d] cannot access FBF\n",
     
    531735        }
    532736
    533 #if VERBOSE
     737#if VERBOSE_EXEC
    534738get_cycle( &date );
    535 printf( "\n[convol] thread[%x,%d] completes initial display at cycle %d\n",
    536 cxy , lid , (unsigned int)date );
     739printf( "\n[convol] thread[%d] on core[%x,%d] completes initial display\n",
     740tid , cxy , lpid );
    537741#endif
    538742
     
    543747    ////////////////////////////////////////////////////////////
    544748    // parallel horizontal filter :
    545     // B <= transpose(FH(A))
     749    // B <= convol(FH(A))
    546750    // D <= A - FH(A)
    547     // Each thread computes (NL/nthreads) lines 
     751    // Each thread computes (NL/nthreads) lines.
    548752    // The image must be extended :
    549     // if (z<0)    TA(cluster_id,l,z) == TA(cluster_id,l,0)
    550     // if (z>NP-1) TA(cluster_id,l,z) == TA(cluster_id,l,NP-1)
     753    // if (z<0)    TA(cid,l,z) == TA(cid,l,0)
     754    // if (z>NP-1) TA(cid,l,z) == TA(cid,l,NP-1)
    551755    ////////////////////////////////////////////////////////////
    552756
    553757    get_cycle( &date );
    554     H_BEG[cluster_id][lid] = (unsigned int)date;
    555 
    556 #if VERBOSE
    557 printf( "\n[convol] thread[%x,%d] starts horizontal filter at cycle %d\n",
    558 cxy , lid , (unsigned int)date );
     758    H_BEG[cid][lid] = (unsigned int)date;
     759
     760#if VERBOSE_EXEC
     761printf( "\n[convol] thread[%d] on core[%x,%d] starts horizontal filter\n",
     762tid , cxy , lpid );
    559763#else
    560 if ( (cxy == cxy_main) && (lid == lid_main) )
    561 printf( "\n[convol] thread[%x,%d] starts horizontal filter at cycle %d\n",
    562 cxy , lid , (unsigned int)date );
     764if ( tid == tid_main )
     765printf( "\n[convol] thread[%d] on core[%x,%d] starts horizontal filter\n",
     766tid , cxy , lpid );
    563767#endif
    564768
     
    566770    // first & last define which lines are handled by a given thread
    567771
    568     first = thread_id * lines_per_thread;
     772    first = tid * lines_per_thread;
    569773    last  = first + lines_per_thread;
    570774
     
    626830
    627831    get_cycle( &date );
    628     H_END[cluster_id][lid] = (unsigned int)date;
    629 
    630 #if VERBOSE
    631 printf( "\n[convol] thread[%x,%d] completes horizontal filter at cycle %d\n",
    632 cxy , lid, (unsigned int)date );
     832    H_END[cid][lid] = (unsigned int)date;
     833
     834#if VERBOSE_EXEC
     835printf( "\n[convol] thread[%d] on core[%x,%d] completes horizontal filter\n",
     836tid , cxy , lpid );
    633837#else
    634 if ( (cxy == cxy_main) && (lid == lid_main) )
    635 printf( "\n[convol] thread[%x,%d] completes horizontal filter at cycle %d\n",
    636 cxy , lid, (unsigned int)date );
     838if ( tid == tid_main )
     839printf( "\n[convol] thread[%d] on core[%x,%d] completes horizontal filter\n",
     840tid , cxy , lpid );
    637841#endif
    638842
     
    645849    // Each thread computes (NP/nthreads) columns
    646850    // The image must be extended :
    647     // if (l<0)    TB(cluster_id,p,l) == TB(cluster_id,p,0)
    648     // if (l>NL-1)   TB(cluster_id,p,l) == TB(cluster_id,p,NL-1)
     851    // if (l<0)    TB(cid,p,l) == TB(cid,p,0)
     852    // if (l>NL-1)   TB(cid,p,l) == TB(cid,p,NL-1)
    649853    ///////////////////////////////////////////////////////////////
    650854
    651855    get_cycle( &date );
    652     V_BEG[cluster_id][lid] = (unsigned int)date;
    653 
    654 #if VERBOSE
    655 printf( "\n[convol] thread[%x,%d] starts vertical filter at cycle %d\n",
    656 cxy , lid , (unsigned int)date );
     856    V_BEG[cid][lid] = (unsigned int)date;
     857
     858#if VERBOSE_EXEC
     859printf( "\n[convol] thread[%d] on core[%x,%d] starts vertical filter\n",
     860tid , cxy , lpid );
    657861#else
    658 if ( (cxy == cxy_main) && (lid == lid_main) )
    659 printf( "\n[convol] thread[%x,%d] starts vertical filter at cycle %d\n",
    660 cxy , lid, (unsigned int)date );
     862if ( tid == tid_main )
     863printf( "\n[convol] thread[%d] on core[%x,%d] starts vertical filter\n",
     864tid , cxy , lpid );
    661865#endif
    662866
     
    664868    // first & last define which pixels are handled by a given thread
    665869
    666     first = thread_id * pixels_per_thread;
     870    first = tid * pixels_per_thread;
    667871    last  = first + pixels_per_thread;
    668872
     
    740944
    741945    get_cycle( &date );
    742     V_END[cluster_id][lid] = (unsigned int)date;
    743 
    744 #if VERBOSE
    745 printf( "\n[convol] thread[%x,%d] completes vertical filter at cycle %d\n",
    746 cxy , lid , (unsigned int)date );
     946    V_END[cid][lid] = (unsigned int)date;
     947
     948#if VERBOSE_EXEC
     949printf( "\n[convol] thread[%d] on core[%x,%d] completes vertical filter\n",
     950tid , cxy , lid );
    747951#else
    748 if ( (cxy == cxy_main) && (lid == lid_main) )
    749 printf( "\n[convol] thread[%x,%d] completes vertical filter at cycle %d\n",
    750 cxy , lid, (unsigned int)date );
     952if ( tid == tid_main )
     953printf( "\n[convol] thread[%d] on core[%x,%d] completes vertical filter\n",
     954tid , cxy , lid );
    751955#endif
    752956
     
    755959
    756960    // Optional parallel display of the final image Z <= D + C
    757     // Eah thread[x,y,p] displays (NL/nthreads) lines. (one byte per pixel).
     961    // Eah thread[x,y,p] displays (NL/nthreads) lines.
    758962
    759963    if ( FINAL_DISPLAY_ENABLE )
    760964    {
    761965        get_cycle( &date );
    762         D_BEG[cluster_id][lid] = (unsigned int)date;
    763 
    764 #if VERBOSE
    765 printf( "\n[convol] thread[%x,%d] starts final display at cycle %d\n",
    766 cxy , lid , (unsigned int)date );
     966        D_BEG[cid][lid] = (unsigned int)date;
     967
     968#if VERBOSE_EXEC
     969printf( "\n[convol] thread[%d] on core[%x,%d] starts final display\n",
     970tid , cxy , lid );
    767971#else
    768 if ( (cxy == cxy_main) && (lid == lid_main) )
    769 printf( "\n[convol] thread[%x,%d] starts final display at cycle %d\n",
    770 cxy , lid, (unsigned int)date );
     972if ( tid == tid_main )
     973printf( "\n[convol] thread[%d] on core[%x,%d] starts final display\n",
     974tid , cxy , lid );
    771975#endif
    772976
     
    780984            for ( p = 0 ; p < NP ; p++ )
    781985            {
    782                 TZ(cluster_id, line, p) =
    783                    (unsigned char)( (TD(cluster_id, line, p) +
    784                                      TC(cluster_id, line, p) ) >> 8 );
     986                TZ(cid, line, p) =
     987                   (unsigned char)( (TD(cid, line, p) +
     988                                     TC(cid, line, p) ) >> 8 );
    785989            }
    786990
    787             if (fbf_write( &TZ(cluster_id, line, 0),                  // first pixel in TZ
    788                            NP,                                        // number of bytes
    789                            NP*(l + (thread_id * lines_per_thread))))  // offset in FBF
     991            if (fbf_write( &TZ(cid, line, 0),                   // first pixel in TZ
     992                           NP,                                  // number of bytes
     993                           NP*(l + (tid * lines_per_thread))))  // offset in FBF
    790994            {
    791                 printf("\n[convol error] in %s : thread[%d,%d,%d] cannot access FBF\n",
    792                 __FUNCTION__ , x , y , lid );
     995                printf("\n[convol error] thread[%d] cannot access FBF\n", tid );
    793996                pthread_exit( &THREAD_EXIT_FAILURE );
    794997            }
     
    796999
    7971000        get_cycle( &date );
    798         D_END[cluster_id][lid] = (unsigned int)date;
    799 
    800 #if VERBOSE
    801 printf( "\n[convol] thread[%x,%d] completes final display at cycle %d\n",
    802 cxy , lid , (unsigned int)date );
     1001        D_END[cid][lid] = (unsigned int)date;
     1002
     1003#if VERBOSE_EXEC
     1004printf( "\n[convol] thread[%d] on core[%x,%d] completes final display\n",
     1005tid , cxy , lid );
    8031006#else
    804 if ( (cxy == cxy_main) && (lid == lid_main) )
    805 printf( "\n[convol] thread[%x,%d] completes final display at cycle %d\n",
    806 cxy , lid , (unsigned int)date );
    807 #endif
    808      
    809         ////////////////////////////////
    810         pthread_barrier_wait( &barrier );
     1007if ( tid == tid_main )
     1008printf( "\n[convol] thread[%d] on core[%x,%d] completes final display\n",
     1009tid , cxy , lid );
     1010#endif
     1011
    8111012    }
    8121013
    8131014    // all threads (but the one executing main) exit
    814     if ( (cxy != cxy_main) || (lid != lid_main) )
     1015    if ( tid != tid_main )
    8151016    {
    8161017        pthread_exit( &THREAD_EXIT_SUCCESS );
     
    8211022
    8221023
    823 /////////////////////////////////////////
    824 void instrument( unsigned int nclusters,
    825                  unsigned int ncores )
     1024//////////////////////////
     1025void instrument( FILE * f,
     1026                 char * filename )
    8261027{
    827         unsigned int cc, pp;
    828 
    829         unsigned int min_start = 0xFFFFFFFF;
    830         unsigned int max_start = 0;
    831 
    832         unsigned int min_h_beg = 0xFFFFFFFF;
    833         unsigned int max_h_beg = 0;
    834 
    835         unsigned int min_h_end = 0xFFFFFFFF;
    836         unsigned int max_h_end = 0;
    837 
    838         unsigned int min_v_beg = 0xFFFFFFFF;
    839         unsigned int max_v_beg = 0;
    840 
    841         unsigned int min_v_end = 0xFFFFFFFF;
    842         unsigned int max_v_end = 0;
    843 
    844         unsigned int min_d_beg = 0xFFFFFFFF;
    845         unsigned int max_d_beg = 0;
    846 
    847         unsigned int min_d_end = 0xFFFFFFFF;
    848         unsigned int max_d_end = 0;
    849 
    850         for (cc = 0; cc < nclusters; cc++)
    851         {
    852             for (pp = 0; pp < ncores; pp++ )
    853             {
    854                 if (START[cc][pp] < min_start) min_start = START[cc][pp];
    855                 if (START[cc][pp] > max_start) max_start = START[cc][pp];
    856 
    857                 if (H_BEG[cc][pp] < min_h_beg) min_h_beg = H_BEG[cc][pp];
    858                 if (H_BEG[cc][pp] > max_h_beg) max_h_beg = H_BEG[cc][pp];
    859 
    860                 if (H_END[cc][pp] < min_h_end) min_h_end = H_END[cc][pp];
    861                 if (H_END[cc][pp] > max_h_end) max_h_end = H_END[cc][pp];
    862 
    863                 if (V_BEG[cc][pp] < min_v_beg) min_v_beg = V_BEG[cc][pp];
    864                 if (V_BEG[cc][pp] > max_v_beg) max_v_beg = V_BEG[cc][pp];
    865 
    866                 if (V_END[cc][pp] < min_v_end) min_v_end = V_END[cc][pp];
    867                 if (V_END[cc][pp] > max_v_end) max_v_end = V_END[cc][pp];
    868 
    869                 if (D_BEG[cc][pp] < min_d_beg) min_d_beg = D_BEG[cc][pp];
    870                 if (D_BEG[cc][pp] > max_d_beg) max_d_beg = D_BEG[cc][pp];
    871 
    872                 if (D_END[cc][pp] < min_d_end) min_d_end = D_END[cc][pp];
    873                 if (D_END[cc][pp] > max_d_end) max_d_end = D_END[cc][pp];
    874             }
    875         }
    876 
    877         printf(" - START : min = %d / max = %d / med = %d / delta = %d\n",
    878                min_start, max_start, (min_start+max_start)/2, max_start-min_start);
    879 
    880         printf(" - H_BEG : min = %d / max = %d / med = %d / delta = %d\n",
    881                min_h_beg, max_h_beg, (min_h_beg+max_h_beg)/2, max_h_beg-min_h_beg);
    882 
    883         printf(" - H_END : min = %d / max = %d / med = %d / delta = %d\n",
    884                min_h_end, max_h_end, (min_h_end+max_h_end)/2, max_h_end-min_h_end);
    885 
    886         printf(" - V_BEG : min = %d / max = %d / med = %d / delta = %d\n",
    887                min_v_beg, max_v_beg, (min_v_beg+max_v_beg)/2, max_v_beg-min_v_beg);
    888 
    889         printf(" - V_END : min = %d / max = %d / med = %d / delta = %d\n",
    890                min_v_end, max_v_end, (min_v_end+max_v_end)/2, max_v_end-min_v_end);
    891 
    892         printf(" - D_BEG : min = %d / max = %d / med = %d / delta = %d\n",
    893                min_d_beg, max_d_beg, (min_d_beg+max_d_beg)/2, max_d_beg-min_d_beg);
    894 
    895         printf(" - D_END : min = %d / max = %d / med = %d / delta = %d\n",
    896                min_d_end, max_d_end, (min_d_end+max_d_end)/2, max_d_end-min_d_end);
    897 
    898         printf( "\n General Scenario (Kcycles for each step)\n" );
    899         printf( " - BOOT OS           = %d\n", (min_start            )/1000 );
    900         printf( " - LOAD IMAGE        = %d\n", (min_h_beg - min_start)/1000 );
    901         printf( " - H_FILTER          = %d\n", (max_h_end - min_h_beg)/1000 );
    902         printf( " - BARRIER HORI/VERT = %d\n", (min_v_beg - max_h_end)/1000 );
    903         printf( " - V_FILTER          = %d\n", (max_v_end - min_v_beg)/1000 );
    904         printf( " - BARRIER VERT/DISP = %d\n", (min_d_beg - max_v_end)/1000 );
    905         printf( " - DISPLAY           = %d\n", (max_d_end - min_d_beg)/1000 );
    906 
    907         // TODO save these results on f_instrum
     1028    unsigned int nclusters = x_size * y_size;
     1029
     1030    unsigned int cc, pp;
     1031
     1032    unsigned int min_start = 0xFFFFFFFF;
     1033    unsigned int max_start = 0;
     1034
     1035    unsigned int min_h_beg = 0xFFFFFFFF;
     1036    unsigned int max_h_beg = 0;
     1037
     1038    unsigned int min_h_end = 0xFFFFFFFF;
     1039    unsigned int max_h_end = 0;
     1040
     1041    unsigned int min_v_beg = 0xFFFFFFFF;
     1042    unsigned int max_v_beg = 0;
     1043
     1044    unsigned int min_v_end = 0xFFFFFFFF;
     1045    unsigned int max_v_end = 0;
     1046
     1047    unsigned int min_d_beg = 0xFFFFFFFF;
     1048    unsigned int max_d_beg = 0;
     1049
     1050    unsigned int min_d_end = 0xFFFFFFFF;
     1051    unsigned int max_d_end = 0;
     1052
     1053    for (cc = 0; cc < nclusters; cc++)
     1054    {
     1055        for (pp = 0; pp < ncores; pp++ )
     1056        {
     1057            if (START[cc][pp] < min_start) min_start = START[cc][pp];
     1058            if (START[cc][pp] > max_start) max_start = START[cc][pp];
     1059
     1060            if (H_BEG[cc][pp] < min_h_beg) min_h_beg = H_BEG[cc][pp];
     1061            if (H_BEG[cc][pp] > max_h_beg) max_h_beg = H_BEG[cc][pp];
     1062
     1063            if (H_END[cc][pp] < min_h_end) min_h_end = H_END[cc][pp];
     1064            if (H_END[cc][pp] > max_h_end) max_h_end = H_END[cc][pp];
     1065
     1066            if (V_BEG[cc][pp] < min_v_beg) min_v_beg = V_BEG[cc][pp];
     1067            if (V_BEG[cc][pp] > max_v_beg) max_v_beg = V_BEG[cc][pp];
     1068
     1069            if (V_END[cc][pp] < min_v_end) min_v_end = V_END[cc][pp];
     1070            if (V_END[cc][pp] > max_v_end) max_v_end = V_END[cc][pp];
     1071
     1072            if (D_BEG[cc][pp] < min_d_beg) min_d_beg = D_BEG[cc][pp];
     1073            if (D_BEG[cc][pp] > max_d_beg) max_d_beg = D_BEG[cc][pp];
     1074
     1075            if (D_END[cc][pp] < min_d_end) min_d_end = D_END[cc][pp];
     1076            if (D_END[cc][pp] > max_d_end) max_d_end = D_END[cc][pp];
     1077        }
     1078    }
     1079
     1080    // display on terminal
     1081    printf( "\n ------ %s ------\n" , filename );
     1082
     1083    printf(" - START : min = %d / max = %d / med = %d / delta = %d\n",
     1084           min_start, max_start, (min_start+max_start)/2, max_start-min_start);
     1085
     1086    printf(" - H_BEG : min = %d / max = %d / med = %d / delta = %d\n",
     1087           min_h_beg, max_h_beg, (min_h_beg+max_h_beg)/2, max_h_beg-min_h_beg);
     1088
     1089    printf(" - H_END : min = %d / max = %d / med = %d / delta = %d\n",
     1090           min_h_end, max_h_end, (min_h_end+max_h_end)/2, max_h_end-min_h_end);
     1091
     1092    printf(" - V_BEG : min = %d / max = %d / med = %d / delta = %d\n",
     1093           min_v_beg, max_v_beg, (min_v_beg+max_v_beg)/2, max_v_beg-min_v_beg);
     1094
     1095    printf(" - V_END : min = %d / max = %d / med = %d / delta = %d\n",
     1096           min_v_end, max_v_end, (min_v_end+max_v_end)/2, max_v_end-min_v_end);
     1097
     1098    printf(" - D_BEG : min = %d / max = %d / med = %d / delta = %d\n",
     1099           min_d_beg, max_d_beg, (min_d_beg+max_d_beg)/2, max_d_beg-min_d_beg);
     1100
     1101    printf(" - D_END : min = %d / max = %d / med = %d / delta = %d\n",
     1102           min_d_end, max_d_end, (min_d_end+max_d_end)/2, max_d_end-min_d_end);
     1103
     1104    printf( "\n General Scenario (Kcycles for each step)\n" );
     1105    printf( " - LOAD IMAGE        = %d\n", (min_h_beg - min_start)/1000 );
     1106    printf( " - H_FILTER          = %d\n", (max_h_end - min_h_beg)/1000 );
     1107    printf( " - BARRIER HORI/VERT = %d\n", (min_v_beg - max_h_end)/1000 );
     1108    printf( " - V_FILTER          = %d\n", (max_v_end - min_v_beg)/1000 );
     1109    printf( " - BARRIER VERT/DISP = %d\n", (min_d_beg - max_v_end)/1000 );
     1110    printf( " - DISPLAY           = %d\n", (max_d_end - min_d_beg)/1000 );
     1111    printf( " \nSEQUENCIAL = %d / PARALLEL = %d\n", SEQUENCIAL_TIME, PARALLEL_TIME );
     1112
     1113    // save on disk
     1114    fprintf( f ,  "\n ------ %s ------\n" , filename );
     1115
     1116    fprintf( f , " - START : min = %d / max = %d / med = %d / delta = %d\n",
     1117           min_start, max_start, (min_start+max_start)/2, max_start-min_start);
     1118
     1119    fprintf( f , " - H_BEG : min = %d / max = %d / med = %d / delta = %d\n",
     1120           min_h_beg, max_h_beg, (min_h_beg+max_h_beg)/2, max_h_beg-min_h_beg);
     1121
     1122    fprintf( f , " - H_END : min = %d / max = %d / med = %d / delta = %d\n",
     1123           min_h_end, max_h_end, (min_h_end+max_h_end)/2, max_h_end-min_h_end);
     1124
     1125    fprintf( f , " - V_BEG : min = %d / max = %d / med = %d / delta = %d\n",
     1126           min_v_beg, max_v_beg, (min_v_beg+max_v_beg)/2, max_v_beg-min_v_beg);
     1127
     1128    fprintf( f , " - V_END : min = %d / max = %d / med = %d / delta = %d\n",
     1129           min_v_end, max_v_end, (min_v_end+max_v_end)/2, max_v_end-min_v_end);
     1130
     1131    fprintf( f , " - D_BEG : min = %d / max = %d / med = %d / delta = %d\n",
     1132           min_d_beg, max_d_beg, (min_d_beg+max_d_beg)/2, max_d_beg-min_d_beg);
     1133
     1134    fprintf( f , " - D_END : min = %d / max = %d / med = %d / delta = %d\n",
     1135           min_d_end, max_d_end, (min_d_end+max_d_end)/2, max_d_end-min_d_end);
     1136
     1137    fprintf( f ,  "\n General Scenario (Kcycles)\n" );
     1138    fprintf( f ,  " - LOAD IMAGE        = %d\n", (min_h_beg - min_start)/1000 );
     1139    fprintf( f ,  " - H_FILTER          = %d\n", (max_h_end - min_h_beg)/1000 );
     1140    fprintf( f ,  " - BARRIER HORI/VERT = %d\n", (min_v_beg - max_h_end)/1000 );
     1141    fprintf( f ,  " - V_FILTER          = %d\n", (max_v_end - min_v_beg)/1000 );
     1142    fprintf( f ,  " - BARRIER VERT/DISP = %d\n", (min_d_beg - max_v_end)/1000 );
     1143    fprintf( f ,  " - DISPLAY           = %d\n", (max_d_end - min_d_beg)/1000 );
     1144    fprintf( f ,  " \nSEQUENCIAL = %d / PARALLEL = %d\n", SEQUENCIAL_TIME, PARALLEL_TIME );
    9081145
    9091146} // end instrument()
  • trunk/user/display/display.ld

    r644 r652  
    1 /****************************************************************************
     1/***************************************************************************
    22* Define the base address for user code (both .text and .data)
    3 *****************************************************************************/
     3***************************************************************************/
    44
    55seg_code_base      = 0x400000;
  • trunk/user/fft/fft.c

    r649 r652  
    1515/*************************************************************************/
    1616
    17 ///////////////////////////////////////////////////////////////////////////
     17////////////////////////////////////////////////////////////////////////////////////////
    1818// This port of the SPLASH FFT benchmark on the ALMOS-MKH OS has been
    1919// done by Alain Greiner (august 2018).
     
    4545// that contains all coefs required for a rootN points FFT.
    4646//
    47 // There is one working thread per core.
    4847// The actual number of cores and cluster in a given hardware architecture
    4948// is obtained by the get_config() syscall (x_size, y_size, ncores).
     
    5150// The max number of cores per cluster is bounded by CORES_MAX.
    5251//
    53 // Several configuration parameters can be defined below:
    54 //  - PRINT_ARRAY : Print out complex data points arrays.
    55 //  - CHECK       : Perform both FFT and inverse FFT to check output/input.
    56 //  - DEBUG_MAIN  : Display intermediate results in main()
    57 //  - DEBUG_FFT1D : Display intermediate results in FFT1D()
    58 //  - DEBUG_ROW   : Display intermedite results in FFTrow()
     52// The number N of working threads is always defined by the number of cores availables
     53// in the architecture, but this application supports three placement modes.
     54// In all modes, the working threads are identified by the [tid] continuous index
     55// in range [0, NTHREADS-1], and defines how the lines are shared amongst the threads.
     56// This continuous index can always be decomposed in two continuous sub-indexes:
     57// tid == cid * ncores + lid,  where cid is in [0,NCLUSTERS-1] and lid in [0,NCORES-1].
     58//
     59// - NO_PLACEMENT: the main thread is itsef a working thread. The (N_1) other working
     60//   threads are created by the main thread, but the placement is done by the OS, using
     61//   the DQDT for load balancing, and two working threads can be placed on the same core.
     62//   The [cid,lid] are only abstract identifiers, and cannot be associated to a physical
     63//   cluster or a physical core. In this mode, the main thread run on any cluster,
     64//   but has tid = 0 (i.e. cid = 0 & tid = 0).
     65//
     66// - EXPLICIT_PLACEMENT: the main thread is again a working thread, but the placement of
     67//   of the threads on the cores is explicitely controled by the main thread to have
     68//   exactly one working thread per core, and the [cxy][lpid] core coordinates for a given
     69//   thread[tid] can be directly derived from the [tid] value: [cid] is an alias for the
     70//   physical cluster identifier, and [lid] is the local core index.
     71//
     72// - PARALLEL_PLACEMENT: the main thread is not anymore a working thread, and uses the
     73//   non standard pthread_parallel_create() function to avoid the costly sequencial
     74//   loops for pthread_create() and pthread_join(). It garanty one working thread
     75//   per core, and the same relation between the thread[tid] and the core[cxy][lpid].
     76//
     77// Several others configuration parameters can be defined below:
     78//  - USE_DQT_BARRIER : use a hierarchical barrier for working threads synchro
     79//  - PRINT_ARRAY     : Print out complex data points arrays.
     80//  - CHECK           : Perform both FFT and inverse FFT to check output/input.
     81//  - DEBUG_MAIN      : Display intermediate results in main()
     82//  - DEBUG_FFT1D     : Display intermediate results in FFT1D()
     83//  - DEBUG_ROW       : Display intermedite results in FFTrow()
    5984//
    6085// Regarding final instrumentation:
     
    6691//   is computed by each thread(i) in the work() function.
    6792// The results are displayed on the TXT terminal, and registered on disk.
    68 ///////////////////////////////////////////////////////////////////////////
     93///////////////////////////////////////////////////////////////////////////////////////
    6994
    7095#include <math.h>
     
    92117// parameters
    93118
     119#define NO_PLACEMENT            1
     120#define EXPLICIT_PLACEMENT      0
     121#define PARALLEL_PLACEMENT      0
     122
    94123#define DEFAULT_M               18              // 256 K complex points
    95124#define USE_DQT_BARRIER         1               // use DDT barrier if non zero
     
    110139/////////////////////////////////////////////////////////////////////////////////////
    111140
    112 // work function arguments
    113 typedef struct work_args_s
    114 {
    115     unsigned int        tid;               // thread continuous index
    116     unsigned int        lid;               // core local index
    117     unsigned int        cid;               // cluster continuous index
    118     pthread_barrier_t * parent_barrier;    // parent barrier to signal completion
    119 }
    120 work_args_t;
     141unsigned int   x_size;                     // platform global parameter
     142unsigned int   y_size;                     // platform global parameter
     143unsigned int   ncores;                     // platform global parameter
    121144
    122145unsigned int   nthreads;                   // total number of threads (one thread per core)
     
    130153// arrays of pointers on distributed buffers (one sub-buffer per cluster)
    131154double *       data[CLUSTERS_MAX];         // original time-domain data
    132 double *       trans[CLUSTERS_MAX];        // used as auxiliary space for transpose
     155double *       trans[CLUSTERS_MAX];        // used as auxiliary space for fft
    133156double *       twid[CLUSTERS_MAX];         // twiddle factor : exp(-2iPI*k*n/N)
    134157double *       bloup[CLUSTERS_MAX];        // used as auxiliary space for DFT
     
    146169pthread_barrierattr_t  barrier_attr;
    147170
    148 /////////////////////////////////////////////////////////////////////////////////////
    149 //             Global variables required by parallel_pthread_create()
    150 /////////////////////////////////////////////////////////////////////////////////////
    151 
    152 // 2D arrays of input arguments for the <work> threads
    153 // These arrays are initialised by the application main thread
    154 
    155 work_args_t       work_args[CLUSTERS_MAX][CORES_MAX];  // work function arguments
    156 work_args_t     * work_ptrs[CLUSTERS_MAX][CORES_MAX];  // pointers on arguments
    157 
    158 // 1D array of barriers to allow the <work> threads to signal termination
    159 // this array is initialised in each cluster by the <build[cxy][0]> thread
    160  
    161 pthread_barrier_t parent_barriers[CLUSTERS_MAX];        // termination barrier
     171//return values at thread exit
     172unsigned int   THREAD_EXIT_SUCCESS = 0;
     173unsigned int   THREAD_EXIT_FAILURE = 1;
     174
     175// main thread continuous index
     176unsigned int     tid_main;
     177
     178// array of kernel thread identifiers / indexed by [tid]
     179pthread_t      work_trdid[CLUSTERS_MAX * CORES_MAX];   
     180
     181// array of thread attributes / indexed by [tid]
     182pthread_attr_t work_attr[CLUSTERS_MAX * CORES_MAX];
     183
     184// array of work function arguments / indexed by [tid]
     185pthread_parallel_work_args_t work_args[CLUSTERS_MAX * CORES_MAX];
    162186
    163187/////////////////////////////////////////////////////////////////////////////////////
     
    165189/////////////////////////////////////////////////////////////////////////////////////
    166190
    167 void work( work_args_t * args );
     191void work( pthread_parallel_work_args_t * args );
    168192
    169193double CheckSum( void );
     
    234258    int                 error;
    235259
    236     unsigned int        x_size;            // number of clusters per row
    237     unsigned int        y_size;            // number of clusters per column
    238     unsigned int        ncores;            // max number of cores per cluster
    239 
    240 
    241     unsigned int        x;                 // current index for cluster X coordinate
    242     unsigned int        y;                 // current index for cluster Y coordinate
    243     unsigned int        lid;               // current index for core in a cluster
    244260    unsigned int        tid;               // continuous thread index
    245     unsigned int        cid;               // cluster continuous index
    246     unsigned int        cxy;               // hardware specific cluster identifier
    247261
    248262    char                name[64];          // instrumentation file name
     
    265279    int                 pid = getpid();
    266280
     281    // check placement mode
     282    if( (NO_PLACEMENT + EXPLICIT_PLACEMENT + PARALLEL_PLACEMENT) != 1 )
     283    {
     284        printf("\n[fft error] illegal placement mode\n");
     285        exit( 0 );
     286    }
     287
    267288    // get FFT application start cycle
    268289    get_cycle( &start_init_cycle );
     
    295316        exit( 0 );
    296317    }
     318
     319    // get identifiers for core executing main
     320    unsigned int  cxy_main;
     321    unsigned int  lid_main;
     322    get_core_id( &cxy_main , &lid_main );
    297323
    298324    // compute nthreads and nclusters
     
    317343    }
    318344
    319     printf("\n[fft] starts / %d points / %d thread(s) / PID %x / cycle %d\n",
    320     N, nthreads, pid, (unsigned int)start_init_cycle );
    321 
    322     // build instrumentation file name
    323     if( USE_DQT_BARRIER )
    324     snprintf( name , 64 , "p_fft_dqt_%d_%d_%d", N , x_size * y_size , ncores );
    325     else
    326     snprintf( name , 64 , "p_fft_smp_%d_%d_%d", N , x_size * y_size , ncores );
    327 
    328     // build pathname
     345    // define instrumentation file name
     346    if( NO_PLACEMENT )
     347    {
     348        printf("\n[fft] starts / %d points / %d thread(s) / PID %x / NO_PLACE\n",
     349        N, nthreads, pid );
     350
     351        // build instrumentation file name
     352        if( USE_DQT_BARRIER )
     353        snprintf( name , 64 , "fft_dqt_no_place_%d_%d_%d", M , x_size * y_size , ncores );
     354        else
     355        snprintf( name , 64 , "fft_smp_no_place_%d_%d_%d", M , x_size * y_size , ncores );
     356    }
     357
     358    if( EXPLICIT_PLACEMENT )
     359    {
     360        printf("\n[fft] starts / %d points / %d thread(s) / PID %x / EXPLICIT\n",
     361        N, nthreads, pid );
     362
     363        // build instrumentation file name
     364        if( USE_DQT_BARRIER )
     365        snprintf( name , 64 , "fft_dqt_explicit_%d_%d_%d", M , x_size * y_size , ncores );
     366        else
     367        snprintf( name , 64 , "fft_smp_explicit_%d_%d_%d", M , x_size * y_size , ncores );
     368    }
     369
     370    if( PARALLEL_PLACEMENT )
     371    {
     372        printf("\n[fft] starts / %d points / %d thread(s) / PID %x / PARALLEL\n",
     373        N, nthreads, pid );
     374
     375        // build instrumentation file name
     376        if( USE_DQT_BARRIER )
     377        snprintf( name , 64 , "fft_dqt_parallel_%d_%d_%d", M , x_size * y_size , ncores );
     378        else
     379        snprintf( name , 64 , "fft_smp_parallel_%d_%d_%d", M , x_size * y_size , ncores );
     380    }
     381
     382    // build instrumentation file pathname
    329383    snprintf( path , 128 , "/home/%s", name );
    330384
     
    339393#if DEBUG_MAIN
    340394get_cycle( &debug_cycle );
    341 printf("\n[fft] main open file <%s> at cycle %d\n",
     395printf("\n[fft] main open instrumentation file <%s> at cycle %d\n",
    342396path, (unsigned int)debug_cycle );
    343397#endif
     
    381435#if DEBUG_MAIN
    382436get_cycle( &debug_cycle );
    383 printf("\n[fft] main completes barrier init at cycle %d\n",
     437printf("\n[fft] main completes sequencial initialisation at cycle %d\n",
    384438(unsigned int)debug_cycle );
    385439#endif
    386 
    387     // build array of arguments for the <work> threads
    388     for (x = 0 ; x < x_size ; x++)
    389     {
    390         for (y = 0 ; y < y_size ; y++)
    391         {
    392             // compute cluster identifier
    393             cxy = HAL_CXY_FROM_XY( x , y );
    394 
    395             for ( lid = 0 ; lid < ncores ; lid++ )
    396             {
    397                 // compute cluster continuous index
    398                 cid = (x * y_size) + y;
    399 
    400                 // compute work thread continuous index
    401                 tid = (cid * ncores) + lid;
    402                
    403                 // initialize 2D array of arguments
    404                 work_args[cxy][lid].tid            = tid;
    405                 work_args[cxy][lid].lid            = lid;
    406                 work_args[cxy][lid].cid            = cid;
    407                 work_args[cxy][lid].parent_barrier = &parent_barriers[cxy];
    408 
    409                 // initialize 2D array of pointers
    410                 work_ptrs[cxy][lid] = &work_args[cxy][lid];
    411             }
    412         }
    413     }
    414440
    415441    // register sequencial time
     
    417443    init_time = (unsigned int)(end_init_cycle - start_init_cycle);
    418444
     445    //////////////////
     446    if( NO_PLACEMENT )
     447    {
     448        // the tid value for the main thread is always 0
     449        // main thread creates new threads with tid in [1,nthreads-1] 
     450        unsigned int tid;
     451        for ( tid = 0 ; tid < nthreads ; tid++ )
     452        {
     453            // register tid value in work_args[tid] array
     454            work_args[tid].tid = tid;
     455           
     456            // create other threads
     457            if( tid > 0 )
     458            {
     459                if ( pthread_create( &work_trdid[tid],
     460                                     NULL,                  // no attribute
     461                                     &work,
     462                                     &work_args[tid] ) )
     463                {
     464                    printf("\n[fft error] cannot create thread %d\n", tid );
     465                    exit( 0 );
     466                }
     467
    419468#if DEBUG_MAIN
    420 printf("\n[fft] main completes <work> threads arguments at cycle %d\n",
    421 (unsigned int)end_init_cycle );
    422 #endif
    423 
    424     // create and execute the working threads
    425     if( pthread_parallel_create( root_level,
    426                                  &work,
    427                                  &work_ptrs[0][0],
    428                                  &parent_barriers[0] ) )
    429     {
    430         printf("\n[fft error] creating threads\n");
    431         exit( 0 );
     469printf("\n[fft] main created thread %d\n", tid );
     470#endif
     471
     472            }
     473            else
     474            {
     475                tid_main = 0;
     476            }
     477        }  // end for tid
     478
     479        // main thread calls itself the execute() function
     480        work( &work_args[0] );
     481
     482        // main thread wait other threads completion
     483        for ( tid = 1 ; tid < nthreads ; tid++ )
     484        {
     485            unsigned int * status;
     486
     487            // main wait thread[tid] status
     488            if ( pthread_join( work_trdid[tid], (void*)(&status)) )
     489            {
     490                printf("\n[fft error] main cannot join thread %d\n", tid );
     491                exit( 0 );
     492            }
     493       
     494            // check status
     495            if( *status != THREAD_EXIT_SUCCESS )
     496            {
     497                printf("\n[fft error] thread %x returned failure\n", tid );
     498                exit( 0 );
     499            }
     500
     501#if DEBUG_MAIN
     502printf("\n[fft] main successfully joined thread %x\n", tid );
     503#endif
     504       
     505        }  // end for tid
     506
     507    }  // end if no_placement
     508
     509    ////////////////////////
     510    if( EXPLICIT_PLACEMENT )
     511    {
     512        // main thread places each thread[tid] on a specific core[cxy][lid]
     513        // but the actual thread creation is sequencial
     514        unsigned int x;
     515        unsigned int y;
     516        unsigned int l;
     517        unsigned int cxy;                   // cluster identifier
     518        unsigned int tid;                   // thread continuous index
     519
     520        for( x = 0 ; x < x_size ; x++ )
     521        {
     522            for( y = 0 ; y < y_size ; y++ )
     523            {
     524                cxy = HAL_CXY_FROM_XY( x , y );
     525                for( l = 0 ; l < ncores ; l++ )
     526                {
     527                    // compute thread continuous index
     528                    tid = (((x  * y_size) + y) * ncores) + l;
     529
     530                    // register tid value in work_args[tid] array
     531                    work_args[tid].tid = tid;
     532
     533                    // no thread created on the core running the main
     534                    if( (cxy != cxy_main) || (l != lid_main) )
     535                    {
     536                        // define thread attributes
     537                        work_attr[tid].attributes = PT_ATTR_CLUSTER_DEFINED |
     538                                                    PT_ATTR_CORE_DEFINED;
     539                        work_attr[tid].cxy        = cxy;
     540                        work_attr[tid].lid        = l;
     541 
     542                        // create thread[tid] on core[cxy][l]
     543                        if ( pthread_create( &work_trdid[tid],   
     544                                             &work_attr[tid],   
     545                                             &work,
     546                                             &work_args[tid] ) )       
     547                        {
     548                            printf("\n[fft error] cannot create thread %d\n", tid );
     549                            exit( 0 );
     550                        }
     551#if DEBUG_MAIN
     552printf("\n[fft] main created thread[%d] on core[%x,%d]\n", tid, cxy, l );
     553#endif
     554                    }
     555                    else
     556                    {
     557                        tid_main = tid;
     558                    }
     559                }
     560            }
     561        }
     562
     563        // main thread calls itself the execute() function
     564        work( &work_args[tid_main] );
     565
     566        // main thread wait other threads completion
     567        for( tid = 0 ; tid < nthreads ; tid++ )
     568        {
     569            // no other thread on the core running the main
     570            if( tid != tid_main )
     571            {
     572                unsigned int * status;
     573
     574                // wait thread[tid]
     575                if( pthread_join( work_trdid[tid] , (void*)(&status) ) )
     576                {
     577                    printf("\n[fft error] main cannot join thread %d\n", tid );
     578                    exit( 0 );
     579                }
     580       
     581                // check status
     582                if( *status != THREAD_EXIT_SUCCESS )
     583                {
     584                    printf("\n[fft error] thread %d returned failure\n", tid );
     585                    exit( 0 );
     586                }
     587#if DEBUG_MAIN
     588printf("\n[fft] main joined thread %d on core[%x,%d]\n", tid , cxy , l );
     589#endif
     590            }
     591        }
     592    }  // end if explicit_placement
     593
     594    ////////////////////////
     595    if( PARALLEL_PLACEMENT )
     596    {
     597        // create and execute the working threads
     598        if( pthread_parallel_create( root_level , &work ) )
     599        {
     600            printf("\n[fft error] cannot create threads\n");
     601            exit( 0 );
     602        }
    432603    }
    433604
     
    533704// This function is executed in parallel by all <work> threads.
    534705/////////////////////////////////////////////////////////////////
    535 void work( work_args_t * args )
     706void work( pthread_parallel_work_args_t * args )
    536707{
    537708    unsigned int        tid;              // this thread continuous index
     
    549720    unsigned long long  barrier_stop;
    550721
     722    get_cycle( &parallel_start );
     723
    551724    // get thread arguments
    552725    tid            = args->tid;
    553     lid            = args->lid;             
    554     cid            = args->cid;             
    555     parent_barrier = args->parent_barrier;
    556 
    557     get_cycle( &parallel_start );
    558 
     726    parent_barrier = args->barrier;
     727
     728    // compute lid and cid from tid
     729    lid            = tid % ncores;             
     730    cid            = tid / ncores;
     731           
    559732#if DEBUG_WORK
    560733printf("\n[fft] %s : thread %d enter / cycle %d\n",
     
    602775printf("\n[fft] %s : thread %d exit barrier for buffer allocation / cycle %d\n",
    603776__FUNCTION__, tid, (unsigned int)barrier_stop );
    604 #endif
    605 
    606 #if DISPLAY_SCHED_AND_VMM
    607     unsigned int x_size;
    608     unsigned int y_size;
    609     unsigned int ncores;
    610     get_config( &x_size , &y_size , &ncores );
    611     unsigned int x   = cid / y_size;
    612     unsigned int y   = cid % y_size;
    613     unsigned int cxy = HAL_CXY_FROM_XY( x , y );
    614 display_sched( cxy , lid );
    615 if( lid == 0 ) display_vmm( cxy , getpid() , 0 );
    616777#endif
    617778
     
    9191080// contained in the distributed buffers x[nclusters][points_per_cluster].
    9201081// It handles the (N) points 1D array as a (rootN*rootN) points 2D array.
    921 // 1) it transpose (rootN/nthreads ) rows from x to tmp.
     1082// 1) it fft (rootN/nthreads ) rows from x to tmp.
    9221083// 2) it make (rootN/nthreads) FFT on the tmp rows and apply the twiddle factor.
    923 // 3) it transpose (rootN/nthreads) columns from tmp to x.
     1084// 3) it fft (rootN/nthreads) columns from tmp to x.
    9241085// 4) it make (rootN/nthreads) FFT on the x rows.
    9251086// It calls the FFTRow() 2*(rootN/nthreads) times to perform the in place FFT
     
    9461107#endif
    9471108
    948     // transpose (rootN/nthreads) rows from x to tmp
     1109    // fft (rootN/nthreads) rows from x to tmp
    9491110    Transpose( x , tmp , MyFirst , MyLast );
    9501111
    9511112#if( DEBUG_FFT1D & 1 )
    9521113get_cycle( &cycle );
    953 printf("\n[fft] %s : thread %d after first transpose / cycle %d\n",
     1114printf("\n[fft] %s : thread %d after first fft / cycle %d\n",
    9541115__FUNCTION__, tid, (unsigned int)cycle );
    9551116if( PRINT_ARRAY ) PrintArray( tmp , N );
     
    9641125#if( DEBUG_FFT1D & 1 )
    9651126get_cycle( &cycle );
    966 printf("\n[fft] %s : thread %d exit barrier after first transpose / cycle %d\n",
     1127printf("\n[fft] %s : thread %d exit barrier after first fft / cycle %d\n",
    9671128__FUNCTION__, tid, (unsigned int)cycle );
    9681129#endif
     
    9921153    sync_time[tid] += (unsigned int)(barrier_stop - barrier_start);
    9931154
    994     // transpose tmp to x
     1155    // fft tmp to x
    9951156    Transpose( tmp , x , MyFirst , MyLast );
    9961157
    9971158#if( DEBUG_FFT1D & 1 )
    998 printf("\n[fft] %s : thread %d after second transpose\n", __FUNCTION__, tid);
     1159printf("\n[fft] %s : thread %d after second fft\n", __FUNCTION__, tid);
    9991160if( PRINT_ARRAY ) PrintArray( x , N );
    10001161#endif
     
    10061167
    10071168#if( DEBUG_FFT1D & 1 )
    1008 printf("\n[fft] %s : thread %d exit barrier after second transpose\n", __FUNCTION__, tid);
     1169printf("\n[fft] %s : thread %d exit barrier after second fft\n", __FUNCTION__, tid);
    10091170#endif
    10101171
     
    10331194    sync_time[tid] += (unsigned int)(barrier_stop - barrier_start);
    10341195
    1035     // transpose x to tmp
     1196    // fft x to tmp
    10361197    Transpose( x , tmp , MyFirst , MyLast );
    10371198
    10381199#if( DEBUG_FFT1D & 1 )
    1039 printf("\n[fft] %s : thread %x after third transpose\n", __FUNCTION__, tid);
     1200printf("\n[fft] %s : thread %x after third fft\n", __FUNCTION__, tid);
    10401201if( PRINT_ARRAY ) PrintArray( x , N );
    10411202#endif
     
    10471208
    10481209#if( DEBUG_FFT1D & 1 )
    1049 printf("\n[fft] %s : thread %d exit barrier after third transpose\n", __FUNCTION__, tid);
     1210printf("\n[fft] %s : thread %d exit barrier after third fft\n", __FUNCTION__, tid);
    10501211#endif
    10511212
  • trunk/user/ksh/ksh.c

    r647 r652  
    6161#define DEBUG_CMD_CAT       0
    6262#define DEBUG_CMD_CP        0
    63 #define DEBUG_CMD_LOAD      1
     63#define DEBUG_CMD_LOAD      0
    6464#define DEBUG_CMD_LS        0
    6565#define DEBUG_CMD_PS        0
     
    12261226
    12271227
    1228 /* 1. first direct command
     1228// 1. first direct command
    12291229if( sem_wait( &semaphore ) )
    12301230{
     
    12381238    execute( cmd );
    12391239}
    1240 */
     1240//
    12411241
    12421242
  • trunk/user/sort/sort.c

    r637 r652  
    6969#define INSTRUMENTATION     1               // register computation times on file
    7070
    71 ///////////////////////////////////////////////////////////////////////////////////
    72 //            Arguments for the sort() function
    73 ///////////////////////////////////////////////////////////////////////////////////
    74 
    75 typedef struct
    76 {
    77     unsigned int        tid;                // continuous thread index
    78     unsigned int        threads;            // total number of threads
    79     pthread_barrier_t * parent_barrier;     // pointer on termination barrier
    80 }
    81 sort_args_t;
    82 
    8371////////////////////////////////////////////////////////////////////////////////////
    8472//            Sort specific global variables
     
    8876int                 array1[ARRAY_LENGTH];   
    8977
     78unsigned int        threads;                // total number of working threads
     79
    9080pthread_barrier_t   barrier;                 // synchronisation variables
    9181
     
    9383//             Global variables required by parallel_pthread_create()
    9484/////////////////////////////////////////////////////////////////////////////////////
    95 
    96 // 2D arrays of input arguments for the <sort> threads
    97 // These arrays are initialised by the application main thread
    98 
    99 sort_args_t       sort_args[CLUSTERS_MAX][CORES_MAX];  // sort function arguments
    100 sort_args_t     * sort_ptrs[CLUSTERS_MAX][CORES_MAX];  // pointers on arguments
    101 
    102 // 1D array of barriers to allow the <sort> threads to signal termination
    103 // this array is initialised by the pthread_parallel_create() function
    104  
    105 pthread_barrier_t parent_barriers[CLUSTERS_MAX];       // termination barrier
    10685
    10786
     
    174153}  // end merge()
    175154
    176 //////////////////////////////
    177 void sort( sort_args_t * ptr )
     155///////////////////////////////////////////////
     156void sort( pthread_parallel_work_args_t * ptr )
    178157{
    179158    unsigned int        i;
     
    183162    // get arguments
    184163    unsigned int        tid            = ptr->tid;
    185     unsigned int        threads        = ptr->threads;
    186     pthread_barrier_t * parent_barrier = ptr->parent_barrier;
     164    pthread_barrier_t * parent_barrier = ptr->barrier;
    187165
    188166    unsigned int        items      = ARRAY_LENGTH / threads;
     
    190168
    191169#if DEBUG_SORT
    192 printf("\n[sort] start : ptr %x / tid %d / threads %d / barrier %x\n",
     170printf("\n[sort] start : ptr %x / tid %d / threads %d / parent_barrier %x\n",
    193171ptr, tid, threads, parent_barrier );
    194172#endif
     
    249227    }  // en for stages
    250228
    251     // sort thread signal completion to main thread
     229    // sort thread signal completion to pthtread_parallel_create()
    252230    pthread_barrier_wait( parent_barrier );
    253231
     
    269247    unsigned int           y_size;             // number of columns
    270248    unsigned int           ncores;             // number of cores per cluster
    271     unsigned int           total_threads;      // total number of threads
    272     unsigned int           x;                  // X coordinate for a sort thread
    273     unsigned int           y;                  // Y coordinate for a sort thread
    274     unsigned int           cxy;                // cluster identifier for a sort thead
    275     unsigned int           lid;                // core local index for a thread
    276     unsigned int           tid;                // sort thread continuous index
    277249    pthread_barrierattr_t  barrier_attr;       // barrier attributes (used for DQT)
    278250    unsigned int           n;                  // index in array to sort
     
    285257    get_cycle( &start_cycle );
    286258 
    287     // compute number of threads (one thread per core)
     259    // compute number of working threads (one thread per core)
    288260    get_config( &x_size , &y_size , &ncores );
    289     total_threads = x_size * y_size * ncores;
     261    threads = x_size * y_size * ncores;
    290262
    291263    // compute covering DQT size an level
     
    294266
    295267    // checks number of threads
    296     if ( (total_threads != 1)   && (total_threads != 2)   && (total_threads != 4)   &&
    297          (total_threads != 8)   && (total_threads != 16 ) && (total_threads != 32)  &&
    298          (total_threads != 64)  && (total_threads != 128) && (total_threads != 256) &&
    299          (total_threads != 512) && (total_threads != 1024) )
     268    if ( (threads != 1)   && (threads != 2)   && (threads != 4)   &&
     269         (threads != 8)   && (threads != 16 ) && (threads != 32)  &&
     270         (threads != 64)  && (threads != 128) && (threads != 256) &&
     271         (threads != 512) && (threads != 1024) )
    300272    {
    301273        printf("\n[sort] ERROR : number of cores must be power of 2\n");
     
    304276
    305277    // check array size
    306     if ( ARRAY_LENGTH % total_threads)
     278    if ( ARRAY_LENGTH % threads)
    307279    {
    308280        printf("\n[sort] ERROR : array size must be multiple of number of threads\n");
     
    311283
    312284    printf("\n[sort] main starts / %d threads / %d items / pid %x / cycle %d\n",
    313     total_threads, ARRAY_LENGTH, getpid(), (unsigned int)start_cycle );
     285    threads, ARRAY_LENGTH, getpid(), (unsigned int)start_cycle );
    314286
    315287    // initialize barrier
     
    319291        barrier_attr.y_size   = y_size;
    320292        barrier_attr.nthreads = ncores;
    321         error = pthread_barrier_init( &barrier, &barrier_attr , total_threads );
     293        error = pthread_barrier_init( &barrier, &barrier_attr , threads );
    322294    }
    323295    else // use SIMPLE_BARRIER
    324296    {
    325         error = pthread_barrier_init( &barrier, NULL , total_threads );
     297        error = pthread_barrier_init( &barrier, NULL , threads );
    326298    }
    327299
     
    352324#endif
    353325
    354     // build array of arguments for the <sort> threads
    355     for (x = 0 ; x < x_size ; x++)
    356     {
    357         for (y = 0 ; y < y_size ; y++)
    358         {
    359             // compute cluster identifier
    360             cxy = HAL_CXY_FROM_XY( x , y );
    361 
    362             for ( lid = 0 ; lid < ncores ; lid++ )
    363             {
    364                 // compute thread continuous index
    365                 tid = (((x * y_size) + y) * ncores) + lid;
    366 
    367                 // initialize 2D array of arguments
    368                 sort_args[cxy][lid].tid            = tid;
    369                 sort_args[cxy][lid].threads        = total_threads;
    370                 sort_args[cxy][lid].parent_barrier = &parent_barriers[cxy];
    371 
    372                 // initialize 2D array of pointers
    373                 sort_ptrs[cxy][lid] = &sort_args[cxy][lid];
    374             }
    375         }
    376     }
    377 
    378326    ///////////////////////////
    379327    get_cycle( &seq_end_cycle );
     
    386334    // create and execute the working threads
    387335    if( pthread_parallel_create( root_level,
    388                                  &sort,
    389                                  &sort_ptrs[0][0],
    390                                  &parent_barriers[0] ) )
     336                                 &sort ) )
    391337    {
    392338        printf("\n[sort] ERROR : cannot create threads\n");
     
    412358#if CHECK_RESULT
    413359    int    success = 1;
    414     int *  res_array = ( (total_threads ==   2) ||
    415                          (total_threads ==   8) ||
    416                          (total_threads ==  32) ||
    417                          (total_threads == 128) ||
    418                          (total_threads == 512) ) ? array1 : array0;
     360    int *  res_array = ( (threads ==   2) ||
     361                         (threads ==   8) ||
     362                         (threads ==  32) ||
     363                         (threads == 128) ||
     364                         (threads == 512) ) ? array1 : array0;
    419365
    420366    for( n=0 ; n<(ARRAY_LENGTH-2) ; n++ )
  • trunk/user/transpose/transpose.c

    r646 r652  
    55//////////////////////////////////////////////////////////////////////////////////////////
    66// This multi-threaded aplication read a raw image (one byte per pixel)
    7 // stored on disk, transpose it, display the result on the frame buffer,
    8 // and store the transposed image on disk.
    9 // It can run on a multi-cores, multi-clusters architecture, with one thread
     7// stored on disk, transposes it, displays the result on the frame buffer,
     8// and stores the transposed image on disk.
    109//
    11 // per core, and uses the POSIX threads API.
    12 // It uses the mmap() syscall to directly access the input and output files
    13 // and the fbf_write() syscall to display the images.
     10// The image size and the pixel encoding type are defined by the IMAGE_SIZE and
     11// IMAGE_TYPE global parameters.
    1412//
    15 // The main() function can be launched on any core[cxy,l].
    16 // It makes the initialisations, launch (N-1) threads to run the execute() function
    17 // on the (N-1) other cores, calls himself the execute() function, and finally calls
    18 // the instrument() function to display instrumentation results when the parallel
    19 // execution is completed. The placement of threads on the cores can be done
    20 // automatically by the operating system, or can be done explicitely by the main thread
    21 // (when the EXPLICIT_PLACEMENT global parameter is set).
     13// It can run on a multi-cores, multi-clusters architecture, where (X_SIZE * Y_SIZE)
     14// is the number of clusters and NCORES the number of cores per cluster.
     15// A core is identified by two indexes [cxy,lid] : cxy is the cluster identifier,
     16// (that is NOT required to be a continuous index), and lid is the local core index,
     17// (that must be in the [Ø,NCORES-1] range).
    2218//
    23 // The buf_in[x,y] and buf_out[put buffers containing the direct ans transposed images
    24 // are distributed in clusters: In each cluster[cxy], the thread running on core[cxy,0]
    25 // map the buf_in[cxy] and // buf_out[cxy] buffers containing a subset of lines.
    26 // Then, all threads in cluster[xy] read pixels from the local buf_in[cxy] buffer, and
    27 // write the pixels to all remote buf_out[cxy] buffers. Finally, each thread display
    28 // a part of the transposed image to the frame buffer.
     19// The main() function can run on any core in any cluster. This main thread
     20// makes the initialisations, uses the pthread_create() syscall to launch (NTHREADS-1)
     21// other threads in "attached" mode running in parallel the execute() function, calls
     22// himself the execute() function, wait completion of the (NTHREADS-1) other threads
     23// with a pthread_join(), and finally calls the instrument() function to display
     24// and register the instrumentation results when execution is completed.
     25// All threads run the execute() function, but each thread transposes only
     26// (NLINES / NTHREADS) lines. This requires that NLINES == k * NTHREADS.
     27//
     28// The number N of working threads is always defined by the number of cores availables
     29// in the architecture, but this application supports three placement modes.
     30// In all modes, the working threads are identified by the [tid] continuous index
     31// in range [0, NTHREADS-1], and defines how the lines are shared amongst the threads.
     32// This continuous index can always be decomposed in two continuous sub-indexes:
     33// tid == cid * ncores + lid,  where cid is in [0,NCLUSTERS-1] and lid in [0,NCORES-1].
     34//
     35// - NO_PLACEMENT: the main thread is itsef a working thread. The (N_1) other working
     36//   threads are created by the main thread, but the placement is done by the OS, using
     37//   the DQDT for load balancing, and two working threads can be placed on the same core.
     38//   The [cid,lid] are only abstract identifiers, and cannot be associated to a physical
     39//   cluster or a physical core. In this mode, the main thread run on any cluster,
     40//   but has tid = 0 (i.e. cid = 0 & tid = 0).
     41//
     42// - EXPLICIT_PLACEMENT: the main thread is again a working thread, but the placement of
     43//   of the threads on the cores is explicitely controled by the main thread to have
     44//   exactly one working thread per core, and the [cxy][lpid] core coordinates for a given
     45//   thread[tid] can be directly derived from the [tid] value: [cid] is an alias for the
     46//   physical cluster identifier, and [lid] is the local core index.
     47//
     48// - PARALLEL_PLACEMENT: the main thread is not anymore a working thread, and uses the
     49//   non standard pthread_parallel_create() function to avoid the costly sequencial
     50//   loops for pthread_create() and pthread_join(). It garanty one working thread
     51//   per core, and the same relation between the thread[tid] and the core[cxy][lpid].
     52//   
     53// The buf_in[x,y] and buf_out[put buffers containing the direct and transposed images
     54// are distributed in clusters: each thread[cid][0] allocate a local input buffer
     55// and load in this buffer all lines that must be handled by the threads sharing the
     56// same cid, from the mapper of the input image file.
     57// In the execute function, all threads in the group defined by the cid index read pixels
     58// from the local buf_in[cid] buffer, and write pixels to all remote buf_out[cid] buffers.
     59// Finally, each thread displays a part of the transposed image to the frame buffer.
    2960//
    3061// - The image  must fit the frame buffer size, that must be power of 2.
    3162// - The number of clusters  must be a power of 2 no larger than 256.
    3263// - The number of cores per cluster must be a power of 2 no larger than 4.
    33 // - The number of clusters cannot be larger than (IMAGE_SIZE * IMAGE_SIZE) / 4096,
    34 //   because the size of buf_in[x,y] and buf_out[x,y] must be multiple of 4096.
     64// - The number of threads cannot be larger than IMAGE_SIZE.
    3565//
    3666//////////////////////////////////////////////////////////////////////////////////////////
     
    5080#define CORES_MAX             4                            // max number of cores per cluster
    5181#define CLUSTERS_MAX          (X_MAX * Y_MAX)              // max number of clusters
    52 
    53 #define IMAGE_SIZE            256                          // image size
     82#define THREADS_MAX           (X_MAX * Y_MAX * CORES_MAX)  // max number of threads
     83
     84#define IMAGE_SIZE            512                          // image size
    5485#define IMAGE_TYPE            420                          // pixel encoding type
    55 #define INPUT_FILE_PATH       "/misc/lena_256.raw"         // input file pathname
    56 #define OUTPUT_FILE_PATH      "/home/trsp_256.raw"         // output file pathname
    57 
     86#define INPUT_FILE_PATH       "/misc/couple_512.raw"       // input file pathname
     87#define OUTPUT_FILE_PATH      "/misc/transposed_512.raw"   // output file pathname
     88
     89#define SAVE_RESULT_FILE      0                            // save result image on disk
    5890#define USE_DQT_BARRIER       1                            // quad-tree barrier if non zero
    59 #define EXPLICIT_PLACEMENT    1                            // explicit thread placement
    60 #define VERBOSE               1                            // print comments on TTY
     91
     92#define NO_PLACEMENT          0                            // uncontrolefdthread placement
     93#define EXPLICIT_PLACEMENT    0                            // explicit threads placement
     94#define PARALLEL_PLACEMENT    1                            // parallel threads placement
     95
     96#define VERBOSE_MAIN          0                            // main function print comments
     97#define VERBOSE_EXEC          0                            // exec function print comments
     98#define VERBOSE_INSTRU        0                            // instru function print comments
    6199
    62100
     
    65103///////////////////////////////////////////////////////
    66104
    67 // instrumentation counters for each processor in each cluster
    68 unsigned int MMAP_START[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
    69 unsigned int MMAP_END  [CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
     105// global instrumentation counters for the main thread
     106unsigned int SEQUENCIAL_TIME = 0;
     107unsigned int PARALLEL_TIME   = 0;
     108
     109// instrumentation counters for each thread in each cluster
     110// indexed by [cid][lid] : cluster continuous index / thread local index
     111unsigned int LOAD_START[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
     112unsigned int LOAD_END  [CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
    70113unsigned int TRSP_START[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
    71114unsigned int TRSP_END  [CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
     
    73116unsigned int DISP_END  [CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
    74117
    75 // arrays of pointers on distributed buffers
    76 // one input buffer & one output buffer per cluster
    77 unsigned char *  buf_in [CLUSTERS_MAX];
    78 unsigned char *  buf_out[CLUSTERS_MAX];
    79 
    80 // synchronisation barrier (all threads)
     118// pointer on buffer containing the input image, maped by the main to the input file
     119unsigned char *  image_in;
     120
     121// pointer on buffer containing the output image, maped by the main to the output file
     122unsigned char *  image_out;
     123
     124// arrays of pointers on distributed buffers indexed by [cid] : cluster continuous index
     125unsigned char *  buf_in_ptr [CLUSTERS_MAX];
     126unsigned char *  buf_out_ptr[CLUSTERS_MAX];
     127
     128// synchronisation barrier (all working threads)
    81129pthread_barrier_t   barrier;
    82130
    83131// platform parameters
    84 unsigned int  x_size;                       // number of clusters in a row
    85 unsigned int  y_size;                       // number of clusters in a column
    86 unsigned int  ncores;                       // number of processors per cluster
    87 
    88 // cluster identifier & local index of core running the main thread
    89 unsigned int  cxy_main;
    90 unsigned int  lid_main;
    91 
    92 // input & output file descriptors
    93 int  fd_in;
    94 int  fd_out;
    95 
    96 #if EXPLICIT_PLACEMENT
    97 
    98 // thread index allocated by the kernel
    99 pthread_t        trdid[CLUSTERS_MAX][CORES_MAX];   
    100 
    101 // user defined continuous thread index
    102 unsigned int     tid[CLUSTERS_MAX][CORES_MAX];
    103 
    104 // thread attributes only used if explicit placement
    105 pthread_attr_t   attr[CLUSTERS_MAX][CORES_MAX];
    106 
    107 #else
    108 
    109 // thread index allocated by the kernel
    110 pthread_t        trdid[CLUSTERS_MAX * CORES_MAX];   
    111 
    112 // user defined continuous thread index
    113 unsigned int     tid[CLUSTERS_MAX * CORES_MAX];
    114 
    115 #endif
     132unsigned int  x_size;              // number of clusters in a row
     133unsigned int  y_size;              // number of clusters in a column
     134unsigned int  ncores;              // number of cores per cluster
     135
     136// main thread continuous index
     137unsigned int     tid_main;
    116138
    117139//return values at thread exit
     
    119141unsigned int THREAD_EXIT_FAILURE = 1;
    120142
     143// array of kernel thread identifiers / indexed by [tid]
     144pthread_t                     exec_trdid[THREADS_MAX];   
     145
     146// array of execute function arguments / indexed by [tid]
     147pthread_parallel_work_args_t  exec_args[THREADS_MAX];
     148
     149// array of thread attributes / indexed by [tid]
     150pthread_attr_t                exec_attr[THREADS_MAX];
     151
    121152////////////////////////////////////////////////////////////////
    122153//             functions declaration
    123154////////////////////////////////////////////////////////////////
    124155
    125 void execute( unsigned int * ptid );
    126 
    127 void instrument( void );
    128 
    129 ///////////
    130 void main()
     156void execute( pthread_parallel_work_args_t * args );
     157
     158void instrument( FILE * f , char * filename );
     159
     160/////////////////
     161void main( void )
    131162{
    132     unsigned long long date;
     163    unsigned long long start_cycle;
     164    unsigned long long end_sequencial_cycle;
     165    unsigned long long end_parallel_cycle;
     166
     167    char               filename[32];      // instrumentation file name
     168    char               pathname[64];      // instrumentation file pathname
    133169
    134170    int error;
    135171
    136 printf("\n bloup 0\n");
    137 
    138     // get identifiers for core executing main
    139     get_core_id( &cxy_main , &lid_main );
    140 
    141 printf("\n bloup 1\n");
     172    /////////////////////////////////////////////////////////////////////////////////
     173    get_cycle( &start_cycle );
     174    /////////////////////////////////////////////////////////////////////////////////
     175
     176    if( (NO_PLACEMENT + EXPLICIT_PLACEMENT + PARALLEL_PLACEMENT) != 1 )
     177    {
     178        printf("\n[transpose error] illegal placement\n");
     179        exit( 0 );
     180    }
    142181
    143182    // get & check plat-form parameters
    144     get_config( &x_size , &y_size , &ncores );
    145 
    146 printf("\n bloup 2\n");
    147 
    148     if((ncores != 1) && (ncores != 2) && (ncores == 4))
     183    get_config( &x_size,
     184                &y_size,
     185                &ncores );
     186
     187    if((ncores != 1) && (ncores != 2) && (ncores != 4))
    149188    {
    150189        printf("\n[transpose error] number of cores per cluster must be 1/2/4\n");
     
    166205    }
    167206       
    168 printf("\n bloup 3\n");
     207    // main thread get identifiers for core executing main
     208    unsigned int  cxy_main;
     209    unsigned int  lid_main;
     210    get_core_id( &cxy_main , &lid_main );
    169211
    170212    // compute number of threads
     
    172214    unsigned int nthreads  = nclusters * ncores;
    173215
    174 printf("\n bloup 4\n");
    175 
    176     // get FBF ownership and FBF size
     216    // main thread get FBF size and type
    177217    unsigned int   fbf_width;
    178218    unsigned int   fbf_height;
     
    180220    fbf_get_config( &fbf_width , &fbf_height , &fbf_type );
    181221
    182 printf("\n bloup 5\n");
    183 
    184222    if( (fbf_width != IMAGE_SIZE) || (fbf_height != IMAGE_SIZE) || (fbf_type != IMAGE_TYPE) )
    185223    {
     
    188226    }
    189227
    190     get_cycle( &date );
    191     printf("\n[transpose] starts at cycle %d on %d cores / FBF = %d * %d pixels\n",
    192     (unsigned int)date , nthreads , fbf_width , fbf_height );
    193 
    194     // open input file
    195     fd_in = open( INPUT_FILE_PATH , O_RDONLY , 0 );    // read-only
    196     if ( fd_in < 0 )
     228    if( nthreads > IMAGE_SIZE )
     229    {
     230        printf("\n[transpose error] number of threads larger than number of lines\n");
     231        exit( 0 );
     232    }
     233
     234    unsigned int npixels = IMAGE_SIZE * IMAGE_SIZE;
     235
     236    // define instrumentation file name
     237    if( NO_PLACEMENT )
     238    {
     239        printf("\n[transpose] %d cluster(s) / %d core(s) / FBF[%d*%d] / PID %x / NO_PLACE\n",
     240        nclusters, ncores, fbf_width, fbf_height, getpid() );
     241
     242        // build instrumentation file name
     243        if( USE_DQT_BARRIER )
     244        snprintf( filename , 32 , "trsp_dqt_no_place_%d_%d_%d",
     245        IMAGE_SIZE , x_size * y_size , ncores );
     246        else
     247        snprintf( filename , 32 , "trsp_smp_no_place_%d_%d_%d",
     248        IMAGE_SIZE , x_size * y_size , ncores );
     249    }
     250
     251    if( EXPLICIT_PLACEMENT )
     252    {
     253        printf("\n[transpose] %d cluster(s) / %d core(s) / FBF[%d*%d] / PID %x / EXPLICIT\n",
     254        nclusters, ncores, fbf_width, fbf_height, getpid() );
     255
     256        // build instrumentation file name
     257        if( USE_DQT_BARRIER )
     258        snprintf( filename , 32 , "trsp_dqt_explicit_%d_%d_%d",
     259        IMAGE_SIZE , x_size * y_size , ncores );
     260        else
     261        snprintf( filename , 32 , "trsp_smp_explicit_%d_%d_%d",
     262        IMAGE_SIZE , x_size * y_size , ncores );
     263    }
     264
     265    if( PARALLEL_PLACEMENT )
     266    {
     267        printf("\n[transpose] %d cluster(s) / %d core(s) / FBF[%d*%d] / PID %x / PARALLEL\n",
     268        nclusters, ncores, fbf_width, fbf_height, getpid() );
     269
     270        // build instrumentation file name
     271        if( USE_DQT_BARRIER )
     272        snprintf( filename , 32 , "trsp_dqt_parallel_%d_%d_%d",
     273        IMAGE_SIZE , x_size * y_size , ncores );
     274        else
     275        snprintf( filename , 32 , "trsp_smp_parallel_%d_%d_%d",
     276        IMAGE_SIZE , x_size * y_size , ncores );
     277    }
     278
     279    // open instrumentation file
     280    snprintf( pathname , 64 , "/home/%s", filename );
     281    FILE * f = fopen( pathname , NULL );
     282    if ( f == NULL )
    197283    {
    198         printf("\n[transpose error] main cannot open file %s\n", INPUT_FILE_PATH );
    199         exit( 0 );
    200     }
    201 
    202 #if VERBOSE
    203 printf("\n[transpose] main open file %s / fd = %d\n", INPUT_FILE_PATH , fd_in );
    204 #endif
    205 
    206     // open output file
    207     fd_out = open( OUTPUT_FILE_PATH , O_CREAT , 0 );   // create if required
    208     if ( fd_out < 0 )
    209     {
    210         printf("\n[transpose error] main cannot open file %s\n", OUTPUT_FILE_PATH );
    211         exit( 0 );
    212     }
    213 
    214 #if  VERBOSE
    215 printf("\n[transpose] main open file %s / fd = %d\n", OUTPUT_FILE_PATH , fd_out );
    216 #endif
    217 
    218     // initialise barrier
     284        printf("\n[transpose error] cannot open instrumentation file %s\n", pathname );
     285        exit( 0 );
     286    }
     287
     288#if  VERBOSE_MAIN
     289printf("\n[transpose] main on core[%x,%d] open instrumentation file %s\n",
     290cxy_main, lid_main, pathname );
     291#endif
     292
     293    // main thread initializes barrier
    219294    if( USE_DQT_BARRIER )
    220295    {
     
    236311    }
    237312
    238     get_cycle( &date );
    239     printf("\n[transpose] main on core[%x,%d] completes initialisation at cycle %d\n"
    240            "- CLUSTERS     = %d\n"
    241            "- PROCS        = %d\n"
    242            "- THREADS      = %d\n",
    243            cxy_main, lid_main, (unsigned int)date, nclusters, ncores, nthreads );
    244 
    245 //////////////////////
    246 #if EXPLICIT_PLACEMENT
    247 
    248     // main thread launch other threads
    249     unsigned int x;
    250     unsigned int y;
    251     unsigned int l;
    252     unsigned int cxy;
    253     for( x = 0 ; x < x_size ; x++ )
    254     {
    255         for( y = 0 ; y < y_size ; y++ )
     313#if  VERBOSE_MAIN
     314printf("\n[transpose] main on core[%x,%d] completes barrier initialisation\n",
     315cxy_main, lid_main );
     316#endif
     317
     318    // main thread open input file
     319    int fd_in = open( INPUT_FILE_PATH , O_RDONLY , 0 );
     320
     321    if ( fd_in < 0 )
     322    {
     323        printf("\n[transpose error] main cannot open file %s\n", INPUT_FILE_PATH );
     324        exit( 0 );
     325    }
     326
     327#if  VERBOSE_MAIN
     328printf("\n[transpose] main open file <%s> / fd = %d\n", INPUT_FILE_PATH , fd_in );
     329#endif
     330
     331    // main thread map image_in buffer to input image file
     332    image_in = (unsigned char *)mmap( NULL,
     333                                      npixels,
     334                                      PROT_READ,
     335                                      MAP_FILE | MAP_SHARED,
     336                                      fd_in,
     337                                      0 );     // offset
     338    if ( image_in == NULL )
     339    {
     340        printf("\n[transpose error] main cannot map buffer to file %s\n", INPUT_FILE_PATH );
     341        exit( 0 );
     342    }
     343
     344#if  VERBOSE_MAIN
     345printf("\n[transpose] main map buffer to file <%s>\n", INPUT_FILE_PATH );
     346#endif
     347
     348    // main thread display input image on FBF
     349    if( fbf_write( image_in,
     350                   npixels,
     351                   0 ) )
     352    {
     353        printf("\n[transpose error] main cannot access FBF\n");
     354        exit( 0 );
     355    }
     356
     357#if SAVE_RESULT_IMAGE
     358
     359    // main thread open output file
     360    int fd_out = open( OUTPUT_FILE_PATH , O_CREAT , 0 );
     361
     362    if ( fd_out < 0 )
     363    {
     364        printf("\n[transpose error] main cannot open file %s\n", OUTPUT_FILE_PATH );
     365        exit( 0 );
     366    }
     367
     368#if  VERBOSE_MAIN
     369printf("\n[transpose] main open file <%s> / fd = %d\n", OUTPUT_FILE_PATH , fd_out );
     370#endif
     371
     372    // main thread map image_out buffer to output image file
     373    image_out = (unsigned char *)mmap( NULL,
     374                                       npixels,
     375                                       PROT_WRITE,
     376                                       MAP_FILE | MAP_SHARED,
     377                                       fd_out,
     378                                       0 );     // offset
     379    if ( image_out == NULL )
     380    {
     381        printf("\n[transpose error] main cannot map buf_out to file %s\n", OUTPUT_FILE_PATH );
     382        exit( 0 );
     383    }
     384
     385#if  VERBOSE_MAIN
     386printf("\n[transpose] main map buffer to file <%s>\n", OUTPUT_FILE_PATH );
     387#endif
     388
     389#endif  // SAVE_RESULT_IMAGE
     390
     391    /////////////////////////////////////////////////////////////////////////////////////
     392    get_cycle( &end_sequencial_cycle );
     393    SEQUENCIAL_TIME = (unsigned int)(end_sequencial_cycle - start_cycle);
     394    /////////////////////////////////////////////////////////////////////////////////////
     395
     396    //////////////////
     397    if( NO_PLACEMENT )
     398    {
     399        // the tid value for the main thread is always 0
     400        // main thread creates new threads with tid in [1,nthreads-1] 
     401        unsigned int tid;
     402        for ( tid = 0 ; tid < nthreads ; tid++ )
    256403        {
    257             cxy = HAL_CXY_FROM_XY( x , y );
    258             for( l = 0 ; l < ncores ; l++ )
     404            // register tid value in exec_args[tid] array
     405            exec_args[tid].tid = tid;
     406           
     407            // create other threads
     408            if( tid > 0 )
    259409            {
    260                 // no other thread on the core running the main
    261                 if( (cxy != cxy_main) || (l != lid_main) )
     410                if ( pthread_create( &exec_trdid[tid],
     411                                     NULL,                  // no attribute
     412                                     &execute,
     413                                     &exec_args[tid] ) )
    262414                {
    263                     // define thread attributes
    264                     attr[cxy][l].attributes = PT_ATTR_CLUSTER_DEFINED | PT_ATTR_CORE_DEFINED;
    265                     attr[cxy][l].cxy        = cxy;
    266                     attr[cxy][l].lid        = l;
    267 
    268                     tid[cxy][l] = (((x  * y_size) + y) * ncores) + l;
     415                    printf("\n[transpose error] cannot create thread %d\n", tid );
     416                    exit( 0 );
     417                }
     418
     419#if VERBOSE_MAIN
     420printf("\n[transpose] main created thread %d\n", tid );
     421#endif
     422
     423            }
     424            else
     425            {
     426                tid_main = 0;
     427            }
     428        }  // end for tid
     429
     430        // main thread calls itself the execute() function
     431        execute( &exec_args[0] );
     432
     433        // main thread wait other threads completion
     434        for ( tid = 1 ; tid < nthreads ; tid++ )
     435        {
     436            unsigned int * status;
     437
     438            // main wait thread[tid] status
     439            if ( pthread_join( exec_trdid[tid], (void*)(&status)) )
     440            {
     441                printf("\n[transpose error] main cannot join thread %d\n", tid );
     442                exit( 0 );
     443            }
     444       
     445            // check status
     446            if( *status != THREAD_EXIT_SUCCESS )
     447            {
     448                printf("\n[transpose error] thread %x returned failure\n", tid );
     449                exit( 0 );
     450            }
     451
     452#if VERBOSE_MAIN
     453printf("\n[transpose] main successfully joined thread %x\n", tid );
     454#endif
     455       
     456        }  // end for tid
     457
     458    }  // end if no_placement
     459
     460    ////////////////////////
     461    if( EXPLICIT_PLACEMENT )
     462    {
     463        // main thread places each other threads on a specific core[cxy][lid]
     464        // but the actual thread creation is sequencial
     465        unsigned int x;
     466        unsigned int y;
     467        unsigned int l;
     468        unsigned int cxy;                   // cluster identifier
     469        unsigned int tid;                   // thread continuous index
     470
     471        for( x = 0 ; x < x_size ; x++ )
     472        {
     473            for( y = 0 ; y < y_size ; y++ )
     474            {
     475                cxy = HAL_CXY_FROM_XY( x , y );
     476                for( l = 0 ; l < ncores ; l++ )
     477                {
     478                    // compute thread continuous index
     479                    tid = (((x  * y_size) + y) * ncores) + l;
     480
     481                    // register tid value in exec_args[tid] array
     482                    exec_args[tid].tid = tid;
     483
     484                    // no thread created on the core running the main
     485                    if( (cxy != cxy_main) || (l != lid_main) )
     486                    {
     487                        // define thread attributes
     488                        exec_attr[tid].attributes = PT_ATTR_CLUSTER_DEFINED |
     489                                                    PT_ATTR_CORE_DEFINED;
     490                        exec_attr[tid].cxy        = cxy;
     491                        exec_attr[tid].lid        = l;
    269492 
    270                     // create thread on core[cxy,l]
    271                     if (pthread_create( &trdid[cxy][l],   
    272                                         &attr[cxy][l],   
    273                                         &execute,
    274                                         &tid[cxy][l] ) )       
     493                        // create thread[tid] on core[cxy][l]
     494                        if ( pthread_create( &exec_trdid[tid],   
     495                                             &exec_attr[tid],   
     496                                             &execute,
     497                                             &exec_args[tid] ) )       
     498                        {
     499                            printf("\n[transpose error] cannot create thread %d\n", tid );
     500                            exit( 0 );
     501                        }
     502#if VERBOSE_MAIN
     503printf("\n[transpose] main created thread[%d] on core[%x,%d]\n", tid, cxy, l );
     504#endif
     505                    }
     506                    else
    275507                    {
    276                         printf("\n[convol error] created thread %x on core[%x][%d]\n",
    277                         trdid[cxy][l] , cxy , l );
    278                         exit( 0 );
     508                        tid_main = tid;
    279509                    }
    280 #if VERBOSE
    281 printf("\n[transpose] main created thread[%x,%d]\n", cxy, l );
    282 #endif
    283510                }
    284511            }
    285512        }
    286     }   
    287 
    288     // main thread calls itself the execute() function
    289     execute( &tid[cxy_main][lid_main] );
    290 
    291     // main thread wait other threads completion
    292     for( x = 0 ; x < x_size ; x++ )
    293     {
    294         for( y = 0 ; y < y_size ; y++ )
     513
     514        // main thread calls itself the execute() function
     515        execute( &exec_args[tid_main] );
     516
     517        // main thread wait other threads completion
     518        for( tid = 0 ; tid < nthreads ; tid++ )
    295519        {
    296             cxy = HAL_CXY_FROM_XY( x , y );
    297             for( l = 0 ; l < ncores ; l++ )
     520            // no other thread on the core running the main
     521            if( tid != tid_main )
    298522            {
    299                 // no other thread on the core running the main
    300                 if( (cxy != cxy_main) || (l != lid_main) )
     523                unsigned int * status;
     524
     525                // wait thread[tid]
     526                if( pthread_join( exec_trdid[tid] , (void*)(&status) ) )
    301527                {
    302                     unsigned int * status;
    303 
    304                     // wait thread[cxy][l]
    305                     if( pthread_join( trdid[cxy][l] , (void*)(&status) ) )
    306                     {
    307                         printf("\n[transpose error] main cannot join thread[%x,%d]\n", cxy, l );
    308                         exit( 0 );
    309                     }
     528                    printf("\n[transpose error] main cannot join thread %d\n", tid );
     529                    exit( 0 );
     530                }
    310531       
    311                     // check status
    312                     if( *status != THREAD_EXIT_SUCCESS )
    313                     {
    314                         printf("\n[transpose error] thread[%x,%d] returned failure\n", cxy, l );
    315                         exit( 0 );
    316                     }
    317 #if VERBOSE
    318 printf("\n[transpose] main joined thread[%x,%d]\n", cxy, l );
    319 #endif
     532                // check status
     533                if( *status != THREAD_EXIT_SUCCESS )
     534                {
     535                    printf("\n[transpose error] thread %d returned failure\n", tid );
     536                    exit( 0 );
    320537                }
     538#if VERBOSE_MAIN
     539printf("\n[transpose] main joined thread %d on core[%x,%d]\n", tid , cxy , l );
     540#endif
    321541            }
    322542        }
    323     }
    324 
    325 ///////////////////////////////
    326 #else  // no explicit placement
    327 
    328     // main thread launch other threads
    329     unsigned int n;
    330     for ( n = 1 ; n < nthreads ; n++ )
    331     {
    332         tid[n] = n;
    333         if ( pthread_create( &trdid[n],
    334                              NULL,                  // no attribute
    335                              &execute,
    336                              &tid[n] ) )
     543    }  // end if explicit_placement
     544
     545    ////////////////////////
     546    if( PARALLEL_PLACEMENT )
     547    {
     548        // compute covering DQT size an level
     549        unsigned int z          = (x_size > y_size) ? x_size : y_size;
     550        unsigned int root_level = ((z == 1) ? 0 :
     551                                  ((z == 2) ? 1 :
     552                                  ((z == 4) ? 2 :
     553                                  ((z == 8) ? 3 : 4))));
     554
     555        // create & execute the working threads
     556        if( pthread_parallel_create( root_level , &execute ) )
    337557        {
    338             printf("\n[transpose error] cannot create thread %d\n", n );
     558            printf("\n[transpose error] in %s\n", __FUNCTION__ );
    339559            exit( 0 );
    340560        }
    341 
    342 #if VERBOSE
    343 printf("\n[transpose] main created thread %d\n", tid[n] );
    344 #endif
    345 
    346     }
    347 
    348     // main thread calls itself the execute() function
    349     execute( &tid[0] );
    350 
    351     // main thread wait other threads completion
    352     for ( n = 1 ; n < nthreads ; n++ )
    353     {
    354         unsigned int * status;
    355 
    356         // main wait thread[n] status
    357         if ( pthread_join( trdid[n], (void*)(&status)) )
    358         {
    359             printf("\n[transpose error] main cannot join thread %d\n", n );
    360             exit( 0 );
    361         }
    362        
    363         // check status
    364         if( *status != THREAD_EXIT_SUCCESS )
    365         {
    366             printf("\n[transpose error] thread %x returned failure\n", n );
    367             exit( 0 );
    368         }
    369 
    370 #if VERBOSE
    371 printf("\n[transpose] main successfully joined thread %x\n", tid[n] );
    372 #endif
    373        
    374     }
    375 
    376 #endif
    377 
    378     // instrumentation
    379     instrument();
    380 
    381     // close input and output files
     561    }  // end if parallel_placement
     562
     563
     564    /////////////////////////////////////////////////////////////////////////////
     565    get_cycle( &end_parallel_cycle );
     566    PARALLEL_TIME = (unsigned int)(end_parallel_cycle - end_sequencial_cycle);
     567    /////////////////////////////////////////////////////////////////////////////
     568
     569    // main thread register instrumentation results
     570    instrument( f , filename );
     571
     572    // main thread close input file
    382573    close( fd_in );
     574
     575#if SAVE_RESULT_IMAGE
     576
     577    // main thread close output file
    383578    close( fd_out );
    384579
    385     // suicide
     580#endif
     581
     582    // main close instrumentation file
     583    fclose( f );
     584
     585    // main thread suicide
    386586    exit( 0 );
    387587   
     
    390590
    391591
    392 ///////////////////////////////////
    393 void execute( unsigned int * ptid )
     592
     593///////////////////////////////////////////////////
     594void execute( pthread_parallel_work_args_t * args )
    394595{
    395596    unsigned long long   date;
    396597 
    397     unsigned int l;                         // line index for loops
    398     unsigned int p;                         // pixel index for loops
    399 
    400     // get thread continuous index
    401     unsigned int my_tid = *ptid;
     598    unsigned int l;                         // line index for loop
     599    unsigned int p;                         // pixel index for loop
     600
     601    // WARNING
     602    //A thread is identified by the tid index, defined in the "args" structure.
     603    // This index being in range [0,nclusters*ncores-1] we can always write
     604    //       tid == cid * ncores + lid
     605    // with cid in [0,nclusters-1] and lid in [0,ncores-1].
     606    // if NO_PLACEMENT, there is no relation between these
     607    // thread [cid][lid] indexes, and the core coordinates [cxy][lpid]
     608
     609    // get thread abstract identifiers
     610    unsigned int tid = args->tid;
     611    unsigned int cid = tid / ncores;   
     612    unsigned int lid = tid % ncores;
     613
     614#if VERBOSE_EXEC
     615unsigned int cxy;
     616unsigned int lpid;
     617get_core_id( &cxy , &lpid );   // get core physical identifiers
     618printf("\n[transpose] exec[%d] on core[%x,%d] enters parallel exec\n",
     619tid , cxy , lpid );
     620#endif
     621
     622    get_cycle( &date );
     623    LOAD_START[cid][lid] = (unsigned int)date;
    402624
    403625    // build total number of pixels per image
    404626    unsigned int npixels = IMAGE_SIZE * IMAGE_SIZE;     
    405627
    406     // nuild total number of threads and clusters
    407     unsigned int nthreads  = x_size * y_size * ncores;
     628    // build total number of threads and clusters
    408629    unsigned int nclusters = x_size * y_size;
    409 
    410     // get cluster continuous index and core index from tid
    411     // we use (tid == cid * ncores + lid)
    412     unsigned int cid = my_tid / ncores;     // continuous index   
    413     unsigned int lid = my_tid % ncores;     // core local index
    414 
    415     // get cluster identifier from cid
    416     // we use (cid == x * y_size + y)
    417     unsigned int x   = cid / y_size;        // X cluster coordinate
    418     unsigned int y   = cid % y_size;        // Y cluster coordinate
    419     unsigned int cxy = HAL_CXY_FROM_XY(x,y);
    420    
    421 #if VERBOSE
    422 printf("\n[transpose] thread[%d] start on core[%x,%d]\n", my_tid , cxy , lid );
    423 #endif
    424 
    425     // In each cluster cxy,  thread[cxy,0] map input file
    426     // to buf_in[cxy] and map output file to buf_in[cxy]
    427 
    428     get_cycle( &date );
    429     MMAP_START[cxy][lid] = (unsigned int)date;
    430 
    431     if ( lid == 0 )
    432     {
    433         unsigned int length = npixels / nclusters;
    434         unsigned int offset = length * cid;
    435        
    436         // map buf_in
    437         buf_in[cid] =  mmap( NULL,
    438                              length,
    439                              PROT_READ,
    440                              MAP_SHARED,
    441                              fd_in,
    442                              offset );
    443 
    444         if ( buf_in[cid] == NULL )
     630    unsigned int nthreads  = nclusters * ncores;
     631
     632    unsigned int buf_size = npixels / nclusters;     // number of bytes in buf_in & buf_out
     633    unsigned int offset   = cid * buf_size;       // offset in file (bytes)
     634
     635    unsigned char  * buf_in = NULL;        // private pointer on local input buffer
     636    unsigned char  * buf_out = NULL;       // private pointer on local output buffer
     637
     638    // Each thread[cid,0] allocate a local buffer buf_in, and register
     639    // the base adress in the global variable buf_in_ptr[cid]
     640    // this local buffer is shared by all threads with the same cid
     641    if( lid == 0 )
     642    {
     643        // allocate buf_in
     644        buf_in = (unsigned char *)malloc( buf_size );
     645
     646        if( buf_in == NULL )
    445647        {
    446             printf("\n[transpose error] thread[%x,%d] cannot map input file\n", cxy, lid);
     648            printf("\n[transpose error] thread[%d] cannot allocate buf_in\n", tid );
    447649            pthread_exit( &THREAD_EXIT_FAILURE );
    448650        }
    449                  
    450 #if VERBOSE
    451 printf("\n[transpose] thread[%x,%d] map input file / length %x / offset %x / buf_in %x\n",
    452 cxy, lid, length, offset, buf_in[cid] );
    453 #endif
    454 
    455         // map buf_out           
    456         buf_out[cid] = mmap( NULL,
    457                              length,
    458                              PROT_WRITE,
    459                              MAP_SHARED,
    460                              fd_out,
    461                              offset );
    462 
    463         if ( buf_out[cid] == NULL )
     651
     652        // register buf_in buffer in global array of pointers
     653        buf_in_ptr[cid] = buf_in;
     654
     655#if VERBOSE_EXEC
     656printf("\n[transpose] exec[%d] on core[%x,%d] allocated buf_in = %x\n",
     657tid , cxy , lpid , buf_in );
     658#endif
     659
     660    }
     661
     662    // Each thread[cid,0] copy relevant part of the image_in to buf_in
     663    if( lid == 0 )
     664    {
     665        memcpy( buf_in,
     666                image_in + offset,
     667                buf_size );
     668    }
     669
     670#if VERBOSE_EXEC
     671printf("\n[transpose] exec[%d] on core[%x,%d] loaded buf_in[%d]\n",
     672tid , cxy , lpid , cid );
     673#endif
     674
     675    // Each thread[cid,0] allocate a local buffer buf_out, and register
     676    // the base adress in the global variable buf_out_ptr[cid]
     677    if( lid == 0 )
     678    {
     679        // allocate buf_out
     680        buf_out = (unsigned char *)malloc( buf_size );
     681
     682        if( buf_out == NULL )
    464683        {
    465             printf("\n[transpose error] thread[%x,%d] cannot map output file\n", cxy, lid);
     684            printf("\n[transpose error] thread[%d] cannot allocate buf_in\n", tid );
    466685            pthread_exit( &THREAD_EXIT_FAILURE );
    467686        }
    468                    
    469 #if VERBOSE
    470 printf("\n[transpose] thread[%x,%d] map output file / length %x / offset %x / buf_out %x\n",
    471 cxy, lid, length, offset, buf_out[cid] );
    472 #endif
    473 
    474     }
    475 
     687
     688        // register buf_in buffer in global array of pointers
     689        buf_out_ptr[cid] = buf_out;
     690
     691#if VERBOSE_EXEC
     692printf("\n[transpose] exec[%d] on core[%x,%d] allocated buf_out = %x\n",
     693tid , cxy , lpid , buf_out );
     694#endif
     695
     696    }
     697   
    476698    get_cycle( &date );
    477     MMAP_END[cxy][lid] = (unsigned int)date;
     699    LOAD_END[cid][lid] = (unsigned int)date;
    478700
    479701    /////////////////////////////////
    480702    pthread_barrier_wait( &barrier );
    481703
    482     // parallel transpose from buf_in to buf_out
    483     // each thread makes the transposition for nlt lines (nlt = IMAGE_SIZE/nthreads)
     704    get_cycle( &date );
     705    TRSP_START[cid][lid] = (unsigned int)date;
     706
     707    // All threads contribute to parallel transpose from buf_in to buf_out
     708    // each thread makes the transposition for nlt lines (nlt = npixels/nthreads)
    484709    // from line [tid*nlt] to line [(tid + 1)*nlt - 1]
    485710    // (p,l) are the absolute pixel coordinates in the source image
     711    // (l,p) are the absolute pixel coordinates in the source image
     712    // (p,l) are the absolute pixel coordinates in the dest image
    486713
    487714    get_cycle( &date );
    488     TRSP_START[cxy][lid] = (unsigned int)date;
     715    TRSP_START[cid][lid] = (unsigned int)date;
    489716
    490717    unsigned int nlt   = IMAGE_SIZE / nthreads;    // number of lines per thread
    491718    unsigned int nlc   = IMAGE_SIZE / nclusters;   // number of lines per cluster
    492719
    493     unsigned int src_cluster;
     720    unsigned int src_cid;
    494721    unsigned int src_index;
    495     unsigned int dst_cluster;
     722    unsigned int dst_cid;
    496723    unsigned int dst_index;
    497724
    498725    unsigned char byte;
    499726
    500     unsigned int first = my_tid * nlt;     // first line index for a given thread
     727    unsigned int first = tid * nlt;     // first line index for a given thread
    501728    unsigned int last  = first + nlt;      // last line index for a given thread
    502729
     730    // loop on lines handled by this thread
    503731    for ( l = first ; l < last ; l++ )
    504732    {
    505         // in each iteration we transfer one byte
     733        // loop on pixels in one line (one pixel per iteration)
    506734        for ( p = 0 ; p < IMAGE_SIZE ; p++ )
    507735        {
    508736            // read one byte from local buf_in
    509             src_cluster = l / nlc;
    510             src_index   = (l % nlc) * IMAGE_SIZE + p;
    511             byte        = buf_in[src_cluster][src_index];
     737            src_cid   = l / nlc;
     738            src_index = (l % nlc) * IMAGE_SIZE + p;
     739
     740            byte        = buf_in_ptr[src_cid][src_index];
    512741
    513742            // write one byte to remote buf_out
    514             dst_cluster = p / nlc;
    515             dst_index   = (p % nlc) * IMAGE_SIZE + l;
    516 
    517             buf_out[dst_cluster][dst_index] = byte;
     743            dst_cid  = p / nlc;
     744            dst_index = (p % nlc) * IMAGE_SIZE + l;
     745
     746            buf_out_ptr[dst_cid][dst_index] = byte;
    518747        }
    519748    }
    520749
    521 #if VERBOSE
    522 printf("\n[transpose] thread[%x,%d] completes transposed\n", cxy, lid );
     750#if VERBOSE_EXEC
     751printf("\n[transpose] exec[%d] on core[%x,%d] completes transpose\n",
     752tid , cxy , lpid );
    523753#endif
    524754
    525755    get_cycle( &date );
    526     TRSP_END[cxy][lid] = (unsigned int)date;
     756    TRSP_END[cid][lid] = (unsigned int)date;
    527757
    528758    /////////////////////////////////
    529759    pthread_barrier_wait( &barrier );
    530760
    531     // parallel display from local buf_out to frame buffer
    532     // all threads contribute to display
    533 
    534761    get_cycle( &date );
    535     DISP_START[cxy][lid] = (unsigned int)date;
    536 
     762    DISP_START[cid][lid] = (unsigned int)date;
     763
     764    // All threads contribute to parallel display
     765    // from local buf_out to frame buffer
    537766    unsigned int  npt   = npixels / nthreads;   // number of pixels per thread
    538767
    539     if( fbf_write( &buf_out[cid][lid * npt],
     768    if( fbf_write( &buf_out_ptr[cid][lid * npt],
    540769                   npt,
    541                    npt * my_tid ) )
    542     {
    543         printf("\n[transpose error] thread[%x,%d] cannot access FBF\n", cxy, lid );
     770                   npt * tid ) )
     771    {
     772        printf("\n[transpose error] thread[%d] cannot access FBF\n", tid );
    544773        pthread_exit( &THREAD_EXIT_FAILURE );
    545774    }
    546775
    547 #if VERBOSE
    548 printf("\n[transpose] thread[%x,%d] completes display\n", cxy, lid );
     776#if VERBOSE_EXEC
     777printf("\n[transpose] exec[%d] on core [%x,%d] completes display\n",
     778tid, cxy , lpid );
    549779#endif
    550780
    551781    get_cycle( &date );
    552     DISP_END[cxy][lid] = (unsigned int)date;
     782    DISP_END[cid][lid] = (unsigned int)date;
    553783
    554784    /////////////////////////////////
    555785    pthread_barrier_wait( &barrier );
    556786
    557     // all threads, but thread[0,0,0], suicide
    558     if ( (cxy != cxy_main) || (lid !=  lid_main) )
    559     {
     787#if SAVE_RESULT_IMAGE
     788
     789    // Each thread[cid,0] copy buf_out to relevant part of image_out
     790    if( lid == 0 )
     791    {
     792        memcpy( image_out + offset,
     793                buf_out,
     794                buf_size );
     795    }
     796
     797#if VERBOSE_EXEC
     798printf("\n[transpose] exec[%d] on core[%x,%d] saved buf_out[%d]\n",
     799tid , cxy , lpid , cid );
     800#endif
     801
     802#endif
     803
     804    // Each thread[cid,0] releases local buffer buf_out
     805    if( lid == 0 )
     806    {
     807        // release buf_out
     808        free( buf_in );
     809        free( buf_out );
     810    }
     811   
     812    // thread termination depends on the placement policy
     813    if( PARALLEL_PLACEMENT )   
     814    {
     815        // <work> threads are runing in detached mode
     816        // each thread must signal completion by calling barrier
     817        // passed in arguments before exit
     818
     819        pthread_barrier_wait( args->barrier );
     820
    560821        pthread_exit( &THREAD_EXIT_SUCCESS );
    561822    }
     823    else
     824    {
     825        // <work> threads are running in attached mode
     826        // each thread, but de main, simply exit
     827        if ( tid != tid_main )  pthread_exit( &THREAD_EXIT_SUCCESS );
     828    }
    562829
    563830} // end execute()
     
    565832
    566833
    567 ///////////////////////
    568 void instrument( void )
     834///////////////////////////
     835void instrument( FILE * f,
     836                 char * filename )
    569837{
    570838    unsigned int x, y, l;
     839
     840#if VERBOSE_EXEC
     841printf("\n[transpose] main enters instrument\n" );
     842#endif
    571843
    572844    unsigned int min_load_start = 0xFFFFFFFF;
     
    583855    unsigned int max_disp_ended = 0;
    584856 
    585     char string[64];
    586 
    587     snprintf( string , 64 , "/home/transpose_%d_%d_%d" , x_size , y_size , ncores );
    588 
    589     // open instrumentation file
    590     FILE * f = fopen( string , NULL );
    591     if ( f == NULL )
    592     {
    593         printf("\n[transpose error] cannot open instrumentation file %s\n", string );
    594         exit( 0 );
    595     }
    596 
    597857    for (x = 0; x < x_size; x++)
    598858    {
    599859        for (y = 0; y < y_size; y++)
    600860        {
    601             unsigned int cxy = HAL_CXY_FROM_XY( x , y );
     861            unsigned int cid = y_size * x + y;
    602862
    603863            for ( l = 0 ; l < ncores ; l++ )
    604864            {
    605                 if (MMAP_START[cxy][l] < min_load_start)  min_load_start = MMAP_START[cxy][l];
    606                 if (MMAP_START[cxy][l] > max_load_start)  max_load_start = MMAP_START[cxy][l];
    607                 if (MMAP_END[cxy][l]   < min_load_ended)  min_load_ended = MMAP_END[cxy][l];
    608                 if (MMAP_END[cxy][l]   > max_load_ended)  max_load_ended = MMAP_END[cxy][l];
    609                 if (TRSP_START[cxy][l] < min_trsp_start)  min_trsp_start = TRSP_START[cxy][l];
    610                 if (TRSP_START[cxy][l] > max_trsp_start)  max_trsp_start = TRSP_START[cxy][l];
    611                 if (TRSP_END[cxy][l]   < min_trsp_ended)  min_trsp_ended = TRSP_END[cxy][l];
    612                 if (TRSP_END[cxy][l]   > max_trsp_ended)  max_trsp_ended = TRSP_END[cxy][l];
    613                 if (DISP_START[cxy][l] < min_disp_start)  min_disp_start = DISP_START[cxy][l];
    614                 if (DISP_START[cxy][l] > max_disp_start)  max_disp_start = DISP_START[cxy][l];
    615                 if (DISP_END[cxy][l]   < min_disp_ended)  min_disp_ended = DISP_END[cxy][l];
    616                 if (DISP_END[cxy][l]   > max_disp_ended)  max_disp_ended = DISP_END[cxy][l];
     865                if (LOAD_START[cid][l] < min_load_start)  min_load_start = LOAD_START[cid][l];
     866                if (LOAD_START[cid][l] > max_load_start)  max_load_start = LOAD_START[cid][l];
     867                if (LOAD_END[cid][l]   < min_load_ended)  min_load_ended = LOAD_END[cid][l];
     868                if (LOAD_END[cid][l]   > max_load_ended)  max_load_ended = LOAD_END[cid][l];
     869                if (TRSP_START[cid][l] < min_trsp_start)  min_trsp_start = TRSP_START[cid][l];
     870                if (TRSP_START[cid][l] > max_trsp_start)  max_trsp_start = TRSP_START[cid][l];
     871                if (TRSP_END[cid][l]   < min_trsp_ended)  min_trsp_ended = TRSP_END[cid][l];
     872                if (TRSP_END[cid][l]   > max_trsp_ended)  max_trsp_ended = TRSP_END[cid][l];
     873                if (DISP_START[cid][l] < min_disp_start)  min_disp_start = DISP_START[cid][l];
     874                if (DISP_START[cid][l] > max_disp_start)  max_disp_start = DISP_START[cid][l];
     875                if (DISP_END[cid][l]   < min_disp_ended)  min_disp_ended = DISP_END[cid][l];
     876                if (DISP_END[cid][l]   > max_disp_ended)  max_disp_ended = DISP_END[cid][l];
    617877            }
    618878        }
    619879    }
    620880
    621     printf( "\n ------ %s ------\n" , string );
    622     fprintf( f , "\n ------ %s ------\n" , string );
    623 
    624     printf( " - MMAP_START : min = %d / max = %d / med = %d / delta = %d\n",
    625            min_load_start, max_load_start, (min_load_start+max_load_start)/2,
    626            max_load_start-min_load_start );
    627 
    628     fprintf( f , " - MMAP_START : min = %d / max = %d / med = %d / delta = %d\n",
    629            min_load_start, max_load_start, (min_load_start+max_load_start)/2,
    630            max_load_start-min_load_start );
    631 
    632     printf( " - MMAP_END   : min = %d / max = %d / med = %d / delta = %d\n",
    633            min_load_ended, max_load_ended, (min_load_ended+max_load_ended)/2,
    634            max_load_ended-min_load_ended );
    635 
    636     fprintf( f , " - MMAP_END   : min = %d / max = %d / med = %d / delta = %d\n",
    637            min_load_ended, max_load_ended, (min_load_ended+max_load_ended)/2,
    638            max_load_ended-min_load_ended );
    639 
    640     printf( " - TRSP_START : min = %d / max = %d / med = %d / delta = %d\n",
    641            min_trsp_start, max_trsp_start, (min_trsp_start+max_trsp_start)/2,
    642            max_trsp_start-min_trsp_start );
    643 
    644     fprintf( f , " - TRSP_START : min = %d / max = %d / med = %d / delta = %d\n",
    645            min_trsp_start, max_trsp_start, (min_trsp_start+max_trsp_start)/2,
    646            max_trsp_start-min_trsp_start );
    647 
    648     printf( " - TRSP_END   : min = %d / max = %d / med = %d / delta = %d\n",
    649            min_trsp_ended, max_trsp_ended, (min_trsp_ended+max_trsp_ended)/2,
    650            max_trsp_ended-min_trsp_ended );
    651 
    652     fprintf( f , " - TRSP_END   : min = %d / max = %d / med = %d / delta = %d\n",
    653            min_trsp_ended, max_trsp_ended, (min_trsp_ended+max_trsp_ended)/2,
    654            max_trsp_ended-min_trsp_ended );
    655 
    656     printf( " - DISP_START : min = %d / max = %d / med = %d / delta = %d\n",
    657            min_disp_start, max_disp_start, (min_disp_start+max_disp_start)/2,
    658            max_disp_start-min_disp_start );
    659 
    660     fprintf( f , " - DISP_START : min = %d / max = %d / med = %d / delta = %d\n",
    661            min_disp_start, max_disp_start, (min_disp_start+max_disp_start)/2,
    662            max_disp_start-min_disp_start );
    663 
    664     printf( " - DISP_END   : min = %d / max = %d / med = %d / delta = %d\n",
    665            min_disp_ended, max_disp_ended, (min_disp_ended+max_disp_ended)/2,
    666            max_disp_ended-min_disp_ended );
    667 
    668     fprintf( f , " - DISP_END   : min = %d / max = %d / med = %d / delta = %d\n",
    669            min_disp_ended, max_disp_ended, (min_disp_ended+max_disp_ended)/2,
    670            max_disp_ended-min_disp_ended );
    671 
    672     fclose( f );
     881    printf( "\n ------ %s ------\n" , filename );
     882    fprintf( f , "\n ------ %s ------\n" , filename );
     883
     884    printf( " - LOAD_START : min = %d / max = %d / delta = %d\n",
     885           min_load_start, max_load_start, max_load_start-min_load_start );
     886    fprintf( f , " - LOAD_START : min = %d / max = %d / delta = %d\n",
     887           min_load_start, max_load_start, max_load_start-min_load_start );
     888
     889    printf( " - LOAD_END   : min = %d / max = %d / delta = %d\n",
     890           min_load_ended, max_load_ended, max_load_ended-min_load_ended );
     891    fprintf( f , " - LOAD_END   : min = %d / max = %d / delta = %d\n",
     892           min_load_ended, max_load_ended, max_load_ended-min_load_ended );
     893
     894    printf( " - TRSP_START : min = %d / max = %d / delta = %d\n",
     895           min_trsp_start, max_trsp_start, max_trsp_start-min_trsp_start );
     896    fprintf( f , " - TRSP_START : min = %d / max = %d / delta = %d\n",
     897           min_trsp_start, max_trsp_start, max_trsp_start-min_trsp_start );
     898
     899    printf( " - TRSP_END   : min = %d / max = %d / delta = %d\n",
     900           min_trsp_ended, max_trsp_ended, max_trsp_ended-min_trsp_ended );
     901    fprintf( f , " - TRSP_END   : min = %d / max = %d / delta = %d\n",
     902           min_trsp_ended, max_trsp_ended, max_trsp_ended-min_trsp_ended );
     903
     904    printf( " - DISP_START : min = %d / max = %d / delta = %d\n",
     905           min_disp_start, max_disp_start, max_disp_start-min_disp_start );
     906    fprintf( f , " - DISP_START : min = %d / max = %d / delta = %d\n",
     907           min_disp_start, max_disp_start, max_disp_start-min_disp_start );
     908
     909    printf( " - DISP_END   : min = %d / max = %d / delta = %d\n",
     910           min_disp_ended, max_disp_ended, max_disp_ended-min_disp_ended );
     911    fprintf( f , " - DISP_END   : min = %d / max = %d / delta = %d\n",
     912           min_disp_ended, max_disp_ended, max_disp_ended-min_disp_ended );
     913
     914    printf( "\n   Sequencial = %d / Parallel = %d\n", SEQUENCIAL_TIME, PARALLEL_TIME );
     915    fprintf( f , "\n   Sequencial = %d / Parallel = %d\n", SEQUENCIAL_TIME, PARALLEL_TIME );
    673916
    674917}  // end instrument()
  • trunk/user/transpose/transpose.ld

    r646 r652  
    1 /****************************************************************************
     1/***************************************************************************
    22* Definition of the base address for all virtual segments
    3 *****************************************************************************/
     3***************************************************************************/
    44
    55seg_code_base      = 0x400000;
     6
     7/***************************************************************************
     8* Define code entry point (e_entry field in .elf file)
     9***************************************************************************/
     10
     11ENTRY( main )
    612
    713/***************************************************************************
Note: See TracChangeset for help on using the changeset viewer.