//////////////////////////////////////////////////////////////////////////////////////////
// File   : transpose.c   
// Date   : september 2019
// author : Alain Greiner
//////////////////////////////////////////////////////////////////////////////////////////
// This multi-threaded aplication read a raw image (one byte per pixel)
// stored on disk, transposes it, displays the result on the frame buffer,
// and stores the transposed image on disk.
//
// The image size and the pixel encoding type are defined by the IMAGE_SIZE and
// IMAGE_TYPE global parameters.
//
// It can run on a multi-cores, multi-clusters architecture, where (X_SIZE * Y_SIZE)
// is the number of clusters and NCORES the number of cores per cluster.
// A core is identified by two indexes [cxy,lid] : cxy is the cluster identifier,
// (that is NOT required to be a continuous index), and lid is the local core index,
// (that must be in the [0,NCORES-1] range).
//
// The main() function can run on any core in any cluster. This main thread
// makes the initialisations, load the input file to the "image_in" buffer,
// launches the working threads, calls the instrument() function when all working
// threads complete, and saves the result "image_out" buffer to the output file.
//
// The number of working threads is always defined by the number of cores availables
// in the architecture, but this application supports three placement modes.
// In all modes, the working threads are identified by the [tid] continuous index 
// in range [0, NTHREADS-1], and defines how the lines are shared amongst the threads.
// This continuous index can always be decomposed in two continuous sub-indexes:
// tid == cid * ncores + lid,  where cid is in [0,NCLUSTERS-1] and lid in [0,NCORES-1].
//
// - NO_PLACEMENT: the main thread is itsef a working thread. The (N_1) other working
//   threads are created by the main thread, but the placement is done by the OS, using
//   the DQDT for load balancing, and two working threads can be placed on the same core.
//   The [cid,lid] are only abstract identifiers, and cannot be associated to a physical
//   cluster or a physical core. In this mode, the main thread run on any cluster, 
//   but has tid = 0 (i.e. cid = 0 & tid = 0).
//
// - EXPLICIT_PLACEMENT: the main thread is again a working thread, but the placement of
//   of the threads on the cores is explicitely controled by the main thread to have
//   exactly one working thread per core, and the [cxy][lpid] core coordinates for a given
//   thread[tid] can be directly derived from the [tid] value: [cid] is an alias for the
//   physical cluster identifier, and [lid] is the local core index.
//
// - PARALLEL_PLACEMENT: the main thread is not anymore a working thread, and uses the
//   non standard pthread_parallel_create() function to avoid the costly sequencial
//   loops for pthread_create() and pthread_join(). It garanty one working thread 
//   per core, and the same relation between the thread[tid] and the core[cxy][lpid].
//    
// Each working thread[cid][lid] run the "execute" function, that uses the "buf_in" and
// "buf_out" local buffers, containing the direct and transposed images:
// Each thread[cid][0] allocates two buf_in[cid] and buf_out[cid] buffers, load from
// "image_in" to buf_in[cid] all lines that must be handled by the threads sharing the
// same cid, and finally save from buf_out[cid] to "image_out" all lines that have been
// transposed to buf_out[cid]. 
// Each thread[cid][lid] in the group defined by the cid index read pixels from the 
// local buf_in[cid] buffer, and write pixels to all remote // buf_out[cid] buffers.
//
// - The image  must fit the frame buffer size, that must be power of 2.
// - The number of clusters  must be a power of 2 no larger than 256.
// - The number of cores per cluster must be a power of 2 no larger than 4.
// - The number of threads cannot be larger than IMAGE_SIZE.
//
//////////////////////////////////////////////////////////////////////////////////////////

#include <sys/mman.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <pthread.h>
#include <string.h>
#include <almosmkh.h>
#include <fcntl.h>
#include <hal_macros.h>

#define X_MAX                 16                           // max number of clusters in row
#define Y_MAX                 16                           // max number of clusters in column
#define CORES_MAX             4                            // max number of cores per cluster
#define CLUSTERS_MAX          (X_MAX * Y_MAX)              // max number of clusters
#define THREADS_MAX           (X_MAX * Y_MAX * CORES_MAX)  // max number of threads

#define IMAGE_TYPE            420                          // pixel encoding type

//#define IMAGE_SIZE            128                          // image size 
//#define INPUT_FILE_PATH       "/misc/images_128.raw"       // input file pathname 
//#define OUTPUT_FILE_PATH      "/misc/transposed_128.raw"   // output file pathname 

//#define IMAGE_SIZE            256                          // image size 
//#define INPUT_FILE_PATH       "/misc/lena_256.raw"         // input file pathname 
#//define OUTPUT_FILE_PATH      "/misc/transposed_256.raw"   // output file pathname 

//#define IMAGE_SIZE            512                          // image size 
//#define INPUT_FILE_PATH       "/misc/couple_512.raw"       // input file pathname 
//#define OUTPUT_FILE_PATH      "/misc/transposed_512.raw"   // output file pathname 

#define IMAGE_SIZE            1024                         // image size 
#define INPUT_FILE_PATH       "/misc/philips_1024.raw"     // input file pathname 
#define OUTPUT_FILE_PATH      "/misc/transposed_1024.raw"  // output file pathname 

#define SAVE_RESULT_FILE      0                            // save result image on disk
#define USE_DQT_BARRIER       0                            // quad-tree barrier if non zero 

#define NO_PLACEMENT          0                            // uncontrolefdthread placement
#define EXPLICIT_PLACEMENT    1                            // explicit threads placement
#define PARALLEL_PLACEMENT    0                            // parallel threads placement

#define VERBOSE_MAIN          1                            // main function print comments
#define VERBOSE_MAIN_DETAILED 0                            // main function print comments
#define VERBOSE_EXEC          1                            // exec function print comments


///////////////////////////////////////////////////////
//                global variables 
///////////////////////////////////////////////////////

// global instrumentation counters for the main thread
unsigned int SEQUENCIAL_TIME = 0;
unsigned int PARALLEL_TIME   = 0;

// instrumentation counters for each thread in each cluster 
// indexed by [cid][lid] : cluster continuous index / thread local index
unsigned int ALOC_START[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
unsigned int ALOC_END  [CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
unsigned int LOAD_START[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
unsigned int LOAD_END  [CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
unsigned int TRSP_START[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
unsigned int TRSP_END  [CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
unsigned int SAVE_START[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
unsigned int SAVE_END  [CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
unsigned int FREE_START[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
unsigned int FREE_END  [CLUSTERS_MAX][CORES_MAX] = {{ 0 }};

// buffer containing the input image, loaded by the main from input file
unsigned char  image_in[IMAGE_SIZE * IMAGE_SIZE];

// buffer containing the output image, saved by the main to output file
unsigned char  image_out[IMAGE_SIZE * IMAGE_SIZE];

// arrays of pointers on distributed buffers indexed by [cid] 
unsigned char *  buf_in [CLUSTERS_MAX];
unsigned char *  buf_out[CLUSTERS_MAX];

// pointer and identifier for dynamically allocated FBF window 
void   *  win_buf;
int       wid;

// synchronisation barrier (all working threads)
pthread_barrier_t   barrier;

// platform parameters
unsigned int  x_size;              // number of clusters in a row
unsigned int  y_size;              // number of clusters in a column
unsigned int  ncores;              // number of cores per cluster

// main thread continuous index
unsigned int     tid_main; 

//return values at thread exit
unsigned int THREAD_EXIT_SUCCESS = 0;
unsigned int THREAD_EXIT_FAILURE = 1;

// array of kernel thread identifiers / indexed by [tid]
pthread_t                     exec_trdid[THREADS_MAX];   

// array of execute function arguments / indexed by [tid]
pthread_parallel_work_args_t  exec_args[THREADS_MAX];

// array of thread attributes / indexed by [tid]
pthread_attr_t                exec_attr[THREADS_MAX];

////////////////////////////////////////////////////////////////
//             functions declaration
////////////////////////////////////////////////////////////////

void * execute( void * arguments );

void instrument( FILE * f , char * filename );

////////////////
int main( void )
{
    unsigned long long start_cycle;
    unsigned long long end_sequencial_cycle;
    unsigned long long end_parallel_cycle;

    char               filename[32];      // instrumentation file name
    char               pathname[64];      // instrumentation file pathname

    int error;

    /////////////////////////////////////////////////////////////////////////////////
    get_cycle( &start_cycle );
    /////////////////////////////////////////////////////////////////////////////////

    if( (NO_PLACEMENT + EXPLICIT_PLACEMENT + PARALLEL_PLACEMENT) != 1 )
    {
        printf("\n[transpose error] illegal placement\n");
        exit( 0 );
    }

    // get & check plat-form parameters
    hard_config_t  config;
    get_config( &config );
    x_size = config.x_size;
    y_size = config.y_size;
    ncores = config.ncores;

    if((ncores != 1) && (ncores != 2) && (ncores != 4))
    {
        printf("\n[transpose error] number of cores per cluster must be 1/2/4\n");
        exit( 0 );
    }

    if( (x_size != 1) && (x_size != 2) && (x_size != 4) && 
        (x_size != 8) && (x_size != 16) )
    {
        printf("\n[transpose error] x_size must be 1/2/4/8/16\n");
        exit( 0 );
    }
        
    if( (y_size != 1) && (y_size != 2) && (y_size != 4) && 
        (y_size != 8) && (y_size != 16) )
    {
        printf("\n[transpose error] y_size must be 1/2/4/8/16\n");
        exit( 0 );
    }
        
    // get identifiers for core executing main
    unsigned int  cxy_main;
    unsigned int  lid_main;
    get_core_id( &cxy_main , &lid_main );

    // compute number of threads
    unsigned int nclusters = x_size * y_size;
    unsigned int nthreads  = nclusters * ncores;

    if( nthreads > IMAGE_SIZE )
    {
        printf("\n[transpose error] number of threads larger than number of lines\n");
        exit( 0 );
    }

    // get FBF size and type
    unsigned int   fbf_width;
    unsigned int   fbf_height;
    unsigned int   fbf_type;
    fbf_get_config( &fbf_width , &fbf_height , &fbf_type );

    if( (fbf_width < IMAGE_SIZE) || (fbf_height < IMAGE_SIZE) || (fbf_type != IMAGE_TYPE) )
    {
        printf("\n[transpose error] image does not fit FBF size or type\n");
        exit( 0 );
    }

    // define total number of pixels
    int npixels = IMAGE_SIZE * IMAGE_SIZE;

    // define instrumentation file name
    if( NO_PLACEMENT )
    {
        printf("\n[transpose] %d cluster(s) / %d core(s) / <%s> / PID %x / NO_PLACE\n",
        nclusters, ncores, INPUT_FILE_PATH , getpid() );

        // build instrumentation file name
        if( USE_DQT_BARRIER )
        snprintf( filename , 32 , "trsp_dqt_no_place_%d_%d_%d",
        IMAGE_SIZE , x_size * y_size , ncores );
        else
        snprintf( filename , 32 , "trsp_smp_no_place_%d_%d_%d",
        IMAGE_SIZE , x_size * y_size , ncores );
    }

    if( EXPLICIT_PLACEMENT )
    {
        printf("\n[transpose] %d cluster(s) / %d core(s) / <%s> / PID %x / EXPLICIT\n",
        nclusters, ncores, INPUT_FILE_PATH , getpid() );

        // build instrumentation file name
        if( USE_DQT_BARRIER )
        snprintf( filename , 32 , "trsp_dqt_explicit_%d_%d_%d",
        IMAGE_SIZE , x_size * y_size , ncores );
        else
        snprintf( filename , 32 , "trsp_smp_explicit_%d_%d_%d",
        IMAGE_SIZE , x_size * y_size , ncores );
    }

    if( PARALLEL_PLACEMENT )
    {
        printf("\n[transpose] %d cluster(s) / %d core(s) / <%s> / PID %x / PARALLEL\n",
        nclusters, ncores, INPUT_FILE_PATH , getpid() );

        // build instrumentation file name
        if( USE_DQT_BARRIER )
        snprintf( filename , 32 , "trsp_dqt_parallel_%d_%d_%d",
        IMAGE_SIZE , x_size * y_size , ncores );
        else
        snprintf( filename , 32 , "trsp_smp_parallel_%d_%d_%d",
        IMAGE_SIZE , x_size * y_size , ncores );
    }

    // open a window in FBF
    wid = fbf_create_window( 0,             // l_zero
                             0,             // p_zero
                             IMAGE_SIZE,    // lines
                             IMAGE_SIZE,    // pixels
                             &win_buf );
    if( wid < 0) 
    {
        printf("\n[transpose error] cannot open FBF window\n");
        exit( 0 );
    }

#if  VERBOSE_MAIN
printf("\n[transpose] main on core[%x,%d] created FBF window %d / buffer %x\n",
cxy_main, lid_main, wid , win_buf );
#endif

    // open instrumentation file
    snprintf( pathname , 64 , "/home/%s", filename );
    FILE * f = fopen( pathname , NULL );

    if ( f == NULL ) 
    { 
        printf("\n[transpose error] cannot open instru file %s\n", pathname );
        exit( 0 );
    }

#if  VERBOSE_MAIN
printf("\n[transpose] main on core[%x,%d] open instrumentation file %s\n",
cxy_main, lid_main, pathname );
#endif

    // main thread initializes barrier 
    if( USE_DQT_BARRIER )
    {
        pthread_barrierattr_t attr;
        attr.x_size   = x_size;
        attr.y_size   = y_size;
        attr.nthreads = ncores;
        error = pthread_barrier_init( &barrier, &attr , nthreads );
    }
    else
    {
        error = pthread_barrier_init( &barrier, NULL , nthreads );
    }

    if( error )
    { 
        printf("\n[transpose error] main cannot initialize barrier\n" );
        exit( 0 );
    }

#if  VERBOSE_MAIN
printf("\n[transpose] main on core[%x,%d] completed barrier initialisation\n",
cxy_main, lid_main );
#endif

    // open input file
    int fd_in = open( INPUT_FILE_PATH , O_RDONLY , 0 ); 

    if ( fd_in < 0 ) 
    { 
        printf("\n[transpose error] main cannot open file %s\n", INPUT_FILE_PATH );
        exit( 0 );
    }

#if  VERBOSE_MAIN
printf("\n[transpose] main open file <%s> / fd = %d\n", INPUT_FILE_PATH , fd_in );
#endif

    // open output file
    int fd_out = open( OUTPUT_FILE_PATH , O_CREAT , 0 ); 

    if ( fd_out < 0 ) 
    { 
        printf("\n[transpose error] main cannot open file %s\n", OUTPUT_FILE_PATH );
        exit( 0 );
    }

    // move input image to input buffer
    if( read( fd_in , image_in , npixels ) != npixels )
    {
        printf("\n[transpose error] main cannot read input image\n");
        exit( 0 );
    }

#if  VERBOSE_MAIN
printf("\n[transpose] main moved file <%s> to buf_in\n", INPUT_FILE_PATH );
#endif

    /////////////////////////////////////////////////////////////////////////////////////
    get_cycle( &end_sequencial_cycle );
    SEQUENCIAL_TIME = (unsigned int)(end_sequencial_cycle - start_cycle);
    /////////////////////////////////////////////////////////////////////////////////////

    //////////////////
    if( NO_PLACEMENT )
    {
        // the tid value for the main thread is always 0
        // main thread creates new threads with tid in [1,nthreads-1]  
        unsigned int tid;
        for ( tid = 0 ; tid < nthreads ; tid++ )
        {
            // register tid value in exec_args[tid] array
            exec_args[tid].tid = tid;
            
            // create other threads
            if( tid > 0 )
            {
                if ( pthread_create( &exec_trdid[tid], 
                                     NULL,                  // no attribute
                                     &execute,
                                     &exec_args[tid] ) ) 
                {
                    printf("\n[transpose error] cannot create thread %d\n", tid );
                    exit( 0 );
                }

#if VERBOSE_MAIN_DETAILED
printf("\n[transpose] main created thread %d\n", tid );
#endif

            }
            else
            {
                tid_main = 0;
            }
        }  // end for tid

        // main thread calls itself the execute() function
        execute( &exec_args[0] );

        // main thread wait other threads completion
        for ( tid = 1 ; tid < nthreads ; tid++ )
        {
            unsigned int * status;

            // main wait thread[tid] status
            if ( pthread_join( exec_trdid[tid], (void*)(&status)) )
            {
                printf("\n[transpose error] main cannot join thread %d\n", tid );
                exit( 0 );
            }
       
            // check status
            if( *status != THREAD_EXIT_SUCCESS )
            {
                printf("\n[transpose error] thread %x returned failure\n", tid );
                exit( 0 );
            }

#if VERBOSE_MAIN_DETAILED
printf("\n[transpose] main joined thread %x\n", tid );
#endif
        
        }  // end for tid

    }  // end if no_placement

    ////////////////////////
    if( EXPLICIT_PLACEMENT )
    {
        // main thread places each other threads on a specific core[cxy][lid]
        // but the actual thread creation is sequencial
        unsigned int x;
        unsigned int y;
        unsigned int l;
        unsigned int cxy;                   // cluster identifier
        unsigned int tid;                   // thread continuous index

        for( x = 0 ; x < x_size ; x++ )
        {
            for( y = 0 ; y < y_size ; y++ )
            {
                cxy = HAL_CXY_FROM_XY( x , y );
                for( l = 0 ; l < ncores ; l++ )
                {
                    // compute thread continuous index
                    tid = (((x  * y_size) + y) * ncores) + l;

                    // register tid value in exec_args[tid] array
                    exec_args[tid].tid = tid;

                    // no thread created on the core running the main
                    if( (cxy != cxy_main) || (l != lid_main) )
                    {
                        // define thread attributes
                        exec_attr[tid].attributes = PT_ATTR_CLUSTER_DEFINED |
                                                    PT_ATTR_CORE_DEFINED;
                        exec_attr[tid].cxy        = cxy;
                        exec_attr[tid].lid        = l;
  
                        // create thread[tid] on core[cxy][l]
                        if ( pthread_create( &exec_trdid[tid],    
                                             &exec_attr[tid],    
                                             &execute,
                                             &exec_args[tid] ) )       
                        {
                            printf("\n[transpose error] cannot create thread %d\n", tid );
                            exit( 0 );
                        }

#if VERBOSE_MAIN_DETAILED
printf("\n[transpose] main created thread[%d] on core[%x,%d]\n", tid, cxy, l );
#endif
                    }
                    else
                    {
                        tid_main = tid;
                    }
                }
            }
        }

        // main thread calls itself the execute() function
        execute( &exec_args[tid_main] );

        // main thread wait other threads completion
        for( tid = 0 ; tid < nthreads ; tid++ )
        {
            // no other thread on the core running the main
            if( tid != tid_main )
            {
                unsigned int * status;

                // wait thread[tid]
                if( pthread_join( exec_trdid[tid] , (void*)(&status) ) )
                {
                    printf("\n[transpose error] main cannot join thread %d\n", tid );
                    exit( 0 );
                }
       
                // check status
                if( *status != THREAD_EXIT_SUCCESS )
                {
                    printf("\n[transpose error] thread %d returned failure\n", tid );
                    exit( 0 );
                }

#if VERBOSE_MAIN_DETAILED
printf("\n[transpose] main joined thread %d\n", tid );
#endif
            }
        }
    }  // end if explicit_placement

    ////////////////////////
    if( PARALLEL_PLACEMENT )
    {
        // compute covering DQT size an level
        unsigned int z          = (x_size > y_size) ? x_size : y_size;
        unsigned int root_level = ((z == 1) ? 0 : 
                                  ((z == 2) ? 1 : 
                                  ((z == 4) ? 2 : 
                                  ((z == 8) ? 3 : 4))));

        // create & execute the working threads
        if( pthread_parallel_create( root_level , &execute ) )
        {
            printf("\n[transpose error] in %s\n", __FUNCTION__ );
            exit( 0 );
        }
    }  // end if parallel_placement


    /////////////////////////////////////////////////////////////////////////////
    get_cycle( &end_parallel_cycle );
    PARALLEL_TIME = (unsigned int)(end_parallel_cycle - end_sequencial_cycle);
    /////////////////////////////////////////////////////////////////////////////

    // register instrumentation results
    instrument( f , filename );

#if VERBOSE_MAIN
printf("\n[transpose] main completed instrumentation\n");
#endif

/*
    printf("\n> ");
    getchar();

    // move window
    if( fbf_move_window( wid , 100 , 100 ) )
    {
        printf("\n[transpose error] main cannot move FBF window\n");
        exit( 0 );
    }

    printf("\n> ");
    getchar();
*/    
    // save image_out to output file
    if( write( fd_out , image_out , npixels ) != npixels )
    {
        printf("\n[transpose error] main cannot write output image\n");
        exit( 0 );
    }

#if VERBOSE_MAIN
printf("\n[transpose] main saved buf_out to output file\n");
#endif

    // close input file 
    close( fd_in );

#if VERBOSE_MAIN
printf("\n[transpose] main closed input file\n");
#endif

    // close output file 
    close( fd_out );

#if VERBOSE_MAIN
printf("\n[transpose] main closed output file\n");
#endif

    // close instrumentation file
    fclose( f );

#if VERBOSE_MAIN
printf("\n[transpose] main closed instrumentation file\n");
#endif

    // delete FBF window 
    if( fbf_delete_window( wid ) )
    {
        printf("\n[transpose error] main cannot delete FBF window\n");
        exit( 0 );
    }

    // main thread suicide
    exit( 0 );
    
    return 0;

} // end main()




//////////////////////////////////
void * execute( void * arguments ) 
{
    unsigned long long   date;
    unsigned int         l;         // line index for loop
    unsigned int         p;         // pixel index for loop
    int                  error;

    unsigned char      * wbuf = win_buf;
 
    pthread_parallel_work_args_t * args = (pthread_parallel_work_args_t *)arguments;

    // WARNING 
    //A thread is identified by the tid index, defined in the "args" structure.
    // This index being in range [0,nclusters*ncores-1] we can always write
    //       tid == cid * ncores + lid 
    // with cid in [0,nclusters-1] and lid in [0,ncores-1].
    // if NO_PLACEMENT, there is no relation between these
    // thread [cid][lid] indexes, and the core coordinates [cxy][lpid]

    // get thread abstract identifiers 
    unsigned int tid = args->tid;
    unsigned int cid = tid / ncores;    // abstract cluster index
    unsigned int lid = tid % ncores;    // local thread index

#if VERBOSE_EXEC
unsigned int cxy;
unsigned int lpid;
get_core_id( &cxy , &lpid );   // get core physical identifiers
#endif

#if VERBOSE_EXEC
printf("\n[transpose] exec[%d] on core[%x,%d] enters parallel exec\n",
tid , cxy , lpid );
#endif

    get_cycle( &date );
    ALOC_START[cid][lid] = (unsigned int)date;

    // compute total number of pixels per image
    unsigned int npixels = IMAGE_SIZE * IMAGE_SIZE;      

    // compute total number of threads and clusters
    unsigned int nclusters = x_size * y_size;
    unsigned int nthreads  = nclusters * ncores;

    // compute number of pixels per cid & per thread
    unsigned int pixels_per_cid = npixels / nclusters;
    unsigned int pixels_per_lid = pixels_per_cid / ncores;

    // compute first and last line per thread
    unsigned int lines_per_cid = pixels_per_cid / IMAGE_SIZE;
    unsigned int lines_per_lid = pixels_per_lid / IMAGE_SIZE;

    unsigned int line_first = (cid * lines_per_cid) + (lid * lines_per_lid);
    unsigned int line_last  = line_first + lines_per_lid;

    // Each thread[cid,0] allocates two local buffers, and register the base 
    // adresses in the global variable buf_in_ptr[cid] & buf_out_ptr[cid].
    
    if( lid == 0 )
    {
        // allocate buf_in
        buf_in[cid] = (unsigned char *)malloc( pixels_per_cid );

        if( buf_in[cid] == NULL )
        {
            printf("\n[transpose error] thread[%d] cannot allocate buf_in\n", tid );
            pthread_exit( &THREAD_EXIT_FAILURE );
        }

#if VERBOSE_EXEC
printf("\n[transpose] exec[%d] on core[%x,%d] allocated buf_in = %x\n",
tid , cxy , lpid , buf_in );
#endif

        // allocate buf_out
        buf_out[cid] = (unsigned char *)malloc( pixels_per_cid );

        if( buf_out[cid] == NULL )
        {
            printf("\n[transpose error] thread[%d] cannot allocate buf_in\n", tid );
            pthread_exit( &THREAD_EXIT_FAILURE );
        }

#if VERBOSE_EXEC
printf("\n[transpose] exec[%d] on core[%x,%d] allocated buf_out = %x\n",
tid , cxy , lpid , buf_out );
#endif

    }

    get_cycle( &date );
    ALOC_END[cid][lid] = (unsigned int)date;

    /////////////////////////////////
    pthread_barrier_wait( &barrier );
    /////////////////////////////////

    get_cycle( &date );
    LOAD_START[cid][lid] = (unsigned int)date;

    // all threads copy relevant part of the image_in to buf_in[cid]
    memcpy( buf_in[cid] + (lid * pixels_per_lid), 
            image_in + (cid * pixels_per_cid) + (lid * pixels_per_lid),
            pixels_per_lid );

#if VERBOSE_EXEC
printf("\n[transpose] exec[%d] on core[%x,%d] loaded buf_in[%d]\n",
tid , cxy , lpid , cid );
#endif

    // all local threads copy part of buf_in[cid] to FBF window for display
    memcpy( wbuf + (cid * pixels_per_cid) + (lid * pixels_per_lid),
            buf_in[cid] + (lid * pixels_per_lid),
            pixels_per_lid );

#if  VERBOSE_EXEC
printf("\n[transpose] exec[%d] on core[%x,%d] loaded buf_in to FBF (first %d / last %d)\n",
tid , cxy , lpid , line_first , line_last );
#endif

    // retresh window
    error = fbf_refresh_window( wid , line_first , line_last );

    if( error )
    {
        printf("\n[transpose error] exec[%d] cannot refresh FBF window\n", tid );
        exit( 0 );
    }

    get_cycle( &date );
    LOAD_END[cid][lid] = (unsigned int)date;

    /////////////////////////////////
    pthread_barrier_wait( &barrier );
    /////////////////////////////////

    get_cycle( &date );
    TRSP_START[cid][lid] = (unsigned int)date;

    // All threads contribute to parallel transpose from buf_in to buf_out:
    // each thread makes the transposition for nlt lines (nlt = npixels/nthreads)
    // from line [tid*nlt] to line [(tid + 1)*nlt - 1]
    // (p,l) are the absolute pixel coordinates in the source image
    // (l,p) are the absolute pixel coordinates in the dest image

    unsigned int nlt   = IMAGE_SIZE / nthreads;    // number of lines per thread
    unsigned int nlc   = IMAGE_SIZE / nclusters;   // number of lines per cluster

    unsigned int src_cid;
    unsigned int src_index;
    unsigned int dst_cid;
    unsigned int dst_index;

    unsigned char byte;

    unsigned int first = tid * nlt;        // first line index for a given thread
    unsigned int last  = first + nlt;      // last line index for a given thread

    // loop on lines handled by this thread
    for ( l = first ; l < last ; l++ )
    {
        // loop on pixels in one line (one pixel per iteration)
        for ( p = 0 ; p < IMAGE_SIZE ; p++ )
        {
            // read one byte from local buf_in
            src_cid   = l / nlc;
            src_index = (l % nlc) * IMAGE_SIZE + p;

            byte = buf_in[src_cid][src_index];

            // write one byte to remote buf_out
            dst_cid   = p / nlc; 
            dst_index = (p % nlc) * IMAGE_SIZE + l;

            buf_out[dst_cid][dst_index] = byte;
        }
    }

#if VERBOSE_EXEC
printf("\n[transpose] exec[%d] on core[%x,%d] completes transpose\n",
tid , cxy , lpid );
#endif

    get_cycle( &date );
    TRSP_END[cid][lid] = (unsigned int)date;

    /////////////////////////////////
    pthread_barrier_wait( &barrier );
    /////////////////////////////////

    get_cycle( &date );
    SAVE_START[cid][lid] = (unsigned int)date;

    // each local threads copy part of buf_out[cid] to FBF window for display
    memcpy( wbuf + (cid * pixels_per_cid) + (lid * pixels_per_lid),
            buf_out[cid] + (lid * pixels_per_lid),
            pixels_per_lid );

#if  VERBOSE_EXEC
printf("\n[transpose] exec[%d] on core[%x,%d] loaded buf_out to FBF (first %d / last %d)\n",
tid , cxy , lpid , line_first , line_last );
#endif

    // refresh window
    error = fbf_refresh_window( wid , line_first , line_last );

    if( error )
    {
        printf("\n[transpose error] exec[%d] cannot refresh FBF window\n", tid );
        exit( 0 );
    }

    // each local thread copy relevant part of buf_out to image_out
    memcpy( image_out + (cid * pixels_per_cid) + (lid * pixels_per_lid),
            buf_out[cid] + (lid * pixels_per_lid),
            pixels_per_lid );

#if VERBOSE_EXEC
printf("\n[transpose] exec[%d] on core[%x,%d] saved buf_out[%d]\n",
tid , cxy , lpid , cid );
#endif

    get_cycle( &date );
    SAVE_END[cid][lid] = (unsigned int)date;

    /////////////////////////////////
    pthread_barrier_wait( &barrier );
    /////////////////////////////////

    get_cycle( &date );
    FREE_START[cid][lid] = (unsigned int)date;

    // Each thread[cid,0] release local buffers buf_in & buf_out

    if( lid == 0 )
    {
        // release local buffers
        free( buf_in[cid] );
        free( buf_out[cid] );

#if VERBOSE_EXEC
printf("\n[transpose] exec[%d] on core[%x,%d] released buf_in & buf_out\n",
tid , cxy , lpid );
#endif

    }

    get_cycle( &date );
    FREE_END[cid][lid] = (unsigned int)date;

    /////////////////////////////////
    pthread_barrier_wait( &barrier );
    /////////////////////////////////
    
    // thread termination depends on the placement policy
    if( PARALLEL_PLACEMENT )   
    {
        // <work> threads are runing in detached mode, and
        // each thread must signal completion by calling barrier
        // passed in arguments before exit

        pthread_barrier_wait( args->barrier );

        pthread_exit( &THREAD_EXIT_SUCCESS );
    }
    else
    {
        // <work> threads are running in attached mode
        // each thread, but de main, simply exit
        if ( tid != tid_main )  
        {

#if VERBOSE_EXEC
printf("\n[transpose] exec[%d] on core[%x,%d] exit\n",
tid , cxy , lpid );
#endif
            pthread_exit( &THREAD_EXIT_SUCCESS );
        }
    }

    return NULL;

} // end execute()



//////////////////////////
void instrument( FILE * f,
                 char * filename )
{
    unsigned int cid;
    unsigned int l;

    unsigned int min_aloc_start = 0xFFFFFFFF;
    unsigned int max_aloc_start = 0;
    unsigned int min_aloc_ended = 0xFFFFFFFF;
    unsigned int max_aloc_ended = 0;
    unsigned int min_load_start = 0xFFFFFFFF;
    unsigned int max_load_start = 0;
    unsigned int min_load_ended = 0xFFFFFFFF;
    unsigned int max_load_ended = 0;
    unsigned int min_trsp_start = 0xFFFFFFFF;
    unsigned int max_trsp_start = 0;
    unsigned int min_trsp_ended = 0xFFFFFFFF;
    unsigned int max_trsp_ended = 0;
    unsigned int min_save_start = 0xFFFFFFFF;
    unsigned int max_save_start = 0;
    unsigned int min_save_ended = 0xFFFFFFFF;
    unsigned int max_save_ended = 0;
    unsigned int min_free_start = 0xFFFFFFFF;
    unsigned int max_free_start = 0;
    unsigned int min_free_ended = 0xFFFFFFFF;
    unsigned int max_free_ended = 0;
 
    for (cid = 0; cid < (x_size * y_size) ; cid++)
    {
        for ( l = 0 ; l < ncores ; l++ )
        {
            if (ALOC_START[cid][l] < min_aloc_start)  min_aloc_start = ALOC_START[cid][l];
            if (ALOC_START[cid][l] > max_aloc_start)  max_aloc_start = ALOC_START[cid][l];
            if (ALOC_END[cid][l]   < min_aloc_ended)  min_aloc_ended = ALOC_END[cid][l]; 
            if (ALOC_END[cid][l]   > max_aloc_ended)  max_aloc_ended = ALOC_END[cid][l];
            if (LOAD_START[cid][l] < min_load_start)  min_load_start = LOAD_START[cid][l];
            if (LOAD_START[cid][l] > max_load_start)  max_load_start = LOAD_START[cid][l];
            if (LOAD_END[cid][l]   < min_load_ended)  min_load_ended = LOAD_END[cid][l]; 
            if (LOAD_END[cid][l]   > max_load_ended)  max_load_ended = LOAD_END[cid][l];
            if (TRSP_START[cid][l] < min_trsp_start)  min_trsp_start = TRSP_START[cid][l];
            if (TRSP_START[cid][l] > max_trsp_start)  max_trsp_start = TRSP_START[cid][l];
            if (TRSP_END[cid][l]   < min_trsp_ended)  min_trsp_ended = TRSP_END[cid][l];
            if (TRSP_END[cid][l]   > max_trsp_ended)  max_trsp_ended = TRSP_END[cid][l];
            if (SAVE_START[cid][l] < min_save_start)  min_save_start = SAVE_START[cid][l];
            if (SAVE_START[cid][l] > max_save_start)  max_save_start = SAVE_START[cid][l];
            if (SAVE_END[cid][l]   < min_save_ended)  min_save_ended = SAVE_END[cid][l];
            if (SAVE_END[cid][l]   > max_save_ended)  max_save_ended = SAVE_END[cid][l];
            if (FREE_START[cid][l] < min_free_start)  min_free_start = FREE_START[cid][l];
            if (FREE_START[cid][l] > max_free_start)  max_free_start = FREE_START[cid][l];
            if (FREE_END[cid][l]   < min_free_ended)  min_free_ended = FREE_END[cid][l];
            if (FREE_END[cid][l]   > max_free_ended)  max_free_ended = FREE_END[cid][l];
        }
    }

    printf( "\n ------ %s ------\n" , filename );
    fprintf( f , "\n ------ %s ------\n" , filename );

    printf( " - ALOC_START : min = %d / max = %d / delta = %d\n",
           min_aloc_start, max_aloc_start, max_aloc_start-min_aloc_start ); 
    fprintf( f , " - ALOC_START : min = %d / max = %d / delta = %d\n",
           min_aloc_start, max_aloc_start, max_aloc_start-min_aloc_start ); 

    printf( " - ALOC_END   : min = %d / max = %d / delta = %d\n",
           min_aloc_start, max_aloc_start, max_aloc_start-min_aloc_start ); 
    fprintf( f , " - ALOC_END   : min = %d / max = %d / delta = %d\n",
           min_aloc_start, max_aloc_start, max_aloc_start-min_aloc_start ); 

    printf( " - LOAD_START : min = %d / max = %d / delta = %d\n",
           min_load_start, max_load_start, max_load_start-min_load_start ); 
    fprintf( f , " - LOAD_START : min = %d / max = %d / delta = %d\n",
           min_load_start, max_load_start, max_load_start-min_load_start ); 

    printf( " - LOAD_END   : min = %d / max = %d / delta = %d\n",
           min_load_ended, max_load_ended, max_load_ended-min_load_ended ); 
    fprintf( f , " - LOAD_END   : min = %d / max = %d / delta = %d\n",
           min_load_ended, max_load_ended, max_load_ended-min_load_ended ); 

    printf( " - TRSP_START : min = %d / max = %d / delta = %d\n",
           min_trsp_start, max_trsp_start, max_trsp_start-min_trsp_start ); 
    fprintf( f , " - TRSP_START : min = %d / max = %d / delta = %d\n",
           min_trsp_start, max_trsp_start, max_trsp_start-min_trsp_start ); 

    printf( " - TRSP_END   : min = %d / max = %d / delta = %d\n",
           min_trsp_ended, max_trsp_ended, max_trsp_ended-min_trsp_ended ); 
    fprintf( f , " - TRSP_END   : min = %d / max = %d / delta = %d\n",
           min_trsp_ended, max_trsp_ended, max_trsp_ended-min_trsp_ended ); 

    printf( " - SAVE_START : min = %d / max = %d / delta = %d\n",
           min_save_start, max_save_start, max_save_start-min_save_start ); 
    fprintf( f , " - SAVE_START : min = %d / max = %d / delta = %d\n",
           min_save_start, max_save_start, max_save_start-min_save_start ); 

    printf( " - SAVE_END   : min = %d / max = %d / delta = %d\n",
           min_save_ended, max_save_ended, max_save_ended-min_save_ended ); 
    fprintf( f , " - SAVE_END   : min = %d / max = %d / delta = %d\n",
           min_save_ended, max_save_ended, max_save_ended-min_save_ended ); 

    printf( " - FREE_START : min = %d / max = %d / delta = %d\n",
           min_free_start, max_free_start, max_free_start-min_free_start ); 
    fprintf( f , " - FREE_START : min = %d / max = %d / delta = %d\n",
           min_free_start, max_free_start, max_free_start-min_free_start ); 

    printf( " - FREE_END   : min = %d / max = %d / delta = %d\n",
           min_free_start, max_free_start, max_free_start-min_free_start ); 
    fprintf( f , " - FREE_END   : min = %d / max = %d / delta = %d\n",
           min_free_start, max_free_start, max_free_start-min_free_start ); 


    printf( "\n   Sequencial %d"
            "\n   Parallel   %d"
            "\n   Alloc      %d"
            "\n   Load       %d"
            "\n   Transpose  %d"
            "\n   Save       %d"
            "\n   Free       %d\n" ,
            SEQUENCIAL_TIME / 1000 ,
            PARALLEL_TIME / 1000 ,
            (max_aloc_ended - min_aloc_start) / 1000 ,
            (max_load_ended - min_load_start) / 1000 ,
            (max_trsp_ended - min_trsp_start) / 1000 ,
            (max_save_ended - min_save_start) / 1000 ,
            (max_free_ended - min_free_start) / 1000 );

    fprintf( f , "\n   Sequencial %d"
            "\n   Parallel   %d"
            "\n   Alloc      %d"
            "\n   Load       %d"
            "\n   Transpose  %d"
            "\n   Save       %d"
            "\n   Free       %d\n" ,
            SEQUENCIAL_TIME / 1000 ,
            PARALLEL_TIME / 1000 ,
            (max_aloc_ended - min_aloc_start) / 1000 ,
            (max_load_ended - min_load_start) / 1000 ,
            (max_trsp_ended - min_trsp_start) / 1000 ,
            (max_save_ended - min_save_start) / 1000 ,
            (max_free_ended - min_free_start) / 1000 );
}  // end instrument()




