///////////////////////////////////////////////////////////////////////////////////////
// File   : convol.c  
// Date   : june 2014
// author : Alain Greiner
///////////////////////////////////////////////////////////////////////////////////////
// This multi-threaded application implements a 2D convolution product.  
// It can run on a multi-cores, multi-clusters architecture, with one thread
// per core, and uses the POSIX threads API.
// 
// The main() function can be launched on any processor P[x,y,l].
// It makes the initialisations, launch (N-1) threads to run the execute() function
// on the (N-1) other processors than P[x,y,l], call himself the execute() function, 
// and finally call the instrument() function to display instrumentation results 
// when the parallel execution is completed.
//
// The convolution kernel is defined in the execute() function.
// It can be factored in two independant line and column convolution products.
// The five buffers containing the image are distributed in clusters.
// For the philips image, it is a [201]*[35] pixels rectangle, and the.
// 
// The (1024 * 1024) pixels image is read from a file (2 bytes per pixel).
//
// - number of clusters containing processors must be power of 2 no larger than 256.
// - number of processors per cluster must be power of 2 no larger than 4.
//
// The number N of working threads is always defined by the number of cores availables
// in the architecture, but this application supports three placement modes.
// In all modes, the working threads are identified by the [tid] continuous index 
// in range [0, NTHREADS-1], and defines how the lines are shared amongst the threads.
// This continuous index can always be decomposed in two continuous sub-indexes:
// tid == cid * ncores + lid,  where cid is in [0,NCLUSTERS-1] and lid in [0,NCORES-1].
//
// - NO_PLACEMENT: the main thread is itsef a working thread. The (N_1) other working
//   threads are created by the main thread, but the placement is done by the OS, using
//   the DQDT for load balancing, and two working threads can be placed on the same core.
//   The [cid,lid] are only abstract identifiers, and cannot be associated to a physical
//   cluster or a physical core. In this mode, the main thread run on any cluster, 
//   but has tid = 0 (i.e. cid = 0 & tid = 0).
//
// - EXPLICIT_PLACEMENT: the main thread is again a working thread, but the placement of
//   of the threads on the cores is explicitely controled by the main thread to have
//   exactly one working thread per core, and the [cxy][lpid] core coordinates for a given
//   thread[tid] can be directly derived from the [tid] value: [cid] is an alias for the
//   physical cluster identifier, and [lid] is the local core index.
//
// - PARALLEL_PLACEMENT: the main thread is not anymore a working thread, and uses the
//   non standard pthread_parallel_create() function to avoid the costly sequencial
//   loops for pthread_create() and pthread_join(). It garanty one working thread 
//   per core, and the same relation between the thread[tid] and the core[cxy][lpid].
//
// The [tid] continuous index defines how the work is shared amongst the threads:
// - each thread handles NL/nthreads lines for the horizontal filter.
// - each thread handles NP/nthreads columns for the vertical filter.
///////////////////////////////////////////////////////////////////////////////////////

#include <sys/mman.h>
#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <unistd.h>
#include <pthread.h>
#include <string.h>
#include <almosmkh.h>
#include <hal_macros.h>

#define VERBOSE_MAIN               1
#define VERBOSE_EXEC               1

#define X_MAX                      16
#define Y_MAX                      16
#define CORES_MAX                  4
#define CLUSTERS_MAX               (X_MAX * Y_MAX)
#define THREADS_MAX                (X_MAX * Y_MAX * CORES_MAX)

#define IMAGE_IN_PATH              "misc/philips_1024_2.raw"
#define IMAGE_IN_PIXEL_SIZE        2                               // 2 bytes per pixel

#define IMAGE_OUT_PATH             "misc/philips_after_1O24.raw"
#define IMAGE_OUT_PIXEL_SIZE       1                               // 1 bytes per pixel

#define FBF_TYPE                   420 
#define NL                         1024
#define NP                         1024
#define NB_PIXELS                  (NP * NL)

#define NO_PLACEMENT               0
#define EXPLICIT_PLACEMENT         0
#define PARALLEL_PLACEMENT         1

#define USE_DQT_BARRIER            1
#define INITIAL_DISPLAY_ENABLE     1
#define FINAL_DISPLAY_ENABLE       1

#define TA(c,l,p)  (A[c][((NP) * (l)) + (p)])
#define TB(c,p,l)  (B[c][((NL) * (p)) + (l)])
#define TC(c,l,p)  (C[c][((NP) * (l)) + (p)])
#define TD(c,l,p)  (D[c][((NP) * (l)) + (p)])
#define TZ(c,l,p)  (Z[c][((NP) * (l)) + (p)])

#define max(x,y) ((x) > (y) ? (x) : (y))
#define min(x,y) ((x) < (y) ? (x) : (y))

//////////////////////////////////////////////////////////
//            global variables 
//////////////////////////////////////////////////////////

// global instrumentation counters for the main thread
unsigned int SEQUENCIAL_TIME = 0;
unsigned int PARALLEL_TIME   = 0;

// instrumentation counters for thread[tid] in cluster[cid] 
unsigned int START[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
unsigned int H_BEG[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
unsigned int H_END[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
unsigned int V_BEG[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
unsigned int V_END[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
unsigned int D_BEG[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
unsigned int D_END[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};

// pointer on buffer containing the input image, maped by the main to the input file
unsigned char *  image_in;

// pointer on buffer containing the output image, maped by the main to the output file
unsigned char *  image_out;

// return values at thread exit
unsigned int THREAD_EXIT_SUCCESS = 0;
unsigned int THREAD_EXIT_FAILURE = 1;

// synchronization barrier
pthread_barrier_t     barrier;

// platform parameters
unsigned int  x_size;              // number of clusters in a row
unsigned int  y_size;              // number of clusters in a column
unsigned int  ncores;              // number of processors per cluster

// arrays of pointers on distributed buffers in all clusters
unsigned short * GA[CLUSTERS_MAX];
int            * GB[CLUSTERS_MAX];
int            * GC[CLUSTERS_MAX];
int            * GD[CLUSTERS_MAX];
unsigned char  * GZ[CLUSTERS_MAX];

// array of threads kernel identifiers / indexed by [tid] 
pthread_t        exec_trdid[THREADS_MAX];

// array of threads attributes / indexed bi [tid]
pthread_attr_t   exec_attr[THREADS_MAX]; 

// array of execute() function arguments / indexed by [tid]
pthread_parallel_work_args_t exec_args[THREADS_MAX];

// main thread continuous index
unsigned int     tid_main;

/////////////////////////////////////////////////////////////////////////////////////
//           functions declaration
/////////////////////////////////////////////////////////////////////////////////////

void execute( pthread_parallel_work_args_t * args );

void instrument( FILE * f , char * filename );

/////////////////
void main( void )
{
    unsigned long long start_cycle;
    unsigned long long end_sequencial_cycle;
    unsigned long long end_parallel_cycle;

    int          error;

    char         instru_name[32];               // instrumentation file name
    char         instru_path[64];              // instrumentation path name

    /////////////////////////////////////////////////////////////////////////////////
    get_cycle( &start_cycle );
    /////////////////////////////////////////////////////////////////////////////////

    if( (NO_PLACEMENT + EXPLICIT_PLACEMENT + PARALLEL_PLACEMENT) != 1 )
    {
        printf("\n[convol error] illegal placement\n");
        exit( 0 );
    }

    // get & check platform parameters
    get_config( &x_size , &y_size , &ncores );

    if((ncores != 1) && (ncores != 2) && (ncores != 4))
    {
        printf("\n[convol error] number of cores per cluster must be 1/2/4\n");
        exit( 0 );
    }

    if( (x_size != 1) && (x_size != 2) && (x_size != 4) && 
        (x_size != 8) && (x_size != 16) )
    {
        printf("\n[convol error] x_size must be 1/2/4/8/16\n");
        exit( 0 );
    }
        
    if( (y_size != 1) && (y_size != 2) && (y_size != 4) && 
        (y_size != 8) && (y_size != 16) )
    {
        printf("\n[convol error] y_size must be 1/2/4/8/16\n");
        exit( 0 );
    }
        
    // main thread get identifiers for core executing main
    unsigned int  cxy_main;
    unsigned int  lid_main;
    get_core_id( &cxy_main , &lid_main );

    // compute nthreads and nclusters
    unsigned int nclusters = x_size * y_size;
    unsigned int nthreads  = nclusters * ncores;

    // main thread get FBF size and type
    unsigned int   fbf_width;
    unsigned int   fbf_height;
    unsigned int   fbf_type;
    fbf_get_config( &fbf_width , &fbf_height , &fbf_type );

    if( (fbf_width != NP) || (fbf_height != NL) || (fbf_type != FBF_TYPE) )
    {
        printf("\n[convol error] image does not fit FBF size or type\n");
        exit( 0 );
    }

    if( nthreads > NL )
    {
        printf("\n[convol error] number of threads larger than number of lines\n");
        exit( 0 );
    }

    // define instrumentation file name
    if( NO_PLACEMENT )
    {
        printf("\n[convol] %d cluster(s) / %d core(s) / FBF[%d*%d] / PID %x / NO_PLACE\n",
        nclusters, ncores, fbf_width, fbf_height, getpid() );

        // build instrumentation file name
        if( USE_DQT_BARRIER )
        snprintf( instru_name , 32 , "conv_dqt_no_place_%d_%d", x_size * y_size , ncores );
        else
        snprintf( instru_name , 32 , "conv_smp_no_place_%d_%d", x_size * y_size , ncores );
    }

    if( EXPLICIT_PLACEMENT )
    {
        printf("\n[convol] %d cluster(s) / %d core(s) / FBF[%d*%d] / PID %x / EXPLICIT\n",
        nclusters, ncores, fbf_width, fbf_height, getpid() );

        // build instrumentation file name
        if( USE_DQT_BARRIER )
        snprintf( instru_name , 32 , "conv_dqt_explicit_%d_%d_%d", x_size * y_size , ncores );
        else
        snprintf( instru_name , 32 , "conv_smp_explicit_%d_%d_%d", x_size * y_size , ncores );
    }

    if( PARALLEL_PLACEMENT )
    {
        printf("\n[convol] %d cluster(s) / %d core(s) / FBF[%d*%d] / PID %x / PARALLEL\n",
        nclusters, ncores, fbf_width, fbf_height, getpid() );

        // build instrumentation file name
        if( USE_DQT_BARRIER )
        snprintf( instru_name , 32 , "conv_dqt_parallel_%d_%d_%d", x_size * y_size , ncores );
        else
        snprintf( instru_name , 32 , "conv_smp_parallel_%d_%d_%d", x_size * y_size , ncores );
    }

    // open instrumentation file
    snprintf( instru_path , 64 , "/home/%s", instru_name );
    FILE * f_instru = fopen( instru_path , NULL );
    if ( f_instru == NULL ) 
    { 
        printf("\n[convol error] cannot open instrumentation file %s\n", instru_path );
        exit( 0 );
    }

#if  VERBOSE_MAIN
printf("\n[convol] main on core[%x,%d] open instrumentation file %s\n",
cxy_main, lid_main, instru_path );
#endif

    // main initialise barrier 
    if( USE_DQT_BARRIER )
    {
        pthread_barrierattr_t attr;
        attr.x_size   = x_size;
        attr.y_size   = y_size;
        attr.nthreads = ncores;
        error = pthread_barrier_init( &barrier, &attr , nthreads );
    }
    else
    {
        error = pthread_barrier_init( &barrier, NULL , nthreads );
    }

    if( error )
    {
        printf("\n[convol error] cannot initialize barrier\n");
        exit( 0 );
    }

#if VERBOSE_MAIN
printf("\n[convol] main on core[%x,%d] completes barrier init\n", 
cxy_main, lid_main );
#endif

    // main open input file
    int fd_in = open( IMAGE_IN_PATH , O_RDONLY , 0 );

    if ( fd_in < 0 ) 
    { 
        printf("\n[convol error] cannot open input file <%s>\n", IMAGE_IN_PATH );
        exit( 0 );
    }

#if VERBOSE_MAIN
printf("\n[convol] main on core[%x,%d] open file <%s>\n",
cxy_main, lid_main, IMAGE_IN_PATH );
#endif
    
    // main thread map image_in buffer to input file 
    image_in = (unsigned char *)mmap( NULL,
                                      NB_PIXELS * IMAGE_IN_PIXEL_SIZE,
                                      PROT_READ,
                                      MAP_FILE | MAP_SHARED,
                                      fd_in,
                                      0 );           // offset
    if ( image_in == NULL ) 
    { 
        printf("\n[convol error] main cannot map buffer to file %s\n", IMAGE_IN_PATH );
        exit( 0 );
    }

#if  VERBOSE_MAIN
printf("\n[convol] main on core[%x,%x] map buffer to file <%s>\n",
cxy_main, lid_main, IMAGE_IN_PATH );
#endif

    // main thread open output file
    int fd_out = open( IMAGE_OUT_PATH , O_CREAT , 0 ); 

    if ( fd_out < 0 ) 
    { 
        printf("\n[convol error] main cannot open file %s\n", IMAGE_OUT_PATH );
        exit( 0 );
    }

#if  VERBOSE_MAIN
printf("\n[convol] main on core[%x,%d] open file <%s>\n",
cxy_main, lid_main, IMAGE_OUT_PATH );
#endif

    // main thread map image_out buffer to output file
    image_out = (unsigned char *)mmap( NULL,
                                       NB_PIXELS + IMAGE_OUT_PIXEL_SIZE,
                                       PROT_WRITE,
                                       MAP_FILE | MAP_SHARED,
                                       fd_out,
                                       0 );     // offset 
    if ( image_out == NULL ) 
    { 
        printf("\n[convol error] main cannot map buffer to file %s\n", IMAGE_OUT_PATH );
        exit( 0 );
    }

#if  VERBOSE_MAIN
printf("\n[convol] main on core[%x,%x] map buffer to file <%s>\n",
cxy_main, lid_main, IMAGE_OUT_PATH );
#endif

    /////////////////////////////////////////////////////////////////////////////////////
    get_cycle( &end_sequencial_cycle );
    SEQUENCIAL_TIME = (unsigned int)(end_sequencial_cycle - start_cycle);
    /////////////////////////////////////////////////////////////////////////////////////

    //////////////////
    if( NO_PLACEMENT )
    {
        // the tid value for the main thread is always 0
        // main thread creates new threads with tid in [1,nthreads-1]  
        unsigned int tid;
        for ( tid = 0 ; tid < nthreads ; tid++ )
        {
            // register tid value in exec_args[tid] array
            exec_args[tid].tid = tid;
            
            // create other threads
            if( tid > 0 )
            {
                if ( pthread_create( &exec_trdid[tid], 
                                     NULL,                  // no attribute
                                     &execute,
                                     &exec_args[tid] ) ) 
                {
                    printf("\n[convol error] cannot create thread %d\n", tid );
                    exit( 0 );
                }

#if VERBOSE_MAIN
printf("\n[convol] main created thread %d\n", tid );
#endif

            }
            else
            {
                tid_main = 0;
            }
        }  // end for tid

        // main thread calls itself the execute() function
        execute( &exec_args[0] );

        // main thread wait other threads completion
        for ( tid = 1 ; tid < nthreads ; tid++ )
        {
            unsigned int * status;

            // main wait thread[tid] status
            if ( pthread_join( exec_trdid[tid], (void*)(&status)) )
            {
                printf("\n[convol error] main cannot join thread %d\n", tid );
                exit( 0 );
            }
       
            // check status
            if( *status != THREAD_EXIT_SUCCESS )
            {
                printf("\n[convol error] thread %x returned failure\n", tid );
                exit( 0 );
            }

#if VERBOSE_MAIN 
printf("\n[convol] main successfully joined thread %x\n", tid );
#endif
        
        }  // end for tid

    }  // end if no_placement

    ////////////////////////
    if( EXPLICIT_PLACEMENT )
    {
        // main thread places each other threads on a specific core[cxy][lid]
        // but the actual thread creation is sequencial
        unsigned int x;
        unsigned int y;
        unsigned int l;
        unsigned int cxy;                   // cluster identifier
        unsigned int tid;                   // thread continuous index

        for( x = 0 ; x < x_size ; x++ )
        {
            for( y = 0 ; y < y_size ; y++ )
            {
                cxy = HAL_CXY_FROM_XY( x , y );
                for( l = 0 ; l < ncores ; l++ )
                {
                    // compute thread continuous index
                    tid = (((x  * y_size) + y) * ncores) + l;

                    // register tid value in exec_args[tid] array
                    exec_args[tid].tid = tid;

                    // no thread created on the core running the main
                    if( (cxy != cxy_main) || (l != lid_main) )
                    {
                        // define thread attributes
                        exec_attr[tid].attributes = PT_ATTR_CLUSTER_DEFINED |
                                                    PT_ATTR_CORE_DEFINED;
                        exec_attr[tid].cxy        = cxy;
                        exec_attr[tid].lid        = l;
  
                        // create thread[tid] on core[cxy][l]
                        if ( pthread_create( &exec_trdid[tid],    
                                             &exec_attr[tid],    
                                             &execute,
                                             &exec_args[tid] ) )       
                        {
                            printf("\n[convol error] cannot create thread %d\n", tid );
                            exit( 0 );
                        }
#if VERBOSE_MAIN 
printf("\n[convol] main created thread[%d] on core[%x,%d]\n", tid, cxy, l );
#endif
                    }
                    else
                    {
                        tid_main = tid;
                    }
                }
            }
        }

        // main thread calls itself the execute() function
        execute( &exec_args[tid_main] );

        // main thread wait other threads completion
        for( tid = 0 ; tid < nthreads ; tid++ )
        {
            // no other thread on the core running the main
            if( tid != tid_main )
            {
                unsigned int * status;

                // wait thread[tid]
                if( pthread_join( exec_trdid[tid] , (void*)(&status) ) )
                {
                    printf("\n[convol error] main cannot join thread %d\n", tid );
                    exit( 0 );
                }
       
                // check status
                if( *status != THREAD_EXIT_SUCCESS )
                {
                    printf("\n[convol error] thread %d returned failure\n", tid );
                    exit( 0 );
                }
#if VERBOSE_MAIN 
printf("\n[convol] main joined thread %d on core[%x,%d]\n", tid , cxy , l );
#endif
            }
        }
    }  // end if explicit_placement

    ////////////////////////
    if( PARALLEL_PLACEMENT )
    {
        // compute covering DQT size an level
        unsigned int z          = (x_size > y_size) ? x_size : y_size;
        unsigned int root_level = ((z == 1) ? 0 : 
                                  ((z == 2) ? 1 : 
                                  ((z == 4) ? 2 : 
                                  ((z == 8) ? 3 : 4))));

        // create & execute the working threads
        if( pthread_parallel_create( root_level , &execute ) )
        {
            printf("\n[convol error] in %s\n", __FUNCTION__ );
            exit( 0 );
        }
    }  // end if parallel_placement

    /////////////////////////////////////////////////////////////////////////////
    get_cycle( &end_parallel_cycle );
    PARALLEL_TIME = (unsigned int)(end_parallel_cycle - end_sequencial_cycle);
    /////////////////////////////////////////////////////////////////////////////

    // main thread register instrumentation results
    instrument( f_instru , instru_name );

    // main thread close input file
    close( fd_in );

    // main thread close output file
    close( fd_out );

    // main thread close instrumentation file
    fclose( f_instru );

    // main thread suicide 
    exit( 0 );
    
} // end main() 






///////////////////////////////////////////////////
void execute( pthread_parallel_work_args_t * args )
{
    unsigned long long date;

    // Each thread initialises the convolution kernel parameters in local stack.
    // The values defined in the next 12 lines are Philips proprietary information.

    int   vnorm  = 115;
    int   vf[35] = { 1, 1, 2, 2, 2,
                     2, 3, 3, 3, 4,
                     4, 4, 4, 5, 5,
                     5, 5, 5, 5, 5,
                     5, 5, 4, 4, 4,
                     4, 3, 3, 3, 2,
                     2, 2, 2, 1, 1 };

    unsigned int hrange = 100;
    unsigned int hnorm  = 201;

    // WARNING 
    //A thread is identified by the tid index, defined in the "args" structure.
    // This index being in range [0,nclusters*ncores-1] we can always write
    //       tid == cid * ncores + lid 
    // with cid in [0,nclusters-1] and lid in [0,ncores-1].
    // if NO_PLACEMENT, there is no relation between these
    // thread [cid][lid] indexes, and the core coordinates [cxy][lpid]

    // get thread abstract identifiers 
    unsigned int tid = args->tid;
    unsigned int cid = tid / ncores;   
    unsigned int lid = tid % ncores;

#if VERBOSE_EXEC
unsigned int cxy;              // core cluster identifier
unsigned int lpid;             // core local identifier
get_core_id( &cxy , &lpid );
printf("\n[convol] exec[%d] on core[%x,%d] enters parallel exec\n",
tid , cxy , lpid );
#endif

    // build total number of threads and clusters from global variables
    unsigned int nclusters = x_size * y_size;
    unsigned int nthreads  = nclusters * ncores;

    // indexes for loops
    unsigned int c;                 // cluster index 
    unsigned int l;                 // line index 
    unsigned int p;                 // pixel index 
    unsigned int z;                 // vertical filter index 

    unsigned int lines_per_thread   = NL / nthreads;
    unsigned int lines_per_cluster  = NL / nclusters;
    unsigned int pixels_per_thread  = NP / nthreads;
    unsigned int pixels_per_cluster = NP / nclusters;

    // compute number of pixels stored in one abstract cluster cid
    unsigned int local_pixels = NL * NP / nclusters;        

    unsigned int first, last;

    get_cycle( &date );
    START[cid][lid] = (unsigned int)date;

    // Each thread[cid][0] allocates 5 local buffers, 
    // shared by all threads that have the same cid 
    if ( lid == 0 )
    {
        GA[cid] = malloc( local_pixels * sizeof( unsigned short ) );
        GB[cid] = malloc( local_pixels * sizeof( int ) );
        GC[cid] = malloc( local_pixels * sizeof( int ) );
        GD[cid] = malloc( local_pixels * sizeof( int ) );
        GZ[cid] = malloc( local_pixels * sizeof( unsigned char ) );

        if( (GA[cid] == NULL) || (GB[cid] == NULL) || (GC[cid] == NULL) || 
            (GD[cid] == NULL) || (GZ[cid] == NULL) )
        {
            printf("\n[convol error] thread[%d] cannot allocate buf_in\n", tid );
            pthread_exit( &THREAD_EXIT_FAILURE );
        }

#if VERBOSE_EXEC
printf( "\n[convol] exec[%d] on core[%x,%d] allocated shared buffers\n"
"### GA = %x\n"
"### GB = %x\n"               
"### GC = %x\n"               
"### GD = %x\n"               
"### GZ = %x\n",
tid, cxy , lpid, GA[cid], GB[cid], GC[cid], GD[cid], GZ[cid] );
#endif
    
    }

    ////////////////////////////////
    pthread_barrier_wait( &barrier );

    // Each thread[cid,lid] allocate and initialise in its private stack 
    // a copy of the arrays of pointers on the distributed buffers.
    unsigned short * A[CLUSTERS_MAX];
    int            * B[CLUSTERS_MAX];
    int            * C[CLUSTERS_MAX];
    int            * D[CLUSTERS_MAX];
    unsigned char  * Z[CLUSTERS_MAX];

    for( c = 0 ; c < nclusters ; c++ )
    {
        A[c] = GA[c];
        B[c] = GB[c];
        C[c] = GC[c];
        D[c] = GD[c];
        Z[c] = GZ[c];
    }

    // Each thread[cid,0] access the file containing the input image, to load
    // the local A[cid] buffer. Other threads are waiting on the barrier.
    if ( lid==0 )
    {
        unsigned int size   = local_pixels * sizeof( unsigned short );
        unsigned int offset = size * cid;

        memcpy( A[cid],
                image_in + offset,
                size );
 
#if VERBOSE_EXEC
get_cycle( &date );
printf( "\n[convol] thread %d on core[%x,%d] load input file in A[%d]\n", 
tid , cxy , lpid , cid );
#endif

    }

    // Optionnal parallel display of the initial image stored in A[c] buffers.
    // Eah thread[cid,lid] displays (NL/nthreads) lines. 

    if ( INITIAL_DISPLAY_ENABLE )
    {
        unsigned int line;
        unsigned int offset = lines_per_thread * lid;

        for ( l = 0 ; l < lines_per_thread ; l++ )
        {
            line = offset + l;

            // copy TA[cid] to TZ[cid]
            for ( p = 0 ; p < NP ; p++ )
            {
                TZ(cid, line, p) = (unsigned char)(TA(cid, line, p) >> 8);
            }

            // display one line to frame buffer
            if (fbf_write( &TZ(cid, line, 0),                     // first pixel in TZ
                           NP,                                    // number of bytes
                           NP*(l + (tid * lines_per_thread))))    // offset in FBF
            {
                printf("\n[convol error] in %s : thread[%x,%d] cannot access FBF\n",
                __FUNCTION__ , cxy , lid );
                pthread_exit( &THREAD_EXIT_FAILURE );
            }
        }

#if VERBOSE_EXEC 
get_cycle( &date );
printf( "\n[convol] thread[%d] on core[%x,%d] completes initial display\n",
tid , cxy , lpid );
#endif

        ////////////////////////////////
        pthread_barrier_wait( &barrier );
    }

    ////////////////////////////////////////////////////////////
    // parallel horizontal filter : 
    // B <= convol(FH(A))
    // D <= A - FH(A)
    // Each thread computes (NL/nthreads) lines.
    // The image must be extended :
    // if (z<0)    TA(cid,l,z) == TA(cid,l,0)
    // if (z>NP-1) TA(cid,l,z) == TA(cid,l,NP-1)
    ////////////////////////////////////////////////////////////

    get_cycle( &date );
    H_BEG[cid][lid] = (unsigned int)date;

#if VERBOSE_EXEC 
printf( "\n[convol] thread[%d] on core[%x,%d] starts horizontal filter\n",
tid , cxy , lpid );
#else
if ( tid == tid_main ) 
printf( "\n[convol] thread[%d] on core[%x,%d] starts horizontal filter\n",
tid , cxy , lpid );
#endif

    // l = absolute line index / p = absolute pixel index  
    // first & last define which lines are handled by a given thread

    first = tid * lines_per_thread;
    last  = first + lines_per_thread;

    for (l = first; l < last; l++)
    {
        // src_c and src_l are the cluster index and the line index for A & D
        int src_c = l / lines_per_cluster;
        int src_l = l % lines_per_cluster;

        // We use the specific values of the horizontal ep-filter for optimisation:
        // sum(p) = sum(p-1) + TA[p+hrange] - TA[p-hrange-1]
        // To minimize the number of tests, the loop on pixels is split in three domains 

        int sum_p = (hrange + 2) * TA(src_c, src_l, 0);
        for (z = 1; z < hrange; z++)
        {
            sum_p = sum_p + TA(src_c, src_l, z);
        }

        // first domain : from 0 to hrange
        for (p = 0; p < hrange + 1; p++)
        {
            // dst_c and dst_p are the cluster index and the pixel index for B
            int dst_c = p / pixels_per_cluster;
            int dst_p = p % pixels_per_cluster;
            sum_p = sum_p + (int) TA(src_c, src_l, p + hrange) - (int) TA(src_c, src_l, 0);
            TB(dst_c, dst_p, l) = sum_p / hnorm;
            TD(src_c, src_l, p) = (int) TA(src_c, src_l, p) - sum_p / hnorm;
        }
        // second domain : from (hrange+1) to (NP-hrange-1)
        for (p = hrange + 1; p < NP - hrange; p++)
        {
            // dst_c and dst_p are the cluster index and the pixel index for B
            int dst_c = p / pixels_per_cluster;
            int dst_p = p % pixels_per_cluster;
            sum_p = sum_p + (int) TA(src_c, src_l, p + hrange) 
                          - (int) TA(src_c, src_l, p - hrange - 1);
            TB(dst_c, dst_p, l) = sum_p / hnorm;
            TD(src_c, src_l, p) = (int) TA(src_c, src_l, p) - sum_p / hnorm;
        }
        // third domain : from (NP-hrange) to (NP-1)
        for (p = NP - hrange; p < NP; p++)
        {
            // dst_c and dst_p are the cluster index and the pixel index for B
            int dst_c = p / pixels_per_cluster;
            int dst_p = p % pixels_per_cluster;
            sum_p = sum_p + (int) TA(src_c, src_l, NP - 1) 
                          - (int) TA(src_c, src_l, p - hrange - 1);
            TB(dst_c, dst_p, l) = sum_p / hnorm;
            TD(src_c, src_l, p) = (int) TA(src_c, src_l, p) - sum_p / hnorm;
        }

#if SUPER_VERBOSE
get_cycle( &date );
printf(" - line %d computed at cycle %d\n", l, (unsigned int)date );
#endif    

    }

    get_cycle( &date );
    H_END[cid][lid] = (unsigned int)date;

#if VERBOSE_EXEC 
printf( "\n[convol] thread[%d] on core[%x,%d] completes horizontal filter\n",
tid , cxy , lpid );
#else
if ( tid == tid_main ) 
printf( "\n[convol] thread[%d] on core[%x,%d] completes horizontal filter\n",
tid , cxy , lpid );
#endif

    ////////////////////////////////
    pthread_barrier_wait( &barrier );

    ///////////////////////////////////////////////////////////////
    // parallel vertical filter : 
    // C <= transpose(FV(B))
    // Each thread computes (NP/nthreads) columns
    // The image must be extended :
    // if (l<0)    TB(cid,p,l) == TB(cid,p,0)
    // if (l>NL-1)   TB(cid,p,l) == TB(cid,p,NL-1)
    ///////////////////////////////////////////////////////////////

    get_cycle( &date );
    V_BEG[cid][lid] = (unsigned int)date;

#if VERBOSE_EXEC 
printf( "\n[convol] thread[%d] on core[%x,%d] starts vertical filter\n",
tid , cxy , lpid );
#else
if ( tid == tid_main ) 
printf( "\n[convol] thread[%d] on core[%x,%d] starts vertical filter\n",
tid , cxy , lpid );
#endif

    // l = absolute line index / p = absolute pixel index
    // first & last define which pixels are handled by a given thread

    first = tid * pixels_per_thread;
    last  = first + pixels_per_thread;

    for (p = first; p < last; p++)
    {
        // src_c and src_p are the cluster index and the pixel index for B
        int src_c = p / pixels_per_cluster;
        int src_p = p % pixels_per_cluster;

        int sum_l;

        // We use the specific values of the vertical ep-filter
        // To minimize the number of tests, the NL lines are split in three domains 

        // first domain : explicit computation for the first 18 values
        for (l = 0; l < 18; l++)
        {
            // dst_c and dst_l are the cluster index and the line index for C
            int dst_c = l / lines_per_cluster;
            int dst_l = l % lines_per_cluster;

            for (z = 0, sum_l = 0; z < 35; z++)
            {
                sum_l = sum_l + vf[z] * TB(src_c, src_p, max(l - 17 + z,0) );
            }
            TC(dst_c, dst_l, p) = sum_l / vnorm;
        }
        // second domain
        for (l = 18; l < NL - 17; l++)
        {
            // dst_c and dst_l are the cluster index and the line index for C
            int dst_c = l / lines_per_cluster;
            int dst_l = l % lines_per_cluster;

            sum_l = sum_l + TB(src_c, src_p, l + 4)
                  + TB(src_c, src_p, l + 8)
                  + TB(src_c, src_p, l + 11)
                  + TB(src_c, src_p, l + 15)
                  + TB(src_c, src_p, l + 17)
                  - TB(src_c, src_p, l - 5)
                  - TB(src_c, src_p, l - 9)
                  - TB(src_c, src_p, l - 12)
                  - TB(src_c, src_p, l - 16)
                  - TB(src_c, src_p, l - 18);

            TC(dst_c, dst_l, p) = sum_l / vnorm;
        }
        // third domain
        for (l = NL - 17; l < NL; l++)
        {
            // dst_c and dst_l are the cluster index and the line index for C
            int dst_c = l / lines_per_cluster;
            int dst_l = l % lines_per_cluster;

            sum_l = sum_l + TB(src_c, src_p, min(l + 4, NL - 1))
                  + TB(src_c, src_p, min(l + 8, NL - 1))
                  + TB(src_c, src_p, min(l + 11, NL - 1))
                  + TB(src_c, src_p, min(l + 15, NL - 1))
                  + TB(src_c, src_p, min(l + 17, NL - 1))
                  - TB(src_c, src_p, l - 5)
                  - TB(src_c, src_p, l - 9)
                  - TB(src_c, src_p, l - 12)
                  - TB(src_c, src_p, l - 16)
                  - TB(src_c, src_p, l - 18);

            TC(dst_c, dst_l, p) = sum_l / vnorm;
        }

#if SUPER_VERBOSE
get_cycle( &date );
printf(" - column %d computed at cycle %d\n", p, (unsigned int)date );
#endif 

    }

    get_cycle( &date );
    V_END[cid][lid] = (unsigned int)date;

#if VERBOSE_EXEC 
printf( "\n[convol] thread[%d] on core[%x,%d] completes vertical filter\n",
tid , cxy , lid );
#else
if ( tid == tid_main ) 
printf( "\n[convol] thread[%d] on core[%x,%d] completes vertical filter\n",
tid , cxy , lid );
#endif

    ////////////////////////////////
    pthread_barrier_wait( &barrier );

    // Optional parallel display of the final image Z <= D + C
    // Eah thread[x,y,p] displays (NL/nthreads) lines.

    if ( FINAL_DISPLAY_ENABLE )
    {
        get_cycle( &date );
        D_BEG[cid][lid] = (unsigned int)date;

#if VERBOSE_EXEC
printf( "\n[convol] thread[%d] on core[%x,%d] starts final display\n",
tid , cxy , lid );
#else
if ( tid == tid_main ) 
printf( "\n[convol] thread[%d] on core[%x,%d] starts final display\n",
tid , cxy , lid );
#endif

        unsigned int line;
        unsigned int offset = lines_per_thread * lid;

        for ( l = 0 ; l < lines_per_thread ; l++ )
        {
            line = offset + l;

            for ( p = 0 ; p < NP ; p++ )
            {
                TZ(cid, line, p) = 
                   (unsigned char)( (TD(cid, line, p) + 
                                     TC(cid, line, p) ) >> 8 );
            }

            if (fbf_write( &TZ(cid, line, 0),                   // first pixel in TZ
                           NP,                                  // number of bytes
                           NP*(l + (tid * lines_per_thread))))  // offset in FBF
            {
                printf("\n[convol error] thread[%d] cannot access FBF\n", tid );
                pthread_exit( &THREAD_EXIT_FAILURE );
            }
        }

        get_cycle( &date );
        D_END[cid][lid] = (unsigned int)date;

#if VERBOSE_EXEC
printf( "\n[convol] thread[%d] on core[%x,%d] completes final display\n",
tid , cxy , lid );
#else
if ( tid == tid_main ) 
printf( "\n[convol] thread[%d] on core[%x,%d] completes final display\n",
tid , cxy , lid );
#endif

    }

    // all threads (but the one executing main) exit
    if ( tid != tid_main )
    {
        pthread_exit( &THREAD_EXIT_SUCCESS );
    }

} // end execute()



//////////////////////////
void instrument( FILE * f,
                 char * filename )
{
    unsigned int nclusters = x_size * y_size;

    unsigned int cc, pp;

    unsigned int min_start = 0xFFFFFFFF;
    unsigned int max_start = 0;

    unsigned int min_h_beg = 0xFFFFFFFF;
    unsigned int max_h_beg = 0;

    unsigned int min_h_end = 0xFFFFFFFF;
    unsigned int max_h_end = 0;

    unsigned int min_v_beg = 0xFFFFFFFF;
    unsigned int max_v_beg = 0;

    unsigned int min_v_end = 0xFFFFFFFF;
    unsigned int max_v_end = 0;

    unsigned int min_d_beg = 0xFFFFFFFF;
    unsigned int max_d_beg = 0;

    unsigned int min_d_end = 0xFFFFFFFF;
    unsigned int max_d_end = 0;

    for (cc = 0; cc < nclusters; cc++)
    {
        for (pp = 0; pp < ncores; pp++ )
        {
            if (START[cc][pp] < min_start) min_start = START[cc][pp];
            if (START[cc][pp] > max_start) max_start = START[cc][pp];

            if (H_BEG[cc][pp] < min_h_beg) min_h_beg = H_BEG[cc][pp];
            if (H_BEG[cc][pp] > max_h_beg) max_h_beg = H_BEG[cc][pp];

            if (H_END[cc][pp] < min_h_end) min_h_end = H_END[cc][pp];
            if (H_END[cc][pp] > max_h_end) max_h_end = H_END[cc][pp];

            if (V_BEG[cc][pp] < min_v_beg) min_v_beg = V_BEG[cc][pp];
            if (V_BEG[cc][pp] > max_v_beg) max_v_beg = V_BEG[cc][pp];

            if (V_END[cc][pp] < min_v_end) min_v_end = V_END[cc][pp];
            if (V_END[cc][pp] > max_v_end) max_v_end = V_END[cc][pp];

            if (D_BEG[cc][pp] < min_d_beg) min_d_beg = D_BEG[cc][pp];
            if (D_BEG[cc][pp] > max_d_beg) max_d_beg = D_BEG[cc][pp];

            if (D_END[cc][pp] < min_d_end) min_d_end = D_END[cc][pp];
            if (D_END[cc][pp] > max_d_end) max_d_end = D_END[cc][pp];
        }
    }

    // display on terminal
    printf( "\n ------ %s ------\n" , filename );

    printf(" - START : min = %d / max = %d / med = %d / delta = %d\n",
           min_start, max_start, (min_start+max_start)/2, max_start-min_start);

    printf(" - H_BEG : min = %d / max = %d / med = %d / delta = %d\n",
           min_h_beg, max_h_beg, (min_h_beg+max_h_beg)/2, max_h_beg-min_h_beg);

    printf(" - H_END : min = %d / max = %d / med = %d / delta = %d\n",
           min_h_end, max_h_end, (min_h_end+max_h_end)/2, max_h_end-min_h_end);

    printf(" - V_BEG : min = %d / max = %d / med = %d / delta = %d\n",
           min_v_beg, max_v_beg, (min_v_beg+max_v_beg)/2, max_v_beg-min_v_beg);

    printf(" - V_END : min = %d / max = %d / med = %d / delta = %d\n",
           min_v_end, max_v_end, (min_v_end+max_v_end)/2, max_v_end-min_v_end);

    printf(" - D_BEG : min = %d / max = %d / med = %d / delta = %d\n",
           min_d_beg, max_d_beg, (min_d_beg+max_d_beg)/2, max_d_beg-min_d_beg);

    printf(" - D_END : min = %d / max = %d / med = %d / delta = %d\n",
           min_d_end, max_d_end, (min_d_end+max_d_end)/2, max_d_end-min_d_end);

    printf( "\n General Scenario (Kcycles for each step)\n" );
    printf( " - LOAD IMAGE        = %d\n", (min_h_beg - min_start)/1000 );
    printf( " - H_FILTER          = %d\n", (max_h_end - min_h_beg)/1000 );
    printf( " - BARRIER HORI/VERT = %d\n", (min_v_beg - max_h_end)/1000 );
    printf( " - V_FILTER          = %d\n", (max_v_end - min_v_beg)/1000 );
    printf( " - BARRIER VERT/DISP = %d\n", (min_d_beg - max_v_end)/1000 );
    printf( " - DISPLAY           = %d\n", (max_d_end - min_d_beg)/1000 );
    printf( " \nSEQUENCIAL = %d / PARALLEL = %d\n", SEQUENCIAL_TIME, PARALLEL_TIME );

    // save on disk
    fprintf( f ,  "\n ------ %s ------\n" , filename );

    fprintf( f , " - START : min = %d / max = %d / med = %d / delta = %d\n",
           min_start, max_start, (min_start+max_start)/2, max_start-min_start);

    fprintf( f , " - H_BEG : min = %d / max = %d / med = %d / delta = %d\n",
           min_h_beg, max_h_beg, (min_h_beg+max_h_beg)/2, max_h_beg-min_h_beg);

    fprintf( f , " - H_END : min = %d / max = %d / med = %d / delta = %d\n",
           min_h_end, max_h_end, (min_h_end+max_h_end)/2, max_h_end-min_h_end);

    fprintf( f , " - V_BEG : min = %d / max = %d / med = %d / delta = %d\n",
           min_v_beg, max_v_beg, (min_v_beg+max_v_beg)/2, max_v_beg-min_v_beg);

    fprintf( f , " - V_END : min = %d / max = %d / med = %d / delta = %d\n",
           min_v_end, max_v_end, (min_v_end+max_v_end)/2, max_v_end-min_v_end);

    fprintf( f , " - D_BEG : min = %d / max = %d / med = %d / delta = %d\n",
           min_d_beg, max_d_beg, (min_d_beg+max_d_beg)/2, max_d_beg-min_d_beg);

    fprintf( f , " - D_END : min = %d / max = %d / med = %d / delta = %d\n",
           min_d_end, max_d_end, (min_d_end+max_d_end)/2, max_d_end-min_d_end);

    fprintf( f ,  "\n General Scenario (Kcycles)\n" );
    fprintf( f ,  " - LOAD IMAGE        = %d\n", (min_h_beg - min_start)/1000 );
    fprintf( f ,  " - H_FILTER          = %d\n", (max_h_end - min_h_beg)/1000 );
    fprintf( f ,  " - BARRIER HORI/VERT = %d\n", (min_v_beg - max_h_end)/1000 );
    fprintf( f ,  " - V_FILTER          = %d\n", (max_v_end - min_v_beg)/1000 );
    fprintf( f ,  " - BARRIER VERT/DISP = %d\n", (min_d_beg - max_v_end)/1000 );
    fprintf( f ,  " - DISPLAY           = %d\n", (max_d_end - min_d_beg)/1000 );
    fprintf( f ,  " \nSEQUENCIAL = %d / PARALLEL = %d\n", SEQUENCIAL_TIME, PARALLEL_TIME );

} // end instrument()





// Local Variables:
// tab-width: 3
// c-basic-offset: 3
// c-file-offsets:((innamespace . 0)(inline-open . 0))
// indent-tabs-mode: nil
// End:

// vim: filetype=cpp:expandtab:shiftwidth=3:tabstop=3:softtabstop=3


