///////////////////////////////////////////////////////////////////////////////////////
// File   : convol.c  
// Date   : june 2014
// author : Alain Greiner
///////////////////////////////////////////////////////////////////////////////////////
// This multi-threaded application implements a 2D convolution product.  
// It can run on a multi-processors, multi-clusters architecture, with one thread
// per processor, and uses the POSIX threads API.
// 
// The main() function can be launched on any processor P[x,y,l].
// It makes the initialisations, launch (N-1) threads to run the execute() function
// on the (N-1) other processors than P[x,y,l], call himself the execute() function, 
// and finally call the instrument() function to display instrumentation results 
// when the parallel execution is completed.
//
// The convolution kernel is [201]*[35] pixels, but it can be factored in two
// independant line and column convolution products.
// The five buffers containing the image are distributed in clusters.
// 
// The (1024 * 1024) pixels image is read from a file (2 bytes per pixel).
//
// - number of clusters containing processors must be power of 2 no larger than 256.
// - number of processors per cluster must be power of 2 no larger than 8.
///////////////////////////////////////////////////////////////////////////////////////

#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <unistd.h>
#include <pthread.h>
#include <almosmkh.h>
#include <hal_macros.h>

#define IMAGE_IN_PATH              "misc/philips_1024.raw"

#define USE_SQT_BARRIER            1
#define VERBOSE                    1
#define SUPER_VERBOSE              0

#define USE_DQT_BARRIER            1

#define X_MAX                      16
#define Y_MAX                      16
#define PROCS_MAX                  4
#define CLUSTERS_MAX               (X_MAX * Y_MAX)
#define THREADS_MAX                (X_MAX * Y_MAX * PROCS_MAX]

#define INITIAL_DISPLAY_ENABLE     1
#define FINAL_DISPLAY_ENABLE       1

#define PIXEL_SIZE                 2       // input image has 2 bytes per pixel
#define FBF_TYPE                   420     // output image has 1 byte per pixel

#define NL                         1024
#define NP                         1024
#define NB_PIXELS                  (NP * NL)
#define FRAME_SIZE                 (NB_PIXELS * PIXEL_SIZE)


#define TA(c,l,p)  (A[c][((NP) * (l)) + (p)])
#define TB(c,p,l)  (B[c][((NL) * (p)) + (l)])
#define TC(c,l,p)  (C[c][((NP) * (l)) + (p)])
#define TD(c,l,p)  (D[c][((NP) * (l)) + (p)])
#define TZ(c,l,p)  (Z[c][((NP) * (l)) + (p)])

#define max(x,y) ((x) > (y) ? (x) : (y))
#define min(x,y) ((x) < (y) ? (x) : (y))

//////////////////////////////////////////////////////////
//   global variables stored in seg_data in cluster[0,0]
//////////////////////////////////////////////////////////

// Instrumentation counters (cluster_id, lpid]
unsigned int START[CLUSTERS_MAX][PROCS_MAX];
unsigned int H_BEG[CLUSTERS_MAX][PROCS_MAX];
unsigned int H_END[CLUSTERS_MAX][PROCS_MAX];
unsigned int V_BEG[CLUSTERS_MAX][PROCS_MAX];
unsigned int V_END[CLUSTERS_MAX][PROCS_MAX];
unsigned int D_BEG[CLUSTERS_MAX][PROCS_MAX];
unsigned int D_END[CLUSTERS_MAX][PROCS_MAX];

// file pointers on input image 
FILE * f_image_in;
FILE * f_instrum;

// return values at thread exit
unsigned int THREAD_EXIT_SUCCESS = 0;
unsigned int THREAD_EXIT_FAILURE = 1;

// synchronization barrier
pthread_barrier_t     barrier;

// coordinates of core executing the main thread
unsigned int cxy_main;
unsigned int lid_main;

// arrays of pointers on distributed buffers in all clusters
unsigned short * GA[CLUSTERS_MAX];
int *            GB[CLUSTERS_MAX];
int *            GC[CLUSTERS_MAX];
int *            GD[CLUSTERS_MAX];
unsigned char *  GZ[CLUSTERS_MAX];

// trdid[] array for execution threads
// 1D array if no explicit threads placement / 2D array if explicit placement
pthread_t        trdid[CLUSTERS_MAX][PROCS_MAX];
//pthread_t        trdid[THREADS_MAX];

// attr[] array for execution threads
// unused if no explicit threads placement
pthread_attr_t   attr[CLUSTERS_MAX][PROCS_MAX]; 

/////////////////////////////////////////////////////////////////////////////////////
//           functions declaration
/////////////////////////////////////////////////////////////////////////////////////

void execute( void );

void instrument( unsigned int nclusters,
                 unsigned int ncores );

/////////////////
void main( void )
{
    unsigned int x_size;                 // number of clusters in a row
    unsigned int y_size;                 // number of clusters in a column
    unsigned int ncores;                 // number of processors per cluster

    unsigned long long  date;

    char         name[64];               // instrumentation file name
    char         path[128];              // instrumentation path name

    int          error;

    // get platform parameters
    if ( get_config( &x_size , &y_size , &ncores ) )
    {
        printf("\n[convol error] cannot get hardware configuration\n");
        exit( 0 );
    }

    // get core executing this main thread
    // and register these coordinates in global variables
    get_core_id( &cxy_main , &lid_main );
    
    // check ncores
    if( (ncores != 1) && (ncores != 2) && (ncores != 4) )
    {
        printf("\n[convol error] number of cores per cluster must be 1/2/4\n");
        exit( 0 );
    }

    // check x_size
    if( (x_size != 1) && (x_size != 2) && (x_size != 4) && (x_size != 8) && (x_size != 16) )
    {
        printf("\n[convol error] x_size must be 1/2/4/8/16\n");
        exit( 0 );
    }

    // check y_size
    if( (y_size != 1) && (y_size != 2) && (y_size != 4) && (y_size != 8) && (y_size != 16) )
    {
        printf("\n[convol error] y_size must be 1/2/4/8/16\n");
        exit( 0 );
    }

    // compute nthreads and nclusters
    unsigned int nthreads  = x_size * y_size * ncores;
    unsigned int nclusters = x_size * y_size;

    get_cycle( &date );
    printf("\n[convol] starts on core[%x,%d] / %d thread(s) / cycle %d\n",
    cxy_main, lid_main, nthreads, (unsigned int)date );

    // build instrumentation file name
    if( USE_DQT_BARRIER )
    snprintf( name , 64 , "p_convol_dqt_%d_%d", x_size * y_size , ncores );
    else
    snprintf( name , 64 , "p_convol_smp_%d_%d", x_size * y_size , ncores );

    // build pathname
    snprintf( path , 128 , "/home/%s", name );

    // open instrumentation file
    f_instrum = fopen( path , NULL );
    if ( f_instrum == NULL ) 
    { 
        printf("\n[convol error] cannot open instrumentation file <%s>\n", path );
        exit( 0 );
    }

#if DEBUG_MAIN
get_cycle( &date );
printf("\n[convol] main on core[%x,%d] open file <%s> at cycle %d\n",
cxy_main, lid_main, path, (unsigned int)date );
#endif

    // open input file
    f_image_in = fopen( IMAGE_IN_PATH , NULL );
    if ( f_image_in == NULL ) 
    { 
        printf("\n[convol error] cannot open input file <%s>\n", IMAGE_IN_PATH );
        exit( 0 );
    }

#if DEBUG_MAIN
get_cycle( &date );
printf("\n[convol] main on core[%x,%d] open file <%s> at cycle %d\n",
cxy_main, lid_main, path, (unsigned int)date );
#endif
    
    // get FBF config
    unsigned int  fbf_width;
    unsigned int  fbf_height;
    unsigned int  fbf_type;
    fbf_get_config( &fbf_width , &fbf_height , &fbf_type );

    // check FBF size
    if ( (fbf_width != NP) || (fbf_height != NL) )
    {
        printf("\n[convol error] bad FBF size\n");
        exit( 0 );
    }

    // check FBF subsampling
    if ( fbf_type != FBF_TYPE )
    {
        printf("\n[convol error] bad FBF subsampling\n");
        exit( 0 );
    }

    // initialise barrier 
    if( USE_DQT_BARRIER )
    {
        pthread_barrierattr_t attr;
        attr.x_size   = x_size;
        attr.y_size   = y_size;
        attr.nthreads = ncores;
        error = pthread_barrier_init( &barrier, &attr , nthreads );
    }
    else
    {
        error = pthread_barrier_init( &barrier, NULL , nthreads );
    }

    if( error )
    {
        printf("\n[convol error] cannot initialize barrier\n");
        exit( 0 );
    }

    get_cycle( &date );
    printf("\n[convol] main on core[%x,%d] completes initialisation at cycle %d\n" 
           "- CLUSTERS     = %d\n"
           "- PROCS        = %d\n" 
           "- THREADS      = %d\n",
           cxy_main, lid_main, (unsigned int)date, nclusters, ncores, nthreads );

    // launch exec threads with explicit placement
    unsigned int x;
    unsigned int y;
    unsigned int l;
    unsigned int cxy;
 
    for( x = 0 ; x < x_size ; x++ )
    {
        for( y = 0 ; y < y_size ; y++ )
        {
           cxy = HAL_CXY_FROM_XY(x,y);
           for( l = 0 ; l < ncores ; l++ )
           {
               // no other thread on the core running the main
               if( (cxy != cxy_main) || (l != lid_main) )
               {
                   // define thread attributes
                   attr[cxy][l].attributes = PT_ATTR_CLUSTER_DEFINED | PT_ATTR_CORE_DEFINED;
                   attr[cxy][l].cxy        = cxy;
                   attr[cxy][l].lid        = l;
  
                   // create thread on core[x,y,l]
                   if (pthread_create( &trdid[cxy][l],
                                       &attr[cxy][l],    
                                       &execute,
                                       NULL ) )     // execute has no argument
                   {
                       printf("\n[convol error] created thread %x on core[%x][%d]\n",
                       trdid[cxy][l] , cxy , l );
                       exit( 0 );
                   }
                }
            }
        }
    }    

/*
    // launch other threads without explicit placement
    for ( n = 1 ; n < nthreads ; n++ )
    {
        if ( giet_pthread_create( &trdid[n],
                                  NULL,                  // no attribute
                                  &execute,
                                  NULL ) )               // no argument
        {
            printf("\n[convol error] creating thread %x\n", trdid[n] );
            exit( 0 );
        }
    }
*/

    // the main thread run itself the execute() function
    execute();

    // wait other threads completions if explicit threads placement
    for( x = 0 ; x < x_size ; x++ )
    {
        for( y = 0 ; y < y_size ; y++ )
        {
            unsigned int cxy = HAL_CXY_FROM_XY(x,y);
            for( l = 0 ; l < ncores ; l++ )
            {
                // no other thread on the core running the main
                if( (cxy != cxy_main) || (l != lid_main) )
                {
                    unsigned int * exit_status;

                    // wait thread running on core[x,y,l]
                    if (pthread_join( trdid[cxy][l] , (void*)(&exit_status) ) )
                    {
                        printf("\n[convol error] main cannot join thread[%x,%d]\n", cxy, l );
                        exit( 0 );
                    }

                    // check exit_status
                    if( *exit_status != 0 )
                    {
                        printf("\n[convol error] thread[%x,%d]return failure\n", cxy, l );
                        exit( 0 );
                    }
                }
            }
        }
    }
/*    
    // wait other threads completion when no explicit threads placement
    for ( n = 1 ; n < nthreads ; n++ )
    {
        if ( pthread_join( trdid[n], NULL ) )
        {
            printf("\n[convol error] joining thread %x\n", trdid[n] );
            exit( 0 );
        }
    }
*/
    // call the instrument() function
    instrument( nclusters , ncores );

    exit( 0 );
    
} // end main() 



//////////////
void execute()
{
    unsigned long long date;

    // Each thread[x,y,p] initialises the convolution kernel parameters in local stack.
    // The values defined in the next 12 lines are Philips proprietary information.

    int   vnorm  = 115;
    int   vf[35] = { 1, 1, 2, 2, 2,
                     2, 3, 3, 3, 4,
                     4, 4, 4, 5, 5,
                     5, 5, 5, 5, 5,
                     5, 5, 4, 4, 4,
                     4, 3, 3, 3, 2,
                     2, 2, 2, 1, 1 };

    unsigned int hrange = 100;
    unsigned int hnorm  = 201;

    // get plat-form config
    unsigned int x_size;            // number of clusters in a row
    unsigned int y_size;            // number of clusters in a column
    unsigned int ncores;            // number of processors per cluster
    get_config( &x_size , &y_size , &ncores );

    // get cluster indentifier and core local index
    unsigned int cxy; 
    unsigned int lid; 
    get_core_id( &cxy , &lid );
    unsigned int x = HAL_X_FROM_CXY( cxy );
    unsigned int y = HAL_Y_FROM_CXY( cxy );

    // indexes for loops
    unsigned int c;                 // cluster index 
    unsigned int l;                 // line index 
    unsigned int p;                 // pixel index 
    unsigned int z;                 // vertical filter index 

    unsigned int nclusters  = x_size * y_size;              // number of clusters
    unsigned int cluster_id = (x * y_size) + y;             // continuous cluster index
    unsigned int thread_id  = (cluster_id * ncores) + lid;  // continuous thread index
    unsigned int nthreads   = nclusters * ncores;           // number of threads
    unsigned int frame_size = FRAME_SIZE;                   // total size (bytes)
    unsigned int lines_per_thread   = NL / nthreads;        // lines per thread
    unsigned int lines_per_cluster  = NL / nclusters;       // lines per cluster
    unsigned int pixels_per_thread  = NP / nthreads;        // columns per thread
    unsigned int pixels_per_cluster = NP / nclusters;       // columns per cluster

    unsigned int first, last;

    get_cycle( &date );
    START[cluster_id][lid] = (unsigned int)date;

    // Each thread[cxy][0] allocate the global buffers in cluster cxy
    if ( lid == 0 )
    {

#if VERBOSE
printf( "\n[convol] thread[%x,%d] enters malloc at cycle %d\n", 
cxy , lid , (unsigned int)date );
#endif

        GA[cluster_id] = remote_malloc( (FRAME_SIZE/nclusters)   , cxy );
        GB[cluster_id] = remote_malloc( (FRAME_SIZE/nclusters)*2 , cxy );
        GC[cluster_id] = remote_malloc( (FRAME_SIZE/nclusters)*2 , cxy );
        GD[cluster_id] = remote_malloc( (FRAME_SIZE/nclusters)*2 , cxy );
        GZ[cluster_id] = remote_malloc( (FRAME_SIZE/nclusters)/2 , cxy );
        
#if VERBOSE
printf( "\n[convol]  Shared Buffer Virtual Addresses in cluster %x\n"
        "### GA = %x\n"
        "### GB = %x\n"               
        "### GC = %x\n"               
        "### GD = %x\n"               
        "### GZ = %x\n",
        cxy,
        GA[cluster_id],
        GB[cluster_id],
        GC[cluster_id],
        GD[cluster_id],
        GZ[cluster_id] );
#endif
    
    }

    ////////////////////////////////
    pthread_barrier_wait( &barrier );

    // Each thread[cxy,p] initialise in its private stack a copy of the
    // arrays of pointers on the shared, distributed buffers.
    unsigned short * A[CLUSTERS_MAX];
    int            * B[CLUSTERS_MAX];
    int            * C[CLUSTERS_MAX];
    int            * D[CLUSTERS_MAX];
    unsigned char  * Z[CLUSTERS_MAX];

    for( c = 0 ; c < nclusters ; c++ )
    {
        A[c] = GA[c];
        B[c] = GB[c];
        C[c] = GC[c];
        D[c] = GD[c];
        Z[c] = GZ[c];
    }

    // Each thread[x,y,0] access the file containing the input image, to load
    // the local A[c] buffer (frame_size / nclusters loaded in each cluster).
    // Other threads are waiting on the barrier.
    if ( lid==0 )
    {
        unsigned int offset = (frame_size/nclusters)*cluster_id;
        unsigned int size   = frame_size/nclusters;

        // seek the pointer in file
        if ( fseek( f_image_in,
                    offset,
                    SEEK_SET ) )
        {
            printf("\n[convol error] in %s : thread[%x,%d] cannot seek input file\n",
            __FUNCTION__ , cxy , lid );
            pthread_exit( &THREAD_EXIT_FAILURE );
        } 

        if ( fread( A[cluster_id],
                    1, 
                    size,
                    f_image_in ) != size )
        {
            printf("\n[convol error] in %s : thread[%x,%d] cannot read input file\n",
            __FUNCTION__ , cxy , lid );
            pthread_exit( &THREAD_EXIT_FAILURE );
        }
 
#if VERBOSE
get_cycle( &date );
printf( "\n[convol] thread[%x,%d] load input file at cycle %d\n", 
cxy , lid , (unsigned int)date );
#endif

    }

    // Optionnal parallel display of the initial image stored in A[c] buffers.
    // Eah thread[x,y,p] displays (NL/nthreads) lines. (one byte per pixel).

    if ( INITIAL_DISPLAY_ENABLE )
    {
        unsigned int line;
        unsigned int offset = lines_per_thread * lid;

        for ( l = 0 ; l < lines_per_thread ; l++ )
        {
            line = offset + l;

            for ( p = 0 ; p < NP ; p++ )
            {
                TZ(cluster_id, line, p) = (unsigned char)(TA(cluster_id, line, p) >> 8);
            }

            if (fbf_write( &TZ(cluster_id, line, 0),                  // first pixel in TZ
                           NP,                                        // number of bytes
                           NP*(l + (thread_id * lines_per_thread))))  // offset in FBF
            {
                printf("\n[convol error] in %s : thread[%x,%d] cannot access FBF\n",
                __FUNCTION__ , cxy , lid );
                pthread_exit( &THREAD_EXIT_FAILURE );
            }
        }

#if VERBOSE 
get_cycle( &date );
printf( "\n[convol] thread[%x,%d] completes initial display at cycle %d\n",
cxy , lid , (unsigned int)date );
#endif

        ////////////////////////////////
        pthread_barrier_wait( &barrier );
    }

    ////////////////////////////////////////////////////////////
    // parallel horizontal filter : 
    // B <= transpose(FH(A))
    // D <= A - FH(A)
    // Each thread computes (NL/nthreads) lines 
    // The image must be extended :
    // if (z<0)    TA(cluster_id,l,z) == TA(cluster_id,l,0)
    // if (z>NP-1) TA(cluster_id,l,z) == TA(cluster_id,l,NP-1)
    ////////////////////////////////////////////////////////////

    get_cycle( &date );
    H_BEG[cluster_id][lid] = (unsigned int)date;

#if VERBOSE 
printf( "\n[convol] thread[%x,%d] starts horizontal filter at cycle %d\n",
cxy , lid , (unsigned int)date );
#else
if ( (cxy == cxy_main) && (lid == lid_main) ) 
printf( "\n[convol] thread[%x,%d] starts horizontal filter at cycle %d\n",
cxy , lid , (unsigned int)date );
#endif

    // l = absolute line index / p = absolute pixel index  
    // first & last define which lines are handled by a given thread

    first = thread_id * lines_per_thread;
    last  = first + lines_per_thread;

    for (l = first; l < last; l++)
    {
        // src_c and src_l are the cluster index and the line index for A & D
        int src_c = l / lines_per_cluster;
        int src_l = l % lines_per_cluster;

        // We use the specific values of the horizontal ep-filter for optimisation:
        // sum(p) = sum(p-1) + TA[p+hrange] - TA[p-hrange-1]
        // To minimize the number of tests, the loop on pixels is split in three domains 

        int sum_p = (hrange + 2) * TA(src_c, src_l, 0);
        for (z = 1; z < hrange; z++)
        {
            sum_p = sum_p + TA(src_c, src_l, z);
        }

        // first domain : from 0 to hrange
        for (p = 0; p < hrange + 1; p++)
        {
            // dst_c and dst_p are the cluster index and the pixel index for B
            int dst_c = p / pixels_per_cluster;
            int dst_p = p % pixels_per_cluster;
            sum_p = sum_p + (int) TA(src_c, src_l, p + hrange) - (int) TA(src_c, src_l, 0);
            TB(dst_c, dst_p, l) = sum_p / hnorm;
            TD(src_c, src_l, p) = (int) TA(src_c, src_l, p) - sum_p / hnorm;
        }
        // second domain : from (hrange+1) to (NP-hrange-1)
        for (p = hrange + 1; p < NP - hrange; p++)
        {
            // dst_c and dst_p are the cluster index and the pixel index for B
            int dst_c = p / pixels_per_cluster;
            int dst_p = p % pixels_per_cluster;
            sum_p = sum_p + (int) TA(src_c, src_l, p + hrange) 
                          - (int) TA(src_c, src_l, p - hrange - 1);
            TB(dst_c, dst_p, l) = sum_p / hnorm;
            TD(src_c, src_l, p) = (int) TA(src_c, src_l, p) - sum_p / hnorm;
        }
        // third domain : from (NP-hrange) to (NP-1)
        for (p = NP - hrange; p < NP; p++)
        {
            // dst_c and dst_p are the cluster index and the pixel index for B
            int dst_c = p / pixels_per_cluster;
            int dst_p = p % pixels_per_cluster;
            sum_p = sum_p + (int) TA(src_c, src_l, NP - 1) 
                          - (int) TA(src_c, src_l, p - hrange - 1);
            TB(dst_c, dst_p, l) = sum_p / hnorm;
            TD(src_c, src_l, p) = (int) TA(src_c, src_l, p) - sum_p / hnorm;
        }

#if SUPER_VERBOSE
get_cycle( &date );
printf(" - line %d computed at cycle %d\n", l, (unsigned int)date );
#endif    

    }

    get_cycle( &date );
    H_END[cluster_id][lid] = (unsigned int)date;

#if VERBOSE 
printf( "\n[convol] thread[%x,%d] completes horizontal filter at cycle %d\n",
cxy , lid, (unsigned int)date );
#else
if ( (cxy == cxy_main) && (lid == lid_main) ) 
printf( "\n[convol] thread[%x,%d] completes horizontal filter at cycle %d\n",
cxy , lid, (unsigned int)date );
#endif

    ////////////////////////////////
    pthread_barrier_wait( &barrier );

    ///////////////////////////////////////////////////////////////
    // parallel vertical filter : 
    // C <= transpose(FV(B))
    // Each thread computes (NP/nthreads) columns
    // The image must be extended :
    // if (l<0)    TB(cluster_id,p,l) == TB(cluster_id,p,0)
    // if (l>NL-1)   TB(cluster_id,p,l) == TB(cluster_id,p,NL-1)
    ///////////////////////////////////////////////////////////////

    get_cycle( &date );
    V_BEG[cluster_id][lid] = (unsigned int)date;

#if VERBOSE 
printf( "\n[convol] thread[%x,%d] starts vertical filter at cycle %d\n",
cxy , lid , (unsigned int)date );
#else
if ( (cxy == cxy_main) && (lid == lid_main) ) 
printf( "\n[convol] thread[%x,%d] starts vertical filter at cycle %d\n",
cxy , lid, (unsigned int)date );
#endif

    // l = absolute line index / p = absolute pixel index
    // first & last define which pixels are handled by a given thread

    first = thread_id * pixels_per_thread;
    last  = first + pixels_per_thread;

    for (p = first; p < last; p++)
    {
        // src_c and src_p are the cluster index and the pixel index for B
        int src_c = p / pixels_per_cluster;
        int src_p = p % pixels_per_cluster;

        int sum_l;

        // We use the specific values of the vertical ep-filter
        // To minimize the number of tests, the NL lines are split in three domains 

        // first domain : explicit computation for the first 18 values
        for (l = 0; l < 18; l++)
        {
            // dst_c and dst_l are the cluster index and the line index for C
            int dst_c = l / lines_per_cluster;
            int dst_l = l % lines_per_cluster;

            for (z = 0, sum_l = 0; z < 35; z++)
            {
                sum_l = sum_l + vf[z] * TB(src_c, src_p, max(l - 17 + z,0) );
            }
            TC(dst_c, dst_l, p) = sum_l / vnorm;
        }
        // second domain
        for (l = 18; l < NL - 17; l++)
        {
            // dst_c and dst_l are the cluster index and the line index for C
            int dst_c = l / lines_per_cluster;
            int dst_l = l % lines_per_cluster;

            sum_l = sum_l + TB(src_c, src_p, l + 4)
                  + TB(src_c, src_p, l + 8)
                  + TB(src_c, src_p, l + 11)
                  + TB(src_c, src_p, l + 15)
                  + TB(src_c, src_p, l + 17)
                  - TB(src_c, src_p, l - 5)
                  - TB(src_c, src_p, l - 9)
                  - TB(src_c, src_p, l - 12)
                  - TB(src_c, src_p, l - 16)
                  - TB(src_c, src_p, l - 18);

            TC(dst_c, dst_l, p) = sum_l / vnorm;
        }
        // third domain
        for (l = NL - 17; l < NL; l++)
        {
            // dst_c and dst_l are the cluster index and the line index for C
            int dst_c = l / lines_per_cluster;
            int dst_l = l % lines_per_cluster;

            sum_l = sum_l + TB(src_c, src_p, min(l + 4, NL - 1))
                  + TB(src_c, src_p, min(l + 8, NL - 1))
                  + TB(src_c, src_p, min(l + 11, NL - 1))
                  + TB(src_c, src_p, min(l + 15, NL - 1))
                  + TB(src_c, src_p, min(l + 17, NL - 1))
                  - TB(src_c, src_p, l - 5)
                  - TB(src_c, src_p, l - 9)
                  - TB(src_c, src_p, l - 12)
                  - TB(src_c, src_p, l - 16)
                  - TB(src_c, src_p, l - 18);

            TC(dst_c, dst_l, p) = sum_l / vnorm;
        }

#if SUPER_VERBOSE
get_cycle( &date );
printf(" - column %d computed at cycle %d\n", p, (unsigned int)date );
#endif 

    }

    get_cycle( &date );
    V_END[cluster_id][lid] = (unsigned int)date;

#if VERBOSE 
printf( "\n[convol] thread[%x,%d] completes vertical filter at cycle %d\n",
cxy , lid , (unsigned int)date );
#else
if ( (cxy == cxy_main) && (lid == lid_main) ) 
printf( "\n[convol] thread[%x,%d] completes vertical filter at cycle %d\n",
cxy , lid, (unsigned int)date );
#endif

    ////////////////////////////////
    pthread_barrier_wait( &barrier );

    // Optional parallel display of the final image Z <= D + C
    // Eah thread[x,y,p] displays (NL/nthreads) lines. (one byte per pixel).

    if ( FINAL_DISPLAY_ENABLE )
    {
        get_cycle( &date );
        D_BEG[cluster_id][lid] = (unsigned int)date;

#if VERBOSE
printf( "\n[convol] thread[%x,%d] starts final display at cycle %d\n",
cxy , lid , (unsigned int)date );
#else
if ( (cxy == cxy_main) && (lid == lid_main) ) 
printf( "\n[convol] thread[%x,%d] starts final display at cycle %d\n",
cxy , lid, (unsigned int)date );
#endif

        unsigned int line;
        unsigned int offset = lines_per_thread * lid;

        for ( l = 0 ; l < lines_per_thread ; l++ )
        {
            line = offset + l;

            for ( p = 0 ; p < NP ; p++ )
            {
                TZ(cluster_id, line, p) = 
                   (unsigned char)( (TD(cluster_id, line, p) + 
                                     TC(cluster_id, line, p) ) >> 8 );
            }

            if (fbf_write( &TZ(cluster_id, line, 0),                  // first pixel in TZ
                           NP,                                        // number of bytes
                           NP*(l + (thread_id * lines_per_thread))))  // offset in FBF
            {
                printf("\n[convol error] in %s : thread[%d,%d,%d] cannot access FBF\n",
                __FUNCTION__ , x , y , lid );
                pthread_exit( &THREAD_EXIT_FAILURE );
            }
        }

        get_cycle( &date );
        D_END[cluster_id][lid] = (unsigned int)date;

#if VERBOSE
printf( "\n[convol] thread[%x,%d] completes final display at cycle %d\n",
cxy , lid , (unsigned int)date );
#else
if ( (cxy == cxy_main) && (lid == lid_main) ) 
printf( "\n[convol] thread[%x,%d] completes final display at cycle %d\n",
cxy , lid , (unsigned int)date );
#endif
     
        ////////////////////////////////
        pthread_barrier_wait( &barrier );
    }

    // all threads (but the one executing main) exit
    if ( (cxy != cxy_main) || (lid != lid_main) )
    {
        pthread_exit( &THREAD_EXIT_SUCCESS );
    }

} // end execute()



/////////////////////////////////////////
void instrument( unsigned int nclusters,
                 unsigned int ncores )
{
        unsigned int cc, pp;

        unsigned int min_start = 0xFFFFFFFF;
        unsigned int max_start = 0;

        unsigned int min_h_beg = 0xFFFFFFFF;
        unsigned int max_h_beg = 0;

        unsigned int min_h_end = 0xFFFFFFFF;
        unsigned int max_h_end = 0;

        unsigned int min_v_beg = 0xFFFFFFFF;
        unsigned int max_v_beg = 0;

        unsigned int min_v_end = 0xFFFFFFFF;
        unsigned int max_v_end = 0;

        unsigned int min_d_beg = 0xFFFFFFFF;
        unsigned int max_d_beg = 0;

        unsigned int min_d_end = 0xFFFFFFFF;
        unsigned int max_d_end = 0;

        for (cc = 0; cc < nclusters; cc++)
        {
            for (pp = 0; pp < ncores; pp++ )
            {
                if (START[cc][pp] < min_start) min_start = START[cc][pp];
                if (START[cc][pp] > max_start) max_start = START[cc][pp];

                if (H_BEG[cc][pp] < min_h_beg) min_h_beg = H_BEG[cc][pp];
                if (H_BEG[cc][pp] > max_h_beg) max_h_beg = H_BEG[cc][pp];

                if (H_END[cc][pp] < min_h_end) min_h_end = H_END[cc][pp];
                if (H_END[cc][pp] > max_h_end) max_h_end = H_END[cc][pp];

                if (V_BEG[cc][pp] < min_v_beg) min_v_beg = V_BEG[cc][pp];
                if (V_BEG[cc][pp] > max_v_beg) max_v_beg = V_BEG[cc][pp];

                if (V_END[cc][pp] < min_v_end) min_v_end = V_END[cc][pp];
                if (V_END[cc][pp] > max_v_end) max_v_end = V_END[cc][pp];

                if (D_BEG[cc][pp] < min_d_beg) min_d_beg = D_BEG[cc][pp];
                if (D_BEG[cc][pp] > max_d_beg) max_d_beg = D_BEG[cc][pp];

                if (D_END[cc][pp] < min_d_end) min_d_end = D_END[cc][pp];
                if (D_END[cc][pp] > max_d_end) max_d_end = D_END[cc][pp];
            }
        }

        printf(" - START : min = %d / max = %d / med = %d / delta = %d\n",
               min_start, max_start, (min_start+max_start)/2, max_start-min_start);

        printf(" - H_BEG : min = %d / max = %d / med = %d / delta = %d\n",
               min_h_beg, max_h_beg, (min_h_beg+max_h_beg)/2, max_h_beg-min_h_beg);

        printf(" - H_END : min = %d / max = %d / med = %d / delta = %d\n",
               min_h_end, max_h_end, (min_h_end+max_h_end)/2, max_h_end-min_h_end);

        printf(" - V_BEG : min = %d / max = %d / med = %d / delta = %d\n",
               min_v_beg, max_v_beg, (min_v_beg+max_v_beg)/2, max_v_beg-min_v_beg);

        printf(" - V_END : min = %d / max = %d / med = %d / delta = %d\n",
               min_v_end, max_v_end, (min_v_end+max_v_end)/2, max_v_end-min_v_end);

        printf(" - D_BEG : min = %d / max = %d / med = %d / delta = %d\n",
               min_d_beg, max_d_beg, (min_d_beg+max_d_beg)/2, max_d_beg-min_d_beg);

        printf(" - D_END : min = %d / max = %d / med = %d / delta = %d\n",
               min_d_end, max_d_end, (min_d_end+max_d_end)/2, max_d_end-min_d_end);

        printf( "\n General Scenario (Kcycles for each step)\n" );
        printf( " - BOOT OS           = %d\n", (min_start            )/1000 );
        printf( " - LOAD IMAGE        = %d\n", (min_h_beg - min_start)/1000 );
        printf( " - H_FILTER          = %d\n", (max_h_end - min_h_beg)/1000 );
        printf( " - BARRIER HORI/VERT = %d\n", (min_v_beg - max_h_end)/1000 );
        printf( " - V_FILTER          = %d\n", (max_v_end - min_v_beg)/1000 );
        printf( " - BARRIER VERT/DISP = %d\n", (min_d_beg - max_v_end)/1000 );
        printf( " - DISPLAY           = %d\n", (max_d_end - min_d_beg)/1000 );

        // TODO save these results on f_instrum

} // end instrument()





// Local Variables:
// tab-width: 3
// c-basic-offset: 3
// c-file-offsets:((innamespace . 0)(inline-open . 0))
// indent-tabs-mode: nil
// End:

// vim: filetype=cpp:expandtab:shiftwidth=3:tabstop=3:softtabstop=3


