//////////////////////////////////////////////////////////////////////////////////////////
// File   : transpose.c   
// Date   : september 2019
// author : Alain Greiner
//////////////////////////////////////////////////////////////////////////////////////////
// This multi-threaded aplication read a raw image (one byte per pixel)
// stored on disk, transpose it, display the result on the frame buffer,
// and store the transposed image on disk.
// It can run on a multi-cores, multi-clusters architecture, with one thread
//
// per core, and uses the POSIX threads API. 
// It uses the mmap() syscall to directly access the input and output files
// and the fbf_write() syscall to display the images.
//
// The main() function can be launched on any core[cxy,l].
// It makes the initialisations, launch (N-1) threads to run the execute() function
// on the (N-1) other cores, calls himself the execute() function, and finally calls
// the instrument() function to display instrumentation results when the parallel 
// execution is completed. The placement of threads on the cores can be done
// automatically by the operating system, or can be done explicitely by the main thread
// (when the EXPLICIT_PLACEMENT global parameter is set). 
//
// The buf_in[x,y] and buf_out[put buffers containing the direct ans transposed images
// are distributed in clusters: In each cluster[cxy], the thread running on core[cxy,0]
// map the buf_in[cxy] and // buf_out[cxy] buffers containing a subset of lines.
// Then, all threads in cluster[xy] read pixels from the local buf_in[cxy] buffer, and
// write the pixels to all remote buf_out[cxy] buffers. Finally, each thread display
// a part of the transposed image to the frame buffer.
//
// - The image  must fit the frame buffer size, that must be power of 2.
// - The number of clusters  must be a power of 2 no larger than 256.
// - The number of cores per cluster must be a power of 2 no larger than 4.
// - The number of clusters cannot be larger than (IMAGE_SIZE * IMAGE_SIZE) / 4096,
//   because the size of buf_in[x,y] and buf_out[x,y] must be multiple of 4096.
//
//////////////////////////////////////////////////////////////////////////////////////////

#include <sys/mman.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <pthread.h>
#include <string.h>
#include <almosmkh.h>
#include <fcntl.h>
#include <hal_macros.h>

#define X_MAX                 16                           // max number of clusters in row
#define Y_MAX                 16                           // max number of clusters in column
#define CORES_MAX             4                            // max number of cores per cluster
#define CLUSTERS_MAX          (X_MAX * Y_MAX)              // max number of clusters

#define IMAGE_SIZE            256                          // image size 
#define IMAGE_TYPE            420                          // pixel encoding type
#define INPUT_FILE_PATH       "/misc/lena_256.raw"         // input file pathname 
#define OUTPUT_FILE_PATH      "/home/trsp_256.raw"         // output file pathname 

#define USE_DQT_BARRIER       1                            // quad-tree barrier if non zero 
#define EXPLICIT_PLACEMENT    1                            // explicit thread placement
#define VERBOSE               1                            // print comments on TTY


///////////////////////////////////////////////////////
//                global variables 
///////////////////////////////////////////////////////

// instrumentation counters for each processor in each cluster 
unsigned int MMAP_START[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
unsigned int MMAP_END  [CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
unsigned int TRSP_START[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
unsigned int TRSP_END  [CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
unsigned int DISP_START[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
unsigned int DISP_END  [CLUSTERS_MAX][CORES_MAX] = {{ 0 }};

// arrays of pointers on distributed buffers
// one input buffer & one output buffer per cluster
unsigned char *  buf_in [CLUSTERS_MAX];
unsigned char *  buf_out[CLUSTERS_MAX];

// synchronisation barrier (all threads)
pthread_barrier_t   barrier;

// platform parameters
unsigned int  x_size;                       // number of clusters in a row
unsigned int  y_size;                       // number of clusters in a column
unsigned int  ncores;                       // number of processors per cluster

// cluster identifier & local index of core running the main thread
unsigned int  cxy_main;
unsigned int  lid_main;

// input & output file descriptors
int  fd_in;
int  fd_out;

#if EXPLICIT_PLACEMENT

// thread index allocated by the kernel 
pthread_t        trdid[CLUSTERS_MAX][CORES_MAX];   

// user defined continuous thread index 
unsigned int     tid[CLUSTERS_MAX][CORES_MAX];

// thread attributes only used if explicit placement
pthread_attr_t   attr[CLUSTERS_MAX][CORES_MAX];

#else 

// thread index allocated by the kernel
pthread_t        trdid[CLUSTERS_MAX * CORES_MAX];   

// user defined continuous thread index
unsigned int     tid[CLUSTERS_MAX * CORES_MAX];

#endif

//return values at thread exit
unsigned int THREAD_EXIT_SUCCESS = 0;
unsigned int THREAD_EXIT_FAILURE = 1;

////////////////////////////////////////////////////////////////
//             functions declaration
////////////////////////////////////////////////////////////////

void execute( unsigned int * ptid );

void instrument( void );

///////////
void main()
{
    unsigned long long date;

    int error;

printf("\n bloup 0\n");

    // get identifiers for core executing main
    get_core_id( &cxy_main , &lid_main );

printf("\n bloup 1\n");

    // get & check plat-form parameters
    get_config( &x_size , &y_size , &ncores );

printf("\n bloup 2\n");

    if((ncores != 1) && (ncores != 2) && (ncores == 4))
    {
        printf("\n[transpose error] number of cores per cluster must be 1/2/4\n");
        exit( 0 );
    }

    if( (x_size != 1) && (x_size != 2) && (x_size != 4) && 
        (x_size != 8) && (x_size != 16) )
    {
        printf("\n[transpose error] x_size must be 1/2/4/8/16\n");
        exit( 0 );
    }
        
    if( (y_size != 1) && (y_size != 2) && (y_size != 4) && 
        (y_size != 8) && (y_size != 16) )
    {
        printf("\n[transpose error] y_size must be 1/2/4/8/16\n");
        exit( 0 );
    }
        
printf("\n bloup 3\n");

    // compute number of threads
    unsigned int nclusters = x_size * y_size;
    unsigned int nthreads  = nclusters * ncores;

printf("\n bloup 4\n");

    // get FBF ownership and FBF size
    unsigned int   fbf_width;
    unsigned int   fbf_height;
    unsigned int   fbf_type;
    fbf_get_config( &fbf_width , &fbf_height , &fbf_type );

printf("\n bloup 5\n");

    if( (fbf_width != IMAGE_SIZE) || (fbf_height != IMAGE_SIZE) || (fbf_type != IMAGE_TYPE) )
    {
        printf("\n[transpose error] image does not fit FBF size or type\n");
        exit( 0 );
    }

    get_cycle( &date );
    printf("\n[transpose] starts at cycle %d on %d cores / FBF = %d * %d pixels\n",
    (unsigned int)date , nthreads , fbf_width , fbf_height );

    // open input file
    fd_in = open( INPUT_FILE_PATH , O_RDONLY , 0 );    // read-only
    if ( fd_in < 0 ) 
    { 
        printf("\n[transpose error] main cannot open file %s\n", INPUT_FILE_PATH );
        exit( 0 );
    } 

#if VERBOSE
printf("\n[transpose] main open file %s / fd = %d\n", INPUT_FILE_PATH , fd_in );
#endif

    // open output file
    fd_out = open( OUTPUT_FILE_PATH , O_CREAT , 0 );   // create if required
    if ( fd_out < 0 ) 
    { 
        printf("\n[transpose error] main cannot open file %s\n", OUTPUT_FILE_PATH );
        exit( 0 );
    }

#if  VERBOSE
printf("\n[transpose] main open file %s / fd = %d\n", OUTPUT_FILE_PATH , fd_out );
#endif

    // initialise barrier 
    if( USE_DQT_BARRIER )
    {
        pthread_barrierattr_t attr;
        attr.x_size   = x_size;
        attr.y_size   = y_size;
        attr.nthreads = ncores;
        error = pthread_barrier_init( &barrier, &attr , nthreads );
    }
    else
    {
        error = pthread_barrier_init( &barrier, NULL , nthreads );
    }

    if( error )
    { 
        printf("\n[transpose error] main cannot initialize barrier\n" );
        exit( 0 );
    }

    get_cycle( &date );
    printf("\n[transpose] main on core[%x,%d] completes initialisation at cycle %d\n" 
           "- CLUSTERS     = %d\n"
           "- PROCS        = %d\n" 
           "- THREADS      = %d\n",
           cxy_main, lid_main, (unsigned int)date, nclusters, ncores, nthreads );

//////////////////////
#if EXPLICIT_PLACEMENT

    // main thread launch other threads
    unsigned int x;
    unsigned int y;
    unsigned int l;
    unsigned int cxy;
    for( x = 0 ; x < x_size ; x++ )
    {
        for( y = 0 ; y < y_size ; y++ )
        {
            cxy = HAL_CXY_FROM_XY( x , y );
            for( l = 0 ; l < ncores ; l++ )
            {
                // no other thread on the core running the main
                if( (cxy != cxy_main) || (l != lid_main) )
                {
                    // define thread attributes
                    attr[cxy][l].attributes = PT_ATTR_CLUSTER_DEFINED | PT_ATTR_CORE_DEFINED;
                    attr[cxy][l].cxy        = cxy;
                    attr[cxy][l].lid        = l;

                    tid[cxy][l] = (((x  * y_size) + y) * ncores) + l;
  
                    // create thread on core[cxy,l]
                    if (pthread_create( &trdid[cxy][l],    
                                        &attr[cxy][l],    
                                        &execute,
                                        &tid[cxy][l] ) )       
                    {
                        printf("\n[convol error] created thread %x on core[%x][%d]\n",
                        trdid[cxy][l] , cxy , l );
                        exit( 0 );
                    }
#if VERBOSE 
printf("\n[transpose] main created thread[%x,%d]\n", cxy, l );
#endif
                }
            }
        }
    }    

    // main thread calls itself the execute() function
    execute( &tid[cxy_main][lid_main] );

    // main thread wait other threads completion
    for( x = 0 ; x < x_size ; x++ )
    {
        for( y = 0 ; y < y_size ; y++ )
        {
            cxy = HAL_CXY_FROM_XY( x , y );
            for( l = 0 ; l < ncores ; l++ )
            {
                // no other thread on the core running the main
                if( (cxy != cxy_main) || (l != lid_main) )
                {
                    unsigned int * status;

                    // wait thread[cxy][l]
                    if( pthread_join( trdid[cxy][l] , (void*)(&status) ) )
                    {
                        printf("\n[transpose error] main cannot join thread[%x,%d]\n", cxy, l );
                        exit( 0 );
                    }
       
                    // check status
                    if( *status != THREAD_EXIT_SUCCESS )
                    {
                        printf("\n[transpose error] thread[%x,%d] returned failure\n", cxy, l );
                        exit( 0 );
                    }
#if VERBOSE 
printf("\n[transpose] main joined thread[%x,%d]\n", cxy, l );
#endif
                }
            }
        }
    }

///////////////////////////////
#else  // no explicit placement

    // main thread launch other threads 
    unsigned int n;
    for ( n = 1 ; n < nthreads ; n++ )
    {
        tid[n] = n;
        if ( pthread_create( &trdid[n], 
                             NULL,                  // no attribute
                             &execute,
                             &tid[n] ) ) 
        {
            printf("\n[transpose error] cannot create thread %d\n", n );
            exit( 0 );
        }

#if VERBOSE
printf("\n[transpose] main created thread %d\n", tid[n] );
#endif

    }

    // main thread calls itself the execute() function
    execute( &tid[0] );

    // main thread wait other threads completion
    for ( n = 1 ; n < nthreads ; n++ )
    {
        unsigned int * status;

        // main wait thread[n] status
        if ( pthread_join( trdid[n], (void*)(&status)) )
        {
            printf("\n[transpose error] main cannot join thread %d\n", n );
            exit( 0 );
        }
       
        // check status
        if( *status != THREAD_EXIT_SUCCESS )
        {
            printf("\n[transpose error] thread %x returned failure\n", n );
            exit( 0 );
        }

#if VERBOSE 
printf("\n[transpose] main successfully joined thread %x\n", tid[n] );
#endif
        
    }

#endif

    // instrumentation
    instrument();

    // close input and output files 
    close( fd_in );
    close( fd_out );

    // suicide
    exit( 0 );
    
} // end main()



///////////////////////////////////
void execute( unsigned int * ptid )
{
    unsigned long long   date;
 
    unsigned int l;                         // line index for loops
    unsigned int p;                         // pixel index for loops

    // get thread continuous index 
    unsigned int my_tid = *ptid;

    // build total number of pixels per image
    unsigned int npixels = IMAGE_SIZE * IMAGE_SIZE;      

    // nuild total number of threads and clusters
    unsigned int nthreads  = x_size * y_size * ncores;
    unsigned int nclusters = x_size * y_size;

    // get cluster continuous index and core index from tid
    // we use (tid == cid * ncores + lid) 
    unsigned int cid = my_tid / ncores;     // continuous index   
    unsigned int lid = my_tid % ncores;     // core local index

    // get cluster identifier from cid
    // we use (cid == x * y_size + y)
    unsigned int x   = cid / y_size;        // X cluster coordinate
    unsigned int y   = cid % y_size;        // Y cluster coordinate
    unsigned int cxy = HAL_CXY_FROM_XY(x,y);
    
#if VERBOSE
printf("\n[transpose] thread[%d] start on core[%x,%d]\n", my_tid , cxy , lid );
#endif

    // In each cluster cxy,  thread[cxy,0] map input file
    // to buf_in[cxy] and map output file to buf_in[cxy] 

    get_cycle( &date );
    MMAP_START[cxy][lid] = (unsigned int)date;

    if ( lid == 0 ) 
    {
        unsigned int length = npixels / nclusters;
        unsigned int offset = length * cid;
        
        // map buf_in
        buf_in[cid] =  mmap( NULL,
                             length,
                             PROT_READ,
                             MAP_SHARED,
                             fd_in,
                             offset );

        if ( buf_in[cid] == NULL )
        {
            printf("\n[transpose error] thread[%x,%d] cannot map input file\n", cxy, lid);
            pthread_exit( &THREAD_EXIT_FAILURE );
        }
                 
#if VERBOSE
printf("\n[transpose] thread[%x,%d] map input file / length %x / offset %x / buf_in %x\n",
cxy, lid, length, offset, buf_in[cid] );
#endif

        // map buf_out           
        buf_out[cid] = mmap( NULL,
                             length,
                             PROT_WRITE,
                             MAP_SHARED,
                             fd_out,
                             offset );

        if ( buf_out[cid] == NULL )
        {
            printf("\n[transpose error] thread[%x,%d] cannot map output file\n", cxy, lid);
            pthread_exit( &THREAD_EXIT_FAILURE );
        }
                   
#if VERBOSE
printf("\n[transpose] thread[%x,%d] map output file / length %x / offset %x / buf_out %x\n",
cxy, lid, length, offset, buf_out[cid] );
#endif

    }

    get_cycle( &date );
    MMAP_END[cxy][lid] = (unsigned int)date;

    /////////////////////////////////
    pthread_barrier_wait( &barrier );

    // parallel transpose from buf_in to buf_out
    // each thread makes the transposition for nlt lines (nlt = IMAGE_SIZE/nthreads)
    // from line [tid*nlt] to line [(tid + 1)*nlt - 1]
    // (p,l) are the absolute pixel coordinates in the source image

    get_cycle( &date );
    TRSP_START[cxy][lid] = (unsigned int)date;

    unsigned int nlt   = IMAGE_SIZE / nthreads;    // number of lines per thread
    unsigned int nlc   = IMAGE_SIZE / nclusters;   // number of lines per cluster

    unsigned int src_cluster;
    unsigned int src_index;
    unsigned int dst_cluster;
    unsigned int dst_index;

    unsigned char byte;

    unsigned int first = my_tid * nlt;     // first line index for a given thread
    unsigned int last  = first + nlt;      // last line index for a given thread

    for ( l = first ; l < last ; l++ )
    {
        // in each iteration we transfer one byte
        for ( p = 0 ; p < IMAGE_SIZE ; p++ )
        {
            // read one byte from local buf_in
            src_cluster = l / nlc;
            src_index   = (l % nlc) * IMAGE_SIZE + p;
            byte        = buf_in[src_cluster][src_index];

            // write one byte to remote buf_out
            dst_cluster = p / nlc; 
            dst_index   = (p % nlc) * IMAGE_SIZE + l;

            buf_out[dst_cluster][dst_index] = byte;
        }
    }

#if VERBOSE
printf("\n[transpose] thread[%x,%d] completes transposed\n", cxy, lid );
#endif

    get_cycle( &date );
    TRSP_END[cxy][lid] = (unsigned int)date;

    /////////////////////////////////
    pthread_barrier_wait( &barrier );

    // parallel display from local buf_out to frame buffer
    // all threads contribute to display 

    get_cycle( &date );
    DISP_START[cxy][lid] = (unsigned int)date;

    unsigned int  npt   = npixels / nthreads;   // number of pixels per thread

    if( fbf_write( &buf_out[cid][lid * npt], 
                   npt,
                   npt * my_tid ) )
    {
        printf("\n[transpose error] thread[%x,%d] cannot access FBF\n", cxy, lid );
        pthread_exit( &THREAD_EXIT_FAILURE );
    }

#if VERBOSE
printf("\n[transpose] thread[%x,%d] completes display\n", cxy, lid );
#endif

    get_cycle( &date );
    DISP_END[cxy][lid] = (unsigned int)date;

    /////////////////////////////////
    pthread_barrier_wait( &barrier );

    // all threads, but thread[0,0,0], suicide
    if ( (cxy != cxy_main) || (lid !=  lid_main) )
    { 
        pthread_exit( &THREAD_EXIT_SUCCESS );
    }

} // end execute()



///////////////////////
void instrument( void )
{
    unsigned int x, y, l;

    unsigned int min_load_start = 0xFFFFFFFF;
    unsigned int max_load_start = 0;
    unsigned int min_load_ended = 0xFFFFFFFF;
    unsigned int max_load_ended = 0;
    unsigned int min_trsp_start = 0xFFFFFFFF;
    unsigned int max_trsp_start = 0;
    unsigned int min_trsp_ended = 0xFFFFFFFF;
    unsigned int max_trsp_ended = 0;
    unsigned int min_disp_start = 0xFFFFFFFF;
    unsigned int max_disp_start = 0;
    unsigned int min_disp_ended = 0xFFFFFFFF;
    unsigned int max_disp_ended = 0;
 
    char string[64];

    snprintf( string , 64 , "/home/transpose_%d_%d_%d" , x_size , y_size , ncores );

    // open instrumentation file
    FILE * f = fopen( string , NULL );
    if ( f == NULL ) 
    { 
        printf("\n[transpose error] cannot open instrumentation file %s\n", string );
        exit( 0 );
    }

    for (x = 0; x < x_size; x++)
    {
        for (y = 0; y < y_size; y++)
        {
            unsigned int cxy = HAL_CXY_FROM_XY( x , y );

            for ( l = 0 ; l < ncores ; l++ )
            {
                if (MMAP_START[cxy][l] < min_load_start)  min_load_start = MMAP_START[cxy][l];
                if (MMAP_START[cxy][l] > max_load_start)  max_load_start = MMAP_START[cxy][l];
                if (MMAP_END[cxy][l]   < min_load_ended)  min_load_ended = MMAP_END[cxy][l]; 
                if (MMAP_END[cxy][l]   > max_load_ended)  max_load_ended = MMAP_END[cxy][l];
                if (TRSP_START[cxy][l] < min_trsp_start)  min_trsp_start = TRSP_START[cxy][l];
                if (TRSP_START[cxy][l] > max_trsp_start)  max_trsp_start = TRSP_START[cxy][l];
                if (TRSP_END[cxy][l]   < min_trsp_ended)  min_trsp_ended = TRSP_END[cxy][l];
                if (TRSP_END[cxy][l]   > max_trsp_ended)  max_trsp_ended = TRSP_END[cxy][l];
                if (DISP_START[cxy][l] < min_disp_start)  min_disp_start = DISP_START[cxy][l];
                if (DISP_START[cxy][l] > max_disp_start)  max_disp_start = DISP_START[cxy][l];
                if (DISP_END[cxy][l]   < min_disp_ended)  min_disp_ended = DISP_END[cxy][l];
                if (DISP_END[cxy][l]   > max_disp_ended)  max_disp_ended = DISP_END[cxy][l];
            }
        }
    }

    printf( "\n ------ %s ------\n" , string );
    fprintf( f , "\n ------ %s ------\n" , string );

    printf( " - MMAP_START : min = %d / max = %d / med = %d / delta = %d\n",
           min_load_start, max_load_start, (min_load_start+max_load_start)/2, 
           max_load_start-min_load_start ); 

    fprintf( f , " - MMAP_START : min = %d / max = %d / med = %d / delta = %d\n",
           min_load_start, max_load_start, (min_load_start+max_load_start)/2, 
           max_load_start-min_load_start ); 

    printf( " - MMAP_END   : min = %d / max = %d / med = %d / delta = %d\n",
           min_load_ended, max_load_ended, (min_load_ended+max_load_ended)/2, 
           max_load_ended-min_load_ended ); 

    fprintf( f , " - MMAP_END   : min = %d / max = %d / med = %d / delta = %d\n",
           min_load_ended, max_load_ended, (min_load_ended+max_load_ended)/2, 
           max_load_ended-min_load_ended ); 

    printf( " - TRSP_START : min = %d / max = %d / med = %d / delta = %d\n",
           min_trsp_start, max_trsp_start, (min_trsp_start+max_trsp_start)/2, 
           max_trsp_start-min_trsp_start ); 

    fprintf( f , " - TRSP_START : min = %d / max = %d / med = %d / delta = %d\n",
           min_trsp_start, max_trsp_start, (min_trsp_start+max_trsp_start)/2, 
           max_trsp_start-min_trsp_start ); 

    printf( " - TRSP_END   : min = %d / max = %d / med = %d / delta = %d\n",
           min_trsp_ended, max_trsp_ended, (min_trsp_ended+max_trsp_ended)/2, 
           max_trsp_ended-min_trsp_ended ); 

    fprintf( f , " - TRSP_END   : min = %d / max = %d / med = %d / delta = %d\n",
           min_trsp_ended, max_trsp_ended, (min_trsp_ended+max_trsp_ended)/2, 
           max_trsp_ended-min_trsp_ended ); 

    printf( " - DISP_START : min = %d / max = %d / med = %d / delta = %d\n",
           min_disp_start, max_disp_start, (min_disp_start+max_disp_start)/2, 
           max_disp_start-min_disp_start ); 

    fprintf( f , " - DISP_START : min = %d / max = %d / med = %d / delta = %d\n",
           min_disp_start, max_disp_start, (min_disp_start+max_disp_start)/2, 
           max_disp_start-min_disp_start ); 

    printf( " - DISP_END   : min = %d / max = %d / med = %d / delta = %d\n",
           min_disp_ended, max_disp_ended, (min_disp_ended+max_disp_ended)/2, 
           max_disp_ended-min_disp_ended ); 

    fprintf( f , " - DISP_END   : min = %d / max = %d / med = %d / delta = %d\n",
           min_disp_ended, max_disp_ended, (min_disp_ended+max_disp_ended)/2, 
           max_disp_ended-min_disp_ended ); 

    fclose( f );

}  // end instrument()




