//////////////////////////////////////////////////////////////////////////////////////////
// File   : transpose.c   
// Date   : september 2015
// author : Alain Greiner
//////////////////////////////////////////////////////////////////////////////////////////
// This multi-threaded aplication read a raw image (one byte per pixel)
// stored on disk, transpose it, display the result on the frame buffer,
// and store the transposed image on disk.
// The input image can be interactively selected if the INTERACTIVE flag is set.
// It can run on a multi-processors, multi-clusters architecture, with one thread
// per processor, and uses the POSIX threads API. 
// It uses the giet_fat_mmap() to directly access the input and output files
// in the kernel files cache. It does not use the CMA to display the result image.
//
// The main() function can be launched on any processor P[x,y,l].
// It makes the initialisations, launch (N-1) threads to run the execute() function
// on the (N-1) other processors than P[x,y,l], call himself the execute() function, 
// and finally call the instrument() function to display instrumentation results 
// when the parallel execution is completed.
//
// The buf_in[x,y] and buf_out[put buffers containing the direct ans transposed images
// are distributed in clusters:
// In each cluster[x,y], the thread running on processor P[x,y,0] uses the giet_fat_mmap()
// function to map the buf_in[x,y] and buf_out[x,y] buffers containing a set of lines.
// Then, all threads in cluster[x,y] read pixels from the local buf_in[x,y] buffer, and
// write the pixels to the remote buf_out[x,y] buffers. Finally, each thread display
// a part of the transposed image to the frame buffer.
//
// - The image size must fit the frame buffer width and height, that must be power of 2.
// - The number of clusters  must be a power of 2 no larger than 256.
// - The number of processors per cluster must be a power of 2 no larger than 4.
// - The number of clusters cannot be larger than (image_size * image_size) / 4096,
//   because the size of buf_in[x,y] and buf_out[x,y] must be multiple of 4096.
//
// The transpose_rw.c file contains a variant that use the giet_fat_read() 
// and giet_fat_write() system calls, to access the files.
//////////////////////////////////////////////////////////////////////////////////////////

#include "stdio.h"
#include "stdlib.h"
#include "user_barrier.h"
#include "malloc.h"

#define BLOCK_SIZE            512                          // block size on disk
#define X_MAX                 16                           // max number of clusters in row
#define Y_MAX                 16                           // max number of clusters in column
#define PROCS_MAX             4                            // max number of procs per cluster
#define CLUSTER_MAX           (X_MAX * Y_MAX)              // max number of clusters
#define IMAGE_SIZE            256                          // default image size 
#define INPUT_FILE_PATH       "/misc/lena_256.raw"         // default input file pathname 
#define OUTPUT_FILE_PATH      "/home/lena_transposed.raw"  // default output file pathname 
#define INTERACTIVE           0                            // interactive capture of filenames 
#define VERBOSE               1                            // print comments on TTY


// macro to use a shared TTY
#define printf(...);    { lock_acquire( &tty_lock ); \
                          giet_tty_printf(__VA_ARGS__);  \
                          lock_release( &tty_lock ); }

///////////////////////////////////////////////////////
// global variables stored in seg_data in cluster(0,0)
///////////////////////////////////////////////////////

// instrumentation counters for each processor in each cluster 
unsigned int MMAP_START[X_MAX][Y_MAX][PROCS_MAX] = {{{ 0 }}};
unsigned int MMAP_END  [X_MAX][Y_MAX][PROCS_MAX] = {{{ 0 }}};
unsigned int TRSP_START[X_MAX][Y_MAX][PROCS_MAX] = {{{ 0 }}};
unsigned int TRSP_END  [X_MAX][Y_MAX][PROCS_MAX] = {{{ 0 }}};
unsigned int DISP_START[X_MAX][Y_MAX][PROCS_MAX] = {{{ 0 }}};
unsigned int DISP_END  [X_MAX][Y_MAX][PROCS_MAX] = {{{ 0 }}};

// arrays of pointers on distributed buffers
// one input buffer & one output buffer per cluster
unsigned char*  buf_in [CLUSTER_MAX];
unsigned char*  buf_out[CLUSTER_MAX];

// lock protecting shared TTY
user_lock_t  tty_lock;

// synchronisation barrier (all threads)
giet_sqt_barrier_t barrier;

// input & output files pathname and size
char          input_file_name[256];
char          output_file_name[256];
unsigned int  image_size;

// input & output file descriptors
int  fd_in;
int  fd_out;

////////////////////////////////////////////
__attribute__ ((constructor)) void execute()
////////////////////////////////////////////
{
    unsigned int l;                            // line index for loops
    unsigned int p;                            // pixel index for loops

    // get processor identifiers 
    unsigned int x_id;                         // x cluster coordinate
    unsigned int y_id;                         // y cluster coordinate
    unsigned int p_id;                         // local processor index

    giet_proc_xyp( &x_id, &y_id, &p_id);             

    // get & check plat-form parameters
    unsigned int x_size;                       // number of clusters in a row
    unsigned int y_size;                       // number of clusters in a column
    unsigned int nprocs;                       // number of processors per cluster
    
    giet_procs_number( &x_size , &y_size , &nprocs );

    unsigned int nclusters     = x_size * y_size;               // number of clusters
    unsigned int nthreads      = x_size * y_size * nprocs;      // number of threads
    unsigned int npixels       = image_size * image_size;       // pixels per image
    unsigned int cluster_id    = (x_id * y_size) + y_id;        // "continuous" index   
    unsigned int thread_id     = (cluster_id * nprocs) + p_id;  // "continuous" index

    // parallel load of image:
    // thread running on processor[x,y,0] 
    // map input & output files in buf_in & buf_out buffers.

    MMAP_START[x_id][y_id][p_id] = giet_proctime();

    if ( p_id == 0 ) 
    {
        // map buf_in and buf_out
        unsigned int length = npixels / nclusters;
        unsigned int offset = length * cluster_id;
        
        buf_in[cluster_id] =  giet_fat_mmap( NULL,
                                             length,
                                             MAP_PROT_READ,
                                             MAP_SHARED,
                                             fd_in,
                                             offset );
        if ( buf_in[cluster_id] == NULL )
        {
            printf("\n[TRANSPOSE ERROR] Thread[%d,%d,%d] cannot map input file\n",
                   x_id , y_id , p_id );
            giet_pthread_exit( NULL );
        }
                 
        if ( VERBOSE )
        printf("\n@@@ Thread[%d,%d,%d] call mmap for input file\n"
               " length = %x / offset = %x / buf_in = %x\n",
               x_id , y_id , p_id , length , offset , buf_in[cluster_id] );
           
        buf_out[cluster_id] = giet_fat_mmap( NULL,
                                             length,
                                             MAP_PROT_WRITE,
                                             MAP_SHARED,
                                             fd_out,
                                             offset );
        if ( buf_out[cluster_id] == NULL )
        {
            printf("\n[TRANSPOSE ERROR] Thread[%d,%d,%d] cannot map output file\n",
                   x_id , y_id , p_id );
            giet_pthread_exit( NULL );
        }
                   
        if ( VERBOSE )
        printf("\n@@@ Thread[%d,%d,%d] call mmap for output file\n"
               " length = %x / offset = %x / buf_out = %x\n",
               x_id , y_id , p_id , length , offset , buf_out[cluster_id] );
       
    }

    MMAP_END[x_id][y_id][p_id] = giet_proctime();

    /////////////////////////////
    sqt_barrier_wait( &barrier );
    /////////////////////////////

    // parallel transpose from buf_in to buf_out
    // each thread makes the transposition for nlt lines (nlt = image_size/nthreads)
    // from line [thread_id*nlt] to line [(thread_id + 1)*nlt - 1]
    // (p,l) are the absolute pixel coordinates in the source image

    TRSP_START[x_id][y_id][p_id] = giet_proctime();

    unsigned int nlt   = image_size / nthreads;    // number of lines per thread
    unsigned int nlc   = image_size / nclusters;   // number of lines per cluster

    unsigned int src_cluster;
    unsigned int src_index;
    unsigned int dst_cluster;
    unsigned int dst_index;

    unsigned char byte;

    unsigned int first = thread_id * nlt;  // first line index for a given thread
    unsigned int last  = first + nlt;      // last line index for a given thread

    for ( l = first ; l < last ; l++ )
    {
        // in each iteration we transfer one byte
        for ( p = 0 ; p < image_size ; p++ )
        {
            // read one byte from local buf_in
            src_cluster = l / nlc;
            src_index   = (l % nlc)*image_size + p;
            byte        = buf_in[src_cluster][src_index];

            // write one byte to remote buf_out
            dst_cluster = p / nlc; 
            dst_index   = (p % nlc)*image_size + l;
            buf_out[dst_cluster][dst_index] = byte;
        }
    }

    if ( (p_id == 0) && (x_id==0) && (y_id==0) )
    {
        printf("\n[TRANSPOSE] Thread[%d,%d,%d] completes transpose at cycle %d\n", 
        x_id, y_id, p_id, giet_proctime() );
    }

    TRSP_END[x_id][y_id][p_id] = giet_proctime();

    /////////////////////////////
    sqt_barrier_wait( &barrier );
    /////////////////////////////

    // parallel display from local buf_out to frame buffer
    // all threads contribute to display using memcpy...

    DISP_START[x_id][y_id][p_id] = giet_proctime();

    unsigned int  npt   = npixels / nthreads;   // number of pixels per thread

    giet_fbf_sync_write( npt * thread_id, 
                         &buf_out[cluster_id][p_id*npt], 
                         npt );

    if ( (x_id==0) && (y_id==0) && (p_id==0) )
    {
        printf("\n[TRANSPOSE] Thread[%d,%d,%d] completes display at cycle %d\n",
               x_id, y_id, p_id, giet_proctime() );
    }

    DISP_END[x_id][y_id][p_id] = giet_proctime();

    /////////////////////////////
    sqt_barrier_wait( &barrier );
    /////////////////////////////

    // all threads, but thread[0,0,0], suicide
    if ( (x_id != 0) || (y_id != 0) || (p_id != 0) ) 
    giet_pthread_exit( "completed" );

} // end execute()



//////////////////////////////////////
void instrument( unsigned int x_size,
                 unsigned int y_size,
                 unsigned int nprocs )
//////////////////////////////////////
{
    unsigned int x, y, l;

    unsigned int min_load_start = 0xFFFFFFFF;
    unsigned int max_load_start = 0;
    unsigned int min_load_ended = 0xFFFFFFFF;
    unsigned int max_load_ended = 0;
    unsigned int min_trsp_start = 0xFFFFFFFF;
    unsigned int max_trsp_start = 0;
    unsigned int min_trsp_ended = 0xFFFFFFFF;
    unsigned int max_trsp_ended = 0;
    unsigned int min_disp_start = 0xFFFFFFFF;
    unsigned int max_disp_start = 0;
    unsigned int min_disp_ended = 0xFFFFFFFF;
    unsigned int max_disp_ended = 0;
 
    // open instrumentation file
    unsigned int fd = giet_fat_open( "/home/transpose.inst" , O_CREAT);
    if ( fd < 0 ) 
    { 
        printf("\n[TRANSPOSE ERROR] main cannot open file transpose.inst\n");
        giet_pthread_exit( NULL );
    }

    for (x = 0; x < x_size; x++)
    {
        for (y = 0; y < y_size; y++)
        {
            for ( l = 0 ; l < nprocs ; l++ )
            {
                if (MMAP_START[x][y][l] < min_load_start)  min_load_start = MMAP_START[x][y][l];
                if (MMAP_START[x][y][l] > max_load_start)  max_load_start = MMAP_START[x][y][l];
                if (MMAP_END[x][y][l]   < min_load_ended)  min_load_ended = MMAP_END[x][y][l]; 
                if (MMAP_END[x][y][l]   > max_load_ended)  max_load_ended = MMAP_END[x][y][l];
                if (TRSP_START[x][y][l] < min_trsp_start)  min_trsp_start = TRSP_START[x][y][l];
                if (TRSP_START[x][y][l] > max_trsp_start)  max_trsp_start = TRSP_START[x][y][l];
                if (TRSP_END[x][y][l]   < min_trsp_ended)  min_trsp_ended = TRSP_END[x][y][l];
                if (TRSP_END[x][y][l]   > max_trsp_ended)  max_trsp_ended = TRSP_END[x][y][l];
                if (DISP_START[x][y][l] < min_disp_start)  min_disp_start = DISP_START[x][y][l];
                if (DISP_START[x][y][l] > max_disp_start)  max_disp_start = DISP_START[x][y][l];
                if (DISP_END[x][y][l]   < min_disp_ended)  min_disp_ended = DISP_END[x][y][l];
                if (DISP_END[x][y][l]   > max_disp_ended)  max_disp_ended = DISP_END[x][y][l];
            }
        }
    }

    printf("\n   ---------------- Instrumentation Results ---------------------\n");

    printf(" - MMAP_START : min = %d / max = %d / med = %d / delta = %d\n",
           min_load_start, max_load_start, (min_load_start+max_load_start)/2, 
           max_load_start-min_load_start); 
    giet_fat_fprintf( fd , " - MMAP_START : min = %d / max = %d / med = %d / delta = %d\n",
           min_load_start, max_load_start, (min_load_start+max_load_start)/2, 
           max_load_start-min_load_start); 

    printf(" - MMAP_END   : min = %d / max = %d / med = %d / delta = %d\n",
           min_load_ended, max_load_ended, (min_load_ended+max_load_ended)/2, 
           max_load_ended-min_load_ended); 
    giet_fat_fprintf( fd , " - MMAP_END   : min = %d / max = %d / med = %d / delta = %d\n",
           min_load_ended, max_load_ended, (min_load_ended+max_load_ended)/2, 
           max_load_ended-min_load_ended); 

    printf(" - TRSP_START : min = %d / max = %d / med = %d / delta = %d\n",
           min_trsp_start, max_trsp_start, (min_trsp_start+max_trsp_start)/2, 
           max_trsp_start-min_trsp_start); 
    giet_fat_fprintf( fd , " - TRSP_START : min = %d / max = %d / med = %d / delta = %d\n",
           min_trsp_start, max_trsp_start, (min_trsp_start+max_trsp_start)/2, 
           max_trsp_start-min_trsp_start); 

    printf(" - TRSP_END   : min = %d / max = %d / med = %d / delta = %d\n",
           min_trsp_ended, max_trsp_ended, (min_trsp_ended+max_trsp_ended)/2, 
           max_trsp_ended-min_trsp_ended); 
    giet_fat_fprintf( fd , " - TRSP_END   : min = %d / max = %d / med = %d / delta = %d\n",
           min_trsp_ended, max_trsp_ended, (min_trsp_ended+max_trsp_ended)/2, 
           max_trsp_ended-min_trsp_ended); 

    printf(" - DISP_START : min = %d / max = %d / med = %d / delta = %d\n",
           min_disp_start, max_disp_start, (min_disp_start+max_disp_start)/2, 
           max_disp_start-min_disp_start); 
    giet_fat_fprintf( fd , " - DISP_START : min = %d / max = %d / med = %d / delta = %d\n",
           min_disp_start, max_disp_start, (min_disp_start+max_disp_start)/2, 
           max_disp_start-min_disp_start); 

    printf(" - DISP_END   : min = %d / max = %d / med = %d / delta = %d\n",
           min_disp_ended, max_disp_ended, (min_disp_ended+max_disp_ended)/2, 
           max_disp_ended-min_disp_ended); 
    giet_fat_fprintf( fd , " - DISP_END   : min = %d / max = %d / med = %d / delta = %d\n",
           min_disp_ended, max_disp_ended, (min_disp_ended+max_disp_ended)/2, 
           max_disp_ended-min_disp_ended); 

    giet_fat_close( fd );

}  // end instrument()



//////////////////////////////////////////
__attribute__ ((constructor)) void main()
//////////////////////////////////////////
{
    // indexes for loops
    unsigned int x , y , n;

    // get identifiers for proc executing main
    unsigned int x_id;                          // x cluster coordinate
    unsigned int y_id;                          // y cluster coordinate
    unsigned int p_id;                          // local processor index
    giet_proc_xyp( &x_id , &y_id , &p_id );

    // get & check plat-form parameters
    unsigned int x_size;                       // number of clusters in a row
    unsigned int y_size;                       // number of clusters in a column
    unsigned int nprocs;                       // number of processors per cluster
    giet_procs_number( &x_size , &y_size , &nprocs );

    giet_pthread_assert( ((nprocs == 1) || (nprocs == 2) || (nprocs == 4)),
                         "[TRANSPOSE ERROR] number of procs per cluster must be 1, 2 or 4");

    giet_pthread_assert( ((x_size == 1) || (x_size == 2) || (x_size == 4) || 
                          (x_size == 8) || (x_size == 16)),
                         "[TRANSPOSE ERROR] x_size must be 1,2,4,8,16");

    giet_pthread_assert( ((y_size == 1) || (y_size == 2) || (y_size == 4) || 
                          (y_size == 8) || (y_size == 16)),
                         "[TRANSPOSE ERROR] y_size must be 1,2,4,8,16");

    // compute number of threads
    unsigned int nthreads = x_size * y_size * nprocs;

    // shared TTY allocation
    giet_tty_alloc( 1 );     
    lock_init( &tty_lock);

    // get FBF ownership and FBF size
    unsigned int   width;
    unsigned int   height;
    giet_fbf_alloc();
    giet_fbf_size( &width , &height );

    printf("\n[TRANSPOSE] start at cycle %d on %d cores / FBF = %d * %d pixels\n",
           giet_proctime(), nthreads , width , height );

    if ( INTERACTIVE ) // input_file_name, output_file_name, and size  acquisition 
    {
        printf("\n[TRANSPOSE] enter path for input file / default is : %s\n> ", INPUT_FILE_PATH );  
        giet_tty_gets( input_file_name , 256 );
        printf("\n");
        if ( strcmp( input_file_name , "" ) == 0 ) strcpy( input_file_name , INPUT_FILE_PATH );

        printf("\n[TRANSPOSE] enter path for output file / default is : %s\n> ", OUTPUT_FILE_PATH );  
        giet_tty_gets( output_file_name , 256 );
        printf("\n");
        if ( strcmp( output_file_name , "" ) == 0 ) strcpy( output_file_name , OUTPUT_FILE_PATH );

        printf("\n[TRANSPOSE] enter image size / default is : %d\n> ", IMAGE_SIZE );  
        giet_tty_getw( &image_size );
        printf("\n");
        if ( image_size == 0 ) image_size = IMAGE_SIZE;
    }
    else
    {
        strcpy( input_file_name , INPUT_FILE_PATH );
        strcpy( output_file_name , OUTPUT_FILE_PATH );
        image_size = IMAGE_SIZE;
    }

    // check image size / number of clusters
    giet_pthread_assert( ((((image_size * image_size) / (x_size * y_size)) & 0xFFF) == 0) ,
                         "[TRANSPOSE ERROR] pixels per cluster must be multiple of 4096");
   
    printf("\n[TRANSPOSE] input = %s / output = %s / size = %d\n",
           input_file_name, output_file_name, image_size );

    // distributed heap initialisation
    for ( x = 0 ; x < x_size ; x++ ) 
    {
        for ( y = 0 ; y < y_size ; y++ ) 
        {
            heap_init( x , y );
        }
    }

    // open input and output files
    fd_in = giet_fat_open( input_file_name , O_RDONLY );  // read_only
    if ( fd_in < 0 ) 
    { 
        printf("\n[TRANSPOSE ERROR] main cannot open file %s\n", input_file_name );
        giet_pthread_exit( NULL );
    }
    else 
    {
        printf("\n[TRANSPOSE] main open file %s / fd = %d\n", input_file_name , fd_in );
    }

    fd_out = giet_fat_open( output_file_name , O_CREAT);   // create if required
    if ( fd_out < 0 ) 
    { 
        printf("\n[TRANSPOSE ERROR] main cannot open file %s\n", output_file_name );
        giet_pthread_exit(" open() failure");
    }
    else
    {
        printf("\n[TRANSPOSE] main open file %s / fd = %d\n", output_file_name , fd_out );
    }

    // allocate thread[] array
    pthread_t* thread = malloc( nthreads * sizeof(pthread_t) );

    // barrier initialisation
    sqt_barrier_init( &barrier, x_size , y_size , nprocs );

    // Initialisation completed
    printf("\n[TRANSPOSE] main completes initialisation\n");
    
    // launch other threads to run execute() function
    for ( n = 1 ; n < nthreads ; n++ )
    {
        if ( giet_pthread_create( &thread[n],
                                  NULL,                  // no attribute
                                  &execute,
                                  NULL ) )               // no argument
        {
            printf("\n[TRANSPOSE ERROR] creating thread %x\n", thread[n] );
            giet_pthread_exit( NULL );
        }
    }

    // run the execute() function
    execute();

    // wait other threads completion
    for ( n = 1 ; n < nthreads ; n++ )
    {
        if ( giet_pthread_join( thread[n], NULL ) )
        {
            printf("\n[TRANSPOSE ERROR] joining thread %x\n", thread[n] );
            giet_pthread_exit( NULL );
        }
        else
        {
            printf("\n[TRANSPOSE] thread %x joined at cycle %d\n",
                   thread[n] , giet_proctime() );
        }
    }

    // call the instrument() function
    instrument( x_size , y_size , nprocs );

    // close input and output files 
    giet_fat_close( fd_in );
    giet_fat_close( fd_out );

    // suicide
    giet_pthread_exit( "completed" );
    
} // end main()

