///////////////////////////////////////////////////////////////////////////////////////
// File   : main.c   (for transpose application)
// Date   : february 2014
// author : Alain Greiner
///////////////////////////////////////////////////////////////////////////////////////
// This multi-threaded application makes a transpose for a NN*NN pixels 
// sequence of images.
// It can run on a multi-processors, multi-clusters architecture, with one thread
// per processor. 
//
// The image sequence is read from a file (one byte per pixel).
// The input and output buffers containing the image are distributed in all clusters.
//
// - The image size NN must fit the frame buffer size: 128 bytes
// - The block size in block device must be 512 bytes.
// - The number of clusters  must be a power of 2 no larger than 32
// - The number of processors per cluster must be a power of 2 no larger than 4
//
// For each image the application makes a self test (checksum for each line).
// The actual display on the frame buffer depends on frame buffer availability.
///////////////////////////////////////////////////////////////////////////////////////

#include "stdio.h"
#include "user_barrier.h"
#include "malloc.h"

#define BLOCK_SIZE          512                 // block size on disk
#define CLUSTERS_MAX        32                  // max number of clusters
#define PROCS_MAX           4                   // max number of processors per cluster
#define NN                  256                 // image size : nlines = npixels
#define NB_IMAGES           1                   // number of images to be handled
#define FILE_PATHNAME       "misc/lena.raw"     // pathname on virtual disk
#define INSTRUMENTATION_OK  0                   // display statistics on TTY when non zero

///////////////////////////////////////////////////////
// global variables stored in seg_data in cluster(0,0)
///////////////////////////////////////////////////////

// instrumentation counters for each processor in each cluster 
unsigned int LOAD_START[CLUSTERS_MAX][PROCS_MAX];
unsigned int LOAD_END  [CLUSTERS_MAX][PROCS_MAX];
unsigned int TRSP_START[CLUSTERS_MAX][PROCS_MAX];
unsigned int TRSP_END  [CLUSTERS_MAX][PROCS_MAX];
unsigned int DISP_START[CLUSTERS_MAX][PROCS_MAX];
unsigned int DISP_END  [CLUSTERS_MAX][PROCS_MAX];

// arrays of pointers on distributed buffers
// one input buffer & one output buffer per cluster
unsigned char*  buf_in [CLUSTERS_MAX];
unsigned char*  buf_out[CLUSTERS_MAX];

// checksum variables 
unsigned check_line_before[NN];
unsigned check_line_after[NN];

// global synchronisation barrier
giet_sqt_barrier_t barrier;

volatile unsigned int init_ok = 0;

//////////////////////////////////////////
__attribute__ ((constructor)) void main()
//////////////////////////////////////////
{

    unsigned int l;                  // line index for loops
    unsigned int p;                  // pixel index for loops
    unsigned int c;                  // cluster index for loops

    // processor identifiers
    unsigned int x;                  // x cluster coordinate
    unsigned int y;                  // y cluster coordinate
    unsigned int lpid;               // local processor index

    // plat-form parameters
    unsigned int x_size;             // number of clusters in a row
    unsigned int y_size;             // number of clusters in a column
    unsigned int nprocs;             // number of processors per cluster
    
    giet_proc_xyp( &x, &y, &lpid);             

    giet_procs_number( &x_size , &y_size , &nprocs );

    unsigned int nclusters  = x_size * y_size;               // number of clusters
    unsigned int ntasks     = x_size * y_size * nprocs;      // number of tasks
    unsigned int npixels    = NN * NN;                       // pixels per image
    unsigned int nblocks    = npixels / BLOCK_SIZE;          // blocks per image
    unsigned int image      = 0;                             // image counter
    int          file       = 0;                             // file descriptor
    unsigned int cluster_id = (x * y_size) + y;              // "continuous" index   
    unsigned int task_id    = (cluster_id * nprocs) + lpid;  // "continuous" task index

    // Processor [0,0,0] makes initialisation
    // It includes parameters checking, barrier initialization,
    // distributed buffers allocation, and file open
    if ( (x==0) && (y==0) && (lpid==0) )
    {
        if ((nprocs != 1) && (nprocs != 2) && (nprocs != 4))
        { 
            giet_exit("[TRANSPOSE ERROR] number of procs per cluster must be 1, 2 or 4");
        }
        if ((nclusters != 1) && (nclusters != 2) && (nclusters != 4) && 
            (nclusters != 8) && (nclusters != 16) && (nclusters != 32) )
        {
            giet_exit("[TRANSPOSE ERROR] number of clusters must be 1,2,4,8,16,32");
        }
        if ( ntasks > NN )
        {
            giet_exit("[TRANSPOSE ERROR] number of tasks larger than number of lines");
        }

        // Distributed buffers allocation
        // The buffers containing one image are distributed in the user 
        // heap (one buf_in and one buf_out per cluster).
        // Each buffer contains (NN*NN / nclusters) bytes.
        for ( c = 0 ; c < nclusters ; c++ )
        {
            unsigned int rx = c / y_size;
            unsigned int ry = c % y_size;

            heap_init( rx, ry );
            buf_in[c]  = remote_malloc( npixels/nclusters, rx, ry );
            buf_out[c] = remote_malloc( npixels/nclusters, rx, ry );

            giet_shr_printf("\n[TRANSPOSE] Proc [0,0,0] completes buffer allocation"
                            " for cluster[%d,%d] at cycle %d\n"
                            " - buf_in  = %x\n"
                            " - buf_out = %x\n",
                            rx, ry, giet_proctime(), 
                            (unsigned int)buf_in[c], 
                            (unsigned int)buf_out[c] );
        }

        // Barrier initialisation
        sqt_barrier_init( &barrier, x_size , y_size , nprocs );

        giet_shr_printf("\n[TRANSPOSE] Proc [0,0,0] completes barrier init at cycle %d\n",
                        giet_proctime() );

        // open file containing images
        file = giet_fat_open( FILE_PATHNAME , 0 );

        if (file < 0)
        { 
            giet_shr_printf("\n[TRANSPOSE ERROR] Proc [%d,%d,%d]"
                            " cannot open file %s",
                            x , y , lpid , FILE_PATHNAME );
            giet_exit(" open() failure");
        }
        else
        {
            giet_shr_printf("\n[TRANSPOSE] Proc [0,0,0] open file misc/images.raw\n");
        }
        init_ok = 1;
    }
    else   // others processors wait initialisation completion
    {
        while ( init_ok == 0 );
    }
    
    /////////////////////////
    // Main loop (on images)
    while (image < NB_IMAGES)
    {
        // pseudo parallel load from disk to buf_in buffer : nblocks/nclusters blocks
        // only task running on processor with (lpid == 0) does it

        LOAD_START[cluster_id][lpid] = giet_proctime();

        if (lpid == 0)
        {
            giet_fat_read( file,
                           buf_in[cluster_id],
                           (nblocks / nclusters),
                           ((image*nblocks) + ((nblocks*cluster_id)/nclusters)) );

            if ( (x==0) && (y==0) )
            giet_shr_printf("\n[TRANSPOSE] Proc [%d,%d,%d] completes load"
                            "  for image %d at cycle %d\n",
                            x, y, lpid, image, giet_proctime() );
        }

        LOAD_END[cluster_id][lpid] = giet_proctime();

        /////////////////////////////
        sqt_barrier_wait( &barrier );

        // parallel transpose from buf_in to buf_out
        // each task makes the transposition for nlt lines (nlt = NN/ntasks)
        // from line [task_id*nlt] to line [(task_id + 1)*nlt - 1]
        // (p,l) are the absolute pixel coordinates in the source image


        TRSP_START[cluster_id][lpid] = giet_proctime();

        unsigned int nlt   = NN / ntasks;      // number of lines per task
        unsigned int nlc   = NN / nclusters;   // number of lines per cluster

        unsigned int src_cluster;
        unsigned int src_index;
        unsigned int dst_cluster;
        unsigned int dst_index;

        unsigned char byte;

        unsigned int first = task_id * nlt;    // first line index for a given task
        unsigned int last  = first + nlt;      // last line index for a given task

        for ( l = first ; l < last ; l++ )
        {
            check_line_before[l] = 0;
         
            // in each iteration we transfer one byte
            for ( p = 0 ; p < NN ; p++ )
            {
                // read one byte from local buf_in
                src_cluster = l / nlc;
                src_index   = (l % nlc)*NN + p;
                byte        = buf_in[src_cluster][src_index];

                // compute checksum
                check_line_before[l] = check_line_before[l] + byte;

                // write one byte to remote buf_out
                dst_cluster = p / nlc; 
                dst_index   = (p % nlc)*NN + l;
                buf_out[dst_cluster][dst_index] = byte;
            }
        }

        if ( lpid == 0 )
        {
            if ( (x==0) && (y==0) )
            giet_shr_printf("\n[TRANSPOSE] proc [%d,%d,0] completes transpose"
                            " for image %d at cycle %d\n", 
                            x, y, image, giet_proctime() );

        }
        TRSP_END[cluster_id][lpid] = giet_proctime();

        /////////////////////////////
        sqt_barrier_wait( &barrier );


        if ( USE_FBF )  // external frame buffer available
        {
            // parallel display from local buf_out to frame buffer
            // all processors contribute to display using memcpy...

            DISP_START[cluster_id][lpid] = giet_proctime();

            unsigned int  npt   = npixels / ntasks;   // number of pixels per task

            giet_fbf_sync_write( npt * task_id, 
                                 &buf_out[cluster_id][lpid*npt], 
                                 npt );

            if ( (x==0) && (y==0) && (lpid==0) )
            giet_shr_printf("\n[TRANSPOSE] Proc [%d,%d,%d] completes display"
                            " for image %d at cycle %d\n",
                            x, y, lpid, image, giet_proctime() );

            DISP_END[cluster_id][lpid] = giet_proctime();

            /////////////////////////////
            sqt_barrier_wait( &barrier );
        }
        else         // checksum by processor(x,y,0) in each cluster
        {
            if ( lpid == 0 )
            {
                unsigned int success = 1;
                unsigned int start   = cluster_id * nlc;
                unsigned int stop    = start + nlc;

                for ( l = start ; l < stop ; l++ )
                {
                    check_line_after[l] = 0;

                    for ( p = 0 ; p < NN ; p++ )
                    {
                        // read one byte in remote buffer
                        src_cluster = p / nlc;
                        src_index   = (p % nlc)*NN + l;

                        unsigned char byte = buf_out[src_cluster][src_index];

                        check_line_after[l] = check_line_after[l] + byte;
                    }

                    if ( check_line_before[l] != check_line_after[l] ) success = 0;
                }

                if ( success ) 
                {
                    giet_shr_printf("\n[TRANSPOSE] proc [%d,%d,0] checksum OK"
                                    " for image %d at cycle %d\n",
                                    x, y, image, giet_proctime() );
                }
                else
                {
                    giet_shr_printf("\n[TRANSPOSE] proc [%d,%d,0] checksum KO"
                                    " for image %d at cycle %d\n",
                                    x, y, image, giet_proctime() );
                }
            } 
        }

        /////////////////////////////
        sqt_barrier_wait( &barrier );

        // instrumentation done by processor [0,0,0] 
        if ( (x==0) && (y==0) && (lpid==0) && INSTRUMENTATION_OK )
        {
            int cc, pp;
            unsigned int min_load_start = 0xFFFFFFFF;
            unsigned int max_load_start = 0;
            unsigned int min_load_ended = 0xFFFFFFFF;
            unsigned int max_load_ended = 0;
            unsigned int min_trsp_start = 0xFFFFFFFF;
            unsigned int max_trsp_start = 0;
            unsigned int min_trsp_ended = 0xFFFFFFFF;
            unsigned int max_trsp_ended = 0;
            unsigned int min_disp_start = 0xFFFFFFFF;
            unsigned int max_disp_start = 0;
            unsigned int min_disp_ended = 0xFFFFFFFF;
            unsigned int max_disp_ended = 0;

            for (cc = 0; cc < nclusters; cc++)
            {
                for (pp = 0; pp < NB_PROCS_MAX; pp++)
                {
                    if (LOAD_START[cc][pp] < min_load_start)  min_load_start = LOAD_START[cc][pp];
                    if (LOAD_START[cc][pp] > max_load_start)  max_load_start = LOAD_START[cc][pp];
                    if (LOAD_END[cc][pp]   < min_load_ended)  min_load_ended = LOAD_END[cc][pp]; 
                    if (LOAD_END[cc][pp]   > max_load_ended)  max_load_ended = LOAD_END[cc][pp];
                    if (TRSP_START[cc][pp] < min_trsp_start)  min_trsp_start = TRSP_START[cc][pp];
                    if (TRSP_START[cc][pp] > max_trsp_start)  max_trsp_start = TRSP_START[cc][pp];
                    if (TRSP_END[cc][pp]   < min_trsp_ended)  min_trsp_ended = TRSP_END[cc][pp];
                    if (TRSP_END[cc][pp]   > max_trsp_ended)  max_trsp_ended = TRSP_END[cc][pp];
                    if (DISP_START[cc][pp] < min_disp_start)  min_disp_start = DISP_START[cc][pp];
                    if (DISP_START[cc][pp] > max_disp_start)  max_disp_start = DISP_START[cc][pp];
                    if (DISP_END[cc][pp]   < min_disp_ended)  min_disp_ended = DISP_END[cc][pp];
                    if (DISP_END[cc][pp]   > max_disp_ended)  max_disp_ended = DISP_END[cc][pp];
                }
            }

            giet_shr_printf(" - LOAD_START : min = %d / max = %d / med = %d / delta = %d\n",
                            min_load_start, max_load_start, (min_load_start+max_load_start)/2, 
                            max_load_start-min_load_start); 

            giet_shr_printf(" - LOAD_END   : min = %d / max = %d / med = %d / delta = %d\n",
                            min_load_ended, max_load_ended, (min_load_ended+max_load_ended)/2, 
                            max_load_ended-min_load_ended); 

            giet_shr_printf(" - TRSP_START : min = %d / max = %d / med = %d / delta = %d\n",
                            min_trsp_start, max_trsp_start, (min_trsp_start+max_trsp_start)/2, 
                            max_trsp_start-min_trsp_start); 

            giet_shr_printf(" - TRSP_END   : min = %d / max = %d / med = %d / delta = %d\n",
                            min_trsp_ended, max_trsp_ended, (min_trsp_ended+max_trsp_ended)/2, 
                            max_trsp_ended-min_trsp_ended); 

            giet_shr_printf(" - DISP_START : min = %d / max = %d / med = %d / delta = %d\n",
                            min_disp_start, max_disp_start, (min_disp_start+max_disp_start)/2, 
                            max_disp_start-min_disp_start); 

            giet_shr_printf(" - DISP_END   : min = %d / max = %d / med = %d / delta = %d\n",
                            min_disp_ended, max_disp_ended, (min_disp_ended+max_disp_ended)/2, 
                            max_disp_ended-min_disp_ended); 
        }

        image++;

        /////////////////////////////
        sqt_barrier_wait( &barrier );

    } // end while image      

    // Processor[0,0,0] releases the Distributed buffers
    if ( (x==0) && (y==0) && (lpid==0) )
    {
        for ( c = 0 ; c < nclusters ; c++ )
        {
            free( buf_in[c] );
            free( buf_in[c] );
        }
    }

    giet_exit("Completed");

} // end main()

// Local Variables:
// tab-width: 3
// c-basic-offset: 
// c-file-offsets:((innamespace . 0)(inline-open . 0))
// indent-tabs-mode: nil
// End:

// vim: filetype=cpp:expandtab:shiftwidth=3:tabstop=3:softtabstop=3



