#include "stdio.h"

#define NL		512
#define NP		512
#define NB_IMAGES	1
#define BLOCK_SIZE	512 

#define PRINTF		if(local_id == 0) tty_printf

///////////////////////////////////////////
// tricks to read parameters from ldscript
///////////////////////////////////////////

struct plaf;

extern struct plaf seg_heap_base;
extern struct plaf NB_PROCS;
extern struct plaf NB_CLUSTERS;

/////////////
void main()
{
    unsigned int 	image     = 0;
    unsigned int 	date      = 0;

    unsigned int	c;					  	// cluster index for loops
    unsigned int	l;					  	// line index for loops
    unsigned int	p;					  	// pixel index for loops

    unsigned int	proc_id     = procid(); 		  	// processor id
    unsigned int	nprocs 	    = (unsigned int)&NB_PROCS; 	  	// number of processors per cluster
    unsigned int	nclusters   = (unsigned int)&NB_CLUSTERS;   	// number of clusters
    unsigned int        local_id    = proc_id%nprocs;			// local processor id
    unsigned int        cluster_id  = proc_id/nprocs;			// cluster id
    unsigned int	base        = (unsigned int)&seg_heap_base; 	// base address for shared buffers
    unsigned int	increment   = (0x80000000 / nclusters) * 2; 	// cluster increment
    unsigned int	ntasks	    = nclusters * nprocs;		// number of tasks
    unsigned int	nblocks     = (NP*NL) / BLOCK_SIZE;		// number of blocks per image 

    PRINTF("\n *** Entering main at cycle %d ***\n\n", proctime());

    //  parameters checking
    if( (nprocs != 1) && (nprocs != 2) && (nprocs != 4) )
    {
        PRINTF("NB_PROCS must be 1, 2 or 4\n");
    }
    if( (nclusters !=  1) && (nclusters !=  2) && (nclusters !=  4) && (nclusters !=  8) &&
        (nclusters != 16) && (nclusters != 32) && (nclusters != 64) && (nclusters !=128) &&
        (nclusters != 256) )
    {
        PRINTF("NB_CLUSTERS must be a power of 1 between 1 and 256\n");
    }
    if( ntasks > 1024 )
    {
        PRINTF("NB_PROCS * NB_CLUSTERS cannot be larger than 1024\n");
    }
    if( proc_id >= ntasks )
    {
        PRINTF("processor id %d larger than NB_CLUSTERS*NB_PROCS\n", proc_id);
    }

    // Arrays of pointers on the shared, distributed buffers containing the images 
    // These arrays are indexed by the cluster index (sized for the worst case : 256 clusters)
    unsigned char*	A[256];
    unsigned char*	B[256];
    
    // Arrays of pointers on the instrumentation arrays
    // These arrays are indexed by the cluster index (sized for the worst case : 256 clusters)
    // each pointer points on the base adress of an array of NPROCS unsigned int
    unsigned int*	LOAD_START[256];
    unsigned int*	LOAD_ENDED[256];
    unsigned int*	TRSP_START[256];
    unsigned int*	TRSP_ENDED[256];
    unsigned int*	DISP_START[256];
    unsigned int*	DISP_ENDED[256];
 
    // shared buffers address definition 
    // from the seg_heap_base and increment depending on the cluster index
    // These arrays of pointers are identical and replicated in the stack of each task 
    for( c=0 ; c<nclusters ; c++)
    {
        A[c]          = (unsigned char*)(base           + increment*c);
        B[c]          = (unsigned char*)(base + NL*NP   + increment*c);
        LOAD_START[c] = (unsigned int*) (base + 2*NL*NP + increment*c);
        LOAD_ENDED[c] = (unsigned int*) (base + 3*NL*NP + increment*c);
        TRSP_START[c] = (unsigned int*) (base + 4*NL*NP + increment*c);
        TRSP_ENDED[c] = (unsigned int*) (base + 5*NL*NP + increment*c);
        DISP_START[c] = (unsigned int*) (base + 6*NL*NP + increment*c);
        DISP_ENDED[c] = (unsigned int*) (base + 7*NL*NP + increment*c);
    }

    PRINTF("NB_CLUSTERS = %d\n", nclusters); 
    PRINTF("NB_PROCS    = %d\n\n", nprocs); 

    PRINTF("*** starting barrier init at cycle %d ***\n", proctime());

    //  barriers initialization
    barrier_init(0, ntasks);
    barrier_init(1, ntasks);
    barrier_init(2, ntasks);

    PRINTF("*** completing barrier init at cycle %d ***\n", proctime());

    // Main loop (on images)
    while(image < NB_IMAGES) 
    {
        // pseudo parallel load from disk to A[c] buffer : nblocks/nclusters blocks
        // only task running on processor with (local_id == 0) does it

        if ( local_id == 0 )
        {
            int p;

            date = proctime();
            PRINTF("\n*** Starting load for image %d at cycle %d\n", image, date);
            for ( p=0 ; p<nprocs ; p++ ) LOAD_START[cluster_id][p] = date;

            if( ioc_read(image*nblocks + nblocks*cluster_id/nclusters , A[cluster_id], nblocks/nclusters) )
            {
                tty_printf("echec ioc_read\n");
                exit();
            }
            if ( ioc_completed() )
            {
                tty_printf("echec ioc_completed\n");
                exit();
            }

            date = proctime();
            PRINTF("*** Completing load for image %d at cycle %d\n", image, date);
            for ( p=0 ; p<nprocs ; p++ ) LOAD_ENDED[cluster_id][p] = date;
        }

        barrier_wait(0);

        // parallel transpose from A to B buffers
	// each processor makes the transposition for (NL/ntasks) lines
        // (p,l) are the (x,y) pixel coordinates in the source image


        date = proctime();
        PRINTF("\n*** Starting transpose for image %d at cycle %d\n", image, date);
        TRSP_START[cluster_id][local_id] = date;

        unsigned int nlt 	= NL/ntasks;
        unsigned int first 	= (cluster_id*nprocs + local_id)*nlt;
        unsigned int last 	= first + nlt;

        for ( l=first ; l<last ; l++)
        {
            PRINTF( "    - processing line %d\n", l);
            for ( p=0 ; p<NP ; p++)
            {
                unsigned int source_cluster = l/(NL/nclusters);
                unsigned int source_index   = (l%(NL/nclusters))*NP + p;
                unsigned int dest_cluster   = p / (NP/nclusters);
                unsigned int dest_index     = (p%(NP/nclusters))*NL + l;
                B[dest_cluster][dest_index] = A[source_cluster][source_index];
            }

        }
        date = proctime();
        PRINTF("*** Completing transpose for image %d at cycle %d\n", image, date);
        TRSP_ENDED[cluster_id][local_id] = date;

        barrier_wait(1);

        // parallel display from B[c] to frame buffer 
        // each processor uses its private dma to display NL*NP/ntasks pixels

        date = proctime();
        PRINTF("\n*** Starting display for image %d at cycle %d\n", image, date);
        DISP_START[cluster_id][local_id] = date;

        unsigned int npxt = NL*NP/ntasks;	// number of pixels per task

        if ( fb_write(npxt*proc_id, B[cluster_id] + npxt*local_id, npxt) )
        {
            PRINTF("echec fb_sync_write\n");
            exit();
        }
        if ( fb_completed() )
        {
            PRINTF("echec fb_completed\n");
            exit();
        }

        date = proctime();
        PRINTF("*** Completing display for image %d at cycle %d\n", image, date);
        DISP_ENDED[cluster_id][local_id] = date;

        barrier_wait(2);

        // Instrumentation (done by processor 0 in cluster 0)
        if ( local_id == 0 )
        { 
            date = proctime();
            PRINTF("\n*** Starting Instrumentation for image %d at cycle %d\n\n", image, date);

            int cc, pp;
            unsigned int min_load_start = 1000000000;
            unsigned int max_load_start = 0;
            unsigned int min_load_ended = 1000000000;
            unsigned int max_load_ended = 0;
            unsigned int min_trsp_start = 1000000000;
            unsigned int max_trsp_start = 0;
            unsigned int min_trsp_ended = 1000000000;
            unsigned int max_trsp_ended = 0;
            unsigned int min_disp_start = 1000000000;
            unsigned int max_disp_start = 0;
            unsigned int min_disp_ended = 1000000000;
            unsigned int max_disp_ended = 0;

            for ( cc=0 ; cc<nclusters ; cc++ )
            {
                for ( pp=0 ; pp<nprocs ; pp++ )
                {
                    if ( LOAD_START[cc][pp] < min_load_start ) min_load_start = LOAD_START[cc][pp];
                    if ( LOAD_START[cc][pp] > max_load_start ) max_load_start = LOAD_START[cc][pp];
                    if ( LOAD_ENDED[cc][pp] < min_load_ended ) min_load_ended = LOAD_ENDED[cc][pp];
                    if ( LOAD_ENDED[cc][pp] > max_load_ended ) max_load_ended = LOAD_ENDED[cc][pp];

                    if ( TRSP_START[cc][pp] < min_trsp_start ) min_trsp_start = TRSP_START[cc][pp];
                    if ( TRSP_START[cc][pp] > max_trsp_start ) max_trsp_start = TRSP_START[cc][pp];
                    if ( TRSP_ENDED[cc][pp] < min_trsp_ended ) min_trsp_ended = TRSP_ENDED[cc][pp];
                    if ( TRSP_ENDED[cc][pp] > max_trsp_ended ) max_trsp_ended = TRSP_ENDED[cc][pp];

                    if ( DISP_START[cc][pp] < min_disp_start ) min_disp_start = DISP_START[cc][pp];
                    if ( DISP_START[cc][pp] > max_disp_start ) max_disp_start = DISP_START[cc][pp];
                    if ( DISP_ENDED[cc][pp] < min_disp_ended ) min_disp_ended = DISP_ENDED[cc][pp];
                    if ( DISP_ENDED[cc][pp] > max_disp_ended ) max_disp_ended = DISP_ENDED[cc][pp];

                }
            }
            PRINTF(" - LOAD_START : min = %d / max = %d / med = %d / delta = %d\n",
            min_load_start, max_load_start, (min_load_start+max_load_start)/2, max_load_start-min_load_start); 
            PRINTF(" - LOAD_END   : min = %d / max = %d / med = %d / delta = %d\n",
            min_load_ended, max_load_ended, (min_load_ended+max_load_ended)/2, max_load_ended-min_load_ended); 

            PRINTF(" - TRSP_START : min = %d / max = %d / med = %d / delta = %d\n",
            min_trsp_start, max_trsp_start, (min_trsp_start+max_trsp_start)/2, max_trsp_start-min_trsp_start); 
            PRINTF(" - TRSP_END   : min = %d / max = %d / med = %d / delta = %d\n",
            min_trsp_ended, max_trsp_ended, (min_trsp_ended+max_trsp_ended)/2, max_trsp_ended-min_trsp_ended); 

            PRINTF(" - DISP_START : min = %d / max = %d / med = %d / delta = %d\n",
            min_disp_start, max_disp_start, (min_disp_start+max_disp_start)/2, max_disp_start-min_disp_start); 
            PRINTF(" - DISP_END   : min = %d / max = %d / med = %d / delta = %d\n",
            min_disp_ended, max_disp_ended, (min_disp_ended+max_disp_ended)/2, max_disp_ended-min_disp_ended); 

            PRINTF(" - BARRIER TRSP/DISP = %d\n", min_disp_start - max_trsp_ended);
        }
        // next image
        image++;

    } // end while image      

    while(1);
} // end main()

