#include "stdio.h"

#define NL		128
#define NP		128
#define NB_IMAGES	2
#define BLOCK_SIZE	128

#define PRINTF		if(local_id == 0) tty_printf

///////////////////////////////////////////
// tricks to read parameters from ldscript
///////////////////////////////////////////

struct plaf;

extern struct plaf seg_heap_base;
extern struct plaf NB_PROCS;
extern struct plaf NB_CLUSTERS;

/////////////
void main()
{
    unsigned int 	image     = 0;
    unsigned int 	date      = 0;
    unsigned int 	delta     = 0;

    unsigned int	c;					  	// cluster index for loops
    unsigned int	l;					  	// line index for loops
    unsigned int	p;					  	// pixel index for loops

    unsigned int	proc_id     = procid(); 		  	// processor id
    unsigned int	nprocs 	    = (unsigned int)&NB_PROCS; 	  	// number of processors per cluster
    unsigned int	nclusters   = (unsigned int)&NB_CLUSTERS;   	// number of clusters
    unsigned int        local_id    = proc_id%nprocs;			// local processor id
    unsigned int        cluster_id  = proc_id/nprocs;			// local processor id
    unsigned int	base        = (unsigned int)&seg_heap_base; 	// base address for shared buffers
    unsigned int	increment   = (0x80000000 / nclusters) * 2; 	// cluster increment
    unsigned int	ntasks	    = nclusters * nprocs;		// number of tasks
    unsigned int	nblocks     = (NP*NL) / BLOCK_SIZE;		// number of blocks per image 

    PRINTF("\n *** Entering main at cycle %d ***\n\n", proctime());

    //  parameters checking
    if( (nprocs != 1) && (nprocs != 2) && (nprocs != 4) )
    {
        PRINTF("NB_PROCS must be 1, 2 or 4\n");

        exit();
    }
    if( (nclusters !=  1) && (nclusters !=  2) && (nclusters !=  4) && (nclusters !=  8) &&
        (nclusters != 16) && (nclusters != 32) && (nclusters != 64) && (nclusters !=128) )
    {
        PRINTF("NB_CLUSTERS must be a power of 2 between 1 and 128\n");
        exit();
    }
    if( ntasks > 128 )
    {
        PRINTF("NB_PROCS * NB_CLUSTERS cannot be larger than 128 4\n");
        exit();
    }
    if( proc_id >= ntasks )
    {
        PRINTF("processor id %d larger than NB_CLUSTERS*NB_PROCS\n", proc_id);
    }

    // Arrays of pointers on the shared, distributed buffers  
    // containing the images (sized for the worst case : 128 clusters)
    unsigned char*	A[128];
    unsigned char*	B[128];
    
    // shared buffers address definition 
    // from the seg_heap_base and segment_increment 
    // values defined in the ldscript file.
    // These arrays of pointers are identical and
    // replicated in the stack of each task 
    for( c=0 ; c<nclusters ; c++)
    {
        A[c] = (unsigned char*)(base + increment*c);
        B[c] = (unsigned char*)(base + NL*NP + increment*c);
    }

    PRINTF("NB_CLUSTERS = %d\n", nclusters); 
    PRINTF("NB_PROCS    = %d\n\n", nprocs); 

    PRINTF("*** starting barrier init at cycle %d ***\n", proctime());

    //  barriers initialization
    barrier_init(0, ntasks);
    barrier_init(1, ntasks);
    barrier_init(2, ntasks);

    PRINTF("*** completing barrier init at cycle %d ***\n", proctime());

    // Main loop (on images)
    while(image < NB_IMAGES) 
    {
        // pseudo parallel load from disk to A[c] buffer : nblocks/nclusters blocks
        // only task running on processor with (local_id == 0) does it

        delta = proctime() - date;
        date  = date + delta;

        if ( local_id == 0 )
        {
            PRINTF("\n*** Starting load for image %d *** at cycle %d (%d)\n", image, date, delta);

            if( ioc_read(image*nblocks + nblocks*cluster_id/nclusters , A[cluster_id], nblocks/nclusters) )
            {
                tty_printf("echec ioc_read\n");
                exit();
            }
            if ( ioc_completed() )
            {
                tty_printf("echec ioc_completed\n");
                exit();
            }
            delta = proctime() - date;
            date  = date + delta;
            PRINTF("*** Completing load for image %d *** at cycle %d (%d)\n", image, date, delta);
        }

        barrier_wait(0);

        // parallel transpose from A to B buffers
	// each processor makes the transposition for (NL/ntasks) lines
        // (p,l) are the (x,y) pixel coordinates in the source image

        delta = proctime() - date;
        date  = date + delta;

        PRINTF("\n*** starting transpose for image %d at cycle %d (%d)\n", image, date, delta);

        unsigned int nlt = NL/ntasks;

        for ( l = nlt*local_id ; l < nlt*(local_id+1) ; l++)
        {
            PRINTF( "    - processing line %d at cycle %d\n", l + NL*cluster_id/nclusters, proctime() );
            for ( p=0 ; p<NP ; p++)
            {
//                unsigned int source_cluster = l/(NL/nclusters);
//                unsigned int source_index   = (l%(NL/nclusters))*NP + p;
//                unsigned int dest_cluster   = p / (NP/nclusters);
//                unsigned int dest_index     = (p%(NP/nclusters))*NL + l;
//                B[dest_cluster][dest_index] = A[source_cluster][source_index];

                B[cluster_id][l*NP+p] = A[cluster_id][l*NP+p];
            }

        }
        delta = proctime() - date;
        date  = date + delta;
        PRINTF("*** Completing transpose for image %d *** at cycle %d (%d)\n", image, date, delta);

        barrier_wait(1);

        // parallel display from B[c] to frame buffer 
        // each processor uses its private dma to display NL*NP/ntasks pixels

        delta = proctime() - date;
        date  = date + delta;

        PRINTF("\n *** starting display for image %d at cycle %d (%d)\n", image, date, delta);

        unsigned int npxt = NL*NP/ntasks;	// number of pixels per task

        if ( fb_write(npxt*proc_id, B[cluster_id] + npxt*local_id, npxt) )
        {
            PRINTF("echec fb_sync_write\n");
            exit();
        }
        if ( fb_completed() )
        {
            PRINTF("echec fb_completed\n");
            exit();
        }

        delta = proctime() - date;
        date  = date + delta;
        PRINTF(" *** completing display for image %d at cycle %d (%d)\n", image, date, delta);

        barrier_wait(2);

        // next image
        image++;
    } // end while image      
    while(1);
} // end main()

