#include "stdio.h"

////////////////////////////////////
// Image parameters

#define PIXEL_SIZE	2
#define NL		1024
#define NP		1024
#define BLOCK_SIZE	1024

#define PRINTF		if(lid==0) tty_printf

#define TA(c,l,p)  (A[c][((NP)*(l))+(p)])
#define TB(c,p,l)  (B[c][((NL)*(p))+(l)])
#define TC(c,l,p)  (C[c][((NP)*(l))+(p)])
#define TD(c,l,p)  (D[c][((NP)*(l))+(p)])
#define TZ(c,l,p)  (Z[c][((NP)*(l))+(p)])

#define max(x,y) ((x) > (y) ? (x) : (y))
#define min(x,y) ((x) < (y) ? (x) : (y))

///////////////////////////////////////////
// tricks to read parameters from ldscript
///////////////////////////////////////////

struct plaf;

extern struct plaf seg_heap_base;
extern struct plaf NB_PROCS;
extern struct plaf NB_CLUSTERS;

/////////////
void main()
{

//////////////////////////////////
// convolution kernel parameters
// The content of this section is
// Philips proprietary information.
///////////////////////////////////

    int	vnorm  = 115;
    int	vf[35];
    vf[0]  = 1;
    vf[1]  = 1;
    vf[2]  = 2;
    vf[3]  = 2;
    vf[4]  = 2;
    vf[5]  = 2;
    vf[6]  = 3;
    vf[7]  = 3;
    vf[8]  = 3;
    vf[9]  = 4;
    vf[10] = 4;
    vf[11] = 4;
    vf[12] = 4;
    vf[13] = 5;
    vf[14] = 5;
    vf[15] = 5;
    vf[16] = 5;
    vf[17] = 5;
    vf[18] = 5;
    vf[19] = 5;
    vf[20] = 5;
    vf[21] = 5;
    vf[22] = 4;
    vf[23] = 4;
    vf[24] = 4;
    vf[25] = 4;
    vf[26] = 3;
    vf[27] = 3;
    vf[28] = 3;
    vf[29] = 2;
    vf[30] = 2;
    vf[31] = 2;
    vf[32] = 2;
    vf[33] = 1;
    vf[34] = 1;

    int hrange = 100;
    int hnorm  = 201;

    unsigned int date      = 0;
    unsigned int delta     = 0;

    int c;                                              	// cluster index for loops
    int l;                                              	// line index for loops
    int p;                                              	// pixel index for loops
    int x;                                              	// filter index for loops

    int pid                 = procid();                         // processor id
    int nprocs              = (int)&NB_PROCS;          		// number of processors per cluster
    int nclusters           = (int)&NB_CLUSTERS;       		// number of clusters
    int lid                 = pid%nprocs;                       // local task id
    int cid                 = pid/nprocs;                       // cluster task id
    int base                = (unsigned int)&seg_heap_base;     // base address for shared buffers
    int increment           = (0x80000000 / nclusters) * 2;     // cluster increment
    int ntasks              = nclusters * nprocs;               // number of tasks
    int nblocks             = (NP*NL*PIXEL_SIZE)/BLOCK_SIZE;  	// number of blocks per image 

    int lines_per_task      = NL/ntasks;			// number of lines per task
    int lines_per_cluster   = NL/nclusters;			// number of lines per cluster
    int pixels_per_task     = NP/ntasks;			// number of columns per task
    int pixels_per_cluster  = NP/nclusters;			// number of columns per cluster

    int first, last;

    PRINTF("\n*** Processor %d entering main at cycle %d ***\n\n", pid, proctime());
    
    //////////////////////////
    //  parameters checking
    if( (nprocs != 1) && (nprocs != 2) && (nprocs != 4) )
    {
        PRINTF("NB_PROCS must be 1, 2 or 4\n");
        while(1);
    }
    if( (nclusters !=  4) && (nclusters !=  8) && (nclusters != 16) && 
        (nclusters != 32) && (nclusters != 64) && (nclusters !=128) && (nclusters != 256) )
    {
        PRINTF("NB_CLUSTERS must be a power of 2 between 4 and 256\n");
        while(1);
    }
    if( pid >= ntasks )
    {
        PRINTF("processor id %d larger than NB_CLUSTERS*NB_PROCS\n", pid);
        while(1);
    }
    if ( NL % nclusters != 0 )
    {
        PRINTF("NB_CLUSTERS must be a divider of NL");
        while(1);
    }
    if( NP % nclusters != 0 )
    {
        PRINTF("NB_CLUSTERS must be a divider of NP");
        while(1);
    }

    //////////////////////////////////////////////////////////////////
    // Arrays of pointers on the shared, distributed buffers  
    // containing the images (sized for the worst case : 256 clusters)
    unsigned short*	A[256];
    int*		B[256];
    int*		C[256];
    int*		D[256];
    unsigned char*	Z[256];
    
    // The shared, distributed buffers addresses are computed
    // from the seg_heap_base value defined in the ldscript file
    // and from the cluster increment = 4Gbytes/nclusters.
    // These arrays of pointers are identical and
    // replicated in the stack of each task 
    for( c=0 ; c<nclusters ; c++)
    {
        A[c] = (unsigned short*)(base 				+ increment*c);
        Z[c] = (unsigned char*)	(base + 2*NP*NL/nclusters 	+ increment*c);
        B[c] = (int*)		(base + 4*NP*NL/nclusters 	+ increment*c);
        C[c] = (int*)		(base + 8*NP*NL/nclusters 	+ increment*c);
        D[c] = (int*)		(base + 12*NP*NL/nclusters 	+ increment*c);
    }

    PRINTF("NCLUSTERS = %d\n", nclusters); 
    PRINTF("NPROCS    = %d\n\n", nprocs); 

    PRINTF("*** Starting barrier init at cycle %d ***\n", proctime());

    //  barriers initialization
    barrier_init(0, ntasks);
    barrier_init(1, ntasks);
    barrier_init(2, ntasks);

    PRINTF("*** Completing barrier init at cycle %d ***\n", proctime());

    ////////////////////////////////////////////////////////
    // pseudo parallel load from disk to A[c] buffers
    // only task running on processor with (lid==0) does it
    // nblocks/nclusters are loaded in each cluster

    if ( lid == 0 )
    {
        delta = proctime() - date;
        date  = date + delta;
        PRINTF("\n*** Starting load at cycle %d (%d)\n", date, delta);

        if( ioc_read(nblocks*cid/nclusters, 
                     A[cid] , 
                     nblocks/nclusters) )
        {
            PRINTF("echec ioc_read\n");
            while(1);
        }
        if ( ioc_completed() )
        {
            PRINTF("echec ioc_completed\n");
            while(1);
        }

        delta = proctime() - date;
        date  = date + delta;
        PRINTF("*** Completing load at cycle %d (%d)\n", date, delta);
    }

    barrier_wait(0);

    //////////////////////////////////////////////////////////
    // parallel horizontal filter : 
    // B <= transpose(FH(A))
    // D <= A - FH(A)
    // Each task computes (NL/ntasks) lines 
    // The image must be extended :
    // if (z<0) 	TA(cid,l,z) == TA(cid,l,0)
    // if (z>NP-1)	TA(cid,l,z) == TA(cid,l,NP-1)

    delta = proctime() - date;
    date  = date + delta;
    PRINTF("\n*** Starting horizontal filter at cycle %d (%d)\n", date, delta);

    // l = absolute line index / p = absolute pixel index  
    // first & last define which lines are handled by a given task(cid,lid)

    first = (cid*nprocs + lid)*lines_per_task;
    last  = first + lines_per_task;

    for ( l=first ; l<last ; l++)
    {
        // src_c and src_l are the cluster index and the line index for A & D
        int src_c = l/lines_per_cluster;
        int src_l = l%lines_per_cluster;

        // We use the spcific values of the horizontal ep-filter for optimisation:
        // sum(p) = sum(p-1) + TA[p+hrange] - TA[p-hrange-1]
        // To minimize the number of tests, the loop on pixels is split in three domains 

        int sum_p = (hrange+2)*TA(src_c, src_l, 0);
        for ( x = 1 ; x < hrange ; x++) sum_p = sum_p + TA(src_c, src_l, x);

        // first domain : from 0 to hrange
        for ( p=0 ; p<hrange+1 ; p++)
        {
            // dst_c and dst_p are the cluster index and the pixel index for B
            int dst_c = p/pixels_per_cluster;
            int dst_p = p%pixels_per_cluster;
            sum_p = sum_p + (int)TA(src_c, src_l, p+hrange) - (int)TA(src_c, src_l, 0);
            TB(dst_c, dst_p, l) = sum_p/hnorm;
            TD(src_c, src_l, p) = (int)TA(src_c, src_l, p) - sum_p/hnorm;
        }
        // second domain : from (hrange+1) to (NP-hrange-1)
        for ( p = hrange+1 ; p < NP-hrange ; p++)
        {
            // dst_c and dst_p are the cluster index and the pixel index for B
            int dst_c = p/pixels_per_cluster;
            int dst_p = p%pixels_per_cluster;
            sum_p = sum_p + (int)TA(src_c, src_l, p+hrange) - (int)TA(src_c, src_l, p-hrange-1);
            TB(dst_c, dst_p, l) = sum_p/hnorm;
            TD(src_c, src_l, p) = (int)TA(src_c, src_l, p) - sum_p/hnorm;
        }
        // third domain : from (NP-hrange) to (NP-1)
        for ( p = NP-hrange ; p < NP ; p++)
        {
            // dst_c and dst_p are the cluster index and the pixel index for B
            int dst_c = p/pixels_per_cluster;
            int dst_p = p%pixels_per_cluster;
            sum_p = sum_p + (int)TA(src_c, src_l, NP-1) - (int)TA(src_c, src_l, p-hrange-1);
            TB(dst_c, dst_p, l) = sum_p/hnorm;
            TD(src_c, src_l, p) = (int)TA(src_c, src_l, p) - sum_p/hnorm;
        }

        PRINTF(" - line %d computed at cycle %d\n", l, proctime());
    }

    delta = proctime() - date;
    date  = date + delta;
    PRINTF("*** Completing horizontal filter at cycle %d (%d)\n", date, delta);

    barrier_wait(1);

    //////////////////////////////////////////////////////////
    // parallel vertical filter : 
    // C <= transpose(FV(B))
    // Each task computes (NP/ntasks) columns
    // The image must be extended :
    // if (l<0) 	TB(cid,p,x) == TB(cid,p,0)
    // if (l>NL-1)	TB(cid,p,x) == TB(cid,p,NL-1)

    delta = proctime() - date;
    date  = date + delta;
    PRINTF("\n*** starting vertical filter at cycle %d (%d)\n", date, delta);

    // l = absolute line index / p = absolute pixel index
    // first & last define which pixels are handled by a given task(cid,lid)

    first = (cid*nprocs + lid)*pixels_per_task;
    last  = first + pixels_per_task;

    for ( p=first ; p<last ; p++)
    {
        // src_c and src_p are the cluster index and the pixel index for B
        int src_c = p/pixels_per_cluster;
        int src_p = p%pixels_per_cluster;

        int sum_l;

        // We use the specific values of the vertical ep-filter
        // To minimize the number of tests, the NL lines are split in three domains 

        // first domain : explicit computation for the first 18 values
        for ( l=0 ; l<18 ; l++)
        {
            // dst_c and dst_l are the cluster index and the line index for C
            int dst_c = l/lines_per_cluster;
            int dst_l = l%lines_per_cluster;

            for ( x=0, sum_l=0 ; x<35 ; x++ )
            {
                sum_l = sum_l + vf[x] * TB(src_c, src_p, max(l-17+x,0) );
            }
            TC(dst_c, dst_l, p) = sum_l/vnorm;
        }
        // second domain
        for ( l = 18 ; l < NL-17 ; l++ )
        {
            // dst_c and dst_l are the cluster index and the line index for C
            int dst_c = l/lines_per_cluster;
            int dst_l = l%lines_per_cluster;

            sum_l = sum_l + TB(src_c, src_p, l+4)
                          + TB(src_c, src_p, l+8)
                          + TB(src_c, src_p, l+11)
                          + TB(src_c, src_p, l+15)
                          + TB(src_c, src_p, l+17)
                          - TB(src_c, src_p, l-5)
                          - TB(src_c, src_p, l-9)
                          - TB(src_c, src_p, l-12)
                          - TB(src_c, src_p, l-16)
                          - TB(src_c, src_p, l-18);
            TC(dst_c, dst_l, p) = sum_l/vnorm;
        }
        // third domain
        for ( l = NL-17 ; l < NL ; l++ )
        {
            // dst_c and dst_l are the cluster index and the line index for C
            int dst_c = l/lines_per_cluster;
            int dst_l = l%lines_per_cluster;

            sum_l = sum_l + TB(src_c, src_p, min(l+4,NL-1))
                          + TB(src_c, src_p, min(l+8,NL-1))
                          + TB(src_c, src_p, min(l+11,NL-1))
                          + TB(src_c, src_p, min(l+15,NL-1))
                          + TB(src_c, src_p, min(l+17,NL-1))
                          - TB(src_c, src_p, l-5)
                          - TB(src_c, src_p, l-9)
                          - TB(src_c, src_p, l-12)
                          - TB(src_c, src_p, l-16)
                          - TB(src_c, src_p, l-18);
            TC(dst_c, dst_l, p) = sum_l/vnorm;
        }
        PRINTF(" - column %d computed at cycle %d\n", p, proctime());
    }

    delta = proctime() - date;
    date  = date + delta;
    PRINTF("*** Completing vertical filter at cycle %d (%d)\n", date, delta);

    barrier_wait(2);

    ////////////////////////////////////////////////////////////////
    // final computation and parallel display 
    // Z <= D + C
    // Each processor use its private DMA channel to display 
    // the resulting image, line  per line (one byte per pixel).
    // Eah processor computes & displays (NL/ntasks) lines. 

    delta = proctime() - date;
    date  = date + delta;
    PRINTF("\n*** Starting display at cycle %d (%d)\n", date, delta);

    for ( l = 0 ; l < lines_per_task ; l++)
    {
        for ( p = 0 ; p < NP ; p++)
        {
           TZ(cid,l,p) = (unsigned char)(((TD(cid,l,p) + TC(cid,l,p))>>8) & 0xFF);
        }
        fb_write(NP*(cid*lines_per_cluster+lid*lines_per_task+l), &TZ(cid,l,0), NP);
    }

    delta = proctime() - date;
    date  = date + delta;
    PRINTF("*** Completing display at cycle %d (%d)\n", date, delta);

    while(1);

} // end main()

