
#include "stdio.h"
#include "limits.h"
#include "../giet_tsar/block_device.h"

#define NL              512
#define NP              512
#define NB_IMAGES       1
#define NB_CLUSTER_MAX  256

#define PRINTF(...)      ({ if (proc_id == 0) { tty_printf(__VA_ARGS__); } })

//#define DISPLAY_ONLY

///////////////////////////////////////////
// tricks to read parameters from ldscript
///////////////////////////////////////////

struct plaf;

extern struct plouf seg_ioc_base;
extern struct plaf seg_heap_base;
extern struct plaf NB_PROCS;
extern struct plaf NB_CLUSTERS;

/////////////
void main(){
   unsigned int frame = 0;
   unsigned int date  = 0;

   unsigned int c; // cluster index for loops
   unsigned int l; // line index for loops
   unsigned int p; // pixel index for loops

   unsigned int proc_id       = procid();                      // processor id
   unsigned int nlocal_procs  = (unsigned int) &NB_PROCS;      // number of processors per cluster
   unsigned int nclusters     = (unsigned int) &NB_CLUSTERS;   // number of clusters
   unsigned int local_id      = proc_id % nlocal_procs;        // local processor id
   unsigned int cluster_id    = proc_id / nlocal_procs;        // cluster id
   unsigned int base          = (unsigned int) &seg_heap_base; // base address for shared buffers
   unsigned int increment     = 0x80000000 / nclusters * 2;    // cluster increment
   unsigned int nglobal_procs = nclusters * nlocal_procs;      // number of tasks
   unsigned int npixels       = NP * NL;                       // number of pixel per frame
   
   unsigned int * ioc_address = (unsigned int *) &seg_ioc_base;
   unsigned int block_size    = ioc_address[BLOCK_DEVICE_BLOCK_SIZE];
   unsigned int nblocks       = npixels / block_size;   // number of blocks per frame

   PRINTF("\n *** Entering main at cycle %d ***\n\n", proctime());

   //  parameters checking
   if ((nlocal_procs != 1) && (nlocal_procs != 2) && (nlocal_procs != 4)){
      PRINTF("NB_PROCS must be 1, 2 or 4\n");
      exit(1);
   }
   if ((nclusters != 1) && (nclusters != 2) && (nclusters != 4) && (nclusters != 8) &&
         (nclusters != 16) && (nclusters != 32) && (nclusters != 64) && (nclusters != 128) &&
         (nclusters != 256)){
      PRINTF("NB_CLUSTERS must be a power of 1 between 1 and 256\n");
      exit(1);
   }
   if (nglobal_procs > 1024){
      PRINTF("NB_PROCS * NB_CLUSTERS cannot be larger than 1024\n");
      exit(1);
   }
   if (proc_id >= nglobal_procs){
      PRINTF("processor id %d larger than NB_CLUSTERS*NB_PROCS\n", proc_id);
      exit(1);
   }

   // Arrays of pointers on the shared, distributed buffers containing the frames 
   // These arrays are indexed by the cluster index (sized for the worst case : 256 clusters)
   unsigned char * A[NB_CLUSTER_MAX];
   unsigned char * B[NB_CLUSTER_MAX];

   // Arrays of pointers on the instrumentation arrays
   // These arrays are indexed by the cluster index (sized for the worst case : 256 clusters)
   // each pointer points on the base adress of an array of NPROCS unsigned int
   unsigned int * LOAD_START[NB_CLUSTER_MAX];
   unsigned int * LOAD_END[NB_CLUSTER_MAX];
   unsigned int * TRSP_START[NB_CLUSTER_MAX];
   unsigned int * TRSP_END[NB_CLUSTER_MAX];
   unsigned int * DISP_START[NB_CLUSTER_MAX];
   unsigned int * DISP_END[NB_CLUSTER_MAX];

   // shared buffers address definition 
   // from the seg_heap_base and increment depending on the cluster index
   // These arrays of pointers are identical and replicated in the stack of each task 
   for (c = 0; c < nclusters; c++){
      A[c]          = (unsigned char *) (base                                  + increment * c);
      B[c]          = (unsigned char *) (base +     npixels                    + increment * c);
      LOAD_START[c] = (unsigned int *)  (base + 2 * npixels                    + increment * c);
      LOAD_END[c]   = (unsigned int *)  (base + 2 * npixels +     nlocal_procs + increment * c);
      TRSP_START[c] = (unsigned int *)  (base + 2 * npixels + 2 * nlocal_procs + increment * c);
      TRSP_END[c]   = (unsigned int *)  (base + 2 * npixels + 3 * nlocal_procs + increment * c);
      DISP_START[c] = (unsigned int *)  (base + 2 * npixels + 4 * nlocal_procs + increment * c);
      DISP_END[c]   = (unsigned int *)  (base + 2 * npixels + 5 * nlocal_procs + increment * c);
   }

   PRINTF("NB_CLUSTERS     = %d\n", nclusters); 
   PRINTF("NB_LOCAL_PROCS  = %d\n", nlocal_procs); 
   PRINTF("NB_GLOBAL_PROCS = %d\n", nglobal_procs);
   PRINTF("NB_PIXELS       = %d\n", npixels);
   PRINTF("BLOCK_SIZE      = %d\n", block_size);
   PRINTF("NB_BLOCKS       = %d\n\n", nblocks);


   PRINTF("*** Starting barrier init at cycle %d ***\n", proctime());

   //  barriers initialization
   barrier_init(0, nglobal_procs);
   barrier_init(1, nglobal_procs);
   barrier_init(2, nglobal_procs);

   PRINTF("*** Completing barrier init at cycle %d ***\n", proctime());

   // Main loop (on frames)
   while (frame < NB_IMAGES){
      // pseudo parallel load from disk to A[c] buffer : nblocks/nclusters blocks
      // only task running on processor with (local_id == 0) does it

      if (local_id == 0){
         int p;

         date = proctime();
         PRINTF("\n*** Starting load for frame %d at cycle %d\n", frame, date);
         
         for (p = 0; p < nlocal_procs; p++){
            LOAD_START[cluster_id][p] = date;
         }
         if (ioc_read(frame * nblocks + nblocks * cluster_id / nclusters, A[cluster_id], nblocks / nclusters)){
            PRINTF("echec ioc_read\n");
            exit();
         }
         if (ioc_completed()){
            PRINTF("echec ioc_completed\n");
            exit();
         }

         date = proctime();
         PRINTF("*** Completing load for frame %d at cycle %d\n", frame, date);
         for (p = 0; p < nlocal_procs; p++){
            LOAD_END[cluster_id][p] = date;
         }
      }

      barrier_wait(0);

      // parallel transpose from A to B buffers
      // each processor makes the transposition for (NL/nglobal_procs) lines
      // (p,l) are the (x,y) pixel coordinates in the source frame

#ifndef DISPLAY_ONLY
      date = proctime();
      PRINTF("\n*** Starting transpose for frame %d at cycle %d\n", frame, date);
      TRSP_START[cluster_id][local_id] = date;

      unsigned int nlt   = NL / nglobal_procs; // Nombre de ligne à traiter par processeur
      unsigned int first = proc_id * nlt;      // Index de la première ligne à traiter pour le proc courant (celui qui exécute le code)
      unsigned int last  = first + nlt;        // Index de la dernière ligne
      unsigned int nlines_clusters = NL / nclusters; // Nombre de lignes à traiter par cluster
      unsigned int npix_clusters   = NP / nclusters; // Nombre de pixels par ligne à traiter par cluster

      for (l = first; l < last; l++){
         PRINTF("    - processing line %d\n", l);
         for (p = 0; p < NP; p++){
            unsigned int source_index   = (l % nlines_clusters) * NP + p;
            unsigned int dest_cluster   = p / npix_clusters;
            unsigned int dest_index     = (p % npix_clusters) * NL + l;
            B[dest_cluster][dest_index] = A[cluster_id][source_index];
         }
      }

      date = proctime();
      PRINTF("*** Completing transpose for frame %d at cycle %d\n", frame, date);
      TRSP_END[cluster_id][local_id] = date;
      barrier_wait(1);
#endif

      // parallel display from B[c] to frame buffer 
      // each processor uses its private dma to display NL*NP/nglobal_procs pixels

      date = proctime();
      PRINTF("\n*** Starting display for frame %d at cycle %d\n", frame, date);
      DISP_START[cluster_id][local_id] = date;

      unsigned int npxt = npixels / nglobal_procs;   // number of pixels per proc

#ifndef DISPLAY_ONLY
      if (fb_write(npxt * proc_id, B[cluster_id] + npxt * local_id, npxt)){
         PRINTF("[%d]: echec fb_sync_write\n", proc_id);
         exit();
      }
#else
      if (fb_write(npxt * proc_id, A[cluster_id] + npxt * local_id, npxt)){
         PRINTF("[%d]: echec fb_sync_write\n", proc_id);
         exit();
      }
#endif

      if (fb_completed()){
         PRINTF("[%d]: echec fb_completed\n", proc_id);
         exit();
      }

      date = proctime();
      PRINTF("*** Completing display for frame %d at cycle %d\n", frame, date);
      DISP_END[cluster_id][local_id] = date;

      barrier_wait(2);

      // Instrumentation (done by processor 0 in cluster 0)
      if (local_id == 0){ 
         date = proctime();
         PRINTF("\n*** Starting Instrumentation for frame %d at cycle %d\n\n", frame, date);

         int cc, pp;
         unsigned int min_load_start = INT_MAX;
         unsigned int max_load_start = 0;
         unsigned int min_load_ended = INT_MAX;
         unsigned int max_load_ended = 0;
         unsigned int min_trsp_start = INT_MAX;
         unsigned int max_trsp_start = 0;
         unsigned int min_trsp_ended = INT_MAX;
         unsigned int max_trsp_ended = 0;
         unsigned int min_disp_start = INT_MAX;
         unsigned int max_disp_start = 0;
         unsigned int min_disp_ended = INT_MAX;
         unsigned int max_disp_ended = 0;

         for (cc = 0; cc < nclusters; cc++){
            for (pp = 0; pp < nlocal_procs; pp++){
               if (LOAD_START[cc][pp] < min_load_start){
                  min_load_start = LOAD_START[cc][pp];
               }
               if (LOAD_START[cc][pp] > max_load_start){
                  max_load_start = LOAD_START[cc][pp];
               }
               if (LOAD_END[cc][pp] < min_load_ended){
                  min_load_ended = LOAD_END[cc][pp];
               }
               if (LOAD_END[cc][pp] > max_load_ended){
                  max_load_ended = LOAD_END[cc][pp];
               }

               if (TRSP_START[cc][pp] < min_trsp_start){
                  min_trsp_start = TRSP_START[cc][pp];
               }
               if (TRSP_START[cc][pp] > max_trsp_start){
                  max_trsp_start = TRSP_START[cc][pp];
               }
               if (TRSP_END[cc][pp] < min_trsp_ended){
                  min_trsp_ended = TRSP_END[cc][pp];
               }
               if (TRSP_END[cc][pp] > max_trsp_ended){
                  max_trsp_ended = TRSP_END[cc][pp];
               }

               if (DISP_START[cc][pp] < min_disp_start){
                  min_disp_start = DISP_START[cc][pp];
               }
               if (DISP_START[cc][pp] > max_disp_start){
                  max_disp_start = DISP_START[cc][pp];
               }
               if (DISP_END[cc][pp] < min_disp_ended){
                  min_disp_ended = DISP_END[cc][pp];
               }
               if (DISP_END[cc][pp] > max_disp_ended){
                  max_disp_ended = DISP_END[cc][pp];
               }
            }
         }

         PRINTF(" - LOAD_START : min = %d / max = %d / med = %d / delta = %d\n",
               min_load_start, max_load_start, (min_load_start+max_load_start)/2, max_load_start-min_load_start); 
         PRINTF(" - LOAD_END   : min = %d / max = %d / med = %d / delta = %d\n",
               min_load_ended, max_load_ended, (min_load_ended+max_load_ended)/2, max_load_ended-min_load_ended); 

         PRINTF(" - TRSP_START : min = %d / max = %d / med = %d / delta = %d\n",
               min_trsp_start, max_trsp_start, (min_trsp_start+max_trsp_start)/2, max_trsp_start-min_trsp_start); 
         PRINTF(" - TRSP_END   : min = %d / max = %d / med = %d / delta = %d\n",
               min_trsp_ended, max_trsp_ended, (min_trsp_ended+max_trsp_ended)/2, max_trsp_ended-min_trsp_ended); 

         PRINTF(" - DISP_START : min = %d / max = %d / med = %d / delta = %d\n",
               min_disp_start, max_disp_start, (min_disp_start+max_disp_start)/2, max_disp_start-min_disp_start); 
         PRINTF(" - DISP_END   : min = %d / max = %d / med = %d / delta = %d\n",
               min_disp_ended, max_disp_ended, (min_disp_ended+max_disp_ended)/2, max_disp_ended-min_disp_ended); 

         PRINTF(" - BARRIER TRSP/DISP = %d\n", min_disp_start - max_trsp_ended);
      }
      frame++;

   } // end while frame      

   PRINTF("*** End of main ***\n");

   while(1);
} // end main()

// Local Variables:
// tab-width: 3
// c-basic-offset: 3
// c-file-offsets:((innamespace . 0)(inline-open . 0))
// indent-tabs-mode: nil
// End:

// vim: filetype=cpp:expandtab:shiftwidth=3:tabstop=3:softtabstop=3



