//////////////////////////////////////////////////////////////////////////////////
// File    : main.c  (for gameoflife)
// Date    : November 2013 / February 2015
// Authors :  Alexandre Joannou <alexandre.joannou@lip6.fr> november 2013
//            Alain Greiner <alain.greiner@lip6.fr> february 2015
//////////////////////////////////////////////////////////////////////////////////
// This multi-threaded application is an emulation of the Game of Life automaton.
// The world size is defined by the Frame Buffer width and height.
//
// There is at most one thread per processor in the platform.
// - If the number of processors is larger than the number of lines,
//   the number of threads is equal to the number of lines, and
//   each thread process one single line. 
// - if the number of processors is not larger than the number of lines,
//   the number of threads is equal to the number of processors, and
//   each thread process height/nthreads (or height/nthreads + 1) lines.
// 
// Thread running on processor P(0,0,0) execute the main() function,
// that initialises the barrier, the TTY terminal, the CMA controler,
// and launch the other threads, before calling the execute function.
// Other threads are just running the execute() function.
// 
// The total number of processors cannot be larger than 1024 = 16 * 16 * 4
//////////////////////////////////////////////////////////////////////////////////

#include "stdio.h"
#include "limits.h"
#include "user_barrier.h"
#include "mapping_info.h"
#include "hard_config.h"
#include "malloc.h"

#define VERBOSE         1

typedef unsigned char uint8_t;

typedef struct
{
    unsigned int    index;    // index of first line to be processed
    unsigned int    lines;    // number of lines to be processed 
}   arguments_t;

arguments_t   args[1024];     // at most 1024 threads 

uint8_t world[2][256][256] __attribute__((aligned(64)));

uint8_t display[2][256][256] __attribute__((aligned(64)));

unsigned int status0[16] __attribute__((aligned(64)));
unsigned int status1[16] __attribute__((aligned(64)));

giet_sqt_barrier_t barrier;

unsigned int width;
unsigned int height;

////////////////////////////////////
void init_world( unsigned int phase,
                 unsigned int base_line,
                 unsigned int nb_line )
{
   unsigned int x,y;
   for (y = base_line ; y < base_line + nb_line ; y++)
   {
      for(x = 0 ; x < width ; x++) 
      {
         world[phase][y][x] = (giet_rand() >> (x % 8)) & 0x1;
      }
   }
}

//////////////////////////////////////////////////////
uint8_t number_of_alive_neighbour( unsigned int phase,
                                   unsigned int x, 
                                   unsigned int y )
{
   uint8_t nb = 0;

   nb += world[phase][(y - 1) % height][(x - 1) % width];
   nb += world[phase][ y              ][(x - 1) % width];
   nb += world[phase][(y + 1) % height][(x - 1) % width];
   nb += world[phase][(y - 1) % height][ x             ];
   nb += world[phase][(y + 1) % height][ x             ];
   nb += world[phase][(y - 1) % height][(x + 1) % width];
   nb += world[phase][ y              ][(x + 1) % width];
   nb += world[phase][(y + 1) % height][(x + 1) % width];

   return nb;
}

/////////////////////////////////////////
uint8_t compute_cell( unsigned int phase,
                      unsigned int x, 
                      unsigned int y )
{
   uint8_t nb_neighbours_alive = number_of_alive_neighbour( phase, x , y );

   if (world[phase][y][x] == 1) 
   {
      if (nb_neighbours_alive == 2 || nb_neighbours_alive == 3)  return 1;
   }
   else 
   {
      if (nb_neighbours_alive == 3) return 1;
      else                          return world[phase][y][x];
   }
   return 0;
}

/////////////////////////////////////////
void compute_new_gen( unsigned int phase,
                      unsigned int base_line, 
                      unsigned int nb_line )
{
   unsigned int x,y;
   for (y = base_line; y < base_line + nb_line; y++)
   {
      for(x = 0; x < width ; x++) 
      {
         world[phase][y][x] = compute_cell( 1 - phase , x , y );  
      }
   }
}

////////////////////////////////////
void copy_world( unsigned int phase,
                 unsigned int base_line,
                 unsigned int nb_line )
{
   unsigned int x,y;
   for (y = base_line; y < base_line + nb_line; y++)
   {
      for(x = 0; x < width ; x++) 
      {
         display[phase][y][x] = world[phase][y][x]*255;  
      }
   }
}



///////////////////////////////////////////////////////////////
__attribute__((constructor)) void execute( arguments_t* pargs )
///////////////////////////////////////////////////////////////
{
   unsigned int nb_lines      = pargs->lines;
   unsigned int base_line     = pargs->index;

   ///////////// parallel world  initialization 

   // All processors initialize world[0]
   init_world( 0 , base_line , nb_lines );

   // copy world[0] to display[0]
   copy_world( 0 , base_line , nb_lines );

   // synchronise with other procs
   sqt_barrier_wait( &barrier );

   // main() makes display[0]
   if ( base_line == 0 ) giet_fbf_cma_display ( 0 );

   //////////// evolution : 2 steps per iteration 

   unsigned int i = 0;
   while( 1 )
   {
      // compute world[1] from world[0]
      compute_new_gen( 1 , base_line , nb_lines );

      // copy world[1] to display[1]
      copy_world( 1 , base_line , nb_lines );

      // synchronise with other procs
      sqt_barrier_wait( &barrier );

      // main makes display[1]
      if ( base_line == 0 ) giet_fbf_cma_display ( 1 );
   
#if VERBOSE
      if ( base_line == 0 ) giet_tty_printf(" - step %d\n", 2*i );
#endif
   
      // compute world[0] from world[1]
      compute_new_gen( 0 , base_line , nb_lines );

      // copy world[0] to display[0]
      copy_world( 0 , base_line , nb_lines );

      // synchronise with other procs
      sqt_barrier_wait( &barrier );

      // main makes display[0]
      if ( base_line == 0 ) giet_fbf_cma_display ( 0 );

#if VERBOSE
      if ( base_line == 0 ) giet_tty_printf(" - step %d\n", 2*i + 1 );
#endif

      i++;

   } // end evolution loop

   giet_pthread_exit("Completed");

} // end main()



////////////////////////////////////////
__attribute__((constructor)) void main()
////////////////////////////////////////
{
   // get processor identifier
   unsigned int x;
   unsigned int y;
   unsigned int p;
   giet_proc_xyp( &x, &y, &p );

   // get platform parameters
   unsigned int x_size;
   unsigned int y_size;
   unsigned int nprocs;
   giet_procs_number( &x_size, &y_size, &nprocs );

   // get a shared TTY 
   giet_tty_alloc( 1 );

   giet_pthread_assert( (x_size <= 16) , "x_size no larger than 16" );
   giet_pthread_assert( (y_size <= 16) , "y_size no larger than 16" );
   giet_pthread_assert( (nprocs <=  4) , "nprocs no larger than 4" );

   // get FBF width and height
   giet_fbf_size( &width , &height );

   giet_pthread_assert( (width  <= 256)   , "FBF width larger than 256" );
   giet_pthread_assert( (height <= 256)   , "FBF height larger than 256" );
   giet_pthread_assert( (width  && height) , "FBF not available" );

   // compute number of threads and min number of lines per thread
   // extra is the number of threads that must process one extra line
   unsigned int total_procs = x_size * y_size * nprocs; 
   unsigned int nthreads;
   unsigned int nlines;
   unsigned int extra;
   if ( total_procs > height )
   {
      nthreads = height;
      nlines   = 1;
      extra    = 0;
   }
   else
   {
      nthreads = total_procs;
      nlines   = height / total_procs;
      extra    = height % total_procs;  
   }

   // get FBF ownership
   giet_fbf_alloc();

   // get a Chained Buffer DMA channel
   giet_fbf_cma_alloc();

   // initializes the source and destination buffers
   giet_fbf_cma_init_buf( &display[0][0][0] , 
                          &display[1][0][0] , 
                          status0 ,
                          status1 );

   // activates CMA channel
   giet_fbf_cma_start( height * width );

   // initializes distributed heap
   unsigned int cx;
   unsigned int cy;
   for ( cx = 0 ; cx < x_size ; cx++ )
   {
      for ( cy = 0 ; cy < y_size ; cy++ )
      {
         heap_init( cx , cy );
      }
   }

   // initialises barrier
   sqt_barrier_init( &barrier , x_size , y_size , nprocs );

   giet_tty_printf("\n[GAMEOFLIFE] P[%d,%d,%d] completes initialisation at cycle %d\n"
                   " nprocs = %d / nlines = %d / nthreads = %d\n", 
                   x, y, p, giet_proctime() , total_procs , height , nthreads );

   // compute arguments (index, nlines) for all threads
   unsigned int n;                   // thread index
   unsigned int index;               // first line index 
   for ( n = 0 , index = 0 ; n < nthreads ; n++ )
   {
      if ( extra )
      {
         args[n].index = index;
         args[n].lines = nlines + 1;
         index         = index + nlines + 1;
      }
      else
      {
         args[n].index = index;
         args[n].lines = nlines;
         index         = index + nlines;
      }
#if VERBOSE      
giet_tty_printf("[GAMEOFLIFE] Thread %d : first = %d / nlines = %d\n",
                n , args[n].index , args[n].lines );
#endif
   }

   // launch all other threads
   pthread_t  trdid;                 // unused because no pthread_join()
   for ( n = 1 ; n < nthreads ; n++ )
   {
      if ( giet_pthread_create( &trdid,
                                NULL,                  // no attribute
                                &execute,
                                &args[n] ) )
      {
          giet_tty_printf("\n[TRANSPOSE ERROR] creating thread %x\n", n );
          giet_pthread_exit( NULL );
      }
   }

   // run execute function
   execute( &args[0] );

   giet_pthread_exit( "completed" );
    
} // end main()



// Local Variables:
// tab-width: 3
// c-basic-offset: 3
// c-file-offsets:((innamespace . 0)(inline-open . 0))
// indent-tabs-mode: nil
// End:

// vim: filetype=cpp:expandtab:shiftwidth=3:tabstop=3:softtabstop=3



