///////////////////////////////////////////////////////////////////////////////////////
// File   : classif.c
// Date   : november 2014
// author : Alain Greiner
///////////////////////////////////////////////////////////////////////////////////////
// This multi-threaded application takes a stream of Gigabit Ethernet packets,
// and makes packet analysis and classification, based on the source MAC address.
// It uses the NIC peripheral, and the distributed kernel chbufs accessed by the CMA 
// component to receive and send packets on the Gigabit Ethernet port. 
//
// It can run on architectures containing up to 16 * 16 clusters,
// and from 3 to 8 processors per cluster.
//
// It requires one shared TTY terminal.
//
// This application is described as a TCG (Thread and Communication Graph) 
// containing (N+2) threads per cluster.
// - one "load" thread
// - one "store" thread
// - N "analyse" threads
// The containers are distributed (N+2 containers per cluster):
// - one RX container (part of the kernel rx_chbuf), in the kernel heap.
// - one TX container (part of the kernel tx-chbuf), in the kernel heap.
// - N working containers (one per analysis thread), in the user heap.
// In each cluster, the "load", analysis" and "store" threads communicates through
// three local MWMR fifos: 
// - fifo_l2a : tranfer a full container from "load" to "analyse" thread.
// - fifo_a2s : transfer a full container from "analyse" to "store" thread.
// - fifo_s2l : transfer an empty container from "store" to "load" thread.
// For each fifo, one item is a 32 bits word defining the index of an
// available working container.
// The pointers on the working containers, and the pointers on the MWMR fifos
// are global arrays stored in cluster[0][0].
//
// The main thread exit after global initialisation, and launching the other threads: 
// It does not use the pthread_join() construct. It is executed on P[0,0,1],
// toavoid overload on P[0,0,0].
//
// Initialisation is made in two steps:
//
// 1) The global, shared, variables are initialised by the main thread:
//    - shared TTY
//    - distributed heap (one heap per cluster)
//    - distributed rx_barrier (between all "load" threads)
//    - distributed tx_barrier (between all "store" threads)
//    - RX kernel chbufs (used by "load" threads)
//    - TX kernel chbufs (used by "store" threads)
//    Then the main thread exit, after launching the "load, "store", and "analyse"
//    threads in all clusters.
//
// 2) Each "load" thread allocates containers[x][y][n] from local heap,
//    and register containers pointers in the local stack.
//    Each "load" thread allocates data buffers & mwmr fifo descriptors
//    from local heap, and register pointers in global arrays.
//    Each "load" thread initialises the containers as empty in fifo_s2l.
//    Then each "load" thread signals mwmr fifos initialisation completion
//    to other threads in same cluster, using the local_sync[x][y] variables.
//
// When initialisation is completed, all threads are running in parallel:
//
// 1) The "load" thread get an empty working container from the fifo_s2l,
//    transfer one container from the kernel rx_chbuf to this user container,
//    and transfer ownership of this container to one "analysis" thread by writing
//    into the fifo_l2a.    
// 2) The "analyse" thread get one working container from the fifo_l2a, analyse
//    each packet header, compute the packet type (depending on the SRC MAC address),
//    increment the correspondint classification counter, and transpose the SRC
//    and the DST MAC addresses fot TX tranmission.
// 3) The "store" thread transfer get a full working container from the fifo_a2s,
//    transfer this user container content to the the kernel tx_chbuf,
//    and transfer ownership of this empty container to the "load" thread by writing
//    into the fifo_s2l.   
//     
// Instrumentation results display is done by the "store" thread in cluster[0][0]
// when all "store" threads completed the number of clusters specified by the
// CONTAINERS_MAX parameter.
///////////////////////////////////////////////////////////////////////////////////////

#include "stdio.h"
#include "user_barrier.h"
#include "malloc.h"
#include "user_lock.h"
#include "mwmr_channel.h"

#define X_SIZE_MAX        16
#define Y_SIZE_MAX        16
#define NPROCS_MAX        8
#define CONTAINERS_MAX    5000
#define VERBOSE_ANALYSE   0

// macro to use a shared TTY
#define printf(...);    { lock_acquire( &tty_lock ); \
                          giet_tty_printf(__VA_ARGS__);  \
                          lock_release( &tty_lock ); }


///////////////////////////////////////////////////////////////////////////////////////
//    Global variables
// The MWMR channels (descriptors and buffers), as well as the working containers 
// used by the "analysis" threads are distributed in clusters.
// But the pointers on these distributed structures are stored in cluster[0][0].
///////////////////////////////////////////////////////////////////////////////////////

// pointers on distributed containers
unsigned int*         container[X_SIZE_MAX][Y_SIZE_MAX][NPROCS_MAX-2];  

// pointers on distributed mwmr fifos containing container descriptors
mwmr_channel_t*       mwmr_l2a[X_SIZE_MAX][Y_SIZE_MAX];  
mwmr_channel_t*       mwmr_a2s[X_SIZE_MAX][Y_SIZE_MAX];
mwmr_channel_t*       mwmr_s2l[X_SIZE_MAX][Y_SIZE_MAX]; 

// local synchros signaling local MWMR fifos initialisation completion
volatile unsigned int local_sync[X_SIZE_MAX][Y_SIZE_MAX];  

// lock protecting shared TTY
user_lock_t           tty_lock;

// distributed barrier between "load" threads
giet_sqt_barrier_t    rx_barrier;

// distributed barrier between "store" threads
giet_sqt_barrier_t    tx_barrier;

// instrumentation counters
unsigned int          counter[16];

// threads arguments array
unsigned int thread_arg[16][16][4];

////////////////////////////////////////////////////////////
__attribute__ ((constructor)) void load( unsigned int* arg )
////////////////////////////////////////////////////////////
{
    // get plat-form parameters
    unsigned int x_size;                       // number of clusters in a row
    unsigned int y_size;                       // number of clusters in a column
    unsigned int nprocs;                       // number of processors per cluster
    giet_procs_number( &x_size , &y_size , &nprocs );

    // each "load" thread get processor identifiers
    unsigned int    x;
    unsigned int    y;
    unsigned int    p;
    giet_proc_xyp( &x, &y, &p );

    // each "load" thread allocates containers[x][y][n] (from local heap)
    // and register pointers in the local stack
    unsigned int   n;
    unsigned int*  cont[NPROCS_MAX-2]; 

    for ( n = 0 ; n < (nprocs - 2) ; n++ )
    {
        container[x][y][n] = malloc( 4096 );
        cont[n]            = container[x][y][n];
    }
    
    // each "load" thread allocates data buffers for mwmr fifos (from local heap)
    unsigned int*  data_l2a = malloc( (nprocs - 2)<<2 );
    unsigned int*  data_a2s = malloc( (nprocs - 2)<<2 );
    unsigned int*  data_s2l = malloc( (nprocs - 2)<<2 );

    // each "load" thread allocates mwmr fifos descriptors (from local heap)
    mwmr_l2a[x][y] = malloc( sizeof(mwmr_channel_t) );
    mwmr_a2s[x][y] = malloc( sizeof(mwmr_channel_t) );
    mwmr_s2l[x][y] = malloc( sizeof(mwmr_channel_t) );

    // each "load" thread registers local pointers on mwmr fifos in local stack
    mwmr_channel_t* fifo_l2a = mwmr_l2a[x][y];
    mwmr_channel_t* fifo_a2s = mwmr_a2s[x][y];
    mwmr_channel_t* fifo_s2l = mwmr_s2l[x][y];

    // each "load" thread initialises local mwmr fifos descriptors
    // ( width = 4 bytes / depth = number of analysis threads )
    mwmr_init( fifo_l2a , data_l2a , 1 , (nprocs - 2) );
    mwmr_init( fifo_a2s , data_a2s , 1 , (nprocs - 2) );
    mwmr_init( fifo_s2l , data_s2l , 1 , (nprocs - 2) );

    
    // each "load" thread initialises local containers as empty in fifo_s2l
    for ( n = 0 ; n < (nprocs - 2) ; n++ ) mwmr_write( fifo_s2l , &n , 1 );

    // each "load" thread signals mwmr fifos initialisation completion
    // to other threads in same cluster.
    local_sync[x][y] = 1;

    // only "load" thread[0][0] displays status
    if ( (x==0) && (y==0) )
    {
        printf("\n[CLASSIF] load on P[%d,%d,%d] enters main loop at cycle %d\n"
               "      &mwmr_l2a  = %x / &data_l2a  = %x\n"
               "      &mwmr_a2s  = %x / &data_a2s  = %x\n"
               "      &mwmr_s2l  = %x / &data_s2l  = %x\n"
               "      &cont[0]   = %x\n"
               "      x_size = %d / y_size = %d / nprocs = %d\n",
               x , y , p , giet_proctime(), 
               (unsigned int)fifo_l2a, (unsigned int)data_l2a,
               (unsigned int)fifo_a2s, (unsigned int)data_a2s,
               (unsigned int)fifo_s2l, (unsigned int)data_s2l,
               (unsigned int)cont[0],
               x_size, y_size, nprocs );
    }

    /////////////////////////////////////////////////////////////
    // "load" thread enters the main loop (on containers)
    unsigned int  count = 0;     // loaded containers count
    unsigned int  index;         // available container index
    unsigned int* temp;          // pointer on available container

    while ( count < CONTAINERS_MAX ) 
    { 
        // get one empty container index from fifo_s2l
        mwmr_read( fifo_s2l , &index , 1 );
        temp = cont[index];

        // get one container from  kernel rx_chbuf
        giet_nic_rx_move( temp );

        // get packets number
        unsigned int npackets = temp[0] & 0x0000FFFF;
        unsigned int nwords   = temp[0] >> 16;

        if ( (x==0) && (y==0) )
        {
            printf("\n[CLASSIF] load on P[%d,%d,%d] get container %d at cycle %d"
                   " : %d packets / %d words\n",
                   x, y, p, index, giet_proctime(), npackets, nwords );
        }

        // put the full container index to fifo_l2a
        mwmr_write( fifo_l2a, &index , 1 );

        count++;
    }

    // all "load" threads synchronise before stats
    sqt_barrier_wait( &rx_barrier );

    // "load" thread[0][0] displays stats
    if ( (x==0) && (y==0) ) giet_nic_rx_stats();

    // all "load" thread exit
    giet_pthread_exit("completed");
 
} // end load()


//////////////////////////////////////////////////////////////
__attribute__ ((constructor)) void store( unsigned int * arg )
//////////////////////////////////////////////////////////////
{
    // get plat-form parameters
    unsigned int x_size;                       // number of clusters in a row
    unsigned int y_size;                       // number of clusters in a column
    unsigned int nprocs;                       // number of processors per cluster
    giet_procs_number( &x_size , &y_size , &nprocs );

    // get processor identifiers
    unsigned int    x;
    unsigned int    y;
    unsigned int    p;
    giet_proc_xyp( &x, &y, &p );

    // each "store" thread wait mwmr channels initialisation
    while ( local_sync[x][y] == 0 ) asm volatile ("nop");

    // each "store" thread registers pointers on working containers in local stack
    unsigned int   n;
    unsigned int*  cont[NPROCS_MAX-2]; 

    for ( n = 0 ; n < (nprocs - 2) ; n++ )
    {
        cont[n] = container[x][y][n];
    }
    
    // each "store" thread registers pointers on mwmr fifos in local stack
    mwmr_channel_t* fifo_l2a = mwmr_l2a[x][y];
    mwmr_channel_t* fifo_a2s = mwmr_a2s[x][y];
    mwmr_channel_t* fifo_s2l = mwmr_s2l[x][y];

    // only "store" thread[0][0] displays status
    if ( (x==0) && (y==0) )
    {
        printf("\n[CLASSIF] store on P[%d,%d,%d] enters main loop at cycle %d\n"
               "      &mwmr_l2a  = %x\n"
               "      &mwmr_a2s  = %x\n"
               "      &mwmr_s2l  = %x\n"
               "      &cont[0]   = %x\n",
               x , y , p , giet_proctime(), 
               (unsigned int)fifo_l2a,
               (unsigned int)fifo_a2s,
               (unsigned int)fifo_s2l,
               (unsigned int)cont[0] );
    }

    /////////////////////////////////////////////////////////////
    // "store" thread enter the main loop (on containers)
    unsigned int count = 0;     // stored containers count
    unsigned int index;         // empty container index
    unsigned int* temp;         // pointer on empty container

    while ( count < CONTAINERS_MAX ) 
    { 
        // get one working container index from fifo_a2s
        mwmr_read( fifo_a2s , &index , 1 );
        temp = cont[index];

        // put one container to kernel tx_chbuf
        giet_nic_tx_move( temp );
 
        // get packets number
        unsigned int npackets = temp[0] & 0x0000FFFF;
        unsigned int nwords   = temp[0] >> 16;

        if ( (x==0) && (y==0) )
        {
            printf("\n[CLASSIF] store on P[%d,%d,%d] get container %d at cycle %d"
                   " : %d packets / %d words\n",
                   x, y, p, index, giet_proctime(), npackets, nwords );
        }

        // put the working container index to fifo_s2l
        mwmr_write( fifo_s2l, &index , 1 );

        count++;
    }

    // all "store" threads synchronise before result display
    sqt_barrier_wait( &tx_barrier );

    // "store" thread[0,0] and displays results 
    if ( (x==0) && (y==0) )
    {
        printf("\nClassification Results\n"
               " - TYPE 0 : %d packets\n"
               " - TYPE 1 : %d packets\n"
               " - TYPE 2 : %d packets\n"
               " - TYPE 3 : %d packets\n"
               " - TYPE 4 : %d packets\n"
               " - TYPE 5 : %d packets\n"
               " - TYPE 6 : %d packets\n"
               " - TYPE 7 : %d packets\n"
               " - TYPE 8 : %d packets\n"
               " - TYPE 9 : %d packets\n"
               " - TYPE A : %d packets\n"
               " - TYPE B : %d packets\n"
               " - TYPE C : %d packets\n"
               " - TYPE D : %d packets\n"
               " - TYPE E : %d packets\n"
               " - TYPE F : %d packets\n"
               "    TOTAL = %d packets\n",
               counter[0x0], counter[0x1], counter[0x2], counter[0x3],
               counter[0x4], counter[0x5], counter[0x6], counter[0x7],
               counter[0x8], counter[0x9], counter[0xA], counter[0xB],
               counter[0xC], counter[0xD], counter[0xE], counter[0xF],
               counter[0x0]+ counter[0x1]+ counter[0x2]+ counter[0x3]+
               counter[0x4]+ counter[0x5]+ counter[0x6]+ counter[0x7]+
               counter[0x8]+ counter[0x9]+ counter[0xA]+ counter[0xB]+
               counter[0xC]+ counter[0xD]+ counter[0xE]+ counter[0xF] );

        giet_nic_tx_stats();
    }

    // all "store" thread exit
    giet_pthread_exit("Thread completed");

} // end store()


///////////////////////////////////////////////////////////////
__attribute__ ((constructor)) void analyse( unsigned int* arg )
///////////////////////////////////////////////////////////////
{
    // get platform parameters
    unsigned int    x_size;						// number of clusters in row
    unsigned int    y_size;                     // number of clusters in a column
    unsigned int    nprocs;                     // number of processors per cluster
    giet_procs_number( &x_size, &y_size, &nprocs );

    // get processor identifiers
    unsigned int    x;
    unsigned int    y;
    unsigned int    p;
    giet_proc_xyp( &x, &y, &p );

    // each "analyse" thread wait mwmr channels initialisation
    while ( local_sync[x][y] == 0 ) asm volatile ("nop");

    // each "analyse" threads register pointers on working containers in local stack
    unsigned int   n;
    unsigned int*  cont[NPROCS_MAX-2]; 
    for ( n = 0 ; n < (nprocs - 2) ; n++ )
    {
        cont[n] = container[x][y][n];
    }

    // each "analyse" threads register pointers on mwmr fifos in local stack
    mwmr_channel_t* fifo_l2a = mwmr_l2a[x][y];
    mwmr_channel_t* fifo_a2s = mwmr_a2s[x][y];

    // only "analyse" thread[0][0] display status
    if ( (x==0) && (y==0) )
    {
        printf("\n[CLASSIF] analyse on P[%d,%d,%d] enters main loop at cycle %d\n"
               "       &mwmr_l2a = %x\n"
               "       &mwmr_a2s = %x\n"
               "       &cont[0]  = %x\n",
               x, y, p, giet_proctime(), 
               (unsigned int)fifo_l2a,
               (unsigned int)fifo_a2s,
               (unsigned int)cont[0] );
    }
      
    //////////////////////////////////////////////////////////////////////
    // all "analyse" threads enter the main infinite loop (on containers)
    unsigned int  index;           // available container index
    unsigned int* temp;            // pointer on available container
    unsigned int  nwords;          // number of words in container
    unsigned int  npackets;        // number of packets in container
    unsigned int  length;          // number of bytes in current packet
    unsigned int  first;           // current packet first word in container
    unsigned int  type;            // current packet type
    unsigned int  pid;             // current packet index

#if VERBOSE_ANALYSE
    unsigned int       verbose_len[10]; // save length for 10 packets in one container
    unsigned long long verbose_dst[10]; // save dest   for 10 packets in one container
    unsigned long long verbose_src[10]; // save source for 10 packets in one container
#endif

    while ( 1 )
    { 

#if VERBOSE_ANALYSE 
            for( pid = 0 ; pid < 10 ; pid++ )
            {
                verbose_len[pid] = 0;
                verbose_dst[pid] = 0;
                verbose_src[pid] = 0;
            }
#endif
        // get one working container index from fifo_l2a
        mwmr_read( fifo_l2a , &index , 1 );
        temp = cont[index];

        // get packets number and words number
        npackets = temp[0] & 0x0000FFFF;
        nwords   = temp[0] >> 16;

        if ( (x==0) && (y==0) )
        {
            printf("\n[CLASSIF] analyse on P[%d,%d,%d] get container at cycle %d"
                   " : %d packets / %d words\n",
                   x, y, p, giet_proctime(), npackets, nwords );
        }

        // initialize word index in container
        first = 34;

        // loop on packets
        for( pid = 0 ; pid < npackets ; pid++ )
        {
            // get packet length from container header
            if ( (pid & 0x1) == 0 )  length = temp[1+(pid>>1)] >> 16;
            else                     length = temp[1+(pid>>1)] & 0x0000FFFF;

            // compute packet DST and SRC MAC addresses
            unsigned int word0 = temp[first];
            unsigned int word1 = temp[first + 1];
            unsigned int word2 = temp[first + 2];

#if VERBOSE_ANALYSE 
            unsigned long long dst = ((unsigned long long)(word1 & 0xFFFF0000)>>16) |
                                     (((unsigned long long)word0)<<16);
            unsigned long long src = ((unsigned long long)(word1 & 0x0000FFFF)<<32) |
                                     ((unsigned long long)word2);
            if ( pid < 10 )
            {
                verbose_len[pid] = length;
                verbose_dst[pid] = dst;
                verbose_src[pid] = src;
            }
#endif
            // compute type from SRC MAC address and increment counter
            type = word1 & 0x0000000F;
            atomic_increment( &counter[type], 1 );

            // exchange SRC & DST MAC addresses for TX
            temp[first]     = ((word1 & 0x0000FFFF)<<16) | ((word2 & 0xFFFF0000)>>16);
            temp[first + 1] = ((word2 & 0x0000FFFF)<<16) | ((word0 & 0xFFFF0000)>>16);
            temp[first + 2] = ((word0 & 0x0000FFFF)<<16) | ((word1 & 0xFFFF0000)>>16);

            // update first word index 
            if ( length & 0x3 ) first += (length>>2)+1;
            else                first += (length>>2);
        }
        
#if VERBOSE_ANALYSE 
        if ( (x==0) && (y==0) )
        {
            printf("\n*** Thread analyse on P[%d,%d,%d] / container %d at cycle %d\n"
                   "   - Packet 0 : plen = %d / dst_mac = %l / src_mac = %l\n"
                   "   - Packet 1 : plen = %d / dst_mac = %l / src_mac = %l\n"
                   "   - Packet 2 : plen = %d / dst_mac = %l / src_mac = %l\n"
                   "   - Packet 3 : plen = %d / dst_mac = %l / src_mac = %l\n"
                   "   - Packet 4 : plen = %d / dst_mac = %l / src_mac = %l\n"
                   "   - Packet 5 : plen = %d / dst_mac = %l / src_mac = %l\n"
                   "   - Packet 6 : plen = %d / dst_mac = %l / src_mac = %l\n"
                   "   - Packet 7 : plen = %d / dst_mac = %l / src_mac = %l\n"
                   "   - Packet 8 : plen = %d / dst_mac = %l / src_mac = %l\n"
                   "   - Packet 9 : plen = %d / dst_mac = %l / src_mac = %l\n",
                   x , y , p , index , giet_proctime() ,  
                   verbose_len[0] , verbose_dst[0] , verbose_src[0] ,
                   verbose_len[1] , verbose_dst[1] , verbose_src[1] ,
                   verbose_len[2] , verbose_dst[2] , verbose_src[2] ,
                   verbose_len[3] , verbose_dst[3] , verbose_src[3] ,
                   verbose_len[4] , verbose_dst[4] , verbose_src[4] ,
                   verbose_len[5] , verbose_dst[5] , verbose_src[5] ,
                   verbose_len[6] , verbose_dst[6] , verbose_src[6] ,
                   verbose_len[7] , verbose_dst[7] , verbose_src[7] ,
                   verbose_len[8] , verbose_dst[8] , verbose_src[8] ,
                   verbose_len[9] , verbose_dst[9] , verbose_src[9] );
        }
#endif
            
        // pseudo-random delay
        unsigned int delay = giet_rand()>>3;
        unsigned int time;
        for( time = 0 ; time < delay ; time++ ) asm volatile ("nop");

        // put the working container index to fifo_a2s
        mwmr_write( fifo_a2s , &index , 1 );
    }
} // end analyse()

//////////////////////////////////////////
__attribute__ ((constructor)) void main()
//////////////////////////////////////////
{
    // indexes for loops
    unsigned int x , y , n;

    // get identifiers for proc executing main
    unsigned int x_id;                          // x cluster coordinate
    unsigned int y_id;                          // y cluster coordinate
    unsigned int p_id;                          // local processor index
    giet_proc_xyp( &x_id , &y_id , &p_id );

    // get plat-form parameters
    unsigned int x_size;                       // number of clusters in a row
    unsigned int y_size;                       // number of clusters in a column
    unsigned int nprocs;                       // number of processors per cluster
    giet_procs_number( &x_size , &y_size , &nprocs );

    // shared TTY allocation
    giet_tty_alloc( 1 );     
    lock_init( &tty_lock);

    // check plat-form parameters
    giet_pthread_assert( ((nprocs >= 3) && (nprocs <= 8)),
                         "[CLASSIF ERROR] number of procs per cluster must in [3...8]");

    giet_pthread_assert( ((x_size >= 1) && (x_size <= 16)),
                         "[CLASSIF ERROR] x_size must be in [1...16]");

    giet_pthread_assert( ((y_size >= 1) && (y_size <= 16)),
                         "[CLASSIF ERROR] y_size must be in [1...16]");

    // distributed heap initialisation
    for ( x = 0 ; x < x_size ; x++ ) 
    {
        for ( y = 0 ; y < y_size ; y++ ) 
        {
            heap_init( x , y );
        }
    }

    printf("\n[CLASSIF] start at cycle %d on %d cores\n", 
           giet_proctime(), (x_size * y_size * nprocs) );

    // thread index 
    // required by pthread_create()
    // unused in this appli because no pthread_join()
    pthread_t   trdid; 

    // rx_barrier initialisation
    sqt_barrier_init( &rx_barrier, x_size , y_size , 1 );

    // tx_barrier initialisation
    sqt_barrier_init( &tx_barrier, x_size , y_size , 1 );

    // allocate and start RX NIC and CMA channels
    giet_nic_rx_alloc( x_size , y_size );
    giet_nic_rx_start();

    // allocate and start TX NIC and CMA channels
    giet_nic_tx_alloc( x_size , y_size );
    giet_nic_tx_start();

    // Initialisation completed
    printf("\n[CLASSIF] initialisation completed at cycle %d\n", giet_proctime() );
    
    // launch load, store and analyse threads
    for ( x = 0 ; x < x_size ; x++ )
    {
        for ( y = 0 ; y < y_size ; y++ )
        {
            for ( n = 0 ; n < nprocs ; n++ )
            {
                // compute argument value 
                thread_arg[x][y][n] = (x<<8) | (y<<4) | n;

                if ( n == 0 )       // "load" thread
                {
                    if ( giet_pthread_create( &trdid,
                                              NULL,                  // no attribute
                                              &load,
                                              &thread_arg[x][y][n] ) )
                    {
                        printf("\n[CLASSIF ERROR] launching thread load\n" );
                        giet_pthread_exit( NULL );
                    }
                    else
                    { 
                        printf("\n[CLASSIF] thread load activated : trdid = %x\n", trdid );
                    }
                }
                else if ( n == 1 )  // "store" thread
                {
                    if ( giet_pthread_create( &trdid,
                                              NULL,                  // no attribute
                                              &store,
                                              &thread_arg[x][y][n] ) )
                    {
                        printf("\n[CLASSIF ERROR] launching thread store\n" );
                        giet_pthread_exit( NULL );
                    }
                    else
                    { 
                        printf("\n[CLASSIF] thread store activated : trdid = %x\n", trdid );
                    }
                }
                else              // "analyse" threads            
                {
                    if ( giet_pthread_create( &trdid,
                                              NULL,              // no attribute
                                              &analyse,
                                              &thread_arg[x][y][n] ) )
                    {
                        printf("\n[CLASSIF ERROR] launching thread analyse\n" );
                        giet_pthread_exit( NULL );
                    }
                    else
                    { 
                        printf("\n[CLASSIF] thread analyse activated : trdid = %x\n", trdid );
                    }  
                }
            }
        }
    }

    giet_pthread_exit( "completed" );
    
} // end main()