///////////////////////////////////////////////////////////////////////////////
// File   :  sort.c
// Date   :  November 2013
// Author :  Cesar Fuguet Tortolero <cesar.fuguet-tortolero@lip6.fr>
///////////////////////////////////////////////////////////////////////////////
// This multi-threaded application implement a multi-stage sort application.
// The various stages are separated by synchronisation barriers.
// There is one thread per physical processors. Computation is organised as
// a binary tree: All threads contribute to the first stage of parallel sort
// but, the number of participating threads is divided by 2 at each next stage.
//       Number_of_stages = number of barriers = log2(Number_of_threads)
//
// Constraints :
// - It supports up to 1024 processors and the number of processors
//   must be a power of 2.
// _ The array of values to be sorted (ARRAY_LENGTH) must be power of 2 
//   larger than the number of processors.
// - This application uses a single TTY terminal, shared by all threads,
//   that is protectted by an user-level SQT lock.
///////////////////////////////////////////////////////////////////////////////

#include "stdio.h"
#include "mapping_info.h"
#include "user_barrier.h"
#include "user_lock.h"

#define ARRAY_LENGTH    0x400
#define VERBOSE         0

// macro to use a shared TTY
#define printf(...);  { lock_acquire( &tty_lock ); \
                        giet_tty_printf(__VA_ARGS__);  \
                        lock_release( &tty_lock ); }

// argument for the sort() function
typedef struct
{
    unsigned int threads;     // number of threads (one per core
    unsigned int index;       // user defined thread index
}   args_t;

//////////////////////////////////////////
//  Global variables
//////////////////////////////////////////

int              array0[ARRAY_LENGTH];
int              array1[ARRAY_LENGTH];

giet_barrier_t   barrier[10];

user_lock_t      tty_lock;   


////////////////////////////////////
void bubbleSort( int *        array,
                 unsigned int length,
                 unsigned int init_pos )
{
    int i;
    int j;
    int aux;

    for(i = 0; i < length; i++)
    {
        for(j = init_pos; j < (init_pos + length - i - 1); j++)
        {
            if(array[j] > array[j + 1])
            {
                aux          = array[j + 1];
                array[j + 1] = array[j];
                array[j]     = aux;
            }
        }
    }
}  // end bubbleSort()


/////////////////////////
void merge( int * array,
            int * result,
            int length,
            int init_pos_a,
            int init_pos_b,
            int init_pos_result )
{
    int i;
    int j;
    int k;

    i = 0;
    j = 0;
    k = init_pos_result;

    while((i < length) || (j < length))
    {
        if((i < length) && (j < length))
        {
            if(array[init_pos_a + i] < array[init_pos_b + j])
            {
                result[k++] = array[init_pos_a + i];
                i++;
            }
            else
            {
                result[k++] = array[init_pos_b + j];
                j++;
            }
        }
        else if(i < length)
        {
            result[k++] = array[init_pos_a + i];
            i++;
        }
        else
        {
            result[k++] = array[init_pos_b + j];
            j++;
        }
    }
}  // end merge()


///////////////////////////////////////////////////////////////////
__attribute__ ((constructor)) void sort( args_t* ptr )
///////////////////////////////////////////////////////////////////
{
    int * src_array = NULL;
    int * dst_array = NULL;

    unsigned int  thread_id = ptr->index;
    unsigned int  threads   = ptr->threads;
    unsigned int  items     = ARRAY_LENGTH / threads;
    unsigned int  stages    = __builtin_ctz( threads );
    unsigned int  i;

    // all threads contribute to the first stage of parallel sort
    printf("[SORT] Thread %d / Stage 0: Sorting...\n\r", thread_id );

    bubbleSort( array0, items, items * thread_id );

    printf("[SORT] Thread %d / Stage 0: Completed\n\r", thread_id);

    // the number of threads is divided by 2 at each next stage
    for ( i = 0 ; i < stages ; i++ )
    {
        barrier_wait( &barrier[i] );

        if((thread_id % (2 << i)) != 0)  giet_pthread_exit("Completed");

        printf("[SORT] Thread %d / Stage %d: Sorting...\n\r", thread_id, i+1);

        if((i % 2) == 0)               // even stage 
        {
            src_array = &array0[0];
            dst_array = &array1[0];
        }
        else                           // odd stage
        {
            src_array = &array1[0];
            dst_array = &array0[0];
        }

        merge( src_array, 
               dst_array,
               items << i,
               items * thread_id,
               items * (thread_id + (1 << i)),
               items * thread_id );

        printf("[SORT] Thread %d / Stage %d: Completed\n\r", thread_id, i+1);
    }


} // end sort()


//////////////////////////////////////////
__attribute__ ((constructor)) void main()
//////////////////////////////////////////
{
    unsigned int x_size;      // number of rows
    unsigned int y_size;      // number of columns
    unsigned int nprocs;      // number of procs per cluster
    unsigned int threads;     // total number of threads
    unsigned int n;           // index for loops

    args_t       arg[1024];   // array of arguments for sort() 

    // compute number of threads (one thread per proc)
    giet_procs_number( &x_size , &y_size , &nprocs );
    threads = x_size * y_size * nprocs;

    // alloc a shared TTY used by all threads
    giet_tty_alloc(1);
    lock_init( &tty_lock );

    // checks number of threads
    if ( (threads != 1)   && (threads != 2)   && (threads != 4)   && 
         (threads != 8)   && (threads != 16 ) && (threads != 32)  && 
         (threads != 64)  && (threads != 128) && (threads != 256) && 
         (threads != 512) && (threads != 1024) )
    {
        giet_pthread_exit("[SORT ERROR] : number of cores must be power of 2\n");
    }

    // check array size
    if ( ARRAY_LENGTH % threads) 
    {
        giet_pthread_exit("[SORT ERROR] : array size must be multiple of number of cores\n");
    }

    // Barriers initialization (number of participants divided by 2 at each stage)
    for (n = 0; n < __builtin_ctz( threads ); n++)
    {
        barrier_init( &barrier[n], threads >> n );
    }

    // Array to sort initialization
    for ( n = 0 ; n < ARRAY_LENGTH ; n++ )
    {
        array0[n] = giet_rand();

#if VERBOSE
        printf("array[%d] = %d\n", n , array0[n] );
#endif

    }

    printf("\n[SORT] main completes initialisation at cycle %d / %d threads\n",
           giet_proctime() , threads );

    // launch other threads to run sort() function
    pthread_t   trdid;
    for ( n = 1 ; n < threads ; n++ )
    {
        arg[n].index   = n;
        arg[n].threads = threads;
        if ( giet_pthread_create( &trdid,        // not used because no join
                                  NULL,          // no attribute
                                  &sort,
                                  &arg[n] ) )    // pointer on sort arguments
        {
            printf("\n[SORT ERROR] creating thread %d\n", n );
            giet_pthread_exit( NULL );
        }
    }

    // main run also the sort() function
    arg[0].index   = 0;
    arg[0].threads = threads;
    sort( &arg[0] );

    // Check result
    int    success = 1;
    int*   res_array = ( (threads==  2) ||
                         (threads==  8) || 
                         (threads== 32) || 
                         (threads==128) || 
                         (threads==512) ) ? array1 : array0;
    
    for( n=0 ; n<(ARRAY_LENGTH-1) ; n++ )
    {
        if ( res_array[n] > res_array[n+1] )
        {
            success = 0;
            break;
        }
    }

#if VERBOSE
    for( n=0; n<ARRAY_LENGTH; n++)
    {
        printf("array[%d] = %d\n", n , res_array[n] );
    }
#endif

    if ( success )
    {
        printf("[SORT] Main completes at cycle %d : success\n", giet_proctime() );
        giet_pthread_exit("Success");
    }
    else
    {
        printf("[SORT] Main completes at cycle %d : failure\n", giet_proctime() );
        giet_pthread_exit("Failure");
    }

}  // end main()


/* vim: tabstop=4 : shiftwidth=4 : expandtab
*/
