///////////////////////////////////////////////////////////////////////////////
// File   :  sort.c
// Date   :  November 2013
// Author :  Cesar Fuguet Tortolero <cesar.fuguet-tortolero@lip6.fr>
///////////////////////////////////////////////////////////////////////////////
// This multi-threaded application implement a multi-stage sort application.
// The various stages are separated by synchronisation barriers.
// There is one thread per physical cores. 
// Computation is organised as a binary tree: 
// - All threads execute in parallel a buble sort on a sub-array during the
//   the first stage of parallel sort,
// - The number of participating threads is divided by 2 at each next stage,
//   to make a merge sort, on two subsets of previous stage.
//
//       Number_of_stages = number of barriers = log2(Number_of_threads)
//
// Constraints :
// - It supports up to 1024 cores: x_size, y_size, and ncores must be
//   power of 2 (max 16*16 clusters / max 4 cores per cluster)
// _ The array of values to be sorted (ARRAY_LENGTH) must be power of 2 
//   larger than the number of cores.
///////////////////////////////////////////////////////////////////////////////

#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>
#include <pthread.h>

#define ARRAY_LENGTH    0x100    // 256 values
#define VERBOSE         0

///////////////////////////////////////////////////////
// macros for fixed format cxy <=> (x,y) translation
///////////////////////////////////////////////////////

#define CXY_FROM_XY( x , y )  ((x<<4) + y)

#define X_FROM_CXY( cxy )     ((cxy>>4) & 0xF)

#define Y_FROM_CXY( cxy )     (cxy & 0xF)

/////////////////////////////////////////////////////////////
// argument for the sort() function (one thread per core)
/////////////////////////////////////////////////////////////

typedef struct
{
    unsigned int threads;      // total number of threads
    unsigned int thread_uid;    // thread user index (0 to threads -1)
    unsigned int main_uid;      // main thread user index
}
args_t;

//////////////////////////////////////////
//      Global variables
//////////////////////////////////////////

int                 array0[ARRAY_LENGTH];    // values to sort
int                 array1[ARRAY_LENGTH];    

pthread_barrier_t   barrier;                 // synchronisation variables


////////////////////////////////////
void bubbleSort( int *        array,
                 unsigned int length,
                 unsigned int init_pos )
{
    int i;
    int j;
    int aux;

    for(i = 0; i < length; i++)
    {
        for(j = init_pos; j < (init_pos + length - i - 1); j++)
        {
            if(array[j] > array[j + 1])
            {
                aux          = array[j + 1];
                array[j + 1] = array[j];
                array[j]     = aux;
            }
        }
    }
}  // end bubbleSort()


/////////////////////////
void merge( int * src,
            int * dst,
            int length,
            int init_pos_src_a,
            int init_pos_src_b,
            int init_pos_dst )
{
    int i;
    int j;
    int k;

    i = 0;
    j = 0;
    k = init_pos_dst;

    while((i < length) || (j < length))
    {
        if((i < length) && (j < length))
        {
            if(src[init_pos_src_a + i] < src[init_pos_src_b + j])
            {
                dst[k++] = src[init_pos_src_a + i];
                i++;
            }
            else
            {
                dst[k++] = src[init_pos_src_b + j];
                j++;
            }
        }
        else if(i < length)
        {
            dst[k++] = src[init_pos_src_a + i];
            i++;
        }
        else
        {
            dst[k++] = src[init_pos_src_b + j];
            j++;
        }
    }
}  // end merge()

/////////////////////////
void sort( args_t * ptr )
{
    unsigned int       i;
    unsigned long long cycle;

    int         * src_array  = NULL;
    int         * dst_array  = NULL;

    unsigned int  thread_uid = ptr->thread_uid;
    unsigned int  threads    = ptr->threads;
    unsigned int  main_uid   = ptr->main_uid;

    unsigned int  items      = ARRAY_LENGTH / threads;
    unsigned int  stages     = __builtin_ctz( threads ) + 1;
    
    get_cycle( &cycle );
    printf("\n[SORT] thread[%d] enter at cycle %d\n", thread_uid , (unsigned int)cycle );

    printf("\n[SORT] thread[%d] / stage 0 start\n", thread_uid );

    bubbleSort( array0, items, items * thread_uid );

    printf("\n[SORT] thread[%d] / stage 0 completed\n", thread_uid );

    /////////////////////////////////
    pthread_barrier_wait( &barrier ); 

    // the number of threads contributing to sort
    // is divided by 2 at each next stage
    for ( i = 1 ; i < stages ; i++ )
    {
        pthread_barrier_wait( &barrier );

        if( (thread_uid & ((1<<i)-1)) == 0 )
        {
            printf("\n[SORT] thread[%d] / stage %d start\n", thread_uid , i );

            if((i % 2) == 1)               // odd stage 
            {
                src_array = array0;
                dst_array = array1;
            }
            else                           // even stage
            {
                src_array = array1;
                dst_array = array0;
            }

            merge( src_array, 
                   dst_array,
                   items << i,
                   items * thread_uid,
                   items * (thread_uid + (1 << (i-1))),
                   items * thread_uid );

            printf("\n[SORT] thread[%d] / stage %d completed\n", thread_uid , i );
        }

        /////////////////////////////////
        pthread_barrier_wait( &barrier );

    }

    // all threads but the main thread exit
    if( thread_uid != main_uid ) pthread_exit( NULL );

} // end sort()


///////////
void main()
{
    unsigned int           x_size;             // number of rows
    unsigned int           y_size;             // number of columns
    unsigned int           ncores;             // number of cores per cluster
    unsigned int           threads;            // total number of threads
    unsigned int           thread_uid;         // user defined thread index
    unsigned int           main_cxy;           // cluster identifier for main
    unsigned int           main_x;             // X coordinate for main thread
    unsigned int           main_y;             // Y coordinate for main thread
    unsigned int           main_lid;           // core local index for main thread
    unsigned int           main_uid;           // thread user index for main thread
    unsigned int           x;                  // X coordinate for a thread
    unsigned int           y;                  // Y coordinate for a thread
    unsigned int           lid;                // core local index for a thread
    unsigned int           n;                  // index in array to sort
    unsigned long long     cycle;              // current date for log
    pthread_t              trdid;              // kernel allocated thread index (unused)
    pthread_barrierattr_t  barrier_attr;       // barrier attributes
    pthread_attr_t         attr[1024];         // thread attributes (one per thread)
    args_t                 arg[1024];          // sort function arguments (one per thread)

    // compute number of threads (one thread per proc)
    get_config( &x_size , &y_size , &ncores );
    threads = x_size * y_size * ncores;

    // get core coordinates and user index for the main thread
    get_core( &main_cxy , & main_lid );
    main_x   = X_FROM_CXY( main_cxy );
    main_y   = Y_FROM_CXY( main_cxy );
    main_uid = (((main_x * y_size) + main_y) * ncores) + main_lid; 

    // checks number of threads
    if ( (threads != 1)   && (threads != 2)   && (threads != 4)   && 
         (threads != 8)   && (threads != 16 ) && (threads != 32)  && 
         (threads != 64)  && (threads != 128) && (threads != 256) && 
         (threads != 512) && (threads != 1024) )
    {
        printf("\n[SORT ERROR] number of cores must be power of 2\n");
        exit( 0 );
    }

    // check array size
    if ( ARRAY_LENGTH % threads) 
    {
        printf("\n[SORT ERROR] array size must be multiple of number of threads\n");
        exit( 0 );
    }

    get_cycle( &cycle );
    printf("\n[SORT] starts : %d threads / %d values / cycle %d\n",
    threads, ARRAY_LENGTH , (unsigned int)cycle );

    // Barrier initialization
    barrier_attr.x_size   = x_size; 
    barrier_attr.y_size   = y_size;
    barrier_attr.nthreads = ncores;
    if( pthread_barrier_init( &barrier, &barrier_attr , threads ) )
    {
        printf("\n[SORT ERROR] cannot initialise barrier\n" );
        exit( 0 );
    }

    get_cycle( &cycle );
    printf("\n[SORT] completes barrier init at cycle %d continue ?\n", (unsigned int)cycle );
    getchar();

    // Array to sort initialization
    for ( n = 0 ; n < ARRAY_LENGTH ; n++ )
    {
        array0[n] = rand();
    }

#if VERBOSE
printf("\n*** array before sort\n");
for( n=0; n<ARRAY_LENGTH; n++) printf("array[%d] = %d\n", n , array0[n] );
#endif

    get_cycle( &cycle );
    printf("\n[SORT] completes array init at cycle %d\n", (unsigned int)cycle );

    // launch other threads to execute sort() function
    // on cores other than the core running the main thread
    for ( x=0 ; x<x_size ; x++ )
    {
        for ( y=0 ; y<y_size ; y++ )
        {
            for ( lid=0 ; lid<ncores ; lid++ )
            {
                thread_uid = (((x * y_size) + y) * ncores) + lid;

                // set sort arguments for all threads
                arg[thread_uid].threads      = threads;
                arg[thread_uid].thread_uid   = thread_uid;
                arg[thread_uid].main_uid     = main_uid;

                // set thread attributes for all threads
                attr[thread_uid].attributes = PT_ATTR_CLUSTER_DEFINED | PT_ATTR_CORE_DEFINED;
                attr[thread_uid].cxy        = CXY_FROM_XY( x , y );
                attr[thread_uid].lid        = lid;

                if( thread_uid != main_uid )
                {
                    if ( pthread_create( &trdid,              // not used because no join
                                         &attr[thread_uid],   // thread attributes 
                                         &sort,               // entry function 
                                         &arg[thread_uid] ) ) // sort arguments
                    {
                        printf("\n[SORT ERROR] creating thread %x\n", thread_uid );
                        exit( 0 );
                    }
         
                }
            }
        }
    }

    get_cycle( &cycle );
    printf("\n[SORT] completes threads create at cycle %d\n", (unsigned int)cycle );

   // main run also the sort() function
    sort( &arg[main_uid] );

    // Check result
    int    success = 1;
    int*   res_array = ( (threads==  2) ||
                         (threads==  8) || 
                         (threads== 32) || 
                         (threads==128) || 
                         (threads==512) ) ? array1 : array0;
    
    for( n=0 ; n<(ARRAY_LENGTH-1) ; n++ )
    {
        if ( res_array[n] > res_array[n+1] )
        {
            success = 0;
            break;
        }
    }

#if VERBOSE
printf("\n*** array after sort\n");
for( n=0; n<ARRAY_LENGTH; n++) printf("array[%d] = %d\n", n , res_array[n] );
#endif

    get_cycle( &cycle );

    if ( success )
    {
        printf("\n[SORT] success at cycle %d\n", (unsigned int)cycle );
        exit( 0 );
    }
    else
    {
        printf("\n[SORT] failure at cycle %d\n", (unsigned int)cycle );
        exit( 0 );
    }

}  // end main()


/* 
vim: tabstop=4 : shiftwidth=4 : expandtab
*/
