///////////////////////////////////////////////////////////////////////////////
// File   :  main.c
// Date   :  November 2013
// Author :  Cesar Fuguet Tortolero <cesar.fuguet-tortolero@lip6.fr>
//
// Description :
//
//      Sort application using the GIET-VM OS. This application uses the
//      barrier routines to apply a sort algorithm in several stages.
//
//      Constraints :
//
//      - It supports up to 1024 processors and the number of processors
//        must be a power of 2.
//
//      _ The array of values to be sorted (ARRAY_LENGTH) must be power of 2 
//        larger than the number of processors.
//
//      - This application must be executed on a cache coherent architecture. 
//
///////////////////////////////////////////////////////////////////////////////

#include "stdio.h"
#include "mapping_info.h"
#include "hard_config.h"
#include "user_barrier.h"

#define ARRAY_LENGTH    4096
#define IPT             (ARRAY_LENGTH / threads) // ITEMS PER THREAD

////////////////////////////////////////////////////////////////////////////////
// Processors other than 0 display algorithm state if VERBOSE non zero

#define VERBOSE         1

////////////////////////////////////////////////////////////////////////////////
// Define printf according to verbosity option and number of available TTY

#if (VERBOSE == 1)
#   define printf(...)     giet_shr_printf(__VA_ARGS__)
#else      
#   define printf(...)
#endif

#define task0_printf(...) if(thread_id == 0) giet_shr_printf(__VA_ARGS__)

int array0[ARRAY_LENGTH];
int array1[ARRAY_LENGTH];

volatile int init_ok = 0;

void bubbleSort(
        int * array,
        unsigned int length,
        unsigned int init_pos);

void merge(
        int * array,
        int * result,
        int length,
        int init_pos_a,
        int init_pos_b,
        int init_pos_result);

///////////////////////////////////////////////////////
// This application supports at most 1024 processors
// Number of barriers = log2(threads)

giet_barrier_t barrier[10];

//////////////////////////////////////////
__attribute__ ((constructor)) void sort()
{
    int thread_id = giet_thread_id();
    int * src_array = NULL;
    int * dst_array = NULL;
    int i;
    
    unsigned int time_start = giet_proctime();
    unsigned int time_end;   

    // compute number of threads (one thread per proc)
    unsigned int x_size;
    unsigned int y_size;
    unsigned int nprocs;
    unsigned int threads;
    giet_procs_number( &x_size , &y_size , &nprocs );
    threads = x_size * y_size * nprocs;

    if ( (threads != 1)   && (threads != 2)   && (threads != 4)   && 
         (threads != 8)   && (threads != 16 ) && (threads != 32)  && 
         (threads != 64)  && (threads != 128) && (threads != 256) && 
         (threads != 512) && (threads != 1024) )
    {
        task0_printf("[SORT ERROR] Number of processors must be power of 2\n"
                     "  x_size = %d / y_size = %d / nprocs = %d\n",
                     x_size , y_size , nprocs );
        giet_exit("error");
    }

    task0_printf("\n[ Thread 0 ] Starting sort application with %d threads "
                 "at cycle %d\n", threads, time_start);

    ///////////////////////////
    // Barriers Initialization

    if (thread_id == 0)
    {
        for (i = 0; i < __builtin_ctz( threads ); i++)
        {
            barrier_init(&barrier[i], threads >> i);
        }

        init_ok = 1;
    }
    else
    {
        while(!init_ok);
    }

    ////////////////////////
    // Array Initialization

    for (i = IPT * thread_id; i < IPT * (thread_id + 1); i++)
    {
        array0[i] = giet_rand();
    }

    ///////////////////////////////////
    // Parallel sort of array elements

    printf("[ Thread %d ] Stage 0: Sorting...\n\r", thread_id);

    bubbleSort(array0, IPT, IPT * thread_id);

    printf("[ Thread %d ] Finishing Stage 0\n\r", thread_id);

    for (i = 0; i < __builtin_ctz( threads ); i++)
    {
        barrier_wait(&barrier[i]);

        if((thread_id % (2 << i)) != 0)
        {
            printf("[ Thread %d ] Quit\n\r", thread_id );
            giet_exit("Completed");
        }

        printf("[ Thread %d ] Stage %d: Sorting...\n\r", thread_id, i+1);

        if((i % 2) == 0)
        {
            src_array = &array0[0];
            dst_array = &array1[0];
        }
        else
        {
            src_array = &array1[0];
            dst_array = &array0[0];
        }

        merge(src_array, dst_array
                , IPT << i
                , IPT * thread_id
                , IPT * (thread_id + (1 << i))
                , IPT * thread_id
                );

        printf("[ Thread %d ] Finishing Stage %d\n\r", thread_id, i + 1);
    }

    int success;
    int failure_index;

    //////////////////////////////
    // Verify the resulting array
    
    if(thread_id != 0)
    {
        giet_exit("error: only thread 0 should get here");
    }

    success = 1;
    for(i=0; i<(ARRAY_LENGTH-1); i++)
    {
        if(dst_array[i] > dst_array[i+1])
        {

            success = 0;
            failure_index = i;
            break;
        }
    }

    time_end = giet_proctime();

    printf("[ Thread 0 ] Finishing sort application at cycle %d\n"
           "[ Thread 0 ] Time elapsed = %d\n",
            time_end, (time_end - time_start) );

    if (success)
    {
        giet_exit("!!! Success !!!");
    }
    else
    {
        printf("[ Thread 0 ] Failure!! Incorrect element: %d\n\r", 
               failure_index);
        for(i=0; i<ARRAY_LENGTH; i++)
        {
            printf("array[%d] = %d\n", i, dst_array[i]);
        }
        giet_exit("!!!  Failure !!!");
    }

    giet_exit("Completed");
}

////////////////////////////////////
void bubbleSort( int *        array,
                 unsigned int length,
                 unsigned int init_pos )
{
    int i;
    int j;
    int aux;

    for(i = 0; i < length; i++)
    {
        for(j = init_pos; j < (init_pos + length - i - 1); j++)
        {
            if(array[j] > array[j + 1])
            {
                aux          = array[j + 1];
                array[j + 1] = array[j];
                array[j]     = aux;
            }
        }
    }
}

/////////////
void merge(
        int * array,
        int * result,
        int length,
        int init_pos_a,
        int init_pos_b,
        int init_pos_result)
{
    int i;
    int j;
    int k;

    i = 0;
    j = 0;
    k = init_pos_result;

    while((i < length) || (j < length))
    {
        if((i < length) && (j < length))
        {
            if(array[init_pos_a + i] < array[init_pos_b + j])
            {
                result[k++] = array[init_pos_a + i];
                i++;
            }
            else
            {
                result[k++] = array[init_pos_b + j];
                j++;
            }
        }
        else if(i < length)
        {
            result[k++] = array[init_pos_a + i];
            i++;
        }
        else
        {
            result[k++] = array[init_pos_b + j];
            j++;
        }
    }
}

/* vim: tabstop=4 : shiftwidth=4 : expandtab
*/
