///////////////////////////////////////////////////////////////////////////////
// File :
//      
//      main.c
//
// Date :
//      
//      November 2013
//
// Author :
//
//      Cesar Fuguet Tortolero <cesar.fuguet-tortolero@lip6.fr>
//
// Description :
//
//      Sort application using the GIET-VM OS. This application uses the
//      barrier routines to apply a sort algorithm in several stages.
//
//      Considerations :
//
//          - It supports up to 256 processors and the number of processors
//            must be a power of 2.
//
//          - If there is only one TTY available, this application uses a spin
//            lock to avoid several threads writting at the same time.
//
//          - This application must be executed on a cache coherent
//            architecture. Otherwise some modifications must be applied
//
//          - The processors executing this application must have a contiguous
//            processor id and the first processor must have id 0.
//
//      TODO: Replace processor id based identification mechanism by one based
//            on thread id
//
///////////////////////////////////////////////////////////////////////////////

#include "stdio.h"
#include "hard_config.h"
#include "barrier.h"
#include "spin_lock.h"

//////////////////////////////////////////////////////////////////////////
// The NPROCS constant must be modified depending on the desired number of
// threads

#define NPROCS          16
#define ARRAY_LENGTH    (NPROCS * 128)
#define IPP             (ARRAY_LENGTH / NPROCS) // ITEMS PER PROCESSOR

////////////////////////////////////////////////////////////////////////////////
// Processors other than 0 display algorithm state
// The processor 0 always displays some information so this does not affect him 

#define VERBOSE         1

///////////////////////////////////////////////////////////////////////
// Define printf according to verbosity option and number of available
// TTY

#if (VERBOSE == 1)
#   if (NB_TTY_CHANNELS > 1)
#       define printf(...)     giet_tty_printf(__VA_ARGS__)
#       define puts(...)       giet_tty_puts(__VA_ARGS__)

#   else    // NB_TTY_CHANNELS == 0
#       define printf(...) \
        lock_acquire(&tty_lock); \
        giet_tty_printf(__VA_ARGS__); \
        lock_release(&tty_lock);  
#   endif
#else       // VERBOSE == 0
#   define printf(...)
#   define puts(...)
#endif

#define task0_printf(...) if(procid() == 0) giet_tty_printf(__VA_ARGS__)

#define exit    giet_exit
#define procid  giet_procid
#define rand    giet_rand

int array0[ARRAY_LENGTH];
int array1[ARRAY_LENGTH];

volatile int init_ok = 0;

void bubbleSort(
        int * array,
        unsigned int length,
        unsigned int init_pos);

void merge(
        int * array,
        int * result,
        int length,
        int init_pos_a,
        int init_pos_b,
        int init_pos_result);

///////////////////////////////////////////////////
// This application support at most 256 processors
// Number of barriers = log2(NPROCS)

giet_barrier_t barrier[8];
giet_lock_t tty_lock;

__attribute__ ((constructor)) void sort()
{
    int proc_id = procid();
    int * src_array;
    int * dst_array;
    int i;

    task0_printf("Starting SORT application\n");

    ////////////////////////
    // Array Initialization

    for (i = IPP * proc_id; i < IPP * (proc_id + 1); i++)
    {
        array0[i] = rand();
    }

    ///////////////////////////
    // Barriers Initialization

    while((proc_id != 0) && (init_ok == 0));

    if (proc_id == 0)
    {
        for (i = 0; i < __builtin_ctz(NPROCS); i++)
        {
            task0_printf("Initializing barrier %d with %d\n", i, NPROCS >> i);
            barrier_init(&barrier[i], NPROCS >> i);
        }

        asm volatile ("sync");
        init_ok = 1;
    }

    asm volatile ("sync");
    barrier_wait(&barrier[0]);

    ///////////////////////////////////
    // Parallel sort of array elements

    printf("Proc %d Stage 0: Processor Sorting...\n\r", proc_id);

    bubbleSort(array0, IPP, IPP * proc_id);

    printf("Proc %d Finishing Stage 0...\n\r", proc_id);

    for (i = 0; i < __builtin_ctz(NPROCS); i++)
    {
        asm volatile ("sync");
        barrier_wait(&barrier[i]);

        printf("Proc %d Stage %d: Starting...\n\r", proc_id, i+1);

        if((proc_id % (2 << i)) != 0) exit();

        if((i % 2) == 0)
        {
            src_array = &array0[0];
            dst_array = &array1[0];
        }
        else
        {
            src_array = &array1[0];
            dst_array = &array0[0];
        }

        merge(src_array, dst_array
                , IPP << i
                , IPP * proc_id
                , IPP * (proc_id + (1 << i))
                , IPP * proc_id
                );

        printf("Proc %d Finishing Stage %d...\n\r", proc_id, i + 1);
    }

    int success;
    int failure_index;

    //////////////////////////////
    // Verify the resulting array

    if(proc_id == 0)
    {
        success = 1;

        for(i=0; i<(ARRAY_LENGTH-1); i++)
        {
            if(dst_array[i] > dst_array[i+1])
            {

                success = 0;
                failure_index = i;
                break;
            }
        }

        if (success)
        {
            printf("Success!!\n\r");
        }
        else
        {
            printf("Failure!! Incorrect element: %d\n\r", failure_index);


            for(i=0; i<ARRAY_LENGTH; i++)
            {
                printf("array[%d] = %d\n", i, dst_array[i]);
            }
        }
    }

    exit();
}

void bubbleSort(
        int * array,
        unsigned int length,
        unsigned int init_pos)
{
    int i;
    int j;
    int aux;

    for(i = 0; i < length; i++)
    {
        for(j = init_pos; j < (init_pos + length - i - 1); j++)
        {
            if(array[j] > array[j + 1])
            {
                aux          = array[j + 1];
                array[j + 1] = array[j];
                array[j]     = aux;
            }
        }
    }
}

void merge(
        int * array,
        int * result,
        int length,
        int init_pos_a,
        int init_pos_b,
        int init_pos_result)
{
    int i;
    int j;
    int k;

    i = 0;
    j = 0;
    k = init_pos_result;

    while((i < length) || (j < length))
    {
        if((i < length) && (j < length))
        {
            if(array[init_pos_a + i] < array[init_pos_b + j])
            {
                result[k++] = array[init_pos_a + i];
                i++;
            }
            else
            {
                result[k++] = array[init_pos_b + j];
                j++;
            }
        }
        else if(i < length)
        {
            result[k++] = array[init_pos_a + i];
            i++;
        }
        else
        {
            result[k++] = array[init_pos_b + j];
            j++;
        }
    }
}

/* vim: tabstop=4 : shiftwidth=4 : expandtab
*/
