
#ifndef _CLOCK_H_
#define _CLOCK_H_

#include <stdint.h>

#include "nrc_os_config.h"
#if TARGET_OS == LINUX
    #include <sys/time.h>
#endif

/**
 * The macros should be called in the following order:
 * - CLOCK_DEC;
 * - CLOCK_INIT(num_threads, num_steps);
 * - CLOCK_APP_START;
 * - CLOCK_APP_CREATE;
 * - CLOCK_THREAD_START(thread_id);
 * - CLOCK_THREAD_COMPUTE_START(thread_id;
 * - CLOCK_THREAD_START_STEP(thread_id, step_id)
 * - CLOCK_THREAD_END_STEP(thread_id, step_id)
 * - (repeat num_steps times)
 * - CLOCK_THREAD_COMPUTE_END(thread_id);
 * - CLOCK_THREAD_END(thread_id)
 * - CLOCK_APP_JOIN;
 * - CLOCK_APP_END;
 * - CLOCK_FINALIZE(num_threads);
 * - PRINT_CLOCK;
 * - CLOCK_FREE;
 */


static void local_sort_asc(uint32_t tab[], int size) {
    int tmp;
    int i, j;
    for (i = 0; i < size; i++) {
        uint32_t min = tab[i];
        int jmin = i;
        for (j = i + 1; j < size; j++) {
            if (tab[j] < min) {
                jmin = j;
                min = tab[j];
            }
        }
        tmp = tab[i];
        tab[i] = min;
        tab[jmin] = tmp;
    }
}



#define CLOCK_DEC uint32_t app_start;                   \
                  uint32_t app_end;                     \
                  uint32_t app_create;                  \
                  uint32_t app_join;                    \
                  uint32_t * thread_start;              \
                  uint32_t * thread_end;                \
                  uint32_t * thread_compute_start;      \
                  uint32_t * thread_compute_end;        \
                  int32_t step_number;                  \
                  int32_t clock_thread_num;             \
                  uint32_t ** thread_start_step;        \
                  uint32_t ** thread_end_step;          \
                  uint32_t global_thread_start;         \
                  uint32_t global_thread_end;           \
                  uint32_t global_thread_compute_start; \
                  uint32_t global_thread_compute_end;   \
                  uint32_t * global_thread_start_step;  \
                  uint32_t * global_thread_end_step;    \

#if TARGET_OS == GIETVM
    #define CLOCK(x)  ({ x = giet_proctime(); })
#elif TARGET_OS == LINUX
    #define CLOCK(x)  ({                      \
            struct timeval full_time;         \
            gettimeofday(&full_time, NULL);   \
            x = (unsigned long) ((full_time.tv_usec + full_time.tv_sec * 1000000) / 1000); \
            })
#endif

// x = number of threads, y = number of steps
#define CLOCK_INIT(x, y) ({                                                         \
    clock_thread_num = (x);                                                         \
    step_number = (y);                                                              \
    global_thread_start = 0xFFFFFFFFLLU;                                            \
    global_thread_end = 0;                                                          \
    global_thread_compute_start = 0xFFFFFFFFLLU;                                    \
    global_thread_compute_end = 0;                                                  \
    if ((x) > 0) {                                                                  \
        thread_start = (uint32_t *) malloc(sizeof(uint32_t) * (x));                 \
        thread_end = (uint32_t *) malloc(sizeof(uint32_t) * (x));                   \
        thread_compute_start = (uint32_t *) malloc(sizeof(uint32_t) * (x));         \
        thread_compute_end = (uint32_t *) malloc(sizeof(uint32_t) * (x));           \
        if ((y) > 0) {                                                              \
            global_thread_start_step = (uint32_t *) malloc(sizeof(uint32_t) * (y)); \
            global_thread_end_step = (uint32_t *) malloc(sizeof(uint32_t) * (y));   \
            thread_start_step = (uint32_t **) malloc(sizeof(uint32_t *) * (y));     \
            thread_end_step = (uint32_t **) malloc(sizeof(uint32_t *) * (y));       \
            for (int j = 0; j < (y); j++) {                                         \
                global_thread_start_step[j] = 0xFFFFFFFFLU;                         \
                global_thread_end_step[j] = 0;                                      \
                thread_start_step[j] = (uint32_t *) malloc(sizeof(uint32_t) * (x)); \
                thread_end_step[j] = (uint32_t *) malloc(sizeof(uint32_t) * (x));   \
            }                                                                       \
        }                                                                           \
    }                                                                               \
})


#define CLOCK_APP_START               ({ CLOCK(app_start); })
#define CLOCK_APP_END                 ({ CLOCK(app_end); })
#define CLOCK_APP_CREATE              ({ CLOCK(app_create); })
#define CLOCK_APP_JOIN                ({ CLOCK(app_join); })
#define CLOCK_THREAD_START(x)         ({ CLOCK(thread_start[x]); })
#define CLOCK_THREAD_END(x)           ({ CLOCK(thread_end[x]); })
#define CLOCK_THREAD_COMPUTE_START(x) ({ CLOCK(thread_compute_start[x]); })
#define CLOCK_THREAD_COMPUTE_END(x)   ({ CLOCK(thread_compute_end[x]); })
#define CLOCK_THREAD_START_STEP(x, y) ({ CLOCK(thread_start_step[y][x]); })
#define CLOCK_THREAD_END_STEP(x, y)   ({ CLOCK(thread_end_step[y][x]); })


// x = number of threads
#define CLOCK_FINALIZE ({                                                \
    for (int i = 0; i < clock_thread_num; i++) {                         \
        if (thread_start[i] < global_thread_start) {                     \
            global_thread_start = thread_start[i];                       \
        }                                                                \
        if (thread_compute_start[i] < global_thread_compute_start) {     \
            global_thread_compute_start = thread_compute_start[i];       \
        }                                                                \
        if (thread_end[i] > global_thread_end) {                         \
            global_thread_end = thread_end[i];                           \
        }                                                                \
        if (thread_compute_end[i] > global_thread_compute_end) {         \
            global_thread_compute_end = thread_compute_end[i];           \
        }                                                                \
        for (int j = 0; j < step_number; j++) {                          \
            if (thread_start_step[j][i] < global_thread_start_step[j]) { \
                global_thread_start_step[j] = thread_start_step[j][i];   \
            }                                                            \
            if (thread_end_step[j][i] > global_thread_end_step[j]) {     \
                global_thread_end_step[j] = thread_end_step[j][i];       \
            }                                                            \
        }                                                                \
    }                                                                    \
})

#define PRINT_CLOCK ({                                                                                         \
    printf("Timestamps:\n");                                                                                   \
    printf("[APP_START]            : %d\n", app_start);                                                        \
    printf("[APP_CREATE]           : %d\n", app_create);                                                       \
    printf("[THREAD_START]         : %d\n", global_thread_start);                                              \
    printf("[THREAD_COMPUTE_START] : %d\n", global_thread_compute_start);                                      \
    for (int j = 0; j < step_number; j++) {                                                                    \
        printf("[THREAD_START_STEP_%d]  : %d\n", j, global_thread_start_step[j]);                              \
        printf("[THREAD_END_STEP_%d]    : %d\n", j, global_thread_end_step[j]);                                \
    }                                                                                                          \
    printf("[THREAD_COMPUTE_END]   : %d\n", global_thread_compute_end);                                        \
    printf("[THREAD_END]           : %d\n", global_thread_end);                                                \
    printf("[APP_JOIN]             : %d\n", app_join);                                                         \
    printf("[APP_END]              : %d\n", app_end);                                                          \
    printf("Durations (in cycles):\n");                                                                        \
    printf("[TOTAL]                : %d\n", app_end - app_start);                                              \
    printf("[THREAD]               : %d\n", app_join - app_create);                                            \
    printf("[PARALLEL]             : %d\n", global_thread_end - global_thread_start);                          \
    printf("[PARALLEL_COMPUTE]     : %d\n", global_thread_compute_end - global_thread_compute_start);          \
    for (int j = 0; j < step_number; j++) {                                                                    \
        printf("[THREAD_STEP_%d]        : %d\n", j, global_thread_end_step[j] - global_thread_start_step[j]);  \
    }                                                                                                          \
    printf("\n");                                                                                              \
    printf("*** All threads times output in a gnuplot data-style ***\n");                                      \
    local_sort_asc(thread_start, clock_thread_num);                                                            \
    local_sort_asc(thread_compute_start, clock_thread_num);                                                    \
    local_sort_asc(thread_compute_end, clock_thread_num);                                                      \
    local_sort_asc(thread_end, clock_thread_num);                                                              \
    for (int j = 0; j < step_number; j++) {                                                                    \
        local_sort_asc(thread_start_step[j], clock_thread_num);                                                \
        local_sort_asc(thread_end_step[j], clock_thread_num);                                                  \
    }                                                                                                          \
    printf("# cycle     thread_id\n");                                                                         \
    for (int i = 0; i < clock_thread_num; i++) {                                                               \
        printf("%d\t%d\n", thread_start[i], i);                                                                \
        printf("%d\t%d\n", thread_compute_start[i], i);                                                        \
        for (int j = 0; j < step_number; j++) {                                                                \
            printf("%d\t%d\n", thread_start_step[j][i], i);                                                    \
            printf("%d\t%d\n", thread_end_step[j][i], i);                                                      \
        }                                                                                                      \
        printf("%d\t%d\n", thread_compute_end[i], i);                                                          \
        printf("%d\t%d\n", thread_end[i], i);                                                                  \
    }                                                                                                          \
})

                




#define CLOCK_FREE ({                                                \
    if (clock_thread_num > 0) {                                      \
        free(thread_start);                                          \
        free(thread_end);                                            \
        free(thread_compute_start);                                  \
        free(thread_compute_end);                                    \
        if (step_number > 0) {                                       \
            free(global_thread_start_step);                          \
            free(global_thread_end_step);                            \
            for (int j = 0; j < step_number; j++) {                  \
                free(thread_start_step[j]);                          \
                free(thread_end_step[j]);                            \
            }                                                        \
            free(thread_start_step);                                 \
            free(thread_end_step);                                   \
        }                                                            \
    }                                                                \
})




#endif

