
#ifndef _CLOCK_H_
#define _CLOCK_H_

#include <stdint.h>

#include "nrc_os_config.h"
#if TARGET_OS == LINUX
    #include <x86intrin.h>
    #include <sys/time.h>

    typedef uint64_t cl_size_t;
    #define MAX_CLOCK_VAL 0xFFFFFFFFFFFFFFFFLU
#elif TARGET_OS == GIETVM
    typedef uint32_t cl_size_t;
    #define MAX_CLOCK_VAL 0xFFFFFFFF
#endif

/**
 * The macros should be called in the following order:
 * - CLOCK_DEC;
 * - CLOCK_INIT(num_threads, num_steps);
 * - CLOCK_APP_START;
 * - CLOCK_APP_CREATE;
 * - CLOCK_THREAD_START(thread_id);
 * - Repeat num_runs times:
 *     - CLOCK_THREAD_COMPUTE_START(thread_id;
 *     - Repeat num_step times:
 *         - CLOCK_THREAD_START_STEP(thread_id, step_id)
 *         - CLOCK_THREAD_END_STEP(thread_id, step_id)
 *     - CLOCK_THREAD_COMPUTE_END(thread_id);
 *     - CLOCK_ACCUMULATE;
 * - CLOCK_THREAD_END(thread_id)
 * - CLOCK_APP_JOIN;
 * - CLOCK_APP_END;
 * - CLOCK_FINALIZE(num_threads);
 * - PRINT_CLOCK;
 * - CLOCK_FREE;
 * In case of several runs, the THREAD_COMPUTE and all the THREAD_STEP resulting times
 * are averaged over all the runs. The other times are kind of irrelevant.
 * TODO: make a struct gathering all variables and change macros to functions
 */


static void local_sort_asc(cl_size_t tab[], int32_t size) {
    cl_size_t tmp;
    int32_t i, j;
    for (i = 0; i < size; i++) {
        cl_size_t min = tab[i];
        int32_t jmin = i;
        for (j = i + 1; j < size; j++) {
            if (tab[j] < min) {
                jmin = j;
                min = tab[j];
            }
        }
        tmp = tab[i];
        tab[i] = min;
        tab[jmin] = tmp;
    }
}



#define CLOCK_DEC cl_size_t app_start;                   \
                  cl_size_t app_end;                     \
                  cl_size_t app_create;                  \
                  cl_size_t app_join;                    \
                  cl_size_t * thread_start;              \
                  cl_size_t * thread_end;                \
                  cl_size_t * thread_compute_start;      \
                  cl_size_t * thread_compute_end;        \
                  int32_t step_number;                  \
                  int32_t clock_thread_num;             \
                  int32_t clock_num_runs;               \
                  cl_size_t ** thread_start_step;        \
                  cl_size_t ** thread_end_step;          \
                  cl_size_t global_thread_start;         \
                  cl_size_t global_thread_end;           \
                  cl_size_t global_thread_compute_start; \
                  cl_size_t global_thread_compute_end;   \
                  cl_size_t accumulated_thread_compute;  \
                  cl_size_t * global_thread_start_step;  \
                  cl_size_t * global_thread_end_step;    \
                  cl_size_t * accumulated_thread_step;

#if TARGET_OS == GIETVM
    #define CLOCK(x)  ({ x = giet_proctime(); })
#elif TARGET_OS == LINUX
    /*#define CLOCK(x)  ({                      \
            struct timeval full_time;         \
            gettimeofday(&full_time, NULL);   \
            x = (cl_size_t) ((full_time.tv_usec + full_time.tv_sec * 1000000)); \
            }) */
    #define CLOCK(x) ({ x = __rdtsc(); })
#endif

// x = number of threads, y = number of steps
#define CLOCK_INIT(x, y) ({                                                           \
    clock_thread_num = (x);                                                           \
    step_number = (y);                                                                \
    clock_num_runs = 0;                                                               \
    global_thread_start = MAX_CLOCK_VAL;                                              \
    global_thread_end = 0;                                                            \
    global_thread_compute_start = MAX_CLOCK_VAL;                                      \
    global_thread_compute_end = 0;                                                    \
    accumulated_thread_compute = 0;                                                   \
    if ((x) > 0) {                                                                    \
        thread_start = (cl_size_t *) malloc(sizeof(cl_size_t) * (x));                 \
        thread_end = (cl_size_t *) malloc(sizeof(cl_size_t) * (x));                   \
        thread_compute_start = (cl_size_t *) malloc(sizeof(cl_size_t) * (x));         \
        thread_compute_end = (cl_size_t *) malloc(sizeof(cl_size_t) * (x));           \
        if ((y) > 0) {                                                                \
            global_thread_start_step = (cl_size_t *) malloc(sizeof(cl_size_t) * (y)); \
            global_thread_end_step = (cl_size_t *) malloc(sizeof(cl_size_t) * (y));   \
            thread_start_step = (cl_size_t **) malloc(sizeof(cl_size_t *) * (y));     \
            thread_end_step = (cl_size_t **) malloc(sizeof(cl_size_t *) * (y));       \
            accumulated_thread_step = (cl_size_t *) malloc(sizeof(cl_size_t) * (y));  \
            for (int32_t j = 0; j < (y); j++) {                                       \
                global_thread_start_step[j] = MAX_CLOCK_VAL;                          \
                global_thread_end_step[j] = 0;                                        \
                accumulated_thread_step[j] = 0;                                       \
                thread_start_step[j] = (cl_size_t *) malloc(sizeof(cl_size_t) * (x)); \
                thread_end_step[j] = (cl_size_t *) malloc(sizeof(cl_size_t) * (x));   \
            }                                                                         \
        }                                                                             \
    }                                                                                 \
})


#define CLOCK_APP_START               ({ CLOCK(app_start); })
#define CLOCK_APP_END                 ({ CLOCK(app_end); })
#define CLOCK_APP_CREATE              ({ CLOCK(app_create); })
#define CLOCK_APP_JOIN                ({ CLOCK(app_join); })
#define CLOCK_THREAD_START(x)         ({ CLOCK(thread_start[x]); })
#define CLOCK_THREAD_END(x)           ({ CLOCK(thread_end[x]); })
#define CLOCK_THREAD_COMPUTE_START(x) ({ CLOCK(thread_compute_start[x]); })
#define CLOCK_THREAD_COMPUTE_END(x)   ({ CLOCK(thread_compute_end[x]); })
#define CLOCK_THREAD_START_STEP(x, y) ({ CLOCK(thread_start_step[y][x]); })
#define CLOCK_THREAD_END_STEP(x, y)   ({ CLOCK(thread_end_step[y][x]); })

#define CLOCK_ACCUMULATE ({                                              \
    for (int32_t i = 0; i < clock_thread_num; i++) {                     \
        if (thread_compute_start[i] < global_thread_compute_start) {     \
            global_thread_compute_start = thread_compute_start[i];       \
        }                                                                \
        if (thread_compute_end[i] > global_thread_compute_end) {         \
            global_thread_compute_end = thread_compute_end[i];           \
        }                                                                \
        for (int32_t j = 0; j < step_number; j++) {                      \
            if (thread_start_step[j][i] < global_thread_start_step[j]) { \
                global_thread_start_step[j] = thread_start_step[j][i];   \
            }                                                            \
            if (thread_end_step[j][i] > global_thread_end_step[j]) {     \
                global_thread_end_step[j] = thread_end_step[j][i];       \
            }                                                            \
        }                                                                \
    }                                                                    \
    for (int32_t j = 0; j < step_number; j++) {                          \
        accumulated_thread_step[j] += (global_thread_end_step[j] - global_thread_start_step[j]); \
        global_thread_start_step[j] = MAX_CLOCK_VAL;                     \
        global_thread_end_step[j] = 0;                                   \
    }                                                                    \
    accumulated_thread_compute += (global_thread_compute_end - global_thread_compute_start); \
    global_thread_compute_start = MAX_CLOCK_VAL;                         \
    global_thread_compute_end = 0;                                       \
    clock_num_runs++;                                                    \
})


#define CLOCK_FINALIZE ({                                                \
    if (clock_num_runs == 0) {                                           \
        CLOCK_ACCUMULATE;                                                \
    }                                                                    \
    for (int32_t i = 0; i < clock_thread_num; i++) {                     \
        if (thread_start[i] < global_thread_start) {                     \
            global_thread_start = thread_start[i];                       \
        }                                                                \
        if (thread_compute_start[i] < global_thread_compute_start) {     \
            global_thread_compute_start = thread_compute_start[i];       \
        }                                                                \
        if (thread_end[i] > global_thread_end) {                         \
            global_thread_end = thread_end[i];                           \
        }                                                                \
        if (thread_compute_end[i] > global_thread_compute_end) {         \
            global_thread_compute_end = thread_compute_end[i];           \
        }                                                                \
        for (int32_t j = 0; j < step_number; j++) {                      \
            if (thread_start_step[j][i] < global_thread_start_step[j]) { \
                global_thread_start_step[j] = thread_start_step[j][i];   \
            }                                                            \
            if (thread_end_step[j][i] > global_thread_end_step[j]) {     \
                global_thread_end_step[j] = thread_end_step[j][i];       \
            }                                                            \
        }                                                                \
    }                                                                    \
})


#if TARGET_OS == LINUX

#define PRINT_CLOCK ({                                                                                                        \
    MCA_VERBOSE1(printf("Timestamps:\n"));                                                                                    \
    if (clock_num_runs > 1) {                                                                                                 \
        MCA_VERBOSE1(printf("(THREAD_COMPUTE_START, THREAD_COMPUTE_END, THREAD_START_STEPs and THREAD_END_STEPs)\n"));        \
        MCA_VERBOSE1(printf("(are those of the last run)\n"));                                                                \
    }                                                                                                                         \
    MCA_VERBOSE1(printf("[APP_START]            : %llu\n", (long long unsigned int) app_start));                              \
    MCA_VERBOSE1(printf("[APP_CREATE]           : %llu\n", (long long unsigned int) app_create));                             \
    MCA_VERBOSE1(printf("[THREAD_START]         : %llu\n", (long long unsigned int) global_thread_start));                    \
    MCA_VERBOSE1(printf("[THREAD_COMPUTE_START] : %llu\n", (long long unsigned int) global_thread_compute_start));            \
    for (int32_t j = 0; j < step_number; j++) {                                                                               \
        MCA_VERBOSE1(printf("[THREAD_START_STEP_%d]  : %llu\n", j, (long long unsigned int) global_thread_start_step[j]));    \
        MCA_VERBOSE1(printf("[THREAD_END_STEP_%d]    : %llu\n", j, (long long unsigned int) global_thread_end_step[j]));      \
    }                                                                                                                         \
    MCA_VERBOSE1(printf("[THREAD_COMPUTE_END]   : %llu\n", (long long unsigned int) global_thread_compute_end));              \
    MCA_VERBOSE1(printf("[THREAD_END]           : %llu\n", (long long unsigned int) global_thread_end));                      \
    MCA_VERBOSE1(printf("[APP_JOIN]             : %llu\n", (long long unsigned int) app_join));                               \
    MCA_VERBOSE1(printf("[APP_END]              : %llu\n", (long long unsigned int) app_end));                                \
    MCA_VERBOSE1(printf("Durations (in cycles):\n"));                                                                         \
    if (clock_num_runs > 1) {                                                                                                 \
        MCA_VERBOSE1(printf("(PARALLEL_COMPUTE and THREAD_STEPs are averaged over %d runs)\n", clock_num_runs));              \
    }                                                                                                                         \
    MCA_VERBOSE1(printf("[TOTAL]                : %llu\n", (long long unsigned int) app_end - app_start));                    \
    MCA_VERBOSE1(printf("[THREAD]               : %llu\n", (long long unsigned int) app_join - app_create));                  \
    MCA_VERBOSE1(printf("[PARALLEL]             : %llu\n", (long long unsigned int) global_thread_end - global_thread_start));\
    MCA_VERBOSE1(printf("[PARALLEL_COMPUTE]     : %llu\n", (long long unsigned int) accumulated_thread_compute / clock_num_runs)); \
    for (int32_t j = 0; j < step_number; j++) {                                                                               \
        MCA_VERBOSE1(printf("[THREAD_STEP_%d]        : %llu\n", j, (long long unsigned int) accumulated_thread_step[j] / clock_num_runs)); \
    }                                                                                                                         \
    MCA_VERBOSE1(printf("\n"));                                                                                               \
    MCA_VERBOSE1(printf("*** All threads times output in a gnuplot data-style ***\n"));                                       \
    local_sort_asc(thread_start, clock_thread_num);                                                                           \
    local_sort_asc(thread_compute_start, clock_thread_num);                                                                   \
    local_sort_asc(thread_compute_end, clock_thread_num);                                                                     \
    local_sort_asc(thread_end, clock_thread_num);                                                                             \
    for (int32_t j = 0; j < step_number; j++) {                                                                               \
        local_sort_asc(thread_start_step[j], clock_thread_num);                                                               \
        local_sort_asc(thread_end_step[j], clock_thread_num);                                                                 \
    }                                                                                                                         \
    MCA_VERBOSE1(printf("# cycle     thread_id\n"));                                                                          \
    for (int32_t i = 0; i < clock_thread_num; i++) {                                                                          \
        MCA_VERBOSE1(printf("%llu\t%d\n", (long long unsigned int) thread_start[i] - app_start, i));                          \
        MCA_VERBOSE1(printf("%llu\t%d\n", (long long unsigned int) thread_compute_start[i] - app_start, i));                  \
        for (int32_t j = 0; j < step_number; j++) {                                                                           \
            MCA_VERBOSE1(printf("%llu\t%d\n", (long long unsigned int) thread_start_step[j][i] - app_start, i));              \
            MCA_VERBOSE1(printf("%llu\t%d\n", (long long unsigned int) thread_end_step[j][i] - app_start, i));                \
        }                                                                                                                     \
        MCA_VERBOSE1(printf("%llu\t%d\n", (long long unsigned int) thread_compute_end[i] - app_start, i));                    \
        MCA_VERBOSE1(printf("%llu\t%d\n", (long long unsigned int) thread_end[i] - app_start, i));                            \
    }                                                                                                                         \
})

#elif TARGET_OS == GIETVM

#define PRINT_CLOCK ({                                                                                           \
    MCA_VERBOSE1(printf("Timestamps:\n"));                                                                       \
    if (clock_num_runs > 1) {                                                                                    \
        MCA_VERBOSE1(printf("(THREAD_COMPUTE_START, THREAD_COMPUTE_END, THREAD_START_STEPs and THREAD_END_STEPs)\n")); \
        MCA_VERBOSE1(printf("(are those of the last run)\n"));                                                   \
    }                                                                                                            \
    MCA_VERBOSE1(printf("[APP_START]            : %d\n",  app_start));                                           \
    MCA_VERBOSE1(printf("[APP_CREATE]           : %d\n", app_create));                                           \
    MCA_VERBOSE1(printf("[THREAD_START]         : %d\n", global_thread_start));                                  \
    MCA_VERBOSE1(printf("[THREAD_COMPUTE_START] : %d\n", global_thread_compute_start));                          \
    for (int32_t j = 0; j < step_number; j++) {                                                                  \
        MCA_VERBOSE1(printf("[THREAD_START_STEP_%d]  : %d\n", j, global_thread_start_step[j]));                  \
        MCA_VERBOSE1(printf("[THREAD_END_STEP_%d]    : %d\n", j, global_thread_end_step[j]));                    \
    }                                                                                                            \
    MCA_VERBOSE1(printf("[THREAD_COMPUTE_END]   : %d\n", global_thread_compute_end));                            \
    MCA_VERBOSE1(printf("[THREAD_END]           : %d\n", global_thread_end));                                    \
    MCA_VERBOSE1(printf("[APP_JOIN]             : %d\n", app_join));                                             \
    MCA_VERBOSE1(printf("[APP_END]              : %d\n", app_end));                                              \
    MCA_VERBOSE1(printf("Durations (in cycles):\n"));                                                            \
    if (clock_num_runs > 1) {                                                                                    \
        MCA_VERBOSE1(printf("(PARALLEL_COMPUTE and THREAD_STEPs are averaged over %d runs)\n", clock_num_runs)); \
    }                                                                                                            \
    MCA_VERBOSE1(printf("[TOTAL]                : %d\n", app_end - app_start));                                  \
    MCA_VERBOSE1(printf("[THREAD]               : %d\n", app_join - app_create));                                \
    MCA_VERBOSE1(printf("[PARALLEL]             : %d\n", global_thread_end - global_thread_start));              \
    MCA_VERBOSE1(printf("[PARALLEL_COMPUTE]     : %d\n", accumulated_thread_compute / clock_num_runs));          \
    for (int32_t j = 0; j < step_number; j++) {                                                                  \
        MCA_VERBOSE1(printf("[THREAD_STEP_%d]        : %d\n", j, accumulated_thread_step[j] / clock_num_runs));  \
    }                                                                                                            \
    MCA_VERBOSE1(printf("\n"));                                                                                  \
    MCA_VERBOSE1(printf("*** All threads times output in a gnuplot data-style ***\n"));                          \
    local_sort_asc(thread_start, clock_thread_num);                                                              \
    local_sort_asc(thread_compute_start, clock_thread_num);                                                      \
    local_sort_asc(thread_compute_end, clock_thread_num);                                                        \
    local_sort_asc(thread_end, clock_thread_num);                                                                \
    for (int32_t j = 0; j < step_number; j++) {                                                                  \
        local_sort_asc(thread_start_step[j], clock_thread_num);                                                  \
        local_sort_asc(thread_end_step[j], clock_thread_num);                                                    \
    }                                                                                                            \
    MCA_VERBOSE1(printf("# cycle     thread_id\n"));                                                             \
    for (int32_t i = 0; i < clock_thread_num; i++) {                                                             \
        MCA_VERBOSE1(printf("%d\t%d\n", thread_start[i] - app_start, i));                                        \
        MCA_VERBOSE1(printf("%d\t%d\n", thread_compute_start[i] - app_start, i));                                \
        for (int32_t j = 0; j < step_number; j++) {                                                              \
            MCA_VERBOSE1(printf("%d\t%d\n", thread_start_step[j][i] - app_start, i));                            \
            MCA_VERBOSE1(printf("%d\t%d\n", thread_end_step[j][i] - app_start, i));                              \
        }                                                                                                        \
        MCA_VERBOSE1(printf("%d\t%d\n", thread_compute_end[i] - app_start, i));                                  \
        MCA_VERBOSE1(printf("%d\t%d\n", thread_end[i] - app_start, i));                                          \
    }                                                                                                            \
})


#endif




#define CLOCK_FREE ({                                   \
    if (clock_thread_num > 0) {                         \
        free(thread_start);                             \
        free(thread_end);                               \
        free(thread_compute_start);                     \
        free(thread_compute_end);                       \
        if (step_number > 0) {                          \
            free(global_thread_start_step);             \
            free(global_thread_end_step);               \
            free(accumulated_thread_step);              \
            for (int32_t j = 0; j < step_number; j++) { \
                free(thread_start_step[j]);             \
                free(thread_end_step[j]);               \
            }                                           \
            free(thread_start_step);                    \
            free(thread_end_step);                      \
        }                                               \
    }                                                   \
})




#endif

