#ifndef _CLOCK_H_ #define _CLOCK_H_ #include #include "nrc_os_config.h" #if TARGET_OS == LINUX #include #endif /** * The macros should be called in the following order: * - CLOCK_DEC; * - CLOCK_INIT(num_threads, num_steps); * - CLOCK_APP_START; * - CLOCK_APP_CREATE; * - CLOCK_THREAD_START(thread_id); * - CLOCK_THREAD_COMPUTE_START(thread_id; * - CLOCK_THREAD_START_STEP(thread_id, step_id) * - CLOCK_THREAD_END_STEP(thread_id, step_id) * - (repeat num_steps times) * - CLOCK_THREAD_COMPUTE_END(thread_id); * - CLOCK_THREAD_END(thread_id) * - CLOCK_APP_JOIN; * - CLOCK_APP_END; * - CLOCK_FINALIZE(num_threads); * - PRINT_CLOCK; * - CLOCK_FREE; */ static void local_sort_asc(uint64_t tab[], int32_t size) { int32_t tmp; int32_t i, j; for (i = 0; i < size; i++) { uint64_t min = tab[i]; int32_t jmin = i; for (j = i + 1; j < size; j++) { if (tab[j] < min) { jmin = j; min = tab[j]; } } tmp = tab[i]; tab[i] = min; tab[jmin] = tmp; } } #define CLOCK_DEC uint64_t app_start; \ uint64_t app_end; \ uint64_t app_create; \ uint64_t app_join; \ uint64_t * thread_start; \ uint64_t * thread_end; \ uint64_t * thread_compute_start; \ uint64_t * thread_compute_end; \ int32_t step_number; \ int32_t clock_thread_num; \ uint64_t ** thread_start_step; \ uint64_t ** thread_end_step; \ uint64_t global_thread_start; \ uint64_t global_thread_end; \ uint64_t global_thread_compute_start; \ uint64_t global_thread_compute_end; \ uint64_t * global_thread_start_step; \ uint64_t * global_thread_end_step; #if TARGET_OS == GIETVM #define CLOCK(x) ({ x = giet_proctime(); }) #elif TARGET_OS == LINUX #define CLOCK(x) ({ \ struct timeval full_time; \ gettimeofday(&full_time, NULL); \ x = (uint64_t) ((full_time.tv_usec + full_time.tv_sec * 1000000)); \ }) #endif // x = number of threads, y = number of steps #define CLOCK_INIT(x, y) ({ \ clock_thread_num = (x); \ step_number = (y); \ global_thread_start = 0xFFFFFFFFFFFFFFFFLLU; \ global_thread_end = 0; \ global_thread_compute_start = 0xFFFFFFFFFFFFFFFFLLU; \ global_thread_compute_end = 0; \ if ((x) > 0) { \ thread_start = (uint64_t *) malloc(sizeof(uint64_t) * (x)); \ thread_end = (uint64_t *) malloc(sizeof(uint64_t) * (x)); \ thread_compute_start = (uint64_t *) malloc(sizeof(uint64_t) * (x)); \ thread_compute_end = (uint64_t *) malloc(sizeof(uint64_t) * (x)); \ if ((y) > 0) { \ global_thread_start_step = (uint64_t *) malloc(sizeof(uint64_t) * (y)); \ global_thread_end_step = (uint64_t *) malloc(sizeof(uint64_t) * (y)); \ thread_start_step = (uint64_t **) malloc(sizeof(uint64_t *) * (y)); \ thread_end_step = (uint64_t **) malloc(sizeof(uint64_t *) * (y)); \ for (int32_t j = 0; j < (y); j++) { \ global_thread_start_step[j] = 0xFFFFFFFFFFFFFFFFLU; \ global_thread_end_step[j] = 0; \ thread_start_step[j] = (uint64_t *) malloc(sizeof(uint64_t) * (x)); \ thread_end_step[j] = (uint64_t *) malloc(sizeof(uint64_t) * (x)); \ } \ } \ } \ }) #define CLOCK_APP_START ({ CLOCK(app_start); }) #define CLOCK_APP_END ({ CLOCK(app_end); }) #define CLOCK_APP_CREATE ({ CLOCK(app_create); }) #define CLOCK_APP_JOIN ({ CLOCK(app_join); }) #define CLOCK_THREAD_START(x) ({ CLOCK(thread_start[x]); }) #define CLOCK_THREAD_END(x) ({ CLOCK(thread_end[x]); }) #define CLOCK_THREAD_COMPUTE_START(x) ({ CLOCK(thread_compute_start[x]); }) #define CLOCK_THREAD_COMPUTE_END(x) ({ CLOCK(thread_compute_end[x]); }) #define CLOCK_THREAD_START_STEP(x, y) ({ CLOCK(thread_start_step[y][x]); }) #define CLOCK_THREAD_END_STEP(x, y) ({ CLOCK(thread_end_step[y][x]); }) #define CLOCK_FINALIZE ({ \ for (int32_t i = 0; i < clock_thread_num; i++) { \ if (thread_start[i] < global_thread_start) { \ global_thread_start = thread_start[i]; \ } \ if (thread_compute_start[i] < global_thread_compute_start) { \ global_thread_compute_start = thread_compute_start[i]; \ } \ if (thread_end[i] > global_thread_end) { \ global_thread_end = thread_end[i]; \ } \ if (thread_compute_end[i] > global_thread_compute_end) { \ global_thread_compute_end = thread_compute_end[i]; \ } \ for (int32_t j = 0; j < step_number; j++) { \ if (thread_start_step[j][i] < global_thread_start_step[j]) { \ global_thread_start_step[j] = thread_start_step[j][i]; \ } \ if (thread_end_step[j][i] > global_thread_end_step[j]) { \ global_thread_end_step[j] = thread_end_step[j][i]; \ } \ } \ } \ }) #define PRINT_CLOCK ({ \ MCA_VERBOSE1(printf("Timestamps:\n")); \ MCA_VERBOSE1(printf("[APP_START] : %llu\n", app_start)); \ MCA_VERBOSE1(printf("[APP_CREATE] : %llu\n", app_create)); \ MCA_VERBOSE1(printf("[THREAD_START] : %llu\n", global_thread_start)); \ MCA_VERBOSE1(printf("[THREAD_COMPUTE_START] : %llu\n", global_thread_compute_start)); \ for (int32_t j = 0; j < step_number; j++) { \ MCA_VERBOSE1(printf("[THREAD_START_STEP_%d] : %llu\n", j, global_thread_start_step[j])); \ MCA_VERBOSE1(printf("[THREAD_END_STEP_%d] : %llu\n", j, global_thread_end_step[j])); \ } \ MCA_VERBOSE1(printf("[THREAD_COMPUTE_END] : %llu\n", global_thread_compute_end)); \ MCA_VERBOSE1(printf("[THREAD_END] : %llu\n", global_thread_end)); \ MCA_VERBOSE1(printf("[APP_JOIN] : %llu\n", app_join)); \ MCA_VERBOSE1(printf("[APP_END] : %llu\n", app_end)); \ MCA_VERBOSE1(printf("Durations (in cycles):\n")); \ MCA_VERBOSE1(printf("[TOTAL] : %llu\n", app_end - app_start)); \ MCA_VERBOSE1(printf("[THREAD] : %llu\n", app_join - app_create)); \ MCA_VERBOSE1(printf("[PARALLEL] : %llu\n", global_thread_end - global_thread_start)); \ MCA_VERBOSE1(printf("[PARALLEL_COMPUTE] : %llu\n", global_thread_compute_end - global_thread_compute_start)); \ for (int32_t j = 0; j < step_number; j++) { \ MCA_VERBOSE1(printf("[THREAD_STEP_%d] : %llu\n", j, global_thread_end_step[j] - global_thread_start_step[j])); \ } \ MCA_VERBOSE1(printf("\n")); \ MCA_VERBOSE1(printf("*** All threads times output in a gnuplot data-style ***\n")); \ local_sort_asc(thread_start, clock_thread_num); \ local_sort_asc(thread_compute_start, clock_thread_num); \ local_sort_asc(thread_compute_end, clock_thread_num); \ local_sort_asc(thread_end, clock_thread_num); \ for (int32_t j = 0; j < step_number; j++) { \ local_sort_asc(thread_start_step[j], clock_thread_num); \ local_sort_asc(thread_end_step[j], clock_thread_num); \ } \ MCA_VERBOSE1(printf("# cycle thread_id\n")); \ for (int32_t i = 0; i < clock_thread_num; i++) { \ MCA_VERBOSE1(printf("%llu\t%d\n", thread_start[i], i)); \ MCA_VERBOSE1(printf("%llu\t%d\n", thread_compute_start[i], i)); \ for (int32_t j = 0; j < step_number; j++) { \ MCA_VERBOSE1(printf("%llu\t%d\n", thread_start_step[j][i], i)); \ MCA_VERBOSE1(printf("%llu\t%d\n", thread_end_step[j][i], i)); \ } \ MCA_VERBOSE1(printf("%llu\t%d\n", thread_compute_end[i], i)); \ MCA_VERBOSE1(printf("%llu\t%d\n", thread_end[i], i)); \ } \ }) #define CLOCK_FREE ({ \ if (clock_thread_num > 0) { \ free(thread_start); \ free(thread_end); \ free(thread_compute_start); \ free(thread_compute_end); \ if (step_number > 0) { \ free(global_thread_start_step); \ free(global_thread_end_step); \ for (int32_t j = 0; j < step_number; j++) { \ free(thread_start_step[j]); \ free(thread_end_step[j]); \ } \ free(thread_start_step); \ free(thread_end_step); \ } \ } \ }) #endif