#ifndef _CLOCK_H_ #define _CLOCK_H_ #include #include "nrc_os_config.h" #if TARGET_OS == LINUX #include #include typedef uint64_t cl_size_t; #define MAX_CLOCK_VAL 0xFFFFFFFFFFFFFFFFLU #elif TARGET_OS == GIETVM typedef uint32_t cl_size_t; #define MAX_CLOCK_VAL 0xFFFFFFFF #endif /** * The macros should be called in the following order: * - CLOCK_DEC; * - CLOCK_INIT(num_threads, num_steps); * - CLOCK_APP_START; * - CLOCK_APP_CREATE; * - CLOCK_THREAD_START(thread_id); * - Repeat num_runs times: * - CLOCK_THREAD_COMPUTE_START(thread_id; * - Repeat num_step times: * - CLOCK_THREAD_START_STEP(thread_id, step_id) * - CLOCK_THREAD_END_STEP(thread_id, step_id) * - CLOCK_THREAD_COMPUTE_END(thread_id); * - CLOCK_ACCUMULATE; * - CLOCK_THREAD_END(thread_id) * - CLOCK_APP_JOIN; * - CLOCK_APP_END; * - CLOCK_FINALIZE(num_threads); * - PRINT_CLOCK; * - CLOCK_FREE; * In case of several runs, the THREAD_COMPUTE and all the THREAD_STEP resulting times * are averaged over all the runs. The other times are kind of irrelevant. * TODO: make a struct gathering all variables and change macros to functions */ static void local_sort_asc(cl_size_t tab[], int32_t size) { cl_size_t tmp; int32_t i, j; for (i = 0; i < size; i++) { cl_size_t min = tab[i]; int32_t jmin = i; for (j = i + 1; j < size; j++) { if (tab[j] < min) { jmin = j; min = tab[j]; } } tmp = tab[i]; tab[i] = min; tab[jmin] = tmp; } } #define CLOCK_DEC cl_size_t app_start; \ cl_size_t app_end; \ cl_size_t app_create; \ cl_size_t app_join; \ cl_size_t * thread_start; \ cl_size_t * thread_end; \ cl_size_t * thread_compute_start; \ cl_size_t * thread_compute_end; \ int32_t step_number; \ int32_t clock_thread_num; \ int32_t clock_num_runs; \ cl_size_t ** thread_start_step; \ cl_size_t ** thread_end_step; \ cl_size_t global_thread_start; \ cl_size_t global_thread_end; \ cl_size_t global_thread_compute_start; \ cl_size_t global_thread_compute_end; \ cl_size_t accumulated_thread_compute; \ cl_size_t * global_thread_start_step; \ cl_size_t * global_thread_end_step; \ cl_size_t * accumulated_thread_step; #if TARGET_OS == GIETVM #define CLOCK(x) ({ x = giet_proctime(); }) #elif TARGET_OS == LINUX /*#define CLOCK(x) ({ \ struct timeval full_time; \ gettimeofday(&full_time, NULL); \ x = (cl_size_t) ((full_time.tv_usec + full_time.tv_sec * 1000000)); \ }) */ #define CLOCK(x) ({ x = __rdtsc(); }) #endif // x = number of threads, y = number of steps #define CLOCK_INIT(x, y) ({ \ clock_thread_num = (x); \ step_number = (y); \ clock_num_runs = 0; \ global_thread_start = MAX_CLOCK_VAL; \ global_thread_end = 0; \ global_thread_compute_start = MAX_CLOCK_VAL; \ global_thread_compute_end = 0; \ accumulated_thread_compute = 0; \ if ((x) > 0) { \ thread_start = (cl_size_t *) malloc(sizeof(cl_size_t) * (x)); \ thread_end = (cl_size_t *) malloc(sizeof(cl_size_t) * (x)); \ thread_compute_start = (cl_size_t *) malloc(sizeof(cl_size_t) * (x)); \ thread_compute_end = (cl_size_t *) malloc(sizeof(cl_size_t) * (x)); \ if ((y) > 0) { \ global_thread_start_step = (cl_size_t *) malloc(sizeof(cl_size_t) * (y)); \ global_thread_end_step = (cl_size_t *) malloc(sizeof(cl_size_t) * (y)); \ thread_start_step = (cl_size_t **) malloc(sizeof(cl_size_t *) * (y)); \ thread_end_step = (cl_size_t **) malloc(sizeof(cl_size_t *) * (y)); \ accumulated_thread_step = (cl_size_t *) malloc(sizeof(cl_size_t) * (y)); \ for (int32_t j = 0; j < (y); j++) { \ global_thread_start_step[j] = MAX_CLOCK_VAL; \ global_thread_end_step[j] = 0; \ accumulated_thread_step[j] = 0; \ thread_start_step[j] = (cl_size_t *) malloc(sizeof(cl_size_t) * (x)); \ thread_end_step[j] = (cl_size_t *) malloc(sizeof(cl_size_t) * (x)); \ } \ } \ } \ }) #define CLOCK_APP_START ({ CLOCK(app_start); }) #define CLOCK_APP_END ({ CLOCK(app_end); }) #define CLOCK_APP_CREATE ({ CLOCK(app_create); }) #define CLOCK_APP_JOIN ({ CLOCK(app_join); }) #define CLOCK_THREAD_START(x) ({ CLOCK(thread_start[x]); }) #define CLOCK_THREAD_END(x) ({ CLOCK(thread_end[x]); }) #define CLOCK_THREAD_COMPUTE_START(x) ({ CLOCK(thread_compute_start[x]); }) #define CLOCK_THREAD_COMPUTE_END(x) ({ CLOCK(thread_compute_end[x]); }) #define CLOCK_THREAD_START_STEP(x, y) ({ CLOCK(thread_start_step[y][x]); }) #define CLOCK_THREAD_END_STEP(x, y) ({ CLOCK(thread_end_step[y][x]); }) #define CLOCK_ACCUMULATE ({ \ for (int32_t i = 0; i < clock_thread_num; i++) { \ if (thread_compute_start[i] < global_thread_compute_start) { \ global_thread_compute_start = thread_compute_start[i]; \ } \ if (thread_compute_end[i] > global_thread_compute_end) { \ global_thread_compute_end = thread_compute_end[i]; \ } \ for (int32_t j = 0; j < step_number; j++) { \ if (thread_start_step[j][i] < global_thread_start_step[j]) { \ global_thread_start_step[j] = thread_start_step[j][i]; \ } \ if (thread_end_step[j][i] > global_thread_end_step[j]) { \ global_thread_end_step[j] = thread_end_step[j][i]; \ } \ } \ } \ for (int32_t j = 0; j < step_number; j++) { \ accumulated_thread_step[j] += (global_thread_end_step[j] - global_thread_start_step[j]); \ global_thread_start_step[j] = MAX_CLOCK_VAL; \ global_thread_end_step[j] = 0; \ } \ accumulated_thread_compute += (global_thread_compute_end - global_thread_compute_start); \ global_thread_compute_start = MAX_CLOCK_VAL; \ global_thread_compute_end = 0; \ clock_num_runs++; \ }) #define CLOCK_FINALIZE ({ \ if (clock_num_runs == 0) { \ CLOCK_ACCUMULATE; \ } \ for (int32_t i = 0; i < clock_thread_num; i++) { \ if (thread_start[i] < global_thread_start) { \ global_thread_start = thread_start[i]; \ } \ if (thread_compute_start[i] < global_thread_compute_start) { \ global_thread_compute_start = thread_compute_start[i]; \ } \ if (thread_end[i] > global_thread_end) { \ global_thread_end = thread_end[i]; \ } \ if (thread_compute_end[i] > global_thread_compute_end) { \ global_thread_compute_end = thread_compute_end[i]; \ } \ for (int32_t j = 0; j < step_number; j++) { \ if (thread_start_step[j][i] < global_thread_start_step[j]) { \ global_thread_start_step[j] = thread_start_step[j][i]; \ } \ if (thread_end_step[j][i] > global_thread_end_step[j]) { \ global_thread_end_step[j] = thread_end_step[j][i]; \ } \ } \ } \ }) #if TARGET_OS == LINUX #define PRINT_CLOCK ({ \ MCA_VERBOSE1(printf("Timestamps:\n")); \ if (clock_num_runs > 1) { \ MCA_VERBOSE1(printf("(THREAD_COMPUTE_START, THREAD_COMPUTE_END, THREAD_START_STEPs and THREAD_END_STEPs)\n")); \ MCA_VERBOSE1(printf("(are those of the last run)\n")); \ } \ MCA_VERBOSE1(printf("[APP_START] : %llu\n", (long long unsigned int) app_start)); \ MCA_VERBOSE1(printf("[APP_CREATE] : %llu\n", (long long unsigned int) app_create)); \ MCA_VERBOSE1(printf("[THREAD_START] : %llu\n", (long long unsigned int) global_thread_start)); \ MCA_VERBOSE1(printf("[THREAD_COMPUTE_START] : %llu\n", (long long unsigned int) global_thread_compute_start)); \ for (int32_t j = 0; j < step_number; j++) { \ MCA_VERBOSE1(printf("[THREAD_START_STEP_%d] : %llu\n", j, (long long unsigned int) global_thread_start_step[j])); \ MCA_VERBOSE1(printf("[THREAD_END_STEP_%d] : %llu\n", j, (long long unsigned int) global_thread_end_step[j])); \ } \ MCA_VERBOSE1(printf("[THREAD_COMPUTE_END] : %llu\n", (long long unsigned int) global_thread_compute_end)); \ MCA_VERBOSE1(printf("[THREAD_END] : %llu\n", (long long unsigned int) global_thread_end)); \ MCA_VERBOSE1(printf("[APP_JOIN] : %llu\n", (long long unsigned int) app_join)); \ MCA_VERBOSE1(printf("[APP_END] : %llu\n", (long long unsigned int) app_end)); \ MCA_VERBOSE1(printf("Durations (in cycles):\n")); \ if (clock_num_runs > 1) { \ MCA_VERBOSE1(printf("(PARALLEL_COMPUTE and THREAD_STEPs are averaged over %d runs)\n", clock_num_runs)); \ } \ MCA_VERBOSE1(printf("[TOTAL] : %llu\n", (long long unsigned int) app_end - app_start)); \ MCA_VERBOSE1(printf("[THREAD] : %llu\n", (long long unsigned int) app_join - app_create)); \ MCA_VERBOSE1(printf("[PARALLEL] : %llu\n", (long long unsigned int) global_thread_end - global_thread_start));\ MCA_VERBOSE1(printf("[PARALLEL_COMPUTE] : %llu\n", (long long unsigned int) accumulated_thread_compute / clock_num_runs)); \ for (int32_t j = 0; j < step_number; j++) { \ MCA_VERBOSE1(printf("[THREAD_STEP_%d] : %llu\n", j, (long long unsigned int) accumulated_thread_step[j] / clock_num_runs)); \ } \ MCA_VERBOSE1(printf("\n")); \ MCA_VERBOSE1(printf("*** All threads times output in a gnuplot data-style ***\n")); \ local_sort_asc(thread_start, clock_thread_num); \ local_sort_asc(thread_compute_start, clock_thread_num); \ local_sort_asc(thread_compute_end, clock_thread_num); \ local_sort_asc(thread_end, clock_thread_num); \ for (int32_t j = 0; j < step_number; j++) { \ local_sort_asc(thread_start_step[j], clock_thread_num); \ local_sort_asc(thread_end_step[j], clock_thread_num); \ } \ MCA_VERBOSE1(printf("# cycle thread_id\n")); \ for (int32_t i = 0; i < clock_thread_num; i++) { \ MCA_VERBOSE1(printf("%llu\t%d\n", (long long unsigned int) thread_start[i] - app_start, i)); \ MCA_VERBOSE1(printf("%llu\t%d\n", (long long unsigned int) thread_compute_start[i] - app_start, i)); \ for (int32_t j = 0; j < step_number; j++) { \ MCA_VERBOSE1(printf("%llu\t%d\n", (long long unsigned int) thread_start_step[j][i] - app_start, i)); \ MCA_VERBOSE1(printf("%llu\t%d\n", (long long unsigned int) thread_end_step[j][i] - app_start, i)); \ } \ MCA_VERBOSE1(printf("%llu\t%d\n", (long long unsigned int) thread_compute_end[i] - app_start, i)); \ MCA_VERBOSE1(printf("%llu\t%d\n", (long long unsigned int) thread_end[i] - app_start, i)); \ } \ }) #elif TARGET_OS == GIETVM #define PRINT_CLOCK ({ \ MCA_VERBOSE1(printf("Timestamps:\n")); \ if (clock_num_runs > 1) { \ MCA_VERBOSE1(printf("(THREAD_COMPUTE_START, THREAD_COMPUTE_END, THREAD_START_STEPs and THREAD_END_STEPs)\n")); \ MCA_VERBOSE1(printf("(are those of the last run)\n")); \ } \ MCA_VERBOSE1(printf("[APP_START] : %d\n", app_start)); \ MCA_VERBOSE1(printf("[APP_CREATE] : %d\n", app_create)); \ MCA_VERBOSE1(printf("[THREAD_START] : %d\n", global_thread_start)); \ MCA_VERBOSE1(printf("[THREAD_COMPUTE_START] : %d\n", global_thread_compute_start)); \ for (int32_t j = 0; j < step_number; j++) { \ MCA_VERBOSE1(printf("[THREAD_START_STEP_%d] : %d\n", j, global_thread_start_step[j])); \ MCA_VERBOSE1(printf("[THREAD_END_STEP_%d] : %d\n", j, global_thread_end_step[j])); \ } \ MCA_VERBOSE1(printf("[THREAD_COMPUTE_END] : %d\n", global_thread_compute_end)); \ MCA_VERBOSE1(printf("[THREAD_END] : %d\n", global_thread_end)); \ MCA_VERBOSE1(printf("[APP_JOIN] : %d\n", app_join)); \ MCA_VERBOSE1(printf("[APP_END] : %d\n", app_end)); \ MCA_VERBOSE1(printf("Durations (in cycles):\n")); \ if (clock_num_runs > 1) { \ MCA_VERBOSE1(printf("(PARALLEL_COMPUTE and THREAD_STEPs are averaged over %d runs)\n", clock_num_runs)); \ } \ MCA_VERBOSE1(printf("[TOTAL] : %d\n", app_end - app_start)); \ MCA_VERBOSE1(printf("[THREAD] : %d\n", app_join - app_create)); \ MCA_VERBOSE1(printf("[PARALLEL] : %d\n", global_thread_end - global_thread_start)); \ MCA_VERBOSE1(printf("[PARALLEL_COMPUTE] : %d\n", accumulated_thread_compute / clock_num_runs)); \ for (int32_t j = 0; j < step_number; j++) { \ MCA_VERBOSE1(printf("[THREAD_STEP_%d] : %d\n", j, accumulated_thread_step[j] / clock_num_runs)); \ } \ MCA_VERBOSE1(printf("\n")); \ MCA_VERBOSE1(printf("*** All threads times output in a gnuplot data-style ***\n")); \ local_sort_asc(thread_start, clock_thread_num); \ local_sort_asc(thread_compute_start, clock_thread_num); \ local_sort_asc(thread_compute_end, clock_thread_num); \ local_sort_asc(thread_end, clock_thread_num); \ for (int32_t j = 0; j < step_number; j++) { \ local_sort_asc(thread_start_step[j], clock_thread_num); \ local_sort_asc(thread_end_step[j], clock_thread_num); \ } \ MCA_VERBOSE1(printf("# cycle thread_id\n")); \ for (int32_t i = 0; i < clock_thread_num; i++) { \ MCA_VERBOSE1(printf("%d\t%d\n", thread_start[i] - app_start, i)); \ MCA_VERBOSE1(printf("%d\t%d\n", thread_compute_start[i] - app_start, i)); \ for (int32_t j = 0; j < step_number; j++) { \ MCA_VERBOSE1(printf("%d\t%d\n", thread_start_step[j][i] - app_start, i)); \ MCA_VERBOSE1(printf("%d\t%d\n", thread_end_step[j][i] - app_start, i)); \ } \ MCA_VERBOSE1(printf("%d\t%d\n", thread_compute_end[i] - app_start, i)); \ MCA_VERBOSE1(printf("%d\t%d\n", thread_end[i] - app_start, i)); \ } \ }) #endif #define CLOCK_FREE ({ \ if (clock_thread_num > 0) { \ free(thread_start); \ free(thread_end); \ free(thread_compute_start); \ free(thread_compute_end); \ if (step_number > 0) { \ free(global_thread_start_step); \ free(global_thread_end_step); \ free(accumulated_thread_step); \ for (int32_t j = 0; j < step_number; j++) { \ free(thread_start_step[j]); \ free(thread_end_step[j]); \ } \ free(thread_start_step); \ free(thread_end_step); \ } \ } \ }) #endif