Changeset 823 for soft/giet_vm/applications/rosenfeld/include
- Timestamp:
- Jun 14, 2016, 5:23:56 PM (9 years ago)
- Location:
- soft/giet_vm/applications/rosenfeld/include
- Files:
-
- 3 edited
Legend:
- Unmodified
- Added
- Removed
-
soft/giet_vm/applications/rosenfeld/include/clock.h
r822 r823 7 7 #include "nrc_os_config.h" 8 8 #if TARGET_OS == LINUX 9 #include <x86intrin.h> 9 10 #include <sys/time.h> 10 11 #endif … … 17 18 * - CLOCK_APP_CREATE; 18 19 * - CLOCK_THREAD_START(thread_id); 19 * - CLOCK_THREAD_COMPUTE_START(thread_id; 20 * - CLOCK_THREAD_START_STEP(thread_id, step_id) 21 * - CLOCK_THREAD_END_STEP(thread_id, step_id) 22 * - (repeat num_steps times) 23 * - CLOCK_THREAD_COMPUTE_END(thread_id); 20 * - Repeat num_runs times: 21 * - CLOCK_THREAD_COMPUTE_START(thread_id; 22 * - Repeat num_step times: 23 * - CLOCK_THREAD_START_STEP(thread_id, step_id) 24 * - CLOCK_THREAD_END_STEP(thread_id, step_id) 25 * - CLOCK_THREAD_COMPUTE_END(thread_id); 26 * - CLOCK_ACCUMULATE; 24 27 * - CLOCK_THREAD_END(thread_id) 25 28 * - CLOCK_APP_JOIN; … … 28 31 * - PRINT_CLOCK; 29 32 * - CLOCK_FREE; 33 * In case of several runs, the THREAD_COMPUTE and all the THREAD_STEP resulting times 34 * are averaged over all the runs. The other times are kind of irrelevant. 35 * TODO: make a struct gathering all variables and change macros to functions 30 36 */ 31 37 32 38 33 39 static void local_sort_asc(uint64_t tab[], int32_t size) { 34 int32_t tmp;40 uint64_t tmp; 35 41 int32_t i, j; 36 42 for (i = 0; i < size; i++) { … … 61 67 int32_t step_number; \ 62 68 int32_t clock_thread_num; \ 69 int32_t clock_num_runs; \ 63 70 uint64_t ** thread_start_step; \ 64 71 uint64_t ** thread_end_step; \ … … 67 74 uint64_t global_thread_compute_start; \ 68 75 uint64_t global_thread_compute_end; \ 76 uint64_t accumulated_thread_compute; \ 69 77 uint64_t * global_thread_start_step; \ 70 uint64_t * global_thread_end_step; 78 uint64_t * global_thread_end_step; \ 79 uint64_t * accumulated_thread_step; 71 80 72 81 #if TARGET_OS == GIETVM 73 82 #define CLOCK(x) ({ x = giet_proctime(); }) 74 83 #elif TARGET_OS == LINUX 75 #define CLOCK(x) ({ \84 /*#define CLOCK(x) ({ \ 76 85 struct timeval full_time; \ 77 86 gettimeofday(&full_time, NULL); \ 78 87 x = (uint64_t) ((full_time.tv_usec + full_time.tv_sec * 1000000)); \ 79 }) 88 }) */ 89 #define CLOCK(x) ({ x = __rdtsc(); }) 80 90 #endif 81 91 … … 84 94 clock_thread_num = (x); \ 85 95 step_number = (y); \ 96 clock_num_runs = 0; \ 86 97 global_thread_start = 0xFFFFFFFFFFFFFFFFLLU; \ 87 98 global_thread_end = 0; \ 88 99 global_thread_compute_start = 0xFFFFFFFFFFFFFFFFLLU; \ 89 100 global_thread_compute_end = 0; \ 101 accumulated_thread_compute = 0; \ 90 102 if ((x) > 0) { \ 91 103 thread_start = (uint64_t *) malloc(sizeof(uint64_t) * (x)); \ … … 98 110 thread_start_step = (uint64_t **) malloc(sizeof(uint64_t *) * (y)); \ 99 111 thread_end_step = (uint64_t **) malloc(sizeof(uint64_t *) * (y)); \ 112 accumulated_thread_step = (uint64_t *) malloc(sizeof(uint64_t) * (y)); \ 100 113 for (int32_t j = 0; j < (y); j++) { \ 101 global_thread_start_step[j] = 0xFFFFFFFFFFFFFFFFL U;\114 global_thread_start_step[j] = 0xFFFFFFFFFFFFFFFFLLU; \ 102 115 global_thread_end_step[j] = 0; \ 116 accumulated_thread_step[j] = 0; \ 103 117 thread_start_step[j] = (uint64_t *) malloc(sizeof(uint64_t) * (x)); \ 104 118 thread_end_step[j] = (uint64_t *) malloc(sizeof(uint64_t) * (x)); \ … … 120 134 #define CLOCK_THREAD_END_STEP(x, y) ({ CLOCK(thread_end_step[y][x]); }) 121 135 136 #define CLOCK_ACCUMULATE ({ \ 137 for (int32_t i = 0; i < clock_thread_num; i++) { \ 138 if (thread_compute_start[i] < global_thread_compute_start) { \ 139 global_thread_compute_start = thread_compute_start[i]; \ 140 } \ 141 if (thread_compute_end[i] > global_thread_compute_end) { \ 142 global_thread_compute_end = thread_compute_end[i]; \ 143 } \ 144 for (int32_t j = 0; j < step_number; j++) { \ 145 if (thread_start_step[j][i] < global_thread_start_step[j]) { \ 146 global_thread_start_step[j] = thread_start_step[j][i]; \ 147 } \ 148 if (thread_end_step[j][i] > global_thread_end_step[j]) { \ 149 global_thread_end_step[j] = thread_end_step[j][i]; \ 150 } \ 151 } \ 152 } \ 153 for (int32_t j = 0; j < step_number; j++) { \ 154 accumulated_thread_step[j] += (global_thread_end_step[j] - global_thread_start_step[j]); \ 155 global_thread_start_step[j] = 0xFFFFFFFFFFFFFFFFLLU; \ 156 global_thread_end_step[j] = 0; \ 157 } \ 158 accumulated_thread_compute += (global_thread_compute_end - global_thread_compute_start); \ 159 global_thread_compute_start = 0xFFFFFFFFFFFFFFFFLLU; \ 160 global_thread_compute_end = 0; \ 161 clock_num_runs++; \ 162 }) 163 122 164 123 165 #define CLOCK_FINALIZE ({ \ 166 if (clock_num_runs == 0) { \ 167 CLOCK_ACCUMULATE; \ 168 } \ 124 169 for (int32_t i = 0; i < clock_thread_num; i++) { \ 125 170 if (thread_start[i] < global_thread_start) { \ … … 146 191 }) 147 192 148 #define PRINT_CLOCK ({ \ 149 MCA_VERBOSE1(printf("Timestamps:\n")); \ 150 MCA_VERBOSE1(printf("[APP_START] : %llu\n", app_start)); \ 151 MCA_VERBOSE1(printf("[APP_CREATE] : %llu\n", app_create)); \ 152 MCA_VERBOSE1(printf("[THREAD_START] : %llu\n", global_thread_start)); \ 153 MCA_VERBOSE1(printf("[THREAD_COMPUTE_START] : %llu\n", global_thread_compute_start)); \ 154 for (int32_t j = 0; j < step_number; j++) { \ 155 MCA_VERBOSE1(printf("[THREAD_START_STEP_%d] : %llu\n", j, global_thread_start_step[j])); \ 156 MCA_VERBOSE1(printf("[THREAD_END_STEP_%d] : %llu\n", j, global_thread_end_step[j])); \ 157 } \ 158 MCA_VERBOSE1(printf("[THREAD_COMPUTE_END] : %llu\n", global_thread_compute_end)); \ 159 MCA_VERBOSE1(printf("[THREAD_END] : %llu\n", global_thread_end)); \ 160 MCA_VERBOSE1(printf("[APP_JOIN] : %llu\n", app_join)); \ 161 MCA_VERBOSE1(printf("[APP_END] : %llu\n", app_end)); \ 162 MCA_VERBOSE1(printf("Durations (in cycles):\n")); \ 163 MCA_VERBOSE1(printf("[TOTAL] : %llu\n", app_end - app_start)); \ 164 MCA_VERBOSE1(printf("[THREAD] : %llu\n", app_join - app_create)); \ 165 MCA_VERBOSE1(printf("[PARALLEL] : %llu\n", global_thread_end - global_thread_start)); \ 166 MCA_VERBOSE1(printf("[PARALLEL_COMPUTE] : %llu\n", global_thread_compute_end - global_thread_compute_start)); \ 167 for (int32_t j = 0; j < step_number; j++) { \ 168 MCA_VERBOSE1(printf("[THREAD_STEP_%d] : %llu\n", j, global_thread_end_step[j] - global_thread_start_step[j])); \ 169 } \ 170 MCA_VERBOSE1(printf("\n")); \ 171 MCA_VERBOSE1(printf("*** All threads times output in a gnuplot data-style ***\n")); \ 172 local_sort_asc(thread_start, clock_thread_num); \ 173 local_sort_asc(thread_compute_start, clock_thread_num); \ 174 local_sort_asc(thread_compute_end, clock_thread_num); \ 175 local_sort_asc(thread_end, clock_thread_num); \ 176 for (int32_t j = 0; j < step_number; j++) { \ 177 local_sort_asc(thread_start_step[j], clock_thread_num); \ 178 local_sort_asc(thread_end_step[j], clock_thread_num); \ 179 } \ 180 MCA_VERBOSE1(printf("# cycle thread_id\n")); \ 181 for (int32_t i = 0; i < clock_thread_num; i++) { \ 182 MCA_VERBOSE1(printf("%llu\t%d\n", thread_start[i], i)); \ 183 MCA_VERBOSE1(printf("%llu\t%d\n", thread_compute_start[i], i)); \ 184 for (int32_t j = 0; j < step_number; j++) { \ 185 MCA_VERBOSE1(printf("%llu\t%d\n", thread_start_step[j][i], i)); \ 186 MCA_VERBOSE1(printf("%llu\t%d\n", thread_end_step[j][i], i)); \ 187 } \ 188 MCA_VERBOSE1(printf("%llu\t%d\n", thread_compute_end[i], i)); \ 189 MCA_VERBOSE1(printf("%llu\t%d\n", thread_end[i], i)); \ 190 } \ 193 194 #define PRINT_CLOCK ({ \ 195 MCA_VERBOSE1(printf("Timestamps:\n")); \ 196 if (clock_num_runs > 1) { \ 197 MCA_VERBOSE1(printf("(THREAD_COMPUTE_START, THREAD_COMPUTE_END, THREAD_START_STEPs and THREAD_END_STEPs)\n")); \ 198 MCA_VERBOSE1(printf("(are those of the last run)\n")); \ 199 } \ 200 MCA_VERBOSE1(printf("[APP_START] : %llu\n", app_start)); \ 201 MCA_VERBOSE1(printf("[APP_CREATE] : %llu\n", app_create)); \ 202 MCA_VERBOSE1(printf("[THREAD_START] : %llu\n", global_thread_start)); \ 203 MCA_VERBOSE1(printf("[THREAD_COMPUTE_START] : %llu\n", global_thread_compute_start)); \ 204 for (int32_t j = 0; j < step_number; j++) { \ 205 MCA_VERBOSE1(printf("[THREAD_START_STEP_%d] : %llu\n", j, global_thread_start_step[j])); \ 206 MCA_VERBOSE1(printf("[THREAD_END_STEP_%d] : %llu\n", j, global_thread_end_step[j])); \ 207 } \ 208 MCA_VERBOSE1(printf("[THREAD_COMPUTE_END] : %llu\n", global_thread_compute_end)); \ 209 MCA_VERBOSE1(printf("[THREAD_END] : %llu\n", global_thread_end)); \ 210 MCA_VERBOSE1(printf("[APP_JOIN] : %llu\n", app_join)); \ 211 MCA_VERBOSE1(printf("[APP_END] : %llu\n", app_end)); \ 212 MCA_VERBOSE1(printf("Durations (in cycles):\n")); \ 213 if (clock_num_runs > 1) { \ 214 MCA_VERBOSE1(printf("(PARALLEL_COMPUTE and THREAD_STEPs are averaged over %d runs)\n", clock_num_runs)); \ 215 } \ 216 MCA_VERBOSE1(printf("[TOTAL] : %llu\n", app_end - app_start)); \ 217 MCA_VERBOSE1(printf("[THREAD] : %llu\n", app_join - app_create)); \ 218 MCA_VERBOSE1(printf("[PARALLEL] : %llu\n", global_thread_end - global_thread_start)); \ 219 MCA_VERBOSE1(printf("[PARALLEL_COMPUTE] : %llu\n", accumulated_thread_compute / clock_num_runs)); \ 220 for (int32_t j = 0; j < step_number; j++) { \ 221 MCA_VERBOSE1(printf("[THREAD_STEP_%d] : %llu\n", j, accumulated_thread_step[j] / clock_num_runs)); \ 222 } \ 223 MCA_VERBOSE1(printf("\n")); \ 224 MCA_VERBOSE1(printf("*** All threads times output in a gnuplot data-style ***\n")); \ 225 local_sort_asc(thread_start, clock_thread_num); \ 226 local_sort_asc(thread_compute_start, clock_thread_num); \ 227 local_sort_asc(thread_compute_end, clock_thread_num); \ 228 local_sort_asc(thread_end, clock_thread_num); \ 229 for (int32_t j = 0; j < step_number; j++) { \ 230 local_sort_asc(thread_start_step[j], clock_thread_num); \ 231 local_sort_asc(thread_end_step[j], clock_thread_num); \ 232 } \ 233 MCA_VERBOSE1(printf("# cycle thread_id\n")); \ 234 for (int32_t i = 0; i < clock_thread_num; i++) { \ 235 MCA_VERBOSE1(printf("%llu\t%d\n", thread_start[i] - app_start, i)); \ 236 MCA_VERBOSE1(printf("%llu\t%d\n", thread_compute_start[i] - app_start, i)); \ 237 for (int32_t j = 0; j < step_number; j++) { \ 238 MCA_VERBOSE1(printf("%llu\t%d\n", thread_start_step[j][i] - app_start, i)); \ 239 MCA_VERBOSE1(printf("%llu\t%d\n", thread_end_step[j][i] - app_start, i)); \ 240 } \ 241 MCA_VERBOSE1(printf("%llu\t%d\n", thread_compute_end[i] - app_start, i)); \ 242 MCA_VERBOSE1(printf("%llu\t%d\n", thread_end[i] - app_start, i)); \ 243 } \ 191 244 }) 192 245 … … 205 258 free(global_thread_start_step); \ 206 259 free(global_thread_end_step); \ 260 free(accumulated_thread_step); \ 207 261 for (int32_t j = 0; j < step_number; j++) { \ 208 262 free(thread_start_step[j]); \ -
soft/giet_vm/applications/rosenfeld/include/config.h
r822 r823 4 4 5 5 #define SLOW 0 6 #define FEATURES 16 #define FEATURES 0 7 7 #define FAST 1 8 8 #define PYR_BARRIERS 0 9 #define PARMERGE 09 #define PARMERGE 1 10 10 #define ARSP 0 11 11 … … 21 21 #if FAST 22 22 #if !FEATURES && !PARMERGE && !ARSP 23 #define vuse2_Rosenfeld(e, f, T, D, alpha, F) vuse2_Rosenfeld_Dist(e, f, T, D, alpha)24 #define vuse3_Rosenfeld(e, f, g, T, D, alpha, F) vuse3_Rosenfeld_Dist(e, f, g, T, D, alpha)23 #define vuse2_Rosenfeld(e, f, T, D, alpha, F) vuse2_Rosenfeld_Dist(e, f, T, D, alpha) 24 #define vuse3_Rosenfeld(e, f, g, T, D, alpha, F) vuse3_Rosenfeld_Dist(e, f, g, T, D, alpha) 25 25 #elif !FEATURES && !PARMERGE && ARSP 26 #define vuse2_Rosenfeld(e, f, T, D, alpha, F) vuse2_Arsp_Rosenfeld_Dist(e, f, T, D, alpha)27 #define vuse3_Rosenfeld(e, f, g, T, D, alpha, F) vuse3_Arsp_Rosenfeld_Dist(e, f, g, T, D, alpha)26 #define vuse2_Rosenfeld(e, f, T, D, alpha, F) vuse2_Arsp_Rosenfeld_Dist(e, f, T, D, alpha) 27 #define vuse3_Rosenfeld(e, f, g, T, D, alpha, F) vuse3_Arsp_Rosenfeld_Dist(e, f, g, T, D, alpha) 28 28 #error "Configuration Not implemented" 29 29 #elif !FEATURES && PARMERGE && !ARSP 30 #define vuse2_Rosenfeld(e, f, T, D, alpha, F) vuse2_Parallel_Rosenfeld_Dist(e, f, T, D, alpha, F)31 #define vuse3_Rosenfeld(e, f, g, T, D, alpha, F) vuse3_Parallel_Rosenfeld_Dist(e, f, g, T, D, alpha, F)32 # error "Configuration Not implemented"30 #define vuse2_Rosenfeld(e, f, T, D, alpha, F) vuse2_Parallel_Rosenfeld_Dist(e, f, T, D, alpha, F) 31 #define vuse3_Rosenfeld(e, f, g, T, D, alpha, F) vuse3_Parallel_Rosenfeld_Dist(e, f, g, T, D, alpha, F) 32 #define SetRoot_Parallel_FNF(D, rl, rd, alpha, F) SetRoot_Parallel_Rosenfeld_Dist(D, rl, rd, alpha, F) 33 33 #elif !FEATURES && PARMERGE && ARSP 34 #define vuse2_Rosenfeld(e, f, T, D, alpha, F) vuse2_Parallel_Arsp_Rosenfeld_Dist(e, f, T, D, alpha, F)35 #define vuse3_Rosenfeld(e, f, g, T, D, alpha, F) vuse3_Parallel_Arsp_Rosenfeld_Dist(e, f, g, T, D, alpha, F)34 #define vuse2_Rosenfeld(e, f, T, D, alpha, F) vuse2_Parallel_Arsp_Rosenfeld_Dist(e, f, T, D, alpha, F) 35 #define vuse3_Rosenfeld(e, f, g, T, D, alpha, F) vuse3_Parallel_Arsp_Rosenfeld_Dist(e, f, g, T, D, alpha, F) 36 36 #elif FEATURES && !PARMERGE && !ARSP 37 #define vuse2_Rosenfeld(e, f, T, D, alpha, F) vuse2_Features_Rosenfeld_Dist(e, f, T, D, alpha, F)38 #define vuse3_Rosenfeld(e, f, g, T, D, alpha, F) vuse3_Features_Rosenfeld_Dist(e, f, g, T, D, alpha, F)37 #define vuse2_Rosenfeld(e, f, T, D, alpha, F) vuse2_Features_Rosenfeld_Dist(e, f, T, D, alpha, F) 38 #define vuse3_Rosenfeld(e, f, g, T, D, alpha, F) vuse3_Features_Rosenfeld_Dist(e, f, g, T, D, alpha, F) 39 39 #elif FEATURES && !PARMERGE && ARSP 40 #define vuse2_Rosenfeld(e, f, T, D, alpha, F) vuse2_Features_Arsp_Rosenfeld_Dist(e, f, T, D, alpha, F)41 #define vuse3_Rosenfeld(e, f, g, T, D, alpha, F) vuse3_Features_Arsp_Rosenfeld_Dist(e, f, g, T, D, alpha, F)40 #define vuse2_Rosenfeld(e, f, T, D, alpha, F) vuse2_Features_Arsp_Rosenfeld_Dist(e, f, T, D, alpha, F) 41 #define vuse3_Rosenfeld(e, f, g, T, D, alpha, F) vuse3_Features_Arsp_Rosenfeld_Dist(e, f, g, T, D, alpha, F) 42 42 #error "Configuration Not implemented" 43 43 #elif FEATURES && PARMERGE && !ARSP 44 #define vuse2_Rosenfeld(e, f, T, D, alpha, F) vuse2_Parallel_Features_Rosenfeld_Dist(e, f, T, D, alpha, F) 45 #define vuse3_Rosenfeld(e, f, g, T, D, alpha, F) vuse3_Parallel_Features_Rosenfeld_Dist(e, f, g, T, D, alpha, F) 44 #define vuse2_Rosenfeld(e, f, T, D, alpha, F) vuse2_Parallel_Rosenfeld_Dist(e, f, T, D, alpha, F) 45 #define vuse3_Rosenfeld(e, f, g, T, D, alpha, F) vuse3_Parallel_Rosenfeld_Dist(e, f, g, T, D, alpha, F) 46 #define SetRoot_Parallel_FNF(D, rl, rd, alpha, F) SetRoot_Parallel_Features_Rosenfeld_Dist(D, rl, rd, alpha, F) 46 47 #elif FEATURES && PARMERGE && ARSP 47 #define vuse2_Rosenfeld(e, f, T, D, alpha, F) vuse2_Parallel_Features_Arsp_Rosenfeld_Dist(e, f, T, D, alpha, F)48 #define vuse3_Rosenfeld(e, f, g, T, D, alpha, F) vuse3_Parallel_Features_Arsp_Rosenfeld_Dist(e, f, g, T, D, alpha, F)48 #define vuse2_Rosenfeld(e, f, T, D, alpha, F) vuse2_Parallel_Features_Arsp_Rosenfeld_Dist(e, f, T, D, alpha, F) 49 #define vuse3_Rosenfeld(e, f, g, T, D, alpha, F) vuse3_Parallel_Features_Arsp_Rosenfeld_Dist(e, f, g, T, D, alpha, F) 49 50 #error "Configuration Not implemented" 50 51 #endif … … 72 73 // 2 : Standard level 73 74 // 3 : Maximum (debug) level 74 #define MCA_VERBOSE_LEVEL 2 75 76 #endif // __CONFIG_H__ 75 #define MCA_VERBOSE_LEVEL 1 77 76 78 77 78 #endif 79 -
soft/giet_vm/applications/rosenfeld/include/mca.h
r822 r823 67 67 68 68 typedef struct sMCA { 69 int p, np; // numero du processeur et nb total de processeurs 69 int p, np; // numero du processeur et nb total de processeurs 70 int nr; // nombre de runs successifs à mesurer 70 71 71 72 uint8 ** X; // image source … … 78 79 int j0, j1; 79 80 80 uint32 e0, e1; // indice pour chaque bande 81 uint32 ne; // indice max d'etiquettes utilise par bande 81 uint32 e0, e1; // indice pour chaque bande 82 uint32 ne; // indice max d'etiquettes utilise par bande 83 uint32 ne_prev; // ne de l'image précédente (pour le reset de T) 82 84 83 int alpha; // puissance de 2 >= a la taille d'un bloc84 uint32 * T; // table d'quivalence table (Rosenfeld) ou d'indices (Warp)85 uint32 ** D; // distributed table (instanciee dans chaque worker)85 int alpha; // puissance de 2 >= a la taille d'un bloc 86 uint32 * T; // table d'quivalence table (Rosenfeld) ou d'indices (Warp) 87 uint32 ** D; // distributed table (instanciee dans chaque worker) 86 88 87 89 RegionStats * stats; … … 106 108 void MCA_Set_Size(MCA * mca, int width, int height); 107 109 void MCA_Set_NP(MCA * mca, int np); 110 void MCA_Set_NR(MCA * mca, int nr); 108 111 109 112 uint32 MCA_CalcMaxLabels(int connection, uint32 height, uint32 width);
Note: See TracChangeset
for help on using the changeset viewer.