Changeset 652 for trunk/user/convol/convol.c
- Timestamp:
- Nov 14, 2019, 3:56:51 PM (5 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/user/convol/convol.c
r645 r652 5 5 /////////////////////////////////////////////////////////////////////////////////////// 6 6 // This multi-threaded application implements a 2D convolution product. 7 // It can run on a multi- processors, multi-clusters architecture, with one thread8 // per processor, and uses the POSIX threads API.7 // It can run on a multi-cores, multi-clusters architecture, with one thread 8 // per core, and uses the POSIX threads API. 9 9 // 10 10 // The main() function can be launched on any processor P[x,y,l]. … … 14 14 // when the parallel execution is completed. 15 15 // 16 // The convolution kernel is [201]*[35] pixels, but it can be factored in two17 // independant line and column convolution products.16 // The convolution kernel is defined in the execute() function. 17 // It can be factored in two independant line and column convolution products. 18 18 // The five buffers containing the image are distributed in clusters. 19 // For the philips image, it is a [201]*[35] pixels rectangle, and the. 19 20 // 20 21 // The (1024 * 1024) pixels image is read from a file (2 bytes per pixel). 21 22 // 22 23 // - number of clusters containing processors must be power of 2 no larger than 256. 23 // - number of processors per cluster must be power of 2 no larger than 8. 24 // - number of processors per cluster must be power of 2 no larger than 4. 25 // 26 // The number N of working threads is always defined by the number of cores availables 27 // in the architecture, but this application supports three placement modes. 28 // In all modes, the working threads are identified by the [tid] continuous index 29 // in range [0, NTHREADS-1], and defines how the lines are shared amongst the threads. 30 // This continuous index can always be decomposed in two continuous sub-indexes: 31 // tid == cid * ncores + lid, where cid is in [0,NCLUSTERS-1] and lid in [0,NCORES-1]. 32 // 33 // - NO_PLACEMENT: the main thread is itsef a working thread. The (N_1) other working 34 // threads are created by the main thread, but the placement is done by the OS, using 35 // the DQDT for load balancing, and two working threads can be placed on the same core. 36 // The [cid,lid] are only abstract identifiers, and cannot be associated to a physical 37 // cluster or a physical core. In this mode, the main thread run on any cluster, 38 // but has tid = 0 (i.e. cid = 0 & tid = 0). 39 // 40 // - EXPLICIT_PLACEMENT: the main thread is again a working thread, but the placement of 41 // of the threads on the cores is explicitely controled by the main thread to have 42 // exactly one working thread per core, and the [cxy][lpid] core coordinates for a given 43 // thread[tid] can be directly derived from the [tid] value: [cid] is an alias for the 44 // physical cluster identifier, and [lid] is the local core index. 45 // 46 // - PARALLEL_PLACEMENT: the main thread is not anymore a working thread, and uses the 47 // non standard pthread_parallel_create() function to avoid the costly sequencial 48 // loops for pthread_create() and pthread_join(). It garanty one working thread 49 // per core, and the same relation between the thread[tid] and the core[cxy][lpid]. 50 // 51 // The [tid] continuous index defines how the work is shared amongst the threads: 52 // - each thread handles NL/nthreads lines for the horizontal filter. 53 // - each thread handles NP/nthreads columns for the vertical filter. 24 54 /////////////////////////////////////////////////////////////////////////////////////// 25 55 56 #include <sys/mman.h> 26 57 #include <stdio.h> 27 58 #include <stdlib.h> … … 29 60 #include <unistd.h> 30 61 #include <pthread.h> 62 #include <string.h> 31 63 #include <almosmkh.h> 32 64 #include <hal_macros.h> 33 65 34 #define IMAGE_IN_PATH "misc/philips_1024.raw" 35 36 #define USE_SQT_BARRIER 1 37 #define VERBOSE 1 38 #define SUPER_VERBOSE 0 39 40 #define USE_DQT_BARRIER 1 66 #define VERBOSE_MAIN 1 67 #define VERBOSE_EXEC 1 41 68 42 69 #define X_MAX 16 43 70 #define Y_MAX 16 44 #define PROCS_MAX 471 #define CORES_MAX 4 45 72 #define CLUSTERS_MAX (X_MAX * Y_MAX) 46 #define THREADS_MAX (X_MAX * Y_MAX * PROCS_MAX] 47 48 #define INITIAL_DISPLAY_ENABLE 1 49 #define FINAL_DISPLAY_ENABLE 1 50 51 #define PIXEL_SIZE 2 // input image has 2 bytes per pixel 52 #define FBF_TYPE 420 // output image has 1 byte per pixel 53 73 #define THREADS_MAX (X_MAX * Y_MAX * CORES_MAX) 74 75 #define IMAGE_IN_PATH "misc/philips_1024_2.raw" 76 #define IMAGE_IN_PIXEL_SIZE 2 // 2 bytes per pixel 77 78 #define IMAGE_OUT_PATH "misc/philips_after_1O24.raw" 79 #define IMAGE_OUT_PIXEL_SIZE 1 // 1 bytes per pixel 80 81 #define FBF_TYPE 420 54 82 #define NL 1024 55 83 #define NP 1024 56 84 #define NB_PIXELS (NP * NL) 57 #define FRAME_SIZE (NB_PIXELS * PIXEL_SIZE) 58 85 86 #define NO_PLACEMENT 0 87 #define EXPLICIT_PLACEMENT 0 88 #define PARALLEL_PLACEMENT 1 89 90 #define USE_DQT_BARRIER 1 91 #define INITIAL_DISPLAY_ENABLE 1 92 #define FINAL_DISPLAY_ENABLE 1 59 93 60 94 #define TA(c,l,p) (A[c][((NP) * (l)) + (p)]) … … 68 102 69 103 ////////////////////////////////////////////////////////// 70 // global variables stored in seg_data in cluster[0,0]104 // global variables 71 105 ////////////////////////////////////////////////////////// 72 106 73 // Instrumentation counters (cluster_id, lpid] 74 unsigned int START[CLUSTERS_MAX][PROCS_MAX]; 75 unsigned int H_BEG[CLUSTERS_MAX][PROCS_MAX]; 76 unsigned int H_END[CLUSTERS_MAX][PROCS_MAX]; 77 unsigned int V_BEG[CLUSTERS_MAX][PROCS_MAX]; 78 unsigned int V_END[CLUSTERS_MAX][PROCS_MAX]; 79 unsigned int D_BEG[CLUSTERS_MAX][PROCS_MAX]; 80 unsigned int D_END[CLUSTERS_MAX][PROCS_MAX]; 81 82 // file pointers on input image 83 FILE * f_image_in; 84 FILE * f_instrum; 107 // global instrumentation counters for the main thread 108 unsigned int SEQUENCIAL_TIME = 0; 109 unsigned int PARALLEL_TIME = 0; 110 111 // instrumentation counters for thread[tid] in cluster[cid] 112 unsigned int START[CLUSTERS_MAX][CORES_MAX] = {{ 0 }}; 113 unsigned int H_BEG[CLUSTERS_MAX][CORES_MAX] = {{ 0 }}; 114 unsigned int H_END[CLUSTERS_MAX][CORES_MAX] = {{ 0 }}; 115 unsigned int V_BEG[CLUSTERS_MAX][CORES_MAX] = {{ 0 }}; 116 unsigned int V_END[CLUSTERS_MAX][CORES_MAX] = {{ 0 }}; 117 unsigned int D_BEG[CLUSTERS_MAX][CORES_MAX] = {{ 0 }}; 118 unsigned int D_END[CLUSTERS_MAX][CORES_MAX] = {{ 0 }}; 119 120 // pointer on buffer containing the input image, maped by the main to the input file 121 unsigned char * image_in; 122 123 // pointer on buffer containing the output image, maped by the main to the output file 124 unsigned char * image_out; 85 125 86 126 // return values at thread exit … … 91 131 pthread_barrier_t barrier; 92 132 93 // coordinates of core executing the main thread 94 unsigned int cxy_main; 95 unsigned int lid_main; 133 // platform parameters 134 unsigned int x_size; // number of clusters in a row 135 unsigned int y_size; // number of clusters in a column 136 unsigned int ncores; // number of processors per cluster 96 137 97 138 // arrays of pointers on distributed buffers in all clusters 98 139 unsigned short * GA[CLUSTERS_MAX]; 99 int * GB[CLUSTERS_MAX]; 100 int * GC[CLUSTERS_MAX]; 101 int * GD[CLUSTERS_MAX]; 102 unsigned char * GZ[CLUSTERS_MAX]; 103 104 // trdid[] array for execution threads 105 // 1D array if no explicit threads placement / 2D array if explicit placement 106 pthread_t trdid[CLUSTERS_MAX][PROCS_MAX]; 107 //pthread_t trdid[THREADS_MAX]; 108 109 // attr[] array for execution threads 110 // unused if no explicit threads placement 111 pthread_attr_t attr[CLUSTERS_MAX][PROCS_MAX]; 140 int * GB[CLUSTERS_MAX]; 141 int * GC[CLUSTERS_MAX]; 142 int * GD[CLUSTERS_MAX]; 143 unsigned char * GZ[CLUSTERS_MAX]; 144 145 // array of threads kernel identifiers / indexed by [tid] 146 pthread_t exec_trdid[THREADS_MAX]; 147 148 // array of threads attributes / indexed bi [tid] 149 pthread_attr_t exec_attr[THREADS_MAX]; 150 151 // array of execute() function arguments / indexed by [tid] 152 pthread_parallel_work_args_t exec_args[THREADS_MAX]; 153 154 // main thread continuous index 155 unsigned int tid_main; 112 156 113 157 ///////////////////////////////////////////////////////////////////////////////////// … … 115 159 ///////////////////////////////////////////////////////////////////////////////////// 116 160 117 void execute( void ); 118 119 void instrument( unsigned int nclusters, 120 unsigned int ncores ); 161 void execute( pthread_parallel_work_args_t * args ); 162 163 void instrument( FILE * f , char * filename ); 121 164 122 165 ///////////////// 123 166 void main( void ) 124 167 { 125 unsigned int x_size; // number of clusters in a row 126 unsigned int y_size; // number of clusters in a column 127 unsigned int ncores; // number of processors per cluster 128 129 unsigned long long date; 130 131 char name[64]; // instrumentation file name 132 char path[128]; // instrumentation path name 168 unsigned long long start_cycle; 169 unsigned long long end_sequencial_cycle; 170 unsigned long long end_parallel_cycle; 133 171 134 172 int error; 135 173 136 // get platform parameters 137 if ( get_config( &x_size , &y_size , &ncores ) ) 138 { 139 printf("\n[convol error] cannot get hardware configuration\n"); 174 char instru_name[32]; // instrumentation file name 175 char instru_path[64]; // instrumentation path name 176 177 ///////////////////////////////////////////////////////////////////////////////// 178 get_cycle( &start_cycle ); 179 ///////////////////////////////////////////////////////////////////////////////// 180 181 if( (NO_PLACEMENT + EXPLICIT_PLACEMENT + PARALLEL_PLACEMENT) != 1 ) 182 { 183 printf("\n[convol error] illegal placement\n"); 140 184 exit( 0 ); 141 185 } 142 186 143 // get core executing this main thread 144 // and register these coordinates in global variables 145 get_core_id( &cxy_main , &lid_main ); 146 147 // check ncores 148 if( (ncores != 1) && (ncores != 2) && (ncores != 4) ) 187 // get & check platform parameters 188 get_config( &x_size , &y_size , &ncores ); 189 190 if((ncores != 1) && (ncores != 2) && (ncores != 4)) 149 191 { 150 192 printf("\n[convol error] number of cores per cluster must be 1/2/4\n"); … … 152 194 } 153 195 154 // check x_size155 if( (x_size != 1) && (x_size != 2) && (x_size != 4) &&(x_size != 8) && (x_size != 16) )196 if( (x_size != 1) && (x_size != 2) && (x_size != 4) && 197 (x_size != 8) && (x_size != 16) ) 156 198 { 157 199 printf("\n[convol error] x_size must be 1/2/4/8/16\n"); 158 200 exit( 0 ); 159 201 } 160 161 // check y_size162 if( (y_size != 1) && (y_size != 2) && (y_size != 4) &&(y_size != 8) && (y_size != 16) )202 203 if( (y_size != 1) && (y_size != 2) && (y_size != 4) && 204 (y_size != 8) && (y_size != 16) ) 163 205 { 164 206 printf("\n[convol error] y_size must be 1/2/4/8/16\n"); 165 207 exit( 0 ); 166 208 } 209 210 // main thread get identifiers for core executing main 211 unsigned int cxy_main; 212 unsigned int lid_main; 213 get_core_id( &cxy_main , &lid_main ); 167 214 168 215 // compute nthreads and nclusters 169 unsigned int nthreads = x_size * y_size * ncores;170 216 unsigned int nclusters = x_size * y_size; 171 172 get_cycle( &date ); 173 printf("\n[convol] starts on core[%x,%d] / %d thread(s) / cycle %d\n", 174 cxy_main, lid_main, nthreads, (unsigned int)date ); 175 176 // build instrumentation file name 177 if( USE_DQT_BARRIER ) 178 snprintf( name , 64 , "p_convol_dqt_%d_%d", x_size * y_size , ncores ); 179 else 180 snprintf( name , 64 , "p_convol_smp_%d_%d", x_size * y_size , ncores ); 181 182 // build pathname 183 snprintf( path , 128 , "/home/%s", name ); 217 unsigned int nthreads = nclusters * ncores; 218 219 // main thread get FBF size and type 220 unsigned int fbf_width; 221 unsigned int fbf_height; 222 unsigned int fbf_type; 223 fbf_get_config( &fbf_width , &fbf_height , &fbf_type ); 224 225 if( (fbf_width != NP) || (fbf_height != NL) || (fbf_type != FBF_TYPE) ) 226 { 227 printf("\n[convol error] image does not fit FBF size or type\n"); 228 exit( 0 ); 229 } 230 231 if( nthreads > NL ) 232 { 233 printf("\n[convol error] number of threads larger than number of lines\n"); 234 exit( 0 ); 235 } 236 237 // define instrumentation file name 238 if( NO_PLACEMENT ) 239 { 240 printf("\n[convol] %d cluster(s) / %d core(s) / FBF[%d*%d] / PID %x / NO_PLACE\n", 241 nclusters, ncores, fbf_width, fbf_height, getpid() ); 242 243 // build instrumentation file name 244 if( USE_DQT_BARRIER ) 245 snprintf( instru_name , 32 , "conv_dqt_no_place_%d_%d", x_size * y_size , ncores ); 246 else 247 snprintf( instru_name , 32 , "conv_smp_no_place_%d_%d", x_size * y_size , ncores ); 248 } 249 250 if( EXPLICIT_PLACEMENT ) 251 { 252 printf("\n[convol] %d cluster(s) / %d core(s) / FBF[%d*%d] / PID %x / EXPLICIT\n", 253 nclusters, ncores, fbf_width, fbf_height, getpid() ); 254 255 // build instrumentation file name 256 if( USE_DQT_BARRIER ) 257 snprintf( instru_name , 32 , "conv_dqt_explicit_%d_%d_%d", x_size * y_size , ncores ); 258 else 259 snprintf( instru_name , 32 , "conv_smp_explicit_%d_%d_%d", x_size * y_size , ncores ); 260 } 261 262 if( PARALLEL_PLACEMENT ) 263 { 264 printf("\n[convol] %d cluster(s) / %d core(s) / FBF[%d*%d] / PID %x / PARALLEL\n", 265 nclusters, ncores, fbf_width, fbf_height, getpid() ); 266 267 // build instrumentation file name 268 if( USE_DQT_BARRIER ) 269 snprintf( instru_name , 32 , "conv_dqt_parallel_%d_%d_%d", x_size * y_size , ncores ); 270 else 271 snprintf( instru_name , 32 , "conv_smp_parallel_%d_%d_%d", x_size * y_size , ncores ); 272 } 184 273 185 274 // open instrumentation file 186 f_instrum = fopen( path , NULL ); 187 if ( f_instrum == NULL ) 275 snprintf( instru_path , 64 , "/home/%s", instru_name ); 276 FILE * f_instru = fopen( instru_path , NULL ); 277 if ( f_instru == NULL ) 188 278 { 189 printf("\n[convol error] cannot open instrumentation file <%s>\n",path );279 printf("\n[convol error] cannot open instrumentation file %s\n", instru_path ); 190 280 exit( 0 ); 191 281 } 192 282 193 #if DEBUG_MAIN 194 get_cycle( &date ); 195 printf("\n[convol] main on core[%x,%d] open file <%s> at cycle %d\n", 196 cxy_main, lid_main, path, (unsigned int)date ); 197 #endif 198 199 // open input file 200 f_image_in = fopen( IMAGE_IN_PATH , NULL ); 201 if ( f_image_in == NULL ) 202 { 203 printf("\n[convol error] cannot open input file <%s>\n", IMAGE_IN_PATH ); 204 exit( 0 ); 205 } 206 207 #if DEBUG_MAIN 208 get_cycle( &date ); 209 printf("\n[convol] main on core[%x,%d] open file <%s> at cycle %d\n", 210 cxy_main, lid_main, path, (unsigned int)date ); 211 #endif 212 213 // get FBF config 214 unsigned int fbf_width; 215 unsigned int fbf_height; 216 unsigned int fbf_type; 217 fbf_get_config( &fbf_width , &fbf_height , &fbf_type ); 218 219 // check FBF size 220 if ( (fbf_width != NP) || (fbf_height != NL) ) 221 { 222 printf("\n[convol error] bad FBF size\n"); 223 exit( 0 ); 224 } 225 226 // check FBF subsampling 227 if ( fbf_type != FBF_TYPE ) 228 { 229 printf("\n[convol error] bad FBF subsampling\n"); 230 exit( 0 ); 231 } 232 233 // initialise barrier 283 #if VERBOSE_MAIN 284 printf("\n[convol] main on core[%x,%d] open instrumentation file %s\n", 285 cxy_main, lid_main, instru_path ); 286 #endif 287 288 // main initialise barrier 234 289 if( USE_DQT_BARRIER ) 235 290 { … … 251 306 } 252 307 253 get_cycle( &date ); 254 printf("\n[convol] main on core[%x,%d] completes initialisation at cycle %d\n" 255 "- CLUSTERS = %d\n" 256 "- PROCS = %d\n" 257 "- THREADS = %d\n", 258 cxy_main, lid_main, (unsigned int)date, nclusters, ncores, nthreads ); 259 260 // launch exec threads with explicit placement 261 unsigned int x; 262 unsigned int y; 263 unsigned int l; 264 unsigned int cxy; 265 266 for( x = 0 ; x < x_size ; x++ ) 267 { 268 for( y = 0 ; y < y_size ; y++ ) 269 { 270 cxy = HAL_CXY_FROM_XY(x,y); 271 for( l = 0 ; l < ncores ; l++ ) 272 { 273 // no other thread on the core running the main 274 if( (cxy != cxy_main) || (l != lid_main) ) 275 { 276 // define thread attributes 277 attr[cxy][l].attributes = PT_ATTR_CLUSTER_DEFINED | PT_ATTR_CORE_DEFINED; 278 attr[cxy][l].cxy = cxy; 279 attr[cxy][l].lid = l; 308 #if VERBOSE_MAIN 309 printf("\n[convol] main on core[%x,%d] completes barrier init\n", 310 cxy_main, lid_main ); 311 #endif 312 313 // main open input file 314 int fd_in = open( IMAGE_IN_PATH , O_RDONLY , 0 ); 315 316 if ( fd_in < 0 ) 317 { 318 printf("\n[convol error] cannot open input file <%s>\n", IMAGE_IN_PATH ); 319 exit( 0 ); 320 } 321 322 #if VERBOSE_MAIN 323 printf("\n[convol] main on core[%x,%d] open file <%s>\n", 324 cxy_main, lid_main, IMAGE_IN_PATH ); 325 #endif 326 327 // main thread map image_in buffer to input file 328 image_in = (unsigned char *)mmap( NULL, 329 NB_PIXELS * IMAGE_IN_PIXEL_SIZE, 330 PROT_READ, 331 MAP_FILE | MAP_SHARED, 332 fd_in, 333 0 ); // offset 334 if ( image_in == NULL ) 335 { 336 printf("\n[convol error] main cannot map buffer to file %s\n", IMAGE_IN_PATH ); 337 exit( 0 ); 338 } 339 340 #if VERBOSE_MAIN 341 printf("\n[convol] main on core[%x,%x] map buffer to file <%s>\n", 342 cxy_main, lid_main, IMAGE_IN_PATH ); 343 #endif 344 345 // main thread open output file 346 int fd_out = open( IMAGE_OUT_PATH , O_CREAT , 0 ); 347 348 if ( fd_out < 0 ) 349 { 350 printf("\n[convol error] main cannot open file %s\n", IMAGE_OUT_PATH ); 351 exit( 0 ); 352 } 353 354 #if VERBOSE_MAIN 355 printf("\n[convol] main on core[%x,%d] open file <%s>\n", 356 cxy_main, lid_main, IMAGE_OUT_PATH ); 357 #endif 358 359 // main thread map image_out buffer to output file 360 image_out = (unsigned char *)mmap( NULL, 361 NB_PIXELS + IMAGE_OUT_PIXEL_SIZE, 362 PROT_WRITE, 363 MAP_FILE | MAP_SHARED, 364 fd_out, 365 0 ); // offset 366 if ( image_out == NULL ) 367 { 368 printf("\n[convol error] main cannot map buffer to file %s\n", IMAGE_OUT_PATH ); 369 exit( 0 ); 370 } 371 372 #if VERBOSE_MAIN 373 printf("\n[convol] main on core[%x,%x] map buffer to file <%s>\n", 374 cxy_main, lid_main, IMAGE_OUT_PATH ); 375 #endif 376 377 ///////////////////////////////////////////////////////////////////////////////////// 378 get_cycle( &end_sequencial_cycle ); 379 SEQUENCIAL_TIME = (unsigned int)(end_sequencial_cycle - start_cycle); 380 ///////////////////////////////////////////////////////////////////////////////////// 381 382 ////////////////// 383 if( NO_PLACEMENT ) 384 { 385 // the tid value for the main thread is always 0 386 // main thread creates new threads with tid in [1,nthreads-1] 387 unsigned int tid; 388 for ( tid = 0 ; tid < nthreads ; tid++ ) 389 { 390 // register tid value in exec_args[tid] array 391 exec_args[tid].tid = tid; 392 393 // create other threads 394 if( tid > 0 ) 395 { 396 if ( pthread_create( &exec_trdid[tid], 397 NULL, // no attribute 398 &execute, 399 &exec_args[tid] ) ) 400 { 401 printf("\n[convol error] cannot create thread %d\n", tid ); 402 exit( 0 ); 403 } 404 405 #if VERBOSE_MAIN 406 printf("\n[convol] main created thread %d\n", tid ); 407 #endif 408 409 } 410 else 411 { 412 tid_main = 0; 413 } 414 } // end for tid 415 416 // main thread calls itself the execute() function 417 execute( &exec_args[0] ); 418 419 // main thread wait other threads completion 420 for ( tid = 1 ; tid < nthreads ; tid++ ) 421 { 422 unsigned int * status; 423 424 // main wait thread[tid] status 425 if ( pthread_join( exec_trdid[tid], (void*)(&status)) ) 426 { 427 printf("\n[convol error] main cannot join thread %d\n", tid ); 428 exit( 0 ); 429 } 430 431 // check status 432 if( *status != THREAD_EXIT_SUCCESS ) 433 { 434 printf("\n[convol error] thread %x returned failure\n", tid ); 435 exit( 0 ); 436 } 437 438 #if VERBOSE_MAIN 439 printf("\n[convol] main successfully joined thread %x\n", tid ); 440 #endif 441 442 } // end for tid 443 444 } // end if no_placement 445 446 //////////////////////// 447 if( EXPLICIT_PLACEMENT ) 448 { 449 // main thread places each other threads on a specific core[cxy][lid] 450 // but the actual thread creation is sequencial 451 unsigned int x; 452 unsigned int y; 453 unsigned int l; 454 unsigned int cxy; // cluster identifier 455 unsigned int tid; // thread continuous index 456 457 for( x = 0 ; x < x_size ; x++ ) 458 { 459 for( y = 0 ; y < y_size ; y++ ) 460 { 461 cxy = HAL_CXY_FROM_XY( x , y ); 462 for( l = 0 ; l < ncores ; l++ ) 463 { 464 // compute thread continuous index 465 tid = (((x * y_size) + y) * ncores) + l; 466 467 // register tid value in exec_args[tid] array 468 exec_args[tid].tid = tid; 469 470 // no thread created on the core running the main 471 if( (cxy != cxy_main) || (l != lid_main) ) 472 { 473 // define thread attributes 474 exec_attr[tid].attributes = PT_ATTR_CLUSTER_DEFINED | 475 PT_ATTR_CORE_DEFINED; 476 exec_attr[tid].cxy = cxy; 477 exec_attr[tid].lid = l; 280 478 281 // create thread on core[x,y,l] 282 if (pthread_create( &trdid[cxy][l], 283 &attr[cxy][l], 284 &execute, 285 NULL ) ) // execute has no argument 286 { 287 printf("\n[convol error] created thread %x on core[%x][%d]\n", 288 trdid[cxy][l] , cxy , l ); 289 exit( 0 ); 290 } 291 } 292 } 293 } 294 } 295 296 /* 297 // launch other threads without explicit placement 298 for ( n = 1 ; n < nthreads ; n++ ) 299 { 300 if ( giet_pthread_create( &trdid[n], 301 NULL, // no attribute 302 &execute, 303 NULL ) ) // no argument 304 { 305 printf("\n[convol error] creating thread %x\n", trdid[n] ); 306 exit( 0 ); 307 } 308 } 309 */ 310 311 // the main thread run itself the execute() function 312 execute(); 313 314 // wait other threads completions if explicit threads placement 315 for( x = 0 ; x < x_size ; x++ ) 316 { 317 for( y = 0 ; y < y_size ; y++ ) 318 { 319 unsigned int cxy = HAL_CXY_FROM_XY(x,y); 320 for( l = 0 ; l < ncores ; l++ ) 321 { 322 // no other thread on the core running the main 323 if( (cxy != cxy_main) || (l != lid_main) ) 324 { 325 unsigned int * exit_status; 326 327 // wait thread running on core[x,y,l] 328 if (pthread_join( trdid[cxy][l] , (void*)(&exit_status) ) ) 479 // create thread[tid] on core[cxy][l] 480 if ( pthread_create( &exec_trdid[tid], 481 &exec_attr[tid], 482 &execute, 483 &exec_args[tid] ) ) 484 { 485 printf("\n[convol error] cannot create thread %d\n", tid ); 486 exit( 0 ); 487 } 488 #if VERBOSE_MAIN 489 printf("\n[convol] main created thread[%d] on core[%x,%d]\n", tid, cxy, l ); 490 #endif 491 } 492 else 329 493 { 330 printf("\n[convol error] main cannot join thread[%x,%d]\n", cxy, l ); 331 exit( 0 ); 332 } 333 334 // check exit_status 335 if( *exit_status != 0 ) 336 { 337 printf("\n[convol error] thread[%x,%d]return failure\n", cxy, l ); 338 exit( 0 ); 494 tid_main = tid; 339 495 } 340 496 } 341 497 } 342 498 } 343 } 344 /* 345 // wait other threads completion when no explicit threads placement 346 for ( n = 1 ; n < nthreads ; n++ ) 347 { 348 if ( pthread_join( trdid[n], NULL ) ) 349 { 350 printf("\n[convol error] joining thread %x\n", trdid[n] ); 499 500 // main thread calls itself the execute() function 501 execute( &exec_args[tid_main] ); 502 503 // main thread wait other threads completion 504 for( tid = 0 ; tid < nthreads ; tid++ ) 505 { 506 // no other thread on the core running the main 507 if( tid != tid_main ) 508 { 509 unsigned int * status; 510 511 // wait thread[tid] 512 if( pthread_join( exec_trdid[tid] , (void*)(&status) ) ) 513 { 514 printf("\n[convol error] main cannot join thread %d\n", tid ); 515 exit( 0 ); 516 } 517 518 // check status 519 if( *status != THREAD_EXIT_SUCCESS ) 520 { 521 printf("\n[convol error] thread %d returned failure\n", tid ); 522 exit( 0 ); 523 } 524 #if VERBOSE_MAIN 525 printf("\n[convol] main joined thread %d on core[%x,%d]\n", tid , cxy , l ); 526 #endif 527 } 528 } 529 } // end if explicit_placement 530 531 //////////////////////// 532 if( PARALLEL_PLACEMENT ) 533 { 534 // compute covering DQT size an level 535 unsigned int z = (x_size > y_size) ? x_size : y_size; 536 unsigned int root_level = ((z == 1) ? 0 : 537 ((z == 2) ? 1 : 538 ((z == 4) ? 2 : 539 ((z == 8) ? 3 : 4)))); 540 541 // create & execute the working threads 542 if( pthread_parallel_create( root_level , &execute ) ) 543 { 544 printf("\n[convol error] in %s\n", __FUNCTION__ ); 351 545 exit( 0 ); 352 546 } 353 } 354 */ 355 // call the instrument() function 356 instrument( nclusters , ncores ); 357 547 } // end if parallel_placement 548 549 ///////////////////////////////////////////////////////////////////////////// 550 get_cycle( &end_parallel_cycle ); 551 PARALLEL_TIME = (unsigned int)(end_parallel_cycle - end_sequencial_cycle); 552 ///////////////////////////////////////////////////////////////////////////// 553 554 // main thread register instrumentation results 555 instrument( f_instru , instru_name ); 556 557 // main thread close input file 558 close( fd_in ); 559 560 // main thread close output file 561 close( fd_out ); 562 563 // main thread close instrumentation file 564 fclose( f_instru ); 565 566 // main thread suicide 358 567 exit( 0 ); 359 568 … … 362 571 363 572 364 ////////////// 365 void execute() 573 574 575 576 /////////////////////////////////////////////////// 577 void execute( pthread_parallel_work_args_t * args ) 366 578 { 367 579 unsigned long long date; 368 580 369 // Each thread [x,y,p]initialises the convolution kernel parameters in local stack.581 // Each thread initialises the convolution kernel parameters in local stack. 370 582 // The values defined in the next 12 lines are Philips proprietary information. 371 583 … … 382 594 unsigned int hnorm = 201; 383 595 384 // get plat-form config 385 unsigned int x_size; // number of clusters in a row 386 unsigned int y_size; // number of clusters in a column 387 unsigned int ncores; // number of processors per cluster 388 get_config( &x_size , &y_size , &ncores ); 389 390 // get cluster indentifier and core local index 391 unsigned int cxy; 392 unsigned int lid; 393 get_core_id( &cxy , &lid ); 394 unsigned int x = HAL_X_FROM_CXY( cxy ); 395 unsigned int y = HAL_Y_FROM_CXY( cxy ); 596 // WARNING 597 //A thread is identified by the tid index, defined in the "args" structure. 598 // This index being in range [0,nclusters*ncores-1] we can always write 599 // tid == cid * ncores + lid 600 // with cid in [0,nclusters-1] and lid in [0,ncores-1]. 601 // if NO_PLACEMENT, there is no relation between these 602 // thread [cid][lid] indexes, and the core coordinates [cxy][lpid] 603 604 // get thread abstract identifiers 605 unsigned int tid = args->tid; 606 unsigned int cid = tid / ncores; 607 unsigned int lid = tid % ncores; 608 609 #if VERBOSE_EXEC 610 unsigned int cxy; // core cluster identifier 611 unsigned int lpid; // core local identifier 612 get_core_id( &cxy , &lpid ); 613 printf("\n[convol] exec[%d] on core[%x,%d] enters parallel exec\n", 614 tid , cxy , lpid ); 615 #endif 616 617 // build total number of threads and clusters from global variables 618 unsigned int nclusters = x_size * y_size; 619 unsigned int nthreads = nclusters * ncores; 396 620 397 621 // indexes for loops … … 401 625 unsigned int z; // vertical filter index 402 626 403 unsigned int nclusters = x_size * y_size; // number of clusters 404 unsigned int cluster_id = (x * y_size) + y; // continuous cluster index 405 unsigned int thread_id = (cluster_id * ncores) + lid; // continuous thread index 406 unsigned int nthreads = nclusters * ncores; // number of threads 407 unsigned int frame_size = FRAME_SIZE; // total size (bytes) 408 unsigned int lines_per_thread = NL / nthreads; // lines per thread 409 unsigned int lines_per_cluster = NL / nclusters; // lines per cluster 410 unsigned int pixels_per_thread = NP / nthreads; // columns per thread 411 unsigned int pixels_per_cluster = NP / nclusters; // columns per cluster 627 unsigned int lines_per_thread = NL / nthreads; 628 unsigned int lines_per_cluster = NL / nclusters; 629 unsigned int pixels_per_thread = NP / nthreads; 630 unsigned int pixels_per_cluster = NP / nclusters; 631 632 // compute number of pixels stored in one abstract cluster cid 633 unsigned int local_pixels = NL * NP / nclusters; 412 634 413 635 unsigned int first, last; 414 636 415 637 get_cycle( &date ); 416 START[cluster_id][lid] = (unsigned int)date; 417 418 // Each thread[cxy][0] allocate the global buffers in cluster cxy 638 START[cid][lid] = (unsigned int)date; 639 640 // Each thread[cid][0] allocates 5 local buffers, 641 // shared by all threads that have the same cid 419 642 if ( lid == 0 ) 420 643 { 421 422 #if VERBOSE 423 printf( "\n[convol] thread[%x,%d] enters malloc at cycle %d\n", 424 cxy , lid , (unsigned int)date ); 425 #endif 426 427 GA[cluster_id] = remote_malloc( (FRAME_SIZE/nclusters) , cxy ); 428 GB[cluster_id] = remote_malloc( (FRAME_SIZE/nclusters)*2 , cxy ); 429 GC[cluster_id] = remote_malloc( (FRAME_SIZE/nclusters)*2 , cxy ); 430 GD[cluster_id] = remote_malloc( (FRAME_SIZE/nclusters)*2 , cxy ); 431 GZ[cluster_id] = remote_malloc( (FRAME_SIZE/nclusters)/2 , cxy ); 432 433 #if VERBOSE 434 printf( "\n[convol] Shared Buffer Virtual Addresses in cluster %x\n" 435 "### GA = %x\n" 436 "### GB = %x\n" 437 "### GC = %x\n" 438 "### GD = %x\n" 439 "### GZ = %x\n", 440 cxy, 441 GA[cluster_id], 442 GB[cluster_id], 443 GC[cluster_id], 444 GD[cluster_id], 445 GZ[cluster_id] ); 644 GA[cid] = malloc( local_pixels * sizeof( unsigned short ) ); 645 GB[cid] = malloc( local_pixels * sizeof( int ) ); 646 GC[cid] = malloc( local_pixels * sizeof( int ) ); 647 GD[cid] = malloc( local_pixels * sizeof( int ) ); 648 GZ[cid] = malloc( local_pixels * sizeof( unsigned char ) ); 649 650 if( (GA[cid] == NULL) || (GB[cid] == NULL) || (GC[cid] == NULL) || 651 (GD[cid] == NULL) || (GZ[cid] == NULL) ) 652 { 653 printf("\n[convol error] thread[%d] cannot allocate buf_in\n", tid ); 654 pthread_exit( &THREAD_EXIT_FAILURE ); 655 } 656 657 #if VERBOSE_EXEC 658 printf( "\n[convol] exec[%d] on core[%x,%d] allocated shared buffers\n" 659 "### GA = %x\n" 660 "### GB = %x\n" 661 "### GC = %x\n" 662 "### GD = %x\n" 663 "### GZ = %x\n", 664 tid, cxy , lpid, GA[cid], GB[cid], GC[cid], GD[cid], GZ[cid] ); 446 665 #endif 447 666 … … 451 670 pthread_barrier_wait( &barrier ); 452 671 453 // Each thread[c xy,p] initialise in its private stack a copy of the454 // a rrays of pointers on the shared,distributed buffers.672 // Each thread[cid,lid] allocate and initialise in its private stack 673 // a copy of the arrays of pointers on the distributed buffers. 455 674 unsigned short * A[CLUSTERS_MAX]; 456 675 int * B[CLUSTERS_MAX]; … … 468 687 } 469 688 470 // Each thread[x,y,0] access the file containing the input image, to load 471 // the local A[c] buffer (frame_size / nclusters loaded in each cluster). 472 // Other threads are waiting on the barrier. 689 // Each thread[cid,0] access the file containing the input image, to load 690 // the local A[cid] buffer. Other threads are waiting on the barrier. 473 691 if ( lid==0 ) 474 692 { 475 unsigned int offset = (frame_size/nclusters)*cluster_id; 476 unsigned int size = frame_size/nclusters; 477 478 // seek the pointer in file 479 if ( fseek( f_image_in, 480 offset, 481 SEEK_SET ) ) 482 { 483 printf("\n[convol error] in %s : thread[%x,%d] cannot seek input file\n", 484 __FUNCTION__ , cxy , lid ); 485 pthread_exit( &THREAD_EXIT_FAILURE ); 486 } 487 488 if ( fread( A[cluster_id], 489 1, 490 size, 491 f_image_in ) != size ) 492 { 493 printf("\n[convol error] in %s : thread[%x,%d] cannot read input file\n", 494 __FUNCTION__ , cxy , lid ); 495 pthread_exit( &THREAD_EXIT_FAILURE ); 496 } 693 unsigned int size = local_pixels * sizeof( unsigned short ); 694 unsigned int offset = size * cid; 695 696 memcpy( A[cid], 697 image_in + offset, 698 size ); 497 699 498 #if VERBOSE 700 #if VERBOSE_EXEC 499 701 get_cycle( &date ); 500 printf( "\n[convol] thread [%x,%d] load input file at cycle %d\n",501 cxy , lid , (unsigned int)date);702 printf( "\n[convol] thread %d on core[%x,%d] load input file in A[%d]\n", 703 tid , cxy , lpid , cid ); 502 704 #endif 503 705 … … 505 707 506 708 // Optionnal parallel display of the initial image stored in A[c] buffers. 507 // Eah thread[ x,y,p] displays (NL/nthreads) lines. (one byte per pixel).709 // Eah thread[cid,lid] displays (NL/nthreads) lines. 508 710 509 711 if ( INITIAL_DISPLAY_ENABLE ) … … 516 718 line = offset + l; 517 719 720 // copy TA[cid] to TZ[cid] 518 721 for ( p = 0 ; p < NP ; p++ ) 519 722 { 520 TZ(c luster_id, line, p) = (unsigned char)(TA(cluster_id, line, p) >> 8);723 TZ(cid, line, p) = (unsigned char)(TA(cid, line, p) >> 8); 521 724 } 522 725 523 if (fbf_write( &TZ(cluster_id, line, 0), // first pixel in TZ 524 NP, // number of bytes 525 NP*(l + (thread_id * lines_per_thread)))) // offset in FBF 726 // display one line to frame buffer 727 if (fbf_write( &TZ(cid, line, 0), // first pixel in TZ 728 NP, // number of bytes 729 NP*(l + (tid * lines_per_thread)))) // offset in FBF 526 730 { 527 731 printf("\n[convol error] in %s : thread[%x,%d] cannot access FBF\n", … … 531 735 } 532 736 533 #if VERBOSE 737 #if VERBOSE_EXEC 534 738 get_cycle( &date ); 535 printf( "\n[convol] thread[% x,%d] completes initial display at cycle %d\n",536 cxy , lid , (unsigned int)date);739 printf( "\n[convol] thread[%d] on core[%x,%d] completes initial display\n", 740 tid , cxy , lpid ); 537 741 #endif 538 742 … … 543 747 //////////////////////////////////////////////////////////// 544 748 // parallel horizontal filter : 545 // B <= transpose(FH(A))749 // B <= convol(FH(A)) 546 750 // D <= A - FH(A) 547 // Each thread computes (NL/nthreads) lines 751 // Each thread computes (NL/nthreads) lines. 548 752 // The image must be extended : 549 // if (z<0) TA(c luster_id,l,z) == TA(cluster_id,l,0)550 // if (z>NP-1) TA(c luster_id,l,z) == TA(cluster_id,l,NP-1)753 // if (z<0) TA(cid,l,z) == TA(cid,l,0) 754 // if (z>NP-1) TA(cid,l,z) == TA(cid,l,NP-1) 551 755 //////////////////////////////////////////////////////////// 552 756 553 757 get_cycle( &date ); 554 H_BEG[c luster_id][lid] = (unsigned int)date;555 556 #if VERBOSE 557 printf( "\n[convol] thread[% x,%d] starts horizontal filter at cycle %d\n",558 cxy , lid , (unsigned int)date);758 H_BEG[cid][lid] = (unsigned int)date; 759 760 #if VERBOSE_EXEC 761 printf( "\n[convol] thread[%d] on core[%x,%d] starts horizontal filter\n", 762 tid , cxy , lpid ); 559 763 #else 560 if ( (cxy == cxy_main) && (lid == lid_main))561 printf( "\n[convol] thread[% x,%d] starts horizontal filter at cycle %d\n",562 cxy , lid , (unsigned int)date);764 if ( tid == tid_main ) 765 printf( "\n[convol] thread[%d] on core[%x,%d] starts horizontal filter\n", 766 tid , cxy , lpid ); 563 767 #endif 564 768 … … 566 770 // first & last define which lines are handled by a given thread 567 771 568 first = t hread_id * lines_per_thread;772 first = tid * lines_per_thread; 569 773 last = first + lines_per_thread; 570 774 … … 626 830 627 831 get_cycle( &date ); 628 H_END[c luster_id][lid] = (unsigned int)date;629 630 #if VERBOSE 631 printf( "\n[convol] thread[% x,%d] completes horizontal filter at cycle %d\n",632 cxy , lid, (unsigned int)date);832 H_END[cid][lid] = (unsigned int)date; 833 834 #if VERBOSE_EXEC 835 printf( "\n[convol] thread[%d] on core[%x,%d] completes horizontal filter\n", 836 tid , cxy , lpid ); 633 837 #else 634 if ( (cxy == cxy_main) && (lid == lid_main))635 printf( "\n[convol] thread[% x,%d] completes horizontal filter at cycle %d\n",636 cxy , lid, (unsigned int)date);838 if ( tid == tid_main ) 839 printf( "\n[convol] thread[%d] on core[%x,%d] completes horizontal filter\n", 840 tid , cxy , lpid ); 637 841 #endif 638 842 … … 645 849 // Each thread computes (NP/nthreads) columns 646 850 // The image must be extended : 647 // if (l<0) TB(c luster_id,p,l) == TB(cluster_id,p,0)648 // if (l>NL-1) TB(c luster_id,p,l) == TB(cluster_id,p,NL-1)851 // if (l<0) TB(cid,p,l) == TB(cid,p,0) 852 // if (l>NL-1) TB(cid,p,l) == TB(cid,p,NL-1) 649 853 /////////////////////////////////////////////////////////////// 650 854 651 855 get_cycle( &date ); 652 V_BEG[c luster_id][lid] = (unsigned int)date;653 654 #if VERBOSE 655 printf( "\n[convol] thread[% x,%d] starts vertical filter at cycle %d\n",656 cxy , lid , (unsigned int)date);856 V_BEG[cid][lid] = (unsigned int)date; 857 858 #if VERBOSE_EXEC 859 printf( "\n[convol] thread[%d] on core[%x,%d] starts vertical filter\n", 860 tid , cxy , lpid ); 657 861 #else 658 if ( (cxy == cxy_main) && (lid == lid_main))659 printf( "\n[convol] thread[% x,%d] starts vertical filter at cycle %d\n",660 cxy , lid, (unsigned int)date);862 if ( tid == tid_main ) 863 printf( "\n[convol] thread[%d] on core[%x,%d] starts vertical filter\n", 864 tid , cxy , lpid ); 661 865 #endif 662 866 … … 664 868 // first & last define which pixels are handled by a given thread 665 869 666 first = t hread_id * pixels_per_thread;870 first = tid * pixels_per_thread; 667 871 last = first + pixels_per_thread; 668 872 … … 740 944 741 945 get_cycle( &date ); 742 V_END[c luster_id][lid] = (unsigned int)date;743 744 #if VERBOSE 745 printf( "\n[convol] thread[% x,%d] completes vertical filter at cycle %d\n",746 cxy , lid , (unsigned int)date);946 V_END[cid][lid] = (unsigned int)date; 947 948 #if VERBOSE_EXEC 949 printf( "\n[convol] thread[%d] on core[%x,%d] completes vertical filter\n", 950 tid , cxy , lid ); 747 951 #else 748 if ( (cxy == cxy_main) && (lid == lid_main))749 printf( "\n[convol] thread[% x,%d] completes vertical filter at cycle %d\n",750 cxy , lid, (unsigned int)date);952 if ( tid == tid_main ) 953 printf( "\n[convol] thread[%d] on core[%x,%d] completes vertical filter\n", 954 tid , cxy , lid ); 751 955 #endif 752 956 … … 755 959 756 960 // Optional parallel display of the final image Z <= D + C 757 // Eah thread[x,y,p] displays (NL/nthreads) lines. (one byte per pixel).961 // Eah thread[x,y,p] displays (NL/nthreads) lines. 758 962 759 963 if ( FINAL_DISPLAY_ENABLE ) 760 964 { 761 965 get_cycle( &date ); 762 D_BEG[c luster_id][lid] = (unsigned int)date;763 764 #if VERBOSE 765 printf( "\n[convol] thread[% x,%d] starts final display at cycle %d\n",766 cxy , lid , (unsigned int)date);966 D_BEG[cid][lid] = (unsigned int)date; 967 968 #if VERBOSE_EXEC 969 printf( "\n[convol] thread[%d] on core[%x,%d] starts final display\n", 970 tid , cxy , lid ); 767 971 #else 768 if ( (cxy == cxy_main) && (lid == lid_main))769 printf( "\n[convol] thread[% x,%d] starts final display at cycle %d\n",770 cxy , lid, (unsigned int)date);972 if ( tid == tid_main ) 973 printf( "\n[convol] thread[%d] on core[%x,%d] starts final display\n", 974 tid , cxy , lid ); 771 975 #endif 772 976 … … 780 984 for ( p = 0 ; p < NP ; p++ ) 781 985 { 782 TZ(c luster_id, line, p) =783 (unsigned char)( (TD(c luster_id, line, p) +784 TC(c luster_id, line, p) ) >> 8 );986 TZ(cid, line, p) = 987 (unsigned char)( (TD(cid, line, p) + 988 TC(cid, line, p) ) >> 8 ); 785 989 } 786 990 787 if (fbf_write( &TZ(c luster_id, line, 0),// first pixel in TZ788 NP, 789 NP*(l + (t hread_id * lines_per_thread)))) // offset in FBF991 if (fbf_write( &TZ(cid, line, 0), // first pixel in TZ 992 NP, // number of bytes 993 NP*(l + (tid * lines_per_thread)))) // offset in FBF 790 994 { 791 printf("\n[convol error] in %s : thread[%d,%d,%d] cannot access FBF\n", 792 __FUNCTION__ , x , y , lid ); 995 printf("\n[convol error] thread[%d] cannot access FBF\n", tid ); 793 996 pthread_exit( &THREAD_EXIT_FAILURE ); 794 997 } … … 796 999 797 1000 get_cycle( &date ); 798 D_END[c luster_id][lid] = (unsigned int)date;799 800 #if VERBOSE 801 printf( "\n[convol] thread[% x,%d] completes final display at cycle %d\n",802 cxy , lid , (unsigned int)date);1001 D_END[cid][lid] = (unsigned int)date; 1002 1003 #if VERBOSE_EXEC 1004 printf( "\n[convol] thread[%d] on core[%x,%d] completes final display\n", 1005 tid , cxy , lid ); 803 1006 #else 804 if ( (cxy == cxy_main) && (lid == lid_main) ) 805 printf( "\n[convol] thread[%x,%d] completes final display at cycle %d\n", 806 cxy , lid , (unsigned int)date ); 807 #endif 808 809 //////////////////////////////// 810 pthread_barrier_wait( &barrier ); 1007 if ( tid == tid_main ) 1008 printf( "\n[convol] thread[%d] on core[%x,%d] completes final display\n", 1009 tid , cxy , lid ); 1010 #endif 1011 811 1012 } 812 1013 813 1014 // all threads (but the one executing main) exit 814 if ( (cxy != cxy_main) || (lid != lid_main))1015 if ( tid != tid_main ) 815 1016 { 816 1017 pthread_exit( &THREAD_EXIT_SUCCESS ); … … 821 1022 822 1023 823 ////////////////////////// ///////////////824 void instrument( unsigned int nclusters,825 unsigned int ncores)1024 ////////////////////////// 1025 void instrument( FILE * f, 1026 char * filename ) 826 1027 { 827 unsigned int cc, pp; 828 829 unsigned int min_start = 0xFFFFFFFF; 830 unsigned int max_start = 0; 831 832 unsigned int min_h_beg = 0xFFFFFFFF; 833 unsigned int max_h_beg = 0; 834 835 unsigned int min_h_end = 0xFFFFFFFF; 836 unsigned int max_h_end = 0; 837 838 unsigned int min_v_beg = 0xFFFFFFFF; 839 unsigned int max_v_beg = 0; 840 841 unsigned int min_v_end = 0xFFFFFFFF; 842 unsigned int max_v_end = 0; 843 844 unsigned int min_d_beg = 0xFFFFFFFF; 845 unsigned int max_d_beg = 0; 846 847 unsigned int min_d_end = 0xFFFFFFFF; 848 unsigned int max_d_end = 0; 849 850 for (cc = 0; cc < nclusters; cc++) 851 { 852 for (pp = 0; pp < ncores; pp++ ) 853 { 854 if (START[cc][pp] < min_start) min_start = START[cc][pp]; 855 if (START[cc][pp] > max_start) max_start = START[cc][pp]; 856 857 if (H_BEG[cc][pp] < min_h_beg) min_h_beg = H_BEG[cc][pp]; 858 if (H_BEG[cc][pp] > max_h_beg) max_h_beg = H_BEG[cc][pp]; 859 860 if (H_END[cc][pp] < min_h_end) min_h_end = H_END[cc][pp]; 861 if (H_END[cc][pp] > max_h_end) max_h_end = H_END[cc][pp]; 862 863 if (V_BEG[cc][pp] < min_v_beg) min_v_beg = V_BEG[cc][pp]; 864 if (V_BEG[cc][pp] > max_v_beg) max_v_beg = V_BEG[cc][pp]; 865 866 if (V_END[cc][pp] < min_v_end) min_v_end = V_END[cc][pp]; 867 if (V_END[cc][pp] > max_v_end) max_v_end = V_END[cc][pp]; 868 869 if (D_BEG[cc][pp] < min_d_beg) min_d_beg = D_BEG[cc][pp]; 870 if (D_BEG[cc][pp] > max_d_beg) max_d_beg = D_BEG[cc][pp]; 871 872 if (D_END[cc][pp] < min_d_end) min_d_end = D_END[cc][pp]; 873 if (D_END[cc][pp] > max_d_end) max_d_end = D_END[cc][pp]; 874 } 875 } 876 877 printf(" - START : min = %d / max = %d / med = %d / delta = %d\n", 878 min_start, max_start, (min_start+max_start)/2, max_start-min_start); 879 880 printf(" - H_BEG : min = %d / max = %d / med = %d / delta = %d\n", 881 min_h_beg, max_h_beg, (min_h_beg+max_h_beg)/2, max_h_beg-min_h_beg); 882 883 printf(" - H_END : min = %d / max = %d / med = %d / delta = %d\n", 884 min_h_end, max_h_end, (min_h_end+max_h_end)/2, max_h_end-min_h_end); 885 886 printf(" - V_BEG : min = %d / max = %d / med = %d / delta = %d\n", 887 min_v_beg, max_v_beg, (min_v_beg+max_v_beg)/2, max_v_beg-min_v_beg); 888 889 printf(" - V_END : min = %d / max = %d / med = %d / delta = %d\n", 890 min_v_end, max_v_end, (min_v_end+max_v_end)/2, max_v_end-min_v_end); 891 892 printf(" - D_BEG : min = %d / max = %d / med = %d / delta = %d\n", 893 min_d_beg, max_d_beg, (min_d_beg+max_d_beg)/2, max_d_beg-min_d_beg); 894 895 printf(" - D_END : min = %d / max = %d / med = %d / delta = %d\n", 896 min_d_end, max_d_end, (min_d_end+max_d_end)/2, max_d_end-min_d_end); 897 898 printf( "\n General Scenario (Kcycles for each step)\n" ); 899 printf( " - BOOT OS = %d\n", (min_start )/1000 ); 900 printf( " - LOAD IMAGE = %d\n", (min_h_beg - min_start)/1000 ); 901 printf( " - H_FILTER = %d\n", (max_h_end - min_h_beg)/1000 ); 902 printf( " - BARRIER HORI/VERT = %d\n", (min_v_beg - max_h_end)/1000 ); 903 printf( " - V_FILTER = %d\n", (max_v_end - min_v_beg)/1000 ); 904 printf( " - BARRIER VERT/DISP = %d\n", (min_d_beg - max_v_end)/1000 ); 905 printf( " - DISPLAY = %d\n", (max_d_end - min_d_beg)/1000 ); 906 907 // TODO save these results on f_instrum 1028 unsigned int nclusters = x_size * y_size; 1029 1030 unsigned int cc, pp; 1031 1032 unsigned int min_start = 0xFFFFFFFF; 1033 unsigned int max_start = 0; 1034 1035 unsigned int min_h_beg = 0xFFFFFFFF; 1036 unsigned int max_h_beg = 0; 1037 1038 unsigned int min_h_end = 0xFFFFFFFF; 1039 unsigned int max_h_end = 0; 1040 1041 unsigned int min_v_beg = 0xFFFFFFFF; 1042 unsigned int max_v_beg = 0; 1043 1044 unsigned int min_v_end = 0xFFFFFFFF; 1045 unsigned int max_v_end = 0; 1046 1047 unsigned int min_d_beg = 0xFFFFFFFF; 1048 unsigned int max_d_beg = 0; 1049 1050 unsigned int min_d_end = 0xFFFFFFFF; 1051 unsigned int max_d_end = 0; 1052 1053 for (cc = 0; cc < nclusters; cc++) 1054 { 1055 for (pp = 0; pp < ncores; pp++ ) 1056 { 1057 if (START[cc][pp] < min_start) min_start = START[cc][pp]; 1058 if (START[cc][pp] > max_start) max_start = START[cc][pp]; 1059 1060 if (H_BEG[cc][pp] < min_h_beg) min_h_beg = H_BEG[cc][pp]; 1061 if (H_BEG[cc][pp] > max_h_beg) max_h_beg = H_BEG[cc][pp]; 1062 1063 if (H_END[cc][pp] < min_h_end) min_h_end = H_END[cc][pp]; 1064 if (H_END[cc][pp] > max_h_end) max_h_end = H_END[cc][pp]; 1065 1066 if (V_BEG[cc][pp] < min_v_beg) min_v_beg = V_BEG[cc][pp]; 1067 if (V_BEG[cc][pp] > max_v_beg) max_v_beg = V_BEG[cc][pp]; 1068 1069 if (V_END[cc][pp] < min_v_end) min_v_end = V_END[cc][pp]; 1070 if (V_END[cc][pp] > max_v_end) max_v_end = V_END[cc][pp]; 1071 1072 if (D_BEG[cc][pp] < min_d_beg) min_d_beg = D_BEG[cc][pp]; 1073 if (D_BEG[cc][pp] > max_d_beg) max_d_beg = D_BEG[cc][pp]; 1074 1075 if (D_END[cc][pp] < min_d_end) min_d_end = D_END[cc][pp]; 1076 if (D_END[cc][pp] > max_d_end) max_d_end = D_END[cc][pp]; 1077 } 1078 } 1079 1080 // display on terminal 1081 printf( "\n ------ %s ------\n" , filename ); 1082 1083 printf(" - START : min = %d / max = %d / med = %d / delta = %d\n", 1084 min_start, max_start, (min_start+max_start)/2, max_start-min_start); 1085 1086 printf(" - H_BEG : min = %d / max = %d / med = %d / delta = %d\n", 1087 min_h_beg, max_h_beg, (min_h_beg+max_h_beg)/2, max_h_beg-min_h_beg); 1088 1089 printf(" - H_END : min = %d / max = %d / med = %d / delta = %d\n", 1090 min_h_end, max_h_end, (min_h_end+max_h_end)/2, max_h_end-min_h_end); 1091 1092 printf(" - V_BEG : min = %d / max = %d / med = %d / delta = %d\n", 1093 min_v_beg, max_v_beg, (min_v_beg+max_v_beg)/2, max_v_beg-min_v_beg); 1094 1095 printf(" - V_END : min = %d / max = %d / med = %d / delta = %d\n", 1096 min_v_end, max_v_end, (min_v_end+max_v_end)/2, max_v_end-min_v_end); 1097 1098 printf(" - D_BEG : min = %d / max = %d / med = %d / delta = %d\n", 1099 min_d_beg, max_d_beg, (min_d_beg+max_d_beg)/2, max_d_beg-min_d_beg); 1100 1101 printf(" - D_END : min = %d / max = %d / med = %d / delta = %d\n", 1102 min_d_end, max_d_end, (min_d_end+max_d_end)/2, max_d_end-min_d_end); 1103 1104 printf( "\n General Scenario (Kcycles for each step)\n" ); 1105 printf( " - LOAD IMAGE = %d\n", (min_h_beg - min_start)/1000 ); 1106 printf( " - H_FILTER = %d\n", (max_h_end - min_h_beg)/1000 ); 1107 printf( " - BARRIER HORI/VERT = %d\n", (min_v_beg - max_h_end)/1000 ); 1108 printf( " - V_FILTER = %d\n", (max_v_end - min_v_beg)/1000 ); 1109 printf( " - BARRIER VERT/DISP = %d\n", (min_d_beg - max_v_end)/1000 ); 1110 printf( " - DISPLAY = %d\n", (max_d_end - min_d_beg)/1000 ); 1111 printf( " \nSEQUENCIAL = %d / PARALLEL = %d\n", SEQUENCIAL_TIME, PARALLEL_TIME ); 1112 1113 // save on disk 1114 fprintf( f , "\n ------ %s ------\n" , filename ); 1115 1116 fprintf( f , " - START : min = %d / max = %d / med = %d / delta = %d\n", 1117 min_start, max_start, (min_start+max_start)/2, max_start-min_start); 1118 1119 fprintf( f , " - H_BEG : min = %d / max = %d / med = %d / delta = %d\n", 1120 min_h_beg, max_h_beg, (min_h_beg+max_h_beg)/2, max_h_beg-min_h_beg); 1121 1122 fprintf( f , " - H_END : min = %d / max = %d / med = %d / delta = %d\n", 1123 min_h_end, max_h_end, (min_h_end+max_h_end)/2, max_h_end-min_h_end); 1124 1125 fprintf( f , " - V_BEG : min = %d / max = %d / med = %d / delta = %d\n", 1126 min_v_beg, max_v_beg, (min_v_beg+max_v_beg)/2, max_v_beg-min_v_beg); 1127 1128 fprintf( f , " - V_END : min = %d / max = %d / med = %d / delta = %d\n", 1129 min_v_end, max_v_end, (min_v_end+max_v_end)/2, max_v_end-min_v_end); 1130 1131 fprintf( f , " - D_BEG : min = %d / max = %d / med = %d / delta = %d\n", 1132 min_d_beg, max_d_beg, (min_d_beg+max_d_beg)/2, max_d_beg-min_d_beg); 1133 1134 fprintf( f , " - D_END : min = %d / max = %d / med = %d / delta = %d\n", 1135 min_d_end, max_d_end, (min_d_end+max_d_end)/2, max_d_end-min_d_end); 1136 1137 fprintf( f , "\n General Scenario (Kcycles)\n" ); 1138 fprintf( f , " - LOAD IMAGE = %d\n", (min_h_beg - min_start)/1000 ); 1139 fprintf( f , " - H_FILTER = %d\n", (max_h_end - min_h_beg)/1000 ); 1140 fprintf( f , " - BARRIER HORI/VERT = %d\n", (min_v_beg - max_h_end)/1000 ); 1141 fprintf( f , " - V_FILTER = %d\n", (max_v_end - min_v_beg)/1000 ); 1142 fprintf( f , " - BARRIER VERT/DISP = %d\n", (min_d_beg - max_v_end)/1000 ); 1143 fprintf( f , " - DISPLAY = %d\n", (max_d_end - min_d_beg)/1000 ); 1144 fprintf( f , " \nSEQUENCIAL = %d / PARALLEL = %d\n", SEQUENCIAL_TIME, PARALLEL_TIME ); 908 1145 909 1146 } // end instrument()
Note: See TracChangeset
for help on using the changeset viewer.