Changeset 652 for trunk/user
- Timestamp:
- Nov 14, 2019, 3:56:51 PM (5 years ago)
- Location:
- trunk/user
- Files:
-
- 7 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/user/convol/convol.c
r645 r652 5 5 /////////////////////////////////////////////////////////////////////////////////////// 6 6 // This multi-threaded application implements a 2D convolution product. 7 // It can run on a multi- processors, multi-clusters architecture, with one thread8 // per processor, and uses the POSIX threads API.7 // It can run on a multi-cores, multi-clusters architecture, with one thread 8 // per core, and uses the POSIX threads API. 9 9 // 10 10 // The main() function can be launched on any processor P[x,y,l]. … … 14 14 // when the parallel execution is completed. 15 15 // 16 // The convolution kernel is [201]*[35] pixels, but it can be factored in two17 // independant line and column convolution products.16 // The convolution kernel is defined in the execute() function. 17 // It can be factored in two independant line and column convolution products. 18 18 // The five buffers containing the image are distributed in clusters. 19 // For the philips image, it is a [201]*[35] pixels rectangle, and the. 19 20 // 20 21 // The (1024 * 1024) pixels image is read from a file (2 bytes per pixel). 21 22 // 22 23 // - number of clusters containing processors must be power of 2 no larger than 256. 23 // - number of processors per cluster must be power of 2 no larger than 8. 24 // - number of processors per cluster must be power of 2 no larger than 4. 25 // 26 // The number N of working threads is always defined by the number of cores availables 27 // in the architecture, but this application supports three placement modes. 28 // In all modes, the working threads are identified by the [tid] continuous index 29 // in range [0, NTHREADS-1], and defines how the lines are shared amongst the threads. 30 // This continuous index can always be decomposed in two continuous sub-indexes: 31 // tid == cid * ncores + lid, where cid is in [0,NCLUSTERS-1] and lid in [0,NCORES-1]. 32 // 33 // - NO_PLACEMENT: the main thread is itsef a working thread. The (N_1) other working 34 // threads are created by the main thread, but the placement is done by the OS, using 35 // the DQDT for load balancing, and two working threads can be placed on the same core. 36 // The [cid,lid] are only abstract identifiers, and cannot be associated to a physical 37 // cluster or a physical core. In this mode, the main thread run on any cluster, 38 // but has tid = 0 (i.e. cid = 0 & tid = 0). 39 // 40 // - EXPLICIT_PLACEMENT: the main thread is again a working thread, but the placement of 41 // of the threads on the cores is explicitely controled by the main thread to have 42 // exactly one working thread per core, and the [cxy][lpid] core coordinates for a given 43 // thread[tid] can be directly derived from the [tid] value: [cid] is an alias for the 44 // physical cluster identifier, and [lid] is the local core index. 45 // 46 // - PARALLEL_PLACEMENT: the main thread is not anymore a working thread, and uses the 47 // non standard pthread_parallel_create() function to avoid the costly sequencial 48 // loops for pthread_create() and pthread_join(). It garanty one working thread 49 // per core, and the same relation between the thread[tid] and the core[cxy][lpid]. 50 // 51 // The [tid] continuous index defines how the work is shared amongst the threads: 52 // - each thread handles NL/nthreads lines for the horizontal filter. 53 // - each thread handles NP/nthreads columns for the vertical filter. 24 54 /////////////////////////////////////////////////////////////////////////////////////// 25 55 56 #include <sys/mman.h> 26 57 #include <stdio.h> 27 58 #include <stdlib.h> … … 29 60 #include <unistd.h> 30 61 #include <pthread.h> 62 #include <string.h> 31 63 #include <almosmkh.h> 32 64 #include <hal_macros.h> 33 65 34 #define IMAGE_IN_PATH "misc/philips_1024.raw" 35 36 #define USE_SQT_BARRIER 1 37 #define VERBOSE 1 38 #define SUPER_VERBOSE 0 39 40 #define USE_DQT_BARRIER 1 66 #define VERBOSE_MAIN 1 67 #define VERBOSE_EXEC 1 41 68 42 69 #define X_MAX 16 43 70 #define Y_MAX 16 44 #define PROCS_MAX 471 #define CORES_MAX 4 45 72 #define CLUSTERS_MAX (X_MAX * Y_MAX) 46 #define THREADS_MAX (X_MAX * Y_MAX * PROCS_MAX] 47 48 #define INITIAL_DISPLAY_ENABLE 1 49 #define FINAL_DISPLAY_ENABLE 1 50 51 #define PIXEL_SIZE 2 // input image has 2 bytes per pixel 52 #define FBF_TYPE 420 // output image has 1 byte per pixel 53 73 #define THREADS_MAX (X_MAX * Y_MAX * CORES_MAX) 74 75 #define IMAGE_IN_PATH "misc/philips_1024_2.raw" 76 #define IMAGE_IN_PIXEL_SIZE 2 // 2 bytes per pixel 77 78 #define IMAGE_OUT_PATH "misc/philips_after_1O24.raw" 79 #define IMAGE_OUT_PIXEL_SIZE 1 // 1 bytes per pixel 80 81 #define FBF_TYPE 420 54 82 #define NL 1024 55 83 #define NP 1024 56 84 #define NB_PIXELS (NP * NL) 57 #define FRAME_SIZE (NB_PIXELS * PIXEL_SIZE) 58 85 86 #define NO_PLACEMENT 0 87 #define EXPLICIT_PLACEMENT 0 88 #define PARALLEL_PLACEMENT 1 89 90 #define USE_DQT_BARRIER 1 91 #define INITIAL_DISPLAY_ENABLE 1 92 #define FINAL_DISPLAY_ENABLE 1 59 93 60 94 #define TA(c,l,p) (A[c][((NP) * (l)) + (p)]) … … 68 102 69 103 ////////////////////////////////////////////////////////// 70 // global variables stored in seg_data in cluster[0,0]104 // global variables 71 105 ////////////////////////////////////////////////////////// 72 106 73 // Instrumentation counters (cluster_id, lpid] 74 unsigned int START[CLUSTERS_MAX][PROCS_MAX]; 75 unsigned int H_BEG[CLUSTERS_MAX][PROCS_MAX]; 76 unsigned int H_END[CLUSTERS_MAX][PROCS_MAX]; 77 unsigned int V_BEG[CLUSTERS_MAX][PROCS_MAX]; 78 unsigned int V_END[CLUSTERS_MAX][PROCS_MAX]; 79 unsigned int D_BEG[CLUSTERS_MAX][PROCS_MAX]; 80 unsigned int D_END[CLUSTERS_MAX][PROCS_MAX]; 81 82 // file pointers on input image 83 FILE * f_image_in; 84 FILE * f_instrum; 107 // global instrumentation counters for the main thread 108 unsigned int SEQUENCIAL_TIME = 0; 109 unsigned int PARALLEL_TIME = 0; 110 111 // instrumentation counters for thread[tid] in cluster[cid] 112 unsigned int START[CLUSTERS_MAX][CORES_MAX] = {{ 0 }}; 113 unsigned int H_BEG[CLUSTERS_MAX][CORES_MAX] = {{ 0 }}; 114 unsigned int H_END[CLUSTERS_MAX][CORES_MAX] = {{ 0 }}; 115 unsigned int V_BEG[CLUSTERS_MAX][CORES_MAX] = {{ 0 }}; 116 unsigned int V_END[CLUSTERS_MAX][CORES_MAX] = {{ 0 }}; 117 unsigned int D_BEG[CLUSTERS_MAX][CORES_MAX] = {{ 0 }}; 118 unsigned int D_END[CLUSTERS_MAX][CORES_MAX] = {{ 0 }}; 119 120 // pointer on buffer containing the input image, maped by the main to the input file 121 unsigned char * image_in; 122 123 // pointer on buffer containing the output image, maped by the main to the output file 124 unsigned char * image_out; 85 125 86 126 // return values at thread exit … … 91 131 pthread_barrier_t barrier; 92 132 93 // coordinates of core executing the main thread 94 unsigned int cxy_main; 95 unsigned int lid_main; 133 // platform parameters 134 unsigned int x_size; // number of clusters in a row 135 unsigned int y_size; // number of clusters in a column 136 unsigned int ncores; // number of processors per cluster 96 137 97 138 // arrays of pointers on distributed buffers in all clusters 98 139 unsigned short * GA[CLUSTERS_MAX]; 99 int * GB[CLUSTERS_MAX]; 100 int * GC[CLUSTERS_MAX]; 101 int * GD[CLUSTERS_MAX]; 102 unsigned char * GZ[CLUSTERS_MAX]; 103 104 // trdid[] array for execution threads 105 // 1D array if no explicit threads placement / 2D array if explicit placement 106 pthread_t trdid[CLUSTERS_MAX][PROCS_MAX]; 107 //pthread_t trdid[THREADS_MAX]; 108 109 // attr[] array for execution threads 110 // unused if no explicit threads placement 111 pthread_attr_t attr[CLUSTERS_MAX][PROCS_MAX]; 140 int * GB[CLUSTERS_MAX]; 141 int * GC[CLUSTERS_MAX]; 142 int * GD[CLUSTERS_MAX]; 143 unsigned char * GZ[CLUSTERS_MAX]; 144 145 // array of threads kernel identifiers / indexed by [tid] 146 pthread_t exec_trdid[THREADS_MAX]; 147 148 // array of threads attributes / indexed bi [tid] 149 pthread_attr_t exec_attr[THREADS_MAX]; 150 151 // array of execute() function arguments / indexed by [tid] 152 pthread_parallel_work_args_t exec_args[THREADS_MAX]; 153 154 // main thread continuous index 155 unsigned int tid_main; 112 156 113 157 ///////////////////////////////////////////////////////////////////////////////////// … … 115 159 ///////////////////////////////////////////////////////////////////////////////////// 116 160 117 void execute( void ); 118 119 void instrument( unsigned int nclusters, 120 unsigned int ncores ); 161 void execute( pthread_parallel_work_args_t * args ); 162 163 void instrument( FILE * f , char * filename ); 121 164 122 165 ///////////////// 123 166 void main( void ) 124 167 { 125 unsigned int x_size; // number of clusters in a row 126 unsigned int y_size; // number of clusters in a column 127 unsigned int ncores; // number of processors per cluster 128 129 unsigned long long date; 130 131 char name[64]; // instrumentation file name 132 char path[128]; // instrumentation path name 168 unsigned long long start_cycle; 169 unsigned long long end_sequencial_cycle; 170 unsigned long long end_parallel_cycle; 133 171 134 172 int error; 135 173 136 // get platform parameters 137 if ( get_config( &x_size , &y_size , &ncores ) ) 138 { 139 printf("\n[convol error] cannot get hardware configuration\n"); 174 char instru_name[32]; // instrumentation file name 175 char instru_path[64]; // instrumentation path name 176 177 ///////////////////////////////////////////////////////////////////////////////// 178 get_cycle( &start_cycle ); 179 ///////////////////////////////////////////////////////////////////////////////// 180 181 if( (NO_PLACEMENT + EXPLICIT_PLACEMENT + PARALLEL_PLACEMENT) != 1 ) 182 { 183 printf("\n[convol error] illegal placement\n"); 140 184 exit( 0 ); 141 185 } 142 186 143 // get core executing this main thread 144 // and register these coordinates in global variables 145 get_core_id( &cxy_main , &lid_main ); 146 147 // check ncores 148 if( (ncores != 1) && (ncores != 2) && (ncores != 4) ) 187 // get & check platform parameters 188 get_config( &x_size , &y_size , &ncores ); 189 190 if((ncores != 1) && (ncores != 2) && (ncores != 4)) 149 191 { 150 192 printf("\n[convol error] number of cores per cluster must be 1/2/4\n"); … … 152 194 } 153 195 154 // check x_size155 if( (x_size != 1) && (x_size != 2) && (x_size != 4) &&(x_size != 8) && (x_size != 16) )196 if( (x_size != 1) && (x_size != 2) && (x_size != 4) && 197 (x_size != 8) && (x_size != 16) ) 156 198 { 157 199 printf("\n[convol error] x_size must be 1/2/4/8/16\n"); 158 200 exit( 0 ); 159 201 } 160 161 // check y_size162 if( (y_size != 1) && (y_size != 2) && (y_size != 4) &&(y_size != 8) && (y_size != 16) )202 203 if( (y_size != 1) && (y_size != 2) && (y_size != 4) && 204 (y_size != 8) && (y_size != 16) ) 163 205 { 164 206 printf("\n[convol error] y_size must be 1/2/4/8/16\n"); 165 207 exit( 0 ); 166 208 } 209 210 // main thread get identifiers for core executing main 211 unsigned int cxy_main; 212 unsigned int lid_main; 213 get_core_id( &cxy_main , &lid_main ); 167 214 168 215 // compute nthreads and nclusters 169 unsigned int nthreads = x_size * y_size * ncores;170 216 unsigned int nclusters = x_size * y_size; 171 172 get_cycle( &date ); 173 printf("\n[convol] starts on core[%x,%d] / %d thread(s) / cycle %d\n", 174 cxy_main, lid_main, nthreads, (unsigned int)date ); 175 176 // build instrumentation file name 177 if( USE_DQT_BARRIER ) 178 snprintf( name , 64 , "p_convol_dqt_%d_%d", x_size * y_size , ncores ); 179 else 180 snprintf( name , 64 , "p_convol_smp_%d_%d", x_size * y_size , ncores ); 181 182 // build pathname 183 snprintf( path , 128 , "/home/%s", name ); 217 unsigned int nthreads = nclusters * ncores; 218 219 // main thread get FBF size and type 220 unsigned int fbf_width; 221 unsigned int fbf_height; 222 unsigned int fbf_type; 223 fbf_get_config( &fbf_width , &fbf_height , &fbf_type ); 224 225 if( (fbf_width != NP) || (fbf_height != NL) || (fbf_type != FBF_TYPE) ) 226 { 227 printf("\n[convol error] image does not fit FBF size or type\n"); 228 exit( 0 ); 229 } 230 231 if( nthreads > NL ) 232 { 233 printf("\n[convol error] number of threads larger than number of lines\n"); 234 exit( 0 ); 235 } 236 237 // define instrumentation file name 238 if( NO_PLACEMENT ) 239 { 240 printf("\n[convol] %d cluster(s) / %d core(s) / FBF[%d*%d] / PID %x / NO_PLACE\n", 241 nclusters, ncores, fbf_width, fbf_height, getpid() ); 242 243 // build instrumentation file name 244 if( USE_DQT_BARRIER ) 245 snprintf( instru_name , 32 , "conv_dqt_no_place_%d_%d", x_size * y_size , ncores ); 246 else 247 snprintf( instru_name , 32 , "conv_smp_no_place_%d_%d", x_size * y_size , ncores ); 248 } 249 250 if( EXPLICIT_PLACEMENT ) 251 { 252 printf("\n[convol] %d cluster(s) / %d core(s) / FBF[%d*%d] / PID %x / EXPLICIT\n", 253 nclusters, ncores, fbf_width, fbf_height, getpid() ); 254 255 // build instrumentation file name 256 if( USE_DQT_BARRIER ) 257 snprintf( instru_name , 32 , "conv_dqt_explicit_%d_%d_%d", x_size * y_size , ncores ); 258 else 259 snprintf( instru_name , 32 , "conv_smp_explicit_%d_%d_%d", x_size * y_size , ncores ); 260 } 261 262 if( PARALLEL_PLACEMENT ) 263 { 264 printf("\n[convol] %d cluster(s) / %d core(s) / FBF[%d*%d] / PID %x / PARALLEL\n", 265 nclusters, ncores, fbf_width, fbf_height, getpid() ); 266 267 // build instrumentation file name 268 if( USE_DQT_BARRIER ) 269 snprintf( instru_name , 32 , "conv_dqt_parallel_%d_%d_%d", x_size * y_size , ncores ); 270 else 271 snprintf( instru_name , 32 , "conv_smp_parallel_%d_%d_%d", x_size * y_size , ncores ); 272 } 184 273 185 274 // open instrumentation file 186 f_instrum = fopen( path , NULL ); 187 if ( f_instrum == NULL ) 275 snprintf( instru_path , 64 , "/home/%s", instru_name ); 276 FILE * f_instru = fopen( instru_path , NULL ); 277 if ( f_instru == NULL ) 188 278 { 189 printf("\n[convol error] cannot open instrumentation file <%s>\n",path );279 printf("\n[convol error] cannot open instrumentation file %s\n", instru_path ); 190 280 exit( 0 ); 191 281 } 192 282 193 #if DEBUG_MAIN 194 get_cycle( &date ); 195 printf("\n[convol] main on core[%x,%d] open file <%s> at cycle %d\n", 196 cxy_main, lid_main, path, (unsigned int)date ); 197 #endif 198 199 // open input file 200 f_image_in = fopen( IMAGE_IN_PATH , NULL ); 201 if ( f_image_in == NULL ) 202 { 203 printf("\n[convol error] cannot open input file <%s>\n", IMAGE_IN_PATH ); 204 exit( 0 ); 205 } 206 207 #if DEBUG_MAIN 208 get_cycle( &date ); 209 printf("\n[convol] main on core[%x,%d] open file <%s> at cycle %d\n", 210 cxy_main, lid_main, path, (unsigned int)date ); 211 #endif 212 213 // get FBF config 214 unsigned int fbf_width; 215 unsigned int fbf_height; 216 unsigned int fbf_type; 217 fbf_get_config( &fbf_width , &fbf_height , &fbf_type ); 218 219 // check FBF size 220 if ( (fbf_width != NP) || (fbf_height != NL) ) 221 { 222 printf("\n[convol error] bad FBF size\n"); 223 exit( 0 ); 224 } 225 226 // check FBF subsampling 227 if ( fbf_type != FBF_TYPE ) 228 { 229 printf("\n[convol error] bad FBF subsampling\n"); 230 exit( 0 ); 231 } 232 233 // initialise barrier 283 #if VERBOSE_MAIN 284 printf("\n[convol] main on core[%x,%d] open instrumentation file %s\n", 285 cxy_main, lid_main, instru_path ); 286 #endif 287 288 // main initialise barrier 234 289 if( USE_DQT_BARRIER ) 235 290 { … … 251 306 } 252 307 253 get_cycle( &date ); 254 printf("\n[convol] main on core[%x,%d] completes initialisation at cycle %d\n" 255 "- CLUSTERS = %d\n" 256 "- PROCS = %d\n" 257 "- THREADS = %d\n", 258 cxy_main, lid_main, (unsigned int)date, nclusters, ncores, nthreads ); 259 260 // launch exec threads with explicit placement 261 unsigned int x; 262 unsigned int y; 263 unsigned int l; 264 unsigned int cxy; 265 266 for( x = 0 ; x < x_size ; x++ ) 267 { 268 for( y = 0 ; y < y_size ; y++ ) 269 { 270 cxy = HAL_CXY_FROM_XY(x,y); 271 for( l = 0 ; l < ncores ; l++ ) 272 { 273 // no other thread on the core running the main 274 if( (cxy != cxy_main) || (l != lid_main) ) 275 { 276 // define thread attributes 277 attr[cxy][l].attributes = PT_ATTR_CLUSTER_DEFINED | PT_ATTR_CORE_DEFINED; 278 attr[cxy][l].cxy = cxy; 279 attr[cxy][l].lid = l; 308 #if VERBOSE_MAIN 309 printf("\n[convol] main on core[%x,%d] completes barrier init\n", 310 cxy_main, lid_main ); 311 #endif 312 313 // main open input file 314 int fd_in = open( IMAGE_IN_PATH , O_RDONLY , 0 ); 315 316 if ( fd_in < 0 ) 317 { 318 printf("\n[convol error] cannot open input file <%s>\n", IMAGE_IN_PATH ); 319 exit( 0 ); 320 } 321 322 #if VERBOSE_MAIN 323 printf("\n[convol] main on core[%x,%d] open file <%s>\n", 324 cxy_main, lid_main, IMAGE_IN_PATH ); 325 #endif 326 327 // main thread map image_in buffer to input file 328 image_in = (unsigned char *)mmap( NULL, 329 NB_PIXELS * IMAGE_IN_PIXEL_SIZE, 330 PROT_READ, 331 MAP_FILE | MAP_SHARED, 332 fd_in, 333 0 ); // offset 334 if ( image_in == NULL ) 335 { 336 printf("\n[convol error] main cannot map buffer to file %s\n", IMAGE_IN_PATH ); 337 exit( 0 ); 338 } 339 340 #if VERBOSE_MAIN 341 printf("\n[convol] main on core[%x,%x] map buffer to file <%s>\n", 342 cxy_main, lid_main, IMAGE_IN_PATH ); 343 #endif 344 345 // main thread open output file 346 int fd_out = open( IMAGE_OUT_PATH , O_CREAT , 0 ); 347 348 if ( fd_out < 0 ) 349 { 350 printf("\n[convol error] main cannot open file %s\n", IMAGE_OUT_PATH ); 351 exit( 0 ); 352 } 353 354 #if VERBOSE_MAIN 355 printf("\n[convol] main on core[%x,%d] open file <%s>\n", 356 cxy_main, lid_main, IMAGE_OUT_PATH ); 357 #endif 358 359 // main thread map image_out buffer to output file 360 image_out = (unsigned char *)mmap( NULL, 361 NB_PIXELS + IMAGE_OUT_PIXEL_SIZE, 362 PROT_WRITE, 363 MAP_FILE | MAP_SHARED, 364 fd_out, 365 0 ); // offset 366 if ( image_out == NULL ) 367 { 368 printf("\n[convol error] main cannot map buffer to file %s\n", IMAGE_OUT_PATH ); 369 exit( 0 ); 370 } 371 372 #if VERBOSE_MAIN 373 printf("\n[convol] main on core[%x,%x] map buffer to file <%s>\n", 374 cxy_main, lid_main, IMAGE_OUT_PATH ); 375 #endif 376 377 ///////////////////////////////////////////////////////////////////////////////////// 378 get_cycle( &end_sequencial_cycle ); 379 SEQUENCIAL_TIME = (unsigned int)(end_sequencial_cycle - start_cycle); 380 ///////////////////////////////////////////////////////////////////////////////////// 381 382 ////////////////// 383 if( NO_PLACEMENT ) 384 { 385 // the tid value for the main thread is always 0 386 // main thread creates new threads with tid in [1,nthreads-1] 387 unsigned int tid; 388 for ( tid = 0 ; tid < nthreads ; tid++ ) 389 { 390 // register tid value in exec_args[tid] array 391 exec_args[tid].tid = tid; 392 393 // create other threads 394 if( tid > 0 ) 395 { 396 if ( pthread_create( &exec_trdid[tid], 397 NULL, // no attribute 398 &execute, 399 &exec_args[tid] ) ) 400 { 401 printf("\n[convol error] cannot create thread %d\n", tid ); 402 exit( 0 ); 403 } 404 405 #if VERBOSE_MAIN 406 printf("\n[convol] main created thread %d\n", tid ); 407 #endif 408 409 } 410 else 411 { 412 tid_main = 0; 413 } 414 } // end for tid 415 416 // main thread calls itself the execute() function 417 execute( &exec_args[0] ); 418 419 // main thread wait other threads completion 420 for ( tid = 1 ; tid < nthreads ; tid++ ) 421 { 422 unsigned int * status; 423 424 // main wait thread[tid] status 425 if ( pthread_join( exec_trdid[tid], (void*)(&status)) ) 426 { 427 printf("\n[convol error] main cannot join thread %d\n", tid ); 428 exit( 0 ); 429 } 430 431 // check status 432 if( *status != THREAD_EXIT_SUCCESS ) 433 { 434 printf("\n[convol error] thread %x returned failure\n", tid ); 435 exit( 0 ); 436 } 437 438 #if VERBOSE_MAIN 439 printf("\n[convol] main successfully joined thread %x\n", tid ); 440 #endif 441 442 } // end for tid 443 444 } // end if no_placement 445 446 //////////////////////// 447 if( EXPLICIT_PLACEMENT ) 448 { 449 // main thread places each other threads on a specific core[cxy][lid] 450 // but the actual thread creation is sequencial 451 unsigned int x; 452 unsigned int y; 453 unsigned int l; 454 unsigned int cxy; // cluster identifier 455 unsigned int tid; // thread continuous index 456 457 for( x = 0 ; x < x_size ; x++ ) 458 { 459 for( y = 0 ; y < y_size ; y++ ) 460 { 461 cxy = HAL_CXY_FROM_XY( x , y ); 462 for( l = 0 ; l < ncores ; l++ ) 463 { 464 // compute thread continuous index 465 tid = (((x * y_size) + y) * ncores) + l; 466 467 // register tid value in exec_args[tid] array 468 exec_args[tid].tid = tid; 469 470 // no thread created on the core running the main 471 if( (cxy != cxy_main) || (l != lid_main) ) 472 { 473 // define thread attributes 474 exec_attr[tid].attributes = PT_ATTR_CLUSTER_DEFINED | 475 PT_ATTR_CORE_DEFINED; 476 exec_attr[tid].cxy = cxy; 477 exec_attr[tid].lid = l; 280 478 281 // create thread on core[x,y,l] 282 if (pthread_create( &trdid[cxy][l], 283 &attr[cxy][l], 284 &execute, 285 NULL ) ) // execute has no argument 286 { 287 printf("\n[convol error] created thread %x on core[%x][%d]\n", 288 trdid[cxy][l] , cxy , l ); 289 exit( 0 ); 290 } 291 } 292 } 293 } 294 } 295 296 /* 297 // launch other threads without explicit placement 298 for ( n = 1 ; n < nthreads ; n++ ) 299 { 300 if ( giet_pthread_create( &trdid[n], 301 NULL, // no attribute 302 &execute, 303 NULL ) ) // no argument 304 { 305 printf("\n[convol error] creating thread %x\n", trdid[n] ); 306 exit( 0 ); 307 } 308 } 309 */ 310 311 // the main thread run itself the execute() function 312 execute(); 313 314 // wait other threads completions if explicit threads placement 315 for( x = 0 ; x < x_size ; x++ ) 316 { 317 for( y = 0 ; y < y_size ; y++ ) 318 { 319 unsigned int cxy = HAL_CXY_FROM_XY(x,y); 320 for( l = 0 ; l < ncores ; l++ ) 321 { 322 // no other thread on the core running the main 323 if( (cxy != cxy_main) || (l != lid_main) ) 324 { 325 unsigned int * exit_status; 326 327 // wait thread running on core[x,y,l] 328 if (pthread_join( trdid[cxy][l] , (void*)(&exit_status) ) ) 479 // create thread[tid] on core[cxy][l] 480 if ( pthread_create( &exec_trdid[tid], 481 &exec_attr[tid], 482 &execute, 483 &exec_args[tid] ) ) 484 { 485 printf("\n[convol error] cannot create thread %d\n", tid ); 486 exit( 0 ); 487 } 488 #if VERBOSE_MAIN 489 printf("\n[convol] main created thread[%d] on core[%x,%d]\n", tid, cxy, l ); 490 #endif 491 } 492 else 329 493 { 330 printf("\n[convol error] main cannot join thread[%x,%d]\n", cxy, l ); 331 exit( 0 ); 332 } 333 334 // check exit_status 335 if( *exit_status != 0 ) 336 { 337 printf("\n[convol error] thread[%x,%d]return failure\n", cxy, l ); 338 exit( 0 ); 494 tid_main = tid; 339 495 } 340 496 } 341 497 } 342 498 } 343 } 344 /* 345 // wait other threads completion when no explicit threads placement 346 for ( n = 1 ; n < nthreads ; n++ ) 347 { 348 if ( pthread_join( trdid[n], NULL ) ) 349 { 350 printf("\n[convol error] joining thread %x\n", trdid[n] ); 499 500 // main thread calls itself the execute() function 501 execute( &exec_args[tid_main] ); 502 503 // main thread wait other threads completion 504 for( tid = 0 ; tid < nthreads ; tid++ ) 505 { 506 // no other thread on the core running the main 507 if( tid != tid_main ) 508 { 509 unsigned int * status; 510 511 // wait thread[tid] 512 if( pthread_join( exec_trdid[tid] , (void*)(&status) ) ) 513 { 514 printf("\n[convol error] main cannot join thread %d\n", tid ); 515 exit( 0 ); 516 } 517 518 // check status 519 if( *status != THREAD_EXIT_SUCCESS ) 520 { 521 printf("\n[convol error] thread %d returned failure\n", tid ); 522 exit( 0 ); 523 } 524 #if VERBOSE_MAIN 525 printf("\n[convol] main joined thread %d on core[%x,%d]\n", tid , cxy , l ); 526 #endif 527 } 528 } 529 } // end if explicit_placement 530 531 //////////////////////// 532 if( PARALLEL_PLACEMENT ) 533 { 534 // compute covering DQT size an level 535 unsigned int z = (x_size > y_size) ? x_size : y_size; 536 unsigned int root_level = ((z == 1) ? 0 : 537 ((z == 2) ? 1 : 538 ((z == 4) ? 2 : 539 ((z == 8) ? 3 : 4)))); 540 541 // create & execute the working threads 542 if( pthread_parallel_create( root_level , &execute ) ) 543 { 544 printf("\n[convol error] in %s\n", __FUNCTION__ ); 351 545 exit( 0 ); 352 546 } 353 } 354 */ 355 // call the instrument() function 356 instrument( nclusters , ncores ); 357 547 } // end if parallel_placement 548 549 ///////////////////////////////////////////////////////////////////////////// 550 get_cycle( &end_parallel_cycle ); 551 PARALLEL_TIME = (unsigned int)(end_parallel_cycle - end_sequencial_cycle); 552 ///////////////////////////////////////////////////////////////////////////// 553 554 // main thread register instrumentation results 555 instrument( f_instru , instru_name ); 556 557 // main thread close input file 558 close( fd_in ); 559 560 // main thread close output file 561 close( fd_out ); 562 563 // main thread close instrumentation file 564 fclose( f_instru ); 565 566 // main thread suicide 358 567 exit( 0 ); 359 568 … … 362 571 363 572 364 ////////////// 365 void execute() 573 574 575 576 /////////////////////////////////////////////////// 577 void execute( pthread_parallel_work_args_t * args ) 366 578 { 367 579 unsigned long long date; 368 580 369 // Each thread [x,y,p]initialises the convolution kernel parameters in local stack.581 // Each thread initialises the convolution kernel parameters in local stack. 370 582 // The values defined in the next 12 lines are Philips proprietary information. 371 583 … … 382 594 unsigned int hnorm = 201; 383 595 384 // get plat-form config 385 unsigned int x_size; // number of clusters in a row 386 unsigned int y_size; // number of clusters in a column 387 unsigned int ncores; // number of processors per cluster 388 get_config( &x_size , &y_size , &ncores ); 389 390 // get cluster indentifier and core local index 391 unsigned int cxy; 392 unsigned int lid; 393 get_core_id( &cxy , &lid ); 394 unsigned int x = HAL_X_FROM_CXY( cxy ); 395 unsigned int y = HAL_Y_FROM_CXY( cxy ); 596 // WARNING 597 //A thread is identified by the tid index, defined in the "args" structure. 598 // This index being in range [0,nclusters*ncores-1] we can always write 599 // tid == cid * ncores + lid 600 // with cid in [0,nclusters-1] and lid in [0,ncores-1]. 601 // if NO_PLACEMENT, there is no relation between these 602 // thread [cid][lid] indexes, and the core coordinates [cxy][lpid] 603 604 // get thread abstract identifiers 605 unsigned int tid = args->tid; 606 unsigned int cid = tid / ncores; 607 unsigned int lid = tid % ncores; 608 609 #if VERBOSE_EXEC 610 unsigned int cxy; // core cluster identifier 611 unsigned int lpid; // core local identifier 612 get_core_id( &cxy , &lpid ); 613 printf("\n[convol] exec[%d] on core[%x,%d] enters parallel exec\n", 614 tid , cxy , lpid ); 615 #endif 616 617 // build total number of threads and clusters from global variables 618 unsigned int nclusters = x_size * y_size; 619 unsigned int nthreads = nclusters * ncores; 396 620 397 621 // indexes for loops … … 401 625 unsigned int z; // vertical filter index 402 626 403 unsigned int nclusters = x_size * y_size; // number of clusters 404 unsigned int cluster_id = (x * y_size) + y; // continuous cluster index 405 unsigned int thread_id = (cluster_id * ncores) + lid; // continuous thread index 406 unsigned int nthreads = nclusters * ncores; // number of threads 407 unsigned int frame_size = FRAME_SIZE; // total size (bytes) 408 unsigned int lines_per_thread = NL / nthreads; // lines per thread 409 unsigned int lines_per_cluster = NL / nclusters; // lines per cluster 410 unsigned int pixels_per_thread = NP / nthreads; // columns per thread 411 unsigned int pixels_per_cluster = NP / nclusters; // columns per cluster 627 unsigned int lines_per_thread = NL / nthreads; 628 unsigned int lines_per_cluster = NL / nclusters; 629 unsigned int pixels_per_thread = NP / nthreads; 630 unsigned int pixels_per_cluster = NP / nclusters; 631 632 // compute number of pixels stored in one abstract cluster cid 633 unsigned int local_pixels = NL * NP / nclusters; 412 634 413 635 unsigned int first, last; 414 636 415 637 get_cycle( &date ); 416 START[cluster_id][lid] = (unsigned int)date; 417 418 // Each thread[cxy][0] allocate the global buffers in cluster cxy 638 START[cid][lid] = (unsigned int)date; 639 640 // Each thread[cid][0] allocates 5 local buffers, 641 // shared by all threads that have the same cid 419 642 if ( lid == 0 ) 420 643 { 421 422 #if VERBOSE 423 printf( "\n[convol] thread[%x,%d] enters malloc at cycle %d\n", 424 cxy , lid , (unsigned int)date ); 425 #endif 426 427 GA[cluster_id] = remote_malloc( (FRAME_SIZE/nclusters) , cxy ); 428 GB[cluster_id] = remote_malloc( (FRAME_SIZE/nclusters)*2 , cxy ); 429 GC[cluster_id] = remote_malloc( (FRAME_SIZE/nclusters)*2 , cxy ); 430 GD[cluster_id] = remote_malloc( (FRAME_SIZE/nclusters)*2 , cxy ); 431 GZ[cluster_id] = remote_malloc( (FRAME_SIZE/nclusters)/2 , cxy ); 432 433 #if VERBOSE 434 printf( "\n[convol] Shared Buffer Virtual Addresses in cluster %x\n" 435 "### GA = %x\n" 436 "### GB = %x\n" 437 "### GC = %x\n" 438 "### GD = %x\n" 439 "### GZ = %x\n", 440 cxy, 441 GA[cluster_id], 442 GB[cluster_id], 443 GC[cluster_id], 444 GD[cluster_id], 445 GZ[cluster_id] ); 644 GA[cid] = malloc( local_pixels * sizeof( unsigned short ) ); 645 GB[cid] = malloc( local_pixels * sizeof( int ) ); 646 GC[cid] = malloc( local_pixels * sizeof( int ) ); 647 GD[cid] = malloc( local_pixels * sizeof( int ) ); 648 GZ[cid] = malloc( local_pixels * sizeof( unsigned char ) ); 649 650 if( (GA[cid] == NULL) || (GB[cid] == NULL) || (GC[cid] == NULL) || 651 (GD[cid] == NULL) || (GZ[cid] == NULL) ) 652 { 653 printf("\n[convol error] thread[%d] cannot allocate buf_in\n", tid ); 654 pthread_exit( &THREAD_EXIT_FAILURE ); 655 } 656 657 #if VERBOSE_EXEC 658 printf( "\n[convol] exec[%d] on core[%x,%d] allocated shared buffers\n" 659 "### GA = %x\n" 660 "### GB = %x\n" 661 "### GC = %x\n" 662 "### GD = %x\n" 663 "### GZ = %x\n", 664 tid, cxy , lpid, GA[cid], GB[cid], GC[cid], GD[cid], GZ[cid] ); 446 665 #endif 447 666 … … 451 670 pthread_barrier_wait( &barrier ); 452 671 453 // Each thread[c xy,p] initialise in its private stack a copy of the454 // a rrays of pointers on the shared,distributed buffers.672 // Each thread[cid,lid] allocate and initialise in its private stack 673 // a copy of the arrays of pointers on the distributed buffers. 455 674 unsigned short * A[CLUSTERS_MAX]; 456 675 int * B[CLUSTERS_MAX]; … … 468 687 } 469 688 470 // Each thread[x,y,0] access the file containing the input image, to load 471 // the local A[c] buffer (frame_size / nclusters loaded in each cluster). 472 // Other threads are waiting on the barrier. 689 // Each thread[cid,0] access the file containing the input image, to load 690 // the local A[cid] buffer. Other threads are waiting on the barrier. 473 691 if ( lid==0 ) 474 692 { 475 unsigned int offset = (frame_size/nclusters)*cluster_id; 476 unsigned int size = frame_size/nclusters; 477 478 // seek the pointer in file 479 if ( fseek( f_image_in, 480 offset, 481 SEEK_SET ) ) 482 { 483 printf("\n[convol error] in %s : thread[%x,%d] cannot seek input file\n", 484 __FUNCTION__ , cxy , lid ); 485 pthread_exit( &THREAD_EXIT_FAILURE ); 486 } 487 488 if ( fread( A[cluster_id], 489 1, 490 size, 491 f_image_in ) != size ) 492 { 493 printf("\n[convol error] in %s : thread[%x,%d] cannot read input file\n", 494 __FUNCTION__ , cxy , lid ); 495 pthread_exit( &THREAD_EXIT_FAILURE ); 496 } 693 unsigned int size = local_pixels * sizeof( unsigned short ); 694 unsigned int offset = size * cid; 695 696 memcpy( A[cid], 697 image_in + offset, 698 size ); 497 699 498 #if VERBOSE 700 #if VERBOSE_EXEC 499 701 get_cycle( &date ); 500 printf( "\n[convol] thread [%x,%d] load input file at cycle %d\n",501 cxy , lid , (unsigned int)date);702 printf( "\n[convol] thread %d on core[%x,%d] load input file in A[%d]\n", 703 tid , cxy , lpid , cid ); 502 704 #endif 503 705 … … 505 707 506 708 // Optionnal parallel display of the initial image stored in A[c] buffers. 507 // Eah thread[ x,y,p] displays (NL/nthreads) lines. (one byte per pixel).709 // Eah thread[cid,lid] displays (NL/nthreads) lines. 508 710 509 711 if ( INITIAL_DISPLAY_ENABLE ) … … 516 718 line = offset + l; 517 719 720 // copy TA[cid] to TZ[cid] 518 721 for ( p = 0 ; p < NP ; p++ ) 519 722 { 520 TZ(c luster_id, line, p) = (unsigned char)(TA(cluster_id, line, p) >> 8);723 TZ(cid, line, p) = (unsigned char)(TA(cid, line, p) >> 8); 521 724 } 522 725 523 if (fbf_write( &TZ(cluster_id, line, 0), // first pixel in TZ 524 NP, // number of bytes 525 NP*(l + (thread_id * lines_per_thread)))) // offset in FBF 726 // display one line to frame buffer 727 if (fbf_write( &TZ(cid, line, 0), // first pixel in TZ 728 NP, // number of bytes 729 NP*(l + (tid * lines_per_thread)))) // offset in FBF 526 730 { 527 731 printf("\n[convol error] in %s : thread[%x,%d] cannot access FBF\n", … … 531 735 } 532 736 533 #if VERBOSE 737 #if VERBOSE_EXEC 534 738 get_cycle( &date ); 535 printf( "\n[convol] thread[% x,%d] completes initial display at cycle %d\n",536 cxy , lid , (unsigned int)date);739 printf( "\n[convol] thread[%d] on core[%x,%d] completes initial display\n", 740 tid , cxy , lpid ); 537 741 #endif 538 742 … … 543 747 //////////////////////////////////////////////////////////// 544 748 // parallel horizontal filter : 545 // B <= transpose(FH(A))749 // B <= convol(FH(A)) 546 750 // D <= A - FH(A) 547 // Each thread computes (NL/nthreads) lines 751 // Each thread computes (NL/nthreads) lines. 548 752 // The image must be extended : 549 // if (z<0) TA(c luster_id,l,z) == TA(cluster_id,l,0)550 // if (z>NP-1) TA(c luster_id,l,z) == TA(cluster_id,l,NP-1)753 // if (z<0) TA(cid,l,z) == TA(cid,l,0) 754 // if (z>NP-1) TA(cid,l,z) == TA(cid,l,NP-1) 551 755 //////////////////////////////////////////////////////////// 552 756 553 757 get_cycle( &date ); 554 H_BEG[c luster_id][lid] = (unsigned int)date;555 556 #if VERBOSE 557 printf( "\n[convol] thread[% x,%d] starts horizontal filter at cycle %d\n",558 cxy , lid , (unsigned int)date);758 H_BEG[cid][lid] = (unsigned int)date; 759 760 #if VERBOSE_EXEC 761 printf( "\n[convol] thread[%d] on core[%x,%d] starts horizontal filter\n", 762 tid , cxy , lpid ); 559 763 #else 560 if ( (cxy == cxy_main) && (lid == lid_main))561 printf( "\n[convol] thread[% x,%d] starts horizontal filter at cycle %d\n",562 cxy , lid , (unsigned int)date);764 if ( tid == tid_main ) 765 printf( "\n[convol] thread[%d] on core[%x,%d] starts horizontal filter\n", 766 tid , cxy , lpid ); 563 767 #endif 564 768 … … 566 770 // first & last define which lines are handled by a given thread 567 771 568 first = t hread_id * lines_per_thread;772 first = tid * lines_per_thread; 569 773 last = first + lines_per_thread; 570 774 … … 626 830 627 831 get_cycle( &date ); 628 H_END[c luster_id][lid] = (unsigned int)date;629 630 #if VERBOSE 631 printf( "\n[convol] thread[% x,%d] completes horizontal filter at cycle %d\n",632 cxy , lid, (unsigned int)date);832 H_END[cid][lid] = (unsigned int)date; 833 834 #if VERBOSE_EXEC 835 printf( "\n[convol] thread[%d] on core[%x,%d] completes horizontal filter\n", 836 tid , cxy , lpid ); 633 837 #else 634 if ( (cxy == cxy_main) && (lid == lid_main))635 printf( "\n[convol] thread[% x,%d] completes horizontal filter at cycle %d\n",636 cxy , lid, (unsigned int)date);838 if ( tid == tid_main ) 839 printf( "\n[convol] thread[%d] on core[%x,%d] completes horizontal filter\n", 840 tid , cxy , lpid ); 637 841 #endif 638 842 … … 645 849 // Each thread computes (NP/nthreads) columns 646 850 // The image must be extended : 647 // if (l<0) TB(c luster_id,p,l) == TB(cluster_id,p,0)648 // if (l>NL-1) TB(c luster_id,p,l) == TB(cluster_id,p,NL-1)851 // if (l<0) TB(cid,p,l) == TB(cid,p,0) 852 // if (l>NL-1) TB(cid,p,l) == TB(cid,p,NL-1) 649 853 /////////////////////////////////////////////////////////////// 650 854 651 855 get_cycle( &date ); 652 V_BEG[c luster_id][lid] = (unsigned int)date;653 654 #if VERBOSE 655 printf( "\n[convol] thread[% x,%d] starts vertical filter at cycle %d\n",656 cxy , lid , (unsigned int)date);856 V_BEG[cid][lid] = (unsigned int)date; 857 858 #if VERBOSE_EXEC 859 printf( "\n[convol] thread[%d] on core[%x,%d] starts vertical filter\n", 860 tid , cxy , lpid ); 657 861 #else 658 if ( (cxy == cxy_main) && (lid == lid_main))659 printf( "\n[convol] thread[% x,%d] starts vertical filter at cycle %d\n",660 cxy , lid, (unsigned int)date);862 if ( tid == tid_main ) 863 printf( "\n[convol] thread[%d] on core[%x,%d] starts vertical filter\n", 864 tid , cxy , lpid ); 661 865 #endif 662 866 … … 664 868 // first & last define which pixels are handled by a given thread 665 869 666 first = t hread_id * pixels_per_thread;870 first = tid * pixels_per_thread; 667 871 last = first + pixels_per_thread; 668 872 … … 740 944 741 945 get_cycle( &date ); 742 V_END[c luster_id][lid] = (unsigned int)date;743 744 #if VERBOSE 745 printf( "\n[convol] thread[% x,%d] completes vertical filter at cycle %d\n",746 cxy , lid , (unsigned int)date);946 V_END[cid][lid] = (unsigned int)date; 947 948 #if VERBOSE_EXEC 949 printf( "\n[convol] thread[%d] on core[%x,%d] completes vertical filter\n", 950 tid , cxy , lid ); 747 951 #else 748 if ( (cxy == cxy_main) && (lid == lid_main))749 printf( "\n[convol] thread[% x,%d] completes vertical filter at cycle %d\n",750 cxy , lid, (unsigned int)date);952 if ( tid == tid_main ) 953 printf( "\n[convol] thread[%d] on core[%x,%d] completes vertical filter\n", 954 tid , cxy , lid ); 751 955 #endif 752 956 … … 755 959 756 960 // Optional parallel display of the final image Z <= D + C 757 // Eah thread[x,y,p] displays (NL/nthreads) lines. (one byte per pixel).961 // Eah thread[x,y,p] displays (NL/nthreads) lines. 758 962 759 963 if ( FINAL_DISPLAY_ENABLE ) 760 964 { 761 965 get_cycle( &date ); 762 D_BEG[c luster_id][lid] = (unsigned int)date;763 764 #if VERBOSE 765 printf( "\n[convol] thread[% x,%d] starts final display at cycle %d\n",766 cxy , lid , (unsigned int)date);966 D_BEG[cid][lid] = (unsigned int)date; 967 968 #if VERBOSE_EXEC 969 printf( "\n[convol] thread[%d] on core[%x,%d] starts final display\n", 970 tid , cxy , lid ); 767 971 #else 768 if ( (cxy == cxy_main) && (lid == lid_main))769 printf( "\n[convol] thread[% x,%d] starts final display at cycle %d\n",770 cxy , lid, (unsigned int)date);972 if ( tid == tid_main ) 973 printf( "\n[convol] thread[%d] on core[%x,%d] starts final display\n", 974 tid , cxy , lid ); 771 975 #endif 772 976 … … 780 984 for ( p = 0 ; p < NP ; p++ ) 781 985 { 782 TZ(c luster_id, line, p) =783 (unsigned char)( (TD(c luster_id, line, p) +784 TC(c luster_id, line, p) ) >> 8 );986 TZ(cid, line, p) = 987 (unsigned char)( (TD(cid, line, p) + 988 TC(cid, line, p) ) >> 8 ); 785 989 } 786 990 787 if (fbf_write( &TZ(c luster_id, line, 0),// first pixel in TZ788 NP, 789 NP*(l + (t hread_id * lines_per_thread)))) // offset in FBF991 if (fbf_write( &TZ(cid, line, 0), // first pixel in TZ 992 NP, // number of bytes 993 NP*(l + (tid * lines_per_thread)))) // offset in FBF 790 994 { 791 printf("\n[convol error] in %s : thread[%d,%d,%d] cannot access FBF\n", 792 __FUNCTION__ , x , y , lid ); 995 printf("\n[convol error] thread[%d] cannot access FBF\n", tid ); 793 996 pthread_exit( &THREAD_EXIT_FAILURE ); 794 997 } … … 796 999 797 1000 get_cycle( &date ); 798 D_END[c luster_id][lid] = (unsigned int)date;799 800 #if VERBOSE 801 printf( "\n[convol] thread[% x,%d] completes final display at cycle %d\n",802 cxy , lid , (unsigned int)date);1001 D_END[cid][lid] = (unsigned int)date; 1002 1003 #if VERBOSE_EXEC 1004 printf( "\n[convol] thread[%d] on core[%x,%d] completes final display\n", 1005 tid , cxy , lid ); 803 1006 #else 804 if ( (cxy == cxy_main) && (lid == lid_main) ) 805 printf( "\n[convol] thread[%x,%d] completes final display at cycle %d\n", 806 cxy , lid , (unsigned int)date ); 807 #endif 808 809 //////////////////////////////// 810 pthread_barrier_wait( &barrier ); 1007 if ( tid == tid_main ) 1008 printf( "\n[convol] thread[%d] on core[%x,%d] completes final display\n", 1009 tid , cxy , lid ); 1010 #endif 1011 811 1012 } 812 1013 813 1014 // all threads (but the one executing main) exit 814 if ( (cxy != cxy_main) || (lid != lid_main))1015 if ( tid != tid_main ) 815 1016 { 816 1017 pthread_exit( &THREAD_EXIT_SUCCESS ); … … 821 1022 822 1023 823 ////////////////////////// ///////////////824 void instrument( unsigned int nclusters,825 unsigned int ncores)1024 ////////////////////////// 1025 void instrument( FILE * f, 1026 char * filename ) 826 1027 { 827 unsigned int cc, pp; 828 829 unsigned int min_start = 0xFFFFFFFF; 830 unsigned int max_start = 0; 831 832 unsigned int min_h_beg = 0xFFFFFFFF; 833 unsigned int max_h_beg = 0; 834 835 unsigned int min_h_end = 0xFFFFFFFF; 836 unsigned int max_h_end = 0; 837 838 unsigned int min_v_beg = 0xFFFFFFFF; 839 unsigned int max_v_beg = 0; 840 841 unsigned int min_v_end = 0xFFFFFFFF; 842 unsigned int max_v_end = 0; 843 844 unsigned int min_d_beg = 0xFFFFFFFF; 845 unsigned int max_d_beg = 0; 846 847 unsigned int min_d_end = 0xFFFFFFFF; 848 unsigned int max_d_end = 0; 849 850 for (cc = 0; cc < nclusters; cc++) 851 { 852 for (pp = 0; pp < ncores; pp++ ) 853 { 854 if (START[cc][pp] < min_start) min_start = START[cc][pp]; 855 if (START[cc][pp] > max_start) max_start = START[cc][pp]; 856 857 if (H_BEG[cc][pp] < min_h_beg) min_h_beg = H_BEG[cc][pp]; 858 if (H_BEG[cc][pp] > max_h_beg) max_h_beg = H_BEG[cc][pp]; 859 860 if (H_END[cc][pp] < min_h_end) min_h_end = H_END[cc][pp]; 861 if (H_END[cc][pp] > max_h_end) max_h_end = H_END[cc][pp]; 862 863 if (V_BEG[cc][pp] < min_v_beg) min_v_beg = V_BEG[cc][pp]; 864 if (V_BEG[cc][pp] > max_v_beg) max_v_beg = V_BEG[cc][pp]; 865 866 if (V_END[cc][pp] < min_v_end) min_v_end = V_END[cc][pp]; 867 if (V_END[cc][pp] > max_v_end) max_v_end = V_END[cc][pp]; 868 869 if (D_BEG[cc][pp] < min_d_beg) min_d_beg = D_BEG[cc][pp]; 870 if (D_BEG[cc][pp] > max_d_beg) max_d_beg = D_BEG[cc][pp]; 871 872 if (D_END[cc][pp] < min_d_end) min_d_end = D_END[cc][pp]; 873 if (D_END[cc][pp] > max_d_end) max_d_end = D_END[cc][pp]; 874 } 875 } 876 877 printf(" - START : min = %d / max = %d / med = %d / delta = %d\n", 878 min_start, max_start, (min_start+max_start)/2, max_start-min_start); 879 880 printf(" - H_BEG : min = %d / max = %d / med = %d / delta = %d\n", 881 min_h_beg, max_h_beg, (min_h_beg+max_h_beg)/2, max_h_beg-min_h_beg); 882 883 printf(" - H_END : min = %d / max = %d / med = %d / delta = %d\n", 884 min_h_end, max_h_end, (min_h_end+max_h_end)/2, max_h_end-min_h_end); 885 886 printf(" - V_BEG : min = %d / max = %d / med = %d / delta = %d\n", 887 min_v_beg, max_v_beg, (min_v_beg+max_v_beg)/2, max_v_beg-min_v_beg); 888 889 printf(" - V_END : min = %d / max = %d / med = %d / delta = %d\n", 890 min_v_end, max_v_end, (min_v_end+max_v_end)/2, max_v_end-min_v_end); 891 892 printf(" - D_BEG : min = %d / max = %d / med = %d / delta = %d\n", 893 min_d_beg, max_d_beg, (min_d_beg+max_d_beg)/2, max_d_beg-min_d_beg); 894 895 printf(" - D_END : min = %d / max = %d / med = %d / delta = %d\n", 896 min_d_end, max_d_end, (min_d_end+max_d_end)/2, max_d_end-min_d_end); 897 898 printf( "\n General Scenario (Kcycles for each step)\n" ); 899 printf( " - BOOT OS = %d\n", (min_start )/1000 ); 900 printf( " - LOAD IMAGE = %d\n", (min_h_beg - min_start)/1000 ); 901 printf( " - H_FILTER = %d\n", (max_h_end - min_h_beg)/1000 ); 902 printf( " - BARRIER HORI/VERT = %d\n", (min_v_beg - max_h_end)/1000 ); 903 printf( " - V_FILTER = %d\n", (max_v_end - min_v_beg)/1000 ); 904 printf( " - BARRIER VERT/DISP = %d\n", (min_d_beg - max_v_end)/1000 ); 905 printf( " - DISPLAY = %d\n", (max_d_end - min_d_beg)/1000 ); 906 907 // TODO save these results on f_instrum 1028 unsigned int nclusters = x_size * y_size; 1029 1030 unsigned int cc, pp; 1031 1032 unsigned int min_start = 0xFFFFFFFF; 1033 unsigned int max_start = 0; 1034 1035 unsigned int min_h_beg = 0xFFFFFFFF; 1036 unsigned int max_h_beg = 0; 1037 1038 unsigned int min_h_end = 0xFFFFFFFF; 1039 unsigned int max_h_end = 0; 1040 1041 unsigned int min_v_beg = 0xFFFFFFFF; 1042 unsigned int max_v_beg = 0; 1043 1044 unsigned int min_v_end = 0xFFFFFFFF; 1045 unsigned int max_v_end = 0; 1046 1047 unsigned int min_d_beg = 0xFFFFFFFF; 1048 unsigned int max_d_beg = 0; 1049 1050 unsigned int min_d_end = 0xFFFFFFFF; 1051 unsigned int max_d_end = 0; 1052 1053 for (cc = 0; cc < nclusters; cc++) 1054 { 1055 for (pp = 0; pp < ncores; pp++ ) 1056 { 1057 if (START[cc][pp] < min_start) min_start = START[cc][pp]; 1058 if (START[cc][pp] > max_start) max_start = START[cc][pp]; 1059 1060 if (H_BEG[cc][pp] < min_h_beg) min_h_beg = H_BEG[cc][pp]; 1061 if (H_BEG[cc][pp] > max_h_beg) max_h_beg = H_BEG[cc][pp]; 1062 1063 if (H_END[cc][pp] < min_h_end) min_h_end = H_END[cc][pp]; 1064 if (H_END[cc][pp] > max_h_end) max_h_end = H_END[cc][pp]; 1065 1066 if (V_BEG[cc][pp] < min_v_beg) min_v_beg = V_BEG[cc][pp]; 1067 if (V_BEG[cc][pp] > max_v_beg) max_v_beg = V_BEG[cc][pp]; 1068 1069 if (V_END[cc][pp] < min_v_end) min_v_end = V_END[cc][pp]; 1070 if (V_END[cc][pp] > max_v_end) max_v_end = V_END[cc][pp]; 1071 1072 if (D_BEG[cc][pp] < min_d_beg) min_d_beg = D_BEG[cc][pp]; 1073 if (D_BEG[cc][pp] > max_d_beg) max_d_beg = D_BEG[cc][pp]; 1074 1075 if (D_END[cc][pp] < min_d_end) min_d_end = D_END[cc][pp]; 1076 if (D_END[cc][pp] > max_d_end) max_d_end = D_END[cc][pp]; 1077 } 1078 } 1079 1080 // display on terminal 1081 printf( "\n ------ %s ------\n" , filename ); 1082 1083 printf(" - START : min = %d / max = %d / med = %d / delta = %d\n", 1084 min_start, max_start, (min_start+max_start)/2, max_start-min_start); 1085 1086 printf(" - H_BEG : min = %d / max = %d / med = %d / delta = %d\n", 1087 min_h_beg, max_h_beg, (min_h_beg+max_h_beg)/2, max_h_beg-min_h_beg); 1088 1089 printf(" - H_END : min = %d / max = %d / med = %d / delta = %d\n", 1090 min_h_end, max_h_end, (min_h_end+max_h_end)/2, max_h_end-min_h_end); 1091 1092 printf(" - V_BEG : min = %d / max = %d / med = %d / delta = %d\n", 1093 min_v_beg, max_v_beg, (min_v_beg+max_v_beg)/2, max_v_beg-min_v_beg); 1094 1095 printf(" - V_END : min = %d / max = %d / med = %d / delta = %d\n", 1096 min_v_end, max_v_end, (min_v_end+max_v_end)/2, max_v_end-min_v_end); 1097 1098 printf(" - D_BEG : min = %d / max = %d / med = %d / delta = %d\n", 1099 min_d_beg, max_d_beg, (min_d_beg+max_d_beg)/2, max_d_beg-min_d_beg); 1100 1101 printf(" - D_END : min = %d / max = %d / med = %d / delta = %d\n", 1102 min_d_end, max_d_end, (min_d_end+max_d_end)/2, max_d_end-min_d_end); 1103 1104 printf( "\n General Scenario (Kcycles for each step)\n" ); 1105 printf( " - LOAD IMAGE = %d\n", (min_h_beg - min_start)/1000 ); 1106 printf( " - H_FILTER = %d\n", (max_h_end - min_h_beg)/1000 ); 1107 printf( " - BARRIER HORI/VERT = %d\n", (min_v_beg - max_h_end)/1000 ); 1108 printf( " - V_FILTER = %d\n", (max_v_end - min_v_beg)/1000 ); 1109 printf( " - BARRIER VERT/DISP = %d\n", (min_d_beg - max_v_end)/1000 ); 1110 printf( " - DISPLAY = %d\n", (max_d_end - min_d_beg)/1000 ); 1111 printf( " \nSEQUENCIAL = %d / PARALLEL = %d\n", SEQUENCIAL_TIME, PARALLEL_TIME ); 1112 1113 // save on disk 1114 fprintf( f , "\n ------ %s ------\n" , filename ); 1115 1116 fprintf( f , " - START : min = %d / max = %d / med = %d / delta = %d\n", 1117 min_start, max_start, (min_start+max_start)/2, max_start-min_start); 1118 1119 fprintf( f , " - H_BEG : min = %d / max = %d / med = %d / delta = %d\n", 1120 min_h_beg, max_h_beg, (min_h_beg+max_h_beg)/2, max_h_beg-min_h_beg); 1121 1122 fprintf( f , " - H_END : min = %d / max = %d / med = %d / delta = %d\n", 1123 min_h_end, max_h_end, (min_h_end+max_h_end)/2, max_h_end-min_h_end); 1124 1125 fprintf( f , " - V_BEG : min = %d / max = %d / med = %d / delta = %d\n", 1126 min_v_beg, max_v_beg, (min_v_beg+max_v_beg)/2, max_v_beg-min_v_beg); 1127 1128 fprintf( f , " - V_END : min = %d / max = %d / med = %d / delta = %d\n", 1129 min_v_end, max_v_end, (min_v_end+max_v_end)/2, max_v_end-min_v_end); 1130 1131 fprintf( f , " - D_BEG : min = %d / max = %d / med = %d / delta = %d\n", 1132 min_d_beg, max_d_beg, (min_d_beg+max_d_beg)/2, max_d_beg-min_d_beg); 1133 1134 fprintf( f , " - D_END : min = %d / max = %d / med = %d / delta = %d\n", 1135 min_d_end, max_d_end, (min_d_end+max_d_end)/2, max_d_end-min_d_end); 1136 1137 fprintf( f , "\n General Scenario (Kcycles)\n" ); 1138 fprintf( f , " - LOAD IMAGE = %d\n", (min_h_beg - min_start)/1000 ); 1139 fprintf( f , " - H_FILTER = %d\n", (max_h_end - min_h_beg)/1000 ); 1140 fprintf( f , " - BARRIER HORI/VERT = %d\n", (min_v_beg - max_h_end)/1000 ); 1141 fprintf( f , " - V_FILTER = %d\n", (max_v_end - min_v_beg)/1000 ); 1142 fprintf( f , " - BARRIER VERT/DISP = %d\n", (min_d_beg - max_v_end)/1000 ); 1143 fprintf( f , " - DISPLAY = %d\n", (max_d_end - min_d_beg)/1000 ); 1144 fprintf( f , " \nSEQUENCIAL = %d / PARALLEL = %d\n", SEQUENCIAL_TIME, PARALLEL_TIME ); 908 1145 909 1146 } // end instrument() -
trunk/user/display/display.ld
r644 r652 1 /*************************************************************************** *1 /*************************************************************************** 2 2 * Define the base address for user code (both .text and .data) 3 *************************************************************************** **/3 ***************************************************************************/ 4 4 5 5 seg_code_base = 0x400000; -
trunk/user/fft/fft.c
r649 r652 15 15 /*************************************************************************/ 16 16 17 /////////////////////////////////////////////////////////////////////////// 17 //////////////////////////////////////////////////////////////////////////////////////// 18 18 // This port of the SPLASH FFT benchmark on the ALMOS-MKH OS has been 19 19 // done by Alain Greiner (august 2018). … … 45 45 // that contains all coefs required for a rootN points FFT. 46 46 // 47 // There is one working thread per core.48 47 // The actual number of cores and cluster in a given hardware architecture 49 48 // is obtained by the get_config() syscall (x_size, y_size, ncores). … … 51 50 // The max number of cores per cluster is bounded by CORES_MAX. 52 51 // 53 // Several configuration parameters can be defined below: 54 // - PRINT_ARRAY : Print out complex data points arrays. 55 // - CHECK : Perform both FFT and inverse FFT to check output/input. 56 // - DEBUG_MAIN : Display intermediate results in main() 57 // - DEBUG_FFT1D : Display intermediate results in FFT1D() 58 // - DEBUG_ROW : Display intermedite results in FFTrow() 52 // The number N of working threads is always defined by the number of cores availables 53 // in the architecture, but this application supports three placement modes. 54 // In all modes, the working threads are identified by the [tid] continuous index 55 // in range [0, NTHREADS-1], and defines how the lines are shared amongst the threads. 56 // This continuous index can always be decomposed in two continuous sub-indexes: 57 // tid == cid * ncores + lid, where cid is in [0,NCLUSTERS-1] and lid in [0,NCORES-1]. 58 // 59 // - NO_PLACEMENT: the main thread is itsef a working thread. The (N_1) other working 60 // threads are created by the main thread, but the placement is done by the OS, using 61 // the DQDT for load balancing, and two working threads can be placed on the same core. 62 // The [cid,lid] are only abstract identifiers, and cannot be associated to a physical 63 // cluster or a physical core. In this mode, the main thread run on any cluster, 64 // but has tid = 0 (i.e. cid = 0 & tid = 0). 65 // 66 // - EXPLICIT_PLACEMENT: the main thread is again a working thread, but the placement of 67 // of the threads on the cores is explicitely controled by the main thread to have 68 // exactly one working thread per core, and the [cxy][lpid] core coordinates for a given 69 // thread[tid] can be directly derived from the [tid] value: [cid] is an alias for the 70 // physical cluster identifier, and [lid] is the local core index. 71 // 72 // - PARALLEL_PLACEMENT: the main thread is not anymore a working thread, and uses the 73 // non standard pthread_parallel_create() function to avoid the costly sequencial 74 // loops for pthread_create() and pthread_join(). It garanty one working thread 75 // per core, and the same relation between the thread[tid] and the core[cxy][lpid]. 76 // 77 // Several others configuration parameters can be defined below: 78 // - USE_DQT_BARRIER : use a hierarchical barrier for working threads synchro 79 // - PRINT_ARRAY : Print out complex data points arrays. 80 // - CHECK : Perform both FFT and inverse FFT to check output/input. 81 // - DEBUG_MAIN : Display intermediate results in main() 82 // - DEBUG_FFT1D : Display intermediate results in FFT1D() 83 // - DEBUG_ROW : Display intermedite results in FFTrow() 59 84 // 60 85 // Regarding final instrumentation: … … 66 91 // is computed by each thread(i) in the work() function. 67 92 // The results are displayed on the TXT terminal, and registered on disk. 68 /////////////////////////////////////////////////////////////////////////// 93 /////////////////////////////////////////////////////////////////////////////////////// 69 94 70 95 #include <math.h> … … 92 117 // parameters 93 118 119 #define NO_PLACEMENT 1 120 #define EXPLICIT_PLACEMENT 0 121 #define PARALLEL_PLACEMENT 0 122 94 123 #define DEFAULT_M 18 // 256 K complex points 95 124 #define USE_DQT_BARRIER 1 // use DDT barrier if non zero … … 110 139 ///////////////////////////////////////////////////////////////////////////////////// 111 140 112 // work function arguments 113 typedef struct work_args_s 114 { 115 unsigned int tid; // thread continuous index 116 unsigned int lid; // core local index 117 unsigned int cid; // cluster continuous index 118 pthread_barrier_t * parent_barrier; // parent barrier to signal completion 119 } 120 work_args_t; 141 unsigned int x_size; // platform global parameter 142 unsigned int y_size; // platform global parameter 143 unsigned int ncores; // platform global parameter 121 144 122 145 unsigned int nthreads; // total number of threads (one thread per core) … … 130 153 // arrays of pointers on distributed buffers (one sub-buffer per cluster) 131 154 double * data[CLUSTERS_MAX]; // original time-domain data 132 double * trans[CLUSTERS_MAX]; // used as auxiliary space for transpose155 double * trans[CLUSTERS_MAX]; // used as auxiliary space for fft 133 156 double * twid[CLUSTERS_MAX]; // twiddle factor : exp(-2iPI*k*n/N) 134 157 double * bloup[CLUSTERS_MAX]; // used as auxiliary space for DFT … … 146 169 pthread_barrierattr_t barrier_attr; 147 170 148 ///////////////////////////////////////////////////////////////////////////////////// 149 // Global variables required by parallel_pthread_create() 150 ///////////////////////////////////////////////////////////////////////////////////// 151 152 // 2D arrays of input arguments for the <work> threads 153 // These arrays are initialised by the application main thread 154 155 work_args_t work_args[CLUSTERS_MAX][CORES_MAX]; // work function arguments 156 work_args_t * work_ptrs[CLUSTERS_MAX][CORES_MAX]; // pointers on arguments 157 158 // 1D array of barriers to allow the <work> threads to signal termination 159 // this array is initialised in each cluster by the <build[cxy][0]> thread 160 161 pthread_barrier_t parent_barriers[CLUSTERS_MAX]; // termination barrier 171 //return values at thread exit 172 unsigned int THREAD_EXIT_SUCCESS = 0; 173 unsigned int THREAD_EXIT_FAILURE = 1; 174 175 // main thread continuous index 176 unsigned int tid_main; 177 178 // array of kernel thread identifiers / indexed by [tid] 179 pthread_t work_trdid[CLUSTERS_MAX * CORES_MAX]; 180 181 // array of thread attributes / indexed by [tid] 182 pthread_attr_t work_attr[CLUSTERS_MAX * CORES_MAX]; 183 184 // array of work function arguments / indexed by [tid] 185 pthread_parallel_work_args_t work_args[CLUSTERS_MAX * CORES_MAX]; 162 186 163 187 ///////////////////////////////////////////////////////////////////////////////////// … … 165 189 ///////////////////////////////////////////////////////////////////////////////////// 166 190 167 void work( work_args_t * args );191 void work( pthread_parallel_work_args_t * args ); 168 192 169 193 double CheckSum( void ); … … 234 258 int error; 235 259 236 unsigned int x_size; // number of clusters per row237 unsigned int y_size; // number of clusters per column238 unsigned int ncores; // max number of cores per cluster239 240 241 unsigned int x; // current index for cluster X coordinate242 unsigned int y; // current index for cluster Y coordinate243 unsigned int lid; // current index for core in a cluster244 260 unsigned int tid; // continuous thread index 245 unsigned int cid; // cluster continuous index246 unsigned int cxy; // hardware specific cluster identifier247 261 248 262 char name[64]; // instrumentation file name … … 265 279 int pid = getpid(); 266 280 281 // check placement mode 282 if( (NO_PLACEMENT + EXPLICIT_PLACEMENT + PARALLEL_PLACEMENT) != 1 ) 283 { 284 printf("\n[fft error] illegal placement mode\n"); 285 exit( 0 ); 286 } 287 267 288 // get FFT application start cycle 268 289 get_cycle( &start_init_cycle ); … … 295 316 exit( 0 ); 296 317 } 318 319 // get identifiers for core executing main 320 unsigned int cxy_main; 321 unsigned int lid_main; 322 get_core_id( &cxy_main , &lid_main ); 297 323 298 324 // compute nthreads and nclusters … … 317 343 } 318 344 319 printf("\n[fft] starts / %d points / %d thread(s) / PID %x / cycle %d\n", 320 N, nthreads, pid, (unsigned int)start_init_cycle ); 321 322 // build instrumentation file name 323 if( USE_DQT_BARRIER ) 324 snprintf( name , 64 , "p_fft_dqt_%d_%d_%d", N , x_size * y_size , ncores ); 325 else 326 snprintf( name , 64 , "p_fft_smp_%d_%d_%d", N , x_size * y_size , ncores ); 327 328 // build pathname 345 // define instrumentation file name 346 if( NO_PLACEMENT ) 347 { 348 printf("\n[fft] starts / %d points / %d thread(s) / PID %x / NO_PLACE\n", 349 N, nthreads, pid ); 350 351 // build instrumentation file name 352 if( USE_DQT_BARRIER ) 353 snprintf( name , 64 , "fft_dqt_no_place_%d_%d_%d", M , x_size * y_size , ncores ); 354 else 355 snprintf( name , 64 , "fft_smp_no_place_%d_%d_%d", M , x_size * y_size , ncores ); 356 } 357 358 if( EXPLICIT_PLACEMENT ) 359 { 360 printf("\n[fft] starts / %d points / %d thread(s) / PID %x / EXPLICIT\n", 361 N, nthreads, pid ); 362 363 // build instrumentation file name 364 if( USE_DQT_BARRIER ) 365 snprintf( name , 64 , "fft_dqt_explicit_%d_%d_%d", M , x_size * y_size , ncores ); 366 else 367 snprintf( name , 64 , "fft_smp_explicit_%d_%d_%d", M , x_size * y_size , ncores ); 368 } 369 370 if( PARALLEL_PLACEMENT ) 371 { 372 printf("\n[fft] starts / %d points / %d thread(s) / PID %x / PARALLEL\n", 373 N, nthreads, pid ); 374 375 // build instrumentation file name 376 if( USE_DQT_BARRIER ) 377 snprintf( name , 64 , "fft_dqt_parallel_%d_%d_%d", M , x_size * y_size , ncores ); 378 else 379 snprintf( name , 64 , "fft_smp_parallel_%d_%d_%d", M , x_size * y_size , ncores ); 380 } 381 382 // build instrumentation file pathname 329 383 snprintf( path , 128 , "/home/%s", name ); 330 384 … … 339 393 #if DEBUG_MAIN 340 394 get_cycle( &debug_cycle ); 341 printf("\n[fft] main open file <%s> at cycle %d\n",395 printf("\n[fft] main open instrumentation file <%s> at cycle %d\n", 342 396 path, (unsigned int)debug_cycle ); 343 397 #endif … … 381 435 #if DEBUG_MAIN 382 436 get_cycle( &debug_cycle ); 383 printf("\n[fft] main completes barrier initat cycle %d\n",437 printf("\n[fft] main completes sequencial initialisation at cycle %d\n", 384 438 (unsigned int)debug_cycle ); 385 439 #endif 386 387 // build array of arguments for the <work> threads388 for (x = 0 ; x < x_size ; x++)389 {390 for (y = 0 ; y < y_size ; y++)391 {392 // compute cluster identifier393 cxy = HAL_CXY_FROM_XY( x , y );394 395 for ( lid = 0 ; lid < ncores ; lid++ )396 {397 // compute cluster continuous index398 cid = (x * y_size) + y;399 400 // compute work thread continuous index401 tid = (cid * ncores) + lid;402 403 // initialize 2D array of arguments404 work_args[cxy][lid].tid = tid;405 work_args[cxy][lid].lid = lid;406 work_args[cxy][lid].cid = cid;407 work_args[cxy][lid].parent_barrier = &parent_barriers[cxy];408 409 // initialize 2D array of pointers410 work_ptrs[cxy][lid] = &work_args[cxy][lid];411 }412 }413 }414 440 415 441 // register sequencial time … … 417 443 init_time = (unsigned int)(end_init_cycle - start_init_cycle); 418 444 445 ////////////////// 446 if( NO_PLACEMENT ) 447 { 448 // the tid value for the main thread is always 0 449 // main thread creates new threads with tid in [1,nthreads-1] 450 unsigned int tid; 451 for ( tid = 0 ; tid < nthreads ; tid++ ) 452 { 453 // register tid value in work_args[tid] array 454 work_args[tid].tid = tid; 455 456 // create other threads 457 if( tid > 0 ) 458 { 459 if ( pthread_create( &work_trdid[tid], 460 NULL, // no attribute 461 &work, 462 &work_args[tid] ) ) 463 { 464 printf("\n[fft error] cannot create thread %d\n", tid ); 465 exit( 0 ); 466 } 467 419 468 #if DEBUG_MAIN 420 printf("\n[fft] main completes <work> threads arguments at cycle %d\n", 421 (unsigned int)end_init_cycle ); 422 #endif 423 424 // create and execute the working threads 425 if( pthread_parallel_create( root_level, 426 &work, 427 &work_ptrs[0][0], 428 &parent_barriers[0] ) ) 429 { 430 printf("\n[fft error] creating threads\n"); 431 exit( 0 ); 469 printf("\n[fft] main created thread %d\n", tid ); 470 #endif 471 472 } 473 else 474 { 475 tid_main = 0; 476 } 477 } // end for tid 478 479 // main thread calls itself the execute() function 480 work( &work_args[0] ); 481 482 // main thread wait other threads completion 483 for ( tid = 1 ; tid < nthreads ; tid++ ) 484 { 485 unsigned int * status; 486 487 // main wait thread[tid] status 488 if ( pthread_join( work_trdid[tid], (void*)(&status)) ) 489 { 490 printf("\n[fft error] main cannot join thread %d\n", tid ); 491 exit( 0 ); 492 } 493 494 // check status 495 if( *status != THREAD_EXIT_SUCCESS ) 496 { 497 printf("\n[fft error] thread %x returned failure\n", tid ); 498 exit( 0 ); 499 } 500 501 #if DEBUG_MAIN 502 printf("\n[fft] main successfully joined thread %x\n", tid ); 503 #endif 504 505 } // end for tid 506 507 } // end if no_placement 508 509 //////////////////////// 510 if( EXPLICIT_PLACEMENT ) 511 { 512 // main thread places each thread[tid] on a specific core[cxy][lid] 513 // but the actual thread creation is sequencial 514 unsigned int x; 515 unsigned int y; 516 unsigned int l; 517 unsigned int cxy; // cluster identifier 518 unsigned int tid; // thread continuous index 519 520 for( x = 0 ; x < x_size ; x++ ) 521 { 522 for( y = 0 ; y < y_size ; y++ ) 523 { 524 cxy = HAL_CXY_FROM_XY( x , y ); 525 for( l = 0 ; l < ncores ; l++ ) 526 { 527 // compute thread continuous index 528 tid = (((x * y_size) + y) * ncores) + l; 529 530 // register tid value in work_args[tid] array 531 work_args[tid].tid = tid; 532 533 // no thread created on the core running the main 534 if( (cxy != cxy_main) || (l != lid_main) ) 535 { 536 // define thread attributes 537 work_attr[tid].attributes = PT_ATTR_CLUSTER_DEFINED | 538 PT_ATTR_CORE_DEFINED; 539 work_attr[tid].cxy = cxy; 540 work_attr[tid].lid = l; 541 542 // create thread[tid] on core[cxy][l] 543 if ( pthread_create( &work_trdid[tid], 544 &work_attr[tid], 545 &work, 546 &work_args[tid] ) ) 547 { 548 printf("\n[fft error] cannot create thread %d\n", tid ); 549 exit( 0 ); 550 } 551 #if DEBUG_MAIN 552 printf("\n[fft] main created thread[%d] on core[%x,%d]\n", tid, cxy, l ); 553 #endif 554 } 555 else 556 { 557 tid_main = tid; 558 } 559 } 560 } 561 } 562 563 // main thread calls itself the execute() function 564 work( &work_args[tid_main] ); 565 566 // main thread wait other threads completion 567 for( tid = 0 ; tid < nthreads ; tid++ ) 568 { 569 // no other thread on the core running the main 570 if( tid != tid_main ) 571 { 572 unsigned int * status; 573 574 // wait thread[tid] 575 if( pthread_join( work_trdid[tid] , (void*)(&status) ) ) 576 { 577 printf("\n[fft error] main cannot join thread %d\n", tid ); 578 exit( 0 ); 579 } 580 581 // check status 582 if( *status != THREAD_EXIT_SUCCESS ) 583 { 584 printf("\n[fft error] thread %d returned failure\n", tid ); 585 exit( 0 ); 586 } 587 #if DEBUG_MAIN 588 printf("\n[fft] main joined thread %d on core[%x,%d]\n", tid , cxy , l ); 589 #endif 590 } 591 } 592 } // end if explicit_placement 593 594 //////////////////////// 595 if( PARALLEL_PLACEMENT ) 596 { 597 // create and execute the working threads 598 if( pthread_parallel_create( root_level , &work ) ) 599 { 600 printf("\n[fft error] cannot create threads\n"); 601 exit( 0 ); 602 } 432 603 } 433 604 … … 533 704 // This function is executed in parallel by all <work> threads. 534 705 ///////////////////////////////////////////////////////////////// 535 void work( work_args_t * args )706 void work( pthread_parallel_work_args_t * args ) 536 707 { 537 708 unsigned int tid; // this thread continuous index … … 549 720 unsigned long long barrier_stop; 550 721 722 get_cycle( ¶llel_start ); 723 551 724 // get thread arguments 552 725 tid = args->tid; 553 lid = args->lid;554 cid = args->cid; 555 parent_barrier = args->parent_barrier;556 557 get_cycle( ¶llel_start );558 726 parent_barrier = args->barrier; 727 728 // compute lid and cid from tid 729 lid = tid % ncores; 730 cid = tid / ncores; 731 559 732 #if DEBUG_WORK 560 733 printf("\n[fft] %s : thread %d enter / cycle %d\n", … … 602 775 printf("\n[fft] %s : thread %d exit barrier for buffer allocation / cycle %d\n", 603 776 __FUNCTION__, tid, (unsigned int)barrier_stop ); 604 #endif605 606 #if DISPLAY_SCHED_AND_VMM607 unsigned int x_size;608 unsigned int y_size;609 unsigned int ncores;610 get_config( &x_size , &y_size , &ncores );611 unsigned int x = cid / y_size;612 unsigned int y = cid % y_size;613 unsigned int cxy = HAL_CXY_FROM_XY( x , y );614 display_sched( cxy , lid );615 if( lid == 0 ) display_vmm( cxy , getpid() , 0 );616 777 #endif 617 778 … … 919 1080 // contained in the distributed buffers x[nclusters][points_per_cluster]. 920 1081 // It handles the (N) points 1D array as a (rootN*rootN) points 2D array. 921 // 1) it transpose(rootN/nthreads ) rows from x to tmp.1082 // 1) it fft (rootN/nthreads ) rows from x to tmp. 922 1083 // 2) it make (rootN/nthreads) FFT on the tmp rows and apply the twiddle factor. 923 // 3) it transpose(rootN/nthreads) columns from tmp to x.1084 // 3) it fft (rootN/nthreads) columns from tmp to x. 924 1085 // 4) it make (rootN/nthreads) FFT on the x rows. 925 1086 // It calls the FFTRow() 2*(rootN/nthreads) times to perform the in place FFT … … 946 1107 #endif 947 1108 948 // transpose(rootN/nthreads) rows from x to tmp1109 // fft (rootN/nthreads) rows from x to tmp 949 1110 Transpose( x , tmp , MyFirst , MyLast ); 950 1111 951 1112 #if( DEBUG_FFT1D & 1 ) 952 1113 get_cycle( &cycle ); 953 printf("\n[fft] %s : thread %d after first transpose/ cycle %d\n",1114 printf("\n[fft] %s : thread %d after first fft / cycle %d\n", 954 1115 __FUNCTION__, tid, (unsigned int)cycle ); 955 1116 if( PRINT_ARRAY ) PrintArray( tmp , N ); … … 964 1125 #if( DEBUG_FFT1D & 1 ) 965 1126 get_cycle( &cycle ); 966 printf("\n[fft] %s : thread %d exit barrier after first transpose/ cycle %d\n",1127 printf("\n[fft] %s : thread %d exit barrier after first fft / cycle %d\n", 967 1128 __FUNCTION__, tid, (unsigned int)cycle ); 968 1129 #endif … … 992 1153 sync_time[tid] += (unsigned int)(barrier_stop - barrier_start); 993 1154 994 // transposetmp to x1155 // fft tmp to x 995 1156 Transpose( tmp , x , MyFirst , MyLast ); 996 1157 997 1158 #if( DEBUG_FFT1D & 1 ) 998 printf("\n[fft] %s : thread %d after second transpose\n", __FUNCTION__, tid);1159 printf("\n[fft] %s : thread %d after second fft\n", __FUNCTION__, tid); 999 1160 if( PRINT_ARRAY ) PrintArray( x , N ); 1000 1161 #endif … … 1006 1167 1007 1168 #if( DEBUG_FFT1D & 1 ) 1008 printf("\n[fft] %s : thread %d exit barrier after second transpose\n", __FUNCTION__, tid);1169 printf("\n[fft] %s : thread %d exit barrier after second fft\n", __FUNCTION__, tid); 1009 1170 #endif 1010 1171 … … 1033 1194 sync_time[tid] += (unsigned int)(barrier_stop - barrier_start); 1034 1195 1035 // transposex to tmp1196 // fft x to tmp 1036 1197 Transpose( x , tmp , MyFirst , MyLast ); 1037 1198 1038 1199 #if( DEBUG_FFT1D & 1 ) 1039 printf("\n[fft] %s : thread %x after third transpose\n", __FUNCTION__, tid);1200 printf("\n[fft] %s : thread %x after third fft\n", __FUNCTION__, tid); 1040 1201 if( PRINT_ARRAY ) PrintArray( x , N ); 1041 1202 #endif … … 1047 1208 1048 1209 #if( DEBUG_FFT1D & 1 ) 1049 printf("\n[fft] %s : thread %d exit barrier after third transpose\n", __FUNCTION__, tid);1210 printf("\n[fft] %s : thread %d exit barrier after third fft\n", __FUNCTION__, tid); 1050 1211 #endif 1051 1212 -
trunk/user/ksh/ksh.c
r647 r652 61 61 #define DEBUG_CMD_CAT 0 62 62 #define DEBUG_CMD_CP 0 63 #define DEBUG_CMD_LOAD 163 #define DEBUG_CMD_LOAD 0 64 64 #define DEBUG_CMD_LS 0 65 65 #define DEBUG_CMD_PS 0 … … 1226 1226 1227 1227 1228 / *1. first direct command1228 // 1. first direct command 1229 1229 if( sem_wait( &semaphore ) ) 1230 1230 { … … 1238 1238 execute( cmd ); 1239 1239 } 1240 */1240 // 1241 1241 1242 1242 -
trunk/user/sort/sort.c
r637 r652 69 69 #define INSTRUMENTATION 1 // register computation times on file 70 70 71 ///////////////////////////////////////////////////////////////////////////////////72 // Arguments for the sort() function73 ///////////////////////////////////////////////////////////////////////////////////74 75 typedef struct76 {77 unsigned int tid; // continuous thread index78 unsigned int threads; // total number of threads79 pthread_barrier_t * parent_barrier; // pointer on termination barrier80 }81 sort_args_t;82 83 71 //////////////////////////////////////////////////////////////////////////////////// 84 72 // Sort specific global variables … … 88 76 int array1[ARRAY_LENGTH]; 89 77 78 unsigned int threads; // total number of working threads 79 90 80 pthread_barrier_t barrier; // synchronisation variables 91 81 … … 93 83 // Global variables required by parallel_pthread_create() 94 84 ///////////////////////////////////////////////////////////////////////////////////// 95 96 // 2D arrays of input arguments for the <sort> threads97 // These arrays are initialised by the application main thread98 99 sort_args_t sort_args[CLUSTERS_MAX][CORES_MAX]; // sort function arguments100 sort_args_t * sort_ptrs[CLUSTERS_MAX][CORES_MAX]; // pointers on arguments101 102 // 1D array of barriers to allow the <sort> threads to signal termination103 // this array is initialised by the pthread_parallel_create() function104 105 pthread_barrier_t parent_barriers[CLUSTERS_MAX]; // termination barrier106 85 107 86 … … 174 153 } // end merge() 175 154 176 ////////////////////////////// 177 void sort( sort_args_t * ptr )155 /////////////////////////////////////////////// 156 void sort( pthread_parallel_work_args_t * ptr ) 178 157 { 179 158 unsigned int i; … … 183 162 // get arguments 184 163 unsigned int tid = ptr->tid; 185 unsigned int threads = ptr->threads; 186 pthread_barrier_t * parent_barrier = ptr->parent_barrier; 164 pthread_barrier_t * parent_barrier = ptr->barrier; 187 165 188 166 unsigned int items = ARRAY_LENGTH / threads; … … 190 168 191 169 #if DEBUG_SORT 192 printf("\n[sort] start : ptr %x / tid %d / threads %d / barrier %x\n",170 printf("\n[sort] start : ptr %x / tid %d / threads %d / parent_barrier %x\n", 193 171 ptr, tid, threads, parent_barrier ); 194 172 #endif … … 249 227 } // en for stages 250 228 251 // sort thread signal completion to main thread229 // sort thread signal completion to pthtread_parallel_create() 252 230 pthread_barrier_wait( parent_barrier ); 253 231 … … 269 247 unsigned int y_size; // number of columns 270 248 unsigned int ncores; // number of cores per cluster 271 unsigned int total_threads; // total number of threads272 unsigned int x; // X coordinate for a sort thread273 unsigned int y; // Y coordinate for a sort thread274 unsigned int cxy; // cluster identifier for a sort thead275 unsigned int lid; // core local index for a thread276 unsigned int tid; // sort thread continuous index277 249 pthread_barrierattr_t barrier_attr; // barrier attributes (used for DQT) 278 250 unsigned int n; // index in array to sort … … 285 257 get_cycle( &start_cycle ); 286 258 287 // compute number of threads (one thread per core)259 // compute number of working threads (one thread per core) 288 260 get_config( &x_size , &y_size , &ncores ); 289 t otal_threads = x_size * y_size * ncores;261 threads = x_size * y_size * ncores; 290 262 291 263 // compute covering DQT size an level … … 294 266 295 267 // checks number of threads 296 if ( (t otal_threads != 1) && (total_threads != 2) && (total_threads != 4) &&297 (t otal_threads != 8) && (total_threads != 16 ) && (total_threads != 32) &&298 (t otal_threads != 64) && (total_threads != 128) && (total_threads != 256) &&299 (t otal_threads != 512) && (total_threads != 1024) )268 if ( (threads != 1) && (threads != 2) && (threads != 4) && 269 (threads != 8) && (threads != 16 ) && (threads != 32) && 270 (threads != 64) && (threads != 128) && (threads != 256) && 271 (threads != 512) && (threads != 1024) ) 300 272 { 301 273 printf("\n[sort] ERROR : number of cores must be power of 2\n"); … … 304 276 305 277 // check array size 306 if ( ARRAY_LENGTH % t otal_threads)278 if ( ARRAY_LENGTH % threads) 307 279 { 308 280 printf("\n[sort] ERROR : array size must be multiple of number of threads\n"); … … 311 283 312 284 printf("\n[sort] main starts / %d threads / %d items / pid %x / cycle %d\n", 313 t otal_threads, ARRAY_LENGTH, getpid(), (unsigned int)start_cycle );285 threads, ARRAY_LENGTH, getpid(), (unsigned int)start_cycle ); 314 286 315 287 // initialize barrier … … 319 291 barrier_attr.y_size = y_size; 320 292 barrier_attr.nthreads = ncores; 321 error = pthread_barrier_init( &barrier, &barrier_attr , t otal_threads );293 error = pthread_barrier_init( &barrier, &barrier_attr , threads ); 322 294 } 323 295 else // use SIMPLE_BARRIER 324 296 { 325 error = pthread_barrier_init( &barrier, NULL , t otal_threads );297 error = pthread_barrier_init( &barrier, NULL , threads ); 326 298 } 327 299 … … 352 324 #endif 353 325 354 // build array of arguments for the <sort> threads355 for (x = 0 ; x < x_size ; x++)356 {357 for (y = 0 ; y < y_size ; y++)358 {359 // compute cluster identifier360 cxy = HAL_CXY_FROM_XY( x , y );361 362 for ( lid = 0 ; lid < ncores ; lid++ )363 {364 // compute thread continuous index365 tid = (((x * y_size) + y) * ncores) + lid;366 367 // initialize 2D array of arguments368 sort_args[cxy][lid].tid = tid;369 sort_args[cxy][lid].threads = total_threads;370 sort_args[cxy][lid].parent_barrier = &parent_barriers[cxy];371 372 // initialize 2D array of pointers373 sort_ptrs[cxy][lid] = &sort_args[cxy][lid];374 }375 }376 }377 378 326 /////////////////////////// 379 327 get_cycle( &seq_end_cycle ); … … 386 334 // create and execute the working threads 387 335 if( pthread_parallel_create( root_level, 388 &sort, 389 &sort_ptrs[0][0], 390 &parent_barriers[0] ) ) 336 &sort ) ) 391 337 { 392 338 printf("\n[sort] ERROR : cannot create threads\n"); … … 412 358 #if CHECK_RESULT 413 359 int success = 1; 414 int * res_array = ( (t otal_threads == 2) ||415 (t otal_threads == 8) ||416 (t otal_threads == 32) ||417 (t otal_threads == 128) ||418 (t otal_threads == 512) ) ? array1 : array0;360 int * res_array = ( (threads == 2) || 361 (threads == 8) || 362 (threads == 32) || 363 (threads == 128) || 364 (threads == 512) ) ? array1 : array0; 419 365 420 366 for( n=0 ; n<(ARRAY_LENGTH-2) ; n++ ) -
trunk/user/transpose/transpose.c
r646 r652 5 5 ////////////////////////////////////////////////////////////////////////////////////////// 6 6 // This multi-threaded aplication read a raw image (one byte per pixel) 7 // stored on disk, transpose it, display the result on the frame buffer, 8 // and store the transposed image on disk. 9 // It can run on a multi-cores, multi-clusters architecture, with one thread 7 // stored on disk, transposes it, displays the result on the frame buffer, 8 // and stores the transposed image on disk. 10 9 // 11 // per core, and uses the POSIX threads API. 12 // It uses the mmap() syscall to directly access the input and output files 13 // and the fbf_write() syscall to display the images. 10 // The image size and the pixel encoding type are defined by the IMAGE_SIZE and 11 // IMAGE_TYPE global parameters. 14 12 // 15 // The main() function can be launched on any core[cxy,l]. 16 // It makes the initialisations, launch (N-1) threads to run the execute() function 17 // on the (N-1) other cores, calls himself the execute() function, and finally calls 18 // the instrument() function to display instrumentation results when the parallel 19 // execution is completed. The placement of threads on the cores can be done 20 // automatically by the operating system, or can be done explicitely by the main thread 21 // (when the EXPLICIT_PLACEMENT global parameter is set). 13 // It can run on a multi-cores, multi-clusters architecture, where (X_SIZE * Y_SIZE) 14 // is the number of clusters and NCORES the number of cores per cluster. 15 // A core is identified by two indexes [cxy,lid] : cxy is the cluster identifier, 16 // (that is NOT required to be a continuous index), and lid is the local core index, 17 // (that must be in the [Ø,NCORES-1] range). 22 18 // 23 // The buf_in[x,y] and buf_out[put buffers containing the direct ans transposed images 24 // are distributed in clusters: In each cluster[cxy], the thread running on core[cxy,0] 25 // map the buf_in[cxy] and // buf_out[cxy] buffers containing a subset of lines. 26 // Then, all threads in cluster[xy] read pixels from the local buf_in[cxy] buffer, and 27 // write the pixels to all remote buf_out[cxy] buffers. Finally, each thread display 28 // a part of the transposed image to the frame buffer. 19 // The main() function can run on any core in any cluster. This main thread 20 // makes the initialisations, uses the pthread_create() syscall to launch (NTHREADS-1) 21 // other threads in "attached" mode running in parallel the execute() function, calls 22 // himself the execute() function, wait completion of the (NTHREADS-1) other threads 23 // with a pthread_join(), and finally calls the instrument() function to display 24 // and register the instrumentation results when execution is completed. 25 // All threads run the execute() function, but each thread transposes only 26 // (NLINES / NTHREADS) lines. This requires that NLINES == k * NTHREADS. 27 // 28 // The number N of working threads is always defined by the number of cores availables 29 // in the architecture, but this application supports three placement modes. 30 // In all modes, the working threads are identified by the [tid] continuous index 31 // in range [0, NTHREADS-1], and defines how the lines are shared amongst the threads. 32 // This continuous index can always be decomposed in two continuous sub-indexes: 33 // tid == cid * ncores + lid, where cid is in [0,NCLUSTERS-1] and lid in [0,NCORES-1]. 34 // 35 // - NO_PLACEMENT: the main thread is itsef a working thread. The (N_1) other working 36 // threads are created by the main thread, but the placement is done by the OS, using 37 // the DQDT for load balancing, and two working threads can be placed on the same core. 38 // The [cid,lid] are only abstract identifiers, and cannot be associated to a physical 39 // cluster or a physical core. In this mode, the main thread run on any cluster, 40 // but has tid = 0 (i.e. cid = 0 & tid = 0). 41 // 42 // - EXPLICIT_PLACEMENT: the main thread is again a working thread, but the placement of 43 // of the threads on the cores is explicitely controled by the main thread to have 44 // exactly one working thread per core, and the [cxy][lpid] core coordinates for a given 45 // thread[tid] can be directly derived from the [tid] value: [cid] is an alias for the 46 // physical cluster identifier, and [lid] is the local core index. 47 // 48 // - PARALLEL_PLACEMENT: the main thread is not anymore a working thread, and uses the 49 // non standard pthread_parallel_create() function to avoid the costly sequencial 50 // loops for pthread_create() and pthread_join(). It garanty one working thread 51 // per core, and the same relation between the thread[tid] and the core[cxy][lpid]. 52 // 53 // The buf_in[x,y] and buf_out[put buffers containing the direct and transposed images 54 // are distributed in clusters: each thread[cid][0] allocate a local input buffer 55 // and load in this buffer all lines that must be handled by the threads sharing the 56 // same cid, from the mapper of the input image file. 57 // In the execute function, all threads in the group defined by the cid index read pixels 58 // from the local buf_in[cid] buffer, and write pixels to all remote buf_out[cid] buffers. 59 // Finally, each thread displays a part of the transposed image to the frame buffer. 29 60 // 30 61 // - The image must fit the frame buffer size, that must be power of 2. 31 62 // - The number of clusters must be a power of 2 no larger than 256. 32 63 // - The number of cores per cluster must be a power of 2 no larger than 4. 33 // - The number of clusters cannot be larger than (IMAGE_SIZE * IMAGE_SIZE) / 4096, 34 // because the size of buf_in[x,y] and buf_out[x,y] must be multiple of 4096. 64 // - The number of threads cannot be larger than IMAGE_SIZE. 35 65 // 36 66 ////////////////////////////////////////////////////////////////////////////////////////// … … 50 80 #define CORES_MAX 4 // max number of cores per cluster 51 81 #define CLUSTERS_MAX (X_MAX * Y_MAX) // max number of clusters 52 53 #define IMAGE_SIZE 256 // image size 82 #define THREADS_MAX (X_MAX * Y_MAX * CORES_MAX) // max number of threads 83 84 #define IMAGE_SIZE 512 // image size 54 85 #define IMAGE_TYPE 420 // pixel encoding type 55 #define INPUT_FILE_PATH "/misc/lena_256.raw" // input file pathname 56 #define OUTPUT_FILE_PATH "/home/trsp_256.raw" // output file pathname 57 86 #define INPUT_FILE_PATH "/misc/couple_512.raw" // input file pathname 87 #define OUTPUT_FILE_PATH "/misc/transposed_512.raw" // output file pathname 88 89 #define SAVE_RESULT_FILE 0 // save result image on disk 58 90 #define USE_DQT_BARRIER 1 // quad-tree barrier if non zero 59 #define EXPLICIT_PLACEMENT 1 // explicit thread placement 60 #define VERBOSE 1 // print comments on TTY 91 92 #define NO_PLACEMENT 0 // uncontrolefdthread placement 93 #define EXPLICIT_PLACEMENT 0 // explicit threads placement 94 #define PARALLEL_PLACEMENT 1 // parallel threads placement 95 96 #define VERBOSE_MAIN 0 // main function print comments 97 #define VERBOSE_EXEC 0 // exec function print comments 98 #define VERBOSE_INSTRU 0 // instru function print comments 61 99 62 100 … … 65 103 /////////////////////////////////////////////////////// 66 104 67 // instrumentation counters for each processor in each cluster 68 unsigned int MMAP_START[CLUSTERS_MAX][CORES_MAX] = {{ 0 }}; 69 unsigned int MMAP_END [CLUSTERS_MAX][CORES_MAX] = {{ 0 }}; 105 // global instrumentation counters for the main thread 106 unsigned int SEQUENCIAL_TIME = 0; 107 unsigned int PARALLEL_TIME = 0; 108 109 // instrumentation counters for each thread in each cluster 110 // indexed by [cid][lid] : cluster continuous index / thread local index 111 unsigned int LOAD_START[CLUSTERS_MAX][CORES_MAX] = {{ 0 }}; 112 unsigned int LOAD_END [CLUSTERS_MAX][CORES_MAX] = {{ 0 }}; 70 113 unsigned int TRSP_START[CLUSTERS_MAX][CORES_MAX] = {{ 0 }}; 71 114 unsigned int TRSP_END [CLUSTERS_MAX][CORES_MAX] = {{ 0 }}; … … 73 116 unsigned int DISP_END [CLUSTERS_MAX][CORES_MAX] = {{ 0 }}; 74 117 75 // arrays of pointers on distributed buffers 76 // one input buffer & one output buffer per cluster 77 unsigned char * buf_in [CLUSTERS_MAX]; 78 unsigned char * buf_out[CLUSTERS_MAX]; 79 80 // synchronisation barrier (all threads) 118 // pointer on buffer containing the input image, maped by the main to the input file 119 unsigned char * image_in; 120 121 // pointer on buffer containing the output image, maped by the main to the output file 122 unsigned char * image_out; 123 124 // arrays of pointers on distributed buffers indexed by [cid] : cluster continuous index 125 unsigned char * buf_in_ptr [CLUSTERS_MAX]; 126 unsigned char * buf_out_ptr[CLUSTERS_MAX]; 127 128 // synchronisation barrier (all working threads) 81 129 pthread_barrier_t barrier; 82 130 83 131 // platform parameters 84 unsigned int x_size; // number of clusters in a row 85 unsigned int y_size; // number of clusters in a column 86 unsigned int ncores; // number of processors per cluster 87 88 // cluster identifier & local index of core running the main thread 89 unsigned int cxy_main; 90 unsigned int lid_main; 91 92 // input & output file descriptors 93 int fd_in; 94 int fd_out; 95 96 #if EXPLICIT_PLACEMENT 97 98 // thread index allocated by the kernel 99 pthread_t trdid[CLUSTERS_MAX][CORES_MAX]; 100 101 // user defined continuous thread index 102 unsigned int tid[CLUSTERS_MAX][CORES_MAX]; 103 104 // thread attributes only used if explicit placement 105 pthread_attr_t attr[CLUSTERS_MAX][CORES_MAX]; 106 107 #else 108 109 // thread index allocated by the kernel 110 pthread_t trdid[CLUSTERS_MAX * CORES_MAX]; 111 112 // user defined continuous thread index 113 unsigned int tid[CLUSTERS_MAX * CORES_MAX]; 114 115 #endif 132 unsigned int x_size; // number of clusters in a row 133 unsigned int y_size; // number of clusters in a column 134 unsigned int ncores; // number of cores per cluster 135 136 // main thread continuous index 137 unsigned int tid_main; 116 138 117 139 //return values at thread exit … … 119 141 unsigned int THREAD_EXIT_FAILURE = 1; 120 142 143 // array of kernel thread identifiers / indexed by [tid] 144 pthread_t exec_trdid[THREADS_MAX]; 145 146 // array of execute function arguments / indexed by [tid] 147 pthread_parallel_work_args_t exec_args[THREADS_MAX]; 148 149 // array of thread attributes / indexed by [tid] 150 pthread_attr_t exec_attr[THREADS_MAX]; 151 121 152 //////////////////////////////////////////////////////////////// 122 153 // functions declaration 123 154 //////////////////////////////////////////////////////////////// 124 155 125 void execute( unsigned int * ptid);126 127 void instrument( void);128 129 /////////// 130 void main( )156 void execute( pthread_parallel_work_args_t * args ); 157 158 void instrument( FILE * f , char * filename ); 159 160 ///////////////// 161 void main( void ) 131 162 { 132 unsigned long long date; 163 unsigned long long start_cycle; 164 unsigned long long end_sequencial_cycle; 165 unsigned long long end_parallel_cycle; 166 167 char filename[32]; // instrumentation file name 168 char pathname[64]; // instrumentation file pathname 133 169 134 170 int error; 135 171 136 printf("\n bloup 0\n"); 137 138 // get identifiers for core executing main 139 get_core_id( &cxy_main , &lid_main ); 140 141 printf("\n bloup 1\n"); 172 ///////////////////////////////////////////////////////////////////////////////// 173 get_cycle( &start_cycle ); 174 ///////////////////////////////////////////////////////////////////////////////// 175 176 if( (NO_PLACEMENT + EXPLICIT_PLACEMENT + PARALLEL_PLACEMENT) != 1 ) 177 { 178 printf("\n[transpose error] illegal placement\n"); 179 exit( 0 ); 180 } 142 181 143 182 // get & check plat-form parameters 144 get_config( &x_size , &y_size , &ncores );145 146 printf("\n bloup 2\n");147 148 if((ncores != 1) && (ncores != 2) && (ncores == 4))183 get_config( &x_size, 184 &y_size, 185 &ncores ); 186 187 if((ncores != 1) && (ncores != 2) && (ncores != 4)) 149 188 { 150 189 printf("\n[transpose error] number of cores per cluster must be 1/2/4\n"); … … 166 205 } 167 206 168 printf("\n bloup 3\n"); 207 // main thread get identifiers for core executing main 208 unsigned int cxy_main; 209 unsigned int lid_main; 210 get_core_id( &cxy_main , &lid_main ); 169 211 170 212 // compute number of threads … … 172 214 unsigned int nthreads = nclusters * ncores; 173 215 174 printf("\n bloup 4\n"); 175 176 // get FBF ownership and FBF size 216 // main thread get FBF size and type 177 217 unsigned int fbf_width; 178 218 unsigned int fbf_height; … … 180 220 fbf_get_config( &fbf_width , &fbf_height , &fbf_type ); 181 221 182 printf("\n bloup 5\n");183 184 222 if( (fbf_width != IMAGE_SIZE) || (fbf_height != IMAGE_SIZE) || (fbf_type != IMAGE_TYPE) ) 185 223 { … … 188 226 } 189 227 190 get_cycle( &date ); 191 printf("\n[transpose] starts at cycle %d on %d cores / FBF = %d * %d pixels\n", 192 (unsigned int)date , nthreads , fbf_width , fbf_height ); 193 194 // open input file 195 fd_in = open( INPUT_FILE_PATH , O_RDONLY , 0 ); // read-only 196 if ( fd_in < 0 ) 228 if( nthreads > IMAGE_SIZE ) 229 { 230 printf("\n[transpose error] number of threads larger than number of lines\n"); 231 exit( 0 ); 232 } 233 234 unsigned int npixels = IMAGE_SIZE * IMAGE_SIZE; 235 236 // define instrumentation file name 237 if( NO_PLACEMENT ) 238 { 239 printf("\n[transpose] %d cluster(s) / %d core(s) / FBF[%d*%d] / PID %x / NO_PLACE\n", 240 nclusters, ncores, fbf_width, fbf_height, getpid() ); 241 242 // build instrumentation file name 243 if( USE_DQT_BARRIER ) 244 snprintf( filename , 32 , "trsp_dqt_no_place_%d_%d_%d", 245 IMAGE_SIZE , x_size * y_size , ncores ); 246 else 247 snprintf( filename , 32 , "trsp_smp_no_place_%d_%d_%d", 248 IMAGE_SIZE , x_size * y_size , ncores ); 249 } 250 251 if( EXPLICIT_PLACEMENT ) 252 { 253 printf("\n[transpose] %d cluster(s) / %d core(s) / FBF[%d*%d] / PID %x / EXPLICIT\n", 254 nclusters, ncores, fbf_width, fbf_height, getpid() ); 255 256 // build instrumentation file name 257 if( USE_DQT_BARRIER ) 258 snprintf( filename , 32 , "trsp_dqt_explicit_%d_%d_%d", 259 IMAGE_SIZE , x_size * y_size , ncores ); 260 else 261 snprintf( filename , 32 , "trsp_smp_explicit_%d_%d_%d", 262 IMAGE_SIZE , x_size * y_size , ncores ); 263 } 264 265 if( PARALLEL_PLACEMENT ) 266 { 267 printf("\n[transpose] %d cluster(s) / %d core(s) / FBF[%d*%d] / PID %x / PARALLEL\n", 268 nclusters, ncores, fbf_width, fbf_height, getpid() ); 269 270 // build instrumentation file name 271 if( USE_DQT_BARRIER ) 272 snprintf( filename , 32 , "trsp_dqt_parallel_%d_%d_%d", 273 IMAGE_SIZE , x_size * y_size , ncores ); 274 else 275 snprintf( filename , 32 , "trsp_smp_parallel_%d_%d_%d", 276 IMAGE_SIZE , x_size * y_size , ncores ); 277 } 278 279 // open instrumentation file 280 snprintf( pathname , 64 , "/home/%s", filename ); 281 FILE * f = fopen( pathname , NULL ); 282 if ( f == NULL ) 197 283 { 198 printf("\n[transpose error] main cannot open file %s\n", INPUT_FILE_PATH ); 199 exit( 0 ); 200 } 201 202 #if VERBOSE 203 printf("\n[transpose] main open file %s / fd = %d\n", INPUT_FILE_PATH , fd_in ); 204 #endif 205 206 // open output file 207 fd_out = open( OUTPUT_FILE_PATH , O_CREAT , 0 ); // create if required 208 if ( fd_out < 0 ) 209 { 210 printf("\n[transpose error] main cannot open file %s\n", OUTPUT_FILE_PATH ); 211 exit( 0 ); 212 } 213 214 #if VERBOSE 215 printf("\n[transpose] main open file %s / fd = %d\n", OUTPUT_FILE_PATH , fd_out ); 216 #endif 217 218 // initialise barrier 284 printf("\n[transpose error] cannot open instrumentation file %s\n", pathname ); 285 exit( 0 ); 286 } 287 288 #if VERBOSE_MAIN 289 printf("\n[transpose] main on core[%x,%d] open instrumentation file %s\n", 290 cxy_main, lid_main, pathname ); 291 #endif 292 293 // main thread initializes barrier 219 294 if( USE_DQT_BARRIER ) 220 295 { … … 236 311 } 237 312 238 get_cycle( &date ); 239 printf("\n[transpose] main on core[%x,%d] completes initialisation at cycle %d\n" 240 "- CLUSTERS = %d\n" 241 "- PROCS = %d\n" 242 "- THREADS = %d\n", 243 cxy_main, lid_main, (unsigned int)date, nclusters, ncores, nthreads ); 244 245 ////////////////////// 246 #if EXPLICIT_PLACEMENT 247 248 // main thread launch other threads 249 unsigned int x; 250 unsigned int y; 251 unsigned int l; 252 unsigned int cxy; 253 for( x = 0 ; x < x_size ; x++ ) 254 { 255 for( y = 0 ; y < y_size ; y++ ) 313 #if VERBOSE_MAIN 314 printf("\n[transpose] main on core[%x,%d] completes barrier initialisation\n", 315 cxy_main, lid_main ); 316 #endif 317 318 // main thread open input file 319 int fd_in = open( INPUT_FILE_PATH , O_RDONLY , 0 ); 320 321 if ( fd_in < 0 ) 322 { 323 printf("\n[transpose error] main cannot open file %s\n", INPUT_FILE_PATH ); 324 exit( 0 ); 325 } 326 327 #if VERBOSE_MAIN 328 printf("\n[transpose] main open file <%s> / fd = %d\n", INPUT_FILE_PATH , fd_in ); 329 #endif 330 331 // main thread map image_in buffer to input image file 332 image_in = (unsigned char *)mmap( NULL, 333 npixels, 334 PROT_READ, 335 MAP_FILE | MAP_SHARED, 336 fd_in, 337 0 ); // offset 338 if ( image_in == NULL ) 339 { 340 printf("\n[transpose error] main cannot map buffer to file %s\n", INPUT_FILE_PATH ); 341 exit( 0 ); 342 } 343 344 #if VERBOSE_MAIN 345 printf("\n[transpose] main map buffer to file <%s>\n", INPUT_FILE_PATH ); 346 #endif 347 348 // main thread display input image on FBF 349 if( fbf_write( image_in, 350 npixels, 351 0 ) ) 352 { 353 printf("\n[transpose error] main cannot access FBF\n"); 354 exit( 0 ); 355 } 356 357 #if SAVE_RESULT_IMAGE 358 359 // main thread open output file 360 int fd_out = open( OUTPUT_FILE_PATH , O_CREAT , 0 ); 361 362 if ( fd_out < 0 ) 363 { 364 printf("\n[transpose error] main cannot open file %s\n", OUTPUT_FILE_PATH ); 365 exit( 0 ); 366 } 367 368 #if VERBOSE_MAIN 369 printf("\n[transpose] main open file <%s> / fd = %d\n", OUTPUT_FILE_PATH , fd_out ); 370 #endif 371 372 // main thread map image_out buffer to output image file 373 image_out = (unsigned char *)mmap( NULL, 374 npixels, 375 PROT_WRITE, 376 MAP_FILE | MAP_SHARED, 377 fd_out, 378 0 ); // offset 379 if ( image_out == NULL ) 380 { 381 printf("\n[transpose error] main cannot map buf_out to file %s\n", OUTPUT_FILE_PATH ); 382 exit( 0 ); 383 } 384 385 #if VERBOSE_MAIN 386 printf("\n[transpose] main map buffer to file <%s>\n", OUTPUT_FILE_PATH ); 387 #endif 388 389 #endif // SAVE_RESULT_IMAGE 390 391 ///////////////////////////////////////////////////////////////////////////////////// 392 get_cycle( &end_sequencial_cycle ); 393 SEQUENCIAL_TIME = (unsigned int)(end_sequencial_cycle - start_cycle); 394 ///////////////////////////////////////////////////////////////////////////////////// 395 396 ////////////////// 397 if( NO_PLACEMENT ) 398 { 399 // the tid value for the main thread is always 0 400 // main thread creates new threads with tid in [1,nthreads-1] 401 unsigned int tid; 402 for ( tid = 0 ; tid < nthreads ; tid++ ) 256 403 { 257 cxy = HAL_CXY_FROM_XY( x , y ); 258 for( l = 0 ; l < ncores ; l++ ) 404 // register tid value in exec_args[tid] array 405 exec_args[tid].tid = tid; 406 407 // create other threads 408 if( tid > 0 ) 259 409 { 260 // no other thread on the core running the main 261 if( (cxy != cxy_main) || (l != lid_main) ) 410 if ( pthread_create( &exec_trdid[tid], 411 NULL, // no attribute 412 &execute, 413 &exec_args[tid] ) ) 262 414 { 263 // define thread attributes 264 attr[cxy][l].attributes = PT_ATTR_CLUSTER_DEFINED | PT_ATTR_CORE_DEFINED; 265 attr[cxy][l].cxy = cxy; 266 attr[cxy][l].lid = l; 267 268 tid[cxy][l] = (((x * y_size) + y) * ncores) + l; 415 printf("\n[transpose error] cannot create thread %d\n", tid ); 416 exit( 0 ); 417 } 418 419 #if VERBOSE_MAIN 420 printf("\n[transpose] main created thread %d\n", tid ); 421 #endif 422 423 } 424 else 425 { 426 tid_main = 0; 427 } 428 } // end for tid 429 430 // main thread calls itself the execute() function 431 execute( &exec_args[0] ); 432 433 // main thread wait other threads completion 434 for ( tid = 1 ; tid < nthreads ; tid++ ) 435 { 436 unsigned int * status; 437 438 // main wait thread[tid] status 439 if ( pthread_join( exec_trdid[tid], (void*)(&status)) ) 440 { 441 printf("\n[transpose error] main cannot join thread %d\n", tid ); 442 exit( 0 ); 443 } 444 445 // check status 446 if( *status != THREAD_EXIT_SUCCESS ) 447 { 448 printf("\n[transpose error] thread %x returned failure\n", tid ); 449 exit( 0 ); 450 } 451 452 #if VERBOSE_MAIN 453 printf("\n[transpose] main successfully joined thread %x\n", tid ); 454 #endif 455 456 } // end for tid 457 458 } // end if no_placement 459 460 //////////////////////// 461 if( EXPLICIT_PLACEMENT ) 462 { 463 // main thread places each other threads on a specific core[cxy][lid] 464 // but the actual thread creation is sequencial 465 unsigned int x; 466 unsigned int y; 467 unsigned int l; 468 unsigned int cxy; // cluster identifier 469 unsigned int tid; // thread continuous index 470 471 for( x = 0 ; x < x_size ; x++ ) 472 { 473 for( y = 0 ; y < y_size ; y++ ) 474 { 475 cxy = HAL_CXY_FROM_XY( x , y ); 476 for( l = 0 ; l < ncores ; l++ ) 477 { 478 // compute thread continuous index 479 tid = (((x * y_size) + y) * ncores) + l; 480 481 // register tid value in exec_args[tid] array 482 exec_args[tid].tid = tid; 483 484 // no thread created on the core running the main 485 if( (cxy != cxy_main) || (l != lid_main) ) 486 { 487 // define thread attributes 488 exec_attr[tid].attributes = PT_ATTR_CLUSTER_DEFINED | 489 PT_ATTR_CORE_DEFINED; 490 exec_attr[tid].cxy = cxy; 491 exec_attr[tid].lid = l; 269 492 270 // create thread on core[cxy,l] 271 if (pthread_create( &trdid[cxy][l], 272 &attr[cxy][l], 273 &execute, 274 &tid[cxy][l] ) ) 493 // create thread[tid] on core[cxy][l] 494 if ( pthread_create( &exec_trdid[tid], 495 &exec_attr[tid], 496 &execute, 497 &exec_args[tid] ) ) 498 { 499 printf("\n[transpose error] cannot create thread %d\n", tid ); 500 exit( 0 ); 501 } 502 #if VERBOSE_MAIN 503 printf("\n[transpose] main created thread[%d] on core[%x,%d]\n", tid, cxy, l ); 504 #endif 505 } 506 else 275 507 { 276 printf("\n[convol error] created thread %x on core[%x][%d]\n", 277 trdid[cxy][l] , cxy , l ); 278 exit( 0 ); 508 tid_main = tid; 279 509 } 280 #if VERBOSE281 printf("\n[transpose] main created thread[%x,%d]\n", cxy, l );282 #endif283 510 } 284 511 } 285 512 } 286 } 287 288 // main thread calls itself the execute() function 289 execute( &tid[cxy_main][lid_main] ); 290 291 // main thread wait other threads completion 292 for( x = 0 ; x < x_size ; x++ ) 293 { 294 for( y = 0 ; y < y_size ; y++ ) 513 514 // main thread calls itself the execute() function 515 execute( &exec_args[tid_main] ); 516 517 // main thread wait other threads completion 518 for( tid = 0 ; tid < nthreads ; tid++ ) 295 519 { 296 cxy = HAL_CXY_FROM_XY( x , y );297 for( l = 0 ; l < ncores ; l++)520 // no other thread on the core running the main 521 if( tid != tid_main ) 298 522 { 299 // no other thread on the core running the main 300 if( (cxy != cxy_main) || (l != lid_main) ) 523 unsigned int * status; 524 525 // wait thread[tid] 526 if( pthread_join( exec_trdid[tid] , (void*)(&status) ) ) 301 527 { 302 unsigned int * status; 303 304 // wait thread[cxy][l] 305 if( pthread_join( trdid[cxy][l] , (void*)(&status) ) ) 306 { 307 printf("\n[transpose error] main cannot join thread[%x,%d]\n", cxy, l ); 308 exit( 0 ); 309 } 528 printf("\n[transpose error] main cannot join thread %d\n", tid ); 529 exit( 0 ); 530 } 310 531 311 // check status 312 if( *status != THREAD_EXIT_SUCCESS ) 313 { 314 printf("\n[transpose error] thread[%x,%d] returned failure\n", cxy, l ); 315 exit( 0 ); 316 } 317 #if VERBOSE 318 printf("\n[transpose] main joined thread[%x,%d]\n", cxy, l ); 319 #endif 532 // check status 533 if( *status != THREAD_EXIT_SUCCESS ) 534 { 535 printf("\n[transpose error] thread %d returned failure\n", tid ); 536 exit( 0 ); 320 537 } 538 #if VERBOSE_MAIN 539 printf("\n[transpose] main joined thread %d on core[%x,%d]\n", tid , cxy , l ); 540 #endif 321 541 } 322 542 } 323 } 324 325 ///////////////////////////////326 #else // no explicit placement 327 328 // main thread launch other threads329 unsigned int n;330 for ( n = 1 ; n < nthreads ; n++ )331 {332 tid[n] = n;333 if ( pthread_create( &trdid[n],334 NULL, // no attribute 335 &execute,336 &tid[n] ) )543 } // end if explicit_placement 544 545 //////////////////////// 546 if( PARALLEL_PLACEMENT ) 547 { 548 // compute covering DQT size an level 549 unsigned int z = (x_size > y_size) ? x_size : y_size; 550 unsigned int root_level = ((z == 1) ? 0 : 551 ((z == 2) ? 1 : 552 ((z == 4) ? 2 : 553 ((z == 8) ? 3 : 4)))); 554 555 // create & execute the working threads 556 if( pthread_parallel_create( root_level , &execute ) ) 337 557 { 338 printf("\n[transpose error] cannot create thread %d\n", n);558 printf("\n[transpose error] in %s\n", __FUNCTION__ ); 339 559 exit( 0 ); 340 560 } 341 342 #if VERBOSE 343 printf("\n[transpose] main created thread %d\n", tid[n] ); 344 #endif 345 346 } 347 348 // main thread calls itself the execute() function 349 execute( &tid[0] ); 350 351 // main thread wait other threads completion 352 for ( n = 1 ; n < nthreads ; n++ ) 353 { 354 unsigned int * status; 355 356 // main wait thread[n] status 357 if ( pthread_join( trdid[n], (void*)(&status)) ) 358 { 359 printf("\n[transpose error] main cannot join thread %d\n", n ); 360 exit( 0 ); 361 } 362 363 // check status 364 if( *status != THREAD_EXIT_SUCCESS ) 365 { 366 printf("\n[transpose error] thread %x returned failure\n", n ); 367 exit( 0 ); 368 } 369 370 #if VERBOSE 371 printf("\n[transpose] main successfully joined thread %x\n", tid[n] ); 372 #endif 373 374 } 375 376 #endif 377 378 // instrumentation 379 instrument(); 380 381 // close input and output files 561 } // end if parallel_placement 562 563 564 ///////////////////////////////////////////////////////////////////////////// 565 get_cycle( &end_parallel_cycle ); 566 PARALLEL_TIME = (unsigned int)(end_parallel_cycle - end_sequencial_cycle); 567 ///////////////////////////////////////////////////////////////////////////// 568 569 // main thread register instrumentation results 570 instrument( f , filename ); 571 572 // main thread close input file 382 573 close( fd_in ); 574 575 #if SAVE_RESULT_IMAGE 576 577 // main thread close output file 383 578 close( fd_out ); 384 579 385 // suicide 580 #endif 581 582 // main close instrumentation file 583 fclose( f ); 584 585 // main thread suicide 386 586 exit( 0 ); 387 587 … … 390 590 391 591 392 /////////////////////////////////// 393 void execute( unsigned int * ptid ) 592 593 /////////////////////////////////////////////////// 594 void execute( pthread_parallel_work_args_t * args ) 394 595 { 395 596 unsigned long long date; 396 597 397 unsigned int l; // line index for loops 398 unsigned int p; // pixel index for loops 399 400 // get thread continuous index 401 unsigned int my_tid = *ptid; 598 unsigned int l; // line index for loop 599 unsigned int p; // pixel index for loop 600 601 // WARNING 602 //A thread is identified by the tid index, defined in the "args" structure. 603 // This index being in range [0,nclusters*ncores-1] we can always write 604 // tid == cid * ncores + lid 605 // with cid in [0,nclusters-1] and lid in [0,ncores-1]. 606 // if NO_PLACEMENT, there is no relation between these 607 // thread [cid][lid] indexes, and the core coordinates [cxy][lpid] 608 609 // get thread abstract identifiers 610 unsigned int tid = args->tid; 611 unsigned int cid = tid / ncores; 612 unsigned int lid = tid % ncores; 613 614 #if VERBOSE_EXEC 615 unsigned int cxy; 616 unsigned int lpid; 617 get_core_id( &cxy , &lpid ); // get core physical identifiers 618 printf("\n[transpose] exec[%d] on core[%x,%d] enters parallel exec\n", 619 tid , cxy , lpid ); 620 #endif 621 622 get_cycle( &date ); 623 LOAD_START[cid][lid] = (unsigned int)date; 402 624 403 625 // build total number of pixels per image 404 626 unsigned int npixels = IMAGE_SIZE * IMAGE_SIZE; 405 627 406 // nuild total number of threads and clusters 407 unsigned int nthreads = x_size * y_size * ncores; 628 // build total number of threads and clusters 408 629 unsigned int nclusters = x_size * y_size; 409 410 // get cluster continuous index and core index from tid 411 // we use (tid == cid * ncores + lid) 412 unsigned int cid = my_tid / ncores; // continuous index 413 unsigned int lid = my_tid % ncores; // core local index 414 415 // get cluster identifier from cid 416 // we use (cid == x * y_size + y) 417 unsigned int x = cid / y_size; // X cluster coordinate 418 unsigned int y = cid % y_size; // Y cluster coordinate 419 unsigned int cxy = HAL_CXY_FROM_XY(x,y); 420 421 #if VERBOSE 422 printf("\n[transpose] thread[%d] start on core[%x,%d]\n", my_tid , cxy , lid ); 423 #endif 424 425 // In each cluster cxy, thread[cxy,0] map input file 426 // to buf_in[cxy] and map output file to buf_in[cxy] 427 428 get_cycle( &date ); 429 MMAP_START[cxy][lid] = (unsigned int)date; 430 431 if ( lid == 0 ) 432 { 433 unsigned int length = npixels / nclusters; 434 unsigned int offset = length * cid; 435 436 // map buf_in 437 buf_in[cid] = mmap( NULL, 438 length, 439 PROT_READ, 440 MAP_SHARED, 441 fd_in, 442 offset ); 443 444 if ( buf_in[cid] == NULL ) 630 unsigned int nthreads = nclusters * ncores; 631 632 unsigned int buf_size = npixels / nclusters; // number of bytes in buf_in & buf_out 633 unsigned int offset = cid * buf_size; // offset in file (bytes) 634 635 unsigned char * buf_in = NULL; // private pointer on local input buffer 636 unsigned char * buf_out = NULL; // private pointer on local output buffer 637 638 // Each thread[cid,0] allocate a local buffer buf_in, and register 639 // the base adress in the global variable buf_in_ptr[cid] 640 // this local buffer is shared by all threads with the same cid 641 if( lid == 0 ) 642 { 643 // allocate buf_in 644 buf_in = (unsigned char *)malloc( buf_size ); 645 646 if( buf_in == NULL ) 445 647 { 446 printf("\n[transpose error] thread[% x,%d] cannot map input file\n", cxy, lid);648 printf("\n[transpose error] thread[%d] cannot allocate buf_in\n", tid ); 447 649 pthread_exit( &THREAD_EXIT_FAILURE ); 448 650 } 449 450 #if VERBOSE 451 printf("\n[transpose] thread[%x,%d] map input file / length %x / offset %x / buf_in %x\n", 452 cxy, lid, length, offset, buf_in[cid] ); 453 #endif 454 455 // map buf_out 456 buf_out[cid] = mmap( NULL, 457 length, 458 PROT_WRITE, 459 MAP_SHARED, 460 fd_out, 461 offset ); 462 463 if ( buf_out[cid] == NULL ) 651 652 // register buf_in buffer in global array of pointers 653 buf_in_ptr[cid] = buf_in; 654 655 #if VERBOSE_EXEC 656 printf("\n[transpose] exec[%d] on core[%x,%d] allocated buf_in = %x\n", 657 tid , cxy , lpid , buf_in ); 658 #endif 659 660 } 661 662 // Each thread[cid,0] copy relevant part of the image_in to buf_in 663 if( lid == 0 ) 664 { 665 memcpy( buf_in, 666 image_in + offset, 667 buf_size ); 668 } 669 670 #if VERBOSE_EXEC 671 printf("\n[transpose] exec[%d] on core[%x,%d] loaded buf_in[%d]\n", 672 tid , cxy , lpid , cid ); 673 #endif 674 675 // Each thread[cid,0] allocate a local buffer buf_out, and register 676 // the base adress in the global variable buf_out_ptr[cid] 677 if( lid == 0 ) 678 { 679 // allocate buf_out 680 buf_out = (unsigned char *)malloc( buf_size ); 681 682 if( buf_out == NULL ) 464 683 { 465 printf("\n[transpose error] thread[% x,%d] cannot map output file\n", cxy, lid);684 printf("\n[transpose error] thread[%d] cannot allocate buf_in\n", tid ); 466 685 pthread_exit( &THREAD_EXIT_FAILURE ); 467 686 } 468 469 #if VERBOSE 470 printf("\n[transpose] thread[%x,%d] map output file / length %x / offset %x / buf_out %x\n", 471 cxy, lid, length, offset, buf_out[cid] ); 472 #endif 473 474 } 475 687 688 // register buf_in buffer in global array of pointers 689 buf_out_ptr[cid] = buf_out; 690 691 #if VERBOSE_EXEC 692 printf("\n[transpose] exec[%d] on core[%x,%d] allocated buf_out = %x\n", 693 tid , cxy , lpid , buf_out ); 694 #endif 695 696 } 697 476 698 get_cycle( &date ); 477 MMAP_END[cxy][lid] = (unsigned int)date;699 LOAD_END[cid][lid] = (unsigned int)date; 478 700 479 701 ///////////////////////////////// 480 702 pthread_barrier_wait( &barrier ); 481 703 482 // parallel transpose from buf_in to buf_out 483 // each thread makes the transposition for nlt lines (nlt = IMAGE_SIZE/nthreads) 704 get_cycle( &date ); 705 TRSP_START[cid][lid] = (unsigned int)date; 706 707 // All threads contribute to parallel transpose from buf_in to buf_out 708 // each thread makes the transposition for nlt lines (nlt = npixels/nthreads) 484 709 // from line [tid*nlt] to line [(tid + 1)*nlt - 1] 485 710 // (p,l) are the absolute pixel coordinates in the source image 711 // (l,p) are the absolute pixel coordinates in the source image 712 // (p,l) are the absolute pixel coordinates in the dest image 486 713 487 714 get_cycle( &date ); 488 TRSP_START[c xy][lid] = (unsigned int)date;715 TRSP_START[cid][lid] = (unsigned int)date; 489 716 490 717 unsigned int nlt = IMAGE_SIZE / nthreads; // number of lines per thread 491 718 unsigned int nlc = IMAGE_SIZE / nclusters; // number of lines per cluster 492 719 493 unsigned int src_c luster;720 unsigned int src_cid; 494 721 unsigned int src_index; 495 unsigned int dst_c luster;722 unsigned int dst_cid; 496 723 unsigned int dst_index; 497 724 498 725 unsigned char byte; 499 726 500 unsigned int first = my_tid * nlt; // first line index for a given thread727 unsigned int first = tid * nlt; // first line index for a given thread 501 728 unsigned int last = first + nlt; // last line index for a given thread 502 729 730 // loop on lines handled by this thread 503 731 for ( l = first ; l < last ; l++ ) 504 732 { 505 // in each iteration we transfer one byte733 // loop on pixels in one line (one pixel per iteration) 506 734 for ( p = 0 ; p < IMAGE_SIZE ; p++ ) 507 735 { 508 736 // read one byte from local buf_in 509 src_cluster = l / nlc; 510 src_index = (l % nlc) * IMAGE_SIZE + p; 511 byte = buf_in[src_cluster][src_index]; 737 src_cid = l / nlc; 738 src_index = (l % nlc) * IMAGE_SIZE + p; 739 740 byte = buf_in_ptr[src_cid][src_index]; 512 741 513 742 // write one byte to remote buf_out 514 dst_c luster= p / nlc;515 dst_index 516 517 buf_out [dst_cluster][dst_index] = byte;743 dst_cid = p / nlc; 744 dst_index = (p % nlc) * IMAGE_SIZE + l; 745 746 buf_out_ptr[dst_cid][dst_index] = byte; 518 747 } 519 748 } 520 749 521 #if VERBOSE 522 printf("\n[transpose] thread[%x,%d] completes transposed\n", cxy, lid ); 750 #if VERBOSE_EXEC 751 printf("\n[transpose] exec[%d] on core[%x,%d] completes transpose\n", 752 tid , cxy , lpid ); 523 753 #endif 524 754 525 755 get_cycle( &date ); 526 TRSP_END[c xy][lid] = (unsigned int)date;756 TRSP_END[cid][lid] = (unsigned int)date; 527 757 528 758 ///////////////////////////////// 529 759 pthread_barrier_wait( &barrier ); 530 760 531 // parallel display from local buf_out to frame buffer532 // all threads contribute to display533 534 761 get_cycle( &date ); 535 DISP_START[cxy][lid] = (unsigned int)date; 536 762 DISP_START[cid][lid] = (unsigned int)date; 763 764 // All threads contribute to parallel display 765 // from local buf_out to frame buffer 537 766 unsigned int npt = npixels / nthreads; // number of pixels per thread 538 767 539 if( fbf_write( &buf_out [cid][lid * npt],768 if( fbf_write( &buf_out_ptr[cid][lid * npt], 540 769 npt, 541 npt * my_tid ) )542 { 543 printf("\n[transpose error] thread[% x,%d] cannot access FBF\n", cxy, lid );770 npt * tid ) ) 771 { 772 printf("\n[transpose error] thread[%d] cannot access FBF\n", tid ); 544 773 pthread_exit( &THREAD_EXIT_FAILURE ); 545 774 } 546 775 547 #if VERBOSE 548 printf("\n[transpose] thread[%x,%d] completes display\n", cxy, lid ); 776 #if VERBOSE_EXEC 777 printf("\n[transpose] exec[%d] on core [%x,%d] completes display\n", 778 tid, cxy , lpid ); 549 779 #endif 550 780 551 781 get_cycle( &date ); 552 DISP_END[c xy][lid] = (unsigned int)date;782 DISP_END[cid][lid] = (unsigned int)date; 553 783 554 784 ///////////////////////////////// 555 785 pthread_barrier_wait( &barrier ); 556 786 557 // all threads, but thread[0,0,0], suicide 558 if ( (cxy != cxy_main) || (lid != lid_main) ) 559 { 787 #if SAVE_RESULT_IMAGE 788 789 // Each thread[cid,0] copy buf_out to relevant part of image_out 790 if( lid == 0 ) 791 { 792 memcpy( image_out + offset, 793 buf_out, 794 buf_size ); 795 } 796 797 #if VERBOSE_EXEC 798 printf("\n[transpose] exec[%d] on core[%x,%d] saved buf_out[%d]\n", 799 tid , cxy , lpid , cid ); 800 #endif 801 802 #endif 803 804 // Each thread[cid,0] releases local buffer buf_out 805 if( lid == 0 ) 806 { 807 // release buf_out 808 free( buf_in ); 809 free( buf_out ); 810 } 811 812 // thread termination depends on the placement policy 813 if( PARALLEL_PLACEMENT ) 814 { 815 // <work> threads are runing in detached mode 816 // each thread must signal completion by calling barrier 817 // passed in arguments before exit 818 819 pthread_barrier_wait( args->barrier ); 820 560 821 pthread_exit( &THREAD_EXIT_SUCCESS ); 561 822 } 823 else 824 { 825 // <work> threads are running in attached mode 826 // each thread, but de main, simply exit 827 if ( tid != tid_main ) pthread_exit( &THREAD_EXIT_SUCCESS ); 828 } 562 829 563 830 } // end execute() … … 565 832 566 833 567 /////////////////////// 568 void instrument( void ) 834 /////////////////////////// 835 void instrument( FILE * f, 836 char * filename ) 569 837 { 570 838 unsigned int x, y, l; 839 840 #if VERBOSE_EXEC 841 printf("\n[transpose] main enters instrument\n" ); 842 #endif 571 843 572 844 unsigned int min_load_start = 0xFFFFFFFF; … … 583 855 unsigned int max_disp_ended = 0; 584 856 585 char string[64];586 587 snprintf( string , 64 , "/home/transpose_%d_%d_%d" , x_size , y_size , ncores );588 589 // open instrumentation file590 FILE * f = fopen( string , NULL );591 if ( f == NULL )592 {593 printf("\n[transpose error] cannot open instrumentation file %s\n", string );594 exit( 0 );595 }596 597 857 for (x = 0; x < x_size; x++) 598 858 { 599 859 for (y = 0; y < y_size; y++) 600 860 { 601 unsigned int c xy = HAL_CXY_FROM_XY( x , y );861 unsigned int cid = y_size * x + y; 602 862 603 863 for ( l = 0 ; l < ncores ; l++ ) 604 864 { 605 if ( MMAP_START[cxy][l] < min_load_start) min_load_start = MMAP_START[cxy][l];606 if ( MMAP_START[cxy][l] > max_load_start) max_load_start = MMAP_START[cxy][l];607 if ( MMAP_END[cxy][l] < min_load_ended) min_load_ended = MMAP_END[cxy][l];608 if ( MMAP_END[cxy][l] > max_load_ended) max_load_ended = MMAP_END[cxy][l];609 if (TRSP_START[c xy][l] < min_trsp_start) min_trsp_start = TRSP_START[cxy][l];610 if (TRSP_START[c xy][l] > max_trsp_start) max_trsp_start = TRSP_START[cxy][l];611 if (TRSP_END[c xy][l] < min_trsp_ended) min_trsp_ended = TRSP_END[cxy][l];612 if (TRSP_END[c xy][l] > max_trsp_ended) max_trsp_ended = TRSP_END[cxy][l];613 if (DISP_START[c xy][l] < min_disp_start) min_disp_start = DISP_START[cxy][l];614 if (DISP_START[c xy][l] > max_disp_start) max_disp_start = DISP_START[cxy][l];615 if (DISP_END[c xy][l] < min_disp_ended) min_disp_ended = DISP_END[cxy][l];616 if (DISP_END[c xy][l] > max_disp_ended) max_disp_ended = DISP_END[cxy][l];865 if (LOAD_START[cid][l] < min_load_start) min_load_start = LOAD_START[cid][l]; 866 if (LOAD_START[cid][l] > max_load_start) max_load_start = LOAD_START[cid][l]; 867 if (LOAD_END[cid][l] < min_load_ended) min_load_ended = LOAD_END[cid][l]; 868 if (LOAD_END[cid][l] > max_load_ended) max_load_ended = LOAD_END[cid][l]; 869 if (TRSP_START[cid][l] < min_trsp_start) min_trsp_start = TRSP_START[cid][l]; 870 if (TRSP_START[cid][l] > max_trsp_start) max_trsp_start = TRSP_START[cid][l]; 871 if (TRSP_END[cid][l] < min_trsp_ended) min_trsp_ended = TRSP_END[cid][l]; 872 if (TRSP_END[cid][l] > max_trsp_ended) max_trsp_ended = TRSP_END[cid][l]; 873 if (DISP_START[cid][l] < min_disp_start) min_disp_start = DISP_START[cid][l]; 874 if (DISP_START[cid][l] > max_disp_start) max_disp_start = DISP_START[cid][l]; 875 if (DISP_END[cid][l] < min_disp_ended) min_disp_ended = DISP_END[cid][l]; 876 if (DISP_END[cid][l] > max_disp_ended) max_disp_ended = DISP_END[cid][l]; 617 877 } 618 878 } 619 879 } 620 880 621 printf( "\n ------ %s ------\n" , string ); 622 fprintf( f , "\n ------ %s ------\n" , string ); 623 624 printf( " - MMAP_START : min = %d / max = %d / med = %d / delta = %d\n", 625 min_load_start, max_load_start, (min_load_start+max_load_start)/2, 626 max_load_start-min_load_start ); 627 628 fprintf( f , " - MMAP_START : min = %d / max = %d / med = %d / delta = %d\n", 629 min_load_start, max_load_start, (min_load_start+max_load_start)/2, 630 max_load_start-min_load_start ); 631 632 printf( " - MMAP_END : min = %d / max = %d / med = %d / delta = %d\n", 633 min_load_ended, max_load_ended, (min_load_ended+max_load_ended)/2, 634 max_load_ended-min_load_ended ); 635 636 fprintf( f , " - MMAP_END : min = %d / max = %d / med = %d / delta = %d\n", 637 min_load_ended, max_load_ended, (min_load_ended+max_load_ended)/2, 638 max_load_ended-min_load_ended ); 639 640 printf( " - TRSP_START : min = %d / max = %d / med = %d / delta = %d\n", 641 min_trsp_start, max_trsp_start, (min_trsp_start+max_trsp_start)/2, 642 max_trsp_start-min_trsp_start ); 643 644 fprintf( f , " - TRSP_START : min = %d / max = %d / med = %d / delta = %d\n", 645 min_trsp_start, max_trsp_start, (min_trsp_start+max_trsp_start)/2, 646 max_trsp_start-min_trsp_start ); 647 648 printf( " - TRSP_END : min = %d / max = %d / med = %d / delta = %d\n", 649 min_trsp_ended, max_trsp_ended, (min_trsp_ended+max_trsp_ended)/2, 650 max_trsp_ended-min_trsp_ended ); 651 652 fprintf( f , " - TRSP_END : min = %d / max = %d / med = %d / delta = %d\n", 653 min_trsp_ended, max_trsp_ended, (min_trsp_ended+max_trsp_ended)/2, 654 max_trsp_ended-min_trsp_ended ); 655 656 printf( " - DISP_START : min = %d / max = %d / med = %d / delta = %d\n", 657 min_disp_start, max_disp_start, (min_disp_start+max_disp_start)/2, 658 max_disp_start-min_disp_start ); 659 660 fprintf( f , " - DISP_START : min = %d / max = %d / med = %d / delta = %d\n", 661 min_disp_start, max_disp_start, (min_disp_start+max_disp_start)/2, 662 max_disp_start-min_disp_start ); 663 664 printf( " - DISP_END : min = %d / max = %d / med = %d / delta = %d\n", 665 min_disp_ended, max_disp_ended, (min_disp_ended+max_disp_ended)/2, 666 max_disp_ended-min_disp_ended ); 667 668 fprintf( f , " - DISP_END : min = %d / max = %d / med = %d / delta = %d\n", 669 min_disp_ended, max_disp_ended, (min_disp_ended+max_disp_ended)/2, 670 max_disp_ended-min_disp_ended ); 671 672 fclose( f ); 881 printf( "\n ------ %s ------\n" , filename ); 882 fprintf( f , "\n ------ %s ------\n" , filename ); 883 884 printf( " - LOAD_START : min = %d / max = %d / delta = %d\n", 885 min_load_start, max_load_start, max_load_start-min_load_start ); 886 fprintf( f , " - LOAD_START : min = %d / max = %d / delta = %d\n", 887 min_load_start, max_load_start, max_load_start-min_load_start ); 888 889 printf( " - LOAD_END : min = %d / max = %d / delta = %d\n", 890 min_load_ended, max_load_ended, max_load_ended-min_load_ended ); 891 fprintf( f , " - LOAD_END : min = %d / max = %d / delta = %d\n", 892 min_load_ended, max_load_ended, max_load_ended-min_load_ended ); 893 894 printf( " - TRSP_START : min = %d / max = %d / delta = %d\n", 895 min_trsp_start, max_trsp_start, max_trsp_start-min_trsp_start ); 896 fprintf( f , " - TRSP_START : min = %d / max = %d / delta = %d\n", 897 min_trsp_start, max_trsp_start, max_trsp_start-min_trsp_start ); 898 899 printf( " - TRSP_END : min = %d / max = %d / delta = %d\n", 900 min_trsp_ended, max_trsp_ended, max_trsp_ended-min_trsp_ended ); 901 fprintf( f , " - TRSP_END : min = %d / max = %d / delta = %d\n", 902 min_trsp_ended, max_trsp_ended, max_trsp_ended-min_trsp_ended ); 903 904 printf( " - DISP_START : min = %d / max = %d / delta = %d\n", 905 min_disp_start, max_disp_start, max_disp_start-min_disp_start ); 906 fprintf( f , " - DISP_START : min = %d / max = %d / delta = %d\n", 907 min_disp_start, max_disp_start, max_disp_start-min_disp_start ); 908 909 printf( " - DISP_END : min = %d / max = %d / delta = %d\n", 910 min_disp_ended, max_disp_ended, max_disp_ended-min_disp_ended ); 911 fprintf( f , " - DISP_END : min = %d / max = %d / delta = %d\n", 912 min_disp_ended, max_disp_ended, max_disp_ended-min_disp_ended ); 913 914 printf( "\n Sequencial = %d / Parallel = %d\n", SEQUENCIAL_TIME, PARALLEL_TIME ); 915 fprintf( f , "\n Sequencial = %d / Parallel = %d\n", SEQUENCIAL_TIME, PARALLEL_TIME ); 673 916 674 917 } // end instrument() -
trunk/user/transpose/transpose.ld
r646 r652 1 /*************************************************************************** *1 /*************************************************************************** 2 2 * Definition of the base address for all virtual segments 3 *************************************************************************** **/3 ***************************************************************************/ 4 4 5 5 seg_code_base = 0x400000; 6 7 /*************************************************************************** 8 * Define code entry point (e_entry field in .elf file) 9 ***************************************************************************/ 10 11 ENTRY( main ) 6 12 7 13 /***************************************************************************
Note: See TracChangeset
for help on using the changeset viewer.