Changeset 652 for trunk/user/fft
- Timestamp:
- Nov 14, 2019, 3:56:51 PM (5 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/user/fft/fft.c
r649 r652 15 15 /*************************************************************************/ 16 16 17 /////////////////////////////////////////////////////////////////////////// 17 //////////////////////////////////////////////////////////////////////////////////////// 18 18 // This port of the SPLASH FFT benchmark on the ALMOS-MKH OS has been 19 19 // done by Alain Greiner (august 2018). … … 45 45 // that contains all coefs required for a rootN points FFT. 46 46 // 47 // There is one working thread per core.48 47 // The actual number of cores and cluster in a given hardware architecture 49 48 // is obtained by the get_config() syscall (x_size, y_size, ncores). … … 51 50 // The max number of cores per cluster is bounded by CORES_MAX. 52 51 // 53 // Several configuration parameters can be defined below: 54 // - PRINT_ARRAY : Print out complex data points arrays. 55 // - CHECK : Perform both FFT and inverse FFT to check output/input. 56 // - DEBUG_MAIN : Display intermediate results in main() 57 // - DEBUG_FFT1D : Display intermediate results in FFT1D() 58 // - DEBUG_ROW : Display intermedite results in FFTrow() 52 // The number N of working threads is always defined by the number of cores availables 53 // in the architecture, but this application supports three placement modes. 54 // In all modes, the working threads are identified by the [tid] continuous index 55 // in range [0, NTHREADS-1], and defines how the lines are shared amongst the threads. 56 // This continuous index can always be decomposed in two continuous sub-indexes: 57 // tid == cid * ncores + lid, where cid is in [0,NCLUSTERS-1] and lid in [0,NCORES-1]. 58 // 59 // - NO_PLACEMENT: the main thread is itsef a working thread. The (N_1) other working 60 // threads are created by the main thread, but the placement is done by the OS, using 61 // the DQDT for load balancing, and two working threads can be placed on the same core. 62 // The [cid,lid] are only abstract identifiers, and cannot be associated to a physical 63 // cluster or a physical core. In this mode, the main thread run on any cluster, 64 // but has tid = 0 (i.e. cid = 0 & tid = 0). 65 // 66 // - EXPLICIT_PLACEMENT: the main thread is again a working thread, but the placement of 67 // of the threads on the cores is explicitely controled by the main thread to have 68 // exactly one working thread per core, and the [cxy][lpid] core coordinates for a given 69 // thread[tid] can be directly derived from the [tid] value: [cid] is an alias for the 70 // physical cluster identifier, and [lid] is the local core index. 71 // 72 // - PARALLEL_PLACEMENT: the main thread is not anymore a working thread, and uses the 73 // non standard pthread_parallel_create() function to avoid the costly sequencial 74 // loops for pthread_create() and pthread_join(). It garanty one working thread 75 // per core, and the same relation between the thread[tid] and the core[cxy][lpid]. 76 // 77 // Several others configuration parameters can be defined below: 78 // - USE_DQT_BARRIER : use a hierarchical barrier for working threads synchro 79 // - PRINT_ARRAY : Print out complex data points arrays. 80 // - CHECK : Perform both FFT and inverse FFT to check output/input. 81 // - DEBUG_MAIN : Display intermediate results in main() 82 // - DEBUG_FFT1D : Display intermediate results in FFT1D() 83 // - DEBUG_ROW : Display intermedite results in FFTrow() 59 84 // 60 85 // Regarding final instrumentation: … … 66 91 // is computed by each thread(i) in the work() function. 67 92 // The results are displayed on the TXT terminal, and registered on disk. 68 /////////////////////////////////////////////////////////////////////////// 93 /////////////////////////////////////////////////////////////////////////////////////// 69 94 70 95 #include <math.h> … … 92 117 // parameters 93 118 119 #define NO_PLACEMENT 1 120 #define EXPLICIT_PLACEMENT 0 121 #define PARALLEL_PLACEMENT 0 122 94 123 #define DEFAULT_M 18 // 256 K complex points 95 124 #define USE_DQT_BARRIER 1 // use DDT barrier if non zero … … 110 139 ///////////////////////////////////////////////////////////////////////////////////// 111 140 112 // work function arguments 113 typedef struct work_args_s 114 { 115 unsigned int tid; // thread continuous index 116 unsigned int lid; // core local index 117 unsigned int cid; // cluster continuous index 118 pthread_barrier_t * parent_barrier; // parent barrier to signal completion 119 } 120 work_args_t; 141 unsigned int x_size; // platform global parameter 142 unsigned int y_size; // platform global parameter 143 unsigned int ncores; // platform global parameter 121 144 122 145 unsigned int nthreads; // total number of threads (one thread per core) … … 130 153 // arrays of pointers on distributed buffers (one sub-buffer per cluster) 131 154 double * data[CLUSTERS_MAX]; // original time-domain data 132 double * trans[CLUSTERS_MAX]; // used as auxiliary space for transpose155 double * trans[CLUSTERS_MAX]; // used as auxiliary space for fft 133 156 double * twid[CLUSTERS_MAX]; // twiddle factor : exp(-2iPI*k*n/N) 134 157 double * bloup[CLUSTERS_MAX]; // used as auxiliary space for DFT … … 146 169 pthread_barrierattr_t barrier_attr; 147 170 148 ///////////////////////////////////////////////////////////////////////////////////// 149 // Global variables required by parallel_pthread_create() 150 ///////////////////////////////////////////////////////////////////////////////////// 151 152 // 2D arrays of input arguments for the <work> threads 153 // These arrays are initialised by the application main thread 154 155 work_args_t work_args[CLUSTERS_MAX][CORES_MAX]; // work function arguments 156 work_args_t * work_ptrs[CLUSTERS_MAX][CORES_MAX]; // pointers on arguments 157 158 // 1D array of barriers to allow the <work> threads to signal termination 159 // this array is initialised in each cluster by the <build[cxy][0]> thread 160 161 pthread_barrier_t parent_barriers[CLUSTERS_MAX]; // termination barrier 171 //return values at thread exit 172 unsigned int THREAD_EXIT_SUCCESS = 0; 173 unsigned int THREAD_EXIT_FAILURE = 1; 174 175 // main thread continuous index 176 unsigned int tid_main; 177 178 // array of kernel thread identifiers / indexed by [tid] 179 pthread_t work_trdid[CLUSTERS_MAX * CORES_MAX]; 180 181 // array of thread attributes / indexed by [tid] 182 pthread_attr_t work_attr[CLUSTERS_MAX * CORES_MAX]; 183 184 // array of work function arguments / indexed by [tid] 185 pthread_parallel_work_args_t work_args[CLUSTERS_MAX * CORES_MAX]; 162 186 163 187 ///////////////////////////////////////////////////////////////////////////////////// … … 165 189 ///////////////////////////////////////////////////////////////////////////////////// 166 190 167 void work( work_args_t * args );191 void work( pthread_parallel_work_args_t * args ); 168 192 169 193 double CheckSum( void ); … … 234 258 int error; 235 259 236 unsigned int x_size; // number of clusters per row237 unsigned int y_size; // number of clusters per column238 unsigned int ncores; // max number of cores per cluster239 240 241 unsigned int x; // current index for cluster X coordinate242 unsigned int y; // current index for cluster Y coordinate243 unsigned int lid; // current index for core in a cluster244 260 unsigned int tid; // continuous thread index 245 unsigned int cid; // cluster continuous index246 unsigned int cxy; // hardware specific cluster identifier247 261 248 262 char name[64]; // instrumentation file name … … 265 279 int pid = getpid(); 266 280 281 // check placement mode 282 if( (NO_PLACEMENT + EXPLICIT_PLACEMENT + PARALLEL_PLACEMENT) != 1 ) 283 { 284 printf("\n[fft error] illegal placement mode\n"); 285 exit( 0 ); 286 } 287 267 288 // get FFT application start cycle 268 289 get_cycle( &start_init_cycle ); … … 295 316 exit( 0 ); 296 317 } 318 319 // get identifiers for core executing main 320 unsigned int cxy_main; 321 unsigned int lid_main; 322 get_core_id( &cxy_main , &lid_main ); 297 323 298 324 // compute nthreads and nclusters … … 317 343 } 318 344 319 printf("\n[fft] starts / %d points / %d thread(s) / PID %x / cycle %d\n", 320 N, nthreads, pid, (unsigned int)start_init_cycle ); 321 322 // build instrumentation file name 323 if( USE_DQT_BARRIER ) 324 snprintf( name , 64 , "p_fft_dqt_%d_%d_%d", N , x_size * y_size , ncores ); 325 else 326 snprintf( name , 64 , "p_fft_smp_%d_%d_%d", N , x_size * y_size , ncores ); 327 328 // build pathname 345 // define instrumentation file name 346 if( NO_PLACEMENT ) 347 { 348 printf("\n[fft] starts / %d points / %d thread(s) / PID %x / NO_PLACE\n", 349 N, nthreads, pid ); 350 351 // build instrumentation file name 352 if( USE_DQT_BARRIER ) 353 snprintf( name , 64 , "fft_dqt_no_place_%d_%d_%d", M , x_size * y_size , ncores ); 354 else 355 snprintf( name , 64 , "fft_smp_no_place_%d_%d_%d", M , x_size * y_size , ncores ); 356 } 357 358 if( EXPLICIT_PLACEMENT ) 359 { 360 printf("\n[fft] starts / %d points / %d thread(s) / PID %x / EXPLICIT\n", 361 N, nthreads, pid ); 362 363 // build instrumentation file name 364 if( USE_DQT_BARRIER ) 365 snprintf( name , 64 , "fft_dqt_explicit_%d_%d_%d", M , x_size * y_size , ncores ); 366 else 367 snprintf( name , 64 , "fft_smp_explicit_%d_%d_%d", M , x_size * y_size , ncores ); 368 } 369 370 if( PARALLEL_PLACEMENT ) 371 { 372 printf("\n[fft] starts / %d points / %d thread(s) / PID %x / PARALLEL\n", 373 N, nthreads, pid ); 374 375 // build instrumentation file name 376 if( USE_DQT_BARRIER ) 377 snprintf( name , 64 , "fft_dqt_parallel_%d_%d_%d", M , x_size * y_size , ncores ); 378 else 379 snprintf( name , 64 , "fft_smp_parallel_%d_%d_%d", M , x_size * y_size , ncores ); 380 } 381 382 // build instrumentation file pathname 329 383 snprintf( path , 128 , "/home/%s", name ); 330 384 … … 339 393 #if DEBUG_MAIN 340 394 get_cycle( &debug_cycle ); 341 printf("\n[fft] main open file <%s> at cycle %d\n",395 printf("\n[fft] main open instrumentation file <%s> at cycle %d\n", 342 396 path, (unsigned int)debug_cycle ); 343 397 #endif … … 381 435 #if DEBUG_MAIN 382 436 get_cycle( &debug_cycle ); 383 printf("\n[fft] main completes barrier initat cycle %d\n",437 printf("\n[fft] main completes sequencial initialisation at cycle %d\n", 384 438 (unsigned int)debug_cycle ); 385 439 #endif 386 387 // build array of arguments for the <work> threads388 for (x = 0 ; x < x_size ; x++)389 {390 for (y = 0 ; y < y_size ; y++)391 {392 // compute cluster identifier393 cxy = HAL_CXY_FROM_XY( x , y );394 395 for ( lid = 0 ; lid < ncores ; lid++ )396 {397 // compute cluster continuous index398 cid = (x * y_size) + y;399 400 // compute work thread continuous index401 tid = (cid * ncores) + lid;402 403 // initialize 2D array of arguments404 work_args[cxy][lid].tid = tid;405 work_args[cxy][lid].lid = lid;406 work_args[cxy][lid].cid = cid;407 work_args[cxy][lid].parent_barrier = &parent_barriers[cxy];408 409 // initialize 2D array of pointers410 work_ptrs[cxy][lid] = &work_args[cxy][lid];411 }412 }413 }414 440 415 441 // register sequencial time … … 417 443 init_time = (unsigned int)(end_init_cycle - start_init_cycle); 418 444 445 ////////////////// 446 if( NO_PLACEMENT ) 447 { 448 // the tid value for the main thread is always 0 449 // main thread creates new threads with tid in [1,nthreads-1] 450 unsigned int tid; 451 for ( tid = 0 ; tid < nthreads ; tid++ ) 452 { 453 // register tid value in work_args[tid] array 454 work_args[tid].tid = tid; 455 456 // create other threads 457 if( tid > 0 ) 458 { 459 if ( pthread_create( &work_trdid[tid], 460 NULL, // no attribute 461 &work, 462 &work_args[tid] ) ) 463 { 464 printf("\n[fft error] cannot create thread %d\n", tid ); 465 exit( 0 ); 466 } 467 419 468 #if DEBUG_MAIN 420 printf("\n[fft] main completes <work> threads arguments at cycle %d\n", 421 (unsigned int)end_init_cycle ); 422 #endif 423 424 // create and execute the working threads 425 if( pthread_parallel_create( root_level, 426 &work, 427 &work_ptrs[0][0], 428 &parent_barriers[0] ) ) 429 { 430 printf("\n[fft error] creating threads\n"); 431 exit( 0 ); 469 printf("\n[fft] main created thread %d\n", tid ); 470 #endif 471 472 } 473 else 474 { 475 tid_main = 0; 476 } 477 } // end for tid 478 479 // main thread calls itself the execute() function 480 work( &work_args[0] ); 481 482 // main thread wait other threads completion 483 for ( tid = 1 ; tid < nthreads ; tid++ ) 484 { 485 unsigned int * status; 486 487 // main wait thread[tid] status 488 if ( pthread_join( work_trdid[tid], (void*)(&status)) ) 489 { 490 printf("\n[fft error] main cannot join thread %d\n", tid ); 491 exit( 0 ); 492 } 493 494 // check status 495 if( *status != THREAD_EXIT_SUCCESS ) 496 { 497 printf("\n[fft error] thread %x returned failure\n", tid ); 498 exit( 0 ); 499 } 500 501 #if DEBUG_MAIN 502 printf("\n[fft] main successfully joined thread %x\n", tid ); 503 #endif 504 505 } // end for tid 506 507 } // end if no_placement 508 509 //////////////////////// 510 if( EXPLICIT_PLACEMENT ) 511 { 512 // main thread places each thread[tid] on a specific core[cxy][lid] 513 // but the actual thread creation is sequencial 514 unsigned int x; 515 unsigned int y; 516 unsigned int l; 517 unsigned int cxy; // cluster identifier 518 unsigned int tid; // thread continuous index 519 520 for( x = 0 ; x < x_size ; x++ ) 521 { 522 for( y = 0 ; y < y_size ; y++ ) 523 { 524 cxy = HAL_CXY_FROM_XY( x , y ); 525 for( l = 0 ; l < ncores ; l++ ) 526 { 527 // compute thread continuous index 528 tid = (((x * y_size) + y) * ncores) + l; 529 530 // register tid value in work_args[tid] array 531 work_args[tid].tid = tid; 532 533 // no thread created on the core running the main 534 if( (cxy != cxy_main) || (l != lid_main) ) 535 { 536 // define thread attributes 537 work_attr[tid].attributes = PT_ATTR_CLUSTER_DEFINED | 538 PT_ATTR_CORE_DEFINED; 539 work_attr[tid].cxy = cxy; 540 work_attr[tid].lid = l; 541 542 // create thread[tid] on core[cxy][l] 543 if ( pthread_create( &work_trdid[tid], 544 &work_attr[tid], 545 &work, 546 &work_args[tid] ) ) 547 { 548 printf("\n[fft error] cannot create thread %d\n", tid ); 549 exit( 0 ); 550 } 551 #if DEBUG_MAIN 552 printf("\n[fft] main created thread[%d] on core[%x,%d]\n", tid, cxy, l ); 553 #endif 554 } 555 else 556 { 557 tid_main = tid; 558 } 559 } 560 } 561 } 562 563 // main thread calls itself the execute() function 564 work( &work_args[tid_main] ); 565 566 // main thread wait other threads completion 567 for( tid = 0 ; tid < nthreads ; tid++ ) 568 { 569 // no other thread on the core running the main 570 if( tid != tid_main ) 571 { 572 unsigned int * status; 573 574 // wait thread[tid] 575 if( pthread_join( work_trdid[tid] , (void*)(&status) ) ) 576 { 577 printf("\n[fft error] main cannot join thread %d\n", tid ); 578 exit( 0 ); 579 } 580 581 // check status 582 if( *status != THREAD_EXIT_SUCCESS ) 583 { 584 printf("\n[fft error] thread %d returned failure\n", tid ); 585 exit( 0 ); 586 } 587 #if DEBUG_MAIN 588 printf("\n[fft] main joined thread %d on core[%x,%d]\n", tid , cxy , l ); 589 #endif 590 } 591 } 592 } // end if explicit_placement 593 594 //////////////////////// 595 if( PARALLEL_PLACEMENT ) 596 { 597 // create and execute the working threads 598 if( pthread_parallel_create( root_level , &work ) ) 599 { 600 printf("\n[fft error] cannot create threads\n"); 601 exit( 0 ); 602 } 432 603 } 433 604 … … 533 704 // This function is executed in parallel by all <work> threads. 534 705 ///////////////////////////////////////////////////////////////// 535 void work( work_args_t * args )706 void work( pthread_parallel_work_args_t * args ) 536 707 { 537 708 unsigned int tid; // this thread continuous index … … 549 720 unsigned long long barrier_stop; 550 721 722 get_cycle( ¶llel_start ); 723 551 724 // get thread arguments 552 725 tid = args->tid; 553 lid = args->lid;554 cid = args->cid; 555 parent_barrier = args->parent_barrier;556 557 get_cycle( ¶llel_start );558 726 parent_barrier = args->barrier; 727 728 // compute lid and cid from tid 729 lid = tid % ncores; 730 cid = tid / ncores; 731 559 732 #if DEBUG_WORK 560 733 printf("\n[fft] %s : thread %d enter / cycle %d\n", … … 602 775 printf("\n[fft] %s : thread %d exit barrier for buffer allocation / cycle %d\n", 603 776 __FUNCTION__, tid, (unsigned int)barrier_stop ); 604 #endif605 606 #if DISPLAY_SCHED_AND_VMM607 unsigned int x_size;608 unsigned int y_size;609 unsigned int ncores;610 get_config( &x_size , &y_size , &ncores );611 unsigned int x = cid / y_size;612 unsigned int y = cid % y_size;613 unsigned int cxy = HAL_CXY_FROM_XY( x , y );614 display_sched( cxy , lid );615 if( lid == 0 ) display_vmm( cxy , getpid() , 0 );616 777 #endif 617 778 … … 919 1080 // contained in the distributed buffers x[nclusters][points_per_cluster]. 920 1081 // It handles the (N) points 1D array as a (rootN*rootN) points 2D array. 921 // 1) it transpose(rootN/nthreads ) rows from x to tmp.1082 // 1) it fft (rootN/nthreads ) rows from x to tmp. 922 1083 // 2) it make (rootN/nthreads) FFT on the tmp rows and apply the twiddle factor. 923 // 3) it transpose(rootN/nthreads) columns from tmp to x.1084 // 3) it fft (rootN/nthreads) columns from tmp to x. 924 1085 // 4) it make (rootN/nthreads) FFT on the x rows. 925 1086 // It calls the FFTRow() 2*(rootN/nthreads) times to perform the in place FFT … … 946 1107 #endif 947 1108 948 // transpose(rootN/nthreads) rows from x to tmp1109 // fft (rootN/nthreads) rows from x to tmp 949 1110 Transpose( x , tmp , MyFirst , MyLast ); 950 1111 951 1112 #if( DEBUG_FFT1D & 1 ) 952 1113 get_cycle( &cycle ); 953 printf("\n[fft] %s : thread %d after first transpose/ cycle %d\n",1114 printf("\n[fft] %s : thread %d after first fft / cycle %d\n", 954 1115 __FUNCTION__, tid, (unsigned int)cycle ); 955 1116 if( PRINT_ARRAY ) PrintArray( tmp , N ); … … 964 1125 #if( DEBUG_FFT1D & 1 ) 965 1126 get_cycle( &cycle ); 966 printf("\n[fft] %s : thread %d exit barrier after first transpose/ cycle %d\n",1127 printf("\n[fft] %s : thread %d exit barrier after first fft / cycle %d\n", 967 1128 __FUNCTION__, tid, (unsigned int)cycle ); 968 1129 #endif … … 992 1153 sync_time[tid] += (unsigned int)(barrier_stop - barrier_start); 993 1154 994 // transposetmp to x1155 // fft tmp to x 995 1156 Transpose( tmp , x , MyFirst , MyLast ); 996 1157 997 1158 #if( DEBUG_FFT1D & 1 ) 998 printf("\n[fft] %s : thread %d after second transpose\n", __FUNCTION__, tid);1159 printf("\n[fft] %s : thread %d after second fft\n", __FUNCTION__, tid); 999 1160 if( PRINT_ARRAY ) PrintArray( x , N ); 1000 1161 #endif … … 1006 1167 1007 1168 #if( DEBUG_FFT1D & 1 ) 1008 printf("\n[fft] %s : thread %d exit barrier after second transpose\n", __FUNCTION__, tid);1169 printf("\n[fft] %s : thread %d exit barrier after second fft\n", __FUNCTION__, tid); 1009 1170 #endif 1010 1171 … … 1033 1194 sync_time[tid] += (unsigned int)(barrier_stop - barrier_start); 1034 1195 1035 // transposex to tmp1196 // fft x to tmp 1036 1197 Transpose( x , tmp , MyFirst , MyLast ); 1037 1198 1038 1199 #if( DEBUG_FFT1D & 1 ) 1039 printf("\n[fft] %s : thread %x after third transpose\n", __FUNCTION__, tid);1200 printf("\n[fft] %s : thread %x after third fft\n", __FUNCTION__, tid); 1040 1201 if( PRINT_ARRAY ) PrintArray( x , N ); 1041 1202 #endif … … 1047 1208 1048 1209 #if( DEBUG_FFT1D & 1 ) 1049 printf("\n[fft] %s : thread %d exit barrier after third transpose\n", __FUNCTION__, tid);1210 printf("\n[fft] %s : thread %d exit barrier after third fft\n", __FUNCTION__, tid); 1050 1211 #endif 1051 1212
Note: See TracChangeset
for help on using the changeset viewer.