Changeset 637 for trunk/user/fft/fft.c
- Timestamp:
- Jul 18, 2019, 2:06:55 PM (5 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/user/fft/fft.c
r636 r637 22 22 // of N complex points, using the Cooley-Tuckey FFT method. 23 23 // The N data points are seen as a 2D array (rootN rows * rootN columns). 24 // Each thread handle (rootN / nthreads) rows. The N input data points25 // be initialised in three different modes:24 // Each thread handle (rootN / nthreads) rows. 25 // The N input data points can be initialised in three different modes: 26 26 // - CONSTANT : all data points have the same [1,0] value 27 27 // - COSIN : data point n has [cos(n/N) , sin(n/N)] values … … 31 31 // - M : N = 2**M = number of data points / M must be an even number. 32 32 // - T : nthreads = ncores defined by the hardware / must be power of 2. 33 // The number of threads cannot be larger than the number of rows. 33 34 // 34 // This application uses 4 shared data arrays, that are dynamically 35 // allocated an distributed, using the remote_malloc() function, with 36 // one sub-buffer per cluster: 37 // - data[N] contains N input data points, with 2 double per point. 38 // - trans[N] contains N intermediate data points, 2 double per point. 39 // - umain[rootN] contains rootN coefs required for a rootN points FFT. 40 // - twid[N] contains N coefs : exp(2*pi*i*j/N) / i and j in [0,rootN-1]. 41 // For data, trans, twid, each sub-buffer contains (N/nclusters) points. 42 // For umain, each sub-buffer contains (rootN/nclusters) points. 35 // This application uses 3 shared data arrays, that are dynamically 36 // allocated and distributed in clusters, with one sub-buffer per cluster: 37 // - data[N] contains N input data points, 38 // - trans[N] contains N intermediate data points, 39 // - twid[N] contains N coefs : exp(2*pi*i*j/N) / i and j in [0,rootN-1] 40 // Each sub-buffer contains (N/nclusters) entries, with 2 double per entry. 41 // These distributed buffers are allocated and initialised in parallel 42 // by the working threads running on core 0 in each cluster. 43 43 // 44 // There is one thread per core. 45 // The max number of clusters is defined by (X_MAX * Y_MAX). 46 // The max number of cores per cluster is defined by CORES_MAX. 44 // Each working thread allocates also a private coefs[rootN-1] buffer, 45 // that contains all coefs required for a rootN points FFT. 46 // 47 // There is one working thread per core. 48 // The actual number of cores and cluster in a given hardware architecture 49 // is obtained by the get_config() syscall (x_size, y_size, ncores). 50 // The max number of clusters is bounded by (X_MAX * Y_MAX). 51 // The max number of cores per cluster is bounded by CORES_MAX. 47 52 // 48 53 // Several configuration parameters can be defined below: … … 57 62 // by the main thread in the main() function. 58 63 // - The parallel execution time (parallel_time[i]) is computed by each 59 // thread(i) in the slave() function.64 // working thread(i) in the work() function. 60 65 // - The synchronisation time related to the barriers (sync_time[i]) 61 // is computed by each thread(i) in the slave() function.66 // is computed by each thread(i) in the work() function. 62 67 // The results are displayed on the TXT terminal, and registered on disk. 63 68 /////////////////////////////////////////////////////////////////////////// … … 87 92 // parameters 88 93 89 #define DEFAULT_M 1 2 // 4096data points90 #define USE_DQT_BARRIER 0// use DDT barrier if non zero94 #define DEFAULT_M 14 // 16384 data points 95 #define USE_DQT_BARRIER 1 // use DDT barrier if non zero 91 96 #define MODE COSIN // DATA array initialisation mode 92 97 #define CHECK 0 93 #define DEBUG_MAIN 0// trace main() function (detailed if odd)94 #define DEBUG_ SLAVE 0 // trace slave() function (detailed if odd)98 #define DEBUG_MAIN 1 // trace main() function (detailed if odd) 99 #define DEBUG_WORK 1 // trace work() function (detailed if odd) 95 100 #define DEBUG_FFT1D 0 // trace FFT1D() function (detailed if odd) 96 101 #define DEBUG_ROW 0 // trace FFTRow() function (detailed if odd) … … 101 106 102 107 ///////////////////////////////////////////////////////////////////////////////////// 103 // structure containing the arguments for the slave() function108 // FFT specific global variables 104 109 ///////////////////////////////////////////////////////////////////////////////////// 105 110 106 typedef struct args_s 107 { 108 unsigned int tid; // thread continuous index 109 unsigned int main_tid; // main thread continuous index 111 // work function arguments 112 typedef struct work_args_s 113 { 114 unsigned int tid; // thread continuous index 115 unsigned int lid; // core local index 116 unsigned int cid; // cluster continuous index 117 pthread_barrier_t * parent_barrier; // parent barrier to signal completion 110 118 } 111 args_t; 112 113 ///////////////////////////////////////////////////////////////////////////////////// 114 // global variables 115 ///////////////////////////////////////////////////////////////////////////////////// 116 117 unsigned int x_size; // number of clusters per row in the mesh 118 unsigned int y_size; // number of clusters per column in the mesh 119 unsigned int ncores; // number of cores per cluster 119 work_args_t; 120 120 121 unsigned int nthreads; // total number of threads (one thread per core) 121 122 unsigned int nclusters; // total number of clusters … … 129 130 double * data[CLUSTERS_MAX]; // original time-domain data 130 131 double * trans[CLUSTERS_MAX]; // used as auxiliary space for transpose 132 double * twid[CLUSTERS_MAX]; // twiddle factor : exp(-2iPI*k*n/N) 131 133 double * bloup[CLUSTERS_MAX]; // used as auxiliary space for DFT 132 double * umain[CLUSTERS_MAX]; // roots of unity used fo rootN points FFT133 double * twid[CLUSTERS_MAX]; // twiddle factor : exp(-2iPI*k*n/N)134 134 135 135 // instrumentation counters … … 142 142 pthread_barrierattr_t barrier_attr; 143 143 144 // threads identifiers, attributes, and arguments 145 pthread_t trdid[THREADS_MAX]; // kernel threads identifiers 146 pthread_attr_t attr[THREADS_MAX]; // POSIX thread attributes 147 args_t args[THREADS_MAX]; // slave function arguments 148 149 ///////////////////////////////////////////////////////////////////////////////// 144 ///////////////////////////////////////////////////////////////////////////////////// 145 // Global variables required by parallel_pthread_create() 146 ///////////////////////////////////////////////////////////////////////////////////// 147 148 // 2D arrays of input arguments for the <work> threads 149 // These arrays are initialised by the application main thread 150 151 work_args_t work_args[CLUSTERS_MAX][CORES_MAX]; // work function arguments 152 work_args_t * work_ptrs[CLUSTERS_MAX][CORES_MAX]; // pointers on arguments 153 154 // 1D array of barriers to allow the <work> threads to signal termination 155 // this array is initialised in each cluster by the <build[cxy][0]> thread 156 157 pthread_barrier_t parent_barriers[CLUSTERS_MAX]; // termination barrier 158 159 ///////////////////////////////////////////////////////////////////////////////////// 150 160 // functions declaration 151 ///////////////////////////////////////////////////////////////////////////////// 152 153 void slave(args_t * args );161 ///////////////////////////////////////////////////////////////////////////////////// 162 163 void work( work_args_t * args ); 154 164 155 165 double CheckSum( void ); 156 166 157 void InitX(double ** x , unsigned int mode); 158 159 void InitU(double ** u); 160 161 void InitT(double ** u); 167 void InitD( double ** data , 168 unsigned int mode, 169 unsigned int tid ); 170 171 void InitT( double ** twid, 172 unsigned int tid ); 173 174 void InitU( double * coefs ); 162 175 163 176 unsigned int BitReverse( unsigned int k ); … … 168 181 double * upriv, 169 182 double ** twid, 170 unsigned int MyNum,183 unsigned int tid, 171 184 unsigned int MyFirst, 172 185 unsigned int MyLast ); … … 217 230 int error; 218 231 219 unsigned int main_cxy; // main thread cluster 220 unsigned int main_x; // main thread X coordinate 221 unsigned int main_y; // main thread y coordinate 222 unsigned int main_lid; // main thread local core index 223 unsigned int main_tid; // main thread continuous index 232 unsigned int x_size; // number of clusters per row 233 unsigned int y_size; // number of clusters per column 234 unsigned int ncores; // max number of cores per cluster 224 235 225 236 unsigned int x; // current index for cluster X coordinate 226 237 unsigned int y; // current index for cluster Y coordinate 227 238 unsigned int lid; // current index for core in a cluster 228 unsigned int ci; // continuous cluster index (from x,y) 239 unsigned int tid; // continuous thread index 240 unsigned int cid; // cluster continuous index 229 241 unsigned int cxy; // hardware specific cluster identifier 230 unsigned int tid; // continuous thread index 242 243 char name[64]; // instrumentation file name 244 char path[128]; // instrumentation path name 245 char string[256]; 246 int ret; 231 247 232 248 unsigned long long start_init_cycle; 233 249 unsigned long long end_init_cycle; 234 250 251 #if DEBUG_MAIN 252 unsigned long long debug_cycle; 253 #endif 254 235 255 #if CHECK 236 double ck1;// for input/output checking237 double ck3;// for input/output checking256 double ck1; // for input/output checking 257 double ck3; // for input/output checking 238 258 #endif 239 259 … … 241 261 get_cycle( &start_init_cycle ); 242 262 243 // get platform parameters to compute nthreads & nclusters263 // get platform parameters 244 264 if( get_config( &x_size , &y_size , &ncores ) ) 245 265 { … … 269 289 } 270 290 291 // compute nthreads and nclusters 271 292 nthreads = x_size * y_size * ncores; 272 293 nclusters = x_size * y_size; 294 295 // compute covering DQT size an level 296 unsigned int z = (x_size > y_size) ? x_size : y_size; 297 unsigned int root_level = (z == 1) ? 0 : (z == 2) ? 1 : (z == 4) ? 2 : (z == 8) ? 3 : 4; 273 298 274 299 // compute various constants depending on N and T … … 285 310 } 286 311 287 // get main thread coordinates (main_x, main_y, main_lid) 288 get_core( &main_cxy , &main_lid ); 289 main_x = HAL_X_FROM_CXY( main_cxy ); 290 main_y = HAL_Y_FROM_CXY( main_cxy ); 291 main_tid = (((main_x * y_size) + main_y) * ncores) + main_lid; 292 293 printf("\n[fft] starts / core[%x,%d] / %d points / %d thread(s) / PID %x / cycle %d\n", 294 main_cxy, main_lid, N, nthreads, getpid(), (unsigned int)start_init_cycle ); 295 296 // allocate memory for the distributed data[i], trans[i], umain[i], twid[i] buffers 297 // the index (i) is a continuous cluster index 298 unsigned int data_size = (N / nclusters) * 2 * sizeof(double); 299 unsigned int coefs_size = (rootN / nclusters) * 2 * sizeof(double); 300 for (x = 0 ; x < x_size ; x++) 301 { 302 for (y = 0 ; y < y_size ; y++) 303 { 304 ci = x * y_size + y; 305 cxy = HAL_CXY_FROM_XY( x , y ); 306 data[ci] = (double *)remote_malloc( data_size , cxy ); 307 trans[ci] = (double *)remote_malloc( data_size , cxy ); 308 bloup[ci] = (double *)remote_malloc( data_size , cxy ); 309 umain[ci] = (double *)remote_malloc( coefs_size , cxy ); 310 twid[ci] = (double *)remote_malloc( data_size , cxy ); 311 } 312 printf("\n[fft] starts / %d points / %d thread(s) / PID %x / cycle %d\n", 313 N, nthreads, getpid(), (unsigned int)start_init_cycle ); 314 315 // build instrumentation file name 316 if( USE_DQT_BARRIER ) 317 snprintf( name , 64 , "p_fft_dqt_%d_%d_%d", N , x_size * y_size , ncores ); 318 else 319 snprintf( name , 64 , "p_fft_smp_%d_%d_%d", N , x_size * y_size , ncores ); 320 321 // build pathname 322 snprintf( path , 128 , "/home/%s", name ); 323 324 // open instrumentation file 325 FILE * f = fopen( path , NULL ); 326 if ( f == NULL ) 327 { 328 printf("\n[fft error] cannot open instrumentation file <%s>\n", path ); 329 exit( 0 ); 312 330 } 313 331 314 332 #if DEBUG_MAIN 315 printf("\n[fft] main completes remote_malloc\n"); 316 #endif 317 318 // arrays initialisation 319 InitX( data , MODE ); 320 InitU( umain ); 321 InitT( twid ); 322 323 #if DEBUG_MAIN 324 printf("\n[fft] main completes arrays init\n"); 333 get_cycle( &debug_cycle ); 334 printf("\n[fft] main open file <%s> at cycle %d\n", 335 path, (unsigned int)debug_cycle ); 325 336 #endif 326 337 … … 342 353 #endif 343 354 344 // initialise barrier 355 // initialise barrier synchronizing all <work> threads 345 356 if( USE_DQT_BARRIER ) 346 357 { … … 362 373 363 374 #if DEBUG_MAIN 364 printf("\n[fft] main completes barrier init\n"); 365 #endif 366 367 // launch other threads to execute the slave() function 368 // on cores other than the core running the main thread 375 get_cycle( &debug_cycle ); 376 printf("\n[fft] main completes barrier init at cycle %d\n", 377 (unsigned int)debug_cycle ); 378 #endif 379 380 // build array of arguments for the <work> threads 369 381 for (x = 0 ; x < x_size ; x++) 370 382 { … … 376 388 for ( lid = 0 ; lid < ncores ; lid++ ) 377 389 { 378 // compute thread user index (continuous index) 379 tid = (((x * y_size) + y) * ncores) + lid; 380 381 // set thread attributes 382 attr[tid].attributes = PT_ATTR_CLUSTER_DEFINED | PT_ATTR_CORE_DEFINED; 383 attr[tid].cxy = cxy; 384 attr[tid].lid = lid; 385 386 // set slave function argument 387 args[tid].tid = tid; 388 args[tid].main_tid = main_tid; 389 390 // create thread 391 if( tid != main_tid ) 392 { 393 if ( pthread_create( &trdid[tid], // pointer on kernel identifier 394 &attr[tid], // pointer on thread attributes 395 &slave, // pointer on function 396 &args[tid]) ) // pointer on function arguments 397 { 398 printf("\n[fft error] creating thread %x\n", tid ); 399 exit( 0 ); 400 } 401 402 #if (DEBUG_MAIN & 1) 403 unsigned long long debug_cycle; 404 get_cycle( &debug_cycle ); 405 printf("\n[fft] main created thread %d on core[%x,%d] / cycle %d\n", 406 tid, cxy, lid, (unsigned int)debug_cycle ); 407 #endif 408 } 390 // compute cluster continuous index 391 cid = (x * y_size) + y; 392 393 // compute work thread continuous index 394 tid = (cid * ncores) + lid; 395 396 // initialize 2D array of arguments 397 work_args[cxy][lid].tid = tid; 398 work_args[cxy][lid].lid = lid; 399 work_args[cxy][lid].cid = cid; 400 work_args[cxy][lid].parent_barrier = &parent_barriers[cxy]; 401 402 // initialize 2D array of pointers 403 work_ptrs[cxy][lid] = &work_args[cxy][lid]; 409 404 } 410 405 } 411 406 } 412 407 408 // register sequencial time 409 get_cycle( &end_init_cycle ); 410 init_time = (unsigned int)(end_init_cycle - start_init_cycle); 411 413 412 #if DEBUG_MAIN 414 printf("\n[fft] main completes threads creation\n"); 415 #endif 416 417 get_cycle( &end_init_cycle ); 418 419 // register sequencial time 420 init_time = (unsigned int)(end_init_cycle - start_init_cycle); 421 422 // main itself executes the slave() function 423 slave( &args[main_tid] ); 424 425 // wait other threads completion 426 for (x = 0 ; x < x_size ; x++) 427 { 428 for (y = 0 ; y < y_size ; y++) 429 { 430 for ( lid = 0 ; lid < ncores ; lid++ ) 431 { 432 // compute thread continuous index 433 tid = (((x * y_size) + y) * ncores) + lid; 434 435 if( tid != main_tid ) 436 { 437 if( pthread_join( trdid[tid] , NULL ) ) 438 { 439 printf("\n[fft error] in main thread joining thread %x\n", tid ); 440 exit( 0 ); 441 } 442 443 #if (DEBUG_MAIN & 1) 444 printf("\n[fft] main thread %d joined thread %d\n", main_tid, tid ); 445 #endif 446 447 } 448 } 449 } 450 } 413 printf("\n[fft] main completes <work> threads arguments at cycle %d\n", 414 (unsigned int)end_init_cycle ); 415 #endif 416 417 // create and execute the working threads 418 if( pthread_parallel_create( root_level, 419 &work, 420 &work_ptrs[0][0], 421 &parent_barriers[0] ) ) 422 { 423 printf("\n[fft error] creating threads\n"); 424 exit( 0 ); 425 } 426 427 #if DEBUG_MAIN 428 get_cycle( &debug_cycle ); 429 printf("\n[fft] main resume for instrumentation at cycle %d\n", 430 (unsigned int)debug_cycle) ; 431 #endif 451 432 452 433 #if PRINT_ARRAY … … 463 444 #endif 464 445 465 // instrumentation466 char name[64];467 char path[128];468 char string[256];469 int ret;470 471 // build file name472 if( USE_DQT_BARRIER )473 snprintf( name , 64 , "fft_dqt_%d_%d_%d", N , x_size * y_size , ncores );474 else475 snprintf( name , 64 , "fft_smp_%d_%d_%d", N , x_size * y_size , ncores );476 477 // build pathname478 snprintf( path , 128 , "/home/%s", name );479 480 // open instrumentation file481 FILE * f = fopen( path , NULL );482 if ( f == NULL )483 {484 printf("\n[fft error] cannot open instrumentation file <%s>\n", path );485 exit( 0 );486 }487 printf("\n[fft] file <%s> open\n", path );488 489 446 // display header on terminal, and save to file 490 447 printf("\n----- %s -----\n", name ); … … 497 454 } 498 455 499 // display results for each thread on terminal, and save to file456 // get instrumentation results for each thread 500 457 for (tid = 0 ; tid < nthreads ; tid++) 501 458 { … … 503 460 tid, init_time, parallel_time[tid], sync_time[tid] ); 504 461 505 // display on terminal, and save to instrumentation file 506 printf("%s" , string ); 462 // save to instrumentation file 507 463 fprintf( f , "%s" , string ); 508 464 if( ret < 0 ) 509 465 { 510 466 printf("\n[fft error] cannot write thread %d to file <%s>\n", tid, path ); 467 printf("%s", string ); 511 468 exit(0); 512 469 } 513 470 } 514 471 515 // display MIN/MAX values on terminal and save to file472 // compute min/max values 516 473 unsigned int min_para = parallel_time[0]; 517 474 unsigned int max_para = parallel_time[0]; … … 527 484 } 528 485 486 // display MIN/MAX values on terminal and save to file 529 487 snprintf( string , 256 , "\n Sequencial Parallel Barrier\n" 530 488 "MIN : %d\t | %d\t | %d\t (cycles)\n" … … 547 505 exit(0); 548 506 } 549 printf("\n[fft] file <%s> closed\n", path ); 507 508 #if DEBUG_MAIN 509 get_cycle( &debug_cycle ); 510 printf("\n[fft] main close file <%s> at cycle %d\n", 511 path, (unsigned int)debug_cycle ); 512 #endif 550 513 551 514 exit( 0 ); … … 553 516 } // end main() 554 517 555 /////////////////////////////////////////////////////////////// 556 // This function is executed in parallel by all threads.557 /////////////////////////////////////////////////////////////// 558 void slave(args_t * args )559 { 560 unsigned int i;561 unsigned int MyNum; // this thread index562 unsigned int MainNum; // main threadindex563 unsigned int MyFirst; // index first row allocated to thread564 unsigned int MyLast; // index last row allocated to thread 565 double * upriv;566 unsigned int c_id;567 unsigned int c_offset;518 ///////////////////////////////////////////////////////////////// 519 // This function is executed in parallel by all <work> threads. 520 ///////////////////////////////////////////////////////////////// 521 void work( work_args_t * args ) 522 { 523 unsigned int tid; // this thread continuous index 524 unsigned int lid; // core local index 525 unsigned int cid; // cluster continuous index 526 pthread_barrier_t * parent_barrier; // pointer on parent barrier 527 528 unsigned int MyFirst; // index first row allocated to thread 529 unsigned int MyLast; // index last row allocated to thread 530 double * upriv; // private array of FFT coefs 568 531 569 532 unsigned long long parallel_start; … … 572 535 unsigned long long barrier_stop; 573 536 574 MyNum = args->tid; 575 MainNum = args->main_tid; 537 // get thread arguments 538 tid = args->tid; 539 lid = args->lid; 540 cid = args->cid; 541 parent_barrier = args->parent_barrier; 576 542 577 543 get_cycle( ¶llel_start ); 578 544 579 #if DEBUG_ SLAVE545 #if DEBUG_WORK 580 546 printf("\n[fft] %s : thread %d enter / cycle %d\n", 581 __FUNCTION__, MyNum, (unsigned int)parallel_start ); 582 #endif 547 __FUNCTION__, tid, (unsigned int)parallel_start ); 548 #endif 549 550 // core 0 allocate memory from the local cluster 551 // for the distributed data[], trans[], twid[] buffers 552 // and for the private upriv[] buffer 553 if( lid == 0 ) 554 { 555 unsigned int data_size = (N / nclusters) * 2 * sizeof(double); 556 unsigned int coefs_size = (rootN - 1) * 2 * sizeof(double); 557 558 data[cid] = (double *)malloc( data_size ); 559 trans[cid] = (double *)malloc( data_size ); 560 twid[cid] = (double *)malloc( data_size ); 561 562 upriv = (double *)malloc( coefs_size ); 563 } 583 564 584 565 // BARRIER … … 586 567 pthread_barrier_wait( &barrier ); 587 568 get_cycle( &barrier_stop ); 588 sync_time[MyNum] += (unsigned int)(barrier_stop - barrier_start); 589 590 #if DEBUG_SLAVE 591 printf("\n[@@@] %s : thread %d exit first barrier / cycle %d\n", 592 __FUNCTION__, MyNum, (unsigned int)barrier_stop ); 593 #endif 594 595 // allocate and initialise local array upriv[] 596 // that is a local copy of the rootN coefs defined in umain[] 597 upriv = (double *)malloc(2 * (rootN - 1) * sizeof(double)); 598 for ( i = 0 ; i < (rootN - 1) ; i++) 599 { 600 c_id = i / (rootN / nclusters); 601 c_offset = i % (rootN / nclusters); 602 upriv[2*i] = umain[c_id][2*c_offset]; 603 upriv[2*i+1] = umain[c_id][2*c_offset+1]; 604 } 569 sync_time[tid] += (unsigned int)(barrier_stop - barrier_start); 570 571 #if DEBUG_WORK 572 printf("\n[fft] %s : thread %d exit first barrier / cycle %d\n", 573 __FUNCTION__, tid, (unsigned int)barrier_stop ); 574 #endif 575 576 // all threads initialize data[] local array 577 InitD( data , MODE , tid ); 578 579 // all threads initialize twid[] local array 580 InitT( twid , tid ); 581 582 // all threads initialise private upriv[] array 583 InitU( upriv ); 584 585 // BARRIER 586 get_cycle( &barrier_start ); 587 pthread_barrier_wait( &barrier ); 588 get_cycle( &barrier_stop ); 589 sync_time[tid] += (unsigned int)(barrier_stop - barrier_start); 590 591 #if DEBUG_WORK 592 printf("\n[fft] %s : thread %d exit second barrier / cycle %d\n", 593 __FUNCTION__, tid, (unsigned int)barrier_stop ); 594 #endif 605 595 606 596 // compute first and last rows handled by the thread 607 MyFirst = rootN * MyNum/ nthreads;608 MyLast = rootN * ( MyNum+ 1) / nthreads;597 MyFirst = rootN * tid / nthreads; 598 MyLast = rootN * (tid + 1) / nthreads; 609 599 610 600 // perform forward FFT 611 FFT1D( 1 , data , trans , upriv , twid , MyNum, MyFirst , MyLast );601 FFT1D( 1 , data , trans , upriv , twid , tid , MyFirst , MyLast ); 612 602 613 603 #if CHECK … … 615 605 pthread_barrier_wait( &barrier ); 616 606 get_cycle( &barrier_stop ); 617 sync_time[ MyNum] += (unsigned int)(barrier_stop - barrier_start);618 FFT1D( -1 , data , trans , upriv , twid , MyNum, MyFirst , MyLast );607 sync_time[tid] += (unsigned int)(barrier_stop - barrier_start); 608 FFT1D( -1 , data , trans , upriv , twid , tid , MyFirst , MyLast ); 619 609 #endif 620 610 … … 622 612 623 613 // register parallel time 624 parallel_time[MyNum] = (unsigned int)(parallel_stop - parallel_start); 625 626 #if DEBUG_SLAVE 627 printf("\n[fft] %s : thread %x completes fft / p_start %d / p_stop %d\n", 628 __FUNCTION__, MyNum, (unsigned int)parallel_start, (unsigned int)parallel_stop ); 629 int tid; 630 for (tid = 0 ; tid < nthreads ; tid++) 631 { 632 printf("- tid %d : Sequencial %d / Parallel %d / Barrier %d\n", 633 tid , init_time, parallel_time[tid], sync_time[tid] ); 634 } 635 #endif 636 637 // exit only if MyNum != MainNum 638 if( MyNum != MainNum ) pthread_exit( NULL ); 639 640 } // end slave() 614 parallel_time[tid] = (unsigned int)(parallel_stop - parallel_start); 615 616 #if DEBUG_WORK 617 printf("\n[fft] %s : thread %d completes fft / p_start %d / p_stop %d\n", 618 __FUNCTION__, tid, (unsigned int)parallel_start, (unsigned int)parallel_stop ); 619 #endif 620 621 // work thread signals completion to main 622 pthread_barrier_wait( parent_barrier ); 623 624 #if DEBUG_WORK 625 printf("\n[fft] %s : thread %d exit\n", 626 __FUNCTION__, tid ); 627 #endif 628 629 // work thread exit 630 pthread_exit( NULL ); 631 632 } // end work() 641 633 642 634 //////////////////////////////////////////////////////////////////////////////////////// … … 724 716 } 725 717 726 727 //////////////////////////// 728 void InitX(double ** x, 729 unsigned int mode ) 718 ////////////////////////////////////////////////////////////////////////////////////// 719 // Each working thread <tid> contributes to initialize (rootN / nthreads) rows, 720 // in the shared - and distributed - <data> array. 721 ////////////////////////////////////////////////////////////////////////////////////// 722 void InitD(double ** data, 723 unsigned int mode, 724 unsigned int tid ) 730 725 { 731 726 unsigned int i , j; … … 734 729 unsigned int index; 735 730 736 for ( j = 0 ; j < rootN ; j++ ) // loop on row index 731 // compute row_min and row_max 732 unsigned int row_min = tid * rows_per_thread; 733 unsigned int row_max = row_min + rows_per_thread; 734 735 for ( j = row_min ; j < row_max ; j++ ) // loop on rows 737 736 { 738 for ( i = 0 ; i < rootN ; i++ ) // loop on pointin a row737 for ( i = 0 ; i < rootN ; i++ ) // loop on points in a row 739 738 { 740 739 index = j * rootN + i; … … 745 744 if ( mode == RANDOM ) 746 745 { 747 x[c_id][2*c_offset] = ( (double)rand() ) / 65536;748 x[c_id][2*c_offset+1] = ( (double)rand() ) / 65536;746 data[c_id][2*c_offset] = ( (double)rand() ) / 65536; 747 data[c_id][2*c_offset+1] = ( (double)rand() ) / 65536; 749 748 } 750 749 … … 754 753 { 755 754 double phi = (double)( 2 * PI * index) / N; 756 x[c_id][2*c_offset] = cos( phi );757 x[c_id][2*c_offset+1] = sin( phi );755 data[c_id][2*c_offset] = cos( phi ); 756 data[c_id][2*c_offset+1] = sin( phi ); 758 757 } 759 758 … … 761 760 if ( mode == CONSTANT ) 762 761 { 763 x[c_id][2*c_offset] = 1.0;764 x[c_id][2*c_offset+1] = 0.0;762 data[c_id][2*c_offset] = 1.0; 763 data[c_id][2*c_offset+1] = 0.0; 765 764 } 766 765 } … … 768 767 } 769 768 770 ///////////////////////// 771 void InitU( double ** u ) 772 { 773 unsigned int q; 774 unsigned int j; 775 unsigned int base; 776 unsigned int n1; 777 unsigned int c_id; 778 unsigned int c_offset; 779 double phi; 780 unsigned int stop = 0; 781 782 for (q = 0 ; ((unsigned int)(1 << q) < N) && (stop == 0) ; q++) 783 { 784 n1 = 1 << q; 785 base = n1 - 1; 786 for (j = 0; (j < n1) && (stop == 0) ; j++) 787 { 788 if (base + j > rootN - 1) return; 789 790 c_id = (base + j) / (rootN / nclusters); 791 c_offset = (base + j) % (rootN / nclusters); 792 phi = (double)(2.0 * PI * j) / (2 * n1); 793 u[c_id][2*c_offset] = cos( phi ); 794 u[c_id][2*c_offset+1] = -sin( phi ); 795 } 796 } 797 } 798 799 ////////////////////////// 800 void InitT( double ** u ) 769 /////////////////////////////////////////////////////////////////////////////////////// 770 // Each working thread <tid> contributes to initialize (rootN / nthreads) rows, 771 // in the shared - and distributed - <twiddle> array. 772 /////////////////////////////////////////////////////////////////////////////////////// 773 void InitT( double ** twid, 774 unsigned int tid ) 801 775 { 802 776 unsigned int i, j; … … 806 780 double phi; 807 781 808 for ( j = 0 ; j < rootN ; j++ ) // loop on row index 782 // compute row_min and row_max 783 unsigned int row_min = tid * rows_per_thread; 784 unsigned int row_max = row_min + rows_per_thread; 785 786 for ( j = row_min ; j < row_max ; j++ ) // loop on rows 809 787 { 810 for ( i = 0 ; i < rootN ; i++ ) // loop on points in a row788 for ( i = 0 ; i < rootN ; i++ ) // loop on points in a row 811 789 { 812 790 index = j * rootN + i; … … 815 793 816 794 phi = (double)(2.0 * PI * i * j) / N; 817 u[c_id][2*c_offset] = cos( phi ); 818 u[c_id][2*c_offset+1] = -sin( phi ); 795 twid[c_id][2*c_offset] = cos( phi ); 796 twid[c_id][2*c_offset+1] = -sin( phi ); 797 } 798 } 799 } 800 801 /////////////////////////////////////////////////////////////////////////////////////// 802 // Each working thread initialize the private <upriv> array / (rootN - 1) entries. 803 /////////////////////////////////////////////////////////////////////////////////////// 804 void InitU( double * upriv ) 805 { 806 unsigned int q; 807 unsigned int j; 808 unsigned int base; 809 unsigned int n1; 810 double phi; 811 812 for (q = 0 ; ((unsigned int)(1 << q) < N) ; q++) 813 { 814 n1 = 1 << q; // n1 == 2**q 815 base = n1 - 1; 816 for (j = 0; (j < n1) ; j++) 817 { 818 if (base + j > rootN - 1) return; 819 820 phi = (double)(2.0 * PI * j) / (2 * n1); 821 upriv[2*(base+j)] = cos( phi ); 822 upriv[2*(base+j)+1] = -sin( phi ); 819 823 } 820 824 } … … 856 860 double * upriv, // local array containing coefs for rootN FFT 857 861 double ** twid, // distributed arrays containing N twiddle factors 858 unsigned int MyNum,// thread continuous index862 unsigned int tid, // thread continuous index 859 863 unsigned int MyFirst, 860 864 unsigned int MyLast ) … … 868 872 get_cycle( &cycle ); 869 873 printf("\n[fft] %s : thread %d enter / first %d / last %d / cycle %d\n", 870 __FUNCTION__, MyNum, MyFirst, MyLast, (unsigned int)cycle );874 __FUNCTION__, tid, MyFirst, MyLast, (unsigned int)cycle ); 871 875 #endif 872 876 … … 877 881 get_cycle( &cycle ); 878 882 printf("\n[fft] %s : thread %d after first transpose / cycle %d\n", 879 __FUNCTION__, MyNum, (unsigned int)cycle );883 __FUNCTION__, tid, (unsigned int)cycle ); 880 884 if( PRINT_ARRAY ) PrintArray( tmp , N ); 881 885 #endif … … 885 889 pthread_barrier_wait( &barrier ); 886 890 get_cycle( &barrier_stop ); 887 sync_time[ MyNum] = (unsigned int)(barrier_stop - barrier_start);891 sync_time[tid] = (unsigned int)(barrier_stop - barrier_start); 888 892 889 893 #if( DEBUG_FFT1D & 1 ) 890 894 get_cycle( &cycle ); 891 895 printf("\n[fft] %s : thread %d exit barrier after first transpose / cycle %d\n", 892 __FUNCTION__, MyNum, (unsigned int)cycle );896 __FUNCTION__, tid, (unsigned int)cycle ); 893 897 #endif 894 898 … … 902 906 903 907 #if( DEBUG_FFT1D & 1 ) 904 printf("\n[fft] %s : thread %d after first twiddle\n", __FUNCTION__, MyNum);908 printf("\n[fft] %s : thread %d after first twiddle\n", __FUNCTION__, tid); 905 909 if( PRINT_ARRAY ) PrintArray( tmp , N ); 906 910 #endif … … 912 916 913 917 #if( DEBUG_FFT1D & 1 ) 914 printf("\n[fft] %s : thread %d exit barrier after first twiddle\n", __FUNCTION__, MyNum);915 #endif 916 917 sync_time[ MyNum] += (unsigned int)(barrier_stop - barrier_start);918 printf("\n[fft] %s : thread %d exit barrier after first twiddle\n", __FUNCTION__, tid); 919 #endif 920 921 sync_time[tid] += (unsigned int)(barrier_stop - barrier_start); 918 922 919 923 // transpose tmp to x … … 921 925 922 926 #if( DEBUG_FFT1D & 1 ) 923 printf("\n[fft] %s : thread %d after second transpose\n", __FUNCTION__, MyNum);927 printf("\n[fft] %s : thread %d after second transpose\n", __FUNCTION__, tid); 924 928 if( PRINT_ARRAY ) PrintArray( x , N ); 925 929 #endif … … 931 935 932 936 #if( DEBUG_FFT1D & 1 ) 933 printf("\n[fft] %s : thread %d exit barrier after second transpose\n", __FUNCTION__, MyNum);934 #endif 935 936 sync_time[ MyNum] += (unsigned int)(barrier_stop - barrier_start);937 printf("\n[fft] %s : thread %d exit barrier after second transpose\n", __FUNCTION__, tid); 938 #endif 939 940 sync_time[tid] += (unsigned int)(barrier_stop - barrier_start); 937 941 938 942 // do FFTs on rows of x and apply the scaling factor … … 944 948 945 949 #if( DEBUG_FFT1D & 1 ) 946 printf("\n[fft] %s : thread %d after FFT on rows\n", __FUNCTION__, MyNum);950 printf("\n[fft] %s : thread %d after FFT on rows\n", __FUNCTION__, tid); 947 951 if( PRINT_ARRAY ) PrintArray( x , N ); 948 952 #endif … … 954 958 955 959 #if( DEBUG_FFT1D & 1 ) 956 printf("\n[fft] %s : thread %d exit barrier after FFT on rows\n", __FUNCTION__, MyNum);957 #endif 958 sync_time[ MyNum] += (unsigned int)(barrier_stop - barrier_start);960 printf("\n[fft] %s : thread %d exit barrier after FFT on rows\n", __FUNCTION__, tid); 961 #endif 962 sync_time[tid] += (unsigned int)(barrier_stop - barrier_start); 959 963 960 964 // transpose x to tmp … … 962 966 963 967 #if( DEBUG_FFT1D & 1 ) 964 printf("\n[fft] %s : thread %x after third transpose\n", __FUNCTION__, MyNum);968 printf("\n[fft] %s : thread %x after third transpose\n", __FUNCTION__, tid); 965 969 if( PRINT_ARRAY ) PrintArray( x , N ); 966 970 #endif … … 972 976 973 977 #if( DEBUG_FFT1D & 1 ) 974 printf("\n[fft] %s : thread %d exit barrier after third transpose\n", __FUNCTION__, MyNum);975 #endif 976 977 sync_time[ MyNum] += (unsigned int)(barrier_stop - barrier_start);978 sync_time[ MyNum] += (long)(barrier_stop - barrier_start);978 printf("\n[fft] %s : thread %d exit barrier after third transpose\n", __FUNCTION__, tid); 979 #endif 980 981 sync_time[tid] += (unsigned int)(barrier_stop - barrier_start); 982 sync_time[tid] += (long)(barrier_stop - barrier_start); 979 983 980 984 // copy tmp to x … … 982 986 983 987 #if DEBUG_FFT1D 984 printf("\n[fft] %s : thread %d completed\n", __FUNCTION__, MyNum);988 printf("\n[fft] %s : thread %d completed\n", __FUNCTION__, tid); 985 989 if( PRINT_ARRAY ) PrintArray( x , N ); 986 990 #endif
Note: See TracChangeset
for help on using the changeset viewer.