Changeset 637 for trunk/user
- Timestamp:
- Jul 18, 2019, 2:06:55 PM (5 years ago)
- Location:
- trunk/user
- Files:
-
- 5 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/user/fft/fft.c
r636 r637 22 22 // of N complex points, using the Cooley-Tuckey FFT method. 23 23 // The N data points are seen as a 2D array (rootN rows * rootN columns). 24 // Each thread handle (rootN / nthreads) rows. The N input data points25 // be initialised in three different modes:24 // Each thread handle (rootN / nthreads) rows. 25 // The N input data points can be initialised in three different modes: 26 26 // - CONSTANT : all data points have the same [1,0] value 27 27 // - COSIN : data point n has [cos(n/N) , sin(n/N)] values … … 31 31 // - M : N = 2**M = number of data points / M must be an even number. 32 32 // - T : nthreads = ncores defined by the hardware / must be power of 2. 33 // The number of threads cannot be larger than the number of rows. 33 34 // 34 // This application uses 4 shared data arrays, that are dynamically 35 // allocated an distributed, using the remote_malloc() function, with 36 // one sub-buffer per cluster: 37 // - data[N] contains N input data points, with 2 double per point. 38 // - trans[N] contains N intermediate data points, 2 double per point. 39 // - umain[rootN] contains rootN coefs required for a rootN points FFT. 40 // - twid[N] contains N coefs : exp(2*pi*i*j/N) / i and j in [0,rootN-1]. 41 // For data, trans, twid, each sub-buffer contains (N/nclusters) points. 42 // For umain, each sub-buffer contains (rootN/nclusters) points. 35 // This application uses 3 shared data arrays, that are dynamically 36 // allocated and distributed in clusters, with one sub-buffer per cluster: 37 // - data[N] contains N input data points, 38 // - trans[N] contains N intermediate data points, 39 // - twid[N] contains N coefs : exp(2*pi*i*j/N) / i and j in [0,rootN-1] 40 // Each sub-buffer contains (N/nclusters) entries, with 2 double per entry. 41 // These distributed buffers are allocated and initialised in parallel 42 // by the working threads running on core 0 in each cluster. 43 43 // 44 // There is one thread per core. 45 // The max number of clusters is defined by (X_MAX * Y_MAX). 46 // The max number of cores per cluster is defined by CORES_MAX. 44 // Each working thread allocates also a private coefs[rootN-1] buffer, 45 // that contains all coefs required for a rootN points FFT. 46 // 47 // There is one working thread per core. 48 // The actual number of cores and cluster in a given hardware architecture 49 // is obtained by the get_config() syscall (x_size, y_size, ncores). 50 // The max number of clusters is bounded by (X_MAX * Y_MAX). 51 // The max number of cores per cluster is bounded by CORES_MAX. 47 52 // 48 53 // Several configuration parameters can be defined below: … … 57 62 // by the main thread in the main() function. 58 63 // - The parallel execution time (parallel_time[i]) is computed by each 59 // thread(i) in the slave() function.64 // working thread(i) in the work() function. 60 65 // - The synchronisation time related to the barriers (sync_time[i]) 61 // is computed by each thread(i) in the slave() function.66 // is computed by each thread(i) in the work() function. 62 67 // The results are displayed on the TXT terminal, and registered on disk. 63 68 /////////////////////////////////////////////////////////////////////////// … … 87 92 // parameters 88 93 89 #define DEFAULT_M 1 2 // 4096data points90 #define USE_DQT_BARRIER 0// use DDT barrier if non zero94 #define DEFAULT_M 14 // 16384 data points 95 #define USE_DQT_BARRIER 1 // use DDT barrier if non zero 91 96 #define MODE COSIN // DATA array initialisation mode 92 97 #define CHECK 0 93 #define DEBUG_MAIN 0// trace main() function (detailed if odd)94 #define DEBUG_ SLAVE 0 // trace slave() function (detailed if odd)98 #define DEBUG_MAIN 1 // trace main() function (detailed if odd) 99 #define DEBUG_WORK 1 // trace work() function (detailed if odd) 95 100 #define DEBUG_FFT1D 0 // trace FFT1D() function (detailed if odd) 96 101 #define DEBUG_ROW 0 // trace FFTRow() function (detailed if odd) … … 101 106 102 107 ///////////////////////////////////////////////////////////////////////////////////// 103 // structure containing the arguments for the slave() function108 // FFT specific global variables 104 109 ///////////////////////////////////////////////////////////////////////////////////// 105 110 106 typedef struct args_s 107 { 108 unsigned int tid; // thread continuous index 109 unsigned int main_tid; // main thread continuous index 111 // work function arguments 112 typedef struct work_args_s 113 { 114 unsigned int tid; // thread continuous index 115 unsigned int lid; // core local index 116 unsigned int cid; // cluster continuous index 117 pthread_barrier_t * parent_barrier; // parent barrier to signal completion 110 118 } 111 args_t; 112 113 ///////////////////////////////////////////////////////////////////////////////////// 114 // global variables 115 ///////////////////////////////////////////////////////////////////////////////////// 116 117 unsigned int x_size; // number of clusters per row in the mesh 118 unsigned int y_size; // number of clusters per column in the mesh 119 unsigned int ncores; // number of cores per cluster 119 work_args_t; 120 120 121 unsigned int nthreads; // total number of threads (one thread per core) 121 122 unsigned int nclusters; // total number of clusters … … 129 130 double * data[CLUSTERS_MAX]; // original time-domain data 130 131 double * trans[CLUSTERS_MAX]; // used as auxiliary space for transpose 132 double * twid[CLUSTERS_MAX]; // twiddle factor : exp(-2iPI*k*n/N) 131 133 double * bloup[CLUSTERS_MAX]; // used as auxiliary space for DFT 132 double * umain[CLUSTERS_MAX]; // roots of unity used fo rootN points FFT133 double * twid[CLUSTERS_MAX]; // twiddle factor : exp(-2iPI*k*n/N)134 134 135 135 // instrumentation counters … … 142 142 pthread_barrierattr_t barrier_attr; 143 143 144 // threads identifiers, attributes, and arguments 145 pthread_t trdid[THREADS_MAX]; // kernel threads identifiers 146 pthread_attr_t attr[THREADS_MAX]; // POSIX thread attributes 147 args_t args[THREADS_MAX]; // slave function arguments 148 149 ///////////////////////////////////////////////////////////////////////////////// 144 ///////////////////////////////////////////////////////////////////////////////////// 145 // Global variables required by parallel_pthread_create() 146 ///////////////////////////////////////////////////////////////////////////////////// 147 148 // 2D arrays of input arguments for the <work> threads 149 // These arrays are initialised by the application main thread 150 151 work_args_t work_args[CLUSTERS_MAX][CORES_MAX]; // work function arguments 152 work_args_t * work_ptrs[CLUSTERS_MAX][CORES_MAX]; // pointers on arguments 153 154 // 1D array of barriers to allow the <work> threads to signal termination 155 // this array is initialised in each cluster by the <build[cxy][0]> thread 156 157 pthread_barrier_t parent_barriers[CLUSTERS_MAX]; // termination barrier 158 159 ///////////////////////////////////////////////////////////////////////////////////// 150 160 // functions declaration 151 ///////////////////////////////////////////////////////////////////////////////// 152 153 void slave(args_t * args );161 ///////////////////////////////////////////////////////////////////////////////////// 162 163 void work( work_args_t * args ); 154 164 155 165 double CheckSum( void ); 156 166 157 void InitX(double ** x , unsigned int mode); 158 159 void InitU(double ** u); 160 161 void InitT(double ** u); 167 void InitD( double ** data , 168 unsigned int mode, 169 unsigned int tid ); 170 171 void InitT( double ** twid, 172 unsigned int tid ); 173 174 void InitU( double * coefs ); 162 175 163 176 unsigned int BitReverse( unsigned int k ); … … 168 181 double * upriv, 169 182 double ** twid, 170 unsigned int MyNum,183 unsigned int tid, 171 184 unsigned int MyFirst, 172 185 unsigned int MyLast ); … … 217 230 int error; 218 231 219 unsigned int main_cxy; // main thread cluster 220 unsigned int main_x; // main thread X coordinate 221 unsigned int main_y; // main thread y coordinate 222 unsigned int main_lid; // main thread local core index 223 unsigned int main_tid; // main thread continuous index 232 unsigned int x_size; // number of clusters per row 233 unsigned int y_size; // number of clusters per column 234 unsigned int ncores; // max number of cores per cluster 224 235 225 236 unsigned int x; // current index for cluster X coordinate 226 237 unsigned int y; // current index for cluster Y coordinate 227 238 unsigned int lid; // current index for core in a cluster 228 unsigned int ci; // continuous cluster index (from x,y) 239 unsigned int tid; // continuous thread index 240 unsigned int cid; // cluster continuous index 229 241 unsigned int cxy; // hardware specific cluster identifier 230 unsigned int tid; // continuous thread index 242 243 char name[64]; // instrumentation file name 244 char path[128]; // instrumentation path name 245 char string[256]; 246 int ret; 231 247 232 248 unsigned long long start_init_cycle; 233 249 unsigned long long end_init_cycle; 234 250 251 #if DEBUG_MAIN 252 unsigned long long debug_cycle; 253 #endif 254 235 255 #if CHECK 236 double ck1;// for input/output checking237 double ck3;// for input/output checking256 double ck1; // for input/output checking 257 double ck3; // for input/output checking 238 258 #endif 239 259 … … 241 261 get_cycle( &start_init_cycle ); 242 262 243 // get platform parameters to compute nthreads & nclusters263 // get platform parameters 244 264 if( get_config( &x_size , &y_size , &ncores ) ) 245 265 { … … 269 289 } 270 290 291 // compute nthreads and nclusters 271 292 nthreads = x_size * y_size * ncores; 272 293 nclusters = x_size * y_size; 294 295 // compute covering DQT size an level 296 unsigned int z = (x_size > y_size) ? x_size : y_size; 297 unsigned int root_level = (z == 1) ? 0 : (z == 2) ? 1 : (z == 4) ? 2 : (z == 8) ? 3 : 4; 273 298 274 299 // compute various constants depending on N and T … … 285 310 } 286 311 287 // get main thread coordinates (main_x, main_y, main_lid) 288 get_core( &main_cxy , &main_lid ); 289 main_x = HAL_X_FROM_CXY( main_cxy ); 290 main_y = HAL_Y_FROM_CXY( main_cxy ); 291 main_tid = (((main_x * y_size) + main_y) * ncores) + main_lid; 292 293 printf("\n[fft] starts / core[%x,%d] / %d points / %d thread(s) / PID %x / cycle %d\n", 294 main_cxy, main_lid, N, nthreads, getpid(), (unsigned int)start_init_cycle ); 295 296 // allocate memory for the distributed data[i], trans[i], umain[i], twid[i] buffers 297 // the index (i) is a continuous cluster index 298 unsigned int data_size = (N / nclusters) * 2 * sizeof(double); 299 unsigned int coefs_size = (rootN / nclusters) * 2 * sizeof(double); 300 for (x = 0 ; x < x_size ; x++) 301 { 302 for (y = 0 ; y < y_size ; y++) 303 { 304 ci = x * y_size + y; 305 cxy = HAL_CXY_FROM_XY( x , y ); 306 data[ci] = (double *)remote_malloc( data_size , cxy ); 307 trans[ci] = (double *)remote_malloc( data_size , cxy ); 308 bloup[ci] = (double *)remote_malloc( data_size , cxy ); 309 umain[ci] = (double *)remote_malloc( coefs_size , cxy ); 310 twid[ci] = (double *)remote_malloc( data_size , cxy ); 311 } 312 printf("\n[fft] starts / %d points / %d thread(s) / PID %x / cycle %d\n", 313 N, nthreads, getpid(), (unsigned int)start_init_cycle ); 314 315 // build instrumentation file name 316 if( USE_DQT_BARRIER ) 317 snprintf( name , 64 , "p_fft_dqt_%d_%d_%d", N , x_size * y_size , ncores ); 318 else 319 snprintf( name , 64 , "p_fft_smp_%d_%d_%d", N , x_size * y_size , ncores ); 320 321 // build pathname 322 snprintf( path , 128 , "/home/%s", name ); 323 324 // open instrumentation file 325 FILE * f = fopen( path , NULL ); 326 if ( f == NULL ) 327 { 328 printf("\n[fft error] cannot open instrumentation file <%s>\n", path ); 329 exit( 0 ); 312 330 } 313 331 314 332 #if DEBUG_MAIN 315 printf("\n[fft] main completes remote_malloc\n"); 316 #endif 317 318 // arrays initialisation 319 InitX( data , MODE ); 320 InitU( umain ); 321 InitT( twid ); 322 323 #if DEBUG_MAIN 324 printf("\n[fft] main completes arrays init\n"); 333 get_cycle( &debug_cycle ); 334 printf("\n[fft] main open file <%s> at cycle %d\n", 335 path, (unsigned int)debug_cycle ); 325 336 #endif 326 337 … … 342 353 #endif 343 354 344 // initialise barrier 355 // initialise barrier synchronizing all <work> threads 345 356 if( USE_DQT_BARRIER ) 346 357 { … … 362 373 363 374 #if DEBUG_MAIN 364 printf("\n[fft] main completes barrier init\n"); 365 #endif 366 367 // launch other threads to execute the slave() function 368 // on cores other than the core running the main thread 375 get_cycle( &debug_cycle ); 376 printf("\n[fft] main completes barrier init at cycle %d\n", 377 (unsigned int)debug_cycle ); 378 #endif 379 380 // build array of arguments for the <work> threads 369 381 for (x = 0 ; x < x_size ; x++) 370 382 { … … 376 388 for ( lid = 0 ; lid < ncores ; lid++ ) 377 389 { 378 // compute thread user index (continuous index) 379 tid = (((x * y_size) + y) * ncores) + lid; 380 381 // set thread attributes 382 attr[tid].attributes = PT_ATTR_CLUSTER_DEFINED | PT_ATTR_CORE_DEFINED; 383 attr[tid].cxy = cxy; 384 attr[tid].lid = lid; 385 386 // set slave function argument 387 args[tid].tid = tid; 388 args[tid].main_tid = main_tid; 389 390 // create thread 391 if( tid != main_tid ) 392 { 393 if ( pthread_create( &trdid[tid], // pointer on kernel identifier 394 &attr[tid], // pointer on thread attributes 395 &slave, // pointer on function 396 &args[tid]) ) // pointer on function arguments 397 { 398 printf("\n[fft error] creating thread %x\n", tid ); 399 exit( 0 ); 400 } 401 402 #if (DEBUG_MAIN & 1) 403 unsigned long long debug_cycle; 404 get_cycle( &debug_cycle ); 405 printf("\n[fft] main created thread %d on core[%x,%d] / cycle %d\n", 406 tid, cxy, lid, (unsigned int)debug_cycle ); 407 #endif 408 } 390 // compute cluster continuous index 391 cid = (x * y_size) + y; 392 393 // compute work thread continuous index 394 tid = (cid * ncores) + lid; 395 396 // initialize 2D array of arguments 397 work_args[cxy][lid].tid = tid; 398 work_args[cxy][lid].lid = lid; 399 work_args[cxy][lid].cid = cid; 400 work_args[cxy][lid].parent_barrier = &parent_barriers[cxy]; 401 402 // initialize 2D array of pointers 403 work_ptrs[cxy][lid] = &work_args[cxy][lid]; 409 404 } 410 405 } 411 406 } 412 407 408 // register sequencial time 409 get_cycle( &end_init_cycle ); 410 init_time = (unsigned int)(end_init_cycle - start_init_cycle); 411 413 412 #if DEBUG_MAIN 414 printf("\n[fft] main completes threads creation\n"); 415 #endif 416 417 get_cycle( &end_init_cycle ); 418 419 // register sequencial time 420 init_time = (unsigned int)(end_init_cycle - start_init_cycle); 421 422 // main itself executes the slave() function 423 slave( &args[main_tid] ); 424 425 // wait other threads completion 426 for (x = 0 ; x < x_size ; x++) 427 { 428 for (y = 0 ; y < y_size ; y++) 429 { 430 for ( lid = 0 ; lid < ncores ; lid++ ) 431 { 432 // compute thread continuous index 433 tid = (((x * y_size) + y) * ncores) + lid; 434 435 if( tid != main_tid ) 436 { 437 if( pthread_join( trdid[tid] , NULL ) ) 438 { 439 printf("\n[fft error] in main thread joining thread %x\n", tid ); 440 exit( 0 ); 441 } 442 443 #if (DEBUG_MAIN & 1) 444 printf("\n[fft] main thread %d joined thread %d\n", main_tid, tid ); 445 #endif 446 447 } 448 } 449 } 450 } 413 printf("\n[fft] main completes <work> threads arguments at cycle %d\n", 414 (unsigned int)end_init_cycle ); 415 #endif 416 417 // create and execute the working threads 418 if( pthread_parallel_create( root_level, 419 &work, 420 &work_ptrs[0][0], 421 &parent_barriers[0] ) ) 422 { 423 printf("\n[fft error] creating threads\n"); 424 exit( 0 ); 425 } 426 427 #if DEBUG_MAIN 428 get_cycle( &debug_cycle ); 429 printf("\n[fft] main resume for instrumentation at cycle %d\n", 430 (unsigned int)debug_cycle) ; 431 #endif 451 432 452 433 #if PRINT_ARRAY … … 463 444 #endif 464 445 465 // instrumentation466 char name[64];467 char path[128];468 char string[256];469 int ret;470 471 // build file name472 if( USE_DQT_BARRIER )473 snprintf( name , 64 , "fft_dqt_%d_%d_%d", N , x_size * y_size , ncores );474 else475 snprintf( name , 64 , "fft_smp_%d_%d_%d", N , x_size * y_size , ncores );476 477 // build pathname478 snprintf( path , 128 , "/home/%s", name );479 480 // open instrumentation file481 FILE * f = fopen( path , NULL );482 if ( f == NULL )483 {484 printf("\n[fft error] cannot open instrumentation file <%s>\n", path );485 exit( 0 );486 }487 printf("\n[fft] file <%s> open\n", path );488 489 446 // display header on terminal, and save to file 490 447 printf("\n----- %s -----\n", name ); … … 497 454 } 498 455 499 // display results for each thread on terminal, and save to file456 // get instrumentation results for each thread 500 457 for (tid = 0 ; tid < nthreads ; tid++) 501 458 { … … 503 460 tid, init_time, parallel_time[tid], sync_time[tid] ); 504 461 505 // display on terminal, and save to instrumentation file 506 printf("%s" , string ); 462 // save to instrumentation file 507 463 fprintf( f , "%s" , string ); 508 464 if( ret < 0 ) 509 465 { 510 466 printf("\n[fft error] cannot write thread %d to file <%s>\n", tid, path ); 467 printf("%s", string ); 511 468 exit(0); 512 469 } 513 470 } 514 471 515 // display MIN/MAX values on terminal and save to file472 // compute min/max values 516 473 unsigned int min_para = parallel_time[0]; 517 474 unsigned int max_para = parallel_time[0]; … … 527 484 } 528 485 486 // display MIN/MAX values on terminal and save to file 529 487 snprintf( string , 256 , "\n Sequencial Parallel Barrier\n" 530 488 "MIN : %d\t | %d\t | %d\t (cycles)\n" … … 547 505 exit(0); 548 506 } 549 printf("\n[fft] file <%s> closed\n", path ); 507 508 #if DEBUG_MAIN 509 get_cycle( &debug_cycle ); 510 printf("\n[fft] main close file <%s> at cycle %d\n", 511 path, (unsigned int)debug_cycle ); 512 #endif 550 513 551 514 exit( 0 ); … … 553 516 } // end main() 554 517 555 /////////////////////////////////////////////////////////////// 556 // This function is executed in parallel by all threads.557 /////////////////////////////////////////////////////////////// 558 void slave(args_t * args )559 { 560 unsigned int i;561 unsigned int MyNum; // this thread index562 unsigned int MainNum; // main threadindex563 unsigned int MyFirst; // index first row allocated to thread564 unsigned int MyLast; // index last row allocated to thread 565 double * upriv;566 unsigned int c_id;567 unsigned int c_offset;518 ///////////////////////////////////////////////////////////////// 519 // This function is executed in parallel by all <work> threads. 520 ///////////////////////////////////////////////////////////////// 521 void work( work_args_t * args ) 522 { 523 unsigned int tid; // this thread continuous index 524 unsigned int lid; // core local index 525 unsigned int cid; // cluster continuous index 526 pthread_barrier_t * parent_barrier; // pointer on parent barrier 527 528 unsigned int MyFirst; // index first row allocated to thread 529 unsigned int MyLast; // index last row allocated to thread 530 double * upriv; // private array of FFT coefs 568 531 569 532 unsigned long long parallel_start; … … 572 535 unsigned long long barrier_stop; 573 536 574 MyNum = args->tid; 575 MainNum = args->main_tid; 537 // get thread arguments 538 tid = args->tid; 539 lid = args->lid; 540 cid = args->cid; 541 parent_barrier = args->parent_barrier; 576 542 577 543 get_cycle( ¶llel_start ); 578 544 579 #if DEBUG_ SLAVE545 #if DEBUG_WORK 580 546 printf("\n[fft] %s : thread %d enter / cycle %d\n", 581 __FUNCTION__, MyNum, (unsigned int)parallel_start ); 582 #endif 547 __FUNCTION__, tid, (unsigned int)parallel_start ); 548 #endif 549 550 // core 0 allocate memory from the local cluster 551 // for the distributed data[], trans[], twid[] buffers 552 // and for the private upriv[] buffer 553 if( lid == 0 ) 554 { 555 unsigned int data_size = (N / nclusters) * 2 * sizeof(double); 556 unsigned int coefs_size = (rootN - 1) * 2 * sizeof(double); 557 558 data[cid] = (double *)malloc( data_size ); 559 trans[cid] = (double *)malloc( data_size ); 560 twid[cid] = (double *)malloc( data_size ); 561 562 upriv = (double *)malloc( coefs_size ); 563 } 583 564 584 565 // BARRIER … … 586 567 pthread_barrier_wait( &barrier ); 587 568 get_cycle( &barrier_stop ); 588 sync_time[MyNum] += (unsigned int)(barrier_stop - barrier_start); 589 590 #if DEBUG_SLAVE 591 printf("\n[@@@] %s : thread %d exit first barrier / cycle %d\n", 592 __FUNCTION__, MyNum, (unsigned int)barrier_stop ); 593 #endif 594 595 // allocate and initialise local array upriv[] 596 // that is a local copy of the rootN coefs defined in umain[] 597 upriv = (double *)malloc(2 * (rootN - 1) * sizeof(double)); 598 for ( i = 0 ; i < (rootN - 1) ; i++) 599 { 600 c_id = i / (rootN / nclusters); 601 c_offset = i % (rootN / nclusters); 602 upriv[2*i] = umain[c_id][2*c_offset]; 603 upriv[2*i+1] = umain[c_id][2*c_offset+1]; 604 } 569 sync_time[tid] += (unsigned int)(barrier_stop - barrier_start); 570 571 #if DEBUG_WORK 572 printf("\n[fft] %s : thread %d exit first barrier / cycle %d\n", 573 __FUNCTION__, tid, (unsigned int)barrier_stop ); 574 #endif 575 576 // all threads initialize data[] local array 577 InitD( data , MODE , tid ); 578 579 // all threads initialize twid[] local array 580 InitT( twid , tid ); 581 582 // all threads initialise private upriv[] array 583 InitU( upriv ); 584 585 // BARRIER 586 get_cycle( &barrier_start ); 587 pthread_barrier_wait( &barrier ); 588 get_cycle( &barrier_stop ); 589 sync_time[tid] += (unsigned int)(barrier_stop - barrier_start); 590 591 #if DEBUG_WORK 592 printf("\n[fft] %s : thread %d exit second barrier / cycle %d\n", 593 __FUNCTION__, tid, (unsigned int)barrier_stop ); 594 #endif 605 595 606 596 // compute first and last rows handled by the thread 607 MyFirst = rootN * MyNum/ nthreads;608 MyLast = rootN * ( MyNum+ 1) / nthreads;597 MyFirst = rootN * tid / nthreads; 598 MyLast = rootN * (tid + 1) / nthreads; 609 599 610 600 // perform forward FFT 611 FFT1D( 1 , data , trans , upriv , twid , MyNum, MyFirst , MyLast );601 FFT1D( 1 , data , trans , upriv , twid , tid , MyFirst , MyLast ); 612 602 613 603 #if CHECK … … 615 605 pthread_barrier_wait( &barrier ); 616 606 get_cycle( &barrier_stop ); 617 sync_time[ MyNum] += (unsigned int)(barrier_stop - barrier_start);618 FFT1D( -1 , data , trans , upriv , twid , MyNum, MyFirst , MyLast );607 sync_time[tid] += (unsigned int)(barrier_stop - barrier_start); 608 FFT1D( -1 , data , trans , upriv , twid , tid , MyFirst , MyLast ); 619 609 #endif 620 610 … … 622 612 623 613 // register parallel time 624 parallel_time[MyNum] = (unsigned int)(parallel_stop - parallel_start); 625 626 #if DEBUG_SLAVE 627 printf("\n[fft] %s : thread %x completes fft / p_start %d / p_stop %d\n", 628 __FUNCTION__, MyNum, (unsigned int)parallel_start, (unsigned int)parallel_stop ); 629 int tid; 630 for (tid = 0 ; tid < nthreads ; tid++) 631 { 632 printf("- tid %d : Sequencial %d / Parallel %d / Barrier %d\n", 633 tid , init_time, parallel_time[tid], sync_time[tid] ); 634 } 635 #endif 636 637 // exit only if MyNum != MainNum 638 if( MyNum != MainNum ) pthread_exit( NULL ); 639 640 } // end slave() 614 parallel_time[tid] = (unsigned int)(parallel_stop - parallel_start); 615 616 #if DEBUG_WORK 617 printf("\n[fft] %s : thread %d completes fft / p_start %d / p_stop %d\n", 618 __FUNCTION__, tid, (unsigned int)parallel_start, (unsigned int)parallel_stop ); 619 #endif 620 621 // work thread signals completion to main 622 pthread_barrier_wait( parent_barrier ); 623 624 #if DEBUG_WORK 625 printf("\n[fft] %s : thread %d exit\n", 626 __FUNCTION__, tid ); 627 #endif 628 629 // work thread exit 630 pthread_exit( NULL ); 631 632 } // end work() 641 633 642 634 //////////////////////////////////////////////////////////////////////////////////////// … … 724 716 } 725 717 726 727 //////////////////////////// 728 void InitX(double ** x, 729 unsigned int mode ) 718 ////////////////////////////////////////////////////////////////////////////////////// 719 // Each working thread <tid> contributes to initialize (rootN / nthreads) rows, 720 // in the shared - and distributed - <data> array. 721 ////////////////////////////////////////////////////////////////////////////////////// 722 void InitD(double ** data, 723 unsigned int mode, 724 unsigned int tid ) 730 725 { 731 726 unsigned int i , j; … … 734 729 unsigned int index; 735 730 736 for ( j = 0 ; j < rootN ; j++ ) // loop on row index 731 // compute row_min and row_max 732 unsigned int row_min = tid * rows_per_thread; 733 unsigned int row_max = row_min + rows_per_thread; 734 735 for ( j = row_min ; j < row_max ; j++ ) // loop on rows 737 736 { 738 for ( i = 0 ; i < rootN ; i++ ) // loop on pointin a row737 for ( i = 0 ; i < rootN ; i++ ) // loop on points in a row 739 738 { 740 739 index = j * rootN + i; … … 745 744 if ( mode == RANDOM ) 746 745 { 747 x[c_id][2*c_offset] = ( (double)rand() ) / 65536;748 x[c_id][2*c_offset+1] = ( (double)rand() ) / 65536;746 data[c_id][2*c_offset] = ( (double)rand() ) / 65536; 747 data[c_id][2*c_offset+1] = ( (double)rand() ) / 65536; 749 748 } 750 749 … … 754 753 { 755 754 double phi = (double)( 2 * PI * index) / N; 756 x[c_id][2*c_offset] = cos( phi );757 x[c_id][2*c_offset+1] = sin( phi );755 data[c_id][2*c_offset] = cos( phi ); 756 data[c_id][2*c_offset+1] = sin( phi ); 758 757 } 759 758 … … 761 760 if ( mode == CONSTANT ) 762 761 { 763 x[c_id][2*c_offset] = 1.0;764 x[c_id][2*c_offset+1] = 0.0;762 data[c_id][2*c_offset] = 1.0; 763 data[c_id][2*c_offset+1] = 0.0; 765 764 } 766 765 } … … 768 767 } 769 768 770 ///////////////////////// 771 void InitU( double ** u ) 772 { 773 unsigned int q; 774 unsigned int j; 775 unsigned int base; 776 unsigned int n1; 777 unsigned int c_id; 778 unsigned int c_offset; 779 double phi; 780 unsigned int stop = 0; 781 782 for (q = 0 ; ((unsigned int)(1 << q) < N) && (stop == 0) ; q++) 783 { 784 n1 = 1 << q; 785 base = n1 - 1; 786 for (j = 0; (j < n1) && (stop == 0) ; j++) 787 { 788 if (base + j > rootN - 1) return; 789 790 c_id = (base + j) / (rootN / nclusters); 791 c_offset = (base + j) % (rootN / nclusters); 792 phi = (double)(2.0 * PI * j) / (2 * n1); 793 u[c_id][2*c_offset] = cos( phi ); 794 u[c_id][2*c_offset+1] = -sin( phi ); 795 } 796 } 797 } 798 799 ////////////////////////// 800 void InitT( double ** u ) 769 /////////////////////////////////////////////////////////////////////////////////////// 770 // Each working thread <tid> contributes to initialize (rootN / nthreads) rows, 771 // in the shared - and distributed - <twiddle> array. 772 /////////////////////////////////////////////////////////////////////////////////////// 773 void InitT( double ** twid, 774 unsigned int tid ) 801 775 { 802 776 unsigned int i, j; … … 806 780 double phi; 807 781 808 for ( j = 0 ; j < rootN ; j++ ) // loop on row index 782 // compute row_min and row_max 783 unsigned int row_min = tid * rows_per_thread; 784 unsigned int row_max = row_min + rows_per_thread; 785 786 for ( j = row_min ; j < row_max ; j++ ) // loop on rows 809 787 { 810 for ( i = 0 ; i < rootN ; i++ ) // loop on points in a row788 for ( i = 0 ; i < rootN ; i++ ) // loop on points in a row 811 789 { 812 790 index = j * rootN + i; … … 815 793 816 794 phi = (double)(2.0 * PI * i * j) / N; 817 u[c_id][2*c_offset] = cos( phi ); 818 u[c_id][2*c_offset+1] = -sin( phi ); 795 twid[c_id][2*c_offset] = cos( phi ); 796 twid[c_id][2*c_offset+1] = -sin( phi ); 797 } 798 } 799 } 800 801 /////////////////////////////////////////////////////////////////////////////////////// 802 // Each working thread initialize the private <upriv> array / (rootN - 1) entries. 803 /////////////////////////////////////////////////////////////////////////////////////// 804 void InitU( double * upriv ) 805 { 806 unsigned int q; 807 unsigned int j; 808 unsigned int base; 809 unsigned int n1; 810 double phi; 811 812 for (q = 0 ; ((unsigned int)(1 << q) < N) ; q++) 813 { 814 n1 = 1 << q; // n1 == 2**q 815 base = n1 - 1; 816 for (j = 0; (j < n1) ; j++) 817 { 818 if (base + j > rootN - 1) return; 819 820 phi = (double)(2.0 * PI * j) / (2 * n1); 821 upriv[2*(base+j)] = cos( phi ); 822 upriv[2*(base+j)+1] = -sin( phi ); 819 823 } 820 824 } … … 856 860 double * upriv, // local array containing coefs for rootN FFT 857 861 double ** twid, // distributed arrays containing N twiddle factors 858 unsigned int MyNum,// thread continuous index862 unsigned int tid, // thread continuous index 859 863 unsigned int MyFirst, 860 864 unsigned int MyLast ) … … 868 872 get_cycle( &cycle ); 869 873 printf("\n[fft] %s : thread %d enter / first %d / last %d / cycle %d\n", 870 __FUNCTION__, MyNum, MyFirst, MyLast, (unsigned int)cycle );874 __FUNCTION__, tid, MyFirst, MyLast, (unsigned int)cycle ); 871 875 #endif 872 876 … … 877 881 get_cycle( &cycle ); 878 882 printf("\n[fft] %s : thread %d after first transpose / cycle %d\n", 879 __FUNCTION__, MyNum, (unsigned int)cycle );883 __FUNCTION__, tid, (unsigned int)cycle ); 880 884 if( PRINT_ARRAY ) PrintArray( tmp , N ); 881 885 #endif … … 885 889 pthread_barrier_wait( &barrier ); 886 890 get_cycle( &barrier_stop ); 887 sync_time[ MyNum] = (unsigned int)(barrier_stop - barrier_start);891 sync_time[tid] = (unsigned int)(barrier_stop - barrier_start); 888 892 889 893 #if( DEBUG_FFT1D & 1 ) 890 894 get_cycle( &cycle ); 891 895 printf("\n[fft] %s : thread %d exit barrier after first transpose / cycle %d\n", 892 __FUNCTION__, MyNum, (unsigned int)cycle );896 __FUNCTION__, tid, (unsigned int)cycle ); 893 897 #endif 894 898 … … 902 906 903 907 #if( DEBUG_FFT1D & 1 ) 904 printf("\n[fft] %s : thread %d after first twiddle\n", __FUNCTION__, MyNum);908 printf("\n[fft] %s : thread %d after first twiddle\n", __FUNCTION__, tid); 905 909 if( PRINT_ARRAY ) PrintArray( tmp , N ); 906 910 #endif … … 912 916 913 917 #if( DEBUG_FFT1D & 1 ) 914 printf("\n[fft] %s : thread %d exit barrier after first twiddle\n", __FUNCTION__, MyNum);915 #endif 916 917 sync_time[ MyNum] += (unsigned int)(barrier_stop - barrier_start);918 printf("\n[fft] %s : thread %d exit barrier after first twiddle\n", __FUNCTION__, tid); 919 #endif 920 921 sync_time[tid] += (unsigned int)(barrier_stop - barrier_start); 918 922 919 923 // transpose tmp to x … … 921 925 922 926 #if( DEBUG_FFT1D & 1 ) 923 printf("\n[fft] %s : thread %d after second transpose\n", __FUNCTION__, MyNum);927 printf("\n[fft] %s : thread %d after second transpose\n", __FUNCTION__, tid); 924 928 if( PRINT_ARRAY ) PrintArray( x , N ); 925 929 #endif … … 931 935 932 936 #if( DEBUG_FFT1D & 1 ) 933 printf("\n[fft] %s : thread %d exit barrier after second transpose\n", __FUNCTION__, MyNum);934 #endif 935 936 sync_time[ MyNum] += (unsigned int)(barrier_stop - barrier_start);937 printf("\n[fft] %s : thread %d exit barrier after second transpose\n", __FUNCTION__, tid); 938 #endif 939 940 sync_time[tid] += (unsigned int)(barrier_stop - barrier_start); 937 941 938 942 // do FFTs on rows of x and apply the scaling factor … … 944 948 945 949 #if( DEBUG_FFT1D & 1 ) 946 printf("\n[fft] %s : thread %d after FFT on rows\n", __FUNCTION__, MyNum);950 printf("\n[fft] %s : thread %d after FFT on rows\n", __FUNCTION__, tid); 947 951 if( PRINT_ARRAY ) PrintArray( x , N ); 948 952 #endif … … 954 958 955 959 #if( DEBUG_FFT1D & 1 ) 956 printf("\n[fft] %s : thread %d exit barrier after FFT on rows\n", __FUNCTION__, MyNum);957 #endif 958 sync_time[ MyNum] += (unsigned int)(barrier_stop - barrier_start);960 printf("\n[fft] %s : thread %d exit barrier after FFT on rows\n", __FUNCTION__, tid); 961 #endif 962 sync_time[tid] += (unsigned int)(barrier_stop - barrier_start); 959 963 960 964 // transpose x to tmp … … 962 966 963 967 #if( DEBUG_FFT1D & 1 ) 964 printf("\n[fft] %s : thread %x after third transpose\n", __FUNCTION__, MyNum);968 printf("\n[fft] %s : thread %x after third transpose\n", __FUNCTION__, tid); 965 969 if( PRINT_ARRAY ) PrintArray( x , N ); 966 970 #endif … … 972 976 973 977 #if( DEBUG_FFT1D & 1 ) 974 printf("\n[fft] %s : thread %d exit barrier after third transpose\n", __FUNCTION__, MyNum);975 #endif 976 977 sync_time[ MyNum] += (unsigned int)(barrier_stop - barrier_start);978 sync_time[ MyNum] += (long)(barrier_stop - barrier_start);978 printf("\n[fft] %s : thread %d exit barrier after third transpose\n", __FUNCTION__, tid); 979 #endif 980 981 sync_time[tid] += (unsigned int)(barrier_stop - barrier_start); 982 sync_time[tid] += (long)(barrier_stop - barrier_start); 979 983 980 984 // copy tmp to x … … 982 986 983 987 #if DEBUG_FFT1D 984 printf("\n[fft] %s : thread %d completed\n", __FUNCTION__, MyNum);988 printf("\n[fft] %s : thread %d completed\n", __FUNCTION__, tid); 985 989 if( PRINT_ARRAY ) PrintArray( x , N ); 986 990 #endif -
trunk/user/idbg/idbg.c
r580 r637 20 20 21 21 get_cycle( &cycle ); 22 get_core ( &cxy , &lid );22 get_core_id( &cxy , &lid ); 23 23 24 24 printf( "\n[IDBG] starts on core[%x,%d] / cycle %d\n", -
trunk/user/ksh/ksh.c
r636 r637 1186 1186 char cmd[CMD_MAX_SIZE]; // buffer for one command 1187 1187 1188 / /1. first direct command1188 /* 1. first direct command 1189 1189 if( sem_wait( &semaphore ) ) 1190 1190 { … … 1199 1199 strcpy( cmd , "load bin/user/sort.elf" ); 1200 1200 execute( cmd ); 1201 //1202 1203 1204 1205 / /2. second direct command1201 */ 1202 1203 1204 1205 /* 2. second direct command 1206 1206 if( sem_wait( &semaphore ) ) 1207 1207 { … … 1216 1216 strcpy( cmd , "load bin/user/fft.elf" ); 1217 1217 execute( cmd ); 1218 //1218 */ 1219 1219 1220 1220 … … 1455 1455 // get KSH process pid and core 1456 1456 parent_pid = getpid(); 1457 get_core ( &cxy , &lid );1457 get_core_id( &cxy , &lid ); 1458 1458 1459 1459 #if DEBUG_MAIN -
trunk/user/pgcd/pgcd.c
r626 r637 27 27 28 28 get_cycle( &cycle ); 29 get_core ( &cxy , &lid );29 get_core_id( &cxy , &lid ); 30 30 31 31 printf( "\n[pgcd] starts on core[%x,%d] / cycle %d\n\n", -
trunk/user/sort/sort.c
r636 r637 54 54 #include <hal_macros.h> 55 55 56 #define ARRAY_LENGTH 2048 // number of items 57 #define MAX_THREADS 1024 // 16 * 16 * 4 58 59 #define USE_DQT_BARRIER 1 // use DQT barrier if non zero 60 #define DISPLAY_ARRAY 0 // display items values before and after 61 #define DEBUG_MAIN 0 // trace main function 62 #define DEBUG_SORT 0 // trace sort function 63 #define CHECK_RESULT 0 // for debug 64 #define INSTRUMENTATION 1 // register computation times on file 65 66 ///////////////////////////////////////////////////////////// 67 // argument for the sort() function (one thread per core) 68 ///////////////////////////////////////////////////////////// 56 #define ARRAY_LENGTH 2048 // number of items 57 #define MAX_THREADS 1024 // 16 * 16 * 4 58 59 #define X_MAX 16 // max number of clusters in a row 60 #define Y_MAX 16 // max number of clusters in a column 61 #define CORES_MAX 4 // max number of cores in a cluster 62 #define CLUSTERS_MAX X_MAX * Y_MAX 63 64 #define USE_DQT_BARRIER 1 // use DQT barrier if non zero 65 #define DISPLAY_ARRAY 0 // display items values before and after 66 #define DEBUG_MAIN 0 // trace main function 67 #define DEBUG_SORT 0 // trace sort function 68 #define CHECK_RESULT 0 // for debug 69 #define INSTRUMENTATION 1 // register computation times on file 70 71 /////////////////////////////////////////////////////////////////////////////////// 72 // Arguments for the sort() function 73 /////////////////////////////////////////////////////////////////////////////////// 69 74 70 75 typedef struct 71 76 { 72 unsigned int threads; // total number of threads73 unsigned int thread_uid; // thread user index (0 to threads -1)74 unsigned int main_uid; // main thread user index77 unsigned int tid; // continuous thread index 78 unsigned int threads; // total number of threads 79 pthread_barrier_t * parent_barrier; // pointer on termination barrier 75 80 } 76 args_t;77 78 ////////////////////////////////////////// 79 // Global variables80 ////////////////////////////////////////// 81 sort_args_t; 82 83 //////////////////////////////////////////////////////////////////////////////////// 84 // Sort specific global variables 85 //////////////////////////////////////////////////////////////////////////////////// 81 86 82 87 int array0[ARRAY_LENGTH]; // values to sort … … 85 90 pthread_barrier_t barrier; // synchronisation variables 86 91 87 pthread_t trdid[MAX_THREADS]; // kernel identifiers 88 pthread_attr_t attr[MAX_THREADS]; // thread attributes 89 args_t arg[MAX_THREADS]; // sort function arguments 92 ///////////////////////////////////////////////////////////////////////////////////// 93 // Global variables required by parallel_pthread_create() 94 ///////////////////////////////////////////////////////////////////////////////////// 95 96 // 2D arrays of input arguments for the <sort> threads 97 // These arrays are initialised by the application main thread 98 99 sort_args_t sort_args[CLUSTERS_MAX][CORES_MAX]; // sort function arguments 100 sort_args_t * sort_ptrs[CLUSTERS_MAX][CORES_MAX]; // pointers on arguments 101 102 // 1D array of barriers to allow the <sort> threads to signal termination 103 // this array is initialised by the pthread_parallel_create() function 104 105 pthread_barrier_t parent_barriers[CLUSTERS_MAX]; // termination barrier 106 90 107 91 108 //////////////////////////////////// … … 157 174 } // end merge() 158 175 159 ////////////////////////////// ////////160 static void sort( constargs_t * ptr )176 ////////////////////////////// 177 void sort( sort_args_t * ptr ) 161 178 { 162 unsigned int i; 163 unsigned long long cycle; 164 unsigned int cxy; 165 unsigned int lid; 166 167 int * src_array = NULL; 168 int * dst_array = NULL; 169 170 // get core coordinates an date 171 get_core( &cxy , &lid ); 172 get_cycle( &cycle ); 173 174 unsigned int thread_uid = ptr->thread_uid; 175 unsigned int threads = ptr->threads; 176 unsigned int main_uid = ptr->main_uid; 177 178 #if DISPLAY_ARRAY 179 unsigned int n; 180 if( thread_uid == main_uid ) 181 { 182 printf("\n*** array before sort\n"); 183 for( n=0; n<ARRAY_LENGTH; n++) printf("array[%d] = %d\n", n , array0[n] ); 184 } 179 unsigned int i; 180 int * src_array = NULL; 181 int * dst_array = NULL; 182 183 // get arguments 184 unsigned int tid = ptr->tid; 185 unsigned int threads = ptr->threads; 186 pthread_barrier_t * parent_barrier = ptr->parent_barrier; 187 188 unsigned int items = ARRAY_LENGTH / threads; 189 unsigned int stages = __builtin_ctz( threads ) + 1; 190 191 #if DEBUG_SORT 192 printf("\n[sort] start : ptr %x / tid %d / threads %d / barrier %x\n", 193 ptr, tid, threads, parent_barrier ); 194 #endif 195 196 bubbleSort( array0, items, items * tid ); 197 198 #if DEBUG_SORT 199 printf("\n[sort] thread[%d] : stage 0 completed\n", tid ); 185 200 #endif 186 201 … … 189 204 190 205 #if DEBUG_SORT 191 if( thread_uid == 0 ) 192 printf("\n[sort] thread[%d] exit barrier 0\n", thread_uid ); 193 #endif 194 195 unsigned int items = ARRAY_LENGTH / threads; 196 unsigned int stages = __builtin_ctz( threads ) + 1; 197 198 #if DEBUG_SORT 199 if( thread_uid == 0 ) 200 printf("\n[sort] thread[%d] : start\n", thread_uid ); 201 #endif 202 203 bubbleSort( array0, items, items * thread_uid ); 204 205 #if DEBUG_SORT 206 if( thread_uid == 0 ) 207 printf("\n[sort] thread[%d] : stage 0 completed\n", thread_uid ); 208 #endif 209 210 ///////////////////////////////// 211 pthread_barrier_wait( &barrier ); 212 213 #if DEBUG_SORT 214 if( thread_uid == 0 ) 215 printf("\n[sort] thread[%d] exit barrier 0\n", thread_uid ); 216 #endif 217 218 #if DISPLAY_ARRAY 219 if( thread_uid == main_uid ) 220 { 221 printf("\n*** array after bubble sort\n"); 222 for( n=0; n<ARRAY_LENGTH; n++) printf("array[%d] = %d\n", n , array0[n] ); 223 } 206 printf("\n[sort] thread[%d] exit barrier 0\n", tid ); 224 207 #endif 225 208 … … 239 222 } 240 223 241 if( (thread_uid & ((1<<i)-1)) == 0 ) 242 { 243 244 #if DEBUG_SORT 245 if( thread_uid == 0 ) 246 printf("\n[sort] thread[%d] : stage %d start\n", thread_uid , i ); 224 if( (tid & ((1<<i)-1)) == 0 ) 225 { 226 227 #if DEBUG_SORT 228 printf("\n[sort] thread[%d] : stage %d start\n", tid , i ); 247 229 #endif 248 230 merge( src_array, 249 231 dst_array, 250 232 items << (i-1), 251 items * thread_uid, 252 items * (thread_uid + (1 << (i-1))), 253 items * thread_uid ); 254 255 #if DEBUG_SORT 256 if( thread_uid == 0 ) 257 printf("\n[sort] thread[%d] : stage %d completed\n", thread_uid , i ); 233 items * tid, 234 items * (tid + (1 << (i-1))), 235 items * tid ); 236 237 #if DEBUG_SORT 238 printf("\n[sort] thread[%d] : stage %d completed\n", tid , i ); 258 239 #endif 259 240 } … … 263 244 264 245 #if DEBUG_SORT 265 if( thread_uid == 0 ) 266 printf("\n[sort] thread[%d] exit barrier %d\n", thread_uid , i ); 267 #endif 268 269 #if DISPLAY_ARRAY 270 if( thread_uid == main_uid ) 271 { 272 printf("\n*** array after merge %d\n", i ); 273 for( n=0; n<ARRAY_LENGTH; n++) printf("array[%d] = %d\n", n , dst_array[n] ); 274 } 246 printf("\n[sort] thread[%d] exit barrier %d\n", tid , i ); 275 247 #endif 276 248 277 249 } // en for stages 278 250 279 // all threads but the main thread exit 280 if( thread_uid != main_uid ) pthread_exit( NULL ); 251 // sort thread signal completion to main thread 252 pthread_barrier_wait( parent_barrier ); 253 254 #if DEBUG_SORT 255 printf("\n[sort] thread[%d] exit\n", tid ); 256 #endif 257 258 // sort thread exit 259 pthread_exit( NULL ); 281 260 282 261 } // end sort() … … 291 270 unsigned int ncores; // number of cores per cluster 292 271 unsigned int total_threads; // total number of threads 293 unsigned int thread_uid; // user defined thread index 294 unsigned int main_cxy; // cluster identifier for main 295 unsigned int main_x; // X coordinate for main thread 296 unsigned int main_y; // Y coordinate for main thread 297 unsigned int main_lid; // core local index for main thread 298 unsigned int main_uid; // thread user index for main thread 299 unsigned int x; // X coordinate for a thread 300 unsigned int y; // Y coordinate for a thread 272 unsigned int x; // X coordinate for a sort thread 273 unsigned int y; // Y coordinate for a sort thread 274 unsigned int cxy; // cluster identifier for a sort thead 301 275 unsigned int lid; // core local index for a thread 276 unsigned int tid; // sort thread continuous index 277 pthread_barrierattr_t barrier_attr; // barrier attributes (used for DQT) 302 278 unsigned int n; // index in array to sort 303 pthread_barrierattr_t barrier_attr; // barrier attributes304 279 305 280 unsigned long long start_cycle; … … 314 289 total_threads = x_size * y_size * ncores; 315 290 316 // get core coordinates and user index for the main thread 317 get_core( &main_cxy , & main_lid ); 318 main_x = HAL_X_FROM_CXY( main_cxy ); 319 main_y = HAL_Y_FROM_CXY( main_cxy ); 320 main_uid = (((main_x * y_size) + main_y) * ncores) + main_lid; 291 // compute covering DQT size an level 292 unsigned int z = (x_size > y_size) ? x_size : y_size; 293 unsigned int root_level = (z == 1) ? 0 : (z == 2) ? 1 : (z == 4) ? 2 : (z == 8) ? 3 : 4; 321 294 322 295 // checks number of threads … … 326 299 (total_threads != 512) && (total_threads != 1024) ) 327 300 { 328 printf("\n[sort error]number of cores must be power of 2\n");301 printf("\n[sort] ERROR : number of cores must be power of 2\n"); 329 302 exit( 0 ); 330 303 } … … 333 306 if ( ARRAY_LENGTH % total_threads) 334 307 { 335 printf("\n[sort error]array size must be multiple of number of threads\n");308 printf("\n[sort] ERROR : array size must be multiple of number of threads\n"); 336 309 exit( 0 ); 337 310 } … … 355 328 if( error ) 356 329 { 357 printf("\n[sort error]cannot initialise barrier\n" );330 printf("\n[sort] ERROR : cannot initialise barrier\n" ); 358 331 exit( 0 ); 359 332 } … … 370 343 } 371 344 345 #if DISPLAY_ARRAY 346 printf("\n*** array before sort\n"); 347 for( n=0; n<ARRAY_LENGTH; n++) printf("array[%d] = %d\n", n , array0[n] ); 348 #endif 349 372 350 #if DEBUG_MAIN 373 351 printf("\n[sort] main completes array init\n"); 374 352 #endif 375 353 376 // launch other threads to execute sort() function 377 // on cores other than the core running the main thread 378 for ( x = 0 ; x < x_size ; x++ ) 379 { 380 for ( y = 0 ; y < y_size ; y++ ) 381 { 354 // build array of arguments for the <sort> threads 355 for (x = 0 ; x < x_size ; x++) 356 { 357 for (y = 0 ; y < y_size ; y++) 358 { 359 // compute cluster identifier 360 cxy = HAL_CXY_FROM_XY( x , y ); 361 382 362 for ( lid = 0 ; lid < ncores ; lid++ ) 383 363 { 384 // compute thread user index (continuous index) 385 thread_uid = (((x * y_size) + y) * ncores) + lid; 386 387 // set arguments for all threads 388 arg[thread_uid].threads = total_threads; 389 arg[thread_uid].thread_uid = thread_uid; 390 arg[thread_uid].main_uid = main_uid; 391 392 // set thread attributes for all threads 393 attr[thread_uid].attributes = PT_ATTR_CLUSTER_DEFINED | PT_ATTR_CORE_DEFINED; 394 attr[thread_uid].cxy = HAL_CXY_FROM_XY( x , y ); 395 attr[thread_uid].lid = lid; 396 397 if( thread_uid != main_uid ) 398 { 399 if ( pthread_create( &trdid[thread_uid], // buffer for kernel identifier 400 &attr[thread_uid], // thread attributes 401 &sort, // entry function 402 &arg[thread_uid] ) ) // sort arguments 403 { 404 printf("\n[sort error] main cannot create thread %x \n", thread_uid ); 405 exit( 0 ); 406 } 407 408 #if (DEBUG_MAIN & 1) 409 printf("\n[sort] main created thread %x \n", thread_uid ); 410 #endif 411 } 364 // compute thread continuous index 365 tid = (((x * y_size) + y) * ncores) + lid; 366 367 // initialize 2D array of arguments 368 sort_args[cxy][lid].tid = tid; 369 sort_args[cxy][lid].threads = total_threads; 370 sort_args[cxy][lid].parent_barrier = &parent_barriers[cxy]; 371 372 // initialize 2D array of pointers 373 sort_ptrs[cxy][lid] = &sort_args[cxy][lid]; 412 374 } 413 375 } 414 376 } 415 377 416 378 /////////////////////////// 417 379 get_cycle( &seq_end_cycle ); … … 422 384 #endif 423 385 424 // the main thread run also the sort() function 425 sort( &arg[main_uid] ); 426 427 // wait other threads completion 428 for ( x = 0 ; x < x_size ; x++ ) 429 { 430 for ( y = 0 ; y < y_size ; y++ ) 431 { 432 for ( lid = 0 ; lid < ncores ; lid++ ) 433 { 434 // compute thread continuous index 435 thread_uid = (((x * y_size) + y) * ncores) + lid; 436 437 if( thread_uid != main_uid ) 438 { 439 if( pthread_join( trdid[thread_uid] , NULL ) ) 440 { 441 printf("\n[fft error] in main thread %d joining thread %d\n", 442 main_uid , thread_uid ); 443 exit( 0 ); 444 } 445 446 #if (DEBUG_MAIN & 1) 447 printf("\n[fft] main thread %d joined thread %d\n", main_uid, thread_uid ); 448 #endif 449 450 } 451 } 452 } 386 // create and execute the working threads 387 if( pthread_parallel_create( root_level, 388 &sort, 389 &sort_ptrs[0][0], 390 &parent_barriers[0] ) ) 391 { 392 printf("\n[sort] ERROR : cannot create threads\n"); 393 exit( 0 ); 453 394 } 454 395 … … 456 397 get_cycle( ¶_end_cycle ); 457 398 458 printf("\n[sort] main completes parallel sort at cycle %d\n", 459 (unsigned int)para_end_cycle ); 399 #if DEBUG_main 400 printf("\n[sort] main completes parallel sort at cycle %d\n", 401 (unsigned int)para_end_cycle ); 402 #endif 460 403 461 404 // destroy barrier 462 405 pthread_barrier_destroy( &barrier ); 406 407 #if DISPLAY_ARRAY 408 printf("\n*** array after merge %d\n", i ); 409 for( n=0; n<ARRAY_LENGTH; n++) printf("array[%d] = %d\n", n , dst_array[n] ); 410 #endif 463 411 464 412 #if CHECK_RESULT … … 492 440 // build file name 493 441 if( USE_DQT_BARRIER ) 494 snprintf( name , 64 , " sort_dqt_%d_%d_%d", ARRAY_LENGTH, x_size * y_size, ncores );442 snprintf( name , 64 , "p_sort_dqt_%d_%d_%d", ARRAY_LENGTH, x_size * y_size, ncores ); 495 443 else 496 snprintf( name , 64 , " sort_smp_%d_%d_%d", ARRAY_LENGTH, x_size * y_size, ncores );444 snprintf( name , 64 , "p_sort_smp_%d_%d_%d", ARRAY_LENGTH, x_size * y_size, ncores ); 497 445 498 446 // build file pathname … … 515 463 if( stream == NULL ) 516 464 { 517 printf("\n[sort error]cannot open instrumentation file <%s>\n", path );465 printf("\n[sort] ERROR : cannot open instrumentation file <%s>\n", path ); 518 466 exit(0); 519 467 } … … 532 480 if( ret < 0 ) 533 481 { 534 printf("\n[sort error]cannot write to instrumentation file <%s>\n", path );482 printf("\n[sort] ERROR : cannot write to instrumentation file <%s>\n", path ); 535 483 exit(0); 536 484 } … … 548 496 if( ret ) 549 497 { 550 printf("\n[sort error]cannot close instrumentation file <%s>\n", path );498 printf("\n[sort] ERROR : cannot close instrumentation file <%s>\n", path ); 551 499 exit(0); 552 500 }
Note: See TracChangeset
for help on using the changeset viewer.