Changeset 637 for trunk/user


Ignore:
Timestamp:
Jul 18, 2019, 2:06:55 PM (5 years ago)
Author:
alain
Message:

Introduce the non-standard pthread_parallel_create() system call
and re-write the <fft> and <sort> applications to improve the
intrinsic paralelism in applications.

Location:
trunk/user
Files:
5 edited

Legend:

Unmodified
Added
Removed
  • trunk/user/fft/fft.c

    r636 r637  
    2222// of N complex points, using the Cooley-Tuckey FFT method.
    2323// The N data points are seen as a 2D array (rootN rows * rootN columns).
    24 // Each thread handle (rootN / nthreads) rows. The N input data points
    25 // be initialised in three different modes:
     24// Each thread handle (rootN / nthreads) rows.
     25// The N input data points can be initialised in three different modes:
    2626// - CONSTANT : all data points have the same [1,0] value
    2727// - COSIN    : data point n has [cos(n/N) , sin(n/N)] values
     
    3131//  - M : N = 2**M = number of data points / M must be an even number.
    3232//  - T : nthreads = ncores defined by the hardware / must be power of 2.
     33// The number of threads cannot be larger than the number of rows.
    3334//
    34 // This application uses 4 shared data arrays, that are dynamically
    35 // allocated an distributed, using the remote_malloc() function, with
    36 // one sub-buffer per cluster:
    37 // - data[N] contains N input data points, with 2 double per point.
    38 // - trans[N] contains N intermediate data points, 2 double per point.
    39 // - umain[rootN] contains rootN coefs required for a rootN points FFT.
    40 // - twid[N] contains N coefs : exp(2*pi*i*j/N) / i and j in [0,rootN-1].
    41 // For data, trans, twid, each sub-buffer contains (N/nclusters) points.
    42 // For umain, each sub-buffer contains (rootN/nclusters) points.
     35// This application uses 3 shared data arrays, that are dynamically
     36// allocated and distributed in clusters, with one sub-buffer per cluster:
     37// - data[N] contains N input data points,
     38// - trans[N] contains N intermediate data points,
     39// - twid[N] contains N coefs : exp(2*pi*i*j/N) / i and j in [0,rootN-1]
     40// Each sub-buffer contains (N/nclusters) entries, with 2 double per entry.
     41// These distributed buffers are allocated and initialised in parallel
     42// by the working threads running on core 0 in each cluster.
    4343//
    44 // There is one thread per core.
    45 // The max number of clusters is defined by (X_MAX * Y_MAX).
    46 // The max number of cores per cluster is defined by CORES_MAX.
     44// Each working thread allocates also a private coefs[rootN-1] buffer,
     45// that contains all coefs required for a rootN points FFT.
     46//
     47// There is one working thread per core.
     48// The actual number of cores and cluster in a given hardware architecture
     49// is obtained by the get_config() syscall (x_size, y_size, ncores).
     50// The max number of clusters is bounded by (X_MAX * Y_MAX).
     51// The max number of cores per cluster is bounded by CORES_MAX.
    4752//
    4853// Several configuration parameters can be defined below:
     
    5762//   by the main thread in the main() function.
    5863// - The parallel execution time (parallel_time[i]) is computed by each
    59 //   thread(i) in the slave() function.
     64//   working thread(i) in the work() function.
    6065// - The synchronisation time related to the barriers (sync_time[i])
    61 //   is computed by each thread(i) in the slave() function.
     66//   is computed by each thread(i) in the work() function.
    6267// The results are displayed on the TXT terminal, and registered on disk.
    6368///////////////////////////////////////////////////////////////////////////
     
    8792// parameters
    8893
    89 #define DEFAULT_M               12              // 4096 data points
    90 #define USE_DQT_BARRIER         0               // use DDT barrier if non zero
     94#define DEFAULT_M               14              // 16384 data points
     95#define USE_DQT_BARRIER         1               // use DDT barrier if non zero
    9196#define MODE                    COSIN           // DATA array initialisation mode
    9297#define CHECK                   0               
    93 #define DEBUG_MAIN              0               // trace main() function (detailed if odd)
    94 #define DEBUG_SLAVE             0               // trace slave() function (detailed if odd)
     98#define DEBUG_MAIN              1               // trace main() function (detailed if odd)
     99#define DEBUG_WORK              1               // trace work() function (detailed if odd)
    95100#define DEBUG_FFT1D             0               // trace FFT1D() function (detailed if odd)
    96101#define DEBUG_ROW               0               // trace FFTRow() function (detailed if odd)
     
    101106
    102107/////////////////////////////////////////////////////////////////////////////////////
    103 //             structure containing the arguments for the slave() function
     108//             FFT specific global variables
    104109/////////////////////////////////////////////////////////////////////////////////////
    105110
    106 typedef struct args_s
    107 {
    108     unsigned int   tid;                    // thread continuous index
    109     unsigned int   main_tid;               // main thread continuous index
     111// work function arguments
     112typedef struct work_args_s
     113{
     114    unsigned int        tid;               // thread continuous index
     115    unsigned int        lid;               // core local index
     116    unsigned int        cid;               // cluster continuous index
     117    pthread_barrier_t * parent_barrier;    // parent barrier to signal completion
    110118}
    111 args_t;
    112 
    113 /////////////////////////////////////////////////////////////////////////////////////
    114 //             global variables
    115 /////////////////////////////////////////////////////////////////////////////////////
    116 
    117 unsigned int   x_size;                     // number of clusters per row in the mesh
    118 unsigned int   y_size;                     // number of clusters per column in the mesh
    119 unsigned int   ncores;                     // number of cores per cluster
     119work_args_t;
     120
    120121unsigned int   nthreads;                   // total number of threads (one thread per core)
    121122unsigned int   nclusters;                  // total number of clusters
     
    129130double *       data[CLUSTERS_MAX];         // original time-domain data
    130131double *       trans[CLUSTERS_MAX];        // used as auxiliary space for transpose
     132double *       twid[CLUSTERS_MAX];         // twiddle factor : exp(-2iPI*k*n/N)
    131133double *       bloup[CLUSTERS_MAX];        // used as auxiliary space for DFT
    132 double *       umain[CLUSTERS_MAX];        // roots of unity used fo rootN points FFT   
    133 double *       twid[CLUSTERS_MAX];         // twiddle factor : exp(-2iPI*k*n/N)
    134134
    135135// instrumentation counters
     
    142142pthread_barrierattr_t  barrier_attr;
    143143
    144 // threads identifiers, attributes, and arguments
    145 pthread_t       trdid[THREADS_MAX];        // kernel threads identifiers
    146 pthread_attr_t  attr[THREADS_MAX];         // POSIX thread attributes
    147 args_t          args[THREADS_MAX];         // slave function arguments
    148 
    149 /////////////////////////////////////////////////////////////////////////////////
     144/////////////////////////////////////////////////////////////////////////////////////
     145//             Global variables required by parallel_pthread_create()
     146/////////////////////////////////////////////////////////////////////////////////////
     147
     148// 2D arrays of input arguments for the <work> threads
     149// These arrays are initialised by the application main thread
     150
     151work_args_t       work_args[CLUSTERS_MAX][CORES_MAX];  // work function arguments
     152work_args_t     * work_ptrs[CLUSTERS_MAX][CORES_MAX];  // pointers on arguments
     153
     154// 1D array of barriers to allow the <work> threads to signal termination
     155// this array is initialised in each cluster by the <build[cxy][0]> thread
     156 
     157pthread_barrier_t parent_barriers[CLUSTERS_MAX];        // termination barrier
     158
     159/////////////////////////////////////////////////////////////////////////////////////
    150160//           functions declaration
    151 /////////////////////////////////////////////////////////////////////////////////
    152 
    153 void slave( args_t * args );
     161/////////////////////////////////////////////////////////////////////////////////////
     162
     163void work( work_args_t * args );
    154164
    155165double CheckSum( void );
    156166
    157 void InitX(double ** x , unsigned int mode);
    158 
    159 void InitU(double ** u);
    160 
    161 void InitT(double ** u);
     167void InitD( double    ** data ,
     168            unsigned int mode,
     169            unsigned int tid );
     170
     171void InitT( double    ** twid,
     172            unsigned int tid );
     173
     174void InitU( double * coefs );
    162175
    163176unsigned int BitReverse( unsigned int k );
     
    168181            double     * upriv,
    169182            double    ** twid,
    170             unsigned int MyNum,
     183            unsigned int tid,
    171184            unsigned int MyFirst,
    172185            unsigned int MyLast );
     
    217230    int                 error;
    218231
    219     unsigned int        main_cxy;          // main thread cluster
    220     unsigned int        main_x;            // main thread X coordinate
    221     unsigned int        main_y;            // main thread y coordinate
    222     unsigned int        main_lid;          // main thread local core index
    223     unsigned int        main_tid;          // main thread continuous index
     232    unsigned int        x_size;            // number of clusters per row
     233    unsigned int        y_size;            // number of clusters per column
     234    unsigned int        ncores;            // max number of cores per cluster
    224235
    225236    unsigned int        x;                 // current index for cluster X coordinate
    226237    unsigned int        y;                 // current index for cluster Y coordinate
    227238    unsigned int        lid;               // current index for core in a cluster
    228     unsigned int        ci;                // continuous cluster index (from x,y)
     239    unsigned int        tid;               // continuous thread index
     240    unsigned int        cid;               // cluster continuous index
    229241    unsigned int        cxy;               // hardware specific cluster identifier
    230     unsigned int        tid;               // continuous thread index
     242
     243    char                name[64];          // instrumentation file name
     244    char                path[128];         // instrumentation path name
     245    char                string[256];
     246    int                 ret;
    231247
    232248    unsigned long long  start_init_cycle;
    233249    unsigned long long  end_init_cycle;
    234250
     251#if DEBUG_MAIN
     252    unsigned long long  debug_cycle;
     253#endif
     254
    235255#if CHECK
    236 double     ck1;           // for input/output checking
    237 double     ck3;           // for input/output checking
     256    double              ck1;               // for input/output checking
     257    double              ck3;               // for input/output checking
    238258#endif
    239259   
     
    241261    get_cycle( &start_init_cycle );
    242262
    243     // get platform parameters to compute nthreads & nclusters
     263    // get platform parameters
    244264    if( get_config( &x_size , &y_size , &ncores ) )
    245265    {
     
    269289    }
    270290
     291    // compute nthreads and nclusters
    271292    nthreads  = x_size * y_size * ncores;
    272293    nclusters = x_size * y_size;
     294
     295    // compute covering DQT size an level
     296    unsigned int z = (x_size > y_size) ? x_size : y_size;
     297    unsigned int root_level = (z == 1) ? 0 : (z == 2) ? 1 : (z == 4) ? 2 : (z == 8) ? 3 : 4;
    273298
    274299    // compute various constants depending on N and T
     
    285310    }
    286311
    287     // get main thread coordinates (main_x, main_y, main_lid)
    288     get_core( &main_cxy , &main_lid );
    289     main_x   = HAL_X_FROM_CXY( main_cxy );
    290     main_y   = HAL_Y_FROM_CXY( main_cxy );
    291     main_tid = (((main_x * y_size) + main_y) * ncores) + main_lid;
    292 
    293     printf("\n[fft] starts / core[%x,%d] / %d points / %d thread(s) / PID %x / cycle %d\n",
    294     main_cxy, main_lid, N, nthreads, getpid(), (unsigned int)start_init_cycle );
    295 
    296     // allocate memory for the distributed data[i], trans[i], umain[i], twid[i] buffers
    297     // the index (i) is a continuous cluster index
    298     unsigned int data_size   = (N / nclusters) * 2 * sizeof(double);
    299     unsigned int coefs_size  = (rootN / nclusters) * 2 * sizeof(double);
    300     for (x = 0 ; x < x_size ; x++)
    301     {
    302         for (y = 0 ; y < y_size ; y++)
    303         {
    304             ci         = x * y_size + y;
    305             cxy        = HAL_CXY_FROM_XY( x , y );
    306             data[ci]   = (double *)remote_malloc( data_size  , cxy );
    307             trans[ci]  = (double *)remote_malloc( data_size  , cxy );
    308             bloup[ci]  = (double *)remote_malloc( data_size  , cxy );
    309             umain[ci]  = (double *)remote_malloc( coefs_size , cxy );
    310             twid[ci]   = (double *)remote_malloc( data_size  , cxy );
    311         }
     312    printf("\n[fft] starts / %d points / %d thread(s) / PID %x / cycle %d\n",
     313    N, nthreads, getpid(), (unsigned int)start_init_cycle );
     314
     315    // build instrumentation file name
     316    if( USE_DQT_BARRIER )
     317    snprintf( name , 64 , "p_fft_dqt_%d_%d_%d", N , x_size * y_size , ncores );
     318    else
     319    snprintf( name , 64 , "p_fft_smp_%d_%d_%d", N , x_size * y_size , ncores );
     320
     321    // build pathname
     322    snprintf( path , 128 , "/home/%s", name );
     323
     324    // open instrumentation file
     325    FILE * f = fopen( path , NULL );
     326    if ( f == NULL )
     327    {
     328        printf("\n[fft error] cannot open instrumentation file <%s>\n", path );
     329        exit( 0 );
    312330    }
    313331
    314332#if DEBUG_MAIN
    315 printf("\n[fft] main completes remote_malloc\n");
    316 #endif
    317 
    318     // arrays initialisation
    319     InitX( data , MODE );
    320     InitU( umain );
    321     InitT( twid );
    322 
    323 #if DEBUG_MAIN
    324 printf("\n[fft] main completes arrays init\n");
     333get_cycle( &debug_cycle );
     334printf("\n[fft] main open file <%s> at cycle %d\n",
     335path, (unsigned int)debug_cycle );
    325336#endif
    326337
     
    342353#endif
    343354
    344     // initialise barrier
     355    // initialise barrier synchronizing all <work> threads
    345356    if( USE_DQT_BARRIER )
    346357    {
     
    362373
    363374#if DEBUG_MAIN
    364 printf("\n[fft] main completes barrier init\n");
    365 #endif
    366 
    367     // launch other threads to execute the slave() function
    368     // on cores other than the core running the main thread
     375get_cycle( &debug_cycle );
     376printf("\n[fft] main completes barrier init at cycle %d\n",
     377(unsigned int)debug_cycle );
     378#endif
     379
     380    // build array of arguments for the <work> threads
    369381    for (x = 0 ; x < x_size ; x++)
    370382    {
     
    376388            for ( lid = 0 ; lid < ncores ; lid++ )
    377389            {
    378                 // compute thread user index (continuous index)
    379                 tid = (((x * y_size) + y) * ncores) + lid;
    380 
    381                 // set thread attributes
    382                 attr[tid].attributes = PT_ATTR_CLUSTER_DEFINED | PT_ATTR_CORE_DEFINED;
    383                 attr[tid].cxy        = cxy;
    384                 attr[tid].lid        = lid;
    385 
    386                 // set slave function argument
    387                 args[tid].tid      = tid;
    388                 args[tid].main_tid = main_tid;
    389 
    390                 // create thread
    391                 if( tid != main_tid )
    392                 {
    393                     if ( pthread_create( &trdid[tid],  // pointer on kernel identifier
    394                                          &attr[tid],   // pointer on thread attributes
    395                                          &slave,       // pointer on function
    396                                          &args[tid]) ) // pointer on function arguments
    397                     {
    398                         printf("\n[fft error] creating thread %x\n", tid );
    399                         exit( 0 );
    400                     }
    401 
    402 #if (DEBUG_MAIN & 1)
    403 unsigned long long debug_cycle;
    404 get_cycle( &debug_cycle );
    405 printf("\n[fft] main created thread %d on core[%x,%d] / cycle %d\n",
    406 tid, cxy, lid, (unsigned int)debug_cycle );
    407 #endif
    408                 }
     390                // compute cluster continuous index
     391                cid = (x * y_size) + y;
     392
     393                // compute work thread continuous index
     394                tid = (cid * ncores) + lid;
     395               
     396                // initialize 2D array of arguments
     397                work_args[cxy][lid].tid            = tid;
     398                work_args[cxy][lid].lid            = lid;
     399                work_args[cxy][lid].cid            = cid;
     400                work_args[cxy][lid].parent_barrier = &parent_barriers[cxy];
     401
     402                // initialize 2D array of pointers
     403                work_ptrs[cxy][lid] = &work_args[cxy][lid];
    409404            }
    410405        }
    411406    }
    412407
     408    // register sequencial time
     409    get_cycle( &end_init_cycle );
     410    init_time = (unsigned int)(end_init_cycle - start_init_cycle);
     411
    413412#if DEBUG_MAIN
    414 printf("\n[fft] main completes threads creation\n");
    415 #endif
    416 
    417     get_cycle( &end_init_cycle );
    418 
    419     // register sequencial time
    420     init_time = (unsigned int)(end_init_cycle - start_init_cycle);
    421    
    422     // main itself executes the slave() function
    423     slave( &args[main_tid] );
    424 
    425     // wait other threads completion
    426     for (x = 0 ; x < x_size ; x++)
    427     {
    428         for (y = 0 ; y < y_size ; y++)
    429         {
    430             for ( lid = 0 ; lid < ncores ; lid++ )
    431             {
    432                 // compute thread continuous index
    433                 tid = (((x * y_size) + y) * ncores) + lid;
    434 
    435                 if( tid != main_tid )
    436                 {
    437                     if( pthread_join( trdid[tid] , NULL ) )
    438                     {
    439                         printf("\n[fft error] in main thread joining thread %x\n", tid );
    440                         exit( 0 );
    441                     }
    442                    
    443 #if (DEBUG_MAIN & 1)
    444 printf("\n[fft] main thread %d joined thread %d\n", main_tid, tid );
    445 #endif
    446 
    447                 }
    448             }
    449         }
    450     }
     413printf("\n[fft] main completes <work> threads arguments at cycle %d\n",
     414(unsigned int)end_init_cycle );
     415#endif
     416
     417    // create and execute the working threads
     418    if( pthread_parallel_create( root_level,
     419                                 &work,
     420                                 &work_ptrs[0][0],
     421                                 &parent_barriers[0] ) )
     422    {
     423        printf("\n[fft error] creating threads\n");
     424        exit( 0 );
     425    }
     426
     427#if DEBUG_MAIN
     428get_cycle( &debug_cycle );
     429printf("\n[fft] main resume for instrumentation at cycle %d\n",
     430(unsigned int)debug_cycle) ;
     431#endif
    451432
    452433#if PRINT_ARRAY
     
    463444#endif
    464445
    465     // instrumentation
    466     char name[64];
    467     char path[128];
    468     char string[256];
    469     int  ret;
    470 
    471     // build file name
    472     if( USE_DQT_BARRIER )
    473     snprintf( name , 64 , "fft_dqt_%d_%d_%d", N , x_size * y_size , ncores );
    474     else
    475     snprintf( name , 64 , "fft_smp_%d_%d_%d", N , x_size * y_size , ncores );
    476 
    477     // build pathname
    478     snprintf( path , 128 , "/home/%s", name );
    479 
    480     // open instrumentation file
    481     FILE * f = fopen( path , NULL );
    482     if ( f == NULL )
    483     {
    484         printf("\n[fft error] cannot open instrumentation file <%s>\n", path );
    485         exit( 0 );
    486     }
    487     printf("\n[fft] file <%s> open\n", path );
    488 
    489446    // display header on terminal, and save to file
    490447    printf("\n----- %s -----\n", name );
     
    497454    }
    498455
    499     // display results for each thread on terminal, and save to file
     456    // get instrumentation results for each thread
    500457    for (tid = 0 ; tid < nthreads ; tid++)
    501458    {
     
    503460        tid, init_time, parallel_time[tid], sync_time[tid] );
    504461
    505         // display on terminal, and save to instrumentation file
    506         printf("%s" , string );
     462        // save  to instrumentation file
    507463        fprintf( f , "%s" , string );
    508464        if( ret < 0 )
    509465        {
    510466            printf("\n[fft error] cannot write thread %d to file <%s>\n", tid, path );
     467            printf("%s", string );
    511468            exit(0);
    512469        }
    513470    }
    514471
    515     // display MIN/MAX values on terminal and save to file
     472    // compute min/max values
    516473    unsigned int min_para = parallel_time[0];
    517474    unsigned int max_para = parallel_time[0];
     
    527484    }
    528485
     486    // display MIN/MAX values on terminal and save to file
    529487    snprintf( string , 256 , "\n      Sequencial  Parallel       Barrier\n"
    530488                             "MIN : %d\t | %d\t | %d\t   (cycles)\n"
     
    547505        exit(0);
    548506    }
    549     printf("\n[fft] file <%s> closed\n", path );
     507 
     508#if DEBUG_MAIN
     509get_cycle( &debug_cycle );
     510printf("\n[fft] main close file <%s> at cycle %d\n",
     511path, (unsigned int)debug_cycle );
     512#endif
    550513
    551514    exit( 0 );
     
    553516} // end main()
    554517
    555 ///////////////////////////////////////////////////////////////
    556 // This function is executed in parallel by all threads.
    557 ///////////////////////////////////////////////////////////////
    558 void slave( args_t * args )
    559 {
    560     unsigned int   i;
    561     unsigned int   MyNum;           // this thread index
    562     unsigned int   MainNum;         // main thread index
    563     unsigned int   MyFirst;         // index first row allocated to thread
    564     unsigned int   MyLast;          // index last row allocated to thread
    565     double       * upriv;
    566     unsigned int   c_id;
    567     unsigned int   c_offset;
     518/////////////////////////////////////////////////////////////////
     519// This function is executed in parallel by all <work> threads.
     520/////////////////////////////////////////////////////////////////
     521void work( work_args_t * args )
     522{
     523    unsigned int        tid;              // this thread continuous index
     524    unsigned int        lid;              // core local index
     525    unsigned int        cid;              // cluster continuous index
     526    pthread_barrier_t * parent_barrier;   // pointer on parent barrier
     527
     528    unsigned int        MyFirst;          // index first row allocated to thread
     529    unsigned int        MyLast;           // index last row allocated to thread
     530    double            * upriv;            // private array of FFT coefs
    568531
    569532    unsigned long long  parallel_start;
     
    572535    unsigned long long  barrier_stop;
    573536
    574     MyNum   = args->tid;
    575     MainNum = args->main_tid;
     537    // get thread arguments
     538    tid            = args->tid;
     539    lid            = args->lid;             
     540    cid            = args->cid;             
     541    parent_barrier = args->parent_barrier;
    576542
    577543    get_cycle( &parallel_start );
    578544
    579 #if DEBUG_SLAVE
     545#if DEBUG_WORK
    580546printf("\n[fft] %s : thread %d enter / cycle %d\n",
    581 __FUNCTION__, MyNum, (unsigned int)parallel_start );
    582 #endif
     547__FUNCTION__, tid, (unsigned int)parallel_start );
     548#endif
     549
     550    // core 0 allocate memory from the local cluster
     551    // for the distributed data[], trans[], twid[] buffers
     552    // and for the private upriv[] buffer
     553    if( lid == 0 )
     554    {
     555        unsigned int data_size  = (N / nclusters) * 2 * sizeof(double);
     556        unsigned int coefs_size = (rootN - 1) * 2 * sizeof(double); 
     557
     558        data[cid]   = (double *)malloc( data_size );
     559        trans[cid]  = (double *)malloc( data_size );
     560        twid[cid]   = (double *)malloc( data_size );
     561
     562        upriv       = (double *)malloc( coefs_size );
     563    }
    583564
    584565    // BARRIER
     
    586567    pthread_barrier_wait( &barrier );
    587568    get_cycle( &barrier_stop );
    588     sync_time[MyNum] += (unsigned int)(barrier_stop - barrier_start);
    589 
    590 #if DEBUG_SLAVE
    591 printf("\n[@@@] %s : thread %d exit first barrier / cycle %d\n",
    592 __FUNCTION__, MyNum, (unsigned int)barrier_stop );
    593 #endif
    594 
    595     // allocate and initialise local array upriv[]
    596     // that is a local copy of the rootN coefs defined in umain[]
    597     upriv = (double *)malloc(2 * (rootN - 1) * sizeof(double)); 
    598     for ( i = 0 ; i < (rootN - 1) ; i++)
    599     {
    600         c_id     = i / (rootN / nclusters);
    601         c_offset = i % (rootN / nclusters);
    602         upriv[2*i]   = umain[c_id][2*c_offset];
    603         upriv[2*i+1] = umain[c_id][2*c_offset+1];
    604     }
     569    sync_time[tid] += (unsigned int)(barrier_stop - barrier_start);
     570
     571#if DEBUG_WORK
     572printf("\n[fft] %s : thread %d exit first barrier / cycle %d\n",
     573__FUNCTION__, tid, (unsigned int)barrier_stop );
     574#endif
     575
     576    // all threads initialize data[] local array
     577    InitD( data , MODE , tid );
     578
     579    // all threads initialize twid[] local array
     580    InitT( twid , tid );
     581   
     582    // all threads initialise private upriv[] array
     583    InitU( upriv );
     584
     585    // BARRIER
     586    get_cycle( &barrier_start );
     587    pthread_barrier_wait( &barrier );
     588    get_cycle( &barrier_stop );
     589    sync_time[tid] += (unsigned int)(barrier_stop - barrier_start);
     590
     591#if DEBUG_WORK
     592printf("\n[fft] %s : thread %d exit second barrier / cycle %d\n",
     593__FUNCTION__, tid, (unsigned int)barrier_stop );
     594#endif
    605595
    606596    // compute first and last rows handled by the thread
    607     MyFirst = rootN * MyNum / nthreads;
    608     MyLast  = rootN * (MyNum + 1) / nthreads;
     597    MyFirst = rootN * tid / nthreads;
     598    MyLast  = rootN * (tid + 1) / nthreads;
    609599
    610600    // perform forward FFT
    611     FFT1D( 1 , data , trans , upriv , twid , MyNum , MyFirst , MyLast );
     601    FFT1D( 1 , data , trans , upriv , twid , tid , MyFirst , MyLast );
    612602
    613603#if CHECK
     
    615605pthread_barrier_wait( &barrier );
    616606get_cycle( &barrier_stop );
    617 sync_time[MyNum] += (unsigned int)(barrier_stop - barrier_start);
    618 FFT1D( -1 , data , trans , upriv , twid , MyNum , MyFirst , MyLast );
     607sync_time[tid] += (unsigned int)(barrier_stop - barrier_start);
     608FFT1D( -1 , data , trans , upriv , twid , tid , MyFirst , MyLast );
    619609#endif
    620610
     
    622612
    623613    // register parallel time
    624     parallel_time[MyNum] = (unsigned int)(parallel_stop - parallel_start);
    625 
    626 #if DEBUG_SLAVE
    627 printf("\n[fft] %s : thread %x completes fft / p_start %d / p_stop %d\n",
    628 __FUNCTION__, MyNum, (unsigned int)parallel_start, (unsigned int)parallel_stop );
    629 int tid;
    630 for (tid = 0 ; tid < nthreads ; tid++)
    631 {
    632     printf("- tid %d : Sequencial %d / Parallel %d / Barrier %d\n",
    633     tid , init_time, parallel_time[tid], sync_time[tid] );
    634 }
    635 #endif
    636 
    637     // exit only if MyNum != MainNum
    638     if( MyNum != MainNum ) pthread_exit( NULL );
    639 
    640 }  // end slave()
     614    parallel_time[tid] = (unsigned int)(parallel_stop - parallel_start);
     615
     616#if DEBUG_WORK
     617printf("\n[fft] %s : thread %d completes fft / p_start %d / p_stop %d\n",
     618__FUNCTION__, tid, (unsigned int)parallel_start, (unsigned int)parallel_stop );
     619#endif
     620
     621    //  work thread signals completion to main
     622    pthread_barrier_wait( parent_barrier );
     623
     624#if DEBUG_WORK
     625printf("\n[fft] %s : thread %d exit\n",
     626__FUNCTION__, tid );
     627#endif
     628
     629    //  work thread exit
     630    pthread_exit( NULL );
     631
     632}  // end work()
    641633
    642634////////////////////////////////////////////////////////////////////////////////////////
     
    724716}
    725717
    726 
    727 ////////////////////////////
    728 void InitX(double      ** x,
    729            unsigned int   mode )
     718//////////////////////////////////////////////////////////////////////////////////////
     719// Each working thread <tid> contributes to initialize (rootN / nthreads) rows,
     720// in the shared - and distributed - <data> array.
     721//////////////////////////////////////////////////////////////////////////////////////
     722void InitD(double      ** data,
     723           unsigned int   mode,
     724           unsigned int   tid )
    730725{
    731726    unsigned int    i , j;
     
    734729    unsigned int    index;
    735730
    736     for ( j = 0 ; j < rootN ; j++ )      // loop on row index
     731    // compute row_min and row_max
     732    unsigned int    row_min = tid * rows_per_thread;
     733    unsigned int    row_max = row_min + rows_per_thread;
     734
     735    for ( j = row_min ; j < row_max ; j++ )      // loop on rows
    737736    { 
    738         for ( i = 0 ; i < rootN ; i++ )  // loop on point in a row
     737        for ( i = 0 ; i < rootN ; i++ )          // loop on points in a row
    739738        { 
    740739            index     = j * rootN + i;
     
    745744            if ( mode == RANDOM )               
    746745            {
    747                 x[c_id][2*c_offset]   = ( (double)rand() ) / 65536;
    748                 x[c_id][2*c_offset+1] = ( (double)rand() ) / 65536;
     746                data[c_id][2*c_offset]   = ( (double)rand() ) / 65536;
     747                data[c_id][2*c_offset+1] = ( (double)rand() ) / 65536;
    749748            }
    750749           
     
    754753            {
    755754                double phi = (double)( 2 * PI * index) / N;
    756                 x[c_id][2*c_offset]   = cos( phi );
    757                 x[c_id][2*c_offset+1] = sin( phi );
     755                data[c_id][2*c_offset]   = cos( phi );
     756                data[c_id][2*c_offset+1] = sin( phi );
    758757            }
    759758
     
    761760            if ( mode == CONSTANT )               
    762761            {
    763                 x[c_id][2*c_offset]   = 1.0;
    764                 x[c_id][2*c_offset+1] = 0.0;
     762                data[c_id][2*c_offset]   = 1.0;
     763                data[c_id][2*c_offset+1] = 0.0;
    765764            }
    766765        }
     
    768767}
    769768
    770 /////////////////////////
    771 void InitU( double ** u )
    772 {
    773     unsigned int    q;
    774     unsigned int    j;
    775     unsigned int    base;
    776     unsigned int    n1;
    777     unsigned int    c_id;
    778     unsigned int    c_offset;
    779     double  phi;
    780     unsigned int    stop = 0;
    781 
    782     for (q = 0 ; ((unsigned int)(1 << q) < N) && (stop == 0) ; q++)
    783     { 
    784         n1 = 1 << q;
    785         base = n1 - 1;
    786         for (j = 0; (j < n1) && (stop == 0) ; j++)
    787         {
    788             if (base + j > rootN - 1) return;
    789 
    790             c_id      = (base + j) / (rootN / nclusters);
    791             c_offset  = (base + j) % (rootN / nclusters);
    792             phi = (double)(2.0 * PI * j) / (2 * n1);
    793             u[c_id][2*c_offset]   = cos( phi );
    794             u[c_id][2*c_offset+1] = -sin( phi );
    795         }
    796     }
    797 }
    798 
    799 //////////////////////////
    800 void InitT( double ** u )
     769///////////////////////////////////////////////////////////////////////////////////////
     770// Each working thread <tid> contributes to initialize (rootN / nthreads) rows,
     771// in the shared - and distributed - <twiddle> array.
     772///////////////////////////////////////////////////////////////////////////////////////
     773void InitT( double      ** twid,
     774            unsigned int   tid )
    801775{
    802776    unsigned int    i, j;
     
    806780    double  phi;
    807781
    808     for ( j = 0 ; j < rootN ; j++ )      // loop on row index
     782    // compute row_min and row_max
     783    unsigned int    row_min = tid * rows_per_thread;
     784    unsigned int    row_max = row_min + rows_per_thread;
     785
     786    for ( j = row_min ; j < row_max ; j++ )      // loop on rows
    809787    { 
    810         for ( i = 0 ; i < rootN ; i++ )  // loop on points in a row
     788        for ( i = 0 ; i < rootN ; i++ )          // loop on points in a row
    811789        { 
    812790            index     = j * rootN + i;
     
    815793
    816794            phi = (double)(2.0 * PI * i * j) / N;
    817             u[c_id][2*c_offset]   = cos( phi );
    818             u[c_id][2*c_offset+1] = -sin( phi );
     795            twid[c_id][2*c_offset]   = cos( phi );
     796            twid[c_id][2*c_offset+1] = -sin( phi );
     797        }
     798    }
     799}
     800
     801///////////////////////////////////////////////////////////////////////////////////////
     802// Each working thread initialize the private <upriv> array / (rootN - 1) entries.
     803///////////////////////////////////////////////////////////////////////////////////////
     804void InitU( double * upriv )
     805{
     806    unsigned int    q;
     807    unsigned int    j;
     808    unsigned int    base;
     809    unsigned int    n1;
     810    double  phi;
     811
     812    for (q = 0 ; ((unsigned int)(1 << q) < N) ; q++)
     813    { 
     814        n1 = 1 << q;    // n1 == 2**q
     815        base = n1 - 1;
     816        for (j = 0; (j < n1) ; j++)
     817        {
     818            if (base + j > rootN - 1) return;
     819
     820            phi = (double)(2.0 * PI * j) / (2 * n1);
     821            upriv[2*(base+j)]   = cos( phi );
     822            upriv[2*(base+j)+1] = -sin( phi );
    819823        }
    820824    }
     
    856860            double        *  upriv,           // local array containing coefs for rootN FFT
    857861            double       **  twid,            // distributed arrays containing N twiddle factors
    858             unsigned int     MyNum,           // thread continuous index
     862            unsigned int     tid,             // thread continuous index
    859863            unsigned int     MyFirst,
    860864            unsigned int     MyLast )
     
    868872get_cycle( &cycle );
    869873printf("\n[fft] %s : thread %d enter / first %d / last %d / cycle %d\n",
    870 __FUNCTION__, MyNum, MyFirst, MyLast, (unsigned int)cycle );
     874__FUNCTION__, tid, MyFirst, MyLast, (unsigned int)cycle );
    871875#endif
    872876
     
    877881get_cycle( &cycle );
    878882printf("\n[fft] %s : thread %d after first transpose / cycle %d\n",
    879 __FUNCTION__, MyNum, (unsigned int)cycle );
     883__FUNCTION__, tid, (unsigned int)cycle );
    880884if( PRINT_ARRAY ) PrintArray( tmp , N );
    881885#endif
     
    885889    pthread_barrier_wait( &barrier );
    886890    get_cycle( &barrier_stop );
    887     sync_time[MyNum] = (unsigned int)(barrier_stop - barrier_start);
     891    sync_time[tid] = (unsigned int)(barrier_stop - barrier_start);
    888892
    889893#if( DEBUG_FFT1D & 1 )
    890894get_cycle( &cycle );
    891895printf("\n[fft] %s : thread %d exit barrier after first transpose / cycle %d\n",
    892 __FUNCTION__, MyNum, (unsigned int)cycle );
     896__FUNCTION__, tid, (unsigned int)cycle );
    893897#endif
    894898
     
    902906
    903907#if( DEBUG_FFT1D & 1 )
    904 printf("\n[fft] %s : thread %d after first twiddle\n", __FUNCTION__, MyNum);
     908printf("\n[fft] %s : thread %d after first twiddle\n", __FUNCTION__, tid);
    905909if( PRINT_ARRAY ) PrintArray( tmp , N );
    906910#endif
     
    912916
    913917#if( DEBUG_FFT1D & 1 )
    914 printf("\n[fft] %s : thread %d exit barrier after first twiddle\n", __FUNCTION__, MyNum);
    915 #endif
    916 
    917     sync_time[MyNum] += (unsigned int)(barrier_stop - barrier_start);
     918printf("\n[fft] %s : thread %d exit barrier after first twiddle\n", __FUNCTION__, tid);
     919#endif
     920
     921    sync_time[tid] += (unsigned int)(barrier_stop - barrier_start);
    918922
    919923    // transpose tmp to x
     
    921925
    922926#if( DEBUG_FFT1D & 1 )
    923 printf("\n[fft] %s : thread %d after second transpose\n", __FUNCTION__, MyNum);
     927printf("\n[fft] %s : thread %d after second transpose\n", __FUNCTION__, tid);
    924928if( PRINT_ARRAY ) PrintArray( x , N );
    925929#endif
     
    931935
    932936#if( DEBUG_FFT1D & 1 )
    933 printf("\n[fft] %s : thread %d exit barrier after second transpose\n", __FUNCTION__, MyNum);
    934 #endif
    935 
    936     sync_time[MyNum] += (unsigned int)(barrier_stop - barrier_start);
     937printf("\n[fft] %s : thread %d exit barrier after second transpose\n", __FUNCTION__, tid);
     938#endif
     939
     940    sync_time[tid] += (unsigned int)(barrier_stop - barrier_start);
    937941
    938942    // do FFTs on rows of x and apply the scaling factor
     
    944948
    945949#if( DEBUG_FFT1D & 1 )
    946 printf("\n[fft] %s : thread %d after FFT on rows\n", __FUNCTION__, MyNum);
     950printf("\n[fft] %s : thread %d after FFT on rows\n", __FUNCTION__, tid);
    947951if( PRINT_ARRAY ) PrintArray( x , N );
    948952#endif
     
    954958
    955959#if( DEBUG_FFT1D & 1 )
    956 printf("\n[fft] %s : thread %d exit barrier after FFT on rows\n", __FUNCTION__, MyNum);
    957 #endif
    958     sync_time[MyNum] += (unsigned int)(barrier_stop - barrier_start);
     960printf("\n[fft] %s : thread %d exit barrier after FFT on rows\n", __FUNCTION__, tid);
     961#endif
     962    sync_time[tid] += (unsigned int)(barrier_stop - barrier_start);
    959963
    960964    // transpose x to tmp
     
    962966
    963967#if( DEBUG_FFT1D & 1 )
    964 printf("\n[fft] %s : thread %x after third transpose\n", __FUNCTION__, MyNum);
     968printf("\n[fft] %s : thread %x after third transpose\n", __FUNCTION__, tid);
    965969if( PRINT_ARRAY ) PrintArray( x , N );
    966970#endif
     
    972976
    973977#if( DEBUG_FFT1D & 1 )
    974 printf("\n[fft] %s : thread %d exit barrier after third transpose\n", __FUNCTION__, MyNum);
    975 #endif
    976 
    977     sync_time[MyNum] += (unsigned int)(barrier_stop - barrier_start);
    978     sync_time[MyNum] += (long)(barrier_stop - barrier_start);
     978printf("\n[fft] %s : thread %d exit barrier after third transpose\n", __FUNCTION__, tid);
     979#endif
     980
     981    sync_time[tid] += (unsigned int)(barrier_stop - barrier_start);
     982    sync_time[tid] += (long)(barrier_stop - barrier_start);
    979983
    980984    // copy tmp to x
     
    982986
    983987#if DEBUG_FFT1D
    984 printf("\n[fft] %s : thread %d completed\n", __FUNCTION__, MyNum);
     988printf("\n[fft] %s : thread %d completed\n", __FUNCTION__, tid);
    985989if( PRINT_ARRAY ) PrintArray( x , N );
    986990#endif
  • trunk/user/idbg/idbg.c

    r580 r637  
    2020
    2121    get_cycle( &cycle );
    22     get_core( &cxy , &lid );
     22    get_core_id( &cxy , &lid );
    2323
    2424    printf( "\n[IDBG] starts on core[%x,%d] / cycle %d\n",
  • trunk/user/ksh/ksh.c

    r636 r637  
    11861186        char           cmd[CMD_MAX_SIZE];               // buffer for one command
    11871187
    1188 // 1. first direct command
     1188/* 1. first direct command
    11891189if( sem_wait( &semaphore ) )
    11901190{
     
    11991199strcpy( cmd , "load bin/user/sort.elf" );
    12001200execute( cmd );
    1201 //
    1202 
    1203 
    1204 
    1205 // 2. second direct command
     1201*/
     1202
     1203
     1204
     1205/* 2. second direct command
    12061206if( sem_wait( &semaphore ) )
    12071207{
     
    12161216strcpy( cmd , "load bin/user/fft.elf" );
    12171217execute( cmd );
    1218 //
     1218*/
    12191219
    12201220
     
    14551455    // get KSH process pid and core
    14561456    parent_pid = getpid();
    1457     get_core( &cxy , &lid );
     1457    get_core_id( &cxy , &lid );
    14581458
    14591459#if DEBUG_MAIN
  • trunk/user/pgcd/pgcd.c

    r626 r637  
    2727
    2828    get_cycle( &cycle );
    29     get_core( &cxy , &lid );
     29    get_core_id( &cxy , &lid );
    3030
    3131    printf( "\n[pgcd] starts on core[%x,%d] / cycle %d\n\n",
  • trunk/user/sort/sort.c

    r636 r637  
    5454#include <hal_macros.h>
    5555
    56 #define ARRAY_LENGTH        2048       // number of items
    57 #define MAX_THREADS         1024       // 16 * 16 * 4
    58 
    59 #define USE_DQT_BARRIER     1          // use DQT barrier if non zero
    60 #define DISPLAY_ARRAY       0          // display items values before and after
    61 #define DEBUG_MAIN          0          // trace main function
    62 #define DEBUG_SORT          0          // trace sort function
    63 #define CHECK_RESULT        0          // for debug
    64 #define INSTRUMENTATION     1          // register computation times on file
    65 
    66 /////////////////////////////////////////////////////////////
    67 // argument for the sort() function (one thread per core)
    68 /////////////////////////////////////////////////////////////
     56#define ARRAY_LENGTH        2048            // number of items
     57#define MAX_THREADS         1024            // 16 * 16 * 4
     58
     59#define X_MAX               16              // max number of clusters in a row
     60#define Y_MAX               16              // max number of clusters in a column
     61#define CORES_MAX           4               // max number of cores in a cluster
     62#define CLUSTERS_MAX        X_MAX * Y_MAX
     63
     64#define USE_DQT_BARRIER     1               // use DQT barrier if non zero
     65#define DISPLAY_ARRAY       0               // display items values before and after
     66#define DEBUG_MAIN          0               // trace main function
     67#define DEBUG_SORT          0               // trace sort function
     68#define CHECK_RESULT        0               // for debug
     69#define INSTRUMENTATION     1               // register computation times on file
     70
     71///////////////////////////////////////////////////////////////////////////////////
     72//            Arguments for the sort() function
     73///////////////////////////////////////////////////////////////////////////////////
    6974
    7075typedef struct
    7176{
    72     unsigned int threads;       // total number of threads
    73     unsigned int thread_uid;    // thread user index (0 to threads -1)
    74     unsigned int main_uid;      // main thread user index
     77    unsigned int        tid;                // continuous thread index
     78    unsigned int        threads;            // total number of threads
     79    pthread_barrier_t * parent_barrier;     // pointer on termination barrier
    7580}
    76 args_t;
    77 
    78 //////////////////////////////////////////
    79 //      Global variables
    80 //////////////////////////////////////////
     81sort_args_t;
     82
     83////////////////////////////////////////////////////////////////////////////////////
     84//            Sort specific global variables
     85////////////////////////////////////////////////////////////////////////////////////
    8186
    8287int                 array0[ARRAY_LENGTH];    // values to sort
     
    8590pthread_barrier_t   barrier;                 // synchronisation variables
    8691
    87 pthread_t           trdid[MAX_THREADS];      // kernel identifiers
    88 pthread_attr_t      attr[MAX_THREADS];       // thread attributes
    89 args_t              arg[MAX_THREADS];        // sort function arguments
     92/////////////////////////////////////////////////////////////////////////////////////
     93//             Global variables required by parallel_pthread_create()
     94/////////////////////////////////////////////////////////////////////////////////////
     95
     96// 2D arrays of input arguments for the <sort> threads
     97// These arrays are initialised by the application main thread
     98
     99sort_args_t       sort_args[CLUSTERS_MAX][CORES_MAX];  // sort function arguments
     100sort_args_t     * sort_ptrs[CLUSTERS_MAX][CORES_MAX];  // pointers on arguments
     101
     102// 1D array of barriers to allow the <sort> threads to signal termination
     103// this array is initialised by the pthread_parallel_create() function
     104 
     105pthread_barrier_t parent_barriers[CLUSTERS_MAX];       // termination barrier
     106
    90107
    91108////////////////////////////////////
     
    157174}  // end merge()
    158175
    159 //////////////////////////////////////
    160 static void sort( const args_t * ptr )
     176//////////////////////////////
     177void sort( sort_args_t * ptr )
    161178{
    162     unsigned int       i;
    163     unsigned long long cycle;
    164     unsigned int       cxy;
    165     unsigned int       lid;
    166 
    167     int              * src_array  = NULL;
    168     int              * dst_array  = NULL;
    169 
    170     // get core coordinates an date
    171     get_core( &cxy , &lid );
    172     get_cycle( &cycle );
    173 
    174     unsigned int  thread_uid = ptr->thread_uid;
    175     unsigned int  threads    = ptr->threads;
    176     unsigned int  main_uid   = ptr->main_uid;
    177 
    178 #if DISPLAY_ARRAY
    179 unsigned int n;
    180 if( thread_uid == main_uid )
    181 {
    182     printf("\n*** array before sort\n");
    183     for( n=0; n<ARRAY_LENGTH; n++) printf("array[%d] = %d\n", n , array0[n] );
    184 }
     179    unsigned int        i;
     180    int               * src_array  = NULL;
     181    int               * dst_array  = NULL;
     182
     183    // get arguments
     184    unsigned int        tid            = ptr->tid;
     185    unsigned int        threads        = ptr->threads;
     186    pthread_barrier_t * parent_barrier = ptr->parent_barrier;
     187
     188    unsigned int        items      = ARRAY_LENGTH / threads;
     189    unsigned int        stages     = __builtin_ctz( threads ) + 1;
     190
     191#if DEBUG_SORT
     192printf("\n[sort] start : ptr %x / tid %d / threads %d / barrier %x\n",
     193ptr, tid, threads, parent_barrier );
     194#endif
     195
     196    bubbleSort( array0, items, items * tid );
     197
     198#if DEBUG_SORT
     199printf("\n[sort] thread[%d] : stage 0 completed\n", tid );
    185200#endif
    186201
     
    189204
    190205#if DEBUG_SORT
    191 if( thread_uid == 0 )
    192 printf("\n[sort] thread[%d] exit barrier 0\n", thread_uid );
    193 #endif
    194 
    195     unsigned int  items      = ARRAY_LENGTH / threads;
    196     unsigned int  stages     = __builtin_ctz( threads ) + 1;
    197 
    198 #if DEBUG_SORT
    199 if( thread_uid == 0 )
    200 printf("\n[sort] thread[%d] : start\n", thread_uid );
    201 #endif
    202 
    203     bubbleSort( array0, items, items * thread_uid );
    204 
    205 #if DEBUG_SORT
    206 if( thread_uid == 0 )
    207 printf("\n[sort] thread[%d] : stage 0 completed\n", thread_uid );
    208 #endif
    209 
    210     /////////////////////////////////
    211     pthread_barrier_wait( &barrier );
    212 
    213 #if DEBUG_SORT
    214 if( thread_uid == 0 )
    215 printf("\n[sort] thread[%d] exit barrier 0\n", thread_uid );
    216 #endif
    217 
    218 #if DISPLAY_ARRAY
    219 if( thread_uid == main_uid )
    220 {
    221     printf("\n*** array after bubble sort\n");
    222     for( n=0; n<ARRAY_LENGTH; n++) printf("array[%d] = %d\n", n , array0[n] );
    223 }
     206printf("\n[sort] thread[%d] exit barrier 0\n", tid );
    224207#endif
    225208
     
    239222        }
    240223
    241         if( (thread_uid & ((1<<i)-1)) == 0 )
    242         {
    243 
    244 #if DEBUG_SORT
    245 if( thread_uid == 0 )
    246 printf("\n[sort] thread[%d] : stage %d start\n", thread_uid , i );
     224        if( (tid & ((1<<i)-1)) == 0 )
     225        {
     226
     227#if DEBUG_SORT
     228printf("\n[sort] thread[%d] : stage %d start\n", tid , i );
    247229#endif
    248230            merge( src_array,
    249231                   dst_array,
    250232                   items << (i-1),
    251                    items * thread_uid,
    252                    items * (thread_uid + (1 << (i-1))),
    253                    items * thread_uid );
    254 
    255 #if DEBUG_SORT
    256 if( thread_uid == 0 )
    257 printf("\n[sort] thread[%d] : stage %d completed\n", thread_uid , i );
     233                   items * tid,
     234                   items * (tid + (1 << (i-1))),
     235                   items * tid );
     236
     237#if DEBUG_SORT
     238printf("\n[sort] thread[%d] : stage %d completed\n", tid , i );
    258239#endif
    259240        }
     
    263244
    264245#if DEBUG_SORT
    265 if( thread_uid == 0 )
    266 printf("\n[sort] thread[%d] exit barrier %d\n", thread_uid , i );
    267 #endif
    268 
    269 #if DISPLAY_ARRAY
    270 if( thread_uid == main_uid )
    271 {
    272     printf("\n*** array after merge %d\n", i );
    273     for( n=0; n<ARRAY_LENGTH; n++) printf("array[%d] = %d\n", n , dst_array[n] );
    274 }
     246printf("\n[sort] thread[%d] exit barrier %d\n", tid , i );
    275247#endif
    276248
    277249    }  // en for stages
    278250
    279     // all threads but the main thread exit
    280     if( thread_uid != main_uid ) pthread_exit( NULL );
     251    // sort thread signal completion to main thread
     252    pthread_barrier_wait( parent_barrier );
     253
     254#if DEBUG_SORT
     255printf("\n[sort] thread[%d] exit\n", tid );
     256#endif
     257
     258    // sort thread exit
     259    pthread_exit( NULL );
    281260
    282261} // end sort()
     
    291270    unsigned int           ncores;             // number of cores per cluster
    292271    unsigned int           total_threads;      // total number of threads
    293     unsigned int           thread_uid;         // user defined thread index
    294     unsigned int           main_cxy;           // cluster identifier for main
    295     unsigned int           main_x;             // X coordinate for main thread
    296     unsigned int           main_y;             // Y coordinate for main thread
    297     unsigned int           main_lid;           // core local index for main thread
    298     unsigned int           main_uid;           // thread user index for main thread
    299     unsigned int           x;                  // X coordinate for a thread
    300     unsigned int           y;                  // Y coordinate for a thread
     272    unsigned int           x;                  // X coordinate for a sort thread
     273    unsigned int           y;                  // Y coordinate for a sort thread
     274    unsigned int           cxy;                // cluster identifier for a sort thead
    301275    unsigned int           lid;                // core local index for a thread
     276    unsigned int           tid;                // sort thread continuous index
     277    pthread_barrierattr_t  barrier_attr;       // barrier attributes (used for DQT)
    302278    unsigned int           n;                  // index in array to sort
    303     pthread_barrierattr_t  barrier_attr;       // barrier attributes
    304279
    305280    unsigned long long     start_cycle;
     
    314289    total_threads = x_size * y_size * ncores;
    315290
    316     // get core coordinates and user index for the main thread
    317     get_core( &main_cxy , & main_lid );
    318     main_x   = HAL_X_FROM_CXY( main_cxy );
    319     main_y   = HAL_Y_FROM_CXY( main_cxy );
    320     main_uid = (((main_x * y_size) + main_y) * ncores) + main_lid;
     291    // compute covering DQT size an level
     292    unsigned int z = (x_size > y_size) ? x_size : y_size;
     293    unsigned int root_level = (z == 1) ? 0 : (z == 2) ? 1 : (z == 4) ? 2 : (z == 8) ? 3 : 4;
    321294
    322295    // checks number of threads
     
    326299         (total_threads != 512) && (total_threads != 1024) )
    327300    {
    328         printf("\n[sort error] number of cores must be power of 2\n");
     301        printf("\n[sort] ERROR : number of cores must be power of 2\n");
    329302        exit( 0 );
    330303    }
     
    333306    if ( ARRAY_LENGTH % total_threads)
    334307    {
    335         printf("\n[sort error] array size must be multiple of number of threads\n");
     308        printf("\n[sort] ERROR : array size must be multiple of number of threads\n");
    336309        exit( 0 );
    337310    }
     
    355328    if( error )
    356329    {
    357         printf("\n[sort error] cannot initialise barrier\n" );
     330        printf("\n[sort] ERROR : cannot initialise barrier\n" );
    358331        exit( 0 );
    359332    }
     
    370343    }
    371344
     345#if DISPLAY_ARRAY
     346    printf("\n*** array before sort\n");
     347    for( n=0; n<ARRAY_LENGTH; n++) printf("array[%d] = %d\n", n , array0[n] );
     348#endif
     349
    372350#if DEBUG_MAIN
    373351printf("\n[sort] main completes array init\n");
    374352#endif
    375353
    376     // launch other threads to execute sort() function
    377     // on cores other than the core running the main thread
    378     for ( x = 0 ; x < x_size ; x++ )
    379     {
    380         for ( y = 0 ; y < y_size ; y++ )
    381         {
     354    // build array of arguments for the <sort> threads
     355    for (x = 0 ; x < x_size ; x++)
     356    {
     357        for (y = 0 ; y < y_size ; y++)
     358        {
     359            // compute cluster identifier
     360            cxy = HAL_CXY_FROM_XY( x , y );
     361
    382362            for ( lid = 0 ; lid < ncores ; lid++ )
    383363            {
    384                 // compute thread user index (continuous index)
    385                 thread_uid = (((x * y_size) + y) * ncores) + lid;
    386 
    387                 // set arguments for all threads
    388                 arg[thread_uid].threads      = total_threads;
    389                 arg[thread_uid].thread_uid   = thread_uid;
    390                 arg[thread_uid].main_uid     = main_uid;
    391 
    392                 // set thread attributes for all threads
    393                 attr[thread_uid].attributes = PT_ATTR_CLUSTER_DEFINED | PT_ATTR_CORE_DEFINED;
    394                 attr[thread_uid].cxy        = HAL_CXY_FROM_XY( x , y );
    395                 attr[thread_uid].lid        = lid;
    396 
    397                 if( thread_uid != main_uid )
    398                 {
    399                     if ( pthread_create( &trdid[thread_uid],  // buffer for kernel identifier
    400                                          &attr[thread_uid],   // thread attributes
    401                                          &sort,               // entry function
    402                                          &arg[thread_uid] ) ) // sort arguments
    403                     {
    404                         printf("\n[sort error] main cannot create thread %x \n", thread_uid );
    405                         exit( 0 );
    406                     }
    407 
    408 #if (DEBUG_MAIN & 1)
    409 printf("\n[sort] main created thread %x \n", thread_uid );
    410 #endif
    411                 }
     364                // compute thread continuous index
     365                tid = (((x * y_size) + y) * ncores) + lid;
     366
     367                // initialize 2D array of arguments
     368                sort_args[cxy][lid].tid            = tid;
     369                sort_args[cxy][lid].threads        = total_threads;
     370                sort_args[cxy][lid].parent_barrier = &parent_barriers[cxy];
     371
     372                // initialize 2D array of pointers
     373                sort_ptrs[cxy][lid] = &sort_args[cxy][lid];
    412374            }
    413375        }
    414376    }
    415    
     377
    416378    ///////////////////////////
    417379    get_cycle( &seq_end_cycle );
     
    422384#endif
    423385
    424     // the main thread run also the sort() function
    425     sort( &arg[main_uid] );
    426 
    427     // wait other threads completion
    428     for ( x = 0 ; x < x_size ; x++ )
    429     {
    430         for ( y = 0 ; y < y_size ; y++ )
    431         {
    432             for ( lid = 0 ; lid < ncores ; lid++ )
    433             {
    434                 // compute thread continuous index
    435                 thread_uid = (((x * y_size) + y) * ncores) + lid;
    436 
    437                 if( thread_uid != main_uid )
    438                 {
    439                     if( pthread_join( trdid[thread_uid] , NULL ) )
    440                     {
    441                         printf("\n[fft error] in main thread %d joining thread %d\n",
    442                         main_uid , thread_uid );
    443                         exit( 0 );
    444                     }
    445                    
    446 #if (DEBUG_MAIN & 1)
    447 printf("\n[fft] main thread %d joined thread %d\n", main_uid, thread_uid );
    448 #endif
    449 
    450                 }
    451             }
    452         }
     386    // create and execute the working threads
     387    if( pthread_parallel_create( root_level,
     388                                 &sort,
     389                                 &sort_ptrs[0][0],
     390                                 &parent_barriers[0] ) )
     391    {
     392        printf("\n[sort] ERROR : cannot create threads\n");
     393        exit( 0 );
    453394    }
    454395
     
    456397    get_cycle( &para_end_cycle );
    457398
    458     printf("\n[sort] main completes parallel sort at cycle %d\n",
    459     (unsigned int)para_end_cycle );
     399#if DEBUG_main
     400printf("\n[sort] main completes parallel sort at cycle %d\n",
     401(unsigned int)para_end_cycle );
     402#endif
    460403
    461404    // destroy barrier
    462405    pthread_barrier_destroy( &barrier );
     406
     407#if DISPLAY_ARRAY
     408    printf("\n*** array after merge %d\n", i );
     409    for( n=0; n<ARRAY_LENGTH; n++) printf("array[%d] = %d\n", n , dst_array[n] );
     410#endif
    463411
    464412#if CHECK_RESULT
     
    492440    // build file name
    493441    if( USE_DQT_BARRIER )
    494     snprintf( name , 64 , "sort_dqt_%d_%d_%d", ARRAY_LENGTH, x_size * y_size, ncores );
     442    snprintf( name , 64 , "p_sort_dqt_%d_%d_%d", ARRAY_LENGTH, x_size * y_size, ncores );
    495443    else
    496     snprintf( name , 64 , "sort_smp_%d_%d_%d", ARRAY_LENGTH, x_size * y_size, ncores );
     444    snprintf( name , 64 , "p_sort_smp_%d_%d_%d", ARRAY_LENGTH, x_size * y_size, ncores );
    497445
    498446    // build file pathname
     
    515463    if( stream == NULL )
    516464    {
    517         printf("\n[sort error] cannot open instrumentation file <%s>\n", path );
     465        printf("\n[sort] ERROR : cannot open instrumentation file <%s>\n", path );
    518466        exit(0);
    519467    }
     
    532480    if( ret < 0 )
    533481    {
    534         printf("\n[sort error] cannot write to instrumentation file <%s>\n", path );
     482        printf("\n[sort] ERROR : cannot write to instrumentation file <%s>\n", path );
    535483        exit(0);
    536484    }
     
    548496    if( ret )
    549497    {
    550         printf("\n[sort error] cannot close instrumentation file <%s>\n", path );
     498        printf("\n[sort] ERROR : cannot close instrumentation file <%s>\n", path );
    551499        exit(0);
    552500    }
Note: See TracChangeset for help on using the changeset viewer.