Changeset 650 for trunk/libs/libalmosmkh


Ignore:
Timestamp:
Nov 14, 2019, 11:44:12 AM (5 years ago)
Author:
alain
Message:

Simplify the pthread_parallel_create() syscall.

Location:
trunk/libs/libalmosmkh
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • trunk/libs/libalmosmkh/almosmkh.c

    r647 r650  
    616616////////////////////////////////////////////////////////////////////i//////////////////////
    617617// This static function initialises the store in the cluster identified by the <cxy>
    618 // arguments. It is called by the malloc() or remote_malloc when a specific store(x,y)
    619 // is accessed for the first time by a remote() or remote_malloc() request.
     618// arguments. It is called by the remote_malloc() function when a specific store(x,y)
     619// is accessed for the first time.
    620620// It uses the mmap( MAP_REMOTE ) syscall to allocate a new vseg mapped in cluster (cxy).
    621621////////////////////////////////////////////////////////////////////i//////////////////////
     
    10991099#define X_MAX                   16              // max number of clusters in a row
    11001100#define Y_MAX                   16              // max number of clusters in a column
    1101 #define CLUSTERS_MAX            X_MAX * Y_MAX
    1102 #define LEVEL_MAX               5
     1101#define CLUSTERS_MAX            X_MAX * Y_MAX   // max number of clusters
     1102#define LEVEL_MAX               5               // max level of DQT
    11031103#define CORES_MAX               4               // max number of cores per cluster
    11041104
    1105 typedef struct build_args_s           
    1106 {
    1107     unsigned char       cxy;                    // this thread cluster identifier
    1108     unsigned char       level;                  // this thread level in quad-tree
    1109     unsigned char       parent_cxy;             // parent thread cluster identifier
    1110     unsigned char       root_level;             // quad-tree root level
    1111     void              * work_func;              // pointer on work function pointer
    1112     void              * work_args_array;        // pointer on 2D array of pointers
    1113     pthread_barrier_t * parent_barriers_array;  // pointer on 1D array of barriers
    1114     unsigned int        error;                  // return value : 0 if success
    1115 }
    1116 build_args_t;
    1117 
    11181105/////////////////////////////////////////////////////////////////////////////////////////
    1119 //      Global variables used for inter-thread communications
     1106//      Global variables
     1107//
     1108// WARNING :  arguments of the pthread_create() function MUST be global variables.
    11201109/////////////////////////////////////////////////////////////////////////////////////////
    11211110
    1122 pthread_attr_t    build_attr   [CLUSTERS_MAX][LEVEL_MAX];   // POSIX thread attributes
    1123 
    1124 build_args_t      build_args   [CLUSTERS_MAX][LEVEL_MAX];   // build function arguments
    1125 
    1126 pthread_barrier_t build_barrier[CLUSTERS_MAX][LEVEL_MAX];   // parent/child synchro
    1127 
    1128 pthread_attr_t    work_attr    [CLUSTERS_MAX][CORES_MAX];    // POSIX thread attributes
    1129 
    1130 //////////////////////////////////////////////////////////
    1131 static void pthread_recursive_build( build_args_t * args )
    1132 {
    1133     unsigned int   trdid;         // unused (required by pthread_create()
     1111// 2D array of <build> threads attributes / indexed by [cid][level]
     1112__attribute__((aligned(4096)))
     1113pthread_attr_t                pthread_build_attr[CLUSTERS_MAX][LEVEL_MAX];
     1114
     1115// 2D array of <build> threads arguments / indexed by [cid][level]
     1116__attribute__((aligned(4096)))
     1117pthread_parallel_build_args_t pthread_build_args[CLUSTERS_MAX][LEVEL_MAX];
     1118
     1119// 1D array of <work> threads attributes / indexed by [tid]
     1120__attribute__((aligned(4096)))
     1121pthread_attr_t                pthread_work_attr[CLUSTERS_MAX * CORES_MAX];
     1122
     1123// 1D array of <work> threads arguments / indexed by [tid]
     1124__attribute__((aligned(4096)))
     1125pthread_parallel_work_args_t  pthread_work_args[CLUSTERS_MAX * CORES_MAX];
     1126
     1127// kernel thread identifier / unused, but required by pthread_create()
     1128__attribute__((aligned(4096)))
     1129pthread_t                     trdid;
     1130
     1131///////////////////////////////////////////////////////////////////////////
     1132static void pthread_recursive_build( pthread_parallel_build_args_t * args )
     1133{
    11341134
    11351135    // get arguments
    1136     unsigned int        cxy                   = args->cxy;
    1137     unsigned int        level                 = args->level;
    1138     unsigned int        parent_cxy            = args->parent_cxy;
    1139     unsigned int        root_level            = args->root_level;
    1140     void              * work_func             = args->work_func;
    1141     void              * work_args_array       = args->work_args_array;
    1142     pthread_barrier_t * parent_barriers_array = args->parent_barriers_array;
    1143 
    1144     // set error default value
    1145     build_args[cxy][level].error = 0;
     1136    unsigned int         cid               = args->cid;
     1137    unsigned int         level             = args->level;
     1138    unsigned int         parent_cid        = args->parent_cid;
     1139    pthread_barrier_t  * parent_barrier    = args->parent_barrier;
     1140    unsigned int         root_level        = args->root_level;
     1141    void               * work_func         = args->work_func;
     1142    unsigned int         x_size            = args->x_size;
     1143    unsigned int         y_size            = args->y_size;
     1144    unsigned int         ncores            = args->ncores;
     1145
     1146#if DEBUG_PTHREAD_PARALLEL
     1147printf("\n[%s] <build> thread[%d][%d] enters / parent_cid %d / work_func %x\n",
     1148__FUNCTION__, cid , level , parent_cid , work_func );
     1149#endif
     1150
     1151    // set error default value in pthread_build_args[cid][level]
     1152    pthread_build_args[cid][level].error = 0;
     1153
     1154    // get cxy from cid
     1155    unsigned int cxy = HAL_CXY_FROM_XY( cid / y_size , cid % y_size );
     1156
     1157    // allocate the parent/child barrier in local cluster
     1158    pthread_barrier_t * barrier = (pthread_barrier_t *)malloc( sizeof(pthread_barrier_t) );
     1159
     1160    if( barrier == NULL )
     1161    {
     1162        printf("\n[ERROR] in %s : cannot allocate barrier for <build> thread[%d][%d]\n",
     1163        __FUNCTION__ , cid , level );
     1164
     1165        // report error to parent
     1166        pthread_build_args[parent_cid][level+1].error = 1;
     1167    }
    11461168
    11471169    ///////////////////////////////////////////////////////////
    1148     if( level == 0 )             // children are "work" threads
    1149     {
    1150         unsigned int   lid;           // core local index
    1151         unsigned int   ncores;        // number of cores in a cluster
    1152 
    1153         // get number of cores per cluster
    1154         get_nb_cores( cxy , &ncores );
    1155 
    1156         // kill process if no active core in cluster
    1157         // TODO this "if" should be replaced by an "assert" [AG]
    1158         if( ncores == 0 )
    1159         {
    1160             printf("\n[PANIC] in %s : no active core in cluster %x\n",
    1161             __FUNCTION__ , cxy );
     1170    if( level == 0 )             // children are <work> threads
     1171    {
     1172
     1173        // check number of cores in local cluster
     1174        unsigned int actual_ncores;
     1175        get_nb_cores( cxy , &actual_ncores );
     1176
     1177        if( actual_ncores != ncores )
     1178        {
     1179            printf("\n[ERROR] in %s : actual_ncores (%d) in cluster %x\n",
     1180            __FUNCTION__ , actual_ncores, cxy );
    11621181
    11631182            // report error to parent
    1164             build_args[parent_cxy][level+1].error = 1;
    1165 
    1166             // kill process
    1167             exit( EXIT_FAILURE );
    1168         }
    1169 
    1170         // initialize the parent_barrier
    1171         if( pthread_barrier_init( &parent_barriers_array[cxy] , NULL , ncores + 1 ) )
    1172         {
    1173             printf("\n[ERROR] in %s : cannot initialise barrier for build thread[%x][%d]\n",
    1174             __FUNCTION__ , cxy , level );
     1183            pthread_build_args[parent_cid][level+1].error = 1;
     1184        }
     1185
     1186        // initializes barrier for (ncores + 1) in flat mode
     1187        if( pthread_barrier_init( barrier , NULL , ncores + 1 ) )
     1188        {
     1189            printf("\n[ERROR] in %s : cannot init barrier for <build> thread[%d][%d]\n",
     1190            __FUNCTION__ , cid , level );
    11751191
    11761192            // report error to parent
    1177             build_args[parent_cxy][level+1].error = 1;
     1193            pthread_build_args[parent_cid][level+1].error = 1;
    11781194        }
    11791195
    11801196#if DEBUG_PTHREAD_PARALLEL
    1181 printf("\n[%s] <build> thread[%x][%d] created barrier / %d children\n",
    1182 __FUNCTION__, cxy, level, ncores + 1 );
    1183 #endif
    1184         // create (ncores) "work" threads
     1197printf("\n[%s] <build> thread[%d][%d] initialized barrier / %d children\n",
     1198__FUNCTION__, cid, level, ncores );
     1199#endif
     1200        unsigned int   lid;     // core local index for <work> thread
     1201        unsigned int   tid;     // <work> thread continuous index
     1202
     1203        // <build> thread creates ncores <work> threads
    11851204        for ( lid = 0 ; lid < ncores ; lid++ )
    11861205        {
    1187             // set attributes for thread[cxy][lid]
    1188             work_attr[cxy][lid].attributes = PT_ATTR_DETACH |
    1189                                              PT_ATTR_CLUSTER_DEFINED |
    1190                                              PT_ATTR_CORE_DEFINED;
    1191             work_attr[cxy][lid].cxy        = cxy;
    1192             work_attr[cxy][lid].lid        = lid;
    1193 
    1194             // compute pointer on thread[cxy][lid] arguments
    1195             void * work_args = *((void **)work_args_array + (cxy * CORES_MAX) + lid);
    1196 
    1197             // create thread
     1206            // compute work thread tid
     1207            tid = (cid * ncores) + lid;
     1208
     1209            // set attributes for <work> thread[tid]
     1210            pthread_work_attr[tid].attributes = PT_ATTR_DETACH |
     1211                                                PT_ATTR_CLUSTER_DEFINED |
     1212                                                PT_ATTR_CORE_DEFINED;
     1213            pthread_work_attr[tid].cxy        = cxy;
     1214            pthread_work_attr[tid].lid        = lid;
     1215
     1216            // set tid and barrier arguments for <work> thread[tid]
     1217            pthread_work_args[tid].tid     = tid;
     1218            pthread_work_args[tid].barrier = barrier;
     1219
     1220            // create <work> thread
    11981221            if ( pthread_create( &trdid,                  // unused
    1199                                  &work_attr[cxy][lid],
     1222                                 &pthread_work_attr[tid],
    12001223                                 work_func,
    1201                                  work_args ) )
     1224                                 &pthread_work_args[tid] ) )
    12021225            {
    1203                 printf("\n[ERROR] in %s : cannot create work thread[%x,%x]\n",
    1204                 __FUNCTION__ , cxy , lid );
     1226                printf("\n[ERROR] in %s : <build> thread[%d][%d] cannot create <work> thread[%d]\n",
     1227                __FUNCTION__ , cid , level , tid );
    12051228
    12061229                // report error to parent
    1207                 build_args[parent_cxy][level+1].error = 1;
     1230                pthread_build_args[parent_cid][level+1].error = 1;
    12081231            }
    12091232
    12101233#if DEBUG_PTHREAD_PARALLEL
    1211 printf("\n[%s] <build> thread[%x][%d] created <work> thread[%x][%d]\n",
    1212 __FUNCTION__, cxy, level, cxy, lid );
    1213 #endif
    1214         }
    1215 
    1216         // wait on barrier until "work" children threads completed
    1217         if( pthread_barrier_wait( &parent_barriers_array[cxy] ) )
    1218         {
    1219             printf("\n[ERROR] in %s / first barrier for <build> thread[%x][%d]\n",
    1220             __FUNCTION__ , cxy , level );
     1234printf("\n[%s] <build> thread[%d][%d] created <work> thread[%d]\n",
     1235__FUNCTION__, cid, level, tid );
     1236#endif
     1237        }
     1238
     1239        // wait on barrier until all <work> children threads completed
     1240        if( pthread_barrier_wait( barrier ) )
     1241        {
     1242            printf("\n[ERROR] in %s / barrier for <build> thread[%x][%d]\n",
     1243            __FUNCTION__ , cid , level );
    12211244
    12221245            // report error to parent
    1223             build_args[parent_cxy][level+1].error = 1;
     1246            pthread_build_args[parent_cid][level+1].error = 1;
    12241247        }
    12251248
    12261249#if DEBUG_PTHREAD_PARALLEL
    1227 printf("\n[%s] <build> thread[%x][%d] resume after children completion\n",
    1228 __FUNCTION__, cxy, level );
     1250printf("\n[%s] <build> thread[%d][%d] resume after children completion\n",
     1251__FUNCTION__ , cid , level );
    12291252#endif
    12301253
     
    12341257    else                        // children are "build" threads
    12351258    {
    1236         // the 4 children threads can be created in any core of each quarters
    1237         // of the parent macro-cluster
     1259        // the 4 children threads can be linked to any core in each
     1260        // sub-macro-cluster[i][j] with [ij] in {00,01,10,11}
    12381261
    12391262        unsigned int parent_x;          // X coordinate of parent macro-cluster
     
    12411264        unsigned int child_x;           // X coordinate of child macro-cluster
    12421265        unsigned int child_y;           // Y coordinate of child macro-cluster
    1243         unsigned int child_cxy[2][2];   // selected cluster for child thread
    1244         unsigned int child_lid[2][2];   // selected core index for child thread
    1245         int          child_sts[2][2];   // -1 if error / 0 if success / +1 if not found
    1246         unsigned int x;                 // X loop index for children
    1247         unsigned int y;                 // Y loop index for children
    1248        
    1249         unsigned int nb_children = 0;
     1266        unsigned int child_cid[2][2];   // selected cluster cid for child[i][j]
     1267        unsigned int child_cxy[2][2];   // selected cluster cxy for child[i][j]
     1268        unsigned int child_lid[2][2];   // selected core index  for child[i][j]
     1269        int          child_sts[2][2];   // -1 if error / 0 if success / +1 if no core
     1270        unsigned int i;                 // loop index for children
     1271        unsigned int j;                 // loop index for children
     1272
     1273        unsigned int nb_children = 0;   // actual number of children (can be < 4)
    12501274
    12511275        // get parent macro-cluster mask and half-size from level
     
    12571281        parent_y = HAL_Y_FROM_CXY( cxy ) & ~mask;
    12581282
    1259         // get child_cxy and child_lid for up to 4 children threads : 00 / 01 / 10 / 11
    1260         for (x = 0 ; x < 2 ; x++)
     1283        // First step : select core for each child thread
     1284        for (i = 0 ; i < 2 ; i++)
    12611285        {
    12621286            // compute child macro-cluster X coordinate
    1263             child_x = (x == 0) ? parent_x : (parent_x + half);
    1264 
    1265             for (y = 0 ; y < 2 ; y++)
     1287            child_x = (i == 0) ? parent_x : (parent_x + half);
     1288
     1289            for (j = 0 ; j < 2 ; j++)
    12661290            {
    12671291                // compute child macro-cluster Y coordinate
    1268                 child_y = (y == 0) ? parent_y : (parent_y + half);
     1292                child_y = (j == 0) ? parent_y : (parent_y + half);
    12691293
    12701294                // select the best core in macro-cluster
    1271                 child_sts[x][y] = get_best_core( HAL_CXY_FROM_XY( child_x , child_y ),
     1295                unsigned int best_cxy;
     1296                unsigned int best_lid;
     1297
     1298                child_sts[i][j] = get_best_core( HAL_CXY_FROM_XY( child_x , child_y ),
    12721299                                                 level-1,
    1273                                                  &child_cxy[x][y],
    1274                                                  &child_lid[x][y] );
    1275 
    1276                 if( child_sts[x][y] < 0 )  // failure => report error
     1300                                                 &best_cxy,
     1301                                                 &best_lid );
     1302
     1303                if( child_sts[i][j] < 0 )  // failure => report error
    12771304                {
    1278                     printf("\n[ERROR] in %s : illegal arguments for <build> thread[%x,%x]\n",
    1279                     __FUNCTION__ , cxy , level );
     1305                    printf("\n[ERROR] in %s select core for child[%d,%d] of <build> thread[%d,%d]\n",
     1306                    __FUNCTION__ , i , j , cid , level );
    12801307
    12811308                    // report error to parent
    1282                     build_args[parent_cxy][level+1].error = 1;
     1309                    pthread_build_args[parent_cid][level+1].error = 1;
    12831310                }
    1284                 else if (child_sts[x][y] > 0 )  // macro-cluster undefined => does nothing
     1311                else if (child_sts[i][j] > 0 )  // macro-cluster empty => does nothing
    12851312                {
    12861313                }
    12871314                else                            // core found
    12881315                {
     1316                    child_cxy[i][j] = best_cxy;
     1317                    child_lid[i][j] = best_lid;
     1318                    child_cid[i][j] = (HAL_X_FROM_CXY(best_cxy) * y_size) + HAL_Y_FROM_CXY( best_cxy);
    12891319                    nb_children++;
     1320
     1321#if DEBUG_PTHREAD_PARALLEL
     1322printf("\n[%s] <build> thread[%d][%d] select core[%x][%d] for child[%d][%d]\n",
     1323__FUNCTION__ , cid , level , best_cxy , best_lid , i , j );
     1324#endif
     1325
    12901326                }
    1291             }  // end for y
    1292         }  // end for x
    1293 
    1294         // kill process if no active core in cluster
    1295         // TODO this "if" should be replaced by an "assert" [AG]
    1296         if( nb_children == 0 )
    1297         {
    1298             printf("\n[PANIC] in %s : no active core in macro cluster [%x,%d]\n",
    1299             __FUNCTION__ , cxy , level );
     1327            }  // end for j
     1328        }  // end for i
     1329
     1330        // second step : initialize barrier for (nb_children + 1) in flat mode
     1331        if( pthread_barrier_init( barrier , NULL , nb_children + 1 ) )
     1332        {
     1333            printf("\n[ERROR] in %s : cannot init barrier for <build> thread[%d][%d]\n",
     1334            __FUNCTION__ , cid , level );
    13001335
    13011336            // report error to parent
    1302             build_args[parent_cxy][level+1].error = 1;
    1303 
    1304             // kill process
    1305             exit( EXIT_FAILURE );
    1306         }
    1307 
    1308         // initialize the barrier for (nb_children + 1)
    1309         if( pthread_barrier_init( &build_barrier[cxy][level], NULL , nb_children + 1 ) )
    1310         {
    1311             printf("\n[error] in %s : cannot initialise barrier for build thread[%x][%d]\n",
    1312             __FUNCTION__ , cxy , level );
    1313 
    1314             // report error to parent
    1315             build_args[parent_cxy][level+1].error = 1;
     1337            pthread_build_args[parent_cid][level+1].error = 1;
    13161338        }
    13171339
    13181340#if DEBUG_PTHREAD_PARALLEL
    1319 printf("\n[%s] <build> thread[%x][%d] created barrier / %d children\n",
    1320 __FUNCTION__, cxy, level, nb_children + 1 );
    1321 #endif
    1322         // create 1 to 4 children threads
    1323         for (x = 0 ; x < 2 ; x++)
    1324         {
    1325             for (y = 0 ; y < 2 ; y++)
     1341printf("\n[%s] <build> thread[%d][%d] initialized barrier / %d children\n",
     1342__FUNCTION__, cid, level, nb_children );
     1343#endif
     1344
     1345        // Third step : actually create the children threads
     1346        for (i = 0 ; i < 2 ; i++)
     1347        {
     1348            for (j = 0 ; j < 2 ; j++)
    13261349            {
    13271350                // thread is created only if macro-cluster is active
    1328                 if( child_sts[x][y] == 0 )
     1351                if( child_sts[i][j] == 0 )
    13291352                {
    1330                     unsigned int tgt_cxy = child_cxy[x][y];
    1331                     unsigned int tgt_lid = child_lid[x][y];
     1353                    unsigned int tgt_cid = child_cid[i][j];
     1354                    unsigned int tgt_lid = child_lid[i][j];
     1355                    unsigned int tgt_cxy = child_cxy[i][j];
    13321356
    13331357                    // set child thread attributes
    1334                     build_attr[tgt_cxy][level-1].attributes = PT_ATTR_DETACH |
    1335                                                               PT_ATTR_CLUSTER_DEFINED |
    1336                                                               PT_ATTR_CORE_DEFINED;
    1337                     build_attr[tgt_cxy][level-1].cxy        = tgt_cxy;
    1338                     build_attr[tgt_cxy][level-1].lid        = tgt_lid;
    1339 
    1340                     // propagate build function arguments
    1341                     build_args[tgt_cxy][level-1].cxy                   = child_cxy[x][y];
    1342                     build_args[tgt_cxy][level-1].level                 = level-1;
    1343                     build_args[tgt_cxy][level-1].parent_cxy            = cxy;
    1344                     build_args[tgt_cxy][level-1].root_level            = root_level;
    1345                     build_args[tgt_cxy][level-1].work_func             = work_func;
    1346                     build_args[tgt_cxy][level-1].work_args_array       = work_args_array;
    1347                     build_args[tgt_cxy][level-1].parent_barriers_array = parent_barriers_array;
     1358                    pthread_build_attr[tgt_cid][level-1].attributes = PT_ATTR_DETACH |
     1359                                                                      PT_ATTR_CLUSTER_DEFINED |
     1360                                                                      PT_ATTR_CORE_DEFINED;
     1361                    pthread_build_attr[tgt_cid][level-1].cxy        = tgt_cxy;
     1362                    pthread_build_attr[tgt_cid][level-1].lid        = tgt_lid;
     1363
     1364                    // propagate build function arguments from parent to child
     1365                    pthread_build_args[tgt_cid][level-1].cid            = tgt_cid;
     1366                    pthread_build_args[tgt_cid][level-1].level          = level-1;
     1367                    pthread_build_args[tgt_cid][level-1].parent_cid     = cid;
     1368                    pthread_build_args[tgt_cid][level-1].parent_barrier = barrier;
     1369                    pthread_build_args[tgt_cid][level-1].root_level     = root_level;
     1370                    pthread_build_args[tgt_cid][level-1].work_func      = work_func;
     1371                    pthread_build_args[tgt_cid][level-1].x_size         = x_size;
     1372                    pthread_build_args[tgt_cid][level-1].y_size         = y_size;
     1373                    pthread_build_args[tgt_cid][level-1].ncores         = ncores;
    13481374                   
    13491375                    // create thread
    13501376                    if( pthread_create( &trdid,                         
    1351                                         &build_attr[tgt_cxy][level-1],   
     1377                                        &pthread_build_attr[tgt_cid][level-1],   
    13521378                                        &pthread_recursive_build,                         
    1353                                         &build_args[tgt_cxy][level-1] ) )
     1379                                        &pthread_build_args[tgt_cid][level-1] ) )
    13541380                    {
    1355                         printf("\n[ERROR] in %s : cannot create build thread[%x][%d]\n",
    1356                         __FUNCTION__ , child_cxy , level -1 );
     1381                        printf("\n[ERROR] in %s : cannot create <build> thread[%x][%d]\n",
     1382                        __FUNCTION__ , child_cid , level -1 );
    13571383
    13581384                        // report error to parent
    1359                         build_args[parent_cxy][level+1].error = 1;
     1385                        pthread_build_args[parent_cid][level+1].error = 1;
    13601386                    }
    13611387
    13621388#if DEBUG_PTHREAD_PARALLEL
    1363 printf("\n[%s] <build> thread[%x][%d] created <build> thread[%x][%d] on core[%x,%d]\n",
    1364 __FUNCTION__, cxy, level, tgt_cxy, level - 1, tgt_cxy, tgt_lid );
     1389printf("\n[%s] <build> thread[%d][%d] created <build> thread[%d][%d] on core[%x,%d]\n",
     1390__FUNCTION__, cid, level, tgt_cid, (level - 1), tgt_cxy, tgt_lid );
    13651391#endif
    13661392                }  //end if sts[x][y]
     
    13681394        }  // end for x
    13691395       
    1370         // wait on barrier until "build" children threads completed
    1371         if( pthread_barrier_wait( &build_barrier[cxy][level] ) )
    1372         {
    1373             printf("\n[ERROR] in %s / first barrier for <build> thread[%x][%d]\n",
    1374             __FUNCTION__ , cxy , level );
     1396        // wait on barrier until all <build> children threads completed
     1397        if( pthread_barrier_wait( barrier ) )
     1398        {
     1399            printf("\n[ERROR] in %s / barrier for <build> thread[%d][%d]\n",
     1400            __FUNCTION__ , cid , level );
    13751401
    13761402            // report error to parent
    1377             build_args[parent_cxy][level+1].error = 1;
     1403            pthread_build_args[parent_cid][level+1].error = 1;
    13781404        }
    13791405
    13801406#if DEBUG_PTHREAD_PARALLEL
    13811407printf("\n[%s] <build> thread[%x][%d] resume after children completion\n",
    1382 __FUNCTION__, cxy, level );
     1408__FUNCTION__, cid, level );
    13831409#endif
    13841410
     
    13861412
    13871413    // report error to parent when required
    1388     if( build_args[cxy][level].error )
    1389     {
    1390         build_args[parent_cxy][level+1].error = 1;
    1391     }
    1392 
    1393     // all <build> threads - but the root -
    1394     // signal completion to parent thread and exit
     1414    if( pthread_build_args[cid][level].error )
     1415    {
     1416        pthread_build_args[parent_cid][level+1].error = 1;
     1417    }
     1418
     1419    // all <build> threads - but the root - signal completion to parent thread and exit
    13951420    if( level < root_level )
    13961421    {
    1397         if( pthread_barrier_wait( &build_barrier[parent_cxy][level+1] ) )
    1398         {
    1399             printf("\n[ERROR] in %s / second barrier for <build> thread[%x][%d]\n",
    1400             __FUNCTION__ , cxy , level );
     1422        if( pthread_barrier_wait( parent_barrier ) )
     1423        {
     1424            printf("\n[ERROR] in %s / parent barrier for <build> thread[%d][%d]\n",
     1425            __FUNCTION__ , cid , level );
    14011426
    14021427            // report error to parent
    1403             build_args[parent_cxy][level+1].error = 1;
     1428            pthread_build_args[parent_cid][level+1].error = 1;
    14041429        }
    14051430   
    14061431#if DEBUG_PTHREAD_PARALLEL
    14071432printf("\n[%s] <build> thread[%x][%d] exit\n",
    1408 __FUNCTION__, cxy , level );
    1409 #endif
    1410         // "build" thread exit
     1433__FUNCTION__, cid , level );
     1434#endif
     1435        // <build> thread exit
    14111436        pthread_exit( NULL );
    14121437    }
    14131438}  // end pthread_recursive_build()
    14141439
    1415 ///////////////////////////////////////////////////////
    1416 int pthread_parallel_create( unsigned int   root_level,
    1417                              void         * work_func,
    1418                              void         * work_args_array,
    1419                              void         * parent_barriers_array )
    1420 {
     1440
     1441//////////////////////////////////////////////////////
     1442int pthread_parallel_create( unsigned int  root_level,
     1443                             void        * work_func )
     1444{
     1445
     1446#if DEBUG_PTHREAD_PARALLEL
     1447printf("\n[%s] enter / root_level %d / func %x\n",
     1448__FUNCTION__, root_level, work_func );
     1449#endif
     1450
     1451    // get platform parameters
     1452    unsigned int   x_size;
     1453    unsigned int   y_size;
     1454    unsigned int   ncores;
     1455    get_config( &x_size , &y_size , &ncores );
     1456
     1457    // get calling thread cluster identifier
    14211458    unsigned int   root_cxy;
    14221459    unsigned int   root_lid;    // unused, but required by get_core_id()
     1460    get_core_id( &root_cxy , &root_lid );
     1461
     1462    // get calling thread continuous index
     1463    unsigned int x        = HAL_X_FROM_CXY( root_cxy );
     1464    unsigned int y        = HAL_Y_FROM_CXY( root_cxy );
     1465    unsigned int root_cid = (y_size * x) + y;
     1466
     1467    // set the build function arguments for the root <build> thread
     1468    pthread_build_args[root_cid][root_level].cid               = root_cid;
     1469    pthread_build_args[root_cid][root_level].level             = root_level;
     1470    pthread_build_args[root_cid][root_level].parent_cid        = -1;
     1471    pthread_build_args[root_cid][root_level].parent_barrier    = NULL;
     1472    pthread_build_args[root_cid][root_level].root_level        = root_level;
     1473    pthread_build_args[root_cid][root_level].work_func         = work_func;
     1474    pthread_build_args[root_cid][root_level].x_size            = x_size;
     1475    pthread_build_args[root_cid][root_level].y_size            = y_size;
     1476    pthread_build_args[root_cid][root_level].ncores            = ncores;
    14231477   
    1424 #if DEBUG_PTHREAD_PARALLEL
    1425 printf("\n[%s] enter / root_level %d / func %x / args %x / barriers %x\n",
    1426 __FUNCTION__, root_level, work_func, work_args_array, parent_barriers_array );
    1427 #endif
    1428 
    1429     // get calling thread cluster
    1430     get_core_id( &root_cxy , &root_lid );
    1431 
    1432     // set the build function arguments for the root <build> thread
    1433     build_args[root_cxy][root_level].cxy                   = root_cxy;
    1434     build_args[root_cxy][root_level].level                 = root_level;
    1435     build_args[root_cxy][root_level].root_level            = root_level;
    1436     build_args[root_cxy][root_level].work_func             = work_func;
    1437     build_args[root_cxy][root_level].work_args_array       = work_args_array;
    1438     build_args[root_cxy][root_level].parent_barriers_array = parent_barriers_array;
    1439    
    1440     // call the recursive build function
    1441     pthread_recursive_build( &build_args[root_cxy][root_level] );
    1442 
    1443     // check error
    1444     if( build_args[root_cxy][root_level].error )
     1478    // call the recursive function
     1479    pthread_recursive_build( &pthread_build_args[root_cid][root_level] );
     1480
     1481    // check error when execution completes
     1482    if( pthread_build_args[root_cid][root_level].error )
    14451483    {
    14461484        printf("\n[error] in  %s\n", __FUNCTION__ );
  • trunk/libs/libalmosmkh/almosmkh.h

    r647 r650  
    101101 * @ level    : [in]  macro-cluster level in [1,2,3,4,5].
    102102 * @ cxy      : [out] selected core cluster identifier.
    103  * @ lid      : [out] selectod core local index.
     103 * @ lid      : [out] selected core local index.
    104104 * @ return 0 if success / 1 if no core in macro-cluster / -1 if illegal arguments.
    105105 **************************************************************************************/
     
    415415 * This function releases the memory buffer identified by the <ptr> argument,
    416416 * to the store identified by the <cxy> argument.
    417  * It displays an error message, but does nothing if the ptr is illegal.
     417 * It  does nothing, but displays an error message, if the ptr is illegal.
    418418 *****************************************************************************************
    419419 * @ ptr   : pointer on the released buffer.
     
    456456
    457457//////////////////////////////////////////////////////////////////////////////////////////
    458 // This system call can be used to parallelize the creation and the termination
    459 // of a parallel multi-threaded application. It removes the loop in the main thread that
    460 // creates the N working threads (N  sequencial pthread_create() ). It also removes the
    461 // loop that waits completion of these N working threads (N sequencial pthread_join() ).
    462 // It creates one "work" thread (in detached mode) per core in the target architecture.
    463 // Each "work" thread is identified by the [cxy][lid] indexes (cluster / local core).
    464 // The pthread_parallel_create() function returns only when all "work" threads completed
     458// This syscall can be used to parallelize the creation, and the termination
     459// of a parallel multi-threaded application.
     460// It removes in the main thread the sequencial loop that creates the N working threads
     461// (N pthread_create() ), and removes also the sequencial loop that waits completion
     462// of these N working threads (N pthread_join() ).
     463// It creates one <work> thread (in detached mode) per core in the target architecture.
     464// Each <work> thread is identified by a continuous [tid] index.
     465// For a regular architecture, defined by the [x_size , y_size , ncores] parameters,
     466// the number of working threads can be the simply computed as (x_size * y_size * ncores),
     467// and the coordinates[x,y,lid] of the core running the thread[tid] cand be directly
     468// derived from the [tid] value with the following relations:
     469//     . cid = (x * y_size) + y
     470//     . tid = (cid * ncores ) + lid
     471//     . lid = tid % ncores
     472//     . cid = tid / ncores
     473//     . y   = cid % y_size
     474//     . x   = cid / y_size
     475// The pthread_parallel_create() function returns only when all <work> threads completed
    465476// (successfully or not).
    466477//
    467 // To use this system call, the application code must define the following structures:
    468 // - To define the arguments to pass to the <work> function the application must allocate
    469 //   and initialize a first 2D array, indexed by [cxy] and [lid] indexes, where each slot
    470 //   contains an application specific structure, and another 2D array, indexed by the same
    471 //   indexes, containing pointers on these structures. This array of pointers is one
    472 //   argument of the pthread_parallel_create() function.
    473 // - To detect the completion of the <work> threads, the application must allocate a 1D
    474 //   array, indexed by the cluster index [cxy], where each slot contains a pthread_barrier
    475 //   descriptor. This barrier is initialised by the pthread_parallel_create() function,
    476 //   in all cluster containing at least one work thread. This array of barriers is another
    477 //   argument of the pthread_parallel_create() function.
     478// WARNING : The function executed by the working thread is application specific,
     479// but the structure defining the arguments passed to this function is imposed.
     480// The "pthread_parallel_work_args_t" structure is defined below, and contains
     481// two fields: the tid value, and a pointer on a pthread_barrier_t.
     482// This barrier must be used by each working thread to signal completion before exit.
     483// The global variables implementing these stuctures for each working thread
     484// are allocated and initialised by the pthread_parallel_create() function.
    478485//
    479 // Implementation note:
    480 // To parallelize the "work" threads creation and termination, the pthread_parallel_create()
    481 // function creates a distributed quad-tree (DQT) of "build" threads covering all cores
    482 // required to execute the parallel application.
     486// Implementation note: the pthread_parallel_create()a function creates a distributed
     487// quad-tree (DQT) of <build> threads covering all cores required to execute the parallel
     488// application. This quad tree is entirely defined by the root_level parameter.
    483489// Depending on the hardware topology, this DQT can be truncated, (i.e. some
    484490// parent nodes can have less than 4 chidren), if (x_size != y_size), or if one size
    485 // is not a power of 2. Each "build" thread is identified by two indexes [cxy][level].
    486 // Each "build" thread makes the following tasks:
     491// is not a power of 2. Each <build> thread is identified by two indexes [cid][level].
     492// Each <build> thread makes the following tasks:
    487493// 1) It calls the pthread_create() function to create up to 4 children threads, that
    488 //    are are "work" threads when (level == 0), or "build" threads, when (level > 0).
    489 // 2) It initializes the barrier (global variable), used to block/unblock
    490 //    the parent thread until children completion.
     494//    are <work> threads when (level == 0), or <build> threads, when (level > 0).
     495// 2) It allocates and initializes the barrier, used to block the parent thread until
     496//    children completion.
    491497// 3) It calls the pthread_barrier_wait( self ) to wait until all children threads
    492498//    completed (successfully or not).
     
    495501
    496502/*****************************************************************************************
    497  * This blocking function creates N working threads that execute the code defined
    498  * by the <work_func> and <work_args> arguments, and returns only when all working
    499  * threads completed.
    500  * The number N of created threads is entirely defined by the <root_level> argument.
    501  * This value defines an abstract quad-tree, with a square base : level in [0,1,2,3,4],
     503 *    structure defining the arguments for the <build> thread function
     504 ****************************************************************************************/
     505typedef struct pthread_parallel_build_args_s           
     506{
     507    unsigned char        cid;                    // this <build> thread cluster index
     508    unsigned char        level;                  // this <build> thread level in quad-tree
     509    unsigned char        parent_cid;             // parent <build> thread cluster index
     510    pthread_barrier_t  * parent_barrier;         // pointer on parent <build> thread barrier
     511    unsigned char        root_level;             // quad-tree root level
     512    void               * work_func;              // pointer on working thread function
     513    unsigned int         x_size;                 // platform global parameter
     514    unsigned int         y_size;                 // platform global parameter
     515    unsigned int         ncores;                 // platform global parameter
     516    unsigned int         error;                  // return value : 0 if success
     517}
     518pthread_parallel_build_args_t;
     519
     520/*****************************************************************************************
     521 *    structure defining the arguments for the <work> thread function
     522 ****************************************************************************************/
     523typedef struct pthread_parallel_work_args_s           
     524{
     525    unsigned int         tid;                    // thread identifier
     526    pthread_barrier_t  * barrier;                // to signal completion
     527}
     528pthread_parallel_work_args_t;           
     529
     530/*****************************************************************************************
     531 * This blocking function creates N working threads identified by the [tid] continuous
     532 * index, that execute the code defined by the <work_func> argument, and returns only
     533 * when all working threads completed.
     534 * The number N of created threads is entirely defined by the <root_level> argument,
     535 * that defines an abstract quad-tree, with a square base : root_level in [0,1,2,3,4],
    502536 * side in [1,2,4,8,16], nclusters in [1,4,16,64,256]. This base is called  macro_cluster.
    503  * A working thread is created on all cores contained in the specified macro-cluster.
     537 * A working thread is created on all cores contained in this abstract macro-cluster.
    504538 * The actual number of physical clusters containing cores can be smaller than the number
    505  * of clusters covered by the quad tree. The actual number of cores in a cluster can be
    506  * less than the max value.
    507  *
    508  * In the current implementation, all threads execute the same <work_func> function,
    509  * on different arguments, that are specified as a 2D array of pointers <work_args>.
    510  * This can be modified in a future version, where the <work_func> argument can become
    511  * a 2D array of pointers, to have one specific function for each thread.
     539 * of clusters covered by the abstract quad tree.
     540 * All threads execute the same <work_func> function, on different arguments, that are
     541 * specified as an array of structures pthread_parallel_work_args_t, allocated and
     542 * initialised by this function.
    512543 *****************************************************************************************
    513544 * @ root_level            : [in]  DQT root level in [0,1,2,3,4].
    514545 * @ work_func             : [in]  pointer on start function.
    515  * @ work_args_array       : [in]  pointer on a 2D array of pointers.
    516  * @ parent_barriers_array : [in]  pointer on a 1D array of barriers.
    517546 * @ return 0 if success / return -1 if failure.
    518547 ****************************************************************************************/
    519 int pthread_parallel_create( unsigned int   root_level,
    520                              void         * work_func,
    521                              void         * work_args_array,
    522                              void         * parent_barriers_array );
     548int pthread_parallel_create( unsigned int         root_level,
     549                             void               * work_func );
     550
     551
     552
     553
    523554
    524555/********* Non standard (ALMOS-MKH specific) Frame Buffer access syscalls   *************/
Note: See TracChangeset for help on using the changeset viewer.