Changeset 650 for trunk/libs/libalmosmkh
- Timestamp:
- Nov 14, 2019, 11:44:12 AM (5 years ago)
- Location:
- trunk/libs/libalmosmkh
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/libs/libalmosmkh/almosmkh.c
r647 r650 616 616 ////////////////////////////////////////////////////////////////////i////////////////////// 617 617 // This static function initialises the store in the cluster identified by the <cxy> 618 // arguments. It is called by the malloc() or remote_mallocwhen a specific store(x,y)619 // is accessed for the first time by a remote() or remote_malloc() request.618 // arguments. It is called by the remote_malloc() function when a specific store(x,y) 619 // is accessed for the first time. 620 620 // It uses the mmap( MAP_REMOTE ) syscall to allocate a new vseg mapped in cluster (cxy). 621 621 ////////////////////////////////////////////////////////////////////i////////////////////// … … 1099 1099 #define X_MAX 16 // max number of clusters in a row 1100 1100 #define Y_MAX 16 // max number of clusters in a column 1101 #define CLUSTERS_MAX X_MAX * Y_MAX 1102 #define LEVEL_MAX 5 1101 #define CLUSTERS_MAX X_MAX * Y_MAX // max number of clusters 1102 #define LEVEL_MAX 5 // max level of DQT 1103 1103 #define CORES_MAX 4 // max number of cores per cluster 1104 1104 1105 typedef struct build_args_s1106 {1107 unsigned char cxy; // this thread cluster identifier1108 unsigned char level; // this thread level in quad-tree1109 unsigned char parent_cxy; // parent thread cluster identifier1110 unsigned char root_level; // quad-tree root level1111 void * work_func; // pointer on work function pointer1112 void * work_args_array; // pointer on 2D array of pointers1113 pthread_barrier_t * parent_barriers_array; // pointer on 1D array of barriers1114 unsigned int error; // return value : 0 if success1115 }1116 build_args_t;1117 1118 1105 ///////////////////////////////////////////////////////////////////////////////////////// 1119 // Global variables used for inter-thread communications 1106 // Global variables 1107 // 1108 // WARNING : arguments of the pthread_create() function MUST be global variables. 1120 1109 ///////////////////////////////////////////////////////////////////////////////////////// 1121 1110 1122 pthread_attr_t build_attr [CLUSTERS_MAX][LEVEL_MAX]; // POSIX thread attributes 1123 1124 build_args_t build_args [CLUSTERS_MAX][LEVEL_MAX]; // build function arguments 1125 1126 pthread_barrier_t build_barrier[CLUSTERS_MAX][LEVEL_MAX]; // parent/child synchro 1127 1128 pthread_attr_t work_attr [CLUSTERS_MAX][CORES_MAX]; // POSIX thread attributes 1129 1130 ////////////////////////////////////////////////////////// 1131 static void pthread_recursive_build( build_args_t * args ) 1132 { 1133 unsigned int trdid; // unused (required by pthread_create() 1111 // 2D array of <build> threads attributes / indexed by [cid][level] 1112 __attribute__((aligned(4096))) 1113 pthread_attr_t pthread_build_attr[CLUSTERS_MAX][LEVEL_MAX]; 1114 1115 // 2D array of <build> threads arguments / indexed by [cid][level] 1116 __attribute__((aligned(4096))) 1117 pthread_parallel_build_args_t pthread_build_args[CLUSTERS_MAX][LEVEL_MAX]; 1118 1119 // 1D array of <work> threads attributes / indexed by [tid] 1120 __attribute__((aligned(4096))) 1121 pthread_attr_t pthread_work_attr[CLUSTERS_MAX * CORES_MAX]; 1122 1123 // 1D array of <work> threads arguments / indexed by [tid] 1124 __attribute__((aligned(4096))) 1125 pthread_parallel_work_args_t pthread_work_args[CLUSTERS_MAX * CORES_MAX]; 1126 1127 // kernel thread identifier / unused, but required by pthread_create() 1128 __attribute__((aligned(4096))) 1129 pthread_t trdid; 1130 1131 /////////////////////////////////////////////////////////////////////////// 1132 static void pthread_recursive_build( pthread_parallel_build_args_t * args ) 1133 { 1134 1134 1135 1135 // get arguments 1136 unsigned int cxy = args->cxy; 1137 unsigned int level = args->level; 1138 unsigned int parent_cxy = args->parent_cxy; 1139 unsigned int root_level = args->root_level; 1140 void * work_func = args->work_func; 1141 void * work_args_array = args->work_args_array; 1142 pthread_barrier_t * parent_barriers_array = args->parent_barriers_array; 1143 1144 // set error default value 1145 build_args[cxy][level].error = 0; 1136 unsigned int cid = args->cid; 1137 unsigned int level = args->level; 1138 unsigned int parent_cid = args->parent_cid; 1139 pthread_barrier_t * parent_barrier = args->parent_barrier; 1140 unsigned int root_level = args->root_level; 1141 void * work_func = args->work_func; 1142 unsigned int x_size = args->x_size; 1143 unsigned int y_size = args->y_size; 1144 unsigned int ncores = args->ncores; 1145 1146 #if DEBUG_PTHREAD_PARALLEL 1147 printf("\n[%s] <build> thread[%d][%d] enters / parent_cid %d / work_func %x\n", 1148 __FUNCTION__, cid , level , parent_cid , work_func ); 1149 #endif 1150 1151 // set error default value in pthread_build_args[cid][level] 1152 pthread_build_args[cid][level].error = 0; 1153 1154 // get cxy from cid 1155 unsigned int cxy = HAL_CXY_FROM_XY( cid / y_size , cid % y_size ); 1156 1157 // allocate the parent/child barrier in local cluster 1158 pthread_barrier_t * barrier = (pthread_barrier_t *)malloc( sizeof(pthread_barrier_t) ); 1159 1160 if( barrier == NULL ) 1161 { 1162 printf("\n[ERROR] in %s : cannot allocate barrier for <build> thread[%d][%d]\n", 1163 __FUNCTION__ , cid , level ); 1164 1165 // report error to parent 1166 pthread_build_args[parent_cid][level+1].error = 1; 1167 } 1146 1168 1147 1169 /////////////////////////////////////////////////////////// 1148 if( level == 0 ) // children are "work" threads 1149 { 1150 unsigned int lid; // core local index 1151 unsigned int ncores; // number of cores in a cluster 1152 1153 // get number of cores per cluster 1154 get_nb_cores( cxy , &ncores ); 1155 1156 // kill process if no active core in cluster 1157 // TODO this "if" should be replaced by an "assert" [AG] 1158 if( ncores == 0 ) 1159 { 1160 printf("\n[PANIC] in %s : no active core in cluster %x\n", 1161 __FUNCTION__ , cxy ); 1170 if( level == 0 ) // children are <work> threads 1171 { 1172 1173 // check number of cores in local cluster 1174 unsigned int actual_ncores; 1175 get_nb_cores( cxy , &actual_ncores ); 1176 1177 if( actual_ncores != ncores ) 1178 { 1179 printf("\n[ERROR] in %s : actual_ncores (%d) in cluster %x\n", 1180 __FUNCTION__ , actual_ncores, cxy ); 1162 1181 1163 1182 // report error to parent 1164 build_args[parent_cxy][level+1].error = 1; 1165 1166 // kill process 1167 exit( EXIT_FAILURE ); 1168 } 1169 1170 // initialize the parent_barrier 1171 if( pthread_barrier_init( &parent_barriers_array[cxy] , NULL , ncores + 1 ) ) 1172 { 1173 printf("\n[ERROR] in %s : cannot initialise barrier for build thread[%x][%d]\n", 1174 __FUNCTION__ , cxy , level ); 1183 pthread_build_args[parent_cid][level+1].error = 1; 1184 } 1185 1186 // initializes barrier for (ncores + 1) in flat mode 1187 if( pthread_barrier_init( barrier , NULL , ncores + 1 ) ) 1188 { 1189 printf("\n[ERROR] in %s : cannot init barrier for <build> thread[%d][%d]\n", 1190 __FUNCTION__ , cid , level ); 1175 1191 1176 1192 // report error to parent 1177 build_args[parent_cxy][level+1].error = 1;1193 pthread_build_args[parent_cid][level+1].error = 1; 1178 1194 } 1179 1195 1180 1196 #if DEBUG_PTHREAD_PARALLEL 1181 printf("\n[%s] <build> thread[%x][%d] created barrier / %d children\n", 1182 __FUNCTION__, cxy, level, ncores + 1 ); 1183 #endif 1184 // create (ncores) "work" threads 1197 printf("\n[%s] <build> thread[%d][%d] initialized barrier / %d children\n", 1198 __FUNCTION__, cid, level, ncores ); 1199 #endif 1200 unsigned int lid; // core local index for <work> thread 1201 unsigned int tid; // <work> thread continuous index 1202 1203 // <build> thread creates ncores <work> threads 1185 1204 for ( lid = 0 ; lid < ncores ; lid++ ) 1186 1205 { 1187 // set attributes for thread[cxy][lid] 1188 work_attr[cxy][lid].attributes = PT_ATTR_DETACH | 1189 PT_ATTR_CLUSTER_DEFINED | 1190 PT_ATTR_CORE_DEFINED; 1191 work_attr[cxy][lid].cxy = cxy; 1192 work_attr[cxy][lid].lid = lid; 1193 1194 // compute pointer on thread[cxy][lid] arguments 1195 void * work_args = *((void **)work_args_array + (cxy * CORES_MAX) + lid); 1196 1197 // create thread 1206 // compute work thread tid 1207 tid = (cid * ncores) + lid; 1208 1209 // set attributes for <work> thread[tid] 1210 pthread_work_attr[tid].attributes = PT_ATTR_DETACH | 1211 PT_ATTR_CLUSTER_DEFINED | 1212 PT_ATTR_CORE_DEFINED; 1213 pthread_work_attr[tid].cxy = cxy; 1214 pthread_work_attr[tid].lid = lid; 1215 1216 // set tid and barrier arguments for <work> thread[tid] 1217 pthread_work_args[tid].tid = tid; 1218 pthread_work_args[tid].barrier = barrier; 1219 1220 // create <work> thread 1198 1221 if ( pthread_create( &trdid, // unused 1199 & work_attr[cxy][lid],1222 &pthread_work_attr[tid], 1200 1223 work_func, 1201 work_args) )1224 &pthread_work_args[tid] ) ) 1202 1225 { 1203 printf("\n[ERROR] in %s : cannot create work thread[%x,%x]\n",1204 __FUNCTION__ , c xy , lid );1226 printf("\n[ERROR] in %s : <build> thread[%d][%d] cannot create <work> thread[%d]\n", 1227 __FUNCTION__ , cid , level , tid ); 1205 1228 1206 1229 // report error to parent 1207 build_args[parent_cxy][level+1].error = 1;1230 pthread_build_args[parent_cid][level+1].error = 1; 1208 1231 } 1209 1232 1210 1233 #if DEBUG_PTHREAD_PARALLEL 1211 printf("\n[%s] <build> thread[% x][%d] created <work> thread[%x][%d]\n",1212 __FUNCTION__, c xy, level, cxy, lid );1213 #endif 1214 } 1215 1216 // wait on barrier until "work"children threads completed1217 if( pthread_barrier_wait( &parent_barriers_array[cxy]) )1218 { 1219 printf("\n[ERROR] in %s / firstbarrier for <build> thread[%x][%d]\n",1220 __FUNCTION__ , c xy, level );1234 printf("\n[%s] <build> thread[%d][%d] created <work> thread[%d]\n", 1235 __FUNCTION__, cid, level, tid ); 1236 #endif 1237 } 1238 1239 // wait on barrier until all <work> children threads completed 1240 if( pthread_barrier_wait( barrier ) ) 1241 { 1242 printf("\n[ERROR] in %s / barrier for <build> thread[%x][%d]\n", 1243 __FUNCTION__ , cid , level ); 1221 1244 1222 1245 // report error to parent 1223 build_args[parent_cxy][level+1].error = 1;1246 pthread_build_args[parent_cid][level+1].error = 1; 1224 1247 } 1225 1248 1226 1249 #if DEBUG_PTHREAD_PARALLEL 1227 printf("\n[%s] <build> thread[% x][%d] resume after children completion\n",1228 __FUNCTION__ , cxy, level );1250 printf("\n[%s] <build> thread[%d][%d] resume after children completion\n", 1251 __FUNCTION__ , cid , level ); 1229 1252 #endif 1230 1253 … … 1234 1257 else // children are "build" threads 1235 1258 { 1236 // the 4 children threads can be created in any core of each quarters1237 // of the parent macro-cluster1259 // the 4 children threads can be linked to any core in each 1260 // sub-macro-cluster[i][j] with [ij] in {00,01,10,11} 1238 1261 1239 1262 unsigned int parent_x; // X coordinate of parent macro-cluster … … 1241 1264 unsigned int child_x; // X coordinate of child macro-cluster 1242 1265 unsigned int child_y; // Y coordinate of child macro-cluster 1243 unsigned int child_cxy[2][2]; // selected cluster for child thread 1244 unsigned int child_lid[2][2]; // selected core index for child thread 1245 int child_sts[2][2]; // -1 if error / 0 if success / +1 if not found 1246 unsigned int x; // X loop index for children 1247 unsigned int y; // Y loop index for children 1248 1249 unsigned int nb_children = 0; 1266 unsigned int child_cid[2][2]; // selected cluster cid for child[i][j] 1267 unsigned int child_cxy[2][2]; // selected cluster cxy for child[i][j] 1268 unsigned int child_lid[2][2]; // selected core index for child[i][j] 1269 int child_sts[2][2]; // -1 if error / 0 if success / +1 if no core 1270 unsigned int i; // loop index for children 1271 unsigned int j; // loop index for children 1272 1273 unsigned int nb_children = 0; // actual number of children (can be < 4) 1250 1274 1251 1275 // get parent macro-cluster mask and half-size from level … … 1257 1281 parent_y = HAL_Y_FROM_CXY( cxy ) & ~mask; 1258 1282 1259 // get child_cxy and child_lid for up to 4 children threads : 00 / 01 / 10 / 111260 for ( x = 0 ; x < 2 ; x++)1283 // First step : select core for each child thread 1284 for (i = 0 ; i < 2 ; i++) 1261 1285 { 1262 1286 // compute child macro-cluster X coordinate 1263 child_x = ( x== 0) ? parent_x : (parent_x + half);1264 1265 for ( y = 0 ; y < 2 ; y++)1287 child_x = (i == 0) ? parent_x : (parent_x + half); 1288 1289 for (j = 0 ; j < 2 ; j++) 1266 1290 { 1267 1291 // compute child macro-cluster Y coordinate 1268 child_y = ( y== 0) ? parent_y : (parent_y + half);1292 child_y = (j == 0) ? parent_y : (parent_y + half); 1269 1293 1270 1294 // select the best core in macro-cluster 1271 child_sts[x][y] = get_best_core( HAL_CXY_FROM_XY( child_x , child_y ), 1295 unsigned int best_cxy; 1296 unsigned int best_lid; 1297 1298 child_sts[i][j] = get_best_core( HAL_CXY_FROM_XY( child_x , child_y ), 1272 1299 level-1, 1273 & child_cxy[x][y],1274 & child_lid[x][y]);1275 1276 if( child_sts[ x][y] < 0 ) // failure => report error1300 &best_cxy, 1301 &best_lid ); 1302 1303 if( child_sts[i][j] < 0 ) // failure => report error 1277 1304 { 1278 printf("\n[ERROR] in %s : illegal arguments for <build> thread[%x,%x]\n",1279 __FUNCTION__ , cxy, level );1305 printf("\n[ERROR] in %s select core for child[%d,%d] of <build> thread[%d,%d]\n", 1306 __FUNCTION__ , i , j , cid , level ); 1280 1307 1281 1308 // report error to parent 1282 build_args[parent_cxy][level+1].error = 1;1309 pthread_build_args[parent_cid][level+1].error = 1; 1283 1310 } 1284 else if (child_sts[ x][y] > 0 ) // macro-cluster undefined=> does nothing1311 else if (child_sts[i][j] > 0 ) // macro-cluster empty => does nothing 1285 1312 { 1286 1313 } 1287 1314 else // core found 1288 1315 { 1316 child_cxy[i][j] = best_cxy; 1317 child_lid[i][j] = best_lid; 1318 child_cid[i][j] = (HAL_X_FROM_CXY(best_cxy) * y_size) + HAL_Y_FROM_CXY( best_cxy); 1289 1319 nb_children++; 1320 1321 #if DEBUG_PTHREAD_PARALLEL 1322 printf("\n[%s] <build> thread[%d][%d] select core[%x][%d] for child[%d][%d]\n", 1323 __FUNCTION__ , cid , level , best_cxy , best_lid , i , j ); 1324 #endif 1325 1290 1326 } 1291 } // end for y 1292 } // end for x 1293 1294 // kill process if no active core in cluster 1295 // TODO this "if" should be replaced by an "assert" [AG] 1296 if( nb_children == 0 ) 1297 { 1298 printf("\n[PANIC] in %s : no active core in macro cluster [%x,%d]\n", 1299 __FUNCTION__ , cxy , level ); 1327 } // end for j 1328 } // end for i 1329 1330 // second step : initialize barrier for (nb_children + 1) in flat mode 1331 if( pthread_barrier_init( barrier , NULL , nb_children + 1 ) ) 1332 { 1333 printf("\n[ERROR] in %s : cannot init barrier for <build> thread[%d][%d]\n", 1334 __FUNCTION__ , cid , level ); 1300 1335 1301 1336 // report error to parent 1302 build_args[parent_cxy][level+1].error = 1; 1303 1304 // kill process 1305 exit( EXIT_FAILURE ); 1306 } 1307 1308 // initialize the barrier for (nb_children + 1) 1309 if( pthread_barrier_init( &build_barrier[cxy][level], NULL , nb_children + 1 ) ) 1310 { 1311 printf("\n[error] in %s : cannot initialise barrier for build thread[%x][%d]\n", 1312 __FUNCTION__ , cxy , level ); 1313 1314 // report error to parent 1315 build_args[parent_cxy][level+1].error = 1; 1337 pthread_build_args[parent_cid][level+1].error = 1; 1316 1338 } 1317 1339 1318 1340 #if DEBUG_PTHREAD_PARALLEL 1319 printf("\n[%s] <build> thread[%x][%d] created barrier / %d children\n", 1320 __FUNCTION__, cxy, level, nb_children + 1 ); 1321 #endif 1322 // create 1 to 4 children threads 1323 for (x = 0 ; x < 2 ; x++) 1324 { 1325 for (y = 0 ; y < 2 ; y++) 1341 printf("\n[%s] <build> thread[%d][%d] initialized barrier / %d children\n", 1342 __FUNCTION__, cid, level, nb_children ); 1343 #endif 1344 1345 // Third step : actually create the children threads 1346 for (i = 0 ; i < 2 ; i++) 1347 { 1348 for (j = 0 ; j < 2 ; j++) 1326 1349 { 1327 1350 // thread is created only if macro-cluster is active 1328 if( child_sts[ x][y] == 0 )1351 if( child_sts[i][j] == 0 ) 1329 1352 { 1330 unsigned int tgt_cxy = child_cxy[x][y]; 1331 unsigned int tgt_lid = child_lid[x][y]; 1353 unsigned int tgt_cid = child_cid[i][j]; 1354 unsigned int tgt_lid = child_lid[i][j]; 1355 unsigned int tgt_cxy = child_cxy[i][j]; 1332 1356 1333 1357 // set child thread attributes 1334 build_attr[tgt_cxy][level-1].attributes = PT_ATTR_DETACH | 1335 PT_ATTR_CLUSTER_DEFINED | 1336 PT_ATTR_CORE_DEFINED; 1337 build_attr[tgt_cxy][level-1].cxy = tgt_cxy; 1338 build_attr[tgt_cxy][level-1].lid = tgt_lid; 1339 1340 // propagate build function arguments 1341 build_args[tgt_cxy][level-1].cxy = child_cxy[x][y]; 1342 build_args[tgt_cxy][level-1].level = level-1; 1343 build_args[tgt_cxy][level-1].parent_cxy = cxy; 1344 build_args[tgt_cxy][level-1].root_level = root_level; 1345 build_args[tgt_cxy][level-1].work_func = work_func; 1346 build_args[tgt_cxy][level-1].work_args_array = work_args_array; 1347 build_args[tgt_cxy][level-1].parent_barriers_array = parent_barriers_array; 1358 pthread_build_attr[tgt_cid][level-1].attributes = PT_ATTR_DETACH | 1359 PT_ATTR_CLUSTER_DEFINED | 1360 PT_ATTR_CORE_DEFINED; 1361 pthread_build_attr[tgt_cid][level-1].cxy = tgt_cxy; 1362 pthread_build_attr[tgt_cid][level-1].lid = tgt_lid; 1363 1364 // propagate build function arguments from parent to child 1365 pthread_build_args[tgt_cid][level-1].cid = tgt_cid; 1366 pthread_build_args[tgt_cid][level-1].level = level-1; 1367 pthread_build_args[tgt_cid][level-1].parent_cid = cid; 1368 pthread_build_args[tgt_cid][level-1].parent_barrier = barrier; 1369 pthread_build_args[tgt_cid][level-1].root_level = root_level; 1370 pthread_build_args[tgt_cid][level-1].work_func = work_func; 1371 pthread_build_args[tgt_cid][level-1].x_size = x_size; 1372 pthread_build_args[tgt_cid][level-1].y_size = y_size; 1373 pthread_build_args[tgt_cid][level-1].ncores = ncores; 1348 1374 1349 1375 // create thread 1350 1376 if( pthread_create( &trdid, 1351 & build_attr[tgt_cxy][level-1],1377 &pthread_build_attr[tgt_cid][level-1], 1352 1378 &pthread_recursive_build, 1353 & build_args[tgt_cxy][level-1] ) )1379 &pthread_build_args[tgt_cid][level-1] ) ) 1354 1380 { 1355 printf("\n[ERROR] in %s : cannot create buildthread[%x][%d]\n",1356 __FUNCTION__ , child_c xy, level -1 );1381 printf("\n[ERROR] in %s : cannot create <build> thread[%x][%d]\n", 1382 __FUNCTION__ , child_cid , level -1 ); 1357 1383 1358 1384 // report error to parent 1359 build_args[parent_cxy][level+1].error = 1;1385 pthread_build_args[parent_cid][level+1].error = 1; 1360 1386 } 1361 1387 1362 1388 #if DEBUG_PTHREAD_PARALLEL 1363 printf("\n[%s] <build> thread[% x][%d] created <build> thread[%x][%d] on core[%x,%d]\n",1364 __FUNCTION__, c xy, level, tgt_cxy, level - 1, tgt_cxy, tgt_lid );1389 printf("\n[%s] <build> thread[%d][%d] created <build> thread[%d][%d] on core[%x,%d]\n", 1390 __FUNCTION__, cid, level, tgt_cid, (level - 1), tgt_cxy, tgt_lid ); 1365 1391 #endif 1366 1392 } //end if sts[x][y] … … 1368 1394 } // end for x 1369 1395 1370 // wait on barrier until "build"children threads completed1371 if( pthread_barrier_wait( &build_barrier[cxy][level]) )1372 { 1373 printf("\n[ERROR] in %s / first barrier for <build> thread[%x][%d]\n",1374 __FUNCTION__ , c xy, level );1396 // wait on barrier until all <build> children threads completed 1397 if( pthread_barrier_wait( barrier ) ) 1398 { 1399 printf("\n[ERROR] in %s / barrier for <build> thread[%d][%d]\n", 1400 __FUNCTION__ , cid , level ); 1375 1401 1376 1402 // report error to parent 1377 build_args[parent_cxy][level+1].error = 1;1403 pthread_build_args[parent_cid][level+1].error = 1; 1378 1404 } 1379 1405 1380 1406 #if DEBUG_PTHREAD_PARALLEL 1381 1407 printf("\n[%s] <build> thread[%x][%d] resume after children completion\n", 1382 __FUNCTION__, c xy, level );1408 __FUNCTION__, cid, level ); 1383 1409 #endif 1384 1410 … … 1386 1412 1387 1413 // report error to parent when required 1388 if( build_args[cxy][level].error ) 1389 { 1390 build_args[parent_cxy][level+1].error = 1; 1391 } 1392 1393 // all <build> threads - but the root - 1394 // signal completion to parent thread and exit 1414 if( pthread_build_args[cid][level].error ) 1415 { 1416 pthread_build_args[parent_cid][level+1].error = 1; 1417 } 1418 1419 // all <build> threads - but the root - signal completion to parent thread and exit 1395 1420 if( level < root_level ) 1396 1421 { 1397 if( pthread_barrier_wait( &build_barrier[parent_cxy][level+1]) )1398 { 1399 printf("\n[ERROR] in %s / second barrier for <build> thread[%x][%d]\n",1400 __FUNCTION__ , c xy, level );1422 if( pthread_barrier_wait( parent_barrier ) ) 1423 { 1424 printf("\n[ERROR] in %s / parent barrier for <build> thread[%d][%d]\n", 1425 __FUNCTION__ , cid , level ); 1401 1426 1402 1427 // report error to parent 1403 build_args[parent_cxy][level+1].error = 1;1428 pthread_build_args[parent_cid][level+1].error = 1; 1404 1429 } 1405 1430 1406 1431 #if DEBUG_PTHREAD_PARALLEL 1407 1432 printf("\n[%s] <build> thread[%x][%d] exit\n", 1408 __FUNCTION__, c xy, level );1409 #endif 1410 // "build"thread exit1433 __FUNCTION__, cid , level ); 1434 #endif 1435 // <build> thread exit 1411 1436 pthread_exit( NULL ); 1412 1437 } 1413 1438 } // end pthread_recursive_build() 1414 1439 1415 /////////////////////////////////////////////////////// 1416 int pthread_parallel_create( unsigned int root_level, 1417 void * work_func, 1418 void * work_args_array, 1419 void * parent_barriers_array ) 1420 { 1440 1441 ////////////////////////////////////////////////////// 1442 int pthread_parallel_create( unsigned int root_level, 1443 void * work_func ) 1444 { 1445 1446 #if DEBUG_PTHREAD_PARALLEL 1447 printf("\n[%s] enter / root_level %d / func %x\n", 1448 __FUNCTION__, root_level, work_func ); 1449 #endif 1450 1451 // get platform parameters 1452 unsigned int x_size; 1453 unsigned int y_size; 1454 unsigned int ncores; 1455 get_config( &x_size , &y_size , &ncores ); 1456 1457 // get calling thread cluster identifier 1421 1458 unsigned int root_cxy; 1422 1459 unsigned int root_lid; // unused, but required by get_core_id() 1460 get_core_id( &root_cxy , &root_lid ); 1461 1462 // get calling thread continuous index 1463 unsigned int x = HAL_X_FROM_CXY( root_cxy ); 1464 unsigned int y = HAL_Y_FROM_CXY( root_cxy ); 1465 unsigned int root_cid = (y_size * x) + y; 1466 1467 // set the build function arguments for the root <build> thread 1468 pthread_build_args[root_cid][root_level].cid = root_cid; 1469 pthread_build_args[root_cid][root_level].level = root_level; 1470 pthread_build_args[root_cid][root_level].parent_cid = -1; 1471 pthread_build_args[root_cid][root_level].parent_barrier = NULL; 1472 pthread_build_args[root_cid][root_level].root_level = root_level; 1473 pthread_build_args[root_cid][root_level].work_func = work_func; 1474 pthread_build_args[root_cid][root_level].x_size = x_size; 1475 pthread_build_args[root_cid][root_level].y_size = y_size; 1476 pthread_build_args[root_cid][root_level].ncores = ncores; 1423 1477 1424 #if DEBUG_PTHREAD_PARALLEL 1425 printf("\n[%s] enter / root_level %d / func %x / args %x / barriers %x\n", 1426 __FUNCTION__, root_level, work_func, work_args_array, parent_barriers_array ); 1427 #endif 1428 1429 // get calling thread cluster 1430 get_core_id( &root_cxy , &root_lid ); 1431 1432 // set the build function arguments for the root <build> thread 1433 build_args[root_cxy][root_level].cxy = root_cxy; 1434 build_args[root_cxy][root_level].level = root_level; 1435 build_args[root_cxy][root_level].root_level = root_level; 1436 build_args[root_cxy][root_level].work_func = work_func; 1437 build_args[root_cxy][root_level].work_args_array = work_args_array; 1438 build_args[root_cxy][root_level].parent_barriers_array = parent_barriers_array; 1439 1440 // call the recursive build function 1441 pthread_recursive_build( &build_args[root_cxy][root_level] ); 1442 1443 // check error 1444 if( build_args[root_cxy][root_level].error ) 1478 // call the recursive function 1479 pthread_recursive_build( &pthread_build_args[root_cid][root_level] ); 1480 1481 // check error when execution completes 1482 if( pthread_build_args[root_cid][root_level].error ) 1445 1483 { 1446 1484 printf("\n[error] in %s\n", __FUNCTION__ ); -
trunk/libs/libalmosmkh/almosmkh.h
r647 r650 101 101 * @ level : [in] macro-cluster level in [1,2,3,4,5]. 102 102 * @ cxy : [out] selected core cluster identifier. 103 * @ lid : [out] select od core local index.103 * @ lid : [out] selected core local index. 104 104 * @ return 0 if success / 1 if no core in macro-cluster / -1 if illegal arguments. 105 105 **************************************************************************************/ … … 415 415 * This function releases the memory buffer identified by the <ptr> argument, 416 416 * to the store identified by the <cxy> argument. 417 * It displays an error message, but does nothingif the ptr is illegal.417 * It does nothing, but displays an error message, if the ptr is illegal. 418 418 ***************************************************************************************** 419 419 * @ ptr : pointer on the released buffer. … … 456 456 457 457 ////////////////////////////////////////////////////////////////////////////////////////// 458 // This system call can be used to parallelize the creation and the termination 459 // of a parallel multi-threaded application. It removes the loop in the main thread that 460 // creates the N working threads (N sequencial pthread_create() ). It also removes the 461 // loop that waits completion of these N working threads (N sequencial pthread_join() ). 462 // It creates one "work" thread (in detached mode) per core in the target architecture. 463 // Each "work" thread is identified by the [cxy][lid] indexes (cluster / local core). 464 // The pthread_parallel_create() function returns only when all "work" threads completed 458 // This syscall can be used to parallelize the creation, and the termination 459 // of a parallel multi-threaded application. 460 // It removes in the main thread the sequencial loop that creates the N working threads 461 // (N pthread_create() ), and removes also the sequencial loop that waits completion 462 // of these N working threads (N pthread_join() ). 463 // It creates one <work> thread (in detached mode) per core in the target architecture. 464 // Each <work> thread is identified by a continuous [tid] index. 465 // For a regular architecture, defined by the [x_size , y_size , ncores] parameters, 466 // the number of working threads can be the simply computed as (x_size * y_size * ncores), 467 // and the coordinates[x,y,lid] of the core running the thread[tid] cand be directly 468 // derived from the [tid] value with the following relations: 469 // . cid = (x * y_size) + y 470 // . tid = (cid * ncores ) + lid 471 // . lid = tid % ncores 472 // . cid = tid / ncores 473 // . y = cid % y_size 474 // . x = cid / y_size 475 // The pthread_parallel_create() function returns only when all <work> threads completed 465 476 // (successfully or not). 466 477 // 467 // To use this system call, the application code must define the following structures: 468 // - To define the arguments to pass to the <work> function the application must allocate 469 // and initialize a first 2D array, indexed by [cxy] and [lid] indexes, where each slot 470 // contains an application specific structure, and another 2D array, indexed by the same 471 // indexes, containing pointers on these structures. This array of pointers is one 472 // argument of the pthread_parallel_create() function. 473 // - To detect the completion of the <work> threads, the application must allocate a 1D 474 // array, indexed by the cluster index [cxy], where each slot contains a pthread_barrier 475 // descriptor. This barrier is initialised by the pthread_parallel_create() function, 476 // in all cluster containing at least one work thread. This array of barriers is another 477 // argument of the pthread_parallel_create() function. 478 // WARNING : The function executed by the working thread is application specific, 479 // but the structure defining the arguments passed to this function is imposed. 480 // The "pthread_parallel_work_args_t" structure is defined below, and contains 481 // two fields: the tid value, and a pointer on a pthread_barrier_t. 482 // This barrier must be used by each working thread to signal completion before exit. 483 // The global variables implementing these stuctures for each working thread 484 // are allocated and initialised by the pthread_parallel_create() function. 478 485 // 479 // Implementation note: 480 // To parallelize the "work" threads creation and termination, the pthread_parallel_create() 481 // function creates a distributed quad-tree (DQT) of "build" threads covering all cores 482 // required to execute the parallel application. 486 // Implementation note: the pthread_parallel_create()a function creates a distributed 487 // quad-tree (DQT) of <build> threads covering all cores required to execute the parallel 488 // application. This quad tree is entirely defined by the root_level parameter. 483 489 // Depending on the hardware topology, this DQT can be truncated, (i.e. some 484 490 // parent nodes can have less than 4 chidren), if (x_size != y_size), or if one size 485 // is not a power of 2. Each "build" thread is identified by two indexes [cxy][level].486 // Each "build"thread makes the following tasks:491 // is not a power of 2. Each <build> thread is identified by two indexes [cid][level]. 492 // Each <build> thread makes the following tasks: 487 493 // 1) It calls the pthread_create() function to create up to 4 children threads, that 488 // are are "work" threads when (level == 0), or "build"threads, when (level > 0).489 // 2) It initializes the barrier (global variable), used to block/unblock490 // the parent thread untilchildren completion.494 // are <work> threads when (level == 0), or <build> threads, when (level > 0). 495 // 2) It allocates and initializes the barrier, used to block the parent thread until 496 // children completion. 491 497 // 3) It calls the pthread_barrier_wait( self ) to wait until all children threads 492 498 // completed (successfully or not). … … 495 501 496 502 /***************************************************************************************** 497 * This blocking function creates N working threads that execute the code defined 498 * by the <work_func> and <work_args> arguments, and returns only when all working 499 * threads completed. 500 * The number N of created threads is entirely defined by the <root_level> argument. 501 * This value defines an abstract quad-tree, with a square base : level in [0,1,2,3,4], 503 * structure defining the arguments for the <build> thread function 504 ****************************************************************************************/ 505 typedef struct pthread_parallel_build_args_s 506 { 507 unsigned char cid; // this <build> thread cluster index 508 unsigned char level; // this <build> thread level in quad-tree 509 unsigned char parent_cid; // parent <build> thread cluster index 510 pthread_barrier_t * parent_barrier; // pointer on parent <build> thread barrier 511 unsigned char root_level; // quad-tree root level 512 void * work_func; // pointer on working thread function 513 unsigned int x_size; // platform global parameter 514 unsigned int y_size; // platform global parameter 515 unsigned int ncores; // platform global parameter 516 unsigned int error; // return value : 0 if success 517 } 518 pthread_parallel_build_args_t; 519 520 /***************************************************************************************** 521 * structure defining the arguments for the <work> thread function 522 ****************************************************************************************/ 523 typedef struct pthread_parallel_work_args_s 524 { 525 unsigned int tid; // thread identifier 526 pthread_barrier_t * barrier; // to signal completion 527 } 528 pthread_parallel_work_args_t; 529 530 /***************************************************************************************** 531 * This blocking function creates N working threads identified by the [tid] continuous 532 * index, that execute the code defined by the <work_func> argument, and returns only 533 * when all working threads completed. 534 * The number N of created threads is entirely defined by the <root_level> argument, 535 * that defines an abstract quad-tree, with a square base : root_level in [0,1,2,3,4], 502 536 * side in [1,2,4,8,16], nclusters in [1,4,16,64,256]. This base is called macro_cluster. 503 * A working thread is created on all cores contained in th e specifiedmacro-cluster.537 * A working thread is created on all cores contained in this abstract macro-cluster. 504 538 * The actual number of physical clusters containing cores can be smaller than the number 505 * of clusters covered by the quad tree. The actual number of cores in a cluster can be 506 * less than the max value. 507 * 508 * In the current implementation, all threads execute the same <work_func> function, 509 * on different arguments, that are specified as a 2D array of pointers <work_args>. 510 * This can be modified in a future version, where the <work_func> argument can become 511 * a 2D array of pointers, to have one specific function for each thread. 539 * of clusters covered by the abstract quad tree. 540 * All threads execute the same <work_func> function, on different arguments, that are 541 * specified as an array of structures pthread_parallel_work_args_t, allocated and 542 * initialised by this function. 512 543 ***************************************************************************************** 513 544 * @ root_level : [in] DQT root level in [0,1,2,3,4]. 514 545 * @ work_func : [in] pointer on start function. 515 * @ work_args_array : [in] pointer on a 2D array of pointers.516 * @ parent_barriers_array : [in] pointer on a 1D array of barriers.517 546 * @ return 0 if success / return -1 if failure. 518 547 ****************************************************************************************/ 519 int pthread_parallel_create( unsigned int root_level, 520 void * work_func, 521 void * work_args_array, 522 void * parent_barriers_array ); 548 int pthread_parallel_create( unsigned int root_level, 549 void * work_func ); 550 551 552 553 523 554 524 555 /********* Non standard (ALMOS-MKH specific) Frame Buffer access syscalls *************/
Note: See TracChangeset
for help on using the changeset viewer.