Changeset 637 for trunk/libs
- Timestamp:
- Jul 18, 2019, 2:06:55 PM (5 years ago)
- Location:
- trunk/libs
- Files:
-
- 6 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/libs/libalmosmkh/almosmkh.c
r626 r637 2 2 * almosmkh.c - User level ALMOS-MKH specific library implementation. 3 3 * 4 * Author Alain Greiner (2016,2017,2018 )4 * Author Alain Greiner (2016,2017,2018,2019) 5 5 * 6 6 * Copyright (c) UPMC Sorbonne Universites … … 24 24 #include <almosmkh.h> 25 25 #include <hal_user.h> 26 #include <hal_macros.h> 26 27 #include <hal_shared_types.h> 27 28 #include <syscalls_numbers.h> … … 32 33 #include <mman.h> 33 34 34 #define MALLOC_DEBUG 0 35 #define DEBUG_REMOTE_MALLOC 0 36 #define DEBUG_PTHREAD_PARALLEL 1 35 37 36 ///////////// Non standard system calls ///////////////////////////////// 38 ////////////////////////////////////////////////////////////////////////////////////// 39 ///////////// Non standard system calls /////////////////////////////////////// 40 ////////////////////////////////////////////////////////////////////////////////////// 37 41 38 42 ////////////////////////// … … 63 67 } 64 68 65 ///////////////////////////////// 66 int get_core ( unsigned int * cxy,67 unsigned int * lid )68 { 69 return hal_user_syscall( SYS_GET_CORE ,69 //////////////////////////////////// 70 int get_core_id( unsigned int * cxy, 71 unsigned int * lid ) 72 { 73 return hal_user_syscall( SYS_GET_CORE_ID, 70 74 (reg_t)cxy, 71 75 (reg_t)lid, 0, 0 ); 76 } 77 78 ///////////////////////////////////// 79 int get_nb_cores( unsigned int cxy, 80 unsigned int * ncores ) 81 { 82 return hal_user_syscall( SYS_GET_NB_CORES, 83 (reg_t)cxy, 84 (reg_t)ncores, 0, 0 ); 85 } 86 87 /////////////////////////////////////////// 88 int get_best_core( unsigned int base_cxy, 89 unsigned int level, 90 unsigned int * cxy, 91 unsigned int * lid ) 92 { 93 return hal_user_syscall( SYS_GET_BEST_CORE, 94 (reg_t)base_cxy, 95 (reg_t)level, 96 (reg_t)cxy, 97 (reg_t)lid ); 72 98 } 73 99 … … 250 276 } // end get_string() 251 277 252 253 /////////////// non standard debug functions ////////////////////////// 278 ////////////////////////////////////////////////////////////////////////////////////// 279 /////////////// non standard debug functions /////////////////////////////////// 280 ////////////////////////////////////////////////////////////////////////////////////// 254 281 255 282 //////////////////////////////////// … … 496 523 497 524 498 /////////////// non standard malloc functions ////////////////////////// 525 ///////////////////////////////////////////////////////////////////////////////////////// 526 /////////////// non standard remote_malloc //////////////////////////////////////// 527 ///////////////////////////////////////////////////////////////////////////////////////// 499 528 500 529 ///////////////////////////////////////////////////////////////////////////////////////// 501 530 // Global variable defining the allocator array (one per cluster) 502 531 // This array (about 16 Kbytes ) will be stored in the data segment 503 // of any application linked with this malloclibray.532 // of any application linked with this libray. 504 533 ///////////////////////////////////////////////////////////////////////////////////////// 505 534 … … 546 575 //////////////////////////////////////////////////////////////////////////////////////////// 547 576 548 #if MALLOC_DEBUG577 #if DEBUG_REMOTE_MALLOC 549 578 static void display_free_array( unsigned int cxy ) 550 579 { … … 594 623 unsigned int iter; // iterator 595 624 596 #if MALLOC_DEBUG 597 printf("\n[MALLOC] %s : enter for store[%x] / size = %x\n", 598 __FUNCTION__, cxy, store_size ); 625 #if DEBUG_REMOTE_MALLOC 626 unsigned int core_cxy; 627 unsigned int core_lid; 628 get_core_id( &core_cxy , &core_lid ); 629 printf("\n[%s] core[%x,%d] enter for store[%x] / size = %x\n", 630 __FUNCTION__, core_cxy, core_lid, cxy, store_size ); 599 631 #endif 600 632 … … 635 667 } 636 668 637 #if MALLOC_DEBUG638 printf("\n[ MALLOC] %s : mmap done for store[%x] / base = %x\n",639 __FUNCTION__, c xy, store_base);669 #if DEBUG_REMOTE_MALLOC 670 printf("\n[%s] core[%x,%d] created vseg %x for store[%x]\n", 671 __FUNCTION__, core_cxy, core_lid, store_base, cxy ); 640 672 #endif 641 673 … … 656 688 } 657 689 658 // DEPRECATED: we don't reset the alloc_base array659 // because we don't want to allocate the physical memory660 // when the heap is created [AG]661 // memset( (void *)alloc_base , 0 , alloc_size );662 663 690 // split the store into various sizes blocks, 664 691 // initializes the free[] array and NEXT pointers … … 690 717 691 718 692 #if MALLOC_DEBUG 693 printf("\n[MALLOC] %s : completes store[%x] initialisation\n", 694 __FUNCTION__, cxy ); 695 719 #if DEBUG_REMOTE_MALLOC 720 printf("\n[%s] core[%x,%d] completed store[%x] initialisation\n", 721 __FUNCTION__, core_cxy, core_lid, cxy ); 722 #endif 723 724 #if (DEBUG_REMOTE_MALLOC & 1) 696 725 display_free_array( cxy ); 697 726 #endif … … 762 791 int error; 763 792 764 #if MALLOC_DEBUG 765 printf("\n[MALLOC] %s : enter for size = %x / cxy = %x\n", 766 __FUNCTION__ , size , cxy ); 793 #if DEBUG_REMOTE_MALLOC 794 unsigned int core_cxy; 795 unsigned int core_lid; 796 get_core_id( &core_cxy , &core_lid ); 797 printf("\n[%s] core[%x,%d] enter for size = %x / target_cxy = %x\n", 798 __FUNCTION__ , core_cxy, core_lid, size , cxy ); 767 799 #endif 768 800 … … 828 860 unsigned char * ptr = (unsigned char*)(store[cxy].alloc_base + offset); 829 861 830 // DEPRECATED : we cannot check the alloc[] array,831 // because it has not been initialised by store_init,832 // to avoid physical memory allocation at heap creation [AG]833 // if ( *ptr != 0 )834 // {835 // pthread_mutex_unlock( &store[cxy].mutex );836 // printf("\n[PANIC] in %s : allocate an already allocated block...\n",837 // __FUNCTION__ );838 // return NULL;839 // }840 841 862 // update alloc_array 842 863 *ptr = requested_index; … … 845 866 pthread_mutex_unlock( &store[cxy].mutex ); 846 867 847 #if MALLOC_DEBUG848 printf("\n[ MALLOC] %s :exit / base = %x / size = %x / from store[%x]\n",849 __FUNCTION__, base , size , cxy );868 #if DEBUG_REMOTE_MALLOC 869 printf("\n[%s] core[%x,%d] exit / base = %x / size = %x / from store[%x]\n", 870 __FUNCTION__, core_cxy, core_lid, base , size , cxy ); 850 871 #endif 851 872 … … 853 874 854 875 } // end remote_malloc() 855 856 857 876 858 877 ////////////////////////////////////////// … … 920 939 921 940 return new_ptr; 922 } 941 942 } // end remote_realloc() 943 923 944 924 945 ////////////////////////////////////////////////////// … … 991 1012 { 992 1013 993 #if MALLOC_DEBUG1014 #if DEBUG_REMOTE_MALLOC 994 1015 printf("\n[MALLOC] %s : enter for block = %x / cxy = %x\n", 995 1016 __FUNCTION__, ptr, cxy ); … … 1052 1073 pthread_mutex_unlock( &store[cxy].mutex ); 1053 1074 1054 #if MALLOC_DEBUG1075 #if DEBUG_REMOTE_MALLOC 1055 1076 printf("\n[MALLOC] %s : conmpletes for block = %x / cxy = %x\n", 1056 1077 __FUNCTION__, ptr, cxy ); … … 1058 1079 1059 1080 } // end remote_free() 1081 1082 ///////////////////////////////////////////////////////////////////////////////////////// 1083 /////////////// non standard pthread_parallel_create ////////////////////////////// 1084 ///////////////////////////////////////////////////////////////////////////////////////// 1085 1086 #define X_MAX 16 // max number of clusters in a row 1087 #define Y_MAX 16 // max number of clusters in a column 1088 #define CLUSTERS_MAX X_MAX * Y_MAX 1089 #define LEVEL_MAX 5 1090 #define CORES_MAX 4 // max number of cores per cluster 1091 1092 typedef struct build_args_s 1093 { 1094 unsigned char cxy; // this thread cluster identifier 1095 unsigned char level; // this thread level in quad-tree 1096 unsigned char parent_cxy; // parent thread cluster identifier 1097 unsigned char root_level; // quad-tree root level 1098 void * work_func; // pointer on work function pointer 1099 void * work_args_array; // pointer on 2D array of pointers 1100 pthread_barrier_t * parent_barriers_array; // pointer on 1D array of barriers 1101 unsigned int error; // return value : 0 if success 1102 } 1103 build_args_t; 1104 1105 ///////////////////////////////////////////////////////////////////////////////////////// 1106 // Global variables used for inter-thread communications 1107 ///////////////////////////////////////////////////////////////////////////////////////// 1108 1109 pthread_attr_t build_attr [CLUSTERS_MAX][LEVEL_MAX]; // POSIX thread attributes 1110 1111 build_args_t build_args [CLUSTERS_MAX][LEVEL_MAX]; // build function arguments 1112 1113 pthread_barrier_t build_barrier[CLUSTERS_MAX][LEVEL_MAX]; // parent/child synchro 1114 1115 pthread_attr_t work_attr [CLUSTERS_MAX][CORES_MAX]; // POSIX thread attributes 1116 1117 ////////////////////////////////////////////////////////// 1118 static void pthread_recursive_build( build_args_t * args ) 1119 { 1120 unsigned int trdid; // unused (required by pthread_create() 1121 1122 // get arguments 1123 unsigned int cxy = args->cxy; 1124 unsigned int level = args->level; 1125 unsigned int parent_cxy = args->parent_cxy; 1126 unsigned int root_level = args->root_level; 1127 void * work_func = args->work_func; 1128 void * work_args_array = args->work_args_array; 1129 pthread_barrier_t * parent_barriers_array = args->parent_barriers_array; 1130 1131 // set error default value 1132 build_args[cxy][level].error = 0; 1133 1134 /////////////////////////////////////////////////////////// 1135 if( level == 0 ) // children are "work" threads 1136 { 1137 unsigned int lid; // core local index 1138 unsigned int ncores; // number of cores in a cluster 1139 1140 // get number of cores per cluster 1141 get_nb_cores( cxy , &ncores ); 1142 1143 // kill process if no active core in cluster 1144 // TODO this "if" should be replaced by an "assert" [AG] 1145 if( ncores == 0 ) 1146 { 1147 printf("\n[PANIC] in %s : no active core in cluster %x\n", 1148 __FUNCTION__ , cxy ); 1149 1150 // report error to parent 1151 build_args[parent_cxy][level+1].error = 1; 1152 1153 // kill process 1154 exit( EXIT_FAILURE ); 1155 } 1156 1157 // initialize the parent_barrier 1158 if( pthread_barrier_init( &parent_barriers_array[cxy] , NULL , ncores + 1 ) ) 1159 { 1160 printf("\n[ERROR] in %s : cannot initialise barrier for build thread[%x][%d]\n", 1161 __FUNCTION__ , cxy , level ); 1162 1163 // report error to parent 1164 build_args[parent_cxy][level+1].error = 1; 1165 } 1166 1167 #if DEBUG_PTHREAD_PARALLEL 1168 printf("\n[%s] <build> thread[%x][%d] created barrier / %d children\n", 1169 __FUNCTION__, cxy, level, ncores + 1 ); 1170 #endif 1171 // create (ncores) "work" threads 1172 for ( lid = 0 ; lid < ncores ; lid++ ) 1173 { 1174 // set attributes for thread[cxy][lid] 1175 work_attr[cxy][lid].attributes = PT_ATTR_DETACH | 1176 PT_ATTR_CLUSTER_DEFINED | 1177 PT_ATTR_CORE_DEFINED; 1178 work_attr[cxy][lid].cxy = cxy; 1179 work_attr[cxy][lid].lid = lid; 1180 1181 // compute pointer on thread[cxy][lid] arguments 1182 void * work_args = *((void **)work_args_array + (cxy * CORES_MAX) + lid); 1183 1184 // create thread 1185 if ( pthread_create( &trdid, // unused 1186 &work_attr[cxy][lid], 1187 work_func, 1188 work_args ) ) 1189 { 1190 printf("\n[ERROR] in %s : cannot create work thread[%x,%x]\n", 1191 __FUNCTION__ , cxy , lid ); 1192 1193 // report error to parent 1194 build_args[parent_cxy][level+1].error = 1; 1195 } 1196 1197 #if DEBUG_PTHREAD_PARALLEL 1198 printf("\n[%s] <build> thread[%x][%d] created <work> thread[%x][%d]\n", 1199 __FUNCTION__, cxy, level, cxy, lid ); 1200 #endif 1201 } 1202 1203 // wait on barrier until "work" children threads completed 1204 if( pthread_barrier_wait( &parent_barriers_array[cxy] ) ) 1205 { 1206 printf("\n[ERROR] in %s / first barrier for <build> thread[%x][%d]\n", 1207 __FUNCTION__ , cxy , level ); 1208 1209 // report error to parent 1210 build_args[parent_cxy][level+1].error = 1; 1211 } 1212 1213 #if DEBUG_PTHREAD_PARALLEL 1214 printf("\n[%s] <build> thread[%x][%d] resume after children completion\n", 1215 __FUNCTION__, cxy, level ); 1216 #endif 1217 1218 } // end level == 0 1219 1220 //////////////////////////////////////////////////////////// 1221 else // children are "build" threads 1222 { 1223 // the 4 children threads can be created in any core of each quarters 1224 // of the parent macro-cluster 1225 1226 unsigned int parent_x; // X coordinate of parent macro-cluster 1227 unsigned int parent_y; // Y coordinate of parent macro-cluster 1228 unsigned int child_x; // X coordinate of child macro-cluster 1229 unsigned int child_y; // Y coordinate of child macro-cluster 1230 unsigned int child_cxy[2][2]; // selected cluster for child thread 1231 unsigned int child_lid[2][2]; // selected core index for child thread 1232 int child_sts[2][2]; // -1 if error / 0 if success / +1 if not found 1233 unsigned int x; // X loop index for children 1234 unsigned int y; // Y loop index for children 1235 1236 unsigned int nb_children = 0; 1237 1238 // get parent macro-cluster mask and half-size from level 1239 unsigned int mask = (1 << level) - 1; 1240 unsigned int half = (level > 0) ? (1 << (level - 1)) : 0; 1241 1242 // get parent macro-cluster coordinates 1243 parent_x = HAL_X_FROM_CXY( cxy ) & ~mask; 1244 parent_y = HAL_Y_FROM_CXY( cxy ) & ~mask; 1245 1246 // get child_cxy and child_lid for up to 4 children threads : 00 / 01 / 10 / 11 1247 for (x = 0 ; x < 2 ; x++) 1248 { 1249 // compute child macro-cluster X coordinate 1250 child_x = (x == 0) ? parent_x : (parent_x + half); 1251 1252 for (y = 0 ; y < 2 ; y++) 1253 { 1254 // compute child macro-cluster Y coordinate 1255 child_y = (y == 0) ? parent_y : (parent_y + half); 1256 1257 // select the best core in macro-cluster 1258 child_sts[x][y] = get_best_core( HAL_CXY_FROM_XY( child_x , child_y ), 1259 level-1, 1260 &child_cxy[x][y], 1261 &child_lid[x][y] ); 1262 1263 if( child_sts[x][y] < 0 ) // failure => report error 1264 { 1265 printf("\n[ERROR] in %s : illegal arguments for <build> thread[%x,%x]\n", 1266 __FUNCTION__ , cxy , level ); 1267 1268 // report error to parent 1269 build_args[parent_cxy][level+1].error = 1; 1270 } 1271 else if (child_sts[x][y] > 0 ) // macro-cluster undefined => does nothing 1272 { 1273 } 1274 else // core found 1275 { 1276 nb_children++; 1277 } 1278 } // end for y 1279 } // end for x 1280 1281 // kill process if no active core in cluster 1282 // TODO this "if" should be replaced by an "assert" [AG] 1283 if( nb_children == 0 ) 1284 { 1285 printf("\n[PANIC] in %s : no active core in macro cluster [%x,%d]\n", 1286 __FUNCTION__ , cxy , level ); 1287 1288 // report error to parent 1289 build_args[parent_cxy][level+1].error = 1; 1290 1291 // kill process 1292 exit( EXIT_FAILURE ); 1293 } 1294 1295 // initialize the barrier for (nb_children + 1) 1296 if( pthread_barrier_init( &build_barrier[cxy][level], NULL , nb_children + 1 ) ) 1297 { 1298 printf("\n[error] in %s : cannot initialise barrier for build thread[%x][%d]\n", 1299 __FUNCTION__ , cxy , level ); 1300 1301 // report error to parent 1302 build_args[parent_cxy][level+1].error = 1; 1303 } 1304 1305 #if DEBUG_PTHREAD_PARALLEL 1306 printf("\n[%s] <build> thread[%x][%d] created barrier / %d children\n", 1307 __FUNCTION__, cxy, level, nb_children + 1 ); 1308 #endif 1309 // create 1 to 4 children threads 1310 for (x = 0 ; x < 2 ; x++) 1311 { 1312 for (y = 0 ; y < 2 ; y++) 1313 { 1314 // thread is created only if macro-cluster is active 1315 if( child_sts[x][y] == 0 ) 1316 { 1317 unsigned int tgt_cxy = child_cxy[x][y]; 1318 unsigned int tgt_lid = child_lid[x][y]; 1319 1320 // set child thread attributes 1321 build_attr[tgt_cxy][level-1].attributes = PT_ATTR_DETACH | 1322 PT_ATTR_CLUSTER_DEFINED | 1323 PT_ATTR_CORE_DEFINED; 1324 build_attr[tgt_cxy][level-1].cxy = tgt_cxy; 1325 build_attr[tgt_cxy][level-1].lid = tgt_lid; 1326 1327 // propagate build function arguments 1328 build_args[tgt_cxy][level-1].cxy = child_cxy[x][y]; 1329 build_args[tgt_cxy][level-1].level = level-1; 1330 build_args[tgt_cxy][level-1].parent_cxy = cxy; 1331 build_args[tgt_cxy][level-1].root_level = root_level; 1332 build_args[tgt_cxy][level-1].work_func = work_func; 1333 build_args[tgt_cxy][level-1].work_args_array = work_args_array; 1334 build_args[tgt_cxy][level-1].parent_barriers_array = parent_barriers_array; 1335 1336 // create thread 1337 if( pthread_create( &trdid, 1338 &build_attr[tgt_cxy][level-1], 1339 &pthread_recursive_build, 1340 &build_args[tgt_cxy][level-1] ) ) 1341 { 1342 printf("\n[ERROR] in %s : cannot create build thread[%x][%d]\n", 1343 __FUNCTION__ , child_cxy , level -1 ); 1344 1345 // report error to parent 1346 build_args[parent_cxy][level+1].error = 1; 1347 } 1348 1349 #if DEBUG_PTHREAD_PARALLEL 1350 printf("\n[%s] <build> thread[%x][%d] created <build> thread[%x][%d] on core[%x,%d]\n", 1351 __FUNCTION__, cxy, level, tgt_cxy, level - 1, tgt_cxy, tgt_lid ); 1352 #endif 1353 } //end if sts[x][y] 1354 } // end for y 1355 } // end for x 1356 1357 // wait on barrier until "build" children threads completed 1358 if( pthread_barrier_wait( &build_barrier[cxy][level] ) ) 1359 { 1360 printf("\n[ERROR] in %s / first barrier for <build> thread[%x][%d]\n", 1361 __FUNCTION__ , cxy , level ); 1362 1363 // report error to parent 1364 build_args[parent_cxy][level+1].error = 1; 1365 } 1366 1367 #if DEBUG_PTHREAD_PARALLEL 1368 printf("\n[%s] <build> thread[%x][%d] resume after children completion\n", 1369 __FUNCTION__, cxy, level ); 1370 #endif 1371 1372 } // end level > 0 1373 1374 // report error to parent when required 1375 if( build_args[cxy][level].error ) 1376 { 1377 build_args[parent_cxy][level+1].error = 1; 1378 } 1379 1380 // all <build> threads - but the root - 1381 // signal completion to parent thread and exit 1382 if( level < root_level ) 1383 { 1384 if( pthread_barrier_wait( &build_barrier[parent_cxy][level+1] ) ) 1385 { 1386 printf("\n[ERROR] in %s / second barrier for <build> thread[%x][%d]\n", 1387 __FUNCTION__ , cxy , level ); 1388 1389 // report error to parent 1390 build_args[parent_cxy][level+1].error = 1; 1391 } 1392 1393 #if DEBUG_PTHREAD_PARALLEL 1394 printf("\n[%s] <build> thread[%x][%d] exit\n", 1395 __FUNCTION__, cxy , level ); 1396 #endif 1397 // "build" thread exit 1398 pthread_exit( NULL ); 1399 } 1400 } // end pthread_recursive_build() 1401 1402 /////////////////////////////////////////////////////// 1403 int pthread_parallel_create( unsigned int root_level, 1404 void * work_func, 1405 void * work_args_array, 1406 void * parent_barriers_array ) 1407 { 1408 unsigned int root_cxy; 1409 unsigned int root_lid; // unused, but required by get_core_id() 1410 1411 #if DEBUG_PTHREAD_PARALLEL 1412 printf("\n[%s] enter / root_level %d / func %x / args %x / barriers %x\n", 1413 __FUNCTION__, root_level, work_func, work_args_array, parent_barriers_array ); 1414 #endif 1415 1416 // get calling thread cluster 1417 get_core_id( &root_cxy , &root_lid ); 1418 1419 // set the build function arguments for the root <build> thread 1420 build_args[root_cxy][root_level].cxy = root_cxy; 1421 build_args[root_cxy][root_level].level = root_level; 1422 build_args[root_cxy][root_level].root_level = root_level; 1423 build_args[root_cxy][root_level].work_func = work_func; 1424 build_args[root_cxy][root_level].work_args_array = work_args_array; 1425 build_args[root_cxy][root_level].parent_barriers_array = parent_barriers_array; 1426 1427 // call the recursive build function 1428 pthread_recursive_build( &build_args[root_cxy][root_level] ); 1429 1430 // check error 1431 if( build_args[root_cxy][root_level].error ) 1432 { 1433 printf("\n[error] in %s\n", __FUNCTION__ ); 1434 return -1; 1435 } 1436 1437 return 0; 1438 1439 } // end pthread_parallel_create() 1440 1441 1060 1442 1061 1443 // Local Variables: -
trunk/libs/libalmosmkh/almosmkh.h
r629 r637 2 2 * almosmkh.h - User level ALMOS-MKH specific library definition. 3 3 * 4 * Author Alain Greiner (2016,2017,2018 )4 * Author Alain Greiner (2016,2017,2018,2019) 5 5 * 6 6 * Copyright (c) UPMC Sorbonne Universites … … 72 72 73 73 /*************************************************************************************** 74 * This syscall returns the cluster an local index for the calling core. 74 * This syscall returns the cluster identifier and the local index 75 * for the calling core. 75 76 *************************************************************************************** 76 77 * @ cxy : [out] cluster identifier. … … 78 79 * @ return always 0. 79 80 **************************************************************************************/ 80 int get_core( unsigned int * cxy, 81 unsigned int * lid ); 81 int get_core_id( unsigned int * cxy, 82 unsigned int * lid ); 83 84 /*************************************************************************************** 85 * This syscall returns the number of cores in a given cluster. 86 *************************************************************************************** 87 * @ cxy : [in] target cluster identifier. 88 * @ ncores : [out] number of cores in target cluster. 89 * @ return always 0. 90 **************************************************************************************/ 91 int get_nb_cores( unsigned int cxy, 92 unsigned int * ncores ); 93 94 /*************************************************************************************** 95 * This syscall uses the DQDT to search, in a macro-cluster specified by the 96 * <cxy_base> and <level> arguments arguments, the core with the lowest load. 97 * it writes in the <cxy> and <lid> buffers the selected core cluster identifier 98 * and the local core index. 99 *************************************************************************************** 100 * @ cxy_base : [in] any cluster identifier in macro-cluster.in clusters array. 101 * @ level : [in] macro-cluster level in [1,2,3,4,5]. 102 * @ cxy : [out] selected core cluster identifier. 103 * @ lid : [out] selectod core local index. 104 * @ return 0 if success / 1 if no core in macro-cluster / -1 if illegal arguments. 105 **************************************************************************************/ 106 int get_best_core( unsigned int cxy_base, 107 unsigned int level, 108 unsigned int * cxy, 109 unsigned int * lid ); 82 110 83 111 /*************************************************************************************** 84 * This function returns the calling core cycles counter,112 * This function returns the value contained in the calling core cycles counter, 85 113 * taking into account a possible overflow on 32 bits architectures. 86 114 *************************************************************************************** … … 414 442 unsigned int cxy ); 415 443 444 /********* Non standard (ALMOS-MKH specific) pthread_parallel_create() syscall *********/ 445 446 ////////////////////////////////////////////////////////////////////////////////////////// 447 // This system call can be used to parallelize the creation and the termination 448 // of a parallel multi-threaded application. It removes the loop in the main thread that 449 // creates the N working threads (N sequencial pthread_create() ). It also removes the 450 // loop that waits completion of these N working threads (N sequencial pthread_join() ). 451 // It creates one "work" thread (in detached mode) per core in the target architecture. 452 // Each "work" thread is identified by the [cxy][lid] indexes (cluster / local core). 453 // The pthread_parallel_create() function returns only when all "work" threads completed 454 // (successfully or not). 455 // 456 // To use this system call, the application code must define the following structures: 457 // - To define the arguments to pass to the <work> function the application must allocate 458 // and initialize a first 2D array, indexed by [cxy] and [lid] indexes, where each slot 459 // contains an application specific structure, and another 2D array, indexed by the same 460 // indexes, containing pointers on these structures. This array of pointers is one 461 // argument of the pthread_parallel_create() function. 462 // - To detect the completion of the <work> threads, the application must allocate a 1D 463 // array, indexed by the cluster index [cxy], where each slot contains a pthread_barrier 464 // descriptor. This barrier is initialised by the pthread_parallel_create() function, 465 // in all cluster containing at least one work thread. This array of barriers is another 466 // argument of the pthread_parallel_create() function. 467 // 468 // Implementation note: 469 // To parallelize the "work" threads creation and termination, the pthread_parallel_create() 470 // function creates a distributed quad-tree (DQT) of "build" threads covering all cores 471 // required to execute the parallel application. 472 // Depending on the hardware topology, this DQT can be truncated, (i.e. some 473 // parent nodes can have less than 4 chidren), if (x_size != y_size), or if one size 474 // is not a power of 2. Each "build" thread is identified by two indexes [cxy][level]. 475 // Each "build" thread makes the following tasks: 476 // 1) It calls the pthread_create() function to create up to 4 children threads, that 477 // are are "work" threads when (level == 0), or "build" threads, when (level > 0). 478 // 2) It initializes the barrier (global variable), used to block/unblock 479 // the parent thread until children completion. 480 // 3) It calls the pthread_barrier_wait( self ) to wait until all children threads 481 // completed (successfully or not). 482 // 4) It calls the pthread_barrier_wait( parent ) to unblock the parent thread. 483 ////////////////////////////////////////////////////////////////////////////////////////// 484 485 /***************************************************************************************** 486 * This blocking function creates N working threads that execute the code defined 487 * by the <work_func> and <work_args> arguments. 488 * The number N of created threads is entirely defined by the <root_level> argument. 489 * This value defines an abstract quad-tree, with a square base : level in [0,1,2,3,4], 490 * side in [1,2,4,8,16], nclusters in [1,4,16,64,256]. This base is called macro_cluster. 491 * A working thread is created on all cores contained in the specified macro-cluster. 492 * The actual number of physical clusters containing cores can be smaller than the number 493 * of clusters covered by the quad tree. The actual number of cores in a cluster can be 494 * less than the max value. 495 * 496 * In the current implementation, all threads execute the same <work_func> function, 497 * on different arguments, that are specified as a 2D array of pointers <work_args>. 498 * This can be modified in a future version, where the <work_func> argument can become 499 * a 2D array of pointers, to have one specific function for each thread. 500 ***************************************************************************************** 501 * @ root_level : [in] DQT root level in [0,1,2,3,4]. 502 * @ work_func : [in] pointer on start function. 503 * @ work_args_array : [in] pointer on a 2D array of pointers. 504 * @ parent_barriers_array : [in] pointer on a 1D array of barriers. 505 * @ return 0 if success / return -1 if failure. 506 ****************************************************************************************/ 507 int pthread_parallel_create( unsigned int root_level, 508 void * work_func, 509 void * work_args_array, 510 void * parent_barriers_array ); 511 416 512 #endif /* _LIBALMOSMKH_H_ */ 417 513 -
trunk/libs/libpthread/pthread.c
r619 r637 230 230 231 231 //////////////////////////////////////////////////////////////////////////////////////////// 232 // The following functions define another implementation for the POSX barrier 233 // based on a distributed quadtree implemented in user space, and relying 234 // on a busy waiting policy. 235 //////////////////////////////////////////////////////////////////////////////////////////// 236 237 238 //////////////////////////////////////////////////////////////////////////////////////////// 239 // This recursive function initializes the SQT nodes 240 // traversing the SQT from root to bottom 241 //////////////////////////////////////////////////////////////////////////////////////////// 242 static void sqt_barrier_build( pthread_barrier_t * barrier, 232 // The following functions define another implementation for the POSX barrier, based on 233 // a distributed quad tree implemented in user space, but using a busy waiting policy. 234 //////////////////////////////////////////////////////////////////////////////////////////// 235 236 237 //////////////////////////////////////////////////////////////////////////////////////////// 238 // This recursive function initializes the DQT nodes traversing the SQT from root to bottom 239 //////////////////////////////////////////////////////////////////////////////////////////// 240 static void dqt_barrier_build( pthread_barrier_t * barrier, 243 241 unsigned int x, 244 242 unsigned int y, 245 243 unsigned int level, 246 sqt_node_t * parent,244 dqt_node_t * parent, 247 245 unsigned int x_size, 248 246 unsigned int y_size, … … 250 248 { 251 249 // get target node address 252 sqt_node_t * node = barrier->node[x][y][level];250 dqt_node_t * node = barrier->node[x][y][level]; 253 251 254 252 if (level == 0 ) // terminal case … … 266 264 267 265 #if PTHREAD_BARRIER_DEBUG 268 printf("\n[BARRIER] %s : sqt_node[%d][%d][%d] / arity %d / desc %x\n"266 printf("\n[BARRIER] %s : dqt_node[%d][%d][%d] / arity %d / desc %x\n" 269 267 "parent %x / child0 %x / child1 %x / child2 %x / child3 %x\n", 270 268 __FUNCTION__, x, y, level, node->arity, node, node->parent, … … 312 310 313 311 #if PTHREAD_BARRIER_DEBUG 314 printf("\n[BARRIER] %s : sqt_node[%d][%d][%d] / arity %d / desc %x\n"312 printf("\n[BARRIER] %s : dqt_node[%d][%d][%d] / arity %d / desc %x\n" 315 313 "parent %x / child0 %x / child1 %x / child2 %x / child3 %x\n", 316 314 __FUNCTION__, x, y, level, node->arity, node, node->parent, … … 322 320 { 323 321 if ( (cx[i] < x_size) && (cy[i] < y_size) ) 324 sqt_barrier_build( barrier,322 dqt_barrier_build( barrier, 325 323 cx[i], 326 324 cy[i], … … 332 330 } 333 331 } 334 } // end sqt_barrier_build()332 } // end dqt_barrier_build() 335 333 336 334 //////////////////////////////////////////////////////////////// … … 394 392 ( (l == 4) && ((x&0x0F) == 0) && ((y&0x0F) == 0) ) ) 395 393 { 396 sqt_node_t * node = remote_malloc( sizeof(sqt_node_t) , cxy );394 dqt_node_t * node = remote_malloc( sizeof(dqt_node_t) , cxy ); 397 395 398 396 if( node == NULL ) 399 397 { 400 printf("\n[ERROR] in %s : cannot allocate sqt_node in cluster %x\n",398 printf("\n[ERROR] in %s : cannot allocate dqt_node in cluster %x\n", 401 399 __FUNCTION__ , cxy ); 402 400 return -1; … … 411 409 412 410 // recursively initialize all SQT nodes from root to bottom 413 sqt_barrier_build( barrier,411 dqt_barrier_build( barrier, 414 412 0, 415 413 0, … … 428 426 ////////////////////////////////////////////////////////////////////////////////////////// 429 427 // This recursive function decrements the distributed "count" variables, 430 // traversing the SQT from bottom to root.428 // traversing the DQT from bottom to root. 431 429 // The last arrived thread reset the local node before returning. 432 430 ////////////////////////////////////////////////////////////////////////////////////////// 433 static void sqt_barrier_decrement( sqt_node_t * node )431 static void dqt_barrier_decrement( dqt_node_t * node ) 434 432 { 435 433 … … 457 455 { 458 456 // decrement the parent node if the current node is not the root 459 if ( node->parent != NULL ) sqt_barrier_decrement( node->parent );457 if ( node->parent != NULL ) dqt_barrier_decrement( node->parent ); 460 458 461 459 #if PTHREAD_BARRIER_DEBUG … … 484 482 return; 485 483 } 486 } // end sqt_barrier_decrement()484 } // end dqt_barrier_decrement() 487 485 488 486 /////////////////////////////////////////////////////// … … 504 502 505 503 // recursively decrement count from bottom to root 506 sqt_barrier_decrement( barrier->node[x][y][0] );504 dqt_barrier_decrement( barrier->node[x][y][0] ); 507 505 508 506 hal_user_fence(); -
trunk/libs/libpthread/pthread.h
r632 r637 2 2 * pthread.h - User level <pthread> library definition. 3 3 * 4 * Author Alain Greiner (2016,2017,2018 )4 * Author Alain Greiner (2016,2017,2018,2019) 5 5 * 6 6 * Copyright (c) UPMC Sorbonne Universites -
trunk/libs/mini-libc/stdio.h
r623 r637 2 2 * stdio.h - User level <stdio> library definition. 3 3 * 4 * Author Alain Greiner (2016,2017,2018 )4 * Author Alain Greiner (2016,2017,2018,2019) 5 5 * 6 6 * Copyright (c) UPMC Sorbonne Universites -
trunk/libs/mini-libc/stdlib.c
r589 r637 148 148 void * malloc( unsigned int size ) 149 149 { 150 // get cluster identifier 151 unsigned int cxy; 152 unsigned int lid; 153 get_core( &cxy , &lid ); 150 unsigned int cxy; 151 unsigned int lid; 152 153 // get cluster identifier 154 get_core_id( &cxy , &lid ); 154 155 155 156 return remote_malloc( size, cxy ); … … 160 161 unsigned int size ) 161 162 { 162 // get calling core cluster identifier 163 unsigned int cxy; 164 unsigned int lid; 165 get_core( &cxy , &lid ); 163 unsigned int cxy; 164 unsigned int lid; 165 166 // get cluster identifier 167 get_core_id( &cxy , &lid ); 166 168 167 169 return remote_calloc( count , size , cxy ); … … 172 174 unsigned int size ) 173 175 { 174 // get calling core cluster identifier 175 unsigned int cxy; 176 unsigned int lid; 177 get_core( &cxy , &lid ); 176 unsigned int cxy; 177 unsigned int lid; 178 179 // get cluster identifier 180 get_core_id( &cxy , &lid ); 178 181 179 182 return remote_realloc( ptr , size , cxy ); … … 183 186 void free( void * ptr ) 184 187 { 185 // get calling core cluster identifier 186 unsigned int cxy; 187 unsigned int lid; 188 get_core( &cxy , &lid ); 188 unsigned int cxy; 189 unsigned int lid; 190 191 // get cluster identifier 192 get_core_id( &cxy , &lid ); 189 193 190 194 remote_free( ptr , cxy );
Note: See TracChangeset
for help on using the changeset viewer.