source: trunk/libs/libalmosmkh/almosmkh.h @ 639

Last change on this file since 639 was 637, checked in by alain, 5 years ago

Introduce the non-standard pthread_parallel_create() system call
and re-write the <fft> and <sort> applications to improve the
intrinsic paralelism in applications.

File size: 28.9 KB
Line 
1/*
2 * almosmkh.h - User level ALMOS-MKH specific library definition.
3 *
4 * Author     Alain Greiner (2016,2017,2018,2019)
5 *
6 * Copyright (c) UPMC Sorbonne Universites
7 *
8 * This file is part of ALMOS-MKH.
9 *
10 * ALMOS-MKH is free software; you can redistribute it and/or modify it
11 * under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; version 2.0 of the License.
13 *
14 * ALMOS-MKH is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17 * General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with ALMOS-MKH; if not, write to the Free Software Foundation,
21 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23
24#ifndef _LIBALMOSMKH_H_
25#define _LIBALMOSMKH_H_
26
27/***************************************************************************************
28 * This file defines an user level, ALMOS-MKH specific library, containing:
29 * - non standard system calls.
30 * - debug functions.
31 * - remote malloc extensions.
32 **************************************************************************************/
33
34#include <pthread.h>
35#include <shared_almos.h>
36
37/****************** Non standard (ALMOS_MKH specific) system calls ********************/
38
39
40/***************************************************************************************
41 * This syscall gives the process identified by the <pid> argument the exclusive
42 * ownership of its TXT terminal.
43 ***************************************************************************************
44 * @ pid        : process identifier.
45 * @ returns O if success / returns -1 if process not found.
46 **************************************************************************************/
47int fg( unsigned int pid );
48
49/***************************************************************************************
50 * This syscall stores in the buffer identified by the <owner> argument a non zero
51 * value when the process identified by the <pid> argument is currently the exclusive
52 * owner of its TXT terminal.
53 ***************************************************************************************
54 * @ pid        : [in]  process identifier.
55 * @ owner      : [out] pointer on buffer to store the
56 * @ returns O if success / returns -1 if process not found.
57 **************************************************************************************/
58int is_fg( unsigned int pid,
59           unsigned int * owner );
60
61/***************************************************************************************
62 * This syscall returns the hardware platform parameters.
63 ***************************************************************************************
64 * @ x_size   : [out] number of clusters in a row.
65 * @ y_size   : [out] number of clusters in a column.
66 * @ ncores   : [out] number of cores per cluster.
67 * @ return always 0.
68 **************************************************************************************/
69int get_config( unsigned int * x_size,
70                unsigned int * y_size,
71                unsigned int * ncores );
72
73/***************************************************************************************
74 * This syscall returns the cluster identifier and the local index
75 * for the calling core.
76 ***************************************************************************************
77 * @ cxy      : [out] cluster identifier.
78 * @ lid      : [out] core local index in cluster.
79 * @ return always 0.
80 **************************************************************************************/
81int get_core_id( unsigned int * cxy,
82                 unsigned int * lid );
83
84/***************************************************************************************
85 * This syscall returns the number of cores in a given cluster.
86 ***************************************************************************************
87 * @ cxy      : [in]  target cluster identifier.
88 * @ ncores   : [out] number of cores in target cluster.
89 * @ return always 0.
90 **************************************************************************************/
91int get_nb_cores( unsigned int   cxy,
92                  unsigned int * ncores );
93
94/***************************************************************************************
95 * This syscall uses the DQDT to search, in a macro-cluster specified by the
96 * <cxy_base> and <level> arguments arguments, the core with the lowest load.
97 * it writes in the <cxy> and <lid> buffers the selected core cluster identifier
98 * and the local core index.
99 ***************************************************************************************
100 * @ cxy_base : [in]  any cluster identifier in macro-cluster.in clusters array.
101 * @ level    : [in]  macro-cluster level in [1,2,3,4,5].
102 * @ cxy      : [out] selected core cluster identifier.
103 * @ lid      : [out] selectod core local index.
104 * @ return 0 if success / 1 if no core in macro-cluster / -1 if illegal arguments.
105 **************************************************************************************/
106int get_best_core( unsigned int   cxy_base,
107                   unsigned int   level,
108                   unsigned int * cxy,
109                   unsigned int * lid );
110
111/***************************************************************************************
112 * This function returns the value contained in the calling core cycles counter,
113 * taking into account a possible overflow on 32 bits architectures.
114 ***************************************************************************************
115 * @ cycle    : [out] current cycle value.
116 * @ return always 0.
117 **************************************************************************************/
118int get_cycle( unsigned long long * cycle );
119
120/***************************************************************************************
121 * This syscall allows the calling thread to specify the target cluster for
122 * a subsequent fork(). It must be called for each fork().
123 ***************************************************************************************
124 * @ cxy      : [in] target cluster identifier.
125 * @ return 0 if success / returns -1 if illegal cxy argument.
126 **************************************************************************************/
127int place_fork( unsigned int cxy );
128
129/***************************************************************************************
130 * This syscall implements the operations related to User Thread Local Storage.
131 ***************************************************************************************
132 * @ operation  : UTLS operation type as defined in "shared_sycalls.h" file.
133 * @ value      : argument value for the UTLS_SET operation.
134 * @ return value for the UTLS_GET and UTLS_GET_ERRNO / return -1 if failure.
135 **************************************************************************************/
136int utls( unsigned int operation,
137          unsigned int value );
138
139/***************************************************************************************
140 * This syscall returns an unsigned 32 bits integer from the standard "stdin" stream.
141 * Both decimal numbers and hexadecimal numbers (prefixed by 0x) are supported.
142 ***************************************************************************************
143 * returns the integer value if success / returns -1 if failure.
144 **************************************************************************************/
145unsigned int get_uint32( void );
146
147
148/***************** Non standard (ALMOS-MKH specific) debug functions ******************/
149
150
151/***************************************************************************************
152 * This debug syscall displays on the kernel terminal TXT0
153 * the thread / process / core identifiers, the current cycle, plus a user defined
154 * message as specified by the <string> argument.
155 ***************************************************************************************
156 * @ string    : [in] user defined message.
157 **************************************************************************************/
158void display_string( char * string );
159
160/***************************************************************************************
161 * This debug function displays on the kernel terminal TXT0
162 * the state of the  VMM for the process <pid> in cluster <cxy>.
163 * It can be called by any thread running in any cluster.
164 ***************************************************************************************
165 * @ cxy      : [in] target cluster identifier.
166 * @ pid      : [in] process identifier.
167 * @ return 0 if success / return -1 if illegal argument.
168 **************************************************************************************/
169int display_vmm(unsigned int cxy, unsigned int pid );
170
171/***************************************************************************************
172 * This debug syscall displays on the kernel terminal TXT0
173 * the state of the core scheduler identified by the <cxy> and <lid> arguments.
174 * It can be called by any thread running in any cluster.
175 ***************************************************************************************
176 * @ cxy      : [in] target cluster identifier.
177 * @ lid      : [in] target core local index.
178 * @ return 0 if success / return -1 if illegal arguments.
179 **************************************************************************************/
180int display_sched( unsigned int  cxy,
181                   unsigned int  lid );
182
183/***************************************************************************************
184 * This debug syscall displays on the kernel terminal TXT0
185 * the list of process registered in a given cluster identified by the <cxy> argument.
186 * Only the owned processes are displayed when the <owned> argument is non zero.
187 * It can be called by any thread running in any cluster.
188 ***************************************************************************************
189 * @ cxy      : [in] target cluster identifier.
190 * @ owned    : [in] only owned processes if non zero.
191 * @ return 0 if success / return -1 if illegal argument.
192 **************************************************************************************/
193int display_cluster_processes( unsigned int  cxy,
194                               unsigned int  owned );
195
196/***************************************************************************************
197 * This debug syscall displays on the kernel terminal TXT0
198 * the list of processes attached to a given TXT channel.
199 * It can be called by any thread running in any cluster.
200 ***************************************************************************************
201 * @ txt_id   : [in] TXT terminal indes.
202 * @ return 0 if success / return -1 if illegal argument.
203 **************************************************************************************/
204int display_txt_processes( unsigned int txt_id );
205
206/***************************************************************************************
207 * This debug syscall displays on the kernel terminal TXT0
208 * the set of busylocks hold by a thread identified by the <pid> and <trdid> arguments.
209 * It can be called by any thread running in any cluster.
210 ***************************************************************************************
211 * @ pid      : [in] process identifier.
212 * @ trdid    : [in] thread identifier.
213 * @ return 0 if success / return -1 if illegal arguments.
214 **************************************************************************************/
215int display_busylocks( unsigned int pid,
216                       unsigned int trdid );
217
218/***************************************************************************************
219 * This debug syscall displays on the kernel terminal TXT0
220 * the list of channel devices available in the architecture.
221 * It can be called by any thread running in any cluster.
222 ***************************************************************************************
223 * @ return always 0.
224 **************************************************************************************/
225int display_chdev( void );
226
227/***************************************************************************************
228 * This debug syscall displays on the kernel terminal TXT0
229 * the list of channel device or pseudo-files registered in the VFS cache.
230 * It can be called by any thread running in any cluster.
231 ***************************************************************************************
232 * @ return always 0.
233 **************************************************************************************/
234int display_vfs( void );
235
236/***************************************************************************************
237 * This debug syscall displays on the kernel terminal TXT0 the current DQDT state.
238 * It can be called by any thread running in any cluster.
239 ***************************************************************************************
240 * @ return always 0.
241 **************************************************************************************/
242int display_dqdt( void );
243
244/***************************************************************************************
245 * This debug syscall displays on the kernel terminal TXT0 the content of a given
246 * page of a given VFS mapper.
247 * It can be called by any thread running in any cluster.
248 ***************************************************************************************
249 * @ path      : pathname identifying the file/directory in VFS.
250 * @ page_id   : page index in file.
251 * @ nbytes    : number of bytes to display.
252 * @ return 0 if success / return -1 if file or page not found.
253 **************************************************************************************/
254int display_mapper( char        * path,
255                    unsigned int  page_id,
256                    unsigned int  nbytes);
257
258/***************************************************************************************
259 * This debug syscall displays on the kernel terminal TXT0
260 * the state of the barrier used by the process identified by the <pid> argument.
261 * It can be called by any thread running in any cluster.
262 ***************************************************************************************
263 * @ pid      : [in] process identifier.
264 * @ return 0 if success / return -1 if illegal arguments.
265 **************************************************************************************/
266int display_barrier( unsigned int pid );
267
268/***************************************************************************************
269 * This debug syscall displays on the kernel terminal TXT0 the content of one given
270 * page of the FAT mapper.
271 * It can be called by any thread running in any cluster.
272 ***************************************************************************************
273 * @ page_id    : page index in file.
274 * @ nb_entries : number of bytes to display.
275 * @ return 0 if success / return -1 if page not found.
276 **************************************************************************************/
277int display_fat( unsigned int page_id,
278                 unsigned int nb_entries );
279
280/*****************************************************************************************
281* This debug syscall is used to activate / desactivate the context switches trace
282* for a core identified by the <cxy> and <lid> arguments.
283* It can be called by any thread running in any cluster.
284*****************************************************************************************
285* @ active     : activate trace if non zero / desactivate if zero.
286* @ cxy        : cluster identifier.
287* @ lid        : core local index.
288* @ returns O if success / returns -1 if illegal arguments.
289****************************************************************************************/
290int trace( unsigned int active,
291           unsigned int cxy, 
292           unsigned int lid );
293
294/****************************************************************************************
295 * This syscall implements an user-level interactive debugger that can be
296 * introduced in any user application to display various kernel distributed structures.
297 ***************************************************************************************/
298void idbg( void );
299
300
301/****************** Non standard (ALMOS-MKH specific) malloc operations  ***************/
302
303/////////////////////////////////////////////////////////////////////////////////////////
304// General principles:
305// - In user space the HEAP zone spread between the ELF zone and the STACK zone,
306//   as defined in the kernel_config.h file.
307// - The malloc library uses the mmap() syscall to create - on demand -
308//   one vseg in a given cluster. The size of this vseg is defined below
309//   by the MALLOC_LOCAL_STORE_SIZE parameter.
310// - For a standard malloc(), the target cluster is the cluster containing
311//   the core running the client thread.
312// - For a remote_malloc(), the target cluster is explicitely defined
313//   by the argument.
314// - In each cluster, the available storage in virtual space is handled by a
315//   local allocator using the buddy algorithm.
316//
317// TODO : In this first implementation one single - fixed size - vseg
318//        is allocated on demand in each cluster.
319//        We should introduce the possibility to dynamically allocate
320//        several vsegs in each cluster, using several mmap when required.
321/////////////////////////////////////////////////////////////////////////////////////////
322// Free blocks organisation in each cluster :
323// - All free blocks have a size that is a power of 2, larger or equal
324//   to MALLOC_MIN_BLOCK_SIZE (typically 64 bytes).
325// - All free blocks are aligned.
326// - They are pre-classed in an array of linked lists, where all blocks in a
327//   given list have the same size.
328// - The NEXT pointer implementing those linked lists is written
329//   in the first bytes of the block itself, using the unsigned int type.
330// - The pointers on the first free block for each size are stored in an
331//   array of pointers free[32] in the storage(x,y) descriptor.
332/////////////////////////////////////////////////////////////////////////////////////////
333// Allocation policy:
334// - The block size required by the user can be any value, but the allocated
335//   block size can be larger than the requested size:
336// - The allocator computes actual_size, that is the smallest power of 2
337//   value larger or equal to the requested size AND larger or equal to
338//   MALLOC_MIN_BLOCK_SIZE.
339// - It pop the linked list of free blocks corresponding to actual_size,
340//   and returns the block B if the list[actual_size] is not empty.
341// - If the list[actual_size] is empty, it pop the list[actual_size * 2].
342//   If a block B' is found, it breaks this block in 2 B/2 blocks, returns
343//   the first B/2 block and push the other B/2 block into list[actual_size].
344// - If the list[actual_size * 2] is empty, it pop the list[actual_size * 4].
345//   If a block B is found, it break this block in 3 blocks B/4, B/4 and B/2,
346//   returns the first B/4 block, push the other blocks B/4 and B/2 into
347//   the proper lists. etc...
348// - If no block satisfying the request is available it returns a failure
349//   (NULL pointer).
350// - This allocation policy has the nice following property:
351//   If the vseg is aligned (the vseg base is a multiple of the
352//   vseg size), all allocated blocks are aligned on the actual_size.
353/////////////////////////////////////////////////////////////////////////////////////////
354// Free policy:
355// - Each allocated block is registered in an alloc[] array of unsigned char.
356// - This registration is required by the free() operation, because the size
357//   of the allocated block must be obtained from the base address of the block. 
358// - The number of entries in this array is equal to the max number
359//   of allocated block : MALLOC_LOCAL_STORE_SIZE / MALLOC_MIN_BLOCK_SIZE.
360// - For each allocated block, the value registered in the alloc[] array
361//   is log2( size_of_allocated_block ).
362// - The index in this array is computed from the allocated block base address:
363//      index = (block_base - vseg_base) / MALLOC_MIN_BLOCK_SIZE
364// - The alloc[] array is stored at the end of heap segment. This consume
365//   (1 / MALLOC_MIN_BLOCK_SIZE) of the total storage capacity.
366/////////////////////////////////////////////////////////////////////////////////////////
367
368
369#define MALLOC_INITIALIZED         0xBABEF00D   // magic number when initialised
370#define MALLOC_MIN_BLOCK_SIZE      0x40         // 64 bytes
371#define MALLOC_LOCAL_STORE_SIZE    0x800000     // 8 Mbytes     
372#define MALLOC_MAX_CLUSTERS        0x100        // 256 clusters
373
374/////////////////////////////////////////////////////////////////////////////////////////
375//               store(x,y) descriptor (one per cluster)
376/////////////////////////////////////////////////////////////////////////////////////////
377
378typedef struct malloc_store_s
379{
380    pthread_mutex_t mutex;           // lock protecting exclusive access to local heap
381    unsigned int    initialized;     // initialised when value == MALLOC_INITIALIZED
382    unsigned int    cxy;             // cluster identifier 
383    unsigned int    store_base;      // store base address
384    unsigned int    store_size;      // store size (bytes)
385    unsigned int    alloc_base;      // alloc[] array base address
386    unsigned int    alloc_size;      // alloc[] array size (bytes)
387    unsigned int    free[32];        // array of addresses of first free block
388} 
389malloc_store_t;
390
391/*****************************************************************************************
392 * This function allocates <size> bytes of memory in user space, and returns a pointer
393 * to the allocated buffer. The pysical memory is allocated from store located in
394 * cluster identified by the <cxy> argument.
395 *****************************************************************************************
396 * @ size    : number of requested bytes.
397 * @ cxy     : target cluster identifier.
398 * @ returns a pointer on the allocated buffer if success / returns NULL if failure
399 ****************************************************************************************/
400void * remote_malloc( unsigned int size, 
401                      unsigned int cxy );
402
403/*****************************************************************************************
404 * This function releases the memory buffer identified by the <ptr> argument,
405 * to the store identified by the <cxy> argument.
406 * It displays an error message, but does nothing if the ptr is illegal.
407 *****************************************************************************************
408 * @ ptr   : pointer on the released buffer.
409 * @ cxy   : target cluster identifier.
410 ****************************************************************************************/
411void remote_free( void        * ptr,
412                  unsigned int  cxy );
413
414/*****************************************************************************************
415 * This function releases the memory buffer identified by the <ptr> argument,
416 * to the store located in cluster identified by the <cxy> argument, and allocates
417 * a new buffer containing <size> bytes from this store.
418 * The content of the old buffer is copied to the new buffer, up to <size> bytes.
419 * It displays an error message, but does nothing if the ptr is illegal.
420 *****************************************************************************************
421 * @ ptr     : pointer on the released buffer.
422 * @ size    : new buffer requested size (bytes).
423 * @ cxy     : target cluster identifier.
424 * @ return a pointer on allocated buffer if success / return NULL if failure
425 ****************************************************************************************/
426void * remote_realloc( void        * ptr,
427                       unsigned int  size,
428                       unsigned int  cxy );
429
430/*****************************************************************************************
431 * This function allocates enough space for <count> objects that are <size> bytes
432 * of memory each from the store located in cluster identied by the <cxy> argument.
433 * The allocated memory is filled with bytes of value zero.
434 *****************************************************************************************
435 * @ count   : number of requested objects.
436 * @ size    : number of bytes per object.
437 * @ cxy     : target cluster identifier.
438 * @ returns a pointer on allocated buffer if success / returns NULL if failure
439 ****************************************************************************************/
440void * remote_calloc( unsigned int count,
441                      unsigned int size,
442                      unsigned int cxy );
443
444/********* Non standard (ALMOS-MKH specific) pthread_parallel_create() syscall  *********/
445
446//////////////////////////////////////////////////////////////////////////////////////////
447// This system call can be used to parallelize the creation and the termination
448// of a parallel multi-threaded application. It removes the loop in the main thread that
449// creates the N working threads (N  sequencial pthread_create() ). It also removes the
450// loop that waits completion of these N working threads (N sequencial pthread_join() ).
451// It creates one "work" thread (in detached mode) per core in the target architecture.
452// Each "work" thread is identified by the [cxy][lid] indexes (cluster / local core).
453// The pthread_parallel_create() function returns only when all "work" threads completed
454// (successfully or not).
455//
456// To use this system call, the application code must define the following structures:
457// - To define the arguments to pass to the <work> function the application must allocate
458//   and initialize a first 2D array, indexed by [cxy] and [lid] indexes, where each slot
459//   contains an application specific structure, and another 2D array, indexed by the same
460//   indexes, containing pointers on these structures. This array of pointers is one
461//   argument of the pthread_parallel_create() function.
462// - To detect the completion of the <work> threads, the application must allocate a 1D
463//   array, indexed by the cluster index [cxy], where each slot contains a pthread_barrier
464//   descriptor. This barrier is initialised by the pthread_parallel_create() function,
465//   in all cluster containing at least one work thread. This array of barriers is another
466//   argument of the pthread_parallel_create() function.
467//
468// Implementation note:
469// To parallelize the "work" threads creation and termination, the pthread_parallel_create()
470// function creates a distributed quad-tree (DQT) of "build" threads covering all cores
471// required to execute the parallel application.
472// Depending on the hardware topology, this DQT can be truncated, (i.e. some
473// parent nodes can have less than 4 chidren), if (x_size != y_size), or if one size
474// is not a power of 2. Each "build" thread is identified by two indexes [cxy][level].
475// Each "build" thread makes the following tasks:
476// 1) It calls the pthread_create() function to create up to 4 children threads, that
477//    are are "work" threads when (level == 0), or "build" threads, when (level > 0).
478// 2) It initializes the barrier (global variable), used to block/unblock
479//    the parent thread until children completion.
480// 3) It calls the pthread_barrier_wait( self ) to wait until all children threads
481//    completed (successfully or not).
482// 4) It calls the pthread_barrier_wait( parent ) to unblock the parent thread.
483//////////////////////////////////////////////////////////////////////////////////////////
484
485/*****************************************************************************************
486 * This blocking function creates N working threads that execute the code defined
487 * by the <work_func> and <work_args> arguments.
488 * The number N of created threads is entirely defined by the <root_level> argument.
489 * This value defines an abstract quad-tree, with a square base : level in [0,1,2,3,4],
490 * side in [1,2,4,8,16], nclusters in [1,4,16,64,256]. This base is called  macro_cluster.
491 * A working thread is created on all cores contained in the specified macro-cluster.
492 * The actual number of physical clusters containing cores can be smaller than the number
493 * of clusters covered by the quad tree. The actual number of cores in a cluster can be
494 * less than the max value.
495 *
496 * In the current implementation, all threads execute the same <work_func> function,
497 * on different arguments, that are specified as a 2D array of pointers <work_args>.
498 * This can be modified in a future version, where the <work_func> argument can become
499 * a 2D array of pointers, to have one specific function for each thread.
500 *****************************************************************************************
501 * @ root_level            : [in]  DQT root level in [0,1,2,3,4].
502 * @ work_func             : [in]  pointer on start function.
503 * @ work_args_array       : [in]  pointer on a 2D array of pointers.
504 * @ parent_barriers_array : [in]  pointer on a 1D array of barriers.
505 * @ return 0 if success / return -1 if failure.
506 ****************************************************************************************/
507int pthread_parallel_create( unsigned int   root_level,
508                             void         * work_func,
509                             void         * work_args_array,
510                             void         * parent_barriers_array );
511
512#endif /* _LIBALMOSMKH_H_ */
513
Note: See TracBrowser for help on using the repository browser.