1 | /* |
---|
2 | * remote_barrier.c - POSIX barrier implementation. |
---|
3 | * |
---|
4 | * Author Alain Greiner (2016,2017,2018,2019) |
---|
5 | * |
---|
6 | * Copyright (c) UPMC Sorbonne Universites |
---|
7 | * |
---|
8 | * This file is part of ALMOS-MKH. |
---|
9 | * |
---|
10 | * ALMOS-MKH is free software; you can redistribute it and/or modify it |
---|
11 | * under the terms of the GNU General Public License as published by |
---|
12 | * the Free Software Foundation; version 2.0 of the License. |
---|
13 | * |
---|
14 | * ALMOS-MKH is distributed in the hope that it will be useful, but |
---|
15 | * WITHOUT ANY WARRANTY; without even the implied warranty of |
---|
16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
---|
17 | * General Public License for more details. |
---|
18 | * |
---|
19 | * You should have received a copy of the GNU General Public License |
---|
20 | * along with ALMOS-MKH; if not, write to the Free Software Foundation, |
---|
21 | * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA |
---|
22 | */ |
---|
23 | |
---|
24 | #include <hal_kernel_types.h> |
---|
25 | #include <hal_macros.h> |
---|
26 | #include <hal_remote.h> |
---|
27 | #include <hal_irqmask.h> |
---|
28 | #include <remote_busylock.h> |
---|
29 | #include <thread.h> |
---|
30 | #include <kmem.h> |
---|
31 | #include <printk.h> |
---|
32 | #include <process.h> |
---|
33 | #include <vmm.h> |
---|
34 | #include <remote_barrier.h> |
---|
35 | |
---|
36 | //////////////////////////////////////////////////// |
---|
37 | // generic (implementation independant) functions |
---|
38 | //////////////////////////////////////////////////// |
---|
39 | |
---|
40 | /////////////////////////////////////////////////// |
---|
41 | xptr_t generic_barrier_from_ident( intptr_t ident ) |
---|
42 | { |
---|
43 | // get pointer on local process_descriptor |
---|
44 | process_t * process = CURRENT_THREAD->process; |
---|
45 | |
---|
46 | // get pointers on reference process |
---|
47 | xptr_t ref_xp = process->ref_xp; |
---|
48 | cxy_t ref_cxy = GET_CXY( ref_xp ); |
---|
49 | process_t * ref_ptr = (process_t *)GET_PTR( ref_xp ); |
---|
50 | |
---|
51 | // get extended pointer on root of barriers list |
---|
52 | xptr_t root_xp = XPTR( ref_cxy , &ref_ptr->barrier_root ); |
---|
53 | |
---|
54 | // scan reference process barriers list |
---|
55 | xptr_t iter_xp; |
---|
56 | xptr_t barrier_xp; |
---|
57 | cxy_t barrier_cxy; |
---|
58 | generic_barrier_t * barrier_ptr; |
---|
59 | intptr_t current; |
---|
60 | bool_t found = false; |
---|
61 | |
---|
62 | XLIST_FOREACH( root_xp , iter_xp ) |
---|
63 | { |
---|
64 | barrier_xp = XLIST_ELEMENT( iter_xp , generic_barrier_t , list ); |
---|
65 | barrier_cxy = GET_CXY( barrier_xp ); |
---|
66 | barrier_ptr = (generic_barrier_t *)GET_PTR( barrier_xp ); |
---|
67 | current = (intptr_t)hal_remote_lpt( XPTR( barrier_cxy , &barrier_ptr->ident ) ); |
---|
68 | if( ident == current ) |
---|
69 | { |
---|
70 | found = true; |
---|
71 | break; |
---|
72 | } |
---|
73 | } |
---|
74 | |
---|
75 | if( found == false ) return XPTR_NULL; |
---|
76 | else return barrier_xp; |
---|
77 | |
---|
78 | } // end generic_barrier_from_ident() |
---|
79 | |
---|
80 | ////////////////////////////////////////////////////////////// |
---|
81 | error_t generic_barrier_create( intptr_t ident, |
---|
82 | uint32_t count, |
---|
83 | pthread_barrierattr_t * attr ) |
---|
84 | { |
---|
85 | xptr_t gen_barrier_xp; // extended pointer on generic barrier descriptor |
---|
86 | generic_barrier_t * gen_barrier_ptr; // local pointer on generic barrier descriptor |
---|
87 | void * barrier; // local pointer on implementation barrier descriptor |
---|
88 | kmem_req_t req; // kmem request |
---|
89 | |
---|
90 | // get pointer on local process_descriptor |
---|
91 | process_t * process = CURRENT_THREAD->process; |
---|
92 | |
---|
93 | // get pointers on reference process |
---|
94 | xptr_t ref_xp = process->ref_xp; |
---|
95 | cxy_t ref_cxy = GET_CXY( ref_xp ); |
---|
96 | process_t * ref_ptr = (process_t *)GET_PTR( ref_xp ); |
---|
97 | |
---|
98 | // allocate memory for generic barrier descriptor |
---|
99 | if( ref_cxy == local_cxy ) // reference cluster is local |
---|
100 | { |
---|
101 | req.type = KMEM_GEN_BARRIER; |
---|
102 | req.flags = AF_ZERO; |
---|
103 | gen_barrier_ptr = kmem_alloc( &req ); |
---|
104 | gen_barrier_xp = XPTR( local_cxy , gen_barrier_ptr ); |
---|
105 | } |
---|
106 | else // reference cluster is remote |
---|
107 | { |
---|
108 | rpc_kcm_alloc_client( ref_cxy, |
---|
109 | KMEM_GEN_BARRIER, |
---|
110 | &gen_barrier_xp ); |
---|
111 | gen_barrier_ptr = GET_PTR( gen_barrier_xp ); |
---|
112 | } |
---|
113 | |
---|
114 | if( gen_barrier_ptr == NULL ) |
---|
115 | { |
---|
116 | printk("\n[ERROR] in %s : cannot create generic barrier\n", __FUNCTION__ ); |
---|
117 | return -1; |
---|
118 | } |
---|
119 | |
---|
120 | // create implementation specific barrier descriptor |
---|
121 | if( attr == NULL ) // simple barrier implementation |
---|
122 | { |
---|
123 | // create simple barrier descriptor |
---|
124 | barrier = simple_barrier_create( count ); |
---|
125 | |
---|
126 | if( barrier == NULL ) |
---|
127 | { |
---|
128 | printk("\n[ERROR] in %s : cannot create simple barrier\n", __FUNCTION__); |
---|
129 | return -1; |
---|
130 | } |
---|
131 | } |
---|
132 | else // QDT barrier implementation |
---|
133 | { |
---|
134 | uint32_t x_size = attr->x_size; |
---|
135 | uint32_t y_size = attr->y_size; |
---|
136 | uint32_t nthreads = attr->nthreads; |
---|
137 | |
---|
138 | // check attributes / count |
---|
139 | if( (x_size * y_size * nthreads) != count ) |
---|
140 | { |
---|
141 | printk("\n[ERROR] in %s : count(%d) != x_size(%d) * y_size(%d) * nthreads(%d)\n", |
---|
142 | __FUNCTION__, count, x_size, y_size, nthreads ); |
---|
143 | return -1; |
---|
144 | } |
---|
145 | |
---|
146 | // create DQT barrier descriptor |
---|
147 | barrier = dqt_barrier_create( x_size , y_size , nthreads ); |
---|
148 | |
---|
149 | if( barrier == NULL ) |
---|
150 | { |
---|
151 | printk("\n[ERROR] in %s : cannot create DQT barrier descriptor\n", __FUNCTION__); |
---|
152 | return -1; |
---|
153 | } |
---|
154 | } |
---|
155 | |
---|
156 | // initialize the generic barrier descriptor |
---|
157 | hal_remote_spt( XPTR( ref_cxy , &gen_barrier_ptr->ident ) , (void*)ident ); |
---|
158 | hal_remote_s32( XPTR( ref_cxy , &gen_barrier_ptr->is_dqt ) , (attr != NULL) ); |
---|
159 | hal_remote_spt( XPTR( ref_cxy , &gen_barrier_ptr->extend ) , barrier ); |
---|
160 | |
---|
161 | // build extended pointers on lock, root and entry for reference process xlist |
---|
162 | xptr_t root_xp = XPTR( ref_cxy , &ref_ptr->barrier_root ); |
---|
163 | xptr_t lock_xp = XPTR( ref_cxy , &ref_ptr->sync_lock ); |
---|
164 | xptr_t entry_xp = XPTR( ref_cxy , &gen_barrier_ptr->list ); |
---|
165 | |
---|
166 | // register barrier in reference process xlist of barriers |
---|
167 | remote_busylock_acquire( lock_xp ); |
---|
168 | xlist_add_first( root_xp , entry_xp ); |
---|
169 | remote_busylock_release( lock_xp ); |
---|
170 | |
---|
171 | return 0; |
---|
172 | |
---|
173 | } // en generic_barrier_create() |
---|
174 | |
---|
175 | ///////////////////////////////////////////////////// |
---|
176 | void generic_barrier_destroy( xptr_t gen_barrier_xp ) |
---|
177 | { |
---|
178 | kmem_req_t req; // kmem request |
---|
179 | |
---|
180 | // get pointer on local process_descriptor |
---|
181 | process_t * process = CURRENT_THREAD->process; |
---|
182 | |
---|
183 | // get pointers on reference process |
---|
184 | xptr_t ref_xp = process->ref_xp; |
---|
185 | cxy_t ref_cxy = GET_CXY( ref_xp ); |
---|
186 | process_t * ref_ptr = GET_PTR( ref_xp ); |
---|
187 | |
---|
188 | // get cluster and local pointer on generic barrier descriptor |
---|
189 | generic_barrier_t * gen_barrier_ptr = GET_PTR( gen_barrier_xp ); |
---|
190 | cxy_t gen_barrier_cxy = GET_CXY( gen_barrier_xp ); |
---|
191 | |
---|
192 | // get barrier type and extension pointer |
---|
193 | bool_t is_dqt = hal_remote_l32( XPTR( gen_barrier_cxy , &gen_barrier_ptr->is_dqt ) ); |
---|
194 | void * extend = hal_remote_lpt( XPTR( gen_barrier_cxy , &gen_barrier_ptr->extend ) ); |
---|
195 | |
---|
196 | // build extended pointer on implementation dependant barrier descriptor |
---|
197 | xptr_t barrier_xp = XPTR( gen_barrier_cxy , extend ); |
---|
198 | |
---|
199 | // delete the implementation specific barrier |
---|
200 | if( is_dqt ) dqt_barrier_destroy( barrier_xp ); |
---|
201 | else simple_barrier_destroy( barrier_xp ); |
---|
202 | |
---|
203 | // build extended pointers on lock and entry for reference process xlist |
---|
204 | xptr_t lock_xp = XPTR( ref_cxy , &ref_ptr->sync_lock ); |
---|
205 | xptr_t entry_xp = XPTR( gen_barrier_cxy , &gen_barrier_ptr->list ); |
---|
206 | |
---|
207 | // remove barrier from reference process xlist |
---|
208 | remote_busylock_acquire( lock_xp ); |
---|
209 | xlist_unlink( entry_xp ); |
---|
210 | remote_busylock_release( lock_xp ); |
---|
211 | |
---|
212 | // release memory allocated to barrier descriptor |
---|
213 | if( gen_barrier_cxy == local_cxy ) |
---|
214 | { |
---|
215 | req.type = KMEM_GEN_BARRIER; |
---|
216 | req.ptr = gen_barrier_ptr; |
---|
217 | kmem_free( &req ); |
---|
218 | } |
---|
219 | else |
---|
220 | { |
---|
221 | rpc_kcm_free_client( gen_barrier_cxy, |
---|
222 | gen_barrier_ptr, |
---|
223 | KMEM_GEN_BARRIER ); |
---|
224 | } |
---|
225 | } // end generic_barrier_destroy() |
---|
226 | |
---|
227 | ////////////////////////////////////////////////// |
---|
228 | void generic_barrier_wait( xptr_t gen_barrier_xp ) |
---|
229 | { |
---|
230 | // get generic barrier descriptor cluster and pointer |
---|
231 | cxy_t gen_barrier_cxy = GET_CXY( gen_barrier_xp ); |
---|
232 | generic_barrier_t * gen_barrier_ptr = GET_PTR( gen_barrier_xp ); |
---|
233 | |
---|
234 | // get implementation type and extend local pointer |
---|
235 | bool_t is_dqt = hal_remote_l32( XPTR( gen_barrier_cxy , &gen_barrier_ptr->is_dqt ) ); |
---|
236 | void * extend = hal_remote_lpt( XPTR( gen_barrier_cxy , &gen_barrier_ptr->extend ) ); |
---|
237 | |
---|
238 | // build extended pointer on implementation specific barrier descriptor |
---|
239 | xptr_t barrier_xp = XPTR( gen_barrier_cxy , extend ); |
---|
240 | |
---|
241 | // call the relevant wait function |
---|
242 | if( is_dqt ) dqt_barrier_wait( barrier_xp ); |
---|
243 | else simple_barrier_wait( barrier_xp ); |
---|
244 | |
---|
245 | } // end generic_barrier_wait() |
---|
246 | |
---|
247 | |
---|
248 | |
---|
249 | |
---|
250 | |
---|
251 | ///////////////////////////////////////////////////////////// |
---|
252 | // simple barrier functions |
---|
253 | ///////////////////////////////////////////////////////////// |
---|
254 | |
---|
255 | /////////////////////////////////////////////////////////// |
---|
256 | simple_barrier_t * simple_barrier_create( uint32_t count ) |
---|
257 | { |
---|
258 | xptr_t barrier_xp; |
---|
259 | simple_barrier_t * barrier; |
---|
260 | |
---|
261 | // get pointer on local client process descriptor |
---|
262 | thread_t * this = CURRENT_THREAD; |
---|
263 | process_t * process = this->process; |
---|
264 | |
---|
265 | // get reference process cluster |
---|
266 | xptr_t ref_xp = process->ref_xp; |
---|
267 | cxy_t ref_cxy = GET_CXY( ref_xp ); |
---|
268 | |
---|
269 | // allocate memory for simple barrier descriptor |
---|
270 | if( ref_cxy == local_cxy ) // reference is local |
---|
271 | { |
---|
272 | kmem_req_t req; |
---|
273 | req.type = KMEM_SMP_BARRIER; |
---|
274 | req.flags = AF_ZERO; |
---|
275 | barrier = kmem_alloc( &req ); |
---|
276 | barrier_xp = XPTR( local_cxy , barrier ); |
---|
277 | } |
---|
278 | else // reference is remote |
---|
279 | { |
---|
280 | rpc_kcm_alloc_client( ref_cxy, |
---|
281 | KMEM_SMP_BARRIER, |
---|
282 | &barrier_xp ); |
---|
283 | barrier = GET_PTR( barrier_xp ); |
---|
284 | } |
---|
285 | |
---|
286 | if( barrier == NULL ) return NULL; |
---|
287 | |
---|
288 | // initialise simple barrier descriptor |
---|
289 | hal_remote_s32 ( XPTR( ref_cxy , &barrier->arity ) , count ); |
---|
290 | hal_remote_s32 ( XPTR( ref_cxy , &barrier->current ) , 0 ); |
---|
291 | hal_remote_s32 ( XPTR( ref_cxy , &barrier->sense ) , 0 ); |
---|
292 | |
---|
293 | xlist_root_init ( XPTR( ref_cxy , &barrier->root ) ); |
---|
294 | remote_busylock_init( XPTR( ref_cxy , &barrier->lock ) , LOCK_BARRIER_STATE ); |
---|
295 | |
---|
296 | #if DEBUG_BARRIER_CREATE |
---|
297 | uint32_t cycle = (uint32_t)hal_get_cycles(); |
---|
298 | if( cycle > DEBUG_BARRIER_CREATE ) |
---|
299 | printk("\n[%s] thread[%x,%x] created barrier (%x,%x) / count %d / cycle %d\n", |
---|
300 | __FUNCTION__, process->pid, this->trdid, ref_cxy, barrier, count, cycle ); |
---|
301 | #endif |
---|
302 | |
---|
303 | return barrier; |
---|
304 | |
---|
305 | } // end simple_barrier_create() |
---|
306 | |
---|
307 | //////////////////////////////////////////////// |
---|
308 | void simple_barrier_destroy( xptr_t barrier_xp ) |
---|
309 | { |
---|
310 | // get barrier cluster and local pointer |
---|
311 | cxy_t barrier_cxy = GET_CXY( barrier_xp ); |
---|
312 | simple_barrier_t * barrier_ptr = GET_PTR( barrier_xp ); |
---|
313 | |
---|
314 | // release memory allocated for barrier descriptor |
---|
315 | if( barrier_cxy == local_cxy ) |
---|
316 | { |
---|
317 | kmem_req_t req; |
---|
318 | req.type = KMEM_SMP_BARRIER; |
---|
319 | req.ptr = barrier_ptr; |
---|
320 | kmem_free( &req ); |
---|
321 | } |
---|
322 | else |
---|
323 | { |
---|
324 | rpc_kcm_free_client( barrier_cxy, |
---|
325 | barrier_ptr, |
---|
326 | KMEM_SMP_BARRIER ); |
---|
327 | } |
---|
328 | |
---|
329 | #if DEBUG_BARRIER_DESTROY |
---|
330 | uint32_t cycle = (uint32_t)hal_get_cycles(); |
---|
331 | thread_t * this = CURRENT_THREAD; |
---|
332 | process_t * process = this->process; |
---|
333 | if( cycle > DEBUG_BARRIER_DESTROY ) |
---|
334 | printk("\n[%s] thread[%x,%x] deleted barrier (%x,%x) / cycle %d\n", |
---|
335 | __FUNCTION__, process->pid, this->trdid, barrier_ptr, barrier_cxy, cycle ); |
---|
336 | #endif |
---|
337 | |
---|
338 | } // end simple_barrier_destroy() |
---|
339 | |
---|
340 | ///////////////////////////////////////////// |
---|
341 | void simple_barrier_wait( xptr_t barrier_xp ) |
---|
342 | { |
---|
343 | uint32_t expected; |
---|
344 | uint32_t sense; |
---|
345 | uint32_t current; |
---|
346 | uint32_t arity; |
---|
347 | xptr_t root_xp; |
---|
348 | xptr_t lock_xp; |
---|
349 | xptr_t current_xp; |
---|
350 | xptr_t sense_xp; |
---|
351 | xptr_t arity_xp; |
---|
352 | |
---|
353 | // get pointer on calling thread |
---|
354 | thread_t * this = CURRENT_THREAD; |
---|
355 | |
---|
356 | // check calling thread can yield |
---|
357 | thread_assert_can_yield( this , __FUNCTION__ ); |
---|
358 | |
---|
359 | // get cluster and local pointer on remote barrier |
---|
360 | simple_barrier_t * barrier_ptr = GET_PTR( barrier_xp ); |
---|
361 | cxy_t barrier_cxy = GET_CXY( barrier_xp ); |
---|
362 | |
---|
363 | #if DEBUG_BARRIER_WAIT |
---|
364 | uint32_t cycle = (uint32_t)hal_get_cycles(); |
---|
365 | if( cycle > DEBUG_BARRIER_WAIT ) |
---|
366 | printk("\n[%s] thread[%x,%x] enter / barrier (%x,%x) / cycle %d\n", |
---|
367 | __FUNCTION__, this->process->pid, this->trdid, barrier_cxy, barrier_ptr, cycle ); |
---|
368 | #endif |
---|
369 | |
---|
370 | // build extended pointers on various barrier descriptor fields |
---|
371 | lock_xp = XPTR( barrier_cxy , &barrier_ptr->lock ); |
---|
372 | root_xp = XPTR( barrier_cxy , &barrier_ptr->root ); |
---|
373 | current_xp = XPTR( barrier_cxy , &barrier_ptr->current ); |
---|
374 | sense_xp = XPTR( barrier_cxy , &barrier_ptr->sense ); |
---|
375 | arity_xp = XPTR( barrier_cxy , &barrier_ptr->arity ); |
---|
376 | |
---|
377 | // take busylock protecting the barrier state |
---|
378 | remote_busylock_acquire( lock_xp ); |
---|
379 | |
---|
380 | // get sense and threads values from barrier descriptor |
---|
381 | sense = hal_remote_l32( sense_xp ); |
---|
382 | arity = hal_remote_l32( arity_xp ); |
---|
383 | |
---|
384 | // compute expected value |
---|
385 | if ( sense == 0 ) expected = 1; |
---|
386 | else expected = 0; |
---|
387 | |
---|
388 | // increment current number of arrived threads / get value before increment |
---|
389 | current = hal_remote_atomic_add( current_xp , 1 ); |
---|
390 | |
---|
391 | // last thread reset current, toggle sense, and activate all waiting threads |
---|
392 | // other threads block, register in queue, and deschedule |
---|
393 | |
---|
394 | if( current == (arity - 1) ) // last thread |
---|
395 | { |
---|
396 | hal_remote_s32( current_xp , 0 ); |
---|
397 | hal_remote_s32( sense_xp , expected ); |
---|
398 | |
---|
399 | // unblock all waiting threads |
---|
400 | while( xlist_is_empty( root_xp ) == false ) |
---|
401 | { |
---|
402 | // get pointers on first waiting thread |
---|
403 | xptr_t thread_xp = XLIST_FIRST( root_xp , thread_t , wait_list ); |
---|
404 | cxy_t thread_cxy = GET_CXY( thread_xp ); |
---|
405 | thread_t * thread_ptr = GET_PTR( thread_xp ); |
---|
406 | |
---|
407 | #if (DEBUG_BARRIER_WAIT & 1) |
---|
408 | trdid_t trdid = hal_remote_l32( XPTR( thread_cxy , &thread_ptr->trdid ) ); |
---|
409 | process_t * process = hal_remote_lpt( XPTR( thread_cxy , &thread_ptr->process ) ); |
---|
410 | pid_t pid = hal_remote_l32( XPTR( thread_cxy , &process->pid ) ); |
---|
411 | if( cycle > DEBUG_BARRIER_WAIT ) |
---|
412 | printk("\n[%s] thread[%x,%x] unblocks thread[%x,%x]\n", |
---|
413 | __FUNCTION__, this->process->pid, this->trdid, pid, trdid ); |
---|
414 | #endif |
---|
415 | |
---|
416 | // remove waiting thread from queue |
---|
417 | xlist_unlink( XPTR( thread_cxy , &thread_ptr->wait_list ) ); |
---|
418 | |
---|
419 | // unblock waiting thread |
---|
420 | thread_unblock( thread_xp , THREAD_BLOCKED_USERSYNC ); |
---|
421 | } |
---|
422 | |
---|
423 | // release busylock protecting the barrier |
---|
424 | remote_busylock_release( lock_xp ); |
---|
425 | } |
---|
426 | else // not the last thread |
---|
427 | { |
---|
428 | |
---|
429 | #if (DEBUG_BARRIER_WAIT & 1) |
---|
430 | if( cycle > DEBUG_BARRIER_WAIT ) |
---|
431 | printk("\n[%s] thread[%x,%x] blocks\n", |
---|
432 | __FUNCTION__, this->process->pid, this->trdid ); |
---|
433 | #endif |
---|
434 | |
---|
435 | // register calling thread in barrier waiting queue |
---|
436 | xlist_add_last( root_xp , XPTR( local_cxy , &this->wait_list ) ); |
---|
437 | |
---|
438 | // block calling thread |
---|
439 | thread_block( XPTR( local_cxy , this ) , THREAD_BLOCKED_USERSYNC ); |
---|
440 | |
---|
441 | // release busylock protecting the remote_barrier |
---|
442 | remote_busylock_release( lock_xp ); |
---|
443 | |
---|
444 | // deschedule |
---|
445 | sched_yield("blocked on barrier"); |
---|
446 | } |
---|
447 | |
---|
448 | #if DEBUG_BARRIER_WAIT |
---|
449 | cycle = (uint32_t)hal_get_cycles(); |
---|
450 | if( cycle > DEBUG_BARRIER_WAIT ) |
---|
451 | printk("\n[%s] thread[%x,%x] exit / barrier (%x,%x) / cycle %d\n", |
---|
452 | __FUNCTION__, this->trdid, this->process->pid, barrier_cxy, barrier_ptr, cycle ); |
---|
453 | #endif |
---|
454 | |
---|
455 | } // end simple_barrier_wait() |
---|
456 | |
---|
457 | |
---|
458 | ///////////////////////////////////////////////////////////// |
---|
459 | // DQT barrier functions |
---|
460 | ///////////////////////////////////////////////////////////// |
---|
461 | |
---|
462 | static void dqt_barrier_increment( xptr_t node_xp ); |
---|
463 | |
---|
464 | #if DEBUG_BARRIER_CREATE |
---|
465 | static void dqt_barrier_display( xptr_t barrier_xp ); |
---|
466 | #endif |
---|
467 | |
---|
468 | /////////////////////////////////////////////////////// |
---|
469 | dqt_barrier_t * dqt_barrier_create( uint32_t x_size, |
---|
470 | uint32_t y_size, |
---|
471 | uint32_t nthreads ) |
---|
472 | { |
---|
473 | page_t * dqt_page; |
---|
474 | xptr_t dqt_page_xp; |
---|
475 | page_t * rpc_page; |
---|
476 | xptr_t rpc_page_xp; |
---|
477 | dqt_barrier_t * barrier; // local pointer on DQT barrier descriptor |
---|
478 | xptr_t barrier_xp; // extended pointer on DQT barrier descriptor |
---|
479 | uint32_t z; // actual DQT size == max(x_size,y_size) |
---|
480 | uint32_t levels; // actual number of DQT levels |
---|
481 | kmem_req_t req; // kmem request |
---|
482 | xptr_t rpc_xp; // extended pointer on RPC descriptors array |
---|
483 | rpc_desc_t * rpc; // pointer on RPC descriptors array |
---|
484 | uint32_t responses; // responses counter for parallel RPCs |
---|
485 | reg_t save_sr; // for critical section |
---|
486 | uint32_t x; // X coordinate in QDT mesh |
---|
487 | uint32_t y; // Y coordinate in QDT mesh |
---|
488 | uint32_t l; // level coordinate |
---|
489 | |
---|
490 | // compute size and number of DQT levels |
---|
491 | z = (x_size > y_size) ? x_size : y_size; |
---|
492 | levels = (z < 2) ? 1 : (z < 3) ? 2 : (z < 5) ? 3 : (z < 9) ? 4 : 5; |
---|
493 | |
---|
494 | // check x_size and y_size arguments |
---|
495 | assert( (z <= 16) , "DQT dqth larger than (16*16)\n"); |
---|
496 | |
---|
497 | // check RPC descriptor size |
---|
498 | assert( (sizeof(rpc_desc_t) <= 128), "RPC descriptor larger than 128 bytes\n"); |
---|
499 | |
---|
500 | // check size of an array of 5 DQT nodes |
---|
501 | assert( (sizeof(dqt_node_t) * 5 <= 512 ), "array of DQT nodes larger than 512 bytes\n"); |
---|
502 | |
---|
503 | // check size of DQT barrier descriptor |
---|
504 | assert( (sizeof(dqt_barrier_t) <= 0x4000 ), "DQT barrier descriptor larger than 4 pages\n"); |
---|
505 | |
---|
506 | // get pointer on local client process descriptor |
---|
507 | thread_t * this = CURRENT_THREAD; |
---|
508 | process_t * process = this->process; |
---|
509 | |
---|
510 | #if DEBUG_BARRIER_CREATE |
---|
511 | uint32_t cycle = (uint32_t)hal_get_cycles(); |
---|
512 | if( cycle > DEBUG_BARRIER_CREATE ) |
---|
513 | printk("\n[%s] thread[%x,%x] enter : x_size %d / y_size %d / levels %d / cycle %d\n", |
---|
514 | __FUNCTION__, process->pid, this->trdid, x_size, y_size, levels, cycle ); |
---|
515 | #endif |
---|
516 | |
---|
517 | // get reference process cluster |
---|
518 | xptr_t ref_xp = process->ref_xp; |
---|
519 | cxy_t ref_cxy = GET_CXY( ref_xp ); |
---|
520 | |
---|
521 | // 1. allocate memory for DQT barrier descriptor in reference cluster |
---|
522 | if( ref_cxy == local_cxy ) |
---|
523 | { |
---|
524 | req.type = KMEM_PAGE; |
---|
525 | req.size = 2; // 4 pages == 16 Kbytes |
---|
526 | req.flags = AF_ZERO; |
---|
527 | dqt_page = kmem_alloc( &req ); |
---|
528 | dqt_page_xp = XPTR( local_cxy , dqt_page ); |
---|
529 | } |
---|
530 | else |
---|
531 | { |
---|
532 | rpc_pmem_get_pages_client( ref_cxy, |
---|
533 | 2, |
---|
534 | &dqt_page ); |
---|
535 | dqt_page_xp = XPTR( ref_cxy , dqt_page ); |
---|
536 | } |
---|
537 | |
---|
538 | if( dqt_page == NULL ) return NULL; |
---|
539 | |
---|
540 | // get pointers on DQT barrier descriptor |
---|
541 | barrier_xp = ppm_page2base( dqt_page_xp ); |
---|
542 | barrier = GET_PTR( barrier_xp ); |
---|
543 | |
---|
544 | // initialize global parameters in DQT barrier descriptor |
---|
545 | hal_remote_s32( XPTR( ref_cxy , &barrier->x_size ) , x_size ); |
---|
546 | hal_remote_s32( XPTR( ref_cxy , &barrier->y_size ) , x_size ); |
---|
547 | hal_remote_s32( XPTR( ref_cxy , &barrier->nthreads ) , nthreads ); |
---|
548 | |
---|
549 | #if DEBUG_BARRIER_CREATE |
---|
550 | if( cycle > DEBUG_BARRIER_CREATE ) |
---|
551 | printk("\n[%s] thread[%x,%x] created DQT barrier descriptor at (%x,%x)\n", |
---|
552 | __FUNCTION__, process->pid, this->trdid, ref_cxy, barrier ); |
---|
553 | #endif |
---|
554 | |
---|
555 | // 2. allocate memory from local cluster for an array of 256 RPCs descriptors |
---|
556 | // cannot share the RPC descriptor, because the returned argument is not shared |
---|
557 | req.type = KMEM_PAGE; |
---|
558 | req.size = 3; // 8 pages == 32 Kbytes |
---|
559 | req.flags = AF_ZERO; |
---|
560 | rpc_page = kmem_alloc( &req ); |
---|
561 | rpc_page_xp = XPTR( local_cxy , rpc_page ); |
---|
562 | |
---|
563 | // get pointers on RPC descriptors array |
---|
564 | rpc_xp = ppm_page2base( rpc_page_xp ); |
---|
565 | rpc = GET_PTR( rpc_xp ); |
---|
566 | |
---|
567 | #if DEBUG_BARRIER_CREATE |
---|
568 | if( cycle > DEBUG_BARRIER_CREATE ) |
---|
569 | printk("\n[%s] thread[%x,%x] created RPC descriptors array at (%x,%s)\n", |
---|
570 | __FUNCTION__, process->pid, this->trdid, local_cxy, rpc ); |
---|
571 | #endif |
---|
572 | |
---|
573 | // 3. send parallel RPCs to all existing clusters covered by the DQT |
---|
574 | // to allocate memory for an array of 5 DQT nodes in each cluster |
---|
575 | // (5 nodes per cluster <= 512 bytes per cluster) |
---|
576 | |
---|
577 | responses = 0; // initialize RPC responses counter |
---|
578 | |
---|
579 | // mask IRQs |
---|
580 | hal_disable_irq( &save_sr); |
---|
581 | |
---|
582 | // client thread blocks itself |
---|
583 | thread_block( XPTR( local_cxy , this ) , THREAD_BLOCKED_RPC ); |
---|
584 | |
---|
585 | for ( x = 0 ; x < x_size ; x++ ) |
---|
586 | { |
---|
587 | for ( y = 0 ; y < y_size ; y++ ) |
---|
588 | { |
---|
589 | // send RPC to existing clusters only |
---|
590 | if( LOCAL_CLUSTER->cluster_info[x][y] ) |
---|
591 | { |
---|
592 | cxy_t cxy = HAL_CXY_FROM_XY( x , y ); // target cluster identifier |
---|
593 | |
---|
594 | // build a specific RPC descriptor for each target cluster |
---|
595 | rpc[cxy].rsp = &responses; |
---|
596 | rpc[cxy].blocking = false; |
---|
597 | rpc[cxy].index = RPC_KCM_ALLOC; |
---|
598 | rpc[cxy].thread = this; |
---|
599 | rpc[cxy].lid = this->core->lid; |
---|
600 | rpc[cxy].args[0] = (uint64_t)KMEM_512_BYTES; |
---|
601 | |
---|
602 | // atomically increment expected responses counter |
---|
603 | hal_atomic_add( &responses , 1 ); |
---|
604 | |
---|
605 | // send a non-blocking RPC to allocate 512 bytes in target cluster |
---|
606 | rpc_send( cxy , &rpc[cxy] ); |
---|
607 | } |
---|
608 | } |
---|
609 | } |
---|
610 | |
---|
611 | #if DEBUG_BARRIER_CREATE |
---|
612 | if( cycle > DEBUG_BARRIER_CREATE ) |
---|
613 | printk("\n[%s] thread[%x,%x] sent all RPC requests to allocate dqt_nodes array\n", |
---|
614 | __FUNCTION__, process->pid, this->trdid ); |
---|
615 | #endif |
---|
616 | |
---|
617 | // client thread deschedule |
---|
618 | sched_yield("blocked on parallel rpc_kcm_alloc"); |
---|
619 | |
---|
620 | // restore IRQs |
---|
621 | hal_restore_irq( save_sr); |
---|
622 | |
---|
623 | // 4. initialize the node_xp[x][y][l] array in DQT barrier descriptor |
---|
624 | // the node_xp[x][y][0] value is available in rpc.args[1] |
---|
625 | |
---|
626 | #if DEBUG_BARRIER_CREATE |
---|
627 | if( cycle > DEBUG_BARRIER_CREATE ) |
---|
628 | printk("\n[%s] thread[%x,%x] initialises array of pointers on dqt_nodes\n", |
---|
629 | __FUNCTION__, process->pid, this->trdid ); |
---|
630 | #endif |
---|
631 | |
---|
632 | for ( x = 0 ; x < x_size ; x++ ) |
---|
633 | { |
---|
634 | for ( y = 0 ; y < y_size ; y++ ) |
---|
635 | { |
---|
636 | cxy_t cxy = HAL_CXY_FROM_XY( x , y ); // target cluster identifier |
---|
637 | xptr_t array_xp = (xptr_t)rpc[cxy].args[1]; // x_pointer on node array |
---|
638 | uint32_t offset = sizeof( dqt_node_t ); // size of a DQT node |
---|
639 | |
---|
640 | // set values into the node_xp[x][y][l] array |
---|
641 | for ( l = 0 ; l < levels ; l++ ) |
---|
642 | { |
---|
643 | xptr_t node_xp = array_xp + (offset * l); |
---|
644 | hal_remote_s64( XPTR( ref_cxy , &barrier->node_xp[x][y][l] ), node_xp ); |
---|
645 | |
---|
646 | #if DEBUG_BARRIER_CREATE |
---|
647 | if( cycle > DEBUG_BARRIER_CREATE ) |
---|
648 | printk(" - dqt_node_xp[%d,%d,%d] = (%x,%x) / &dqt_node_xp = %x\n", |
---|
649 | x , y , l , GET_CXY( node_xp ), GET_PTR( node_xp ), &barrier->node_xp[x][y][l] ); |
---|
650 | #endif |
---|
651 | } |
---|
652 | } |
---|
653 | } |
---|
654 | |
---|
655 | // 5. release memory locally allocated for the RPCs array |
---|
656 | req.type = KMEM_PAGE; |
---|
657 | req.ptr = rpc_page; |
---|
658 | kmem_free( &req ); |
---|
659 | |
---|
660 | #if DEBUG_BARRIER_CREATE |
---|
661 | if( cycle > DEBUG_BARRIER_CREATE ) |
---|
662 | printk("\n[%s] thread[%x,%x] released memory for RPC descriptors array\n", |
---|
663 | __FUNCTION__, process->pid, this->trdid ); |
---|
664 | #endif |
---|
665 | |
---|
666 | // 6. initialise all distributed DQT nodes using remote accesses |
---|
667 | // and the pointers stored in the node_xp[x][y][l] array |
---|
668 | for ( x = 0 ; x < x_size ; x++ ) |
---|
669 | { |
---|
670 | for ( y = 0 ; y < y_size ; y++ ) |
---|
671 | { |
---|
672 | // initialize existing clusters only |
---|
673 | if( LOCAL_CLUSTER->cluster_info[x][y] ) |
---|
674 | { |
---|
675 | for ( l = 0 ; l < levels ; l++ ) |
---|
676 | { |
---|
677 | xptr_t parent_xp; |
---|
678 | xptr_t child_xp[4]; |
---|
679 | uint32_t arity = 0; |
---|
680 | |
---|
681 | // get DQT node pointers |
---|
682 | xptr_t node_xp = hal_remote_l64( XPTR( ref_cxy, |
---|
683 | &barrier->node_xp[x][y][l] ) ); |
---|
684 | cxy_t node_cxy = GET_CXY( node_xp ); |
---|
685 | dqt_node_t * node_ptr = GET_PTR( node_xp ); |
---|
686 | |
---|
687 | // compute arity and child_xp[i] |
---|
688 | if (l == 0 ) // bottom DQT node |
---|
689 | { |
---|
690 | arity = nthreads; |
---|
691 | |
---|
692 | child_xp[0] = XPTR_NULL; |
---|
693 | child_xp[1] = XPTR_NULL; |
---|
694 | child_xp[2] = XPTR_NULL; |
---|
695 | child_xp[3] = XPTR_NULL; |
---|
696 | } |
---|
697 | else // not a bottom DQT node |
---|
698 | { |
---|
699 | arity = 0; |
---|
700 | |
---|
701 | // only few non-bottom nodes must be initialised |
---|
702 | if( ((x & ((1<<l)-1)) == 0) && ((y & ((1<<l)-1)) == 0) ) |
---|
703 | { |
---|
704 | uint32_t cx[4]; // x coordinate for children |
---|
705 | uint32_t cy[4]; // y coordinate for children |
---|
706 | uint32_t i; |
---|
707 | |
---|
708 | // the child0 coordinates are equal to the parent coordinates |
---|
709 | // other children coordinates depend on the level value |
---|
710 | cx[0] = x; |
---|
711 | cy[0] = y; |
---|
712 | |
---|
713 | cx[1] = x; |
---|
714 | cy[1] = y + (1 << (l-1)); |
---|
715 | |
---|
716 | cx[2] = x + (1 << (l-1)); |
---|
717 | cy[2] = y; |
---|
718 | |
---|
719 | cx[3] = x + (1 << (l-1)); |
---|
720 | cy[3] = y + (1 << (l-1)); |
---|
721 | |
---|
722 | for ( i = 0 ; i < 4 ; i++ ) |
---|
723 | { |
---|
724 | // child pointer is NULL if outside the mesh |
---|
725 | if ( (cx[i] < x_size) && (cy[i] < y_size) ) |
---|
726 | { |
---|
727 | // get child_xp[i] |
---|
728 | child_xp[i] = hal_remote_l64( XPTR( ref_cxy, |
---|
729 | &barrier->node_xp[cx[i]][cy[i]][l-1] ) ); |
---|
730 | |
---|
731 | // increment arity |
---|
732 | arity++; |
---|
733 | } |
---|
734 | else |
---|
735 | { |
---|
736 | child_xp[i] = XPTR_NULL; |
---|
737 | } |
---|
738 | } |
---|
739 | } |
---|
740 | } |
---|
741 | |
---|
742 | // compute parent_xp |
---|
743 | if( l == (levels - 1) ) // root DQT node |
---|
744 | { |
---|
745 | parent_xp = XPTR_NULL; |
---|
746 | } |
---|
747 | else // not the root |
---|
748 | { |
---|
749 | uint32_t px = 0; // parent X coordinate |
---|
750 | uint32_t py = 0; // parent Y coordinate |
---|
751 | bool_t found = false; |
---|
752 | |
---|
753 | // compute macro_cluster x_min, x_max, y_min, y_max |
---|
754 | uint32_t x_min = x & ~((1<<(l+1))-1); |
---|
755 | uint32_t x_max = x_min + (1<<(l+1)); |
---|
756 | uint32_t y_min = y & ~((1<<(l+1))-1); |
---|
757 | uint32_t y_max = y_min + (1<<(l+1)); |
---|
758 | |
---|
759 | // scan all clusters in macro-cluster[x][y][l] / take first active |
---|
760 | for( px = x_min ; px < x_max ; px++ ) |
---|
761 | { |
---|
762 | for( py = y_min ; py < y_max ; py++ ) |
---|
763 | { |
---|
764 | if( LOCAL_CLUSTER->cluster_info[px][py] ) found = true; |
---|
765 | if( found ) break; |
---|
766 | } |
---|
767 | if( found ) break; |
---|
768 | } |
---|
769 | |
---|
770 | parent_xp = hal_remote_l64( XPTR( ref_cxy , |
---|
771 | &barrier->node_xp[px][py][l+1] ) ); |
---|
772 | } |
---|
773 | |
---|
774 | // initializes the DQT node |
---|
775 | hal_remote_s32( XPTR( node_cxy , &node_ptr->arity ) , arity ); |
---|
776 | hal_remote_s32( XPTR( node_cxy , &node_ptr->current ) , 0 ); |
---|
777 | hal_remote_s32( XPTR( node_cxy , &node_ptr->sense ) , 0 ); |
---|
778 | hal_remote_s32( XPTR( node_cxy , &node_ptr->level ) , l ); |
---|
779 | hal_remote_s64( XPTR( node_cxy , &node_ptr->parent_xp ) , parent_xp ); |
---|
780 | hal_remote_s64( XPTR( node_cxy , &node_ptr->child_xp[0] ) , child_xp[0] ); |
---|
781 | hal_remote_s64( XPTR( node_cxy , &node_ptr->child_xp[1] ) , child_xp[1] ); |
---|
782 | hal_remote_s64( XPTR( node_cxy , &node_ptr->child_xp[2] ) , child_xp[2] ); |
---|
783 | hal_remote_s64( XPTR( node_cxy , &node_ptr->child_xp[3] ) , child_xp[3] ); |
---|
784 | |
---|
785 | xlist_root_init( XPTR( node_cxy , &node_ptr->root ) ); |
---|
786 | |
---|
787 | remote_busylock_init( XPTR( node_cxy , &node_ptr->lock ), |
---|
788 | LOCK_BARRIER_STATE ); |
---|
789 | } |
---|
790 | } |
---|
791 | } |
---|
792 | } |
---|
793 | |
---|
794 | #if DEBUG_BARRIER_CREATE |
---|
795 | cycle = (uint32_t)hal_get_cycles(); |
---|
796 | if( cycle > DEBUG_BARRIER_CREATE ) |
---|
797 | printk("\n[%s] thread[%x,%x] completed DQT barrier initialisation / cycle %d\n", |
---|
798 | __FUNCTION__, process->pid, this->trdid, cycle ); |
---|
799 | dqt_barrier_display( barrier_xp ); |
---|
800 | #endif |
---|
801 | |
---|
802 | return barrier; |
---|
803 | |
---|
804 | } // end dqt_barrier_create() |
---|
805 | |
---|
806 | /////////////////////////////////////////////// |
---|
807 | void dqt_barrier_destroy( xptr_t barrier_xp ) |
---|
808 | { |
---|
809 | page_t * rpc_page; |
---|
810 | xptr_t rpc_page_xp; |
---|
811 | rpc_desc_t * rpc; // local pointer on RPC descriptors array |
---|
812 | xptr_t rpc_xp; // extended pointer on RPC descriptor array |
---|
813 | reg_t save_sr; // for critical section |
---|
814 | kmem_req_t req; // kmem request |
---|
815 | |
---|
816 | thread_t * this = CURRENT_THREAD; |
---|
817 | |
---|
818 | // get DQT barrier descriptor cluster and local pointer |
---|
819 | dqt_barrier_t * barrier_ptr = GET_PTR( barrier_xp ); |
---|
820 | cxy_t barrier_cxy = GET_CXY( barrier_xp ); |
---|
821 | |
---|
822 | #if DEBUG_BARRIER_DESTROY |
---|
823 | uint32_t cycle = (uint32_t)hal_get_cycles(); |
---|
824 | if( cycle > DEBUG_BARRIER_DESTROY ) |
---|
825 | printk("\n[%s] thread[%x,%x] enter for barrier (%x,%x) / cycle %d\n", |
---|
826 | __FUNCTION__, this->process->pid, this->trdid, barrier_cxy, barrier_ptr, cycle ); |
---|
827 | #endif |
---|
828 | |
---|
829 | // get x_size and y_size global parameters |
---|
830 | uint32_t x_size = hal_remote_l32( XPTR( barrier_cxy , &barrier_ptr->x_size ) ); |
---|
831 | uint32_t y_size = hal_remote_l32( XPTR( barrier_cxy , &barrier_ptr->y_size ) ); |
---|
832 | |
---|
833 | // 1. allocate memory from local cluster for an array of 256 RPCs descriptors |
---|
834 | // cannot share the RPC descriptor, because the "buf" argument is not shared |
---|
835 | req.type = KMEM_PAGE; |
---|
836 | req.size = 3; // 8 pages == 32 Kbytes |
---|
837 | req.flags = AF_ZERO; |
---|
838 | rpc_page = kmem_alloc( &req ); |
---|
839 | rpc_page_xp = XPTR( local_cxy , rpc_page ); |
---|
840 | |
---|
841 | // get pointers on RPC descriptors array |
---|
842 | rpc_xp = ppm_page2base( rpc_page_xp ); |
---|
843 | rpc = GET_PTR( rpc_xp ); |
---|
844 | |
---|
845 | // 2. send parallel RPCs to all existing clusters covered by the DQT |
---|
846 | // to release memory allocated for the arrays of DQT nodes in each cluster |
---|
847 | |
---|
848 | uint32_t responses = 0; // initialize RPC responses counter |
---|
849 | |
---|
850 | // mask IRQs |
---|
851 | hal_disable_irq( &save_sr); |
---|
852 | |
---|
853 | // client thread blocks itself |
---|
854 | thread_block( XPTR( local_cxy , this ) , THREAD_BLOCKED_RPC ); |
---|
855 | |
---|
856 | uint32_t x , y; |
---|
857 | |
---|
858 | #if DEBUG_BARRIER_DESTROY |
---|
859 | if( cycle > DEBUG_BARRIER_DESTROY ) |
---|
860 | printk("\n[%s] thread[%x,%x] send RPCs to release the distributed dqt_node array\n", |
---|
861 | __FUNCTION__, this->process->pid, this->trdid ); |
---|
862 | #endif |
---|
863 | |
---|
864 | for ( x = 0 ; x < x_size ; x++ ) |
---|
865 | { |
---|
866 | for ( y = 0 ; y < y_size ; y++ ) |
---|
867 | { |
---|
868 | // send RPC to existing cluster only |
---|
869 | if( LOCAL_CLUSTER->cluster_info[x][y] ) |
---|
870 | { |
---|
871 | // compute target cluster identifier |
---|
872 | cxy_t cxy = HAL_CXY_FROM_XY( x , y ); |
---|
873 | |
---|
874 | // get local pointer on dqt_nodes array in target cluster |
---|
875 | xptr_t buf_xp_xp = XPTR( barrier_cxy , &barrier_ptr->node_xp[x][y][0] ); |
---|
876 | xptr_t buf_xp = hal_remote_l64( buf_xp_xp ); |
---|
877 | void * buf = GET_PTR( buf_xp ); |
---|
878 | |
---|
879 | assert( (cxy == GET_CXY(buf_xp)) , "bad extended pointer on dqt_nodes array\n" ); |
---|
880 | |
---|
881 | // build a specific RPC descriptor |
---|
882 | rpc[cxy].rsp = &responses; |
---|
883 | rpc[cxy].blocking = false; |
---|
884 | rpc[cxy].index = RPC_KCM_FREE; |
---|
885 | rpc[cxy].thread = this; |
---|
886 | rpc[cxy].lid = this->core->lid; |
---|
887 | rpc[cxy].args[0] = (uint64_t)(intptr_t)buf; |
---|
888 | rpc[cxy].args[1] = (uint64_t)KMEM_512_BYTES; |
---|
889 | |
---|
890 | // atomically increment expected responses counter |
---|
891 | hal_atomic_add( &responses , 1 ); |
---|
892 | |
---|
893 | #if DEBUG_BARRIER_DESTROY |
---|
894 | if( cycle > DEBUG_BARRIER_DESTROY ) |
---|
895 | printk(" - target cluster(%d,%d) / buffer %x\n", x, y, buf ); |
---|
896 | #endif |
---|
897 | // send a non-blocking RPC to release 512 bytes in target cluster |
---|
898 | rpc_send( cxy , &rpc[cxy] ); |
---|
899 | } |
---|
900 | } |
---|
901 | } |
---|
902 | |
---|
903 | // client thread deschedule |
---|
904 | sched_yield("blocked on parallel rpc_kcm_free"); |
---|
905 | |
---|
906 | // restore IRQs |
---|
907 | hal_restore_irq( save_sr); |
---|
908 | |
---|
909 | // 3. release memory locally allocated for the RPC descriptors array |
---|
910 | req.type = KMEM_PAGE; |
---|
911 | req.ptr = rpc_page; |
---|
912 | kmem_free( &req ); |
---|
913 | |
---|
914 | // 4. release memory allocated for barrier descriptor |
---|
915 | xptr_t page_xp = ppm_base2page( barrier_xp ); |
---|
916 | page_t * page = GET_PTR( page_xp ); |
---|
917 | |
---|
918 | if( barrier_cxy == local_cxy ) |
---|
919 | { |
---|
920 | req.type = KMEM_PAGE; |
---|
921 | req.ptr = page; |
---|
922 | kmem_free( &req ); |
---|
923 | } |
---|
924 | else |
---|
925 | { |
---|
926 | rpc_pmem_release_pages_client( barrier_cxy, |
---|
927 | page ); |
---|
928 | } |
---|
929 | |
---|
930 | #if DEBUG_BARRIER_DESTROY |
---|
931 | cycle = (uint32_t)hal_get_cycles(); |
---|
932 | if( cycle > DEBUG_BARRIER_DESTROY ) |
---|
933 | printk("\n[%s] thread[%x,%x] exit for barrier (%x,%x) / cycle %d\n", |
---|
934 | __FUNCTION__, this->process->pid, this->trdid, barrier_cxy, barrier_ptr, cycle ); |
---|
935 | #endif |
---|
936 | |
---|
937 | } // end dqt_barrier_destroy() |
---|
938 | |
---|
939 | //////////////////////////////////////////// |
---|
940 | void dqt_barrier_wait( xptr_t barrier_xp ) |
---|
941 | { |
---|
942 | thread_t * this = CURRENT_THREAD; |
---|
943 | |
---|
944 | // check calling thread can yield |
---|
945 | thread_assert_can_yield( this , __FUNCTION__ ); |
---|
946 | |
---|
947 | // get cluster and local pointer on DQT barrier descriptor |
---|
948 | dqt_barrier_t * barrier_ptr = GET_PTR( barrier_xp ); |
---|
949 | cxy_t barrier_cxy = GET_CXY( barrier_xp ); |
---|
950 | |
---|
951 | #if DEBUG_BARRIER_WAIT |
---|
952 | uint32_t cycle = (uint32_t)hal_get_cycles(); |
---|
953 | if( cycle > DEBUG_BARRIER_WAIT ) |
---|
954 | printk("\n[%s] thread[%x,%x] enter / barrier (%x,%x) / cycle %d\n", |
---|
955 | __FUNCTION__, this->process->pid, this->trdid, barrier_cxy, barrier_ptr, cycle ); |
---|
956 | #endif |
---|
957 | |
---|
958 | // get extended pointer on local bottom DQT node |
---|
959 | uint32_t x = HAL_X_FROM_CXY( local_cxy ); |
---|
960 | uint32_t y = HAL_Y_FROM_CXY( local_cxy ); |
---|
961 | xptr_t node_xp = hal_remote_l64( XPTR( barrier_cxy , &barrier_ptr->node_xp[x][y][0] ) ); |
---|
962 | |
---|
963 | // call recursive function to traverse DQT from bottom to root |
---|
964 | dqt_barrier_increment( node_xp ); |
---|
965 | |
---|
966 | #if DEBUG_BARRIER_WAIT |
---|
967 | cycle = (uint32_t)hal_get_cycles(); |
---|
968 | if( cycle > DEBUG_BARRIER_WAIT ) |
---|
969 | printk("\n[%s] thread[%x,%x] exit / barrier (%x,%x) / cycle %d\n", |
---|
970 | __FUNCTION__, this->trdid, this->process->pid, barrier_cxy, barrier_ptr, cycle ); |
---|
971 | #endif |
---|
972 | |
---|
973 | } // end dqt_barrier_wait() |
---|
974 | |
---|
975 | |
---|
976 | //////////////////////////////////////////////////////////////////////////////////////////// |
---|
977 | // DQT static functions |
---|
978 | //////////////////////////////////////////////////////////////////////////////////////////// |
---|
979 | |
---|
980 | |
---|
981 | ////////////////////////////////////////////////////////////////////////////////////////// |
---|
982 | // This recursive function decrements the distributed "count" variables, |
---|
983 | // traversing the DQT from bottom to root. |
---|
984 | // The last arrived thread reset the local node before returning. |
---|
985 | ////////////////////////////////////////////////////////////////////////////////////////// |
---|
986 | static void dqt_barrier_increment( xptr_t node_xp ) |
---|
987 | { |
---|
988 | uint32_t expected; |
---|
989 | uint32_t sense; |
---|
990 | uint32_t arity; |
---|
991 | |
---|
992 | thread_t * this = CURRENT_THREAD; |
---|
993 | |
---|
994 | // get node cluster and local pointer |
---|
995 | dqt_node_t * node_ptr = GET_PTR( node_xp ); |
---|
996 | cxy_t node_cxy = GET_CXY( node_xp ); |
---|
997 | |
---|
998 | // build relevant extended pointers |
---|
999 | xptr_t arity_xp = XPTR( node_cxy , &node_ptr->arity ); |
---|
1000 | xptr_t sense_xp = XPTR( node_cxy , &node_ptr->sense ); |
---|
1001 | xptr_t current_xp = XPTR( node_cxy , &node_ptr->current ); |
---|
1002 | xptr_t lock_xp = XPTR( node_cxy , &node_ptr->lock ); |
---|
1003 | xptr_t root_xp = XPTR( node_cxy , &node_ptr->root ); |
---|
1004 | |
---|
1005 | #if DEBUG_BARRIER_WAIT |
---|
1006 | uint32_t cycle = (uint32_t)hal_get_cycles(); |
---|
1007 | uint32_t level = hal_remote_l32( XPTR( node_cxy, &node_ptr->level ) ); |
---|
1008 | if( cycle > DEBUG_BARRIER_WAIT ) |
---|
1009 | printk("\n[%s] thread[%x,%x] increments DQT node(%d,%d,%d) / cycle %d\n", |
---|
1010 | __FUNCTION__ , this->process->pid, this->trdid, |
---|
1011 | HAL_X_FROM_CXY(node_cxy), HAL_Y_FROM_CXY(node_cxy), level ); |
---|
1012 | #endif |
---|
1013 | |
---|
1014 | // get extended pointer on parent node |
---|
1015 | xptr_t parent_xp = hal_remote_l64( XPTR( node_cxy , &node_ptr->parent_xp ) ); |
---|
1016 | |
---|
1017 | // take busylock |
---|
1018 | remote_busylock_acquire( lock_xp ); |
---|
1019 | |
---|
1020 | // get sense and arity values from barrier descriptor |
---|
1021 | sense = hal_remote_l32( sense_xp ); |
---|
1022 | arity = hal_remote_l32( arity_xp ); |
---|
1023 | |
---|
1024 | // compute expected value |
---|
1025 | expected = (sense == 0) ? 1 : 0; |
---|
1026 | |
---|
1027 | // increment current number of arrived threads / get value before increment |
---|
1028 | uint32_t current = hal_remote_atomic_add( current_xp , 1 ); |
---|
1029 | |
---|
1030 | // last arrived thread reset the local node, makes the recursive call |
---|
1031 | // on parent node, and reactivates all waiting thread when returning. |
---|
1032 | // other threads block, register in queue, and deschedule. |
---|
1033 | |
---|
1034 | if ( current == (arity - 1) ) // last thread |
---|
1035 | { |
---|
1036 | |
---|
1037 | #if DEBUG_BARRIER_WAIT |
---|
1038 | if( cycle > DEBUG_BARRIER_WAIT ) |
---|
1039 | printk("\n[%s] thread[%x,%x] reset DQT node(%d,%d,%d)\n", |
---|
1040 | __FUNCTION__ , this->process->pid, this->trdid, |
---|
1041 | HAL_X_FROM_CXY(node_cxy), HAL_Y_FROM_CXY(node_cxy), level ); |
---|
1042 | #endif |
---|
1043 | // reset the current node |
---|
1044 | hal_remote_s32( sense_xp , expected ); |
---|
1045 | hal_remote_s32( current_xp , 0 ); |
---|
1046 | |
---|
1047 | // release busylock protecting the current node |
---|
1048 | remote_busylock_release( lock_xp ); |
---|
1049 | |
---|
1050 | // recursive call on parent node when current node is not the root |
---|
1051 | if( parent_xp != XPTR_NULL) dqt_barrier_increment( parent_xp ); |
---|
1052 | |
---|
1053 | // unblock all waiting threads on this node |
---|
1054 | while( xlist_is_empty( root_xp ) == false ) |
---|
1055 | { |
---|
1056 | // get pointers on first waiting thread |
---|
1057 | xptr_t thread_xp = XLIST_FIRST( root_xp , thread_t , wait_list ); |
---|
1058 | cxy_t thread_cxy = GET_CXY( thread_xp ); |
---|
1059 | thread_t * thread_ptr = GET_PTR( thread_xp ); |
---|
1060 | |
---|
1061 | #if (DEBUG_BARRIER_WAIT & 1) |
---|
1062 | trdid_t trdid = hal_remote_l32( XPTR( thread_cxy , &thread_ptr->trdid ) ); |
---|
1063 | process_t * process = hal_remote_lpt( XPTR( thread_cxy , &thread_ptr->process ) ); |
---|
1064 | pid_t pid = hal_remote_l32( XPTR( thread_cxy , &process->pid ) ); |
---|
1065 | if( cycle > DEBUG_BARRIER_WAIT ) |
---|
1066 | printk("\n[%s] thread[%x,%x] unblock thread[%x,%x]\n", |
---|
1067 | __FUNCTION__, this->process->pid, this->trdid, pid, trdid ); |
---|
1068 | #endif |
---|
1069 | // remove waiting thread from queue |
---|
1070 | xlist_unlink( XPTR( thread_cxy , &thread_ptr->wait_list ) ); |
---|
1071 | |
---|
1072 | // unblock waiting thread |
---|
1073 | thread_unblock( thread_xp , THREAD_BLOCKED_USERSYNC ); |
---|
1074 | } |
---|
1075 | } |
---|
1076 | else // not the last thread |
---|
1077 | { |
---|
1078 | // get extended pointer on xlist entry from thread |
---|
1079 | xptr_t entry_xp = XPTR( local_cxy , &this->wait_list ); |
---|
1080 | |
---|
1081 | // register calling thread in barrier waiting queue |
---|
1082 | xlist_add_last( root_xp , entry_xp ); |
---|
1083 | |
---|
1084 | // block calling thread |
---|
1085 | thread_block( XPTR( local_cxy , this ) , THREAD_BLOCKED_USERSYNC ); |
---|
1086 | |
---|
1087 | // release busylock protecting the remote_barrier |
---|
1088 | remote_busylock_release( lock_xp ); |
---|
1089 | |
---|
1090 | #if DEBUG_BARRIER_WAIT |
---|
1091 | if( cycle > DEBUG_BARRIER_WAIT ) |
---|
1092 | printk("\n[%s] thread[%x,%x] blocks on node(%d,%d,%d)\n", |
---|
1093 | __FUNCTION__ , this->process->pid, this->trdid, |
---|
1094 | HAL_X_FROM_CXY(node_cxy), HAL_Y_FROM_CXY(node_cxy), level ); |
---|
1095 | #endif |
---|
1096 | // deschedule |
---|
1097 | sched_yield("blocked on barrier"); |
---|
1098 | } |
---|
1099 | |
---|
1100 | return; |
---|
1101 | |
---|
1102 | } // end dqt_barrier_decrement() |
---|
1103 | |
---|
1104 | #if DEBUG_BARRIER_CREATE |
---|
1105 | |
---|
1106 | //////////////////////////////////////////////////////////////////////////////////////////// |
---|
1107 | // This debug function displays all DQT nodes in all clusters. |
---|
1108 | //////////////////////////////////////////////////////////////////////////////////////////// |
---|
1109 | // @ barrier_xp : extended pointer on DQT barrier descriptor. |
---|
1110 | //////////////////////////////////////////////////////////////////////////////////////////// |
---|
1111 | static void dqt_barrier_display( xptr_t barrier_xp ) |
---|
1112 | { |
---|
1113 | // get cluster and local pointer on DQT barrier |
---|
1114 | dqt_barrier_t * barrier_ptr = GET_PTR( barrier_xp ); |
---|
1115 | cxy_t barrier_cxy = GET_CXY( barrier_xp ); |
---|
1116 | |
---|
1117 | // get barrier global parameters |
---|
1118 | uint32_t x_size = hal_remote_l32( XPTR( barrier_cxy , &barrier_ptr->x_size ) ); |
---|
1119 | uint32_t y_size = hal_remote_l32( XPTR( barrier_cxy , &barrier_ptr->y_size ) ); |
---|
1120 | uint32_t nthreads = hal_remote_l32( XPTR( barrier_cxy , &barrier_ptr->nthreads ) ); |
---|
1121 | |
---|
1122 | // compute size and number of DQT levels |
---|
1123 | uint32_t z = (x_size > y_size) ? x_size : y_size; |
---|
1124 | uint32_t levels = (z < 2) ? 1 : (z < 3) ? 2 : (z < 5) ? 3 : (z < 9) ? 4 : 5; |
---|
1125 | |
---|
1126 | printk("\n***** DQT barrier : x_size %d / y_size %d / nthreads %d / levels %d *****\n", |
---|
1127 | x_size, y_size, nthreads, levels ); |
---|
1128 | |
---|
1129 | uint32_t x , y , l; |
---|
1130 | |
---|
1131 | for ( x = 0 ; x < x_size ; x++ ) |
---|
1132 | { |
---|
1133 | for ( y = 0 ; y < y_size ; y++ ) |
---|
1134 | { |
---|
1135 | printk(" - cluster[%d,%d]\n", x , y ); |
---|
1136 | |
---|
1137 | for ( l = 0 ; l < levels ; l++ ) |
---|
1138 | { |
---|
1139 | // get pointers on target node |
---|
1140 | xptr_t node_xp = hal_remote_l64( XPTR( barrier_cxy , |
---|
1141 | &barrier_ptr->node_xp[x][y][l] ) ); |
---|
1142 | dqt_node_t * node_ptr = GET_PTR( node_xp ); |
---|
1143 | cxy_t node_cxy = GET_CXY( node_xp ); |
---|
1144 | |
---|
1145 | if( node_xp != XPTR_NULL ) |
---|
1146 | { |
---|
1147 | uint32_t level = hal_remote_l32( XPTR( node_cxy , &node_ptr->level )); |
---|
1148 | uint32_t arity = hal_remote_l32( XPTR( node_cxy , &node_ptr->arity )); |
---|
1149 | xptr_t pa_xp = hal_remote_l32( XPTR( node_cxy , &node_ptr->parent_xp )); |
---|
1150 | xptr_t c0_xp = hal_remote_l32( XPTR( node_cxy , &node_ptr->child_xp[0] )); |
---|
1151 | xptr_t c1_xp = hal_remote_l32( XPTR( node_cxy , &node_ptr->child_xp[1] )); |
---|
1152 | xptr_t c2_xp = hal_remote_l32( XPTR( node_cxy , &node_ptr->child_xp[2] )); |
---|
1153 | xptr_t c3_xp = hal_remote_l32( XPTR( node_cxy , &node_ptr->child_xp[3] )); |
---|
1154 | |
---|
1155 | printk(" . level %d : (%x,%x) / arity %d / P(%x,%x) / C0(%x,%x)" |
---|
1156 | " C1(%x,%x) / C2(%x,%x) / C3(%x,%x)\n", |
---|
1157 | level, node_cxy, node_ptr, arity, |
---|
1158 | GET_CXY(pa_xp), GET_PTR(pa_xp), |
---|
1159 | GET_CXY(c0_xp), GET_PTR(c0_xp), |
---|
1160 | GET_CXY(c1_xp), GET_PTR(c1_xp), |
---|
1161 | GET_CXY(c2_xp), GET_PTR(c2_xp), |
---|
1162 | GET_CXY(c3_xp), GET_PTR(c3_xp) ); |
---|
1163 | } |
---|
1164 | } |
---|
1165 | } |
---|
1166 | } |
---|
1167 | } // end dqt_barrier_display() |
---|
1168 | |
---|
1169 | #endif |
---|