/*
 * thread.c -  implementation of thread operations (user & kernel)
 *
 * Author  Ghassan Almaless (2008,2009,2010,2011,2012)
 *         Alain Greiner (2016,2017)
 *
 * Copyright (c) UPMC Sorbonne Universites
 *
 * This file is part of ALMOS-MKH.
 *
 * ALMOS-MKH is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by
 * the Free Software Foundation; version 2.0 of the License.
 *
 * ALMOS-MKH is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with ALMOS-MKH; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
 */

#include <kernel_config.h>
#include <hal_types.h>
#include <hal_context.h>
#include <hal_irqmask.h>
#include <hal_special.h>
#include <hal_remote.h>
#include <memcpy.h>
#include <printk.h>
#include <cluster.h>
#include <process.h>
#include <scheduler.h>
#include <dev_pic.h>
#include <core.h>
#include <list.h>
#include <xlist.h>
#include <page.h>
#include <kmem.h>
#include <ppm.h>
#include <thread.h>

//////////////////////////////////////////////////////////////////////////////////////
// Extern global variables
//////////////////////////////////////////////////////////////////////////////////////

extern process_t      process_zero;

//////////////////////////////////////////////////////////////////////////////////////
// This function returns a printable string for the thread type.
//////////////////////////////////////////////////////////////////////////////////////
char * thread_type_str( uint32_t type )
{
    if     ( type == THREAD_USER   ) return "USR";
    else if( type == THREAD_RPC    ) return "RPC";
    else if( type == THREAD_DEV    ) return "DEV";
    else if( type == THREAD_IDLE   ) return "IDL";
    else                             return "undefined";
}

/////////////////////////////////////////////////////////////////////////////////////
// This static function allocates physical memory for a thread descriptor.
// It can be called by the three functions:
// - thread_user_create()
// - thread_user_fork()
// - thread_kernel_create()
/////////////////////////////////////////////////////////////////////////////////////
// @ return pointer on thread descriptor if success / return NULL if failure.
/////////////////////////////////////////////////////////////////////////////////////
static thread_t * thread_alloc()
{
	page_t       * page;   // pointer on page descriptor containing thread descriptor
	kmem_req_t     req;    // kmem request

	// allocates memory for thread descriptor + kernel stack
	req.type  = KMEM_PAGE;
	req.size  = CONFIG_THREAD_DESC_ORDER;
	req.flags = AF_KERNEL | AF_ZERO;
	page      = kmem_alloc( &req );

	if( page == NULL ) return NULL;

    // return pointer on new thread descriptor
    xptr_t base_xp = ppm_page2base( XPTR(local_cxy , page ) );
    return (thread_t *)GET_PTR( base_xp );

}  // end thread_alloc()
  

/////////////////////////////////////////////////////////////////////////////////////
// This static function releases the physical memory for a thread descriptor.
// It is called by the three functions:
// - thread_user_create()
// - thread_user_fork()
// - thread_kernel_create()
/////////////////////////////////////////////////////////////////////////////////////
// @ thread  : pointer on thread descriptor.
/////////////////////////////////////////////////////////////////////////////////////
static void thread_release( thread_t * thread )
{
    kmem_req_t   req;

    xptr_t base_xp = ppm_base2page( XPTR(local_cxy , thread ) );

    req.type  = KMEM_PAGE;
    req.ptr   = GET_PTR( base_xp );
    kmem_free( &req );
}

/////////////////////////////////////////////////////////////////////////////////////
// This static function initializes a thread descriptor (kernel or user).
// It can be called by the four functions:
// - thread_user_create()
// - thread_user_fork()
// - thread_kernel_create()
// - thread_idle_init()
// It updates the local DQDT.
/////////////////////////////////////////////////////////////////////////////////////
// @ thread       : pointer on thread descriptor
// @ process      : pointer on process descriptor.
// @ type         : thread type.
// @ func         : pointer on thread entry function.
// @ args         : pointer on thread entry function arguments.
// @ core_lid     : target core local index.
// @ u_stack_base : stack base (user thread only)
// @ u_stack_size : stack base (user thread only)
/////////////////////////////////////////////////////////////////////////////////////
static error_t thread_init( thread_t      * thread,
                            process_t     * process,
                            thread_type_t   type,
                            void          * func,
                            void          * args,
                            lid_t           core_lid,
                            intptr_t        u_stack_base,
                            uint32_t        u_stack_size )
{
    error_t        error;
    trdid_t        trdid;      // allocated thread identifier

	cluster_t    * local_cluster = LOCAL_CLUSTER;

    // register new thread in process descriptor, and get a TRDID
    error = process_register_thread( process, thread , &trdid );

    if( error )
    {
        printk("\n[ERROR] in %s : cannot get TRDID\n", __FUNCTION__ );
        return EINVAL;
    }

    // compute thread descriptor size without kernel stack
    uint32_t desc_size = (intptr_t)(&thread->signature) - (intptr_t)thread + 4; 

	// Initialize new thread descriptor
    thread->trdid           = trdid;
	thread->type            = type;
    thread->quantum         = 0;            // TODO
    thread->ticks_nr        = 0;            // TODO
    thread->time_last_check = 0;
	thread->core            = &local_cluster->core_tbl[core_lid];
	thread->process         = process;

    thread->local_locks     = 0;
    thread->remote_locks    = 0;

#if CONFIG_LOCKS_DEBUG 
    list_root_init( &thread->locks_root );  
    xlist_root_init( XPTR( local_cxy , &thread->xlocks_root ) );
#endif

    thread->u_stack_base    = u_stack_base;
    thread->u_stack_size    = u_stack_size;
    thread->k_stack_base    = (intptr_t)thread + desc_size;
    thread->k_stack_size    = CONFIG_THREAD_DESC_SIZE - desc_size;

    thread->entry_func      = func;         // thread entry point
    thread->entry_args      = args;         // thread function arguments
    thread->flags           = 0;            // all flags reset
    thread->errno           = 0;            // no error detected
    thread->fork_user       = 0;            // no user defined placement for fork
    thread->fork_cxy        = 0;            // user defined target cluster for fork
    thread->blocked         = THREAD_BLOCKED_GLOBAL;

    // reset children list
    xlist_root_init( XPTR( local_cxy , &thread->children_root ) );
    thread->children_nr = 0;

    // reset sched list and brothers list
    list_entry_init( &thread->sched_list );
    xlist_entry_init( XPTR( local_cxy , &thread->brothers_list ) );

    // reset thread info
    memset( &thread->info , 0 , sizeof(thread_info_t) );

    // initializes join_lock
    remote_spinlock_init( XPTR( local_cxy , &thread->join_lock ) );

    // initialise signature
	thread->signature = THREAD_SIGNATURE;

    // FIXME call hal_thread_init() function to initialise the save_sr field
    thread->save_sr = 0xFF13;

    // register new thread in core scheduler
    sched_register_thread( thread->core , thread );

	// update DQDT 
    dqdt_update_threads( 1 );

	return 0;

} // end thread_init()

/////////////////////////////////////////////////////////
error_t thread_user_create( pid_t             pid,
                            void            * start_func,
                            void            * start_arg,
                            pthread_attr_t  * attr,
                            thread_t       ** new_thread )
{
    error_t        error;
	thread_t     * thread;       // pointer on created thread descriptor
    process_t    * process;      // pointer to local process descriptor
    lid_t          core_lid;     // selected core local index
    vseg_t       * vseg;         // stack vseg

    assert( (attr != NULL) , __FUNCTION__, "pthread attributes must be defined" );

#if DEBUG_THREAD_USER_CREATE
uint32_t cycle = (uint32_t)hal_get_cycles();
if( DEBUG_THREAD_USER_CREATE < cycle )
printk("\n[DBG] %s : thread %x enter / process %x / cycle %d\n",
__FUNCTION__, CURRENT_THREAD, pid , cycle );
#endif

    // get process descriptor local copy
    process = process_get_local_copy( pid );
    if( process == NULL )
    {
		printk("\n[ERROR] in %s : cannot get process descriptor %x\n",
               __FUNCTION__ , pid );
        return ENOMEM;
    }

    // select a target core in local cluster
    if( attr->attributes & PT_ATTR_CORE_DEFINED )
    {
        core_lid = attr->lid;
        if( core_lid >= LOCAL_CLUSTER->cores_nr )
        {
	        printk("\n[ERROR] in %s : illegal core index attribute = %d\n",
            __FUNCTION__ , core_lid );
            return EINVAL;
        }
    }
    else
    {
        core_lid = cluster_select_local_core();
    }

    // allocate a stack from local VMM
    vseg = vmm_create_vseg( process,
                            VSEG_TYPE_STACK,
                            0,                 // size unused
                            0,                 // length unused
                            0,                 // file_offset unused
                            0,                 // file_size unused
                            XPTR_NULL,         // mapper_xp unused
                            local_cxy );

    if( vseg == NULL )
    {
	    printk("\n[ERROR] in %s : cannot create stack vseg\n", __FUNCTION__ );
		return ENOMEM;
    }

    // allocate memory for thread descriptor
    thread = thread_alloc();

    if( thread == NULL )
    {
	    printk("\n[ERROR] in %s : cannot create new thread\n", __FUNCTION__ );
        vmm_remove_vseg( vseg );
        return ENOMEM;
    }

    // initialize thread descriptor
    error = thread_init( thread,
                         process,
                         THREAD_USER,
                         start_func,
                         start_arg,
                         core_lid,
                         vseg->min,
                         vseg->max - vseg->min );
    if( error )
    {
	    printk("\n[ERROR] in %s : cannot initialize new thread\n", __FUNCTION__ );
        vmm_remove_vseg( vseg );
        thread_release( thread );
        return EINVAL;
    }

    // set DETACHED flag if required
    if( attr->attributes & PT_ATTR_DETACH ) 
    {
        thread->flags |= THREAD_FLAG_DETACHED;
    }

    // allocate & initialize CPU context
	if( hal_cpu_context_create( thread ) )
    {
	    printk("\n[ERROR] in %s : cannot create CPU context\n", __FUNCTION__ );
        vmm_remove_vseg( vseg );
        thread_release( thread );
        return ENOMEM;
    }

    // allocate  FPU context
    if( hal_fpu_context_alloc( thread ) )
    {
	    printk("\n[ERROR] in %s : cannot create FPU context\n", __FUNCTION__ );
        vmm_remove_vseg( vseg );
        thread_release( thread );
        return ENOMEM;
    }

#if DEBUG_THREAD_USER_CREATE
cycle = (uint32_t)hal_get_cycles();
if( DEBUG_THREAD_USER_CREATE < cycle )
printk("\n[DBG] %s : thread %x exit / process %x / new_thread %x / core %d / cycle %d\n",
__FUNCTION__, CURRENT_THREAD, pid, thread, core_lid, cycle );
#endif

    *new_thread = thread;
	return 0;

}  // end thread_user_create()

///////////////////////////////////////////////////////
error_t thread_user_fork( xptr_t      parent_thread_xp,
                          process_t * child_process,
                          thread_t ** child_thread )
{
    error_t        error;
	thread_t     * child_ptr;        // local pointer on local child thread
    lid_t          core_lid;         // selected core local index

    thread_t     * parent_ptr;       // local pointer on remote parent thread
    cxy_t          parent_cxy;       // parent thread cluster
    process_t    * parent_process;   // local pointer on parent process
    xptr_t         parent_gpt_xp;    // extended pointer on parent thread GPT

    void         * func;             // parent thread entry_func
    void         * args;             // parent thread entry_args
    intptr_t       base;             // parent thread u_stack_base
    uint32_t       size;             // parent thread u_stack_size
    uint32_t       flags;            // parent_thread flags
    vpn_t          vpn_base;         // parent thread stack vpn_base
    vpn_t          vpn_size;         // parent thread stack vpn_size
    reg_t        * uzone;            // parent thread pointer on uzone  

    vseg_t       * vseg;             // child thread STACK vseg

#if DEBUG_THREAD_USER_FORK
uint32_t cycle = (uint32_t)hal_get_cycles();
if( DEBUG_THREAD_USER_FORK < cycle )
printk("\n[DBG] %s : thread %x enter / child_process %x / cycle %d\n",
__FUNCTION__, CURRENT_THREAD, child_process->pid, cycle );
#endif

    // select a target core in local cluster
    core_lid = cluster_select_local_core();

    // get cluster and local pointer on parent thread descriptor
    parent_cxy = GET_CXY( parent_thread_xp );
    parent_ptr = (thread_t *)GET_PTR( parent_thread_xp );

    // get relevant fields from parent thread 
    func  = (void *)  hal_remote_lpt( XPTR( parent_cxy , &parent_ptr->entry_func    ));
    args  = (void *)  hal_remote_lpt( XPTR( parent_cxy , &parent_ptr->entry_args    ));
    base  = (intptr_t)hal_remote_lpt( XPTR( parent_cxy , &parent_ptr->u_stack_base  ));
    size  = (uint32_t)hal_remote_lw ( XPTR( parent_cxy , &parent_ptr->u_stack_size  ));
    flags =           hal_remote_lw ( XPTR( parent_cxy , &parent_ptr->flags         ));
    uzone = (reg_t *) hal_remote_lpt( XPTR( parent_cxy , &parent_ptr->uzone_current ));

    vpn_base = base >> CONFIG_PPM_PAGE_SHIFT;
    vpn_size = size >> CONFIG_PPM_PAGE_SHIFT;

    // get pointer on parent process in parent thread cluster
    parent_process = (process_t *)hal_remote_lpt( XPTR( parent_cxy,
                                                        &parent_ptr->process ) );
 
    // get extended pointer on parent GPT in parent thread cluster
    parent_gpt_xp = XPTR( parent_cxy , &parent_process->vmm.gpt );

    // allocate memory for child thread descriptor
    child_ptr = thread_alloc();
    if( child_ptr == NULL )
    {
        printk("\n[ERROR] in %s : cannot allocate new thread\n", __FUNCTION__ );
        return -1;
    }

    // initialize thread descriptor
    error = thread_init( child_ptr,
                         child_process,
                         THREAD_USER,
                         func,
                         args,
                         core_lid,
                         base,
                         size );
    if( error )
    {
	    printk("\n[ERROR] in %s : cannot initialize child thread\n", __FUNCTION__ );
        thread_release( child_ptr );
        return EINVAL;
    }

    // return child pointer
    *child_thread = child_ptr;

    // set detached flag if required
    if( flags & THREAD_FLAG_DETACHED ) child_ptr->flags = THREAD_FLAG_DETACHED;

    // update uzone pointer in child thread descriptor
    child_ptr->uzone_current = (char *)((intptr_t)uzone +
                                        (intptr_t)child_ptr - 
                                        (intptr_t)parent_ptr );
 

    // allocate CPU context for child thread
	if( hal_cpu_context_alloc( child_ptr ) )
    {
	    printk("\n[ERROR] in %s : cannot allocate CPU context\n", __FUNCTION__ );
        thread_release( child_ptr );
        return -1;
    }

    // allocate FPU context for child thread
	if( hal_fpu_context_alloc( child_ptr ) )
    {
	    printk("\n[ERROR] in %s : cannot allocate FPU context\n", __FUNCTION__ );
        thread_release( child_ptr );
        return -1;
    }

    // create and initialize STACK vseg 
    vseg = vseg_alloc();
    vseg_init( vseg,
               VSEG_TYPE_STACK,
               base,
               size,
               vpn_base,
               vpn_size,
               0, 0, XPTR_NULL,                         // not a file vseg
               local_cxy );

    // register STACK vseg in local child VSL
    vseg_attach( &child_process->vmm , vseg );

    // copy all valid STACK GPT entries   
    vpn_t          vpn;
    bool_t         mapped;
    ppn_t          ppn;
    for( vpn = vpn_base ; vpn < (vpn_base + vpn_size) ; vpn++ )
    {
        error = hal_gpt_pte_copy( &child_process->vmm.gpt,
                                  parent_gpt_xp,
                                  vpn,
                                  true,                 // set cow
                                  &ppn,
                                  &mapped );
        if( error )
        {
            vseg_detach( &child_process->vmm , vseg );
            vseg_free( vseg );
            thread_release( child_ptr );
            printk("\n[ERROR] in %s : cannot update child GPT\n", __FUNCTION__ );
            return -1;
        }

        // increment pending forks counter for the page if mapped
        if( mapped )
        {
            xptr_t   page_xp  = ppm_ppn2page( ppn );
            cxy_t    page_cxy = GET_CXY( page_xp );
            page_t * page_ptr = (page_t *)GET_PTR( page_xp );
            hal_remote_atomic_add( XPTR( page_cxy , &page_ptr->forks ) , 1 );

#if (DEBUG_THREAD_USER_FORK & 1)
cycle = (uint32_t)hal_get_cycles();
if( DEBUG_THREAD_USER_FORK < cycle )
printk("\n[DBG] %s : thread %x copied stack PTE to child GPT : vpn %x\n",
__FUNCTION__, CURRENT_THREAD, vpn );
#endif

        }
    }

    // set COW flag for all mapped entries of STAK vseg in parent thread GPT 
    hal_gpt_set_cow( parent_gpt_xp,
                     vpn_base,
                     vpn_size );
 
#if DEBUG_THREAD_USER_FORK
cycle = (uint32_t)hal_get_cycles();
if( DEBUG_THREAD_USER_FORK < cycle )
printk("\n[DBG] %s : thread %x exit / child_process %x / child_thread %x / cycle %d\n",
__FUNCTION__, CURRENT_THREAD, child_process->pid, child_ptr, cycle );
#endif

	return 0;

}  // end thread_user_fork()

/////////////////////////////////////////////////////////
error_t thread_kernel_create( thread_t     ** new_thread,
                              thread_type_t   type,
                              void          * func,
                              void          * args,
				              lid_t           core_lid )
{
    error_t        error;
	thread_t     * thread;       // pointer on new thread descriptor

    assert( ( (type == THREAD_IDLE) || (type == THREAD_RPC) || (type == THREAD_DEV) ) ,
    __FUNCTION__ , "illegal thread type" );

    assert( (core_lid < LOCAL_CLUSTER->cores_nr) ,
            __FUNCTION__ , "illegal core_lid" );

#if DEBUG_THREAD_KERNEL_CREATE
uint32_t cycle = (uint32_t)hal_get_cycles();
if( DEBUG_THREAD_KERNEL_CREATE < cycle )
printk("\n[DBG] %s : thread %x enter / requested_type %s / cycle %d\n",
__FUNCTION__, CURRENT_THREAD, thread, thread_type_str(type), cycle );
#endif

    // allocate memory for new thread descriptor
    thread = thread_alloc();

    if( thread == NULL ) return ENOMEM;

    // initialize thread descriptor
    error = thread_init( thread,
                         &process_zero,
                         type,
                         func,
                         args,
                         core_lid,
                         0 , 0 );  // no user stack for a kernel thread

    if( error ) // release allocated memory for thread descriptor
    {
        thread_release( thread );
        return EINVAL;
    }

    // allocate & initialize CPU context
	hal_cpu_context_create( thread );

#if DEBUG_THREAD_KERNEL_CREATE
cycle = (uint32_t)hal_get_cycles();
if( DEBUG_THREAD_KERNEL_CREATE < cycle )
printk("\n[DBG] %s : thread %x exit / new_thread %x / type %s / cycle %d\n",
__FUNCTION__, CURRENT_THREAD, thread, thread_type_str(type), cycle );
#endif

    *new_thread = thread;
	return 0;

} // end thread_kernel_create()

/////////////////////////////////////////////////
error_t thread_idle_init( thread_t      * thread,
                          thread_type_t   type,
                          void          * func,
                          void          * args,
				          lid_t           core_lid )
{
    assert( (type == THREAD_IDLE) , __FUNCTION__ , "illegal thread type" );

    assert( (core_lid < LOCAL_CLUSTER->cores_nr) , __FUNCTION__ , "illegal core index" );

    error_t  error = thread_init( thread,
                                  &process_zero,
                                  type,
                                  func,
                                  args,
                                  core_lid,
                                  0 , 0 );   // no user stack for a kernel thread

    // allocate & initialize CPU context if success
    if( error == 0 ) hal_cpu_context_create( thread );

    return error;

}  // end thread_idle_init()

///////////////////////////////////////////////////////////////////////////////////////
// TODO: check that all memory dynamically allocated during thread execution
// has been released, using a cache of mmap and malloc requests. [AG]
///////////////////////////////////////////////////////////////////////////////////////
void thread_destroy( thread_t * thread )
{
    reg_t        save_sr;

    process_t  * process    = thread->process;
    core_t     * core       = thread->core;

#if DEBUG_THREAD_DESTROY
uint32_t cycle = (uint32_t)hal_get_cycles();
if( DEBUG_THREAD_DESTROY < cycle )
printk("\n[DBG] %s : thread %x enter to destroy thread %x in process %x / cycle %d\n",
__FUNCTION__, CURRENT_THREAD, thread, process->pid, cycle );
#endif

    assert( (thread->children_nr == 0) , __FUNCTION__ , "still attached children" );

    assert( (thread->local_locks == 0) , __FUNCTION__ , "all local locks not released" );

    assert( (thread->remote_locks == 0) , __FUNCTION__ , "all remote locks not released" );

    // update intrumentation values
	process->vmm.pgfault_nr += thread->info.pgfault_nr;

    // release memory allocated for CPU context and FPU context
	hal_cpu_context_destroy( thread );
	if ( thread->type == THREAD_USER ) hal_fpu_context_destroy( thread );
	
    // release FPU ownership if required
	hal_disable_irq( &save_sr );
	if( core->fpu_owner == thread )
	{
		core->fpu_owner = NULL;
		hal_fpu_disable();
	}
	hal_restore_irq( save_sr );

    // remove thread from process th_tbl[]
    process_remove_thread( thread );
	
    // update DQDT
    dqdt_update_threads( -1 );

    // invalidate thread descriptor
	thread->signature = 0;

    // release memory for thread descriptor
    thread_release( thread );

#if DEBUG_THREAD_DESTROY
cycle = (uint32_t)hal_get_cycles();
if( DEBUG_THREAD_DESTROY < cycle )
printk("\n[DBG] %s : thread %x exit / destroyed thread %x in process %x / cycle %d\n",
__FUNCTION__, CURRENT_THREAD, thread, process->pid, cycle );
#endif

}   // end thread_destroy()

/////////////////////////////////////////////////
void thread_child_parent_link( xptr_t  xp_parent,
                               xptr_t  xp_child )
{
    // get extended pointers on children list root
    cxy_t      parent_cxy = GET_CXY( xp_parent );
    thread_t * parent_ptr = (thread_t *)GET_PTR( xp_parent );
    xptr_t     root       = XPTR( parent_cxy , &parent_ptr->children_root );

    // get extended pointer on children list entry
    cxy_t      child_cxy  = GET_CXY( xp_child );
    thread_t * child_ptr  = (thread_t *)GET_PTR( xp_child );
    xptr_t     entry      = XPTR( child_cxy , &child_ptr->brothers_list );

    // set the link
    xlist_add_first( root , entry );
    hal_remote_atomic_add( XPTR( parent_cxy , &parent_ptr->children_nr ) , 1 );

}  // end thread_child_parent_link()

///////////////////////////////////////////////////
void thread_child_parent_unlink( xptr_t  xp_parent,
                                 xptr_t  xp_child )
{
    // get extended pointer on children list lock
    cxy_t      parent_cxy = GET_CXY( xp_parent );
    thread_t * parent_ptr = (thread_t *)GET_PTR( xp_parent );
    xptr_t     lock       = XPTR( parent_cxy , &parent_ptr->children_lock );

    // get extended pointer on children list entry
    cxy_t      child_cxy  = GET_CXY( xp_child );
    thread_t * child_ptr  = (thread_t *)GET_PTR( xp_child );
    xptr_t     entry      = XPTR( child_cxy , &child_ptr->brothers_list );

    // get the lock
    remote_spinlock_lock( lock );

    // remove the link
    xlist_unlink( entry );
    hal_remote_atomic_add( XPTR( parent_cxy , &parent_ptr->children_nr ) , -1 );

    // release the lock
    remote_spinlock_unlock( lock );

}  // thread_child_parent_unlink()

//////////////////////////////////////////////////
inline void thread_set_req_ack( thread_t * target,
                                uint32_t * rsp_count )
{
    reg_t    save_sr;   // for critical section

    // get pointer on target thread scheduler
    scheduler_t * sched = &target->core->scheduler;

    // wait scheduler ready to handle a new request
    while( sched->req_ack_pending ) asm volatile( "nop" );
    
    // enter critical section
    hal_disable_irq( &save_sr );
      
    // set request in target thread scheduler
    sched->req_ack_pending = true;

    // set ack request in target thread "flags"
    hal_atomic_or( &target->flags , THREAD_FLAG_REQ_ACK );

    // set pointer on responses counter in target thread
    target->ack_rsp_count = rsp_count;
    
    // exit critical section
    hal_restore_irq( save_sr );

    hal_fence();

}  // thread_set_req_ack()

/////////////////////////////////////////////////////
inline void thread_reset_req_ack( thread_t * target )
{
    reg_t    save_sr;   // for critical section

    // get pointer on target thread scheduler
    scheduler_t * sched = &target->core->scheduler;

    // check signal pending in scheduler
    assert( sched->req_ack_pending , __FUNCTION__ , "no pending signal" );
    
    // enter critical section
    hal_disable_irq( &save_sr );
      
    // reset signal in scheduler
    sched->req_ack_pending = false;

    // reset signal in thread "flags"
    hal_atomic_and( &target->flags , ~THREAD_FLAG_REQ_ACK );

    // reset pointer on responses counter 
    target->ack_rsp_count = NULL;
    
    // exit critical section
    hal_restore_irq( save_sr );

    hal_fence();

}  // thread_reset_req_ack()

////////////////////////////////
inline bool_t thread_can_yield()
{
    thread_t * this = CURRENT_THREAD;
    return (this->local_locks == 0) && (this->remote_locks == 0);
}

/////////////////////////
void thread_check_sched()
{
    thread_t * this = CURRENT_THREAD;

	if( (this->local_locks == 0) && 
        (this->remote_locks == 0) &&
        (this->flags & THREAD_FLAG_SCHED) ) 
    {
        this->flags &= ~THREAD_FLAG_SCHED;
        sched_yield( "delayed scheduling" );
    }

}  // end thread_check_sched()

//////////////////////////////////////
void thread_block( xptr_t   thread_xp,
                   uint32_t cause )
{
    // get thread cluster and local pointer
    cxy_t      cxy = GET_CXY( thread_xp );
    thread_t * ptr = GET_PTR( thread_xp );

    // set blocking cause
    hal_remote_atomic_or( XPTR( cxy , &ptr->blocked ) , cause );
    hal_fence();

#if DEBUG_THREAD_BLOCK
uint32_t cycle = (uint32_t)hal_get_cycles();
if( DEBUG_THREAD_BLOCK < cycle )
printk("\n[DBG] %s : thread %x blocked thread %x / cause %x / cycle %d\n",
__FUNCTION__ , CURRENT_THREAD , ptr , cause , cycle );
#endif

#if (DEBUG_THREAD_BLOCK & 1)
if( DEBUG_THREAD_BLOCK < cycle )
sched_display( ptr->core->lid );
#endif

} // end thread_block()

////////////////////////////////////////////
uint32_t thread_unblock( xptr_t   thread_xp,
                         uint32_t cause )
{
    // get thread cluster and local pointer
    cxy_t      cxy = GET_CXY( thread_xp );
    thread_t * ptr = GET_PTR( thread_xp );

    // reset blocking cause
    uint32_t previous = hal_remote_atomic_and( XPTR( cxy , &ptr->blocked ) , ~cause );
    hal_fence();

#if DEBUG_THREAD_BLOCK
uint32_t cycle = (uint32_t)hal_get_cycles();
if( DEBUG_THREAD_BLOCK < cycle )
printk("\n[DBG] %s : thread %x unblocked thread %x / cause %x / cycle %d\n",
__FUNCTION__ , CURRENT_THREAD , ptr , cause , cycle );
#endif

#if (DEBUG_THREAD_BLOCK & 1)
if( DEBUG_THREAD_BLOCK < cycle )
sched_display( ptr->core->lid );
#endif

    // return a non zero value if the cause bit is modified 
    return( previous & cause );

}  // end thread_unblock()

////////////////////////////////////
void thread_kill( xptr_t  target_xp,
                  bool_t  is_exit,
                  bool_t  is_forced )
{
    reg_t       save_sr;                // for critical section
    bool_t      attached;               // target thread in attached mode
    bool_t      join_done;              // joining thread arrived first
    xptr_t      killer_xp;              // extended pointer on killer thread (this)
    thread_t  * killer_ptr;             // pointer on killer thread (this)
    cxy_t       target_cxy;             // target thread cluster     
    thread_t  * target_ptr;             // pointer on target thread
    xptr_t      joining_xp;             // extended pointer on joining thread
    thread_t  * joining_ptr;            // pointer on joining thread
    cxy_t       joining_cxy;            // joining thread cluster
    pid_t       target_pid;             // target process PID
    cxy_t       owner_cxy;              // target process owner cluster
    trdid_t     target_trdid;           // target thread identifier
    ltid_t      target_ltid;            // target thread local index
    xptr_t      process_state_xp;       // extended pointer on <term_state> in process

    xptr_t      target_flags_xp;        // extended pointer on target thread <flags>
    xptr_t      target_join_lock_xp;    // extended pointer on target thread <join_lock>
    xptr_t      target_join_xp_xp;      // extended pointer on target thread <join_xp>
    xptr_t      target_process_xp;      // extended pointer on target thread <process>

    process_t * target_process;         // pointer on target thread process

    // get target thread cluster and pointer
    target_cxy = GET_CXY( target_xp );
    target_ptr = GET_PTR( target_xp );

    // get killer thread pointers
    killer_ptr = CURRENT_THREAD;
    killer_xp  = XPTR( local_cxy , killer_ptr );

#if DEBUG_THREAD_KILL
uint32_t cycle  = (uint32_t)hal_get_cycles;
if( DEBUG_THREAD_KILL < cycle )
printk("\n[DBG] %s : thread %x enter for target thread %x / cycle %d\n",
__FUNCTION__, killer_ptr, target_ptr, cycle );
#endif

    // block the target thread 
    thread_block( target_xp , THREAD_BLOCKED_GLOBAL );

    // get target thread attached mode
    target_flags_xp = XPTR( target_cxy , &target_ptr->flags );
    attached = ((hal_remote_lw( target_flags_xp ) & THREAD_FLAG_DETACHED) == 0);

    // synchronize with the joining thread 
    // if the target thread is attached && not forced

    if( attached  && (is_forced == false) )
    {
        // build extended pointers on target thread join fields
        target_join_lock_xp  = XPTR( target_cxy , &target_ptr->join_lock );
        target_join_xp_xp    = XPTR( target_cxy , &target_ptr->join_xp );

        // enter critical section
        hal_disable_irq( &save_sr );

        // take the join_lock in target thread descriptor
        remote_spinlock_lock( target_join_lock_xp );

        // get join_done from target thread descriptor
        join_done = ((hal_remote_lw( target_flags_xp ) & THREAD_FLAG_JOIN_DONE) != 0);
    
        if( join_done )     // joining thread arrived first
        {
            // get extended pointer on joining thread
            joining_xp  = (xptr_t)hal_remote_lwd( target_join_xp_xp );
            joining_ptr = GET_PTR( joining_xp );
            joining_cxy = GET_CXY( joining_xp );
            
            // reset the join_done flag in target thread
            hal_remote_atomic_and( target_flags_xp , ~THREAD_FLAG_JOIN_DONE );

            // unblock the joining thread
            thread_unblock( joining_xp , THREAD_BLOCKED_JOIN );

            // release the join_lock in target thread descriptor
            remote_spinlock_unlock( target_join_lock_xp );

            // restore IRQs
            hal_restore_irq( save_sr );
        }
        else                // this thread arrived first
        {
            // set the kill_done flag in target thread
            hal_remote_atomic_or( target_flags_xp , THREAD_FLAG_KILL_DONE );

            // block this thread on BLOCKED_JOIN
            thread_block( killer_xp , THREAD_BLOCKED_JOIN );

            // set extended pointer on killer thread in target thread
            hal_remote_swd( target_join_xp_xp , killer_xp );

            // release the join_lock in target thread descriptor
            remote_spinlock_unlock( target_join_lock_xp );

            // deschedule
            sched_yield( "killer thread wait joining thread" );

            // restore IRQs
            hal_restore_irq( save_sr );
        }
    }  // end if attached

    // - if the target thread is the main thread
    //   => synchronize with the parent process main thread
    // - if the target thread is not the main thread
    //   => simply mark the target thread for delete

    // get pointer on target thread process
    target_process_xp  = XPTR( target_cxy , &target_ptr->process );
    target_process     = (process_t *)hal_remote_lpt( target_process_xp ); 

	// get target process owner cluster
	target_pid = hal_remote_lw( XPTR( target_cxy , &target_process->pid ) );
    owner_cxy = CXY_FROM_PID( target_pid );

    // get target thread local index
    target_trdid = hal_remote_lw( XPTR( target_cxy , &target_ptr->trdid ) );
    target_ltid  = LTID_FROM_TRDID( target_trdid );

    if( (owner_cxy == target_cxy) && (target_ltid == 0) )     // main thread
    {
        // get extended pointer on term_state in target process owner cluster
        process_state_xp = XPTR( owner_cxy , &target_process->term_state );

        // set termination info in target process owner  
        if( is_exit ) hal_remote_atomic_or( process_state_xp , PROCESS_TERM_EXIT );
        else          hal_remote_atomic_or( process_state_xp , PROCESS_TERM_KILL );

#if DEBUG_THREAD_KILL
cycle  = (uint32_t)hal_get_cycles;
if( DEBUG_THREAD_KILL < cycle )
printk("\n[DBG] %s : thread %x exit for thread %x / main thread / cycle %d\n",
__FUNCTION__, killer_ptr, target_ptr, cycle );
#endif

    }
    else                                                      // main thread
    {
        // set the REQ_DELETE flag in target thread descriptor
        hal_remote_atomic_or( target_flags_xp , THREAD_FLAG_REQ_DELETE );

#if DEBUG_THREAD_KILL
cycle  = (uint32_t)hal_get_cycles;
if( DEBUG_THREAD_KILL < cycle )
printk("\n[DBG] %s : thread %x exit for thread %x / not the main thread / cycle %d\n",
__FUNCTION__, killer_ptr, target_ptr, cycle );
#endif

    }

}  // end thread_kill()

///////////////////////
void thread_idle_func()
{
    while( 1 )
    {
        // unmask IRQs
        hal_enable_irq( NULL );

        if( CONFIG_THREAD_IDLE_MODE_SLEEP ) // force core to low-power mode
        {

#if DEBUG_THREAD_IDLE
uint32_t cycle  = (uint32_t)hal_get_cycles;
thread_t * this = CURRENT_THREAD;
if( DEBUG_THREAD_IDLE < cycle )
printk("\n[DBG] %s : idle thread %x on core[%x,%d] goes to sleep / cycle %d\n",
__FUNCTION__, this, local_cxy, this->core->lid, cycle );
#endif

            hal_core_sleep();

#if DEBUG_THREAD_IDLE
cycle  = (uint32_t)hal_get_cycles;
if( DEBUG_THREAD_IDLE < cycle )
printk("\n[DBG] %s : idle thread %x on core[%x,%d] wake up / cycle %d\n",
__FUNCTION__, this, local_cxy, this->core->lid, cycle );
#endif

        }
        else                                // search a runable thread
        {
            sched_yield( "IDLE" );
        }
    }
}  // end thread_idle()


/////////////////////////////////////////////////
void thread_user_time_update( thread_t * thread )
{
    // TODO
    // printk("\n[WARNING] function %s not implemented\n", __FUNCTION__ );
}

///////////////////////////////////////////////////
void thread_kernel_time_update( thread_t * thread )
{
    // TODO
    // printk("\n[WARNING] function %s not implemented\n", __FUNCTION__ );
}

/////////////////////////////////////
xptr_t thread_get_xptr( pid_t    pid,
                        trdid_t  trdid )
{
    cxy_t         target_cxy;          // target thread cluster identifier
    ltid_t        target_thread_ltid;  // target thread local index
    thread_t    * target_thread_ptr;   // target thread local pointer
    xptr_t        target_process_xp;   // extended pointer on target process descriptor
    process_t   * target_process_ptr;  // local pointer on target process descriptor
    pid_t         target_process_pid;  // target process identifier
    xlist_entry_t root;                // root of list of process in target cluster
    xptr_t        lock_xp;             // extended pointer on lock protecting  this list

    // get target cluster identifier and local thread identifier
    target_cxy         = CXY_FROM_TRDID( trdid );
    target_thread_ltid = LTID_FROM_TRDID( trdid );

    // check trdid argument
	if( (target_thread_ltid >= CONFIG_THREAD_MAX_PER_CLUSTER) || 
        cluster_is_undefined( target_cxy ) )         return XPTR_NULL;

    // get root of list of process descriptors in target cluster
    hal_remote_memcpy( XPTR( local_cxy  , &root ),
                       XPTR( target_cxy , &LOCAL_CLUSTER->pmgr.local_root ),
                       sizeof(xlist_entry_t) );

    // get extended pointer on lock protecting the list of processes
    lock_xp = XPTR( target_cxy , &LOCAL_CLUSTER->pmgr.local_lock );

    // take the lock protecting the list of processes in target cluster
    remote_spinlock_lock( lock_xp );

    // loop on list of process in target cluster to find the PID process
    xptr_t  iter;
    bool_t  found = false;
    XLIST_FOREACH( XPTR( target_cxy , &LOCAL_CLUSTER->pmgr.local_root ) , iter )
    {
        target_process_xp  = XLIST_ELEMENT( iter , process_t , local_list );
        target_process_ptr = (process_t *)GET_PTR( target_process_xp );
        target_process_pid = hal_remote_lw( XPTR( target_cxy , &target_process_ptr->pid ) );
        if( target_process_pid == pid )
        {
            found = true;
            break;
        }
    }

    // release the lock protecting the list of processes in target cluster
    remote_spinlock_unlock( lock_xp );

    // check PID found
    if( found == false ) return XPTR_NULL;

    // get target thread local pointer
    xptr_t xp = XPTR( target_cxy , &target_process_ptr->th_tbl[target_thread_ltid] );
    target_thread_ptr = (thread_t *)hal_remote_lpt( xp );

    if( target_thread_ptr == NULL )  return XPTR_NULL;

    return XPTR( target_cxy , target_thread_ptr );
}

