/*
 * remote_barrier.c -  POSIX barrier implementation.
 *
 * Author   Alain Greiner (2016,2017,2018)
 *
 * Copyright (c) UPMC Sorbonne Universites
 *
 * This file is part of ALMOS-MKH.
 *
 * ALMOS-MKH is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by
 * the Free Software Foundation; version 2.0 of the License.
 *
 * ALMOS-MKH is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with ALMOS-MKH; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
 */

#include <hal_kernel_types.h>
#include <hal_remote.h>
#include <hal_irqmask.h>
#include <remote_busylock.h>
#include <thread.h>
#include <kmem.h>
#include <printk.h>
#include <process.h>
#include <vmm.h>
#include <remote_barrier.h>


///////////////////////////////////////////////////
xptr_t remote_barrier_from_ident( intptr_t  ident )
{
    // get pointer on local process_descriptor
    process_t * process = CURRENT_THREAD->process;

    // get extended pointer on reference process
    xptr_t      ref_xp = process->ref_xp;

    // get cluster and local pointer on reference process
    cxy_t          ref_cxy = GET_CXY( ref_xp );
    process_t    * ref_ptr = (process_t *)GET_PTR( ref_xp );

    // get extended pointer on root of barriers list
    xptr_t root_xp = XPTR( ref_cxy , &ref_ptr->barrier_root );

    // scan reference process barriers list
    xptr_t             iter_xp;
    xptr_t             barrier_xp;
    cxy_t              barrier_cxy;
    remote_barrier_t * barrier_ptr;
    intptr_t           current;
    bool_t             found = false;

    XLIST_FOREACH( root_xp , iter_xp )
    {
        barrier_xp  = XLIST_ELEMENT( iter_xp , remote_barrier_t , list );
        barrier_cxy = GET_CXY( barrier_xp );
        barrier_ptr = (remote_barrier_t *)GET_PTR( barrier_xp );
        current     = (intptr_t)hal_remote_lpt( XPTR( barrier_cxy , &barrier_ptr->ident ) );
        if( ident == current )
        {
            found = true;
            break;
        }
    }

    if( found == false )  return XPTR_NULL;
    else                  return barrier_xp;
}

//////////////////////////////////////////////
error_t remote_barrier_create( intptr_t ident,
                               uint32_t count )
{
    xptr_t             barrier_xp;
    remote_barrier_t * barrier_ptr;

    // get pointer on local process descriptor
    thread_t  * this    = CURRENT_THREAD;
    process_t * process = this->process;

#if DEBUG_BARRIER
uint32_t cycle = (uint32_t)hal_get_cycles();
if( cycle > DEBUG_BARRIER )
printk("\n[DBG] %s : thread %x in process %x enter / count %d / cycle %d\n",
__FUNCTION__, this->trdid, process->pid, count, cycle );
#endif

    // get extended pointer on reference process
    xptr_t      ref_xp = process->ref_xp;

    // get reference process cluster and local pointer
    cxy_t       ref_cxy = GET_CXY( ref_xp );
    process_t * ref_ptr = GET_PTR( ref_xp );

    // allocate memory for barrier descriptor
    if( ref_cxy == local_cxy )                  // local cluster is the reference
    {
        kmem_req_t req;
        req.type      = KMEM_BARRIER;
        req.flags     = AF_ZERO;
        barrier_ptr   = kmem_alloc( &req );
        barrier_xp    = XPTR( local_cxy , barrier_ptr );
    }
    else                                       // reference is remote
    {
        rpc_kcm_alloc_client( ref_cxy , KMEM_BARRIER , &barrier_xp );
        barrier_ptr = (remote_barrier_t *)GET_PTR( barrier_xp );
    }

    if( barrier_ptr == NULL ) return ENOMEM;

    // initialise barrier
    hal_remote_s32( XPTR( ref_cxy , &barrier_ptr->nb_threads ) , count );
    hal_remote_s32( XPTR( ref_cxy , &barrier_ptr->current    ) , 0 );
    hal_remote_s32( XPTR( ref_cxy , &barrier_ptr->sense      ) , 0 );
    hal_remote_spt( XPTR( ref_cxy , &barrier_ptr->ident      ) , (void*)ident );

    xlist_root_init( XPTR( ref_cxy , &barrier_ptr->root ) );

    // register  barrier in reference process xlist
    xptr_t root_xp  = XPTR( ref_cxy , &ref_ptr->barrier_root );
    xptr_t entry_xp = XPTR( ref_cxy , &barrier_ptr->list );

    remote_busylock_acquire( XPTR( ref_cxy , &ref_ptr->sync_lock ) );
    xlist_add_first( root_xp , entry_xp );
    remote_busylock_release( XPTR( ref_cxy , &ref_ptr->sync_lock ) );

#if DEBUG_BARRIER
cycle = (uint32_t)hal_get_cycles();
if( cycle > DEBUG_BARRIER )
printk("\n[DBG] %s : thread %x in process %x exit / barrier %x in cluster %x / cycle %d\n",
__FUNCTION__, this->trdid, process->pid, barrier_ptr, ref_cxy, cycle );
#endif

    return 0;

}  // end remote_barrier_create()

////////////////////////////////////////////////
void remote_barrier_destroy( xptr_t barrier_xp )
{
    // get pointer on local process descriptor
    process_t * process = CURRENT_THREAD->process;

    // get extended pointer on reference process
    xptr_t      ref_xp = process->ref_xp;

    // get reference process cluster and local pointer
    cxy_t       ref_cxy = GET_CXY( ref_xp );
    process_t * ref_ptr = (process_t *)GET_PTR( ref_xp );

    // get barrier cluster and local pointer
    cxy_t              barrier_cxy = GET_CXY( barrier_xp );
    remote_barrier_t * barrier_ptr = (remote_barrier_t *)GET_PTR( barrier_xp );

    // remove barrier from reference process xlist
    remote_busylock_acquire( XPTR( ref_cxy , &ref_ptr->sync_lock ) );
    xlist_unlink( XPTR( barrier_cxy , &barrier_ptr->list ) );
    remote_busylock_release( XPTR( ref_cxy , &ref_ptr->sync_lock ) );

    // release memory allocated for barrier descriptor
    if( barrier_cxy == local_cxy )                        // reference is local
    {
        kmem_req_t  req;
        req.type = KMEM_BARRIER;
        req.ptr  = barrier_ptr;
        kmem_free( &req );
    }
    else                                                  // reference is remote
    {
        rpc_kcm_free_client( barrier_cxy , barrier_ptr , KMEM_BARRIER );
    }
}  // end remote_barrier_destroy()

/////////////////////////////////////////////
void remote_barrier_wait( xptr_t barrier_xp )
{
    uint32_t  expected;
    uint32_t  sense;
    uint32_t  current;
    uint32_t  nb_threads;
    xptr_t    root_xp;
    xptr_t    lock_xp;
    xptr_t    current_xp;
    xptr_t    sense_xp;
    xptr_t    nb_threads_xp;

    // get pointer on calling thread
    thread_t * this = CURRENT_THREAD;

    // check calling thread can yield
    thread_assert_can_yield( this , __FUNCTION__ );

    // get cluster and local pointer on remote barrier
    remote_barrier_t * barrier_ptr = GET_PTR( barrier_xp );
    cxy_t              barrier_cxy = GET_CXY( barrier_xp );

#if DEBUG_BARRIER
uint32_t cycle = (uint32_t)hal_get_cycles();
if( cycle > DEBUG_BARRIER )
printk("\n[DBG] %s : thread %x in process %x enter / barrier %x in cluster %x / cycle %d\n",
__FUNCTION__, this->trdid, this->process->pid, barrier_ptr, barrier_cxy, cycle );
#endif

    // compute extended pointers on various barrier fields
    lock_xp       = XPTR( barrier_cxy , &barrier_ptr->lock );
    root_xp       = XPTR( barrier_cxy , &barrier_ptr->root );
    current_xp    = XPTR( barrier_cxy , &barrier_ptr->current );
    sense_xp      = XPTR( barrier_cxy , &barrier_ptr->sense );
    nb_threads_xp = XPTR( barrier_cxy , &barrier_ptr->nb_threads );

    // take busylock protecting the remote_barrier
    remote_busylock_acquire( lock_xp );

#if (DEBUG_BARRIER & 1)
cycle = (uint32_t)hal_get_cycles();
if( cycle > DEBUG_BARRIER )
printk("\n[DBG] %s : thread %x in process %x get lock / cycle %d\n",
__FUNCTION__, this->trdid, this->process->pid, cycle );
#endif

    // get sense and nb_threads values from barrier descriptor
    sense      = hal_remote_l32( sense_xp );
    nb_threads = hal_remote_l32( nb_threads_xp );

    // compute expected value
    if ( sense == 0 ) expected = 1;
    else              expected = 0;

#if (DEBUG_BARRIER & 1)
cycle = (uint32_t)hal_get_cycles();
if( cycle > DEBUG_BARRIER )
printk("\n[DBG] %s : thread %x in process %x / count %d / sense %d / cycle %d\n",
__FUNCTION__, this->trdid, this->process->pid, nb_threads, sense, cycle );
#endif

    // atomically increment current, and get value before increment
    current = hal_remote_atomic_add( current_xp , 1 );

    // last thread reset current, toggle sense, and activate all waiting threads
    // other threads block, register in queue, and deschedule

    if( current == (nb_threads-1) )                       // last thread
    {
        hal_remote_s32( current_xp , 0 );
        hal_remote_s32( sense_xp , expected );

        // unblock all waiting threads
        while( xlist_is_empty( root_xp ) == false )
        {
            // get pointers on first waiting thread
            xptr_t     thread_xp  = XLIST_FIRST( root_xp , thread_t , wait_list );
            cxy_t      thread_cxy = GET_CXY( thread_xp );
            thread_t * thread_ptr = GET_PTR( thread_xp );

#if (DEBUG_BARRIER & 1)
cycle = (uint32_t)hal_get_cycles();
if( cycle > DEBUG_BARRIER )
printk("\n[DBG] %s : thread %x in process %x / unblock thread %x / cycle %d\n",
__FUNCTION__, this->trdid, this->process->pid, thread_ptr, cycle );
#endif

            // remove waiting thread from queue
            xlist_unlink( XPTR( thread_cxy , &thread_ptr->wait_list ) );

            // unblock waiting thread
            thread_unblock( thread_xp , THREAD_BLOCKED_USERSYNC );
        }

        // release busylock protecting the remote_barrier
        remote_busylock_release( lock_xp );
    }
    else                                             // not the last thread
    {

#if (DEBUG_BARRIER & 1)
cycle = (uint32_t)hal_get_cycles();
if( cycle > DEBUG_BARRIER )
printk("\n[DBG] %s : thread %x in process %x / blocked / cycle %d\n",
__FUNCTION__, this->trdid, this->process->pid, cycle );
#endif

        // register calling thread in barrier waiting queue
        xlist_add_last( root_xp , XPTR( local_cxy , &this->wait_list ) );

        // block calling thread
        thread_block( XPTR( local_cxy , this ) , THREAD_BLOCKED_USERSYNC );

        // release busylock protecting the remote_barrier
        remote_busylock_release( lock_xp );

        // deschedule
        sched_yield("blocked on barrier");
    }

#if DEBUG_BARRIER
cycle = (uint32_t)hal_get_cycles();
if( cycle > DEBUG_BARRIER )
printk("\n[DBG] %s : thread %x in process %x exit / barrier %x in cluster %x / cycle %d\n",
__FUNCTION__, this->trdid, this->process->pid, barrier_ptr, barrier_cxy, cycle );
#endif

}  // end remote_barrier_wait()
