/*
 * malloc.h - User space memory allocator implementation.
 * 
 * Author     Alain Greiner (2016,2017)
 *
 * Copyright (c) UPMC Sorbonne Universites
 *
 * This file is part of ALMOS-MKH.
 *
 * ALMOS-MKH is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by
 * the Free Software Foundation; version 2.0 of the License.
 *
 * ALMOS-MKH is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with ALMOS-MKH; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
 */

#include <stdlib.h>
#include <stdio.h>
#include <malloc.h>
#include <pthread.h>

#define  MALLOC_DEBUG  0
 
/////////////////////////////////////////////////////////////////////////////////////////
// Global variable defining the allocator array (one per cluster)
// This array (about 16 Kbytes ) will be stored in the data segment
// of any application linked with this malloc libray.
/////////////////////////////////////////////////////////////////////////////////////////

malloc_store_t   store[MALLOC_MAX_CLUSTERS];

// Macro returning the smallest power of 2 larger or equal to size value

#define GET_SIZE_INDEX(size)                (size <= 0x00000001) ? 0  :\
                                            (size <= 0x00000002) ? 1  :\
                                            (size <= 0x00000004) ? 2  :\
                                            (size <= 0x00000008) ? 3  :\
                                            (size <= 0x00000010) ? 4  :\
                                            (size <= 0x00000020) ? 5  :\
                                            (size <= 0x00000040) ? 6  :\
                                            (size <= 0x00000080) ? 7  :\
                                            (size <= 0x00000100) ? 8  :\
                                            (size <= 0x00000200) ? 9  :\
                                            (size <= 0x00000400) ? 10 :\
                                            (size <= 0x00000800) ? 11 :\
                                            (size <= 0x00001000) ? 12 :\
                                            (size <= 0x00002000) ? 13 :\
                                            (size <= 0x00004000) ? 14 :\
                                            (size <= 0x00008000) ? 15 :\
                                            (size <= 0x00010000) ? 16 :\
                                            (size <= 0x00020000) ? 17 :\
                                            (size <= 0x00040000) ? 18 :\
                                            (size <= 0x00080000) ? 19 :\
                                            (size <= 0x00100000) ? 20 :\
                                            (size <= 0x00200000) ? 21 :\
                                            (size <= 0x00400000) ? 22 :\
                                            (size <= 0x00800000) ? 23 :\
                                            (size <= 0x01000000) ? 24 :\
                                            (size <= 0x02000000) ? 25 :\
                                            (size <= 0x04000000) ? 26 :\
                                            (size <= 0x08000000) ? 27 :\
                                            (size <= 0x10000000) ? 28 :\
                                            (size <= 0x20000000) ? 29 :\
                                            (size <= 0x40000000) ? 30 :\
                                            (size <= 0x80000000) ? 31 :\
                                                                   32

////////////////////////////////////////////////////////////////////////////////////////////
// This static function display the current state of the allocator in cluster <cxy>.
////////////////////////////////////////////////////////////////////////////////////////////
static void display_free_array( unsigned int cxy )
{
    unsigned int next;
    unsigned int id;
    unsigned int iter;

    printf("\n*****   store[%x] base = %x / size = %x\n", 
    cxy , store[cxy].store_base, store[cxy].store_size );
    for ( id = 0 ; id < 32 ; id++ )
    { 
        next = store[cxy].free[id];
        printf(" - free[%d] = " , id );
        iter = 0;
        while ( next != 0 )
        {
            printf("%x | ", next );
            next = (*(unsigned int*)next);
            iter++;
        }
        printf("0\n");
    }
}  // end display_free_array()


////////////////////////////////////////////////////////////////////i//////////////////////
// This static function initialises the store in the cluster identified by the <cxy>
// arguments. It is called by the malloc() or remote_malloc when a specific store(x,y)
// is accessed for the first time by a remote() or remote_malloc() request.
// It uses the mmap( MAP_REMOTE ) syscall to allocate a new vseg mapped in cluster (cxy).
////////////////////////////////////////////////////////////////////i//////////////////////
// @ cxy        : target cluster identifier (fixed format).
// @ store_size : store size (bytes).
// # return without setting the initialized field in store(cxy) if failure.
////////////////////////////////////////////////////////////////////i//////////////////////
static void store_init( unsigned int cxy,
                        unsigned int store_size )
{
    unsigned int   store_base;       // store base address
    unsigned int   free_index;       // index in free[array]

    unsigned int   alloc_base;       // alloc[] array base 
    unsigned int   alloc_size;       // alloc[] array size
    unsigned int   alloc_index;      // index in alloc[array]

    unsigned int   iter;             // iterator

#if MALLOC_DEBUG
printf("\n[MALLOC] %s : enter for store[%x] / size = %x\n",
__FUNCTION__, cxy, store_size );
#endif

    // get index in free[] array from size
    free_index = GET_SIZE_INDEX( store_size );

    // check store size power of 2
    if( store_size != (1<<free_index) )
    {
        printf("\n[ERROR] in %s : store[%x] size not power of 2 / size = %x\n",
        __FUNCTION__, cxy , store_size );
        return;
    }

    // allocate store in virtual space
    void * vadr = mmap( NULL,                     // MAP_FIXED not supported
                        store_size,
                        PROT_READ | PROT_WRITE,
                        MAP_REMOTE| MAP_SHARED,
                        cxy,                      // fd is cluster identifier
                        0 );                      // offset unused 

    if( vadr == NULL )
    {
        printf("\n[ERROR] in %s : cannot mmap store[%x]\n",
        __FUNCTION__, cxy );
        return;
    }

    store_base = (unsigned int)vadr;

    // check allocated store alignment
    if( store_base % store_size )
    {
        printf("\n[ERROR] in %s : store[%x] not aligned / base = %x / size = %x\n",
        __FUNCTION__, cxy , store_base , store_size );
        return;
    }

#if MALLOC_DEBUG
printf("\n[MALLOC] %s : mmap done for store[%x] / base = %x\n",
__FUNCTION__, cxy, store_base );
#endif

    // compute size of block containing alloc[] array 
    alloc_size = store_size / MALLOC_MIN_BLOCK_SIZE;
    if ( alloc_size < MALLOC_MIN_BLOCK_SIZE) alloc_size = MALLOC_MIN_BLOCK_SIZE;

    // get index for the corresponding block
    alloc_index = GET_SIZE_INDEX( alloc_size );

    // compute alloc[] array base address
    alloc_base = store_base + store_size - alloc_size;

    // reset the free[] array 
    for ( iter = 0 ; iter < 32 ; iter++ )
    {
        store[cxy].free[iter] = 0;
    }

    // DEPRECATED: we don't reset the alloc_size array 
    // because we don't want to allocate the physical memory
    // when the heap is created  [AG]
    // memset( (void *)alloc_base , 0 , alloc_size ); 
 
    // split the store into various sizes blocks,
    // initializes the free[] array and NEXT pointers
    // base is the block base address
    unsigned int   base = store_base;
    unsigned int * ptr;
    for ( iter = free_index-1 ; iter >= alloc_index ; iter-- )
    {
        store[cxy].free[iter] = base;
        ptr = (unsigned int*)base;
        *ptr = 0;
        base = base + (1<<iter);
    }

    // initialize store mutex
    if( pthread_mutex_init( &store[cxy].mutex , NULL ) )
    {
        printf("\n[ERROR] in %s : cannot initialize mutex for store[%x]\n", 
        __FUNCTION__, cxy );
        return;
    }

    store[cxy].cxy         = cxy;
    store[cxy].store_base  = store_base;
    store[cxy].store_size  = store_size;
    store[cxy].alloc_size  = alloc_size;
    store[cxy].alloc_base  = alloc_base;
    store[cxy].initialized = MALLOC_INITIALIZED;


#if MALLOC_DEBUG
printf("\n[MALLOC] %s : completes store[%x] initialisation\n",
__FUNCTION__, cxy );

display_free_array( cxy );
#endif

}  // end store_init()

////////////////////////////////////////////////////////
static unsigned int split_block( malloc_store_t * store,
                                 unsigned int     vaddr, 
                                 unsigned int     searched_index,
                                 unsigned int     requested_index )
{
    // push the upper half block into free[searched_index-1]
    unsigned int* new            = (unsigned int*)(vaddr + (1<<(searched_index-1)));
    *new                         = store->free[searched_index-1]; 
    store->free[searched_index-1] = (unsigned int)new;
        
    if ( searched_index == requested_index + 1 )  // terminal case: return lower half block 
    {
        return vaddr;
    }
    else            // non terminal case : lower half block must be split again
    {                               
        return split_block( store, vaddr, searched_index-1, requested_index );
    }
} // end split_block()

//////////////////////////////////////////////////////
static unsigned int get_block( malloc_store_t * store,
                               unsigned int     searched_index,
                               unsigned int     requested_index )
{
    // test terminal case
    if ( (1<<searched_index) > store->store_size )  // failure : return a NULL value
    {
        return 0;
    }
    else                            // search a block in free[searched_index]
    {
        unsigned int vaddr = store->free[searched_index];
        if ( vaddr == 0 )     // block not found : search in free[searched_index+1]
        {
            return get_block( store, searched_index+1, requested_index );
        }
        else                // block found : pop it from free[searched_index] 
        {
            // pop the block from free[searched_index]
            unsigned int next = *((unsigned int*)vaddr); 
            store->free[searched_index] = next;
            
            // test if the block must be split
            if ( searched_index == requested_index )  // no split required
            {
                return vaddr;
            }
            else                                      // split is required
            {
                return split_block( store, vaddr, searched_index, requested_index );
            }
        } 
    }
} // end get_block()

////////////////////////////////////////
void * remote_malloc( unsigned int size,
                      unsigned int cxy )
{

#if MALLOC_DEBUG
printf("\n[MALLOC] %s : enter for size = %x / cxy = %x\n",
__FUNCTION__ , size , cxy );
#endif

    // check arguments
    if( size == 0 )
    {
        printf("\n[ERROR] in %s : requested size = 0 \n",
        __FUNCTION__ );
        return NULL;
    }
    if( cxy >= MALLOC_MAX_CLUSTERS )
    {
        printf("\n[ERROR] in %s : illegal cluster %x\n",
        __FUNCTION__ , cxy );
        return NULL;
    }

    // initializes target store if required
    if( store[cxy].initialized != MALLOC_INITIALIZED )
    {
        store_init( cxy , MALLOC_LOCAL_STORE_SIZE );

        if( store[cxy].initialized != MALLOC_INITIALIZED )
        {
            printf("\n[ERROR] in %s : cannot allocate store in cluster %x\n",
            __FUNCTION__ , cxy );
            return NULL;
        }
    }

    // normalize size
    if ( size < MALLOC_MIN_BLOCK_SIZE ) size = MALLOC_MIN_BLOCK_SIZE;

    // compute requested_index for the free[] array
    unsigned int requested_index = GET_SIZE_INDEX( size );

    // take the lock protecting access to store[cxy]
    pthread_mutex_lock( &store[cxy].mutex );

    // call the recursive function get_block
    unsigned int base = get_block( &store[cxy], 
                                   requested_index, 
                                   requested_index );

    // check block found
    if (base == 0)
    {
        pthread_mutex_unlock( &store[cxy].mutex );
        printf("\n[ERROR] in %s : no more space in cluster %x\n",
        __FUNCTION__ , cxy );
        return NULL;
    }

    // compute pointer in alloc[] array
    unsigned        offset = (base - store[cxy].store_base) / MALLOC_MIN_BLOCK_SIZE;
    unsigned char * ptr    = (unsigned char*)(store[cxy].alloc_base + offset);

    // DEPRECATED : we don't check the alloc[] array,
    // because it has not been initialised, to avoid 
    // physical memory allocation at heap creation [AG]
    // if ( *ptr != 0 )
    // {
    //    pthread_mutex_unlock( &store[cxy].mutex );
    //    printf("\n[PANIC] in %s : allocate an already allocated block...\n",
    //    __FUNCTION__ );
    //    return NULL;
    // }

    // update alloc_array
    *ptr = requested_index;

    // release the lock
    pthread_mutex_unlock( &store[cxy].mutex );
 
#if MALLOC_DEBUG
printf("\n[MALLOC] %s : exit / base = %x / size = %x / from store[%x]\n",
__FUNCTION__, base , size , cxy );
#endif

    return (void*) base;

} // end remote_malloc()


//////////////////////////////////
void * malloc( unsigned int size )
{
    // get cluster identifier
    unsigned int cxy;
    unsigned int lid;
    get_core( &cxy , &lid );

    return remote_malloc( size, cxy );
} 


//////////////////////////////////////////
void * remote_calloc ( unsigned int count,
                       unsigned int size,
                       unsigned int cxy )
{
    void * ptr = remote_malloc( count * size , cxy );
    memset( ptr , 0 , count * size );
    return ptr;
}

///////////////////////////////////
void * calloc ( unsigned int count,
                unsigned int size )
{
    // get calling core cluster identifier
    unsigned int cxy;
    unsigned int lid;
    get_core( &cxy , &lid );

    return remote_calloc( count , size , cxy );
}

//////////////////////////////////
void * remote_realloc( void * ptr,
                       unsigned int size,
                       unsigned int cxy )
{
    // simple allocation when (ptr == NULL)
    if( ptr == NULL )
    {
        return remote_malloc( size , cxy );
    }

    // simple free when (size == 0)
    if( size == 0 )
    {
        remote_free( ptr , cxy );
        return NULL;
    }

    // check cxy and ptr in general case
    if( cxy >= MALLOC_MAX_CLUSTERS )
    {
        printf("\n[ERROR] in %s : illegal cluster index %x\n",
        __FUNCTION__ , cxy );
        return NULL;
    }

    unsigned int base = (unsigned int)ptr;

    if( (base < store[cxy].store_base) || 
        (base >= (store[cxy].store_base + store[cxy].store_size)) )
    {
        printf("\n[ERROR] in %s : illegal pointer = %x\n",
        __FUNCTION__, ptr );
        return NULL;
    }
 
    // compute index in free[] array
    int index = (base - store[cxy].store_base) / MALLOC_MIN_BLOCK_SIZE;

    // compute old size
    char * pchar = (char *) (store[cxy].alloc_base + index);
    int old_size = 1 << ((int) *pchar);

    // allocate a new block
    void * new_ptr = remote_malloc( size , cxy );

    // save old data to new block
    int min_size = (size < old_size) ? size : old_size;
    memcpy( new_ptr, ptr, min_size );

    // release old block
    remote_free( ptr , cxy );

    return new_ptr;
}

///////////////////////////////////
void * realloc ( void        * ptr,
                 unsigned int  size )
{
    // get calling core cluster identifier
    unsigned int cxy;
    unsigned int lid;
    get_core( &cxy , &lid );

    return remote_realloc( ptr , size , cxy );
}



//////////////////////////////////////////////////////
static void update_free_array( malloc_store_t * store,
                               unsigned int     base,
                               unsigned int     size_index )
{
    // This recursive function try to merge the released block 
    // with the companion block if this companion block is free.
    // This companion has the same size, and almost the same address
    // (only one address bit is different)
    // - If the companion is not in free[size_index],
    //   the released block is pushed in free[size_index].
    // - If the companion is found, it is evicted from free[size_index]
    //   and the merged bloc is pushed in the free[size_index+1].


    // compute released block size
    unsigned int size = 1<<size_index;

    // compute companion block and merged block base addresses
    unsigned int companion_base;  
    unsigned int merged_base;  

    if ( (base & size) == 0 )   // the released block is aligned on (2*size)
    {
        companion_base  = base + size;
        merged_base     = base;
    }
    else
    {
        companion_base  = base - size;
        merged_base     = base - size;
    }

    // scan all blocks in free[size_index]
    // the iter & prev variables are actually addresses
    unsigned int  found = 0;
    unsigned int  iter  = store->free[size_index];
    unsigned int  prev  = (unsigned int)&store->free[size_index];
    while ( iter ) 
    {
        if ( iter == companion_base ) 
        {
            found = 1;
            break;
        }
        prev = iter;
        iter = *(unsigned int*)iter;
    }

    if ( found == 0 )  // Companion not found => push in free[size_index]  
    {
        *(unsigned int*)base   = store->free[size_index];
        store->free[size_index] = base;
    }
    else               // Companion found : merge
    {
        // evict the searched block from free[size_index]
        *(unsigned int*)prev = *(unsigned int*)iter;

        // call the update_free() function for free[size_index+1]
        update_free_array( store, merged_base , size_index+1 );
    }
}  // end update_free_array()

////////////////////////////////////
void remote_free( void        * ptr,
                  unsigned int  cxy )
{

#if MALLOC_DEBUG
printf("\n[MALLOC] %s : enter for block = %x / cxy = %x\n",
__FUNCTION__, ptr, cxy );
#endif

    unsigned int base = (unsigned int)ptr;

    // check cxy value
    if( cxy >= MALLOC_MAX_CLUSTERS )
    {
        printf("\n[ERROR] in %s : illegal cluster index %x\n",
        __FUNCTION__ , cxy );
        return;
    }

    // check ptr value
    if( (base < store[cxy].store_base) || 
        (base >= (store[cxy].store_base + store[cxy].store_size)) )
    {
        printf("\n[ERROR] in %s : illegal pointer for released block = %x\n",
        __FUNCTION__, ptr );
        return;
    }
 
    // get the lock protecting store[cxy]
    pthread_mutex_lock( &store[cxy].mutex );

    // compute released block index in alloc[] array
    unsigned index = (base - store[cxy].store_base ) / MALLOC_MIN_BLOCK_SIZE;
 
    // get the released block size_index
    unsigned char* pchar      = (unsigned char*)(store[cxy].alloc_base + index);
    unsigned int   size_index = (unsigned int)*pchar;

    // check block is allocated
    if ( size_index == 0 )
    {
        pthread_mutex_unlock( &store[cxy].mutex );
        printf("\n[ERROR] in %s : released block not allocated / ptr = %x\n",
        __FUNCTION__, ptr );
        return;
    }

    // check released block alignment
    if ( base % (1 << size_index) )
    {
        pthread_mutex_unlock( &store[cxy].mutex );
        printf("\n[ERROR] in %s : released block not aligned / ptr = %x\n",
        __FUNCTION__, ptr );
        return;
    }

    // reset the alloc[index] entry
    *pchar = 0;

    // call the recursive function update_free_array() 
    update_free_array( &store[cxy], base, size_index ); 

    // release the lock
    pthread_mutex_unlock( &store[cxy].mutex );

#if MALLOC_DEBUG
printf("\n[MALLOC] %s : conmpletes for block = %x / cxy = %x\n",
__FUNCTION__, ptr, cxy );
#endif

} // end remote_free()

///////////////////////
void free( void * ptr )
{
    // get calling core cluster identifier
    unsigned int cxy;
    unsigned int lid;
    get_core( &cxy , &lid );

    remote_free( ptr , cxy );
}

    
// Local Variables:
// tab-width: 4
// c-basic-offset: 4
// c-file-offsets:((innamespace . 0)(inline-open . 0))
// indent-tabs-mode: nil
// End:
// vim: filetype=c:expandtab:shiftwidth=4:tabstop=4:softtabstop=4



