///////////////////////////////////////////////////////////////////////////////////
// File     : drivers.c
// Date     : 01/04/2012
// Author   : alain greiner
// Copyright (c) UPMC-LIP6
///////////////////////////////////////////////////////////////////////////////////
// The drivers.c and drivers.h files are part ot the GIET nano kernel.
// They contains the drivers for the peripherals available in the SoCLib library:
// - vci_multi_tty
// - vci_multi_timer
// - vci_multi_dma
// - vci_multi_icu
// - vci_gcd
// - vci_frame_buffer
// - vci_block_device
//
// The following global parameters must be defined in the giet_config.h file:
// - NB_PROCS  : number of PROCS per cluster (if not zero)
// - NB_DMAS   : number of DMA channels per cluster (if not zero)
// - NB_TIMERS : number of TIMERS per cluster (if not zero)
// - NB_TTYS   : number of TTY terminals per cluster (if not zero) 
//
// The following base addresses must be defined in the sys.ld file:
// - seg_icu_base
// - seg_timer_base
// - seg_tty_base
// - seg_gcd_base
// - seg_dma_base
// - seg_fb_base
// - seg_ioc_base
///////////////////////////////////////////////////////////////////////////////////

#include <sys_handler.h>
#include <giet_config.h>
#include <drivers.h>
#include <common.h>
#include <hwr_mapping.h>
#include <mips32_registers.h>
#include <ctx_handler.h>

#if !defined(NB_PROCS) 
# error: You must define NB_PROCS in 'giet_config.h' file!
#endif
#if !defined(NB_CLUSTERS) 
# error: You must define NB_CLUSTERS in 'giet_config.h' file!
#endif
#if !defined(CLUSTER_SPAN) 
# error: You must define CLUSTER_SPAN in 'giet_config.h' file!
#endif
#if !defined(NB_TTYS)
# error: You must define NB_TTYS in 'giet_config.h' file!
#endif
#if !defined(NB_DMAS)
# error: You must define NB_DMAS in 'giet_config.h' file!
#endif
#if !defined(NB_TIMERS)
# error: You must define NB_TIMERS in 'giet_config.h' file!
#endif

/////////////////////////////////////////////////////////////////////////////
// 	Global (uncachable) variables
/////////////////////////////////////////////////////////////////////////////

#define in_unckdata __attribute__((section (".unckdata")))

in_unckdata volatile unsigned int  _dma_status[NB_DMAS];
in_unckdata volatile unsigned char _dma_busy[NB_DMAS] = { [0 ... NB_DMAS-1] = 0 };

in_unckdata volatile unsigned char _ioc_status;
in_unckdata volatile unsigned char _ioc_done = 0;
in_unckdata volatile unsigned int  _ioc_lock = 0;

in_unckdata volatile unsigned int  _tty_lock[NB_TTYS] = { [0 ... NB_TTYS-1] = 0 };
in_unckdata volatile unsigned char _tty_get_buf[NB_TTYS];
in_unckdata volatile unsigned char _tty_get_full[NB_TTYS] = { [0 ... NB_TTYS-1] = 0 };

//////////////////////////////////////////////////////////////////////////////
// 	VciMultiTimer driver
//////////////////////////////////////////////////////////////////////////////
// The number of independant timers per cluster is defined by the
// configuration parameter NB_TIMERS.
// The total number of timers is NB_CLUSTERS * NB_TIMERS
// The global timer index = cluster_id*NB_TIMER + timer_id
//////////////////////////////////////////////////////////////////////////////

//////////////////////////////////////////////////////////////////////////////
// _timer_write()
//
// Write a 32-bit word in a memory mapped register of a timer device. 
// Returns 0 if success, > 0 if error.
//////////////////////////////////////////////////////////////////////////////
unsigned int _timer_write( unsigned int global_timer_index,
                           unsigned int register_index, 
                           unsigned int value )
{
    volatile unsigned int *timer_address;

    unsigned int	cluster_id = global_timer_index / NB_TIMERS;
    unsigned int	timer_id   = global_timer_index % NB_TIMERS;

    /* parameters checking */
    if ( register_index >= TIMER_SPAN) 			return 1;
    if ( global_timer_index >= NB_CLUSTERS*NB_TIMERS ) 	return 1;

    timer_address = (unsigned int*)&seg_timer_base + 
                    ( cluster_id * CLUSTER_SPAN )  +
                    ( timer_id * TIMER_SPAN );

    timer_address[register_index] = value; /* write word */

    return 0;
}

//////////////////////////////////////////////////////////////////////////////
// _timer_read()
//
// Read a 32-bit word in a memory mapped register of a timer device. 
// Returns 0 if success, > 0 if error.
//////////////////////////////////////////////////////////////////////////////
unsigned int _timer_read(unsigned int global_timer_index,
                         unsigned int register_index, 
                         unsigned int *buffer)
{
    volatile unsigned int *timer_address;

    unsigned int	cluster_id = global_timer_index / NB_TIMERS;
    unsigned int	timer_id   = global_timer_index % NB_TIMERS;

    /* parameters checking */
    if ( register_index >= TIMER_SPAN) 			return 1;
    if ( global_timer_index >= NB_CLUSTERS*NB_TIMERS ) 	return 1;

    timer_address = (unsigned int*)&seg_timer_base + 
                    ( cluster_id * CLUSTER_SPAN )  +
                    ( timer_id * TIMER_SPAN );

    *buffer = timer_address[register_index]; /* read word */

    return 0;
}

/////////////////////////////////////////////////////////////////////////////////
// 	VciMultiTty driver
/////////////////////////////////////////////////////////////////////////////////
// The total number of TTYs is defined by the configuration parameter NB_TTYS.
// The system terminal is TTY[0].
// The TTYs are allocated to applications by the GIET in the boot phase.
// The nummber of TTYs allocated to each application, and the TTY used by each
// task can be defined in the mapping_info data structure.
// For each user task, the tty_id is stored in the context of the task (slot 34),
// and must be explicitely defined in the boot code.
// The TTY address is always computed as : seg_tty_base + tty_id*TTY_SPAN
///////////////////////////////////////////////////////////////////////////////////

///////////////////////////////////////////////////////////////////////////////////
// tty_get_lock()
//
// This blocking function is intended to be used by the _tty_write() function
// to provide exclusive access to the TTY. It is not used yet, because it appears
// that it creates livelock situations...
///////////////////////////////////////////////////////////////////////////////////
static inline void _tty_get_lock( unsigned int tty_id )
{
    register unsigned int delay = (_proctime() & 0xF) << 4;
    register unsigned int *plock = (unsigned int*)&_tty_lock[tty_id];

    asm volatile (
            "_tty_llsc:             \n"
            "ll   $2,    0(%0)      \n" /* $2 <= _tty_lock current value */
            "bnez $2,    _tty_delay \n" /* delay if _tty_lock already taken */
            "li   $3,    1          \n" /* $3 <= argument for sc */
            "sc   $3,    0(%0)      \n" /* try to set _tty_lock */
            "bnez $3,    _tty_ok    \n" /* exit if atomic */
            "_tty_delay:            \n"
            "move $4,    %1         \n" /* $4 <= delay */
            "_tty_loop:             \n"
            "addi $4,    $4,    -1  \n" /* $4 <= $4 - 1 */
            "beqz $4,    _tty_loop  \n" /* test end delay */
            "j           _tty_llsc  \n" /* retry */
            "_tty_ok:               \n"
            :
            :"r"(plock), "r"(delay)
            :"$2", "$3", "$4");
}

//////////////////////////////////////////////////////////////////////////////
// _tty_write()
//
// Write one or several characters directly from a fixed-length user buffer to
// the TTY_WRITE register of the TTY controler.
// It doesn't use the TTY_PUT_IRQ interrupt and the associated kernel buffer.
// This is a non blocking call: it tests the TTY_STATUS register, and stops
// the transfer as soon as the TTY_STATUS[WRITE] bit is set. 
// The function returns  the number of characters that have been written.
//////////////////////////////////////////////////////////////////////////////
unsigned int _tty_write(const char *buffer, unsigned int length)
{
    volatile unsigned int *tty_address;

    unsigned int proc_id;
    unsigned int task_id;
    unsigned int tty_id;
    unsigned int nwritten;

    proc_id = _procid();
    
    task_id = _scheduler[proc_id].current;
    tty_id  = _scheduler[proc_id].context[task_id][CTX_TTY_ID];

    tty_address = (unsigned int*)&seg_tty_base + tty_id*TTY_SPAN;

    for (nwritten = 0; nwritten < length; nwritten++)
    {
        /* check tty's status */
        if ((tty_address[TTY_STATUS] & 0x2) == 0x2)
            break;
        else
            /* write character */
            tty_address[TTY_WRITE] = (unsigned int)buffer[nwritten];
    }
    return nwritten;
}

//////////////////////////////////////////////////////////////////////////////
// _tty_read_irq()
//
// This non-blocking function uses the TTY_GET_IRQ[tty_id] interrupt and 
// the associated // kernel buffer, that has been written by the ISR.
// It fetches one single character from the _tty_get_buf[tty_id] kernel
// buffer, writes this character to the user buffer, and resets the
// _tty_get_full[tty_id] buffer.
// Returns 0 if the kernel buffer is empty, 1 if the buffer is full.
//////////////////////////////////////////////////////////////////////////////
unsigned int _tty_read_irq(char *buffer, unsigned int length)
{
    unsigned int proc_id;
    unsigned int task_id;
    unsigned int tty_id;
    unsigned int ret;

    proc_id = _procid();
    task_id = _scheduler[proc_id].current;
    tty_id  = _scheduler[proc_id].context[task_id][CTX_TTY_ID];

    if (_tty_get_full[tty_id] == 0) 
    {
        ret = 0;
    }
    else
    {
        *buffer = _tty_get_buf[tty_id];
        _tty_get_full[tty_id] = 0;
        ret = 1;
    }
    return ret;
}

////////////////////////////////////////////////////////////////////////////////
// _tty_read()
//
// This non-blocking function fetches one character directly from the TTY_READ 
// register of the TTY controler, and writes this character to the user buffer.
// It doesn't use the TTY_GET_IRQ interrupt and the associated kernel buffer.
// It doesn't take the lock protecting exclusive access...
// Returns 0 if the register is empty, 1 if the register is full.
////////////////////////////////////////////////////////////////////////////////
unsigned int _tty_read(char *buffer, unsigned int length)
{
    volatile unsigned int *tty_address;

    unsigned int proc_id;
    unsigned int task_id;
    unsigned int tty_id;

    proc_id = _procid();
    task_id = _scheduler[proc_id].current;
    tty_id  = _scheduler[proc_id].context[task_id][CTX_TTY_ID];

    tty_address = (unsigned int*)&seg_tty_base + tty_id*TTY_SPAN;

    if ((tty_address[TTY_STATUS] & 0x1) != 0x1) return 0;

    *buffer = (char)tty_address[TTY_READ];
    return 1;
}

////////////////////////////////////////////////////////////////////////////////
// 	VciMultiIcu driver
////////////////////////////////////////////////////////////////////////////////
// There is in principle one MULTI-ICU component per cluster, and the
// number of independant ICUs is equal to NB_PROCS, because there is 
// one ICU per processor.
////////////////////////////////////////////////////////////////////////////////

////////////////////////////////////////////////////////////////////////////////
// _icu_write()
//
// Write a 32-bit word in a memory mapped register of the ICU device. The
// base address is deduced by the proc_id.
// Returns 0 if success, > 0 if error.
////////////////////////////////////////////////////////////////////////////////
unsigned int _icu_write(unsigned int register_index, unsigned int value)
{
    volatile unsigned int *icu_address;
    unsigned int proc_id;

    /* parameters checking */
    if (register_index >= ICU_END)
        return 1;

    proc_id = _procid();
    icu_address = (unsigned int*)&seg_icu_base + (proc_id * ICU_SPAN);
    icu_address[register_index] = value;   /* write word */
    return 0;
}

////////////////////////////////////////////////////////////////////////////////
// _icu_read()
//
// Read a 32-bit word in a memory mapped register of the ICU device. The
// ICU base address is deduced by the proc_id.
// Returns 0 if success, > 0 if error.
////////////////////////////////////////////////////////////////////////////////
unsigned int _icu_read(unsigned int register_index, unsigned int *buffer)
{
    volatile unsigned int *icu_address;
    unsigned int proc_id;

    /* parameters checking */
    if (register_index >= ICU_END)
        return 1;

    proc_id = _procid();
    icu_address = (unsigned int*)&seg_icu_base + (proc_id * ICU_SPAN);
    *buffer = icu_address[register_index]; /* read word */
    return 0;
}

////////////////////////////////////////////////////////////////////////////////
// 	VciGcd driver
////////////////////////////////////////////////////////////////////////////////
// The Greater Dommon Divider is a -very- simple hardware coprocessor
// performing the computation of a GCD of two 32 bits integers.
// It has no DMA capability.
////////////////////////////////////////////////////////////////////////////////

////////////////////////////////////////////////////////////////////////////////
// _gcd_write()
//
// Write a 32-bit word in a memory mapped register of the GCD coprocessor.
// Returns 0 if success, > 0 if error.
////////////////////////////////////////////////////////////////////////////////
unsigned int _gcd_write(unsigned int register_index, unsigned int value)
{
    volatile unsigned int *gcd_address;

    /* parameters checking */
    if (register_index >= GCD_END)
        return 1;

    gcd_address = (unsigned int*)&seg_gcd_base;
    gcd_address[register_index] = value; /* write word */
    return 0;
}

////////////////////////////////////////////////////////////////////////////////
// _gcd_read()
//
// Read a 32-bit word in a memory mapped register of the GCD coprocessor.
// Returns 0 if success, > 0 if error.
////////////////////////////////////////////////////////////////////////////////
unsigned int _gcd_read(unsigned int register_index, unsigned int *buffer)
{
    volatile unsigned int *gcd_address;

    /* parameters checking */
    if (register_index >= GCD_END)
        return 1;

    gcd_address = (unsigned int*)&seg_gcd_base;
    *buffer = gcd_address[register_index]; /* read word */
    return 0;
}

////////////////////////////////////////////////////////////////////////////////
// VciBlockDevice driver
////////////////////////////////////////////////////////////////////////////////
// The VciBlockDevice is a simple external storage contrôler.
// The three functions below use the three variables _ioc_lock _ioc_done,  and
// _ioc_status for synchronsation.
// As the IOC component can be used by several programs running in parallel,
// the _ioc_lock variable guaranties exclusive access to the device.  The
// _ioc_read() and _ioc_write() functions use atomic LL/SC to get the lock.
// and set _ioc_lock to a non zero value.  The _ioc_write() and _ioc_read()
// functions are blocking, polling the _ioc_lock variable until the device is
// available.
// When the tranfer is completed, the ISR routine activated by the IOC IRQ
// set the _ioc_done variable to a non-zero value. Possible address errors
// detected by the IOC peripheral are reported by the ISR in the _ioc_status
// variable.
// The _ioc_completed() function is polling the _ioc_done variable, waiting for
// tranfer conpletion. When the completion is signaled, the _ioc_completed()
// function reset the _ioc_done variable to zero, and releases the _ioc_lock
// variable.
//
// In a multi-processing environment, this polling policy should be replaced by
// a descheduling policy for the requesting process.
///////////////////////////////////////////////////////////////////////////////

///////////////////////////////////////////////////////////////////////////////
// _ioc_get_lock()
//
// This blocking helper is used by '_ioc_read()' and '_ioc_write()' functions
// to get _ioc_lock using atomic LL/SC.
///////////////////////////////////////////////////////////////////////////////
static inline void _ioc_get_lock()
{
    register unsigned int delay = (_proctime() & 0xF) << 4;
    register unsigned int *plock = (unsigned int*)&_ioc_lock;

    asm volatile (
            "_ioc_llsc:             \n"
            "ll   $2,    0(%0)      \n" /* $2 <= _ioc_lock current value */
            "bnez $2,    _ioc_delay \n" /* delay if _ioc_lock already taken */
            "li   $3,    1          \n" /* $3 <= argument for sc */
            "sc   $3,    0(%0)      \n" /* try to set _ioc_lock */
            "bnez $3,    _ioc_ok    \n" /* exit if atomic */
            "_ioc_delay:            \n"
            "move $4,    %1         \n" /* $4 <= delay */
            "_ioc_loop:             \n"
            "addi $4,    $4,    -1  \n" /* $4 <= $4 - 1 */
            "beqz $4,    _ioc_loop  \n" /* test end delay */
            "j           _ioc_llsc  \n" /* retry */
            "_ioc_ok:               \n"
            :
            :"r"(plock), "r"(delay)
            :"$2", "$3", "$4");
}

///////////////////////////////////////////////////////////////////////////////
//  _ioc_write()
//
// Transfer data from a memory buffer to a file on the block_device. 
// The source memory buffer must be in user address space.
// - lba    : first block index on the disk.
// - buffer : base address of the memory buffer.
// - count  : number of blocks to be transfered.
// Returns 0 if success, > 0 if error.
///////////////////////////////////////////////////////////////////////////////
unsigned int _ioc_write( unsigned int 	lba, 
                         const void*	buffer, 
                         unsigned int 	count)
{
    volatile unsigned int *ioc_address;

    ioc_address = (unsigned int*)&seg_ioc_base;

    /* buffer must be in user space */
    unsigned int block_size = ioc_address[BLOCK_DEVICE_BLOCK_SIZE];

    if (((unsigned int)buffer >= 0x80000000)
            || (((unsigned int)buffer + block_size*count) >= 0x80000000))
        return 1;

    /* get the lock on ioc device */
    _ioc_get_lock();

    /* block_device configuration for the write transfer */
    ioc_address[BLOCK_DEVICE_BUFFER] = (unsigned int)buffer;
    ioc_address[BLOCK_DEVICE_COUNT] = count;
    ioc_address[BLOCK_DEVICE_LBA] = lba;
    ioc_address[BLOCK_DEVICE_IRQ_ENABLE] = 1;
    ioc_address[BLOCK_DEVICE_OP] = BLOCK_DEVICE_WRITE;

    return 0;
}

///////////////////////////////////////////////////////////////////////////////
// _ioc_read()
//
// Transfer data from a file on the block device to a memory buffer. 
// The destination memory buffer must be in user address space.
// - lba    : first block index on the disk.
// - buffer : base address of the memory buffer.
// - count  : number of blocks to be transfered.
// All cache lines corresponding to the the target buffer are invalidated
// for cache coherence.
// Returns 0 if success, > 0 if error.
///////////////////////////////////////////////////////////////////////////////
unsigned int _ioc_read( unsigned int 	lba, 
                        void*		buffer, 
                        unsigned int	count )
{
    volatile unsigned int *ioc_address;

    ioc_address = (unsigned int*)&seg_ioc_base;

    /* buffer must be in user space */
    unsigned int block_size = ioc_address[BLOCK_DEVICE_BLOCK_SIZE];

    if (((unsigned int)buffer >= 0x80000000)
            || (((unsigned int)buffer + block_size*count) >= 0x80000000))
        return 1;

    /* get the lock on ioc device */
    _ioc_get_lock();

    /* block_device configuration for the read transfer */
    ioc_address[BLOCK_DEVICE_BUFFER] = (unsigned int)buffer;
    ioc_address[BLOCK_DEVICE_COUNT] = count;
    ioc_address[BLOCK_DEVICE_LBA] = lba;
    ioc_address[BLOCK_DEVICE_IRQ_ENABLE] = 1;
    ioc_address[BLOCK_DEVICE_OP] = BLOCK_DEVICE_READ;

    /* invalidation of data cache */
    _dcache_buf_invalidate(buffer, block_size*count);

    return 0;
}

/////////////////////////////////////////////////////////////////////////////////
// _ioc_completed()
//
// This function checks completion of an I/O transfer and reports errors. 
// As it is a blocking call, the processor is stalled until the next interrupt.
// Returns 0 if success, > 0 if error.
/////////////////////////////////////////////////////////////////////////////////
unsigned int _ioc_completed()
{
    unsigned int ret;

    /* busy waiting */
    while (_ioc_done == 0)
        asm volatile("nop");

    /* test IOC status */
    if ((_ioc_status != BLOCK_DEVICE_READ_SUCCESS)
            && (_ioc_status != BLOCK_DEVICE_WRITE_SUCCESS)) ret = 1;	/* error */
    else						    ret = 0;	/* success */

    /* reset synchronization variables */
    _ioc_lock =0;
    _ioc_done =0;

    return ret;
}

//////////////////////////////////////////////////////////////////////////////////
// 	VciFrameBuffer driver
//////////////////////////////////////////////////////////////////////////////////
// The '_fb_sync_write' and '_fb_sync_read' functions use a memcpy strategy to
// implement the transfer between a data buffer (user space) and the frame
// buffer (kernel space). They are blocking until completion of the transfer.
// The '_fb_write()', '_fb_read()' and '_fb_completed()' functions use the DMA
// coprocessor to transfer data between the user buffer and the frame buffer.
// These  functions use a polling policy to test the global variables _dma_busy[i] 
// and detect the transfer completion.  
// There is  NB_PROCS DMA channels, that are indexed by the proc_id.
// The _dma_busy[i] synchronisation variables (one per channel) are set by the OS, 
// and reset by the ISR.
//////////////////////////////////////////////////////////////////////////////////

//////////////////////////////////////////////////////////////////////////////////
// _fb_sync_write()
// Transfer data from an memory buffer to the frame_buffer device using 
// a memcpy. The source memory buffer must be in user address space.
// - offset : offset (in bytes) in the frame buffer.
// - buffer : base address of the memory buffer.
// - length : number of bytes to be transfered.
// Returns 0 if success, > 0 if error.
//////////////////////////////////////////////////////////////////////////////////
unsigned int _fb_sync_write( unsigned int	offset, 
                             const void* 	buffer, 
                             unsigned int 	length )
{
    volatile unsigned char *fb_address;

    /* buffer must be in user space */
    if (((unsigned int)buffer >= 0x80000000)
            || (((unsigned int)buffer + length ) >= 0x80000000 ))
        return 1;

    fb_address = (unsigned char*)&seg_fb_base + offset;

    /* buffer copy */
    memcpy((void*)fb_address, (void*)buffer, length);

    return 0;
}

//////////////////////////////////////////////////////////////////////////////////
// _fb_sync_read()
// Transfer data from the frame_buffer device to a memory buffer using
// a memcpy. The destination memory buffer must be in user address space.
// - offset : offset (in bytes) in the frame buffer.
// - buffer : base address of the memory buffer.
// - length : number of bytes to be transfered.
// Returns 0 if success, > 0 if error.
//////////////////////////////////////////////////////////////////////////////////
unsigned int _fb_sync_read( unsigned int 	offset, 
                            const void*		buffer, 
                            unsigned int 	length )
{
    volatile unsigned char *fb_address;

    /* parameters checking */
    /* buffer must be in user space */
    if (((unsigned int)buffer >= 0x80000000)
            || (((unsigned int)buffer + length ) >= 0x80000000 ))
        return 1;

    fb_address = (unsigned char*)&seg_fb_base + offset;

    /* buffer copy */
    memcpy((void*)buffer, (void*)fb_address, length);

    return 0;
}

//////////////////////////////////////////////////////////////////////////////////
// _fb_write()
// Transfer data from an memory buffer to the frame_buffer device using a DMA.
// The source memory buffer must be in user address space.
// - offset : offset (in bytes) in the frame buffer.
// - buffer : base address of the memory buffer.
// - length : number of bytes to be transfered.
// Returns 0 if success, > 0 if error.
//////////////////////////////////////////////////////////////////////////////////
unsigned int _fb_write( unsigned int 	offset, 
                        const void*	buffer, 
                        unsigned int 	length )
{
    volatile unsigned char *fb_address;
    volatile unsigned int *dma;

    unsigned int proc_id;
    unsigned int delay;
    unsigned int i;

    /* buffer must be in user space */
    if (((unsigned int)buffer >= 0x80000000)
            || (((unsigned int)buffer + length ) >= 0x80000000 ))
        return 1;

    proc_id = _procid();
    fb_address = (unsigned char*)&seg_fb_base + offset;
    dma = (unsigned int*)&seg_dma_base + (proc_id * DMA_SPAN);

    /* waiting until DMA device is available */
    while (_dma_busy[proc_id] != 0)
    {
        /* if the lock failed, busy wait with a pseudo random delay between bus
         * accesses */
        delay = (_proctime() & 0xF) << 4;
        for (i = 0; i < delay; i++)
            asm volatile("nop");
    }
    _dma_busy[proc_id] = 1;

    /* DMA configuration for write transfer */
    dma[DMA_IRQ_DISABLE] = 0;
    dma[DMA_SRC] = (unsigned int)buffer;
    dma[DMA_DST] = (unsigned int)fb_address;
    dma[DMA_LEN] = (unsigned int)length;
    return 0;
}

//////////////////////////////////////////////////////////////////////////////////
// _fb_read()
// Transfer data from the frame_buffer device to an memory buffer using a DMA.
// The destination memory buffer must be in user address space.
// - offset : offset (in bytes) in the frame buffer.
// - buffer : base address of the memory buffer.
// - length : number of bytes to be transfered.
// All cache lines corresponding to the the target buffer are invalidated
// for cache coherence.
// Returns 0 if success, > 0 if error.
//////////////////////////////////////////////////////////////////////////////////
unsigned int _fb_read( unsigned int 	offset, 
                       const void*	buffer, 
                       unsigned int 	length )
{
    volatile unsigned char *fb_address;
    volatile unsigned int *dma;

    unsigned int proc_id;
    unsigned int delay;
    unsigned int i;

    /* buffer must be in user space */
    if (((unsigned int)buffer >= 0x80000000)
            || (((unsigned int)buffer + length ) >= 0x80000000 ))
        return 1;

    proc_id = _procid();
    fb_address = (unsigned char*)&seg_fb_base + offset;
    dma = (unsigned int*)&seg_dma_base + (proc_id * DMA_SPAN);

    /* waiting until DMA device is available */
    while (_dma_busy[proc_id] != 0)
    {
        /* if the lock failed, busy wait with a pseudo random delay between bus
         * accesses */
        delay = (_proctime() & 0xF) << 4;
        for (i = 0; i < delay; i++)
            asm volatile("nop");
    }
    _dma_busy[proc_id] = 1;

    /* DMA configuration for write transfer */
    dma[DMA_IRQ_DISABLE] = 0;
    dma[DMA_SRC] = (unsigned int)fb_address;
    dma[DMA_DST] = (unsigned int)buffer;
    dma[DMA_LEN] = (unsigned int)length;

    /* invalidation of data cache */
    _dcache_buf_invalidate(buffer, length);

    return 0;
}

//////////////////////////////////////////////////////////////////////////////////
// _fb_completed()
// This function checks completion of a DMA transfer to or fom the frame buffer.
// As it is a blocking call, the processor is stalled until the next interrupt.
// Returns 0 if success, > 0 if error.
//////////////////////////////////////////////////////////////////////////////////
unsigned int _fb_completed()
{
    unsigned int proc_id;

    proc_id = _procid();

    while (_dma_busy[proc_id] != 0)
        asm volatile("nop");

    if (_dma_status[proc_id] != 0)
        return 1;

    return 0;
}

