////////////////////////////////////////////////////////////////////////////////////////
// File   : vld.c  
// Date   : octobre 2015
// author : Alain Greiner
////////////////////////////////////////////////////////////////////////////////////////
// This file define the code of the VLD (Variable Length Decoder) thread for the MJPEG
// application. This function makes the analysis of the variable length bit stream, 
// resulting from the Huffman entropy coder.
// For each image:
// - It read the Huffman Table parameters from the <in_huff> MWMR channel.
// - It read the bit stream from the <in_data> MWMR channel.
// - It write output pixels (two bytes per pixel) to the <out> MWMR channel.
// It uses MWMR_BUFIO buffers for the input channels, but not for the output channel.
////////////////////////////////////////////////////////////////////////////////////////

#include <stdio.h>
#include <stdint.h>
#include <mwmr_channel.h>
#include "mjpeg.h"

#define HUFF_EOB               0x00
#define HUFF_ZRL               0xF0

// macro to use a shared TTY
#define PRINTF(...)    lock_acquire( &tty_lock ); \
                       giet_tty_printf(__VA_ARGS__);  \
                       lock_release( &tty_lock );

////////////////////////////////////////////////////////////////////////////////////////
// bitreader_t : data structure and access functions to analyse a bit-stream.
////////////////////////////////////////////////////////////////////////////////////////

typedef struct 
{
    mwmr_bufio_t*     bufio;         // associated bufio 
    uint8_t           current;       // temporary buffer of one byte
    uint8_t           available;     // number of bits to read in current byte
} bitreader_t;

////////////////////////////////////////////////////
// returns <number> bits from the associated bufio.
////////////////////////////////////////////////////
uint32_t bitreader_get( bitreader_t*  stream,
                        uint32_t      number )
{
    uint32_t  ret = 0;

    giet_pthread_assert( (number <= 16) ,
    "ERROR in bitreader_get() : illegal number argument");

    if (stream->available) 
    {
        stream->current &= (1<<stream->available)-1;
    }

    while (number) 
    {
        if ( stream->available == 0 )  // current buffer empty => refill
        {
            stream->current = mwmr_bufio_read_byte( stream->bufio );
            stream->available = 8;
        }
        if ( number == stream->available ) 
        {
            stream->available = 0;
            ret = (ret<<number) | stream->current;
            break;
        }
        if ( number < stream->available ) 
        {
            ret = (ret<<number) | (stream->current>>(stream->available-number));
            stream->available -= number;
            break;
        }
        if ( number > stream->available ) 
        {
            ret = (ret<<stream->available) | stream->current;
            number -= stream->available;
            stream->available = 0;
        }
    }

    return ret;
}

///////////////////////////////////////////////
// returns one bit from the associated bufio.
///////////////////////////////////////////////
uint8_t bitreader_get_one( bitreader_t*  stream )
{
    if ( stream->available == 0 ) // current buffer empty => refill
    {
        stream->current = mwmr_bufio_read_byte( stream->bufio );
        stream->available = 7;
    }
    else
    {
        --(stream->available);
    }

    uint32_t tmp = 1<<(stream->available);

    return ( !!(stream->current & tmp) );
}

/////////////////////////////////////////
void bitreader_init( bitreader_t*   stream,
                     mwmr_bufio_t*  bufio )
{
    stream->available = 0;
    stream->current   = 0;
    stream->bufio     = bufio;
}

//////////////////////////////////////////////////////////////////////////////////////////
//       data structures and access functions for the Huffman tables
// - We have two tables (DC and AC), and 16 possible code lengths (from 1 to 16).
// - DC_Table[12]  : 12 possible symbol values for the DC Table.
// - AC_Table[162] : 162 possible symbol values for the AC Table.
// - ValPtr[t][l]  : index in Table <t> for the first code of length <l>
// - MinCode[t][l] : min value for codes of length <l> 
// - MaxCode[t][l] : max value for codes of length <l> / (-1) if no code of length <l> 
//////////////////////////////////////////////////////////////////////////////////////////

typedef struct 
{
    uint8_t*  HT[2];                  // HT[0] == DC_Table / HT[1] == AC_Table 
    int32_t   MinCode[2][16];         // two types of tables / 16 code lengths
    int32_t   MaxCode[2][16];         // two types of tables / 16 code lengths
    int32_t   ValPtr[2][16];          // two types of tables / 16 code lengths
    uint8_t   DC_Table[12];           // at most 12 values
    uint8_t   AC_Table[162];          // at most 162 values
} huff_tables_t;

/////////////////////////////////////////////
void huff_tables_init( huff_tables_t*  huff )
{
    int32_t i, j;

    for ( j=0; j<16; ++j ) 
    {
        for ( i=0; i<2; ++i ) 
        {
            huff->MinCode[i][j] = 0;
            huff->MaxCode[i][j] = 0;
            huff->ValPtr[i][j] = 0;
        }
    }

    for ( i=0; i<14; ++i )    huff->DC_Table[i] = 0;

    for ( i=0; i<162; ++i )   huff->AC_Table[i] = 0;

    huff->HT[0] = huff->DC_Table;
    huff->HT[1] = huff->AC_Table;
}

////////////////////////////////////////////
void huff_tables_dump( huff_tables_t* huff,
                       uint32_t       is_ac )
{
    uint32_t j;
    int32_t  code;

    uint32_t max  = ( is_ac ) ? 162 : 12;
    uint32_t type = ( is_ac ) ? 1 : 0;

    if ( is_ac ) { PRINTF("\n AC Huffman Table\n\n") }
    else         { PRINTF("\n DC Huffman Table\n\n") }

    for ( j = 0; j < 16; j++ ) // j = code_length - 1
    {
        PRINTF(" length = %d / mincode = %x / maxcode = %x / valptr = %d\n",
        j+1 , huff->MinCode[type][j] , huff->MaxCode[type][j] , huff->ValPtr[type][j] )
    }

    PRINTF("\n")

    for ( j = 0 ; j < 16 ; j++ )  // j == code_length - 1
    {
        for ( code = huff->MinCode[type][j] ; code <= huff->MaxCode[type][j] ; code++ )
        {
            uint32_t index = huff->ValPtr[type][j] + code - huff->MinCode[type][j];

            giet_pthread_assert( (index<max) , "ERROR in huff_tables_dump() : overflow");

            PRINTF(" Symbol[%d] = %x / code[%d] = %x\n", 
            index , huff->HT[type][index] , index , code )
        }
    }
 
    PRINTF("\n")
}

/////////////////////////////////////////////
void huff_tables_load( uint32_t        index,
                       huff_tables_t*  huff,
                       mwmr_bufio_t*   bufio )
{
    uint8_t    byte;
    uint32_t   is_ac;          // AC Table if non zero

    uint8_t    LeavesN;        // number of codes of length N (from 1 to 16)
    uint8_t    LeavesT;        // cumulated total number of codes
    uint32_t   AuxCode;        // used to compute code values

    // read length (2 bytes) from bufio
    uint32_t length = (uint32_t)mwmr_bufio_read_byte( bufio );
    length = (length<<8) | mwmr_bufio_read_byte( bufio );

    // read Tc/Th  (1 byte) from bufio 
    // Th must be null, Tc must be 0 or 1
    byte = mwmr_bufio_read_byte( bufio );

    giet_pthread_assert( ((byte & 0xEF) == 0) ,
    "ERROR in huff_load_table() : non supported HT header");

    if (byte == 0) is_ac = 0;
    else           is_ac = 1;
 
    uint32_t max_size = ( is_ac ) ? 162 : 12;

    // get the 16 LeavesN values from bufio
    uint32_t   i;
    LeavesT = 0;
    AuxCode = 0;
    for ( i=0; i<16; i++ ) 
    {
        // read one byte from bufio
        LeavesN = mwmr_bufio_read_byte( bufio );

        huff->ValPtr[is_ac][i] = LeavesT;
        huff->MinCode[is_ac][i] = AuxCode<<1;
        AuxCode = huff->MinCode[is_ac][i] + LeavesN;
        huff->MaxCode[is_ac][i] = (LeavesN) ? (AuxCode - 1) : (-1);
        LeavesT += LeavesN;
    }

    giet_pthread_assert( (length ==  19 + LeavesT) ,
    "ERROR in huff_load_table() : illegal HT length");

    giet_pthread_assert( (LeavesT <= max_size) , 
    "ERROR in huff_load_table() : too much symbols");

    // get the symbol values from bufio (one byte per symbol)
    // complete table with zero values if LeavesT < max_size
    for ( i=0 ; i<max_size ; ++i ) 
	   {
        if ( i < LeavesT )  huff->HT[is_ac][i] = mwmr_bufio_read_byte( bufio );
        else                huff->HT[is_ac][i] = 0; 
    }

    // align bufio pointer on next item
    mwmr_bufio_align( bufio );
    
#if (DEBUG_VLD > 3) 
if ( (index == DEBUG_CLUSTER_INDEX) || (DEBUG_CLUSTER_INDEX == 0XFFFFFFFF) )
huff_tables_dump( huff , is_ac );
#endif

}  // end huff_tables_load()

//////////////////////////////////////////////////////////
// extract a symbol (8 bits) from an Huffman encoded
// bit-stream, using the specified huffman table 
/////////////////////////////////////////////////////////
int8_t huff_get_symbol( bitreader_t*     stream,
                        huff_tables_t*   huff,
                        int32_t          select )   // DC if zero / AC if non zero
{
    uint32_t length;
    int32_t  code = 0;

    uint32_t is_ac    = (select) ? 1 : 0;
    uint32_t max_size = (select) ? 162 : 12;
    
    for ( length = 0 ; length < 16 ; ++length ) 
    {
        code = (code<<1) | bitreader_get_one( stream );
        if ( code <= huff->MaxCode[select][length] )   break;
    }

    uint32_t index = huff->ValPtr[is_ac][length] + code - huff->MinCode[is_ac][length];

    giet_pthread_assert( (index <= max_size) ,
    "ERROR in huff_get_symbol() : Huffman table overflow");

    return huff->HT[is_ac][index];
} 

///////////////////////////////////////////////////////////////////////////
// transform JPEG encoded positive/negative value coded as ( S , nbits )
// to a standard 16 bits 2's complement number (int16_t).
// - nbits is the magnitude (number of significant bits in S
// - most significant bit in S is 0 for positive / 1 for negative
// - other bits in S define the value in 2**(nbits-1) possible values
///////////////////////////////////////////////////////////////////////////
static int16_t reformat( uint32_t S , int32_t nbits )
{
    uint32_t  ext;
    uint32_t  sign;
    int16_t   value;

    if ( nbits == 0 )  return 0;

    sign = !( (1 << (nbits - 1)) & S );
    ext = 0 - (sign << nbits);
    value = (S | ext) + sign;

    return value;
}

////////////////////////////////////////////////////////
// unpack a 8*8 pixels block with 2 bytes per pixel
//////////////////////////////////////////////////////////
static int16_t vld_unpack_block( uint32_t          index,     // cluster index
                                 uint32_t          image,     // image index
                                 uint32_t          block,     // block index
                                 bitreader_t*      stream,    // input bit stream
                                 mwmr_channel_t*   mwmr_out,  // output channel
                                 huff_tables_t*    huff,      // Huffman Tables
                                 int16_t           prev_dc )  // previous DC coef
{
    uint32_t temp;
    uint32_t i;
    uint32_t run;
    uint32_t cat;
    int32_t  value;
    uint8_t  symbol;
    int16_t  buf[64];       // buffer for the 64 resulting pixels (2 bytes per pixel)

    // set default values
    for (i = 0; i < 64; i++) buf[i] = 0;

    // compute the DC coefficient
    symbol   = huff_get_symbol( stream , huff , 0 );   // use DC Huffman Table

    temp     = bitreader_get( stream , symbol );
    value    = reformat( temp , symbol );
    buf[0]   = value + prev_dc;

#if (DEBUG_VLD > 2)
if ( (index == DEBUG_CLUSTER_INDEX) || (DEBUG_CLUSTER_INDEX == 0XFFFFFFFF) )
{ PRINTF("\nVLD[%d] : DC[0] = %d / reformat( %x , %d ) = %d\n",
         index , buf[0], temp , symbol , value ) }
#endif

    // compute the 63 AC coefficients
    for (i = 1; i < 64; i++) 
    {
        symbol = huff_get_symbol( stream , huff , 1 );   // use AC Huffman Table

        // in case of HUFF_EOB symbol, all other pixels are zero
        if ( symbol == HUFF_EOB )
        {

#if (DEBUG_VLD > 2)
if ( (index == DEBUG_CLUSTER_INDEX) || (DEBUG_CLUSTER_INDEX == 0XFFFFFFFF) )
{ PRINTF("\nVLD[%d] : EOB found at i = %d\n", index , i ) }
#endif 
            break;
        }
 
        // in case of HUFF_ZRL symbol (0xF0) , 15 next pixels are zero
        if (symbol == HUFF_ZRL) 
        {
            i += 15;
            continue;
        }

        cat = symbol & 0xf;
        run = symbol >> 4;
        i += run;

        temp   = bitreader_get (stream , cat );
        value  = reformat( temp , cat );
        buf[i] = value;

#if (DEBUG_VLD > 2)
if ( (index == DEBUG_CLUSTER_INDEX) || (DEBUG_CLUSTER_INDEX == 0XFFFFFFFF) )
{ PRINTF("\nVLD[%d] : AC[%d] = %d / reformat( %x , %d ) = %d\n",
         index , i , buf[i] , temp , cat , value ) }
#endif

    }

    // write one block to mwmr_out channel ( 2 bytes per pixel)
    mwmr_write( mwmr_out, (uint32_t*)buf , 32 );

#if (DEBUG_VLD > 1 )
if ( (index == DEBUG_CLUSTER_INDEX) || (DEBUG_CLUSTER_INDEX == 0XFFFFFFFF) )
{ PRINTF("\nVLD[%d] completes block %d/%d in image %d\n"
         "  %d  %d  %d  %d  %d  %d  %d  %d\n"
         "  %d  %d  %d  %d  %d  %d  %d  %d\n"
         "  %d  %d  %d  %d  %d  %d  %d  %d\n"
         "  %d  %d  %d  %d  %d  %d  %d  %d\n"
         "  %d  %d  %d  %d  %d  %d  %d  %d\n"
         "  %d  %d  %d  %d  %d  %d  %d  %d\n"
         "  %d  %d  %d  %d  %d  %d  %d  %d\n"
         "  %d  %d  %d  %d  %d  %d  %d  %d\n",
         index , block , nblocks_w*nblocks_h , image ,
         buf[0] , buf[1] , buf[2] , buf[3] , buf[4] , buf[5] , buf[6] , buf[7] ,
         buf[8] , buf[9] , buf[10], buf[11], buf[12], buf[13], buf[14], buf[15],
         buf[16], buf[17], buf[18], buf[19], buf[20], buf[21], buf[22], buf[23],
         buf[24], buf[25], buf[26], buf[27], buf[28], buf[29], buf[30], buf[31],
         buf[32], buf[33], buf[34], buf[35], buf[36], buf[37], buf[38], buf[39],
         buf[40], buf[41], buf[42], buf[43], buf[44], buf[45], buf[46], buf[47],
         buf[48], buf[49], buf[50], buf[51], buf[52], buf[53], buf[54], buf[55],
         buf[56], buf[57], buf[58], buf[59], buf[60], buf[61], buf[62], buf[63]) }
#endif

    return buf[0];

} // end vld_unpack_block()


//////////////////////////////////////////////////////////////
__attribute__ ((constructor)) void vld( uint32_t index )
//////////////////////////////////////////////////////////////
{
    mwmr_channel_t*  mwmr_in_data = demux_2_vld_data[index];
    mwmr_channel_t*  mwmr_in_huff = demux_2_vld_huff[index];
    mwmr_channel_t*  mwmr_out     = vld_2_iqzz[index];

    huff_tables_t    huff;       // huffman tables
    bitreader_t      stream;     // bit stream buffer
    int16_t          prev_dc;    // previous block DC value
    uint32_t         block;      // block index 

    // get platform parameters
    uint32_t  x_size;
    uint32_t  y_size;
    uint32_t  nprocs;
    giet_procs_number( &x_size , &y_size , &nprocs );

    // get processor coordinates
    uint32_t         x, y, p;
    giet_proc_xyp( &x , &y , &p );

    // private TTY allocation
    // giet_tty_alloc( 0 );

    PRINTF("\n[MJPEG] thread VLD[%d] starts on P[%d,%d,%d] / trdid = %x\n",
           index , x , y , p, (uint32_t)trdid_vld[index] )


    // initialise BUFIO for MWMR channel <in_data>
    uint8_t       in_data_buffer[64];
    mwmr_bufio_t  bufio_in_data;
    mwmr_bufio_init( &bufio_in_data , in_data_buffer , 64 , 1 , mwmr_in_data );

#if (DEBUG_VLD > 1)
if ( (index == DEBUG_CLUSTER_INDEX) || (DEBUG_CLUSTER_INDEX == 0XFFFFFFFF) )
{ PRINTF("\nVLD[%d] <in_data> : &mwmr = %x / &bufio = %x\n",
         index , mwmr_in_data , &bufio_in_data ) }
#endif

    // initialise BUFIO for MWMR channel <in_huff>
    uint8_t       in_huff_buffer[64];
    mwmr_bufio_t  bufio_in_huff;
    mwmr_bufio_init( &bufio_in_huff , in_huff_buffer , 64 , 1 , mwmr_in_huff );

#if (DEBUG_VLD > 1)
if ( (index == DEBUG_CLUSTER_INDEX) || (DEBUG_CLUSTER_INDEX == 0XFFFFFFFF) )
{ PRINTF("\nVLD[%d] <in_huff> : &mwmr = %x / &bufio = %x\n",
         index , mwmr_in_huff , &bufio_in_huff ) }
#endif

    // initialise Huffman Tables
    huff_tables_init( &huff );

    uint32_t  image = index;
    uint32_t  nblocks = nblocks_h * nblocks_w;

    while ( image < MAX_IMAGES )  // one image per iteration
    {
        // load first Huffman Table from bufio_in_huff
        huff_tables_load( index, &huff , &bufio_in_huff );

        // load second Huffman Table from bufio_in_huff
        huff_tables_load( index, &huff , &bufio_in_huff );

#if (DEBUG_VLD > 1)
if ( (index == DEBUG_CLUSTER_INDEX) || (DEBUG_CLUSTER_INDEX == 0XFFFFFFFF) )
{ PRINTF("\nVLD[%d] load Huffman tables for image %d\n", index , image ) }
#endif

        // (re)initializes DC value for each image
        prev_dc = 0;

        // (re)align data bufio for each image
        mwmr_bufio_align( &bufio_in_data );

        // (re)initializes bit-stream for each image
        bitreader_init( &stream, &bufio_in_data );
    
        // loop on the blocks in current image
        for ( block = 0 ; block < nblocks ; block++ )
        {

#if (DEBUG_VLD > 1) 
if ( (index == DEBUG_CLUSTER_INDEX) || (DEBUG_CLUSTER_INDEX == 0XFFFFFFFF) )
{ PRINTF("\nVLD[%d] uncompress block %d/%d in image %d\n", index, block, nblocks, image ) }
#endif
            prev_dc = vld_unpack_block( index,
                                        image,
                                        block,
                                        &stream, 
                                        mwmr_out, 
                                        &huff, 
                                        prev_dc );
        }  // end for blocks

#if DEBUG_VLD 
if ( (index == DEBUG_CLUSTER_INDEX) || (DEBUG_CLUSTER_INDEX == 0XFFFFFFFF) )
{ PRINTF("\nVLD[%d] completes image %d at cycle %d\n", index , image , giet_proctime() ) }
#endif
        image = image + x_size*y_size;

    }  // end while on images

    giet_pthread_exit( "VLD completed" );

}  // end vld()

