/* This file is part of DSX.
 *
 * DSX is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * DSX is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with DSX; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
 *
 * Copyright (c) Lip6, Thalès
 *      Joel Porquet <joel.porquet@lip6.fr>, 2006-2007
 *
 * Based on Martin Fielder's work (http://keyj.s2000.ws/?page_id=41)
 */

#include <srl.h>
#include "decode_proto.h"

#include "bitstream_io.h"

#include "h264.h"
#include "utils.h"

/* for slice processing */
#include "dec_bitstream.h"
#include "modepredinfo.h"
#include "plane_pool.h"
#include "parameter_set.h"

/* for idct */
#include "transform.h"

/* for rendering */
#include "render.h"
#include "intra_render.h"
#include "inter_render.h"
#include "correc_block.h"

/* for struct data */
#include "mb_fifo.h"

#define cur_render  0
#define ref_render  1

FUNC(decode)
{
    /* Args for thread */
    srl_mwmr_t input_mwmr           = GET_ARG(input);
    srl_mwmr_t output_mwmr          = GET_ARG(output);
    srl_mwmr_t valid_output_mwmr    = GET_ARG(valid_output);

    srl_memspace_t plane_buffers_mem    = GET_ARG(plane_buffers);
    srl_memspace_t plane_pool_mem       = GET_ARG(plane_pool);
    srl_memspace_t ps_mem               = GET_ARG(parameter_set);
    ps_t *ps = SRL_MEMSPACE_ADDR(ps_mem);

    /* For bitstream reading */
    slice_t slice;
    slice_init_in(&slice, input_mwmr);
    bitreader_t bitreader;

    /* barrier */
    srl_barrier_t barrier = GET_ARG(barrier);

    /* Misc */
    unsigned char pipe_id = GET_ARG(pipe_id);

    /* frames */
    frame_t *frame[2];

    /* Plane pool */
    plane_pool_t *plane_pool = SRL_MEMSPACE_ADDR(plane_pool_mem);
    
    if (pipe_id == 0) 
        init_plane_pool(plane_pool, SRL_MEMSPACE_ADDR(plane_buffers_mem));

    srl_barrier_wait(barrier);
    
    /* Informations about decoding */
    mode_pred_inf_t mpi_instance;
    mode_pred_inf_t *mpi = &mpi_instance;

    /* Data to send to next threads */
    mb_fifo_global_t mb_fifo_G;
    mb_fifo_luma_t   mb_fifo_L;
    mb_fifo_chroma_t mb_fifo_C;
    
    /* For residual processing */
    short int LumaDCLevel[16];
    short int ChromaDCLevel[2][4];
        
    srl_log(TRACE, "DECODE thread is starting...\n");

    while(1) 
    {
        /* Misc vars */
        unsigned short int  CurrMbAddr;
        nal_inf_t           nal_inf;
        slice_inf_t         slice_inf;
        unsigned char       QPy;
        unsigned char       QPc;
        short int           mb_skip_run   = 0;
        char                prevMbSkipped = 0;
        
        slice_begin(&slice);

        /* Analyse the NAL unit */
        /* nal infos */
        nal_inf.ref_idc = slice_read_char(&slice);
        nal_inf.type    = slice_read_char(&slice);

        /* slice infos */
        bitreader_init(&bitreader, &slice, &slice_read_char);
        decode_slice_header(&slice_inf, &nal_inf, &bitreader, ps);
        QPy = slice_inf.QPy;
        QPc = 0;

        /* clear all mpi */
        clear_half_mode_pred_info(mpi, 0);
        clear_half_mode_pred_info(mpi, 1);
        
        get_plane_pool(plane_pool, slice_inf.pic_order_cnt_lsb >> 1, frame);

        CurrMbAddr = slice_inf.first_mb;

        /* decode slice */
        do {
            srl_assert(CurrMbAddr < NMBLOCKS);
            
            mb_mode_t mb_mode;

            /* fill data struct */
            /* coord of the mb in real world */
            mb_fifo_G.mb_pos_x = (CurrMbAddr % MBLOCKS_X) << 4;
            mb_fifo_G.mb_pos_y = (CurrMbAddr / MBLOCKS_X) << 4;

            srl_log_printf(TRACE, "(SLICE) - Reading bitstream for Macroblock #%d\n", CurrMbAddr);
            srl_log_printf(DEBUG, "(SLICE) - Macroblock #%d (%d, %d):\n", CurrMbAddr, mb_fifo_G.mb_pos_x, mb_fifo_G.mb_pos_y);

            /* clear a new line of mpi if needed */
            if (mb_fifo_G.mb_pos_x == 0)
                clear_half_mode_pred_info(mpi, (mb_fifo_G.mb_pos_y>>4)%2);

            /* - Inter block, p_skip type */
            if (slice_inf.type != I_SLICE)
            {
                if (mb_skip_run == 0 && prevMbSkipped == 0) 
                    mb_skip_run = bitreader_get_ugolomb(&bitreader);

                prevMbSkipped = (mb_skip_run>0);

                if (mb_skip_run > 0)
                {
                    mb_skip_run--;

                    srl_log(TRACE, "\t(SLICE) - P_SKIP macroblock\n");
                    ModePredInfo_MbMode(mpi, mb_fifo_G.mb_pos_x>>4, mb_fifo_G.mb_pos_y>>4) = P_Skip;
                    Derive_P_Skip_MVs(mpi, mb_fifo_G.mb_pos_x, mb_fifo_G.mb_pos_y);
                    /* fill data struct */
                    Fill_PSkip(&mb_fifo_G, P_Skip, 
                            ModePredInfo_MVx(mpi, mb_fifo_G.mb_pos_x>>2, mb_fifo_G.mb_pos_y>>2),
                            ModePredInfo_MVy(mpi, mb_fifo_G.mb_pos_x>>2, mb_fifo_G.mb_pos_y>>2));

                    srl_log_printf(DEBUG, "\tmvx = %d; mvy = %d\n", 
                            ModePredInfo_MVx(mpi, mb_fifo_G.mb_pos_x>>2, mb_fifo_G.mb_pos_y>>2),
                            ModePredInfo_MVy(mpi, mb_fifo_G.mb_pos_x>>2, mb_fifo_G.mb_pos_y>>2));

                    goto render;
                }
            }

            /* - Others block type */
            /* 
             * 1. DECODE MODE 
             */
            int raw_mb_type = bitreader_get_ugolomb(&bitreader);
            decode_mb_mode(&mb_mode, slice_inf.type, raw_mb_type);
            ModePredInfo_MbMode(mpi, mb_fifo_G.mb_pos_x>>4, mb_fifo_G.mb_pos_y>>4) = mb_mode.mb_type;

            srl_log_printf(DEBUG, "\ttype : %d\n", mb_mode.mb_type);

            /* skip I_PCM mb */

            /* 
             * 2. COMPUTE PREDICTION 
             */
            /* Inter block with at least 4 partitions */
            if (mb_mode.MbPartPredMode[0]!=Intra_4x4 
                    && mb_mode.MbPartPredMode[0]!=Intra_16x16 
                    && mb_mode.NumMbPart==4)
            {
                int mbPartIdx,subMbPartIdx;
                sub_mb_mode_t sub_mode[4];

                srl_log(TRACE, "\t(SLICE) - INTER macroblock with subpartitions\n");
                /* for each 4 partitions, decode the sub partitions */
                for (mbPartIdx=0; mbPartIdx<4; ++mbPartIdx)
                {
                    decode_sub_mb_mode(&sub_mode[mbPartIdx],slice_inf.type, bitreader_get_ugolomb(&bitreader));
                    srl_log_printf(DEBUG, "\tsub[%d] type: %d\n", mbPartIdx, sub_mode[mbPartIdx].sub_mb_type);
                }

                /* for each 4 partitions, predict the mv of the sub partitions */
                for (mbPartIdx=0; mbPartIdx<4; ++mbPartIdx)
                {
                    if (sub_mode[mbPartIdx].sub_mb_type!=B_Direct_8x8 && sub_mode[mbPartIdx].SubMbPredMode!=Pred_L1)
                    { 
                        /* SOF = "scan order factor" : the scan order for P_L0_8x4 is special */
                        int SOF = (sub_mode[mbPartIdx].sub_mb_type == P_L0_8x4) ? 2:1;
                        for (subMbPartIdx=0; subMbPartIdx<sub_mode[mbPartIdx].NumSubMbPart; ++subMbPartIdx) 
                        {
                            int mvdx = bitreader_get_sgolomb(&bitreader);
                            int mvdy = bitreader_get_sgolomb(&bitreader);
                            DeriveMVs(mpi,
                                    mb_fifo_G.mb_pos_x + MBScanOrder[mbPartIdx*4+subMbPartIdx*SOF][0],
                                    mb_fifo_G.mb_pos_y + MBScanOrder[mbPartIdx*4+subMbPartIdx*SOF][1],
                                    sub_mode[mbPartIdx].SubMbPartWidth,
                                    sub_mode[mbPartIdx].SubMbPartHeight,
                                    mvdx, mvdy);
                            srl_log_printf(DEBUG, "\tmvx = %d; mvy = %d\n", 
                                    ModePredInfo_MVx(mpi, 
                                        (mb_fifo_G.mb_pos_x + MBScanOrder[mbPartIdx*4+subMbPartIdx*SOF][0])>>2, 
                                        (mb_fifo_G.mb_pos_y + MBScanOrder[mbPartIdx*4+subMbPartIdx*SOF][1])>>2),
                                    ModePredInfo_MVy(mpi, 
                                        (mb_fifo_G.mb_pos_x + MBScanOrder[mbPartIdx*4+subMbPartIdx*SOF][0])>>2, 
                                        (mb_fifo_G.mb_pos_y + MBScanOrder[mbPartIdx*4+subMbPartIdx*SOF][1])>>2));
                        }
                    }
                }

                /* Inter block  with less than 4 partitions */
            } else if ( mb_mode.MbPartPredMode[0]!=Intra_4x4 
                    && mb_mode.MbPartPredMode[0]!=Intra_16x16) {
                int mbPartIdx;
                int SOF = (mb_mode.mb_type == P_L0_L0_16x8) ? 8:4;

                srl_log(TRACE, "\t(SLICE) - INTER macroblock without subpartitions\n");
                for (mbPartIdx=0; mbPartIdx<mb_mode.NumMbPart; ++mbPartIdx)
                {
                    if (mb_mode.MbPartPredMode[mbPartIdx] != Pred_L1) {
                        int mvdx = bitreader_get_sgolomb(&bitreader);
                        int mvdy = bitreader_get_sgolomb(&bitreader);
                        DeriveMVs(mpi,
                                mb_fifo_G.mb_pos_x + MBScanOrder[mbPartIdx*SOF][0],
                                mb_fifo_G.mb_pos_y + MBScanOrder[mbPartIdx*SOF][1],
                                mb_mode.MbPartWidth, mb_mode.MbPartHeight, 
                                mvdx, mvdy);
                        srl_log_printf(DEBUG, "\tsub[%d] : mvx = %d; mvy = %d\n", 
                                mbPartIdx,
                                ModePredInfo_MVx(mpi, 
                                    (mb_fifo_G.mb_pos_x + MBScanOrder[mbPartIdx*SOF][0]) >> 2,
                                    (mb_fifo_G.mb_pos_y + MBScanOrder[mbPartIdx*SOF][1]) >> 2),
                                ModePredInfo_MVy(mpi, 
                                    (mb_fifo_G.mb_pos_x + MBScanOrder[mbPartIdx*SOF][0]) >> 2,
                                    (mb_fifo_G.mb_pos_y + MBScanOrder[mbPartIdx*SOF][1]) >> 2));
                    }
                }

                /* Intra block 4x4 */
            } else if (mb_mode.MbPartPredMode[0]==Intra_4x4) {
                int luma4x4BlkIdx, i;

                srl_log(TRACE, "\t(SLICE) - INTRA_4x4 macroblock\n");
                for (luma4x4BlkIdx=0; luma4x4BlkIdx<16; ++luma4x4BlkIdx)
                {
                    int predIntra4x4PredMode = get_predIntra4x4PredMode(mpi,
                            mb_fifo_G.mb_pos_x + MBScanOrder[luma4x4BlkIdx][0],
                            mb_fifo_G.mb_pos_y + MBScanOrder[luma4x4BlkIdx][1]);

                    /* false prediction, read correction */
                    if(!bitreader_get_one(&bitreader))
                    {
                        int rem_intra4x4_pred_mode = bitreader_get(&bitreader, 3);

                        if(rem_intra4x4_pred_mode < predIntra4x4PredMode)
                            predIntra4x4PredMode = rem_intra4x4_pred_mode;
                        else
                            predIntra4x4PredMode = rem_intra4x4_pred_mode + 1;
                    }
                    /* save the right value */
                    ModePredInfo_Intra4x4PredMode(mpi,
                            (mb_fifo_G.mb_pos_x + MBScanOrder[luma4x4BlkIdx][0])>>2,
                            (mb_fifo_G.mb_pos_y + MBScanOrder[luma4x4BlkIdx][1])>>2) = predIntra4x4PredMode;
                    srl_log_printf(DEBUG, "\tsub[%d] : pred = %d\n", 
                            luma4x4BlkIdx, predIntra4x4PredMode);

                    /* fill data struct */
                    mb_fifo_G.intra.LPredMode[luma4x4BlkIdx] = predIntra4x4PredMode;
                }
                /* predmode of surrounding blocks */
                for (i=0; i<4; i++)
                    mb_fifo_G.intra.LPredMode[16+i] = get_Intra4x4PredModeN(mpi, mb_fifo_G.mb_pos_x + (i<<2), mb_fifo_G.mb_pos_y-4);
                for (i=0; i<4; i++)
                    mb_fifo_G.intra.LPredMode[20+i] = get_Intra4x4PredModeN(mpi, mb_fifo_G.mb_pos_x-4, mb_fifo_G.mb_pos_y + (i<<2));

                mb_fifo_G.intra.CPredMode = bitreader_get_ugolomb(&bitreader);
                srl_log_printf(DEBUG, "\tchroma_pred = %d\n", mb_fifo_G.intra.CPredMode);

                /* Intra block 16x16 */
            } else if (mb_mode.MbPartPredMode[0]==Intra_16x16) {
                /* act as if all transform block were predicted using DC prediction mode */
                int luma4x4BlkIdx;

                srl_log(TRACE, "\t(SLICE) - INTRA_16x16 macroblock\n");
                for (luma4x4BlkIdx=0; luma4x4BlkIdx<16; ++luma4x4BlkIdx)
                {
                    ModePredInfo_Intra4x4PredMode(mpi,
                            (mb_fifo_G.mb_pos_x + MBScanOrder[luma4x4BlkIdx][0])>>2,
                            (mb_fifo_G.mb_pos_y + MBScanOrder[luma4x4BlkIdx][1])>>2) = 2;
                }

                mb_fifo_G.intra.LPredMode[0] = mb_mode.Intra16x16PredMode;
                mb_fifo_G.intra.CPredMode = bitreader_get_ugolomb(&bitreader);
                srl_log_printf(DEBUG, "\tchroma_pred = %d\n", mb_fifo_G.intra.CPredMode);
            }

            /* fill data struct */
            if (mb_mode.MbPartPredMode[0]!=Intra_4x4 && mb_mode.MbPartPredMode[0]!=Intra_16x16)
                Fill_Inter(&mb_fifo_G, mpi, mb_mode.mb_type, mb_mode.MbPartPredMode[0]);
            else if (mb_mode.MbPartPredMode[0]==Intra_4x4 || mb_mode.MbPartPredMode[0]==Intra_16x16)
                /* for LPredMode, it has already been filled in the loops below */
                Fill_Intra(&mb_fifo_G, mpi, mb_mode.mb_type, mb_mode.MbPartPredMode[0]);


            /* 
             * 3. CODED PATTERN 
             */
            if (mb_mode.MbPartPredMode[0]!=Intra_16x16)
            {
                static const char CodedBlockPatternMapping_Intra4x4[]={
                    47,31,15, 0,23,27,29,30, 7,11,13,14,39,43,45,46,
                    16, 3, 5,10,12,19,21,26,28,35,37,42,44, 1, 2, 4,
                    8,17,18,20,24, 6, 9,22,25,32,33,34,36,40,38,41
                };
                static const char CodedBlockPatternMapping_Inter[]={
                    0,16, 1, 2, 4, 8,32, 3, 5,10,12,15,47, 7,11,13,
                    14, 6, 9,31,35,37,42,44,33,34,36,40,39,43,45,46,
                    17,18,20,24,19,21,26,28,23,27,29,30,22,25,38,41
                };
                int coded_block_pattern = bitreader_get_ugolomb(&bitreader);

                srl_assert(coded_block_pattern<48);

                srl_log(TRACE, "\t(SLICE) - Reading CodedBlockPattern\n");
                if (mb_mode.MbPartPredMode[0] == Intra_4x4)
                    coded_block_pattern = CodedBlockPatternMapping_Intra4x4[coded_block_pattern];
                else
                    coded_block_pattern = CodedBlockPatternMapping_Inter[coded_block_pattern];

                mb_mode.CodedBlockPatternLuma   = coded_block_pattern&15;
                mb_mode.CodedBlockPatternChroma = coded_block_pattern>>4;
                srl_log_printf(DEBUG, "\tpattern_luma = %d\n", mb_mode.CodedBlockPatternLuma);
                srl_log_printf(DEBUG, "\tpattern_chroma = %d\n", mb_mode.CodedBlockPatternChroma);
            }
            /* Fill data struct */
            Fill_CodedPattern(&mb_fifo_G, mb_mode.CodedBlockPatternLuma, mb_mode.CodedBlockPatternChroma);


            /* 
             * 4. RESIDUAL 
             */
            if ( mb_mode.CodedBlockPatternLuma > 0 
                    || mb_mode.CodedBlockPatternChroma > 0 
                    || mb_mode.MbPartPredMode[0] == Intra_16x16)
            {
                /* index of loops */
                int i8x8,i4x4,iCbCr;
                /* computation for QPy and QPc */
                int mb_qp_delta;
                int QPi;
                static const char QPcTable[22] = {29,30,31,32,32,33,34,34,35,35,36,36,37,37,37,38,38,38,39,39,39,39};

                srl_log(TRACE, "\t(SLICE) - Reading Residual\n");

                mb_qp_delta = bitreader_get_sgolomb(&bitreader);

                /* compute QPy, quantisation parameter for luma */
                QPy = (QPy+mb_qp_delta+52)%52;
                srl_log_printf(DEBUG, "\tQPy = %d\n", QPy);

                /* compute QPc, quantisation parameter for chroma */
                QPi = QPy + ps->pps.chroma_qp_index_offset;
                QPi = Clip(QPi,0,51);
                if (QPi<30) QPc = QPi;
                else        QPc = QPcTable[QPi-30];
                srl_log_printf(DEBUG, "\tQPc = %d\n", QPc);

                /* read LumaDC (only for Intra16 mb) */
                if (mb_mode.MbPartPredMode[0]==Intra_16x16)
                {
                    int LumaDC_nC = get_luma_nC(mpi, mb_fifo_G.mb_pos_x, mb_fifo_G.mb_pos_y);
                    srl_log_printf(DEBUG, "nC = %d\n", LumaDC_nC);
                    residual_block(&LumaDCLevel[0], 16, LumaDC_nC, &bitreader);
                    srl_log(TRACE, "\t(SLICE) - LumaDC for INTRA_16\n");
                }

                /* read LumaAC, 16 matrix for one mb */
                for (i8x8=0; i8x8<4; ++i8x8)
                {
                    for (i4x4=0; i4x4<4; ++i4x4)
                    {
                        if (mb_mode.CodedBlockPatternLuma&(1<<i8x8)) 
                        {
                            int LumaAC_nC = get_luma_nC(mpi, mb_fifo_G.mb_pos_x + MBScanOrder[i8x8*4+i4x4][0],
                                    mb_fifo_G.mb_pos_y + MBScanOrder[i8x8*4+i4x4][1]);
                            srl_log_printf(DEBUG, "nC = %d\n", LumaAC_nC);
                            int TotalCoeffL;

                            if (mb_mode.MbPartPredMode[0] == Intra_16x16)
                                TotalCoeffL = residual_block(&mb_fifo_L.LumaACLevel[i8x8*4+i4x4][1], 15, LumaAC_nC, &bitreader);
                            else
                                TotalCoeffL = residual_block(&mb_fifo_L.LumaACLevel[i8x8*4+i4x4][0], 16, LumaAC_nC, &bitreader);

                            ModePredInfo_TotalCoeffL(mpi, 
                                    (mb_fifo_G.mb_pos_x + MBScanOrder[i8x8*4+i4x4][0])>>2,
                                    (mb_fifo_G.mb_pos_y + MBScanOrder[i8x8*4+i4x4][1])>>2) = TotalCoeffL;
                        } else {
                            /* set the luma residual to 0 */
                            if (mb_mode.MbPartPredMode[0] == Intra_16x16)
                                memset(&mb_fifo_L.LumaACLevel[i8x8*4+i4x4][1], 0, 15*sizeof(short int));
                            else
                                memset(&mb_fifo_L.LumaACLevel[i8x8*4+i4x4][0], 0, 16*sizeof(short int));
                        }
                    }
                }
                if (mb_mode.CodedBlockPatternLuma > 0) {
                    srl_log(TRACE, "\t(SLICE) - LumaAC\n");
                }

                /* read ChromaDC for both the channels of chroma */
                /* match b'01' and b'10' */
                if (mb_mode.CodedBlockPatternChroma&3)
                {
                    for (iCbCr=0; iCbCr<2; iCbCr++)
                        residual_block(&ChromaDCLevel[iCbCr][0], 4, -1, &bitreader);
                    srl_log(TRACE, "\t(SLICE) - ChromaDC\n");
                }

                /* read ChromaAC for both the channels of chroma, 4 matrix for one mb */
                for (iCbCr=0; iCbCr<2; iCbCr++)
                {
                    for (i4x4=0; i4x4<4; ++i4x4)
                    {
                        if (mb_mode.CodedBlockPatternChroma&2)
                        {
                            int ChromaAC_nC = get_chroma_nC(mpi, mb_fifo_G.mb_pos_x + (i4x4&1)*8,
                                    mb_fifo_G.mb_pos_y + (i4x4>>1)*8, iCbCr);
                            int TotalCoeffC;

                            TotalCoeffC = residual_block(&mb_fifo_C.ChromaACLevel[iCbCr][i4x4][1], 15, ChromaAC_nC, &bitreader);

                            ModePredInfo_TotalCoeffC(mpi,
                                    (mb_fifo_G.mb_pos_x + (i4x4&1)*8)>>3,
                                    (mb_fifo_G.mb_pos_y + (i4x4>>1)*8)>>3, iCbCr) = TotalCoeffC;
                        } else {
                            /* set to 0 the chroma residual */
                            memset(&mb_fifo_C.ChromaACLevel[iCbCr][i4x4][1], 0, 15*sizeof(short int));
                        }
                    }
                }
                if (mb_mode.CodedBlockPatternChroma > 0) {
                    srl_log(TRACE, "\t(SLICE) - ChromaAC\n");
                }
            }

            /*
             * 5. DCT TRANSFORMS
             */
            /* LumaDC only for Intra_16x16 */
            if (mb_fifo_G.MbPartPredMode == Intra_16x16)
            {
                srl_log(TRACE, "\t(IDCT) - Transform LumaDC for INTRA_16x16\n");
                transform_luma_dc(&LumaDCLevel[0], &mb_fifo_L.LumaACLevel[0][0], QPy);
            }
            /* LumaAC */
            if (mb_fifo_G.CodedBlockPatternLuma>0 || mb_fifo_G.MbPartPredMode == Intra_16x16)
            {
                int i8x8, i4x4;

                srl_log(TRACE, "\t(IDCT) - Transform LumaAC\n");
                for(i8x8=0; i8x8<4; ++i8x8)
                {
                    for(i4x4=0; i4x4<4; ++i4x4)
                    {
                        if (mb_fifo_G.MbPartPredMode == Intra_16x16)
                            transform_luma_ac(&mb_fifo_L.LumaACLevel[i8x8*4+i4x4][0], QPy, 1);
                        else
                            transform_luma_ac(&mb_fifo_L.LumaACLevel[i8x8*4+i4x4][0], QPy, 0);
                    }
                }
            }

            /* ChromaDC */
            if (mb_fifo_G.CodedBlockPatternChroma > 0)
            {
                int iCbCr, i;

                for(iCbCr=0; iCbCr<2; ++iCbCr)
                {
                    srl_log_printf(TRACE, "\t(IDCT) - Transform ChromaDC[%d]\n", iCbCr);
                    transform_chroma_dc(&ChromaDCLevel[iCbCr][0], &mb_fifo_C.ChromaACLevel[iCbCr][0][0], QPc);

                    srl_log_printf(TRACE, "\t(IDCT) - Transform ChromaAC[%d]\n", iCbCr);
                    for(i=0; i<4; ++i)
                        transform_chroma_ac(&mb_fifo_C.ChromaACLevel[iCbCr][i][0], QPc, 1);
                }
            }

            /*
             * 6. RENDER
             */
render:
            /* render intra_4x4 + correc luma */
            if (mb_fifo_G.MbPartPredMode == Intra_4x4)
            {
                int i;

                srl_log(TRACE, "\t(RENDER) - Rendering INTRA_4x4\n");
                for(i=0; i<16; ++i)
                {
                    int x = mb_fifo_G.mb_pos_x + MBScanOrder[i][0];
                    int y = mb_fifo_G.mb_pos_y + MBScanOrder[i][1];
                    Intra_4x4_Dispatch(frame[cur_render], &mb_fifo_G, x, y, i, 
                                        ps->pps.constrained_intra_pred_flag);
                    if (mb_fifo_G.CodedBlockPatternLuma & (1 << (i>>2)))
                        correc_luma_block(frame[cur_render], &mb_fifo_L, x, y, i);
                }
                Intra_Chroma_Dispatch(frame[cur_render], &mb_fifo_G,
                                        ps->pps.constrained_intra_pred_flag);

                /* render intra_16x16 + correc luma */
            } else if (mb_fifo_G.MbPartPredMode == Intra_16x16) 
            {
                int i;

                srl_log(TRACE, "\t(RENDER) - Rendering INTRA_16x16\n");
                Intra_16x16_Dispatch(frame[cur_render], &mb_fifo_G,
                                        ps->pps.constrained_intra_pred_flag);
                for(i=0; i<16; ++i)
                {
                    int x = mb_fifo_G.mb_pos_x + MBScanOrder[i][0];
                    int y = mb_fifo_G.mb_pos_y + MBScanOrder[i][1];
                    correc_luma_block(frame[cur_render], &mb_fifo_L, x, y, i);
                }
                Intra_Chroma_Dispatch(frame[cur_render], &mb_fifo_G,
                                        ps->pps.constrained_intra_pred_flag);

                /* render inter (including P_Skip) + correc luma */
            } else 
            {
                int i;

                srl_log(TRACE, "\t(RENDER) - Rendering INTER\n");
                MotionCompensateMB(frame[cur_render], frame[ref_render], 
                        &mb_fifo_G, mb_fifo_G.mb_pos_x, mb_fifo_G.mb_pos_y);
                for(i=0; i<16; ++i)
                {
                    int x = mb_fifo_G.mb_pos_x + MBScanOrder[i][0];
                    int y = mb_fifo_G.mb_pos_y + MBScanOrder[i][1];
                    if (mb_fifo_G.CodedBlockPatternLuma & (1 << (i>>2)))
                        correc_luma_block(frame[cur_render], &mb_fifo_L, x, y, i);
                }
            }

            /* 
             * 3. CORRECTING CHROMA 
             */
            int iCbCr, i;
            if (mb_fifo_G.CodedBlockPatternChroma > 0)
            {
                for (iCbCr=0; iCbCr<2; iCbCr++)
                {
                    srl_log_printf(TRACE, "\t(RENDER) - Correcting Chroma[%d]\n", iCbCr);
                    for(i=0; i<4; ++i)
                    {
                        int x = (mb_fifo_G.mb_pos_x>>1) + MBScanOrder[i][0];
                        int y = (mb_fifo_G.mb_pos_y>>1) + MBScanOrder[i][1];
                        correc_chroma_block(frame[cur_render], &mb_fifo_C, x, y, i, iCbCr);
                    }
                }
            }

            /* the macroblock is available */
            frame[cur_render]->m_available[mb_fifo_G.mb_pos_y>>4][mb_fifo_G.mb_pos_x>>4] = 1;

            /* 
             * 4. WRITING to LIBU 
             */
            int valid = 1;
            srl_mwmr_write(valid_output_mwmr, (unsigned char*)&valid, 4);
            
            srl_log(TRACE, "\t(SLICE) - Sending to Libu\n");
            /* 16x16 value for luma */
            unsigned char L[16][16];
            for (i=0; i<MBLOCK_HEIGHT; i++)
            {
                memcpy(&(L[i]), (unsigned char*)&L_pixel(frame[cur_render],
                            mb_fifo_G.mb_pos_x, mb_fifo_G.mb_pos_y + i), MBLOCK_WIDTH);
            }
            srl_mwmr_write(output_mwmr, (unsigned char*)L, (MBLOCK_SIZE));

            /* 8x8 value for luma */
            for (iCbCr=0; iCbCr<2; iCbCr++)
            {
                unsigned char C[8][8];
                for (i=0; i<CBLOCK_HEIGHT; i++)
                {
                    memcpy(&(C[i]), (unsigned char*)&C_pixel(frame[cur_render], iCbCr,
                                mb_fifo_G.mb_pos_x>>1, (mb_fifo_G.mb_pos_y>>1) + i), CBLOCK_WIDTH);
                }
                srl_mwmr_write(output_mwmr, (unsigned char*)C, (CBLOCK_SIZE));
            }

            /* before iterate */
            CurrMbAddr++;

            //getchar();

            // before testing slice_over
            bitreader_refill(&bitreader);

        }while(!(slice_isover(&slice) && bitreader_isover(&bitreader))
               || (slice_inf.type != I_SLICE && mb_skip_run > 0 )); // end while slice

        srl_log(TRACE, "\t(SLICE) - End of Slice\n");

        int valid = 0;
        srl_mwmr_write(valid_output_mwmr, (unsigned char*)&valid, 4);

        if (CurrMbAddr == NMBLOCKS)
            next_plane_pool(plane_pool);
        //getchar();
        
    } // end infinite loop

}
