//////////////////////////////////////////////////////////////////////////////////////// // File : idct.c // Date : octobre 2015 // author : Alain Greiner //////////////////////////////////////////////////////////////////////////////////////// // This file define the code of the IDCT (Inverse Discrete Cosinus Transform) thread // for the MJPEG application. // It read blocks of 8*8 pixels (one int32_t per pixel) from the MWMR channel. // It write blocks of 8*8 pixels (one uint8_t per pixel) to the MWMR channel. //////////////////////////////////////////////////////////////////////////////////////// #include #include #include #include "mjpeg.h" #define INT_MAX ((int32_t)0x7fffffff) #define INT_MIN ((int32_t)0x80000000) // macro to use a shared TTY #define PRINTF(...) lock_acquire( &tty_lock ); \ giet_tty_printf(__VA_ARGS__); \ lock_release( &tty_lock ); // Useful constants /* ck = cos(k*pi/16) = s8-k = sin((8-k)*pi/16) times 1 << C_BITS and rounded */ #define c0_1 16384 #define c0_s2 23170 #define c1_1 16069 #define c1_s2 22725 #define c2_1 15137 #define c2_s2 21407 #define c3_1 13623 #define c3_s2 19266 #define c4_1 11585 #define c4_s2 16384 #define c5_1 9102 #define c5_s2 12873 #define c6_1 6270 #define c6_s2 8867 #define c7_1 3196 #define c7_s2 4520 #define c8_1 0 #define c8_s2 0 #define sqrt2 c0_s2 // The number of bits of accuracy in all (signed) integer operations #define ARITH_BITS 16 // The minimum signed integer value that fits in ARITH_BITS #define ARITH_MIN (-1 << (ARITH_BITS-1)) #define ARITH_MAX (~ARITH_MIN) // The number of bits coefficients are scaled up before 2-D idct #define S_BITS 3 // The number of bits in the fractional part of a fixed point constant #define C_BITS 14 // This version is vital in passing overall mean error test. #define descale(x, n) (((x) + (1 << ((n) - 1)) - ((x) < 0)) >> (n)) static const int32_t COS[2][8] = { {c0_1 , c1_1 , c2_1 , c3_1 , c4_1 , c5_1 , c6_1 , c7_1 }, {c0_s2, c1_s2, c2_s2, c3_s2, c4_s2, c5_s2, c6_s2, c7_s2} }; //////////////////////////////////// static inline void rot( int32_t f, int32_t k, int32_t x, int32_t y, int32_t* rx, int32_t* ry ) { #define Cos(k) COS[f][k] #define Sin(k) Cos(8-k) *rx = (Cos(k) * x - Sin(k) * y) >> C_BITS; // r = (r + (1 << (C_BITS - 1))) >> C_BITS; *ry = (Sin(k) * x + Cos(k) * y) >> C_BITS; // r = (r + (1 << (C_BITS - 1))) >> C_BITS; #undef Cos #undef Sin } /* Butterfly: but(a,b,x,y) = rot(sqrt(2),4,a,b,x,y) */ #define but(a,b,x,y) do { x = a - b; y = a + b; } while(0) // Inverse 1-D Discrete Cosine Transform. // Result Y is scaled up by factor sqrt(8). // Original Loeffler algorithm. static inline void idct_1d( int32_t* Y ) { int32_t z1[8], z2[8], z3[8]; /* Stage 1: */ but(Y[0], Y[4], z1[1], z1[0]); rot(1, 6, Y[2], Y[6], &z1[2], &z1[3]); but(Y[1], Y[7], z1[4], z1[7]); z1[5] = (sqrt2 * Y[3]) >> C_BITS; // r = (r + (1 << (C_BITS - 1))) >> C_BITS; z1[6] = (sqrt2 * Y[5]) >> C_BITS; // r = (r + (1 << (C_BITS - 1))) >> C_BITS; /* Stage 2: */ but(z1[0], z1[3], z2[3], z2[0]); but(z1[1], z1[2], z2[2], z2[1]); but(z1[4], z1[6], z2[6], z2[4]); but(z1[7], z1[5], z2[5], z2[7]); /* Stage 3: */ z3[0] = z2[0]; z3[1] = z2[1]; z3[2] = z2[2]; z3[3] = z2[3]; rot(0, 3, z2[4], z2[7], &z3[4], &z3[7]); rot(0, 1, z2[5], z2[6], &z3[5], &z3[6]); /* Final stage 4: */ but(z3[0], z3[7], Y[7], Y[0]); but(z3[1], z3[6], Y[6], Y[1]); but(z3[2], z3[5], Y[5], Y[2]); but(z3[3], z3[4], Y[4], Y[3]); } ////////////////////////////////////////////////////////////// __attribute__ ((constructor)) void idct( unsigned int index ) ////////////////////////////////////////////////////////////// { mwmr_channel_t* input = iqzz_2_idct[index]; mwmr_channel_t* output = idct_2_libu[index]; int32_t row; int32_t column; int32_t block; int32_t bin[64]; uint8_t bout[64]; int32_t Y[64]; // get platform parameters uint32_t x_size; uint32_t y_size; uint32_t nprocs; giet_procs_number( &x_size , &y_size , &nprocs ); // get processor coordinates unsigned int x , y , p; giet_proc_xyp( &x ,&y , &p ); // private TTY allocation // giet_tty_alloc( 0 ); PRINTF("\n[MJPEG] thread IDCT[%d] starts on P[%d,%d,%d] / trdid = %x\n", index , x , y , p, (uint32_t)trdid_idct[index] ) uint32_t image = index; uint32_t nblocks = nblocks_h * nblocks_w; while( image < MAX_IMAGES ) // one image per iteration { for ( block = 0 ; block < nblocks ; block++ ) { uint32_t begin; // read obe block of coefficients (4 bytes per pixel) mwmr_read( input, (uint32_t*)bin , 64 ); for ( row = 0; row < 8 ; row++ ) { for ( column = 0 ; column < 8 ; column++ ) { Y[row * 8 + column] = bin[row * 8 + column] << S_BITS; } idct_1d( &Y[8*row] ); // Result Y is scaled up by factor sqrt(8)*2^S_BITS. } for ( column = 0 ; column < 8 ; column++ ) { int32_t Yc[8]; for ( row = 0 ; row < 8 ; row++ ) { Yc[row] = Y[8 * row + column]; } idct_1d( Yc ); for ( row = 0 ; row < 8 ; row++ ) { // Result is once more scaled up by a factor sqrt(8). int32_t r = 128 + descale(Yc[row], 2 * S_BITS); // Clip to 8 bits unsigned r = r > 0 ? (r < 255 ? r : 255) : 0; bout[8*row+column] = r; giet_pthread_assert( ((r & 0xFF) == r ) , "ERROR in idct() : pixel overflow" ); } } // write one block to output MWMR channel (one byte per pixel) mwmr_write( output, (uint32_t*)bout , 16 ); #if (DEBUG_IDCT > 1) if ( (index == DEBUG_CLUSTER_INDEX) || (DEBUG_CLUSTER_INDEX == 0XFFFFFFFF) ) PRINTF("\nIDCT[%d] completes block %d/%d in image %d\n" " %x %x %x %x %x %x %x %x\n" " %x %x %x %x %x %x %x %x\n" " %x %x %x %x %x %x %x %x\n" " %x %x %x %x %x %x %x %x\n" " %x %x %x %x %x %x %x %x\n" " %x %x %x %x %x %x %x %x\n" " %x %x %x %x %x %x %x %x\n" " %x %x %x %x %x %x %x %x\n", index , block , nblocks , image , bout[0] , bout[1] , bout[2] , bout[3] , bout[4] , bout[5] , bout[6] , bout[7] , bout[8] , bout[9] , bout[10], bout[11], bout[12], bout[13], bout[14], bout[15], bout[16], bout[17], bout[18], bout[19], bout[20], bout[21], bout[22], bout[23], bout[24], bout[25], bout[26], bout[27], bout[28], bout[29], bout[30], bout[31], bout[32], bout[33], bout[34], bout[35], bout[36], bout[37], bout[38], bout[39], bout[40], bout[41], bout[42], bout[43], bout[44], bout[45], bout[46], bout[47], bout[48], bout[49], bout[50], bout[51], bout[52], bout[53], bout[54], bout[55], bout[56], bout[57], bout[58], bout[59], bout[60], bout[61], bout[62], bout[63]) } } #endif } // end for blocks #if DEBUG_IDCT if ( (index == DEBUG_CLUSTER_INDEX) || (DEBUG_CLUSTER_INDEX == 0XFFFFFFFF) ) { PRINTF("\nIDCT[%d] completes image %d at cycle %d\n", index , image , giet_proctime() ) } #endif image = image + x_size*y_size; } // end while (1) on images giet_pthread_exit( "IDCT completed" ); } // end idct()