////////////////////////////////////////////////////////////////////////////////////////// // File : transpose.c // Date : september 2019 // author : Alain Greiner ////////////////////////////////////////////////////////////////////////////////////////// // This multi-threaded aplication read a raw image (one byte per pixel) // stored on disk, transpose it, display the result on the frame buffer, // and store the transposed image on disk. // It can run on a multi-cores, multi-clusters architecture, with one thread // // per core, and uses the POSIX threads API. // It uses the mmap() syscall to directly access the input and output files // and the fbf_write() syscall to display the images. // // The main() function can be launched on any core[cxy,l]. // It makes the initialisations, launch (N-1) threads to run the execute() function // on the (N-1) other cores, calls himself the execute() function, and finally calls // the instrument() function to display instrumentation results when the parallel // execution is completed. The placement of threads on the cores can be done // automatically by the operating system, or can be done explicitely by the main thread // (when the EXPLICIT_PLACEMENT global parameter is set). // // The buf_in[x,y] and buf_out[put buffers containing the direct ans transposed images // are distributed in clusters: In each cluster[cxy], the thread running on core[cxy,0] // map the buf_in[cxy] and // buf_out[cxy] buffers containing a subset of lines. // Then, all threads in cluster[xy] read pixels from the local buf_in[cxy] buffer, and // write the pixels to all remote buf_out[cxy] buffers. Finally, each thread display // a part of the transposed image to the frame buffer. // // - The image must fit the frame buffer size, that must be power of 2. // - The number of clusters must be a power of 2 no larger than 256. // - The number of cores per cluster must be a power of 2 no larger than 4. // - The number of clusters cannot be larger than (IMAGE_SIZE * IMAGE_SIZE) / 4096, // because the size of buf_in[x,y] and buf_out[x,y] must be multiple of 4096. // ////////////////////////////////////////////////////////////////////////////////////////// #include #include #include #include #include #include #include #include #include #define X_MAX 16 // max number of clusters in row #define Y_MAX 16 // max number of clusters in column #define CORES_MAX 4 // max number of cores per cluster #define CLUSTERS_MAX (X_MAX * Y_MAX) // max number of clusters #define IMAGE_SIZE 256 // image size #define IMAGE_TYPE 420 // pixel encoding type #define INPUT_FILE_PATH "/misc/lena_256.raw" // input file pathname #define OUTPUT_FILE_PATH "/home/trsp_256.raw" // output file pathname #define USE_DQT_BARRIER 1 // quad-tree barrier if non zero #define EXPLICIT_PLACEMENT 1 // explicit thread placement #define VERBOSE 1 // print comments on TTY /////////////////////////////////////////////////////// // global variables /////////////////////////////////////////////////////// // instrumentation counters for each processor in each cluster unsigned int MMAP_START[CLUSTERS_MAX][CORES_MAX] = {{ 0 }}; unsigned int MMAP_END [CLUSTERS_MAX][CORES_MAX] = {{ 0 }}; unsigned int TRSP_START[CLUSTERS_MAX][CORES_MAX] = {{ 0 }}; unsigned int TRSP_END [CLUSTERS_MAX][CORES_MAX] = {{ 0 }}; unsigned int DISP_START[CLUSTERS_MAX][CORES_MAX] = {{ 0 }}; unsigned int DISP_END [CLUSTERS_MAX][CORES_MAX] = {{ 0 }}; // arrays of pointers on distributed buffers // one input buffer & one output buffer per cluster unsigned char * buf_in [CLUSTERS_MAX]; unsigned char * buf_out[CLUSTERS_MAX]; // synchronisation barrier (all threads) pthread_barrier_t barrier; // platform parameters unsigned int x_size; // number of clusters in a row unsigned int y_size; // number of clusters in a column unsigned int ncores; // number of processors per cluster // cluster identifier & local index of core running the main thread unsigned int cxy_main; unsigned int lid_main; // input & output file descriptors int fd_in; int fd_out; #if EXPLICIT_PLACEMENT // thread index allocated by the kernel pthread_t trdid[CLUSTERS_MAX][CORES_MAX]; // user defined continuous thread index unsigned int tid[CLUSTERS_MAX][CORES_MAX]; // thread attributes only used if explicit placement pthread_attr_t attr[CLUSTERS_MAX][CORES_MAX]; #else // thread index allocated by the kernel pthread_t trdid[CLUSTERS_MAX * CORES_MAX]; // user defined continuous thread index unsigned int tid[CLUSTERS_MAX * CORES_MAX]; #endif //return values at thread exit unsigned int THREAD_EXIT_SUCCESS = 0; unsigned int THREAD_EXIT_FAILURE = 1; //////////////////////////////////////////////////////////////// // functions declaration //////////////////////////////////////////////////////////////// void execute( unsigned int * ptid ); void instrument( void ); /////////// void main() { unsigned long long date; int error; printf("\n bloup 0\n"); // get identifiers for core executing main get_core_id( &cxy_main , &lid_main ); printf("\n bloup 1\n"); // get & check plat-form parameters get_config( &x_size , &y_size , &ncores ); printf("\n bloup 2\n"); if((ncores != 1) && (ncores != 2) && (ncores == 4)) { printf("\n[transpose error] number of cores per cluster must be 1/2/4\n"); exit( 0 ); } if( (x_size != 1) && (x_size != 2) && (x_size != 4) && (x_size != 8) && (x_size != 16) ) { printf("\n[transpose error] x_size must be 1/2/4/8/16\n"); exit( 0 ); } if( (y_size != 1) && (y_size != 2) && (y_size != 4) && (y_size != 8) && (y_size != 16) ) { printf("\n[transpose error] y_size must be 1/2/4/8/16\n"); exit( 0 ); } printf("\n bloup 3\n"); // compute number of threads unsigned int nclusters = x_size * y_size; unsigned int nthreads = nclusters * ncores; printf("\n bloup 4\n"); // get FBF ownership and FBF size unsigned int fbf_width; unsigned int fbf_height; unsigned int fbf_type; fbf_get_config( &fbf_width , &fbf_height , &fbf_type ); printf("\n bloup 5\n"); if( (fbf_width != IMAGE_SIZE) || (fbf_height != IMAGE_SIZE) || (fbf_type != IMAGE_TYPE) ) { printf("\n[transpose error] image does not fit FBF size or type\n"); exit( 0 ); } get_cycle( &date ); printf("\n[transpose] starts at cycle %d on %d cores / FBF = %d * %d pixels\n", (unsigned int)date , nthreads , fbf_width , fbf_height ); // open input file fd_in = open( INPUT_FILE_PATH , O_RDONLY , 0 ); // read-only if ( fd_in < 0 ) { printf("\n[transpose error] main cannot open file %s\n", INPUT_FILE_PATH ); exit( 0 ); } #if VERBOSE printf("\n[transpose] main open file %s / fd = %d\n", INPUT_FILE_PATH , fd_in ); #endif // open output file fd_out = open( OUTPUT_FILE_PATH , O_CREAT , 0 ); // create if required if ( fd_out < 0 ) { printf("\n[transpose error] main cannot open file %s\n", OUTPUT_FILE_PATH ); exit( 0 ); } #if VERBOSE printf("\n[transpose] main open file %s / fd = %d\n", OUTPUT_FILE_PATH , fd_out ); #endif // initialise barrier if( USE_DQT_BARRIER ) { pthread_barrierattr_t attr; attr.x_size = x_size; attr.y_size = y_size; attr.nthreads = ncores; error = pthread_barrier_init( &barrier, &attr , nthreads ); } else { error = pthread_barrier_init( &barrier, NULL , nthreads ); } if( error ) { printf("\n[transpose error] main cannot initialize barrier\n" ); exit( 0 ); } get_cycle( &date ); printf("\n[transpose] main on core[%x,%d] completes initialisation at cycle %d\n" "- CLUSTERS = %d\n" "- PROCS = %d\n" "- THREADS = %d\n", cxy_main, lid_main, (unsigned int)date, nclusters, ncores, nthreads ); ////////////////////// #if EXPLICIT_PLACEMENT // main thread launch other threads unsigned int x; unsigned int y; unsigned int l; unsigned int cxy; for( x = 0 ; x < x_size ; x++ ) { for( y = 0 ; y < y_size ; y++ ) { cxy = HAL_CXY_FROM_XY( x , y ); for( l = 0 ; l < ncores ; l++ ) { // no other thread on the core running the main if( (cxy != cxy_main) || (l != lid_main) ) { // define thread attributes attr[cxy][l].attributes = PT_ATTR_CLUSTER_DEFINED | PT_ATTR_CORE_DEFINED; attr[cxy][l].cxy = cxy; attr[cxy][l].lid = l; tid[cxy][l] = (((x * y_size) + y) * ncores) + l; // create thread on core[cxy,l] if (pthread_create( &trdid[cxy][l], &attr[cxy][l], &execute, &tid[cxy][l] ) ) { printf("\n[convol error] created thread %x on core[%x][%d]\n", trdid[cxy][l] , cxy , l ); exit( 0 ); } #if VERBOSE printf("\n[transpose] main created thread[%x,%d]\n", cxy, l ); #endif } } } } // main thread calls itself the execute() function execute( &tid[cxy_main][lid_main] ); // main thread wait other threads completion for( x = 0 ; x < x_size ; x++ ) { for( y = 0 ; y < y_size ; y++ ) { cxy = HAL_CXY_FROM_XY( x , y ); for( l = 0 ; l < ncores ; l++ ) { // no other thread on the core running the main if( (cxy != cxy_main) || (l != lid_main) ) { unsigned int * status; // wait thread[cxy][l] if( pthread_join( trdid[cxy][l] , (void*)(&status) ) ) { printf("\n[transpose error] main cannot join thread[%x,%d]\n", cxy, l ); exit( 0 ); } // check status if( *status != THREAD_EXIT_SUCCESS ) { printf("\n[transpose error] thread[%x,%d] returned failure\n", cxy, l ); exit( 0 ); } #if VERBOSE printf("\n[transpose] main joined thread[%x,%d]\n", cxy, l ); #endif } } } } /////////////////////////////// #else // no explicit placement // main thread launch other threads unsigned int n; for ( n = 1 ; n < nthreads ; n++ ) { tid[n] = n; if ( pthread_create( &trdid[n], NULL, // no attribute &execute, &tid[n] ) ) { printf("\n[transpose error] cannot create thread %d\n", n ); exit( 0 ); } #if VERBOSE printf("\n[transpose] main created thread %d\n", tid[n] ); #endif } // main thread calls itself the execute() function execute( &tid[0] ); // main thread wait other threads completion for ( n = 1 ; n < nthreads ; n++ ) { unsigned int * status; // main wait thread[n] status if ( pthread_join( trdid[n], (void*)(&status)) ) { printf("\n[transpose error] main cannot join thread %d\n", n ); exit( 0 ); } // check status if( *status != THREAD_EXIT_SUCCESS ) { printf("\n[transpose error] thread %x returned failure\n", n ); exit( 0 ); } #if VERBOSE printf("\n[transpose] main successfully joined thread %x\n", tid[n] ); #endif } #endif // instrumentation instrument(); // close input and output files close( fd_in ); close( fd_out ); // suicide exit( 0 ); } // end main() /////////////////////////////////// void execute( unsigned int * ptid ) { unsigned long long date; unsigned int l; // line index for loops unsigned int p; // pixel index for loops // get thread continuous index unsigned int my_tid = *ptid; // build total number of pixels per image unsigned int npixels = IMAGE_SIZE * IMAGE_SIZE; // nuild total number of threads and clusters unsigned int nthreads = x_size * y_size * ncores; unsigned int nclusters = x_size * y_size; // get cluster continuous index and core index from tid // we use (tid == cid * ncores + lid) unsigned int cid = my_tid / ncores; // continuous index unsigned int lid = my_tid % ncores; // core local index // get cluster identifier from cid // we use (cid == x * y_size + y) unsigned int x = cid / y_size; // X cluster coordinate unsigned int y = cid % y_size; // Y cluster coordinate unsigned int cxy = HAL_CXY_FROM_XY(x,y); #if VERBOSE printf("\n[transpose] thread[%d] start on core[%x,%d]\n", my_tid , cxy , lid ); #endif // In each cluster cxy, thread[cxy,0] map input file // to buf_in[cxy] and map output file to buf_in[cxy] get_cycle( &date ); MMAP_START[cxy][lid] = (unsigned int)date; if ( lid == 0 ) { unsigned int length = npixels / nclusters; unsigned int offset = length * cid; // map buf_in buf_in[cid] = mmap( NULL, length, PROT_READ, MAP_SHARED, fd_in, offset ); if ( buf_in[cid] == NULL ) { printf("\n[transpose error] thread[%x,%d] cannot map input file\n", cxy, lid); pthread_exit( &THREAD_EXIT_FAILURE ); } #if VERBOSE printf("\n[transpose] thread[%x,%d] map input file / length %x / offset %x / buf_in %x\n", cxy, lid, length, offset, buf_in[cid] ); #endif // map buf_out buf_out[cid] = mmap( NULL, length, PROT_WRITE, MAP_SHARED, fd_out, offset ); if ( buf_out[cid] == NULL ) { printf("\n[transpose error] thread[%x,%d] cannot map output file\n", cxy, lid); pthread_exit( &THREAD_EXIT_FAILURE ); } #if VERBOSE printf("\n[transpose] thread[%x,%d] map output file / length %x / offset %x / buf_out %x\n", cxy, lid, length, offset, buf_out[cid] ); #endif } get_cycle( &date ); MMAP_END[cxy][lid] = (unsigned int)date; ///////////////////////////////// pthread_barrier_wait( &barrier ); // parallel transpose from buf_in to buf_out // each thread makes the transposition for nlt lines (nlt = IMAGE_SIZE/nthreads) // from line [tid*nlt] to line [(tid + 1)*nlt - 1] // (p,l) are the absolute pixel coordinates in the source image get_cycle( &date ); TRSP_START[cxy][lid] = (unsigned int)date; unsigned int nlt = IMAGE_SIZE / nthreads; // number of lines per thread unsigned int nlc = IMAGE_SIZE / nclusters; // number of lines per cluster unsigned int src_cluster; unsigned int src_index; unsigned int dst_cluster; unsigned int dst_index; unsigned char byte; unsigned int first = my_tid * nlt; // first line index for a given thread unsigned int last = first + nlt; // last line index for a given thread for ( l = first ; l < last ; l++ ) { // in each iteration we transfer one byte for ( p = 0 ; p < IMAGE_SIZE ; p++ ) { // read one byte from local buf_in src_cluster = l / nlc; src_index = (l % nlc) * IMAGE_SIZE + p; byte = buf_in[src_cluster][src_index]; // write one byte to remote buf_out dst_cluster = p / nlc; dst_index = (p % nlc) * IMAGE_SIZE + l; buf_out[dst_cluster][dst_index] = byte; } } #if VERBOSE printf("\n[transpose] thread[%x,%d] completes transposed\n", cxy, lid ); #endif get_cycle( &date ); TRSP_END[cxy][lid] = (unsigned int)date; ///////////////////////////////// pthread_barrier_wait( &barrier ); // parallel display from local buf_out to frame buffer // all threads contribute to display get_cycle( &date ); DISP_START[cxy][lid] = (unsigned int)date; unsigned int npt = npixels / nthreads; // number of pixels per thread if( fbf_write( &buf_out[cid][lid * npt], npt, npt * my_tid ) ) { printf("\n[transpose error] thread[%x,%d] cannot access FBF\n", cxy, lid ); pthread_exit( &THREAD_EXIT_FAILURE ); } #if VERBOSE printf("\n[transpose] thread[%x,%d] completes display\n", cxy, lid ); #endif get_cycle( &date ); DISP_END[cxy][lid] = (unsigned int)date; ///////////////////////////////// pthread_barrier_wait( &barrier ); // all threads, but thread[0,0,0], suicide if ( (cxy != cxy_main) || (lid != lid_main) ) { pthread_exit( &THREAD_EXIT_SUCCESS ); } } // end execute() /////////////////////// void instrument( void ) { unsigned int x, y, l; unsigned int min_load_start = 0xFFFFFFFF; unsigned int max_load_start = 0; unsigned int min_load_ended = 0xFFFFFFFF; unsigned int max_load_ended = 0; unsigned int min_trsp_start = 0xFFFFFFFF; unsigned int max_trsp_start = 0; unsigned int min_trsp_ended = 0xFFFFFFFF; unsigned int max_trsp_ended = 0; unsigned int min_disp_start = 0xFFFFFFFF; unsigned int max_disp_start = 0; unsigned int min_disp_ended = 0xFFFFFFFF; unsigned int max_disp_ended = 0; char string[64]; snprintf( string , 64 , "/home/transpose_%d_%d_%d" , x_size , y_size , ncores ); // open instrumentation file FILE * f = fopen( string , NULL ); if ( f == NULL ) { printf("\n[transpose error] cannot open instrumentation file %s\n", string ); exit( 0 ); } for (x = 0; x < x_size; x++) { for (y = 0; y < y_size; y++) { unsigned int cxy = HAL_CXY_FROM_XY( x , y ); for ( l = 0 ; l < ncores ; l++ ) { if (MMAP_START[cxy][l] < min_load_start) min_load_start = MMAP_START[cxy][l]; if (MMAP_START[cxy][l] > max_load_start) max_load_start = MMAP_START[cxy][l]; if (MMAP_END[cxy][l] < min_load_ended) min_load_ended = MMAP_END[cxy][l]; if (MMAP_END[cxy][l] > max_load_ended) max_load_ended = MMAP_END[cxy][l]; if (TRSP_START[cxy][l] < min_trsp_start) min_trsp_start = TRSP_START[cxy][l]; if (TRSP_START[cxy][l] > max_trsp_start) max_trsp_start = TRSP_START[cxy][l]; if (TRSP_END[cxy][l] < min_trsp_ended) min_trsp_ended = TRSP_END[cxy][l]; if (TRSP_END[cxy][l] > max_trsp_ended) max_trsp_ended = TRSP_END[cxy][l]; if (DISP_START[cxy][l] < min_disp_start) min_disp_start = DISP_START[cxy][l]; if (DISP_START[cxy][l] > max_disp_start) max_disp_start = DISP_START[cxy][l]; if (DISP_END[cxy][l] < min_disp_ended) min_disp_ended = DISP_END[cxy][l]; if (DISP_END[cxy][l] > max_disp_ended) max_disp_ended = DISP_END[cxy][l]; } } } printf( "\n ------ %s ------\n" , string ); fprintf( f , "\n ------ %s ------\n" , string ); printf( " - MMAP_START : min = %d / max = %d / med = %d / delta = %d\n", min_load_start, max_load_start, (min_load_start+max_load_start)/2, max_load_start-min_load_start ); fprintf( f , " - MMAP_START : min = %d / max = %d / med = %d / delta = %d\n", min_load_start, max_load_start, (min_load_start+max_load_start)/2, max_load_start-min_load_start ); printf( " - MMAP_END : min = %d / max = %d / med = %d / delta = %d\n", min_load_ended, max_load_ended, (min_load_ended+max_load_ended)/2, max_load_ended-min_load_ended ); fprintf( f , " - MMAP_END : min = %d / max = %d / med = %d / delta = %d\n", min_load_ended, max_load_ended, (min_load_ended+max_load_ended)/2, max_load_ended-min_load_ended ); printf( " - TRSP_START : min = %d / max = %d / med = %d / delta = %d\n", min_trsp_start, max_trsp_start, (min_trsp_start+max_trsp_start)/2, max_trsp_start-min_trsp_start ); fprintf( f , " - TRSP_START : min = %d / max = %d / med = %d / delta = %d\n", min_trsp_start, max_trsp_start, (min_trsp_start+max_trsp_start)/2, max_trsp_start-min_trsp_start ); printf( " - TRSP_END : min = %d / max = %d / med = %d / delta = %d\n", min_trsp_ended, max_trsp_ended, (min_trsp_ended+max_trsp_ended)/2, max_trsp_ended-min_trsp_ended ); fprintf( f , " - TRSP_END : min = %d / max = %d / med = %d / delta = %d\n", min_trsp_ended, max_trsp_ended, (min_trsp_ended+max_trsp_ended)/2, max_trsp_ended-min_trsp_ended ); printf( " - DISP_START : min = %d / max = %d / med = %d / delta = %d\n", min_disp_start, max_disp_start, (min_disp_start+max_disp_start)/2, max_disp_start-min_disp_start ); fprintf( f , " - DISP_START : min = %d / max = %d / med = %d / delta = %d\n", min_disp_start, max_disp_start, (min_disp_start+max_disp_start)/2, max_disp_start-min_disp_start ); printf( " - DISP_END : min = %d / max = %d / med = %d / delta = %d\n", min_disp_ended, max_disp_ended, (min_disp_ended+max_disp_ended)/2, max_disp_ended-min_disp_ended ); fprintf( f , " - DISP_END : min = %d / max = %d / med = %d / delta = %d\n", min_disp_ended, max_disp_ended, (min_disp_ended+max_disp_ended)/2, max_disp_ended-min_disp_ended ); fclose( f ); } // end instrument()