source: trunk/user/transpose/transpose.c @ 666

Last change on this file since 666 was 659, checked in by alain, 4 years ago

euh...

File size: 37.8 KB
RevLine 
[646]1//////////////////////////////////////////////////////////////////////////////////////////
2// File   : transpose.c   
3// Date   : september 2019
4// author : Alain Greiner
5//////////////////////////////////////////////////////////////////////////////////////////
6// This multi-threaded aplication read a raw image (one byte per pixel)
[652]7// stored on disk, transposes it, displays the result on the frame buffer,
8// and stores the transposed image on disk.
[646]9//
[652]10// The image size and the pixel encoding type are defined by the IMAGE_SIZE and
11// IMAGE_TYPE global parameters.
[646]12//
[652]13// It can run on a multi-cores, multi-clusters architecture, where (X_SIZE * Y_SIZE)
14// is the number of clusters and NCORES the number of cores per cluster.
15// A core is identified by two indexes [cxy,lid] : cxy is the cluster identifier,
16// (that is NOT required to be a continuous index), and lid is the local core index,
[657]17// (that must be in the [0,NCORES-1] range).
[646]18//
[652]19// The main() function can run on any core in any cluster. This main thread
[657]20// makes the initialisations, load the input file to the "image_in" buffer,
21// launches the working threads, calls the instrument() function when all working
22// threads complete, and saves the result "image_out" buffer to the output file.
[646]23//
[657]24// The number of working threads is always defined by the number of cores availables
[652]25// in the architecture, but this application supports three placement modes.
26// In all modes, the working threads are identified by the [tid] continuous index
27// in range [0, NTHREADS-1], and defines how the lines are shared amongst the threads.
28// This continuous index can always be decomposed in two continuous sub-indexes:
29// tid == cid * ncores + lid,  where cid is in [0,NCLUSTERS-1] and lid in [0,NCORES-1].
30//
31// - NO_PLACEMENT: the main thread is itsef a working thread. The (N_1) other working
32//   threads are created by the main thread, but the placement is done by the OS, using
33//   the DQDT for load balancing, and two working threads can be placed on the same core.
34//   The [cid,lid] are only abstract identifiers, and cannot be associated to a physical
35//   cluster or a physical core. In this mode, the main thread run on any cluster,
36//   but has tid = 0 (i.e. cid = 0 & tid = 0).
37//
38// - EXPLICIT_PLACEMENT: the main thread is again a working thread, but the placement of
39//   of the threads on the cores is explicitely controled by the main thread to have
40//   exactly one working thread per core, and the [cxy][lpid] core coordinates for a given
41//   thread[tid] can be directly derived from the [tid] value: [cid] is an alias for the
42//   physical cluster identifier, and [lid] is the local core index.
43//
44// - PARALLEL_PLACEMENT: the main thread is not anymore a working thread, and uses the
45//   non standard pthread_parallel_create() function to avoid the costly sequencial
46//   loops for pthread_create() and pthread_join(). It garanty one working thread
47//   per core, and the same relation between the thread[tid] and the core[cxy][lpid].
48//   
[657]49// Each working thread[cid][lid] run the "execute" function, that uses the "buf_in" and
50// "buf_out" local buffers, containing the direct and transposed images:
51// Each thread[cid][0] allocates two buf_in[cid] and buf_out[cid] buffers, load from
52// "image_in" to buf_in[cid] all lines that must be handled by the threads sharing the
53// same cid, and finally save from buf_out[cid] to "image_out" all lines that have been
54// transposed to buf_out[cid].
55// Each thread[cid][lid] in the group defined by the cid index read pixels from the
56// local buf_in[cid] buffer, and write pixels to all remote // buf_out[cid] buffers.
[652]57//
[646]58// - The image  must fit the frame buffer size, that must be power of 2.
59// - The number of clusters  must be a power of 2 no larger than 256.
60// - The number of cores per cluster must be a power of 2 no larger than 4.
[652]61// - The number of threads cannot be larger than IMAGE_SIZE.
[646]62//
63//////////////////////////////////////////////////////////////////////////////////////////
64
65#include <sys/mman.h>
66#include <stdio.h>
67#include <stdlib.h>
68#include <unistd.h>
69#include <pthread.h>
70#include <string.h>
71#include <almosmkh.h>
72#include <fcntl.h>
73#include <hal_macros.h>
74
75#define X_MAX                 16                           // max number of clusters in row
76#define Y_MAX                 16                           // max number of clusters in column
77#define CORES_MAX             4                            // max number of cores per cluster
78#define CLUSTERS_MAX          (X_MAX * Y_MAX)              // max number of clusters
[652]79#define THREADS_MAX           (X_MAX * Y_MAX * CORES_MAX)  // max number of threads
[646]80
81#define IMAGE_TYPE            420                          // pixel encoding type
82
[657]83//#define IMAGE_SIZE            128                          // image size
84//#define INPUT_FILE_PATH       "/misc/images_128.raw"       // input file pathname
85//#define OUTPUT_FILE_PATH      "/misc/transposed_128.raw"   // output file pathname
86
87//#define IMAGE_SIZE            256                          // image size
88//#define INPUT_FILE_PATH       "/misc/lena_256.raw"         // input file pathname
89#//define OUTPUT_FILE_PATH      "/misc/transposed_256.raw"   // output file pathname
90
91//#define IMAGE_SIZE            512                          // image size
92//#define INPUT_FILE_PATH       "/misc/couple_512.raw"       // input file pathname
93//#define OUTPUT_FILE_PATH      "/misc/transposed_512.raw"   // output file pathname
94
95#define IMAGE_SIZE            1024                         // image size
96#define INPUT_FILE_PATH       "/misc/philips_1024.raw"     // input file pathname
97#define OUTPUT_FILE_PATH      "/misc/transposed_1024.raw"  // output file pathname
98
[652]99#define SAVE_RESULT_FILE      0                            // save result image on disk
[657]100#define USE_DQT_BARRIER       0                            // quad-tree barrier if non zero
[646]101
[652]102#define NO_PLACEMENT          0                            // uncontrolefdthread placement
[657]103#define EXPLICIT_PLACEMENT    1                            // explicit threads placement
104#define PARALLEL_PLACEMENT    0                            // parallel threads placement
[646]105
[657]106#define VERBOSE_MAIN          1                            // main function print comments
107#define VERBOSE_MAIN_DETAILED 0                            // main function print comments
108#define VERBOSE_EXEC          1                            // exec function print comments
[652]109
110
[646]111///////////////////////////////////////////////////////
112//                global variables
113///////////////////////////////////////////////////////
114
[652]115// global instrumentation counters for the main thread
116unsigned int SEQUENCIAL_TIME = 0;
117unsigned int PARALLEL_TIME   = 0;
118
119// instrumentation counters for each thread in each cluster
120// indexed by [cid][lid] : cluster continuous index / thread local index
[657]121unsigned int ALOC_START[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
122unsigned int ALOC_END  [CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
[652]123unsigned int LOAD_START[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
124unsigned int LOAD_END  [CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
[646]125unsigned int TRSP_START[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
126unsigned int TRSP_END  [CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
[657]127unsigned int SAVE_START[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
128unsigned int SAVE_END  [CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
129unsigned int FREE_START[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
130unsigned int FREE_END  [CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
[646]131
[657]132// buffer containing the input image, loaded by the main from input file
133unsigned char  image_in[IMAGE_SIZE * IMAGE_SIZE];
[646]134
[657]135// buffer containing the output image, saved by the main to output file
136unsigned char  image_out[IMAGE_SIZE * IMAGE_SIZE];
[652]137
[657]138// arrays of pointers on distributed buffers indexed by [cid]
139unsigned char *  buf_in [CLUSTERS_MAX];
140unsigned char *  buf_out[CLUSTERS_MAX];
[652]141
[657]142// pointer and identifier for dynamically allocated FBF window
143void   *  win_buf;
144int       wid;
145
[652]146// synchronisation barrier (all working threads)
[646]147pthread_barrier_t   barrier;
148
149// platform parameters
[652]150unsigned int  x_size;              // number of clusters in a row
151unsigned int  y_size;              // number of clusters in a column
152unsigned int  ncores;              // number of cores per cluster
[646]153
[652]154// main thread continuous index
155unsigned int     tid_main; 
[646]156
[652]157//return values at thread exit
158unsigned int THREAD_EXIT_SUCCESS = 0;
159unsigned int THREAD_EXIT_FAILURE = 1;
[646]160
[652]161// array of kernel thread identifiers / indexed by [tid]
162pthread_t                     exec_trdid[THREADS_MAX];   
[646]163
[652]164// array of execute function arguments / indexed by [tid]
165pthread_parallel_work_args_t  exec_args[THREADS_MAX];
[646]166
[652]167// array of thread attributes / indexed by [tid]
168pthread_attr_t                exec_attr[THREADS_MAX];
[646]169
170////////////////////////////////////////////////////////////////
171//             functions declaration
172////////////////////////////////////////////////////////////////
173
[656]174void * execute( void * arguments );
[646]175
[652]176void instrument( FILE * f , char * filename );
[646]177
[656]178////////////////
179int main( void )
[646]180{
[652]181    unsigned long long start_cycle;
182    unsigned long long end_sequencial_cycle;
183    unsigned long long end_parallel_cycle;
[646]184
[652]185    char               filename[32];      // instrumentation file name
186    char               pathname[64];      // instrumentation file pathname
187
[646]188    int error;
189
[652]190    /////////////////////////////////////////////////////////////////////////////////
191    get_cycle( &start_cycle );
192    /////////////////////////////////////////////////////////////////////////////////
[646]193
[652]194    if( (NO_PLACEMENT + EXPLICIT_PLACEMENT + PARALLEL_PLACEMENT) != 1 )
195    {
196        printf("\n[transpose error] illegal placement\n");
197        exit( 0 );
198    }
[646]199
200    // get & check plat-form parameters
[659]201    hard_config_t  config;
202    get_config( &config );
203    x_size = config.x_size;
204    y_size = config.y_size;
205    ncores = config.ncores;
[646]206
[652]207    if((ncores != 1) && (ncores != 2) && (ncores != 4))
[646]208    {
209        printf("\n[transpose error] number of cores per cluster must be 1/2/4\n");
210        exit( 0 );
211    }
212
213    if( (x_size != 1) && (x_size != 2) && (x_size != 4) && 
214        (x_size != 8) && (x_size != 16) )
215    {
216        printf("\n[transpose error] x_size must be 1/2/4/8/16\n");
217        exit( 0 );
218    }
219       
220    if( (y_size != 1) && (y_size != 2) && (y_size != 4) && 
221        (y_size != 8) && (y_size != 16) )
222    {
223        printf("\n[transpose error] y_size must be 1/2/4/8/16\n");
224        exit( 0 );
225    }
226       
[657]227    // get identifiers for core executing main
[652]228    unsigned int  cxy_main;
229    unsigned int  lid_main;
230    get_core_id( &cxy_main , &lid_main );
[646]231
232    // compute number of threads
233    unsigned int nclusters = x_size * y_size;
234    unsigned int nthreads  = nclusters * ncores;
235
[657]236    if( nthreads > IMAGE_SIZE )
237    {
238        printf("\n[transpose error] number of threads larger than number of lines\n");
239        exit( 0 );
240    }
241
242    // get FBF size and type
[646]243    unsigned int   fbf_width;
244    unsigned int   fbf_height;
245    unsigned int   fbf_type;
246    fbf_get_config( &fbf_width , &fbf_height , &fbf_type );
247
[657]248    if( (fbf_width < IMAGE_SIZE) || (fbf_height < IMAGE_SIZE) || (fbf_type != IMAGE_TYPE) )
[646]249    {
250        printf("\n[transpose error] image does not fit FBF size or type\n");
251        exit( 0 );
252    }
253
[657]254    // define total number of pixels
255    int npixels = IMAGE_SIZE * IMAGE_SIZE;
[646]256
[652]257    // define instrumentation file name
258    if( NO_PLACEMENT )
259    {
[657]260        printf("\n[transpose] %d cluster(s) / %d core(s) / <%s> / PID %x / NO_PLACE\n",
261        nclusters, ncores, INPUT_FILE_PATH , getpid() );
[652]262
263        // build instrumentation file name
264        if( USE_DQT_BARRIER )
265        snprintf( filename , 32 , "trsp_dqt_no_place_%d_%d_%d",
266        IMAGE_SIZE , x_size * y_size , ncores );
267        else
268        snprintf( filename , 32 , "trsp_smp_no_place_%d_%d_%d",
269        IMAGE_SIZE , x_size * y_size , ncores );
270    }
271
272    if( EXPLICIT_PLACEMENT )
273    {
[657]274        printf("\n[transpose] %d cluster(s) / %d core(s) / <%s> / PID %x / EXPLICIT\n",
275        nclusters, ncores, INPUT_FILE_PATH , getpid() );
[652]276
277        // build instrumentation file name
278        if( USE_DQT_BARRIER )
279        snprintf( filename , 32 , "trsp_dqt_explicit_%d_%d_%d",
280        IMAGE_SIZE , x_size * y_size , ncores );
281        else
282        snprintf( filename , 32 , "trsp_smp_explicit_%d_%d_%d",
283        IMAGE_SIZE , x_size * y_size , ncores );
284    }
285
286    if( PARALLEL_PLACEMENT )
287    {
[657]288        printf("\n[transpose] %d cluster(s) / %d core(s) / <%s> / PID %x / PARALLEL\n",
289        nclusters, ncores, INPUT_FILE_PATH , getpid() );
[652]290
291        // build instrumentation file name
292        if( USE_DQT_BARRIER )
293        snprintf( filename , 32 , "trsp_dqt_parallel_%d_%d_%d",
294        IMAGE_SIZE , x_size * y_size , ncores );
295        else
296        snprintf( filename , 32 , "trsp_smp_parallel_%d_%d_%d",
297        IMAGE_SIZE , x_size * y_size , ncores );
298    }
299
[657]300    // open a window in FBF
301    wid = fbf_create_window( 0,             // l_zero
302                             0,             // p_zero
303                             IMAGE_SIZE,    // lines
304                             IMAGE_SIZE,    // pixels
305                             &win_buf );
306    if( wid < 0) 
307    {
308        printf("\n[transpose error] cannot open FBF window\n");
309        exit( 0 );
310    }
311
312#if  VERBOSE_MAIN
313printf("\n[transpose] main on core[%x,%d] created FBF window %d / buffer %x\n",
314cxy_main, lid_main, wid , win_buf );
315#endif
316
[652]317    // open instrumentation file
318    snprintf( pathname , 64 , "/home/%s", filename );
319    FILE * f = fopen( pathname , NULL );
[657]320
[652]321    if ( f == NULL ) 
[646]322    { 
[657]323        printf("\n[transpose error] cannot open instru file %s\n", pathname );
[646]324        exit( 0 );
325    }
326
[652]327#if  VERBOSE_MAIN
328printf("\n[transpose] main on core[%x,%d] open instrumentation file %s\n",
329cxy_main, lid_main, pathname );
[646]330#endif
331
[652]332    // main thread initializes barrier
[646]333    if( USE_DQT_BARRIER )
334    {
335        pthread_barrierattr_t attr;
336        attr.x_size   = x_size;
337        attr.y_size   = y_size;
338        attr.nthreads = ncores;
339        error = pthread_barrier_init( &barrier, &attr , nthreads );
340    }
341    else
342    {
343        error = pthread_barrier_init( &barrier, NULL , nthreads );
344    }
345
346    if( error )
347    { 
348        printf("\n[transpose error] main cannot initialize barrier\n" );
349        exit( 0 );
350    }
351
[652]352#if  VERBOSE_MAIN
[657]353printf("\n[transpose] main on core[%x,%d] completed barrier initialisation\n",
[652]354cxy_main, lid_main );
355#endif
[646]356
[657]357    // open input file
[652]358    int fd_in = open( INPUT_FILE_PATH , O_RDONLY , 0 ); 
[646]359
[652]360    if ( fd_in < 0 ) 
361    { 
362        printf("\n[transpose error] main cannot open file %s\n", INPUT_FILE_PATH );
363        exit( 0 );
364    }
365
366#if  VERBOSE_MAIN
367printf("\n[transpose] main open file <%s> / fd = %d\n", INPUT_FILE_PATH , fd_in );
368#endif
369
[657]370    // open output file
[652]371    int fd_out = open( OUTPUT_FILE_PATH , O_CREAT , 0 ); 
372
373    if ( fd_out < 0 ) 
374    { 
375        printf("\n[transpose error] main cannot open file %s\n", OUTPUT_FILE_PATH );
376        exit( 0 );
377    }
378
[657]379    // move input image to input buffer
380    if( read( fd_in , image_in , npixels ) != npixels )
381    {
382        printf("\n[transpose error] main cannot read input image\n");
[652]383        exit( 0 );
384    }
385
386#if  VERBOSE_MAIN
[657]387printf("\n[transpose] main moved file <%s> to buf_in\n", INPUT_FILE_PATH );
[652]388#endif
389
390    /////////////////////////////////////////////////////////////////////////////////////
391    get_cycle( &end_sequencial_cycle );
392    SEQUENCIAL_TIME = (unsigned int)(end_sequencial_cycle - start_cycle);
393    /////////////////////////////////////////////////////////////////////////////////////
394
395    //////////////////
396    if( NO_PLACEMENT )
397    {
398        // the tid value for the main thread is always 0
399        // main thread creates new threads with tid in [1,nthreads-1] 
400        unsigned int tid;
401        for ( tid = 0 ; tid < nthreads ; tid++ )
[646]402        {
[652]403            // register tid value in exec_args[tid] array
404            exec_args[tid].tid = tid;
405           
406            // create other threads
407            if( tid > 0 )
[646]408            {
[652]409                if ( pthread_create( &exec_trdid[tid], 
410                                     NULL,                  // no attribute
411                                     &execute,
412                                     &exec_args[tid] ) ) 
[646]413                {
[652]414                    printf("\n[transpose error] cannot create thread %d\n", tid );
415                    exit( 0 );
416                }
[646]417
[657]418#if VERBOSE_MAIN_DETAILED
[652]419printf("\n[transpose] main created thread %d\n", tid );
[646]420#endif
[652]421
[646]422            }
[652]423            else
424            {
425                tid_main = 0;
426            }
427        }  // end for tid
[646]428
[652]429        // main thread calls itself the execute() function
430        execute( &exec_args[0] );
[646]431
[652]432        // main thread wait other threads completion
433        for ( tid = 1 ; tid < nthreads ; tid++ )
434        {
435            unsigned int * status;
436
437            // main wait thread[tid] status
438            if ( pthread_join( exec_trdid[tid], (void*)(&status)) )
439            {
440                printf("\n[transpose error] main cannot join thread %d\n", tid );
441                exit( 0 );
442            }
443       
444            // check status
445            if( *status != THREAD_EXIT_SUCCESS )
446            {
447                printf("\n[transpose error] thread %x returned failure\n", tid );
448                exit( 0 );
449            }
450
[657]451#if VERBOSE_MAIN_DETAILED
452printf("\n[transpose] main joined thread %x\n", tid );
[652]453#endif
454       
455        }  // end for tid
456
457    }  // end if no_placement
458
459    ////////////////////////
460    if( EXPLICIT_PLACEMENT )
[646]461    {
[652]462        // main thread places each other threads on a specific core[cxy][lid]
463        // but the actual thread creation is sequencial
464        unsigned int x;
465        unsigned int y;
466        unsigned int l;
467        unsigned int cxy;                   // cluster identifier
468        unsigned int tid;                   // thread continuous index
469
470        for( x = 0 ; x < x_size ; x++ )
[646]471        {
[652]472            for( y = 0 ; y < y_size ; y++ )
[646]473            {
[652]474                cxy = HAL_CXY_FROM_XY( x , y );
475                for( l = 0 ; l < ncores ; l++ )
[646]476                {
[652]477                    // compute thread continuous index
478                    tid = (((* y_size) + y) * ncores) + l;
[646]479
[652]480                    // register tid value in exec_args[tid] array
481                    exec_args[tid].tid = tid;
482
483                    // no thread created on the core running the main
484                    if( (cxy != cxy_main) || (l != lid_main) )
[646]485                    {
[652]486                        // define thread attributes
487                        exec_attr[tid].attributes = PT_ATTR_CLUSTER_DEFINED |
488                                                    PT_ATTR_CORE_DEFINED;
489                        exec_attr[tid].cxy        = cxy;
490                        exec_attr[tid].lid        = l;
491 
492                        // create thread[tid] on core[cxy][l]
493                        if ( pthread_create( &exec_trdid[tid],   
494                                             &exec_attr[tid],   
495                                             &execute,
496                                             &exec_args[tid] ) )       
497                        {
498                            printf("\n[transpose error] cannot create thread %d\n", tid );
499                            exit( 0 );
500                        }
[657]501
502#if VERBOSE_MAIN_DETAILED
[652]503printf("\n[transpose] main created thread[%d] on core[%x,%d]\n", tid, cxy, l );
504#endif
[646]505                    }
[652]506                    else
[646]507                    {
[652]508                        tid_main = tid;
[646]509                    }
510                }
511            }
512        }
513
[652]514        // main thread calls itself the execute() function
515        execute( &exec_args[tid_main] );
[646]516
[652]517        // main thread wait other threads completion
518        for( tid = 0 ; tid < nthreads ; tid++ )
[646]519        {
[652]520            // no other thread on the core running the main
521            if( tid != tid_main )
522            {
523                unsigned int * status;
[646]524
[652]525                // wait thread[tid]
526                if( pthread_join( exec_trdid[tid] , (void*)(&status) ) )
527                {
528                    printf("\n[transpose error] main cannot join thread %d\n", tid );
529                    exit( 0 );
530                }
531       
532                // check status
533                if( *status != THREAD_EXIT_SUCCESS )
534                {
535                    printf("\n[transpose error] thread %d returned failure\n", tid );
536                    exit( 0 );
537                }
[657]538
539#if VERBOSE_MAIN_DETAILED
540printf("\n[transpose] main joined thread %d\n", tid );
[646]541#endif
[652]542            }
543        }
544    }  // end if explicit_placement
[646]545
[652]546    ////////////////////////
547    if( PARALLEL_PLACEMENT )
[646]548    {
[652]549        // compute covering DQT size an level
550        unsigned int z          = (x_size > y_size) ? x_size : y_size;
551        unsigned int root_level = ((z == 1) ? 0 : 
552                                  ((z == 2) ? 1 : 
553                                  ((z == 4) ? 2 : 
554                                  ((z == 8) ? 3 : 4))));
[646]555
[652]556        // create & execute the working threads
557        if( pthread_parallel_create( root_level , &execute ) )
[646]558        {
[652]559            printf("\n[transpose error] in %s\n", __FUNCTION__ );
[646]560            exit( 0 );
561        }
[652]562    }  // end if parallel_placement
[646]563
564
[652]565    /////////////////////////////////////////////////////////////////////////////
566    get_cycle( &end_parallel_cycle );
567    PARALLEL_TIME = (unsigned int)(end_parallel_cycle - end_sequencial_cycle);
568    /////////////////////////////////////////////////////////////////////////////
[646]569
[657]570    // register instrumentation results
[652]571    instrument( f , filename );
[646]572
[657]573#if VERBOSE_MAIN
574printf("\n[transpose] main completed instrumentation\n");
575#endif
576
577/*
578    printf("\n> ");
579    getchar();
580
581    // move window
582    if( fbf_move_window( wid , 100 , 100 ) )
583    {
584        printf("\n[transpose error] main cannot move FBF window\n");
585        exit( 0 );
586    }
587
588    printf("\n> ");
589    getchar();
590*/   
591    // save image_out to output file
592    if( write( fd_out , image_out , npixels ) != npixels )
593    {
594        printf("\n[transpose error] main cannot write output image\n");
595        exit( 0 );
596    }
597
598#if VERBOSE_MAIN
599printf("\n[transpose] main saved buf_out to output file\n");
600#endif
601
602    // close input file
[646]603    close( fd_in );
[652]604
[657]605#if VERBOSE_MAIN
606printf("\n[transpose] main closed input file\n");
607#endif
[652]608
[657]609    // close output file
[646]610    close( fd_out );
611
[657]612#if VERBOSE_MAIN
613printf("\n[transpose] main closed output file\n");
[652]614#endif
615
[657]616    // close instrumentation file
[652]617    fclose( f );
618
[657]619#if VERBOSE_MAIN
620printf("\n[transpose] main closed instrumentation file\n");
621#endif
622
623    // delete FBF window
624    if( fbf_delete_window( wid ) )
625    {
626        printf("\n[transpose error] main cannot delete FBF window\n");
627        exit( 0 );
628    }
629
[652]630    // main thread suicide
[646]631    exit( 0 );
632   
[656]633    return 0;
634
[646]635} // end main()
636
637
638
[652]639
[656]640//////////////////////////////////
641void * execute( void * arguments ) 
[646]642{
643    unsigned long long   date;
[657]644    unsigned int         l;         // line index for loop
645    unsigned int         p;         // pixel index for loop
646    int                  error;
647
648    unsigned char      * wbuf = win_buf;
[646]649 
[656]650    pthread_parallel_work_args_t * args = (pthread_parallel_work_args_t *)arguments;
651
[652]652    // WARNING
653    //A thread is identified by the tid index, defined in the "args" structure.
654    // This index being in range [0,nclusters*ncores-1] we can always write
655    //       tid == cid * ncores + lid
656    // with cid in [0,nclusters-1] and lid in [0,ncores-1].
657    // if NO_PLACEMENT, there is no relation between these
658    // thread [cid][lid] indexes, and the core coordinates [cxy][lpid]
[646]659
[652]660    // get thread abstract identifiers
661    unsigned int tid = args->tid;
[657]662    unsigned int cid = tid / ncores;    // abstract cluster index
663    unsigned int lid = tid % ncores;    // local thread index
[652]664
665#if VERBOSE_EXEC
666unsigned int cxy;
667unsigned int lpid;
668get_core_id( &cxy , &lpid );   // get core physical identifiers
[657]669#endif
670
671#if VERBOSE_EXEC
[652]672printf("\n[transpose] exec[%d] on core[%x,%d] enters parallel exec\n",
673tid , cxy , lpid );
674#endif
675
676    get_cycle( &date );
[657]677    ALOC_START[cid][lid] = (unsigned int)date;
[652]678
[657]679    // compute total number of pixels per image
[646]680    unsigned int npixels = IMAGE_SIZE * IMAGE_SIZE;     
681
[657]682    // compute total number of threads and clusters
[646]683    unsigned int nclusters = x_size * y_size;
[652]684    unsigned int nthreads  = nclusters * ncores;
[646]685
[657]686    // compute number of pixels per cid & per thread
687    unsigned int pixels_per_cid = npixels / nclusters;
688    unsigned int pixels_per_lid = pixels_per_cid / ncores;
[646]689
[657]690    // compute first and last line per thread
691    unsigned int lines_per_cid = pixels_per_cid / IMAGE_SIZE;
692    unsigned int lines_per_lid = pixels_per_lid / IMAGE_SIZE;
[646]693
[657]694    unsigned int line_first = (cid * lines_per_cid) + (lid * lines_per_lid);
695    unsigned int line_last  = line_first + lines_per_lid;
696
697    // Each thread[cid,0] allocates two local buffers, and register the base
698    // adresses in the global variable buf_in_ptr[cid] & buf_out_ptr[cid].
699   
[652]700    if( lid == 0 )
[646]701    {
[652]702        // allocate buf_in
[657]703        buf_in[cid] = (unsigned char *)malloc( pixels_per_cid );
[646]704
[657]705        if( buf_in[cid] == NULL )
[646]706        {
[652]707            printf("\n[transpose error] thread[%d] cannot allocate buf_in\n", tid );
[646]708            pthread_exit( &THREAD_EXIT_FAILURE );
709        }
[652]710
711#if VERBOSE_EXEC
712printf("\n[transpose] exec[%d] on core[%x,%d] allocated buf_in = %x\n",
713tid , cxy , lpid , buf_in );
[646]714#endif
715
[652]716        // allocate buf_out
[657]717        buf_out[cid] = (unsigned char *)malloc( pixels_per_cid );
[652]718
[657]719        if( buf_out[cid] == NULL )
[646]720        {
[652]721            printf("\n[transpose error] thread[%d] cannot allocate buf_in\n", tid );
[646]722            pthread_exit( &THREAD_EXIT_FAILURE );
723        }
[652]724
725#if VERBOSE_EXEC
726printf("\n[transpose] exec[%d] on core[%x,%d] allocated buf_out = %x\n",
727tid , cxy , lpid , buf_out );
[646]728#endif
729
730    }
[657]731
[646]732    get_cycle( &date );
[657]733    ALOC_END[cid][lid] = (unsigned int)date;
734
735    /////////////////////////////////
736    pthread_barrier_wait( &barrier );
737    /////////////////////////////////
738
739    get_cycle( &date );
740    LOAD_START[cid][lid] = (unsigned int)date;
741
742    // all threads copy relevant part of the image_in to buf_in[cid]
743    memcpy( buf_in[cid] + (lid * pixels_per_lid), 
744            image_in + (cid * pixels_per_cid) + (lid * pixels_per_lid),
745            pixels_per_lid );
746
747#if VERBOSE_EXEC
748printf("\n[transpose] exec[%d] on core[%x,%d] loaded buf_in[%d]\n",
749tid , cxy , lpid , cid );
750#endif
751
752    // all local threads copy part of buf_in[cid] to FBF window for display
753    memcpy( wbuf + (cid * pixels_per_cid) + (lid * pixels_per_lid),
754            buf_in[cid] + (lid * pixels_per_lid),
755            pixels_per_lid );
756
757#if  VERBOSE_EXEC
758printf("\n[transpose] exec[%d] on core[%x,%d] loaded buf_in to FBF (first %d / last %d)\n",
759tid , cxy , lpid , line_first , line_last );
760#endif
761
762    // retresh window
763    error = fbf_refresh_window( wid , line_first , line_last );
764
765    if( error )
766    {
767        printf("\n[transpose error] exec[%d] cannot refresh FBF window\n", tid );
768        exit( 0 );
769    }
770
771    get_cycle( &date );
[652]772    LOAD_END[cid][lid] = (unsigned int)date;
[646]773
774    /////////////////////////////////
775    pthread_barrier_wait( &barrier );
[657]776    /////////////////////////////////
[646]777
[652]778    get_cycle( &date );
779    TRSP_START[cid][lid] = (unsigned int)date;
780
[657]781    // All threads contribute to parallel transpose from buf_in to buf_out:
[652]782    // each thread makes the transposition for nlt lines (nlt = npixels/nthreads)
[646]783    // from line [tid*nlt] to line [(tid + 1)*nlt - 1]
784    // (p,l) are the absolute pixel coordinates in the source image
[657]785    // (l,p) are the absolute pixel coordinates in the dest image
[646]786
787    unsigned int nlt   = IMAGE_SIZE / nthreads;    // number of lines per thread
788    unsigned int nlc   = IMAGE_SIZE / nclusters;   // number of lines per cluster
789
[652]790    unsigned int src_cid;
[646]791    unsigned int src_index;
[652]792    unsigned int dst_cid;
[646]793    unsigned int dst_index;
794
795    unsigned char byte;
796
[657]797    unsigned int first = tid * nlt;        // first line index for a given thread
[646]798    unsigned int last  = first + nlt;      // last line index for a given thread
799
[652]800    // loop on lines handled by this thread
[646]801    for ( l = first ; l < last ; l++ )
802    {
[652]803        // loop on pixels in one line (one pixel per iteration)
[646]804        for ( p = 0 ; p < IMAGE_SIZE ; p++ )
805        {
806            // read one byte from local buf_in
[652]807            src_cid   = l / nlc;
808            src_index = (l % nlc) * IMAGE_SIZE + p;
[646]809
[657]810            byte = buf_in[src_cid][src_index];
[652]811
[646]812            // write one byte to remote buf_out
[652]813            dst_cid   = p / nlc; 
814            dst_index = (p % nlc) * IMAGE_SIZE + l;
[646]815
[657]816            buf_out[dst_cid][dst_index] = byte;
[646]817        }
818    }
819
[652]820#if VERBOSE_EXEC
821printf("\n[transpose] exec[%d] on core[%x,%d] completes transpose\n",
822tid , cxy , lpid );
[646]823#endif
824
825    get_cycle( &date );
[652]826    TRSP_END[cid][lid] = (unsigned int)date;
[646]827
828    /////////////////////////////////
829    pthread_barrier_wait( &barrier );
[657]830    /////////////////////////////////
[646]831
832    get_cycle( &date );
[657]833    SAVE_START[cid][lid] = (unsigned int)date;
[646]834
[657]835    // each local threads copy part of buf_out[cid] to FBF window for display
836    memcpy( wbuf + (cid * pixels_per_cid) + (lid * pixels_per_lid),
837            buf_out[cid] + (lid * pixels_per_lid),
838            pixels_per_lid );
[646]839
[657]840#if  VERBOSE_EXEC
841printf("\n[transpose] exec[%d] on core[%x,%d] loaded buf_out to FBF (first %d / last %d)\n",
842tid , cxy , lpid , line_first , line_last );
843#endif
844
845    // refresh window
846    error = fbf_refresh_window( wid , line_first , line_last );
847
848    if( error )
[646]849    {
[657]850        printf("\n[transpose error] exec[%d] cannot refresh FBF window\n", tid );
851        exit( 0 );
[646]852    }
853
[657]854    // each local thread copy relevant part of buf_out to image_out
855    memcpy( image_out + (cid * pixels_per_cid) + (lid * pixels_per_lid),
856            buf_out[cid] + (lid * pixels_per_lid),
857            pixels_per_lid );
858
[652]859#if VERBOSE_EXEC
[657]860printf("\n[transpose] exec[%d] on core[%x,%d] saved buf_out[%d]\n",
861tid , cxy , lpid , cid );
[646]862#endif
863
864    get_cycle( &date );
[657]865    SAVE_END[cid][lid] = (unsigned int)date;
[646]866
867    /////////////////////////////////
868    pthread_barrier_wait( &barrier );
[657]869    /////////////////////////////////
[646]870
[657]871    get_cycle( &date );
872    FREE_START[cid][lid] = (unsigned int)date;
[652]873
[657]874    // Each thread[cid,0] release local buffers buf_in & buf_out
875
[652]876    if( lid == 0 )
877    {
[657]878        // release local buffers
879        free( buf_in[cid] );
880        free( buf_out[cid] );
[652]881
882#if VERBOSE_EXEC
[657]883printf("\n[transpose] exec[%d] on core[%x,%d] released buf_in & buf_out\n",
884tid , cxy , lpid );
[652]885#endif
886
[657]887    }
[652]888
[657]889    get_cycle( &date );
890    FREE_END[cid][lid] = (unsigned int)date;
891
892    /////////////////////////////////
893    pthread_barrier_wait( &barrier );
894    /////////////////////////////////
[652]895   
896    // thread termination depends on the placement policy
897    if( PARALLEL_PLACEMENT )   
898    {
[656]899        // <work> threads are runing in detached mode, and
[652]900        // each thread must signal completion by calling barrier
901        // passed in arguments before exit
902
903        pthread_barrier_wait( args->barrier );
904
[646]905        pthread_exit( &THREAD_EXIT_SUCCESS );
906    }
[652]907    else
908    {
909        // <work> threads are running in attached mode
910        // each thread, but de main, simply exit
[657]911        if ( tid != tid_main ) 
912        {
913
914#if VERBOSE_EXEC
915printf("\n[transpose] exec[%d] on core[%x,%d] exit\n",
916tid , cxy , lpid );
917#endif
918            pthread_exit( &THREAD_EXIT_SUCCESS );
919        }
[652]920    }
[646]921
[656]922    return NULL;
923
[646]924} // end execute()
925
926
927
[657]928//////////////////////////
[652]929void instrument( FILE * f,
930                 char * filename )
[646]931{
[657]932    unsigned int cid;
933    unsigned int l;
[646]934
[657]935    unsigned int min_aloc_start = 0xFFFFFFFF;
936    unsigned int max_aloc_start = 0;
937    unsigned int min_aloc_ended = 0xFFFFFFFF;
938    unsigned int max_aloc_ended = 0;
[646]939    unsigned int min_load_start = 0xFFFFFFFF;
940    unsigned int max_load_start = 0;
941    unsigned int min_load_ended = 0xFFFFFFFF;
942    unsigned int max_load_ended = 0;
943    unsigned int min_trsp_start = 0xFFFFFFFF;
944    unsigned int max_trsp_start = 0;
945    unsigned int min_trsp_ended = 0xFFFFFFFF;
946    unsigned int max_trsp_ended = 0;
[657]947    unsigned int min_save_start = 0xFFFFFFFF;
948    unsigned int max_save_start = 0;
949    unsigned int min_save_ended = 0xFFFFFFFF;
950    unsigned int max_save_ended = 0;
951    unsigned int min_free_start = 0xFFFFFFFF;
952    unsigned int max_free_start = 0;
953    unsigned int min_free_ended = 0xFFFFFFFF;
954    unsigned int max_free_ended = 0;
[646]955 
[657]956    for (cid = 0; cid < (x_size * y_size) ; cid++)
[646]957    {
[657]958        for ( l = 0 ; l < ncores ; l++ )
[646]959        {
[657]960            if (ALOC_START[cid][l] < min_aloc_start)  min_aloc_start = ALOC_START[cid][l];
961            if (ALOC_START[cid][l] > max_aloc_start)  max_aloc_start = ALOC_START[cid][l];
962            if (ALOC_END[cid][l]   < min_aloc_ended)  min_aloc_ended = ALOC_END[cid][l]; 
963            if (ALOC_END[cid][l]   > max_aloc_ended)  max_aloc_ended = ALOC_END[cid][l];
964            if (LOAD_START[cid][l] < min_load_start)  min_load_start = LOAD_START[cid][l];
965            if (LOAD_START[cid][l] > max_load_start)  max_load_start = LOAD_START[cid][l];
966            if (LOAD_END[cid][l]   < min_load_ended)  min_load_ended = LOAD_END[cid][l]; 
967            if (LOAD_END[cid][l]   > max_load_ended)  max_load_ended = LOAD_END[cid][l];
968            if (TRSP_START[cid][l] < min_trsp_start)  min_trsp_start = TRSP_START[cid][l];
969            if (TRSP_START[cid][l] > max_trsp_start)  max_trsp_start = TRSP_START[cid][l];
970            if (TRSP_END[cid][l]   < min_trsp_ended)  min_trsp_ended = TRSP_END[cid][l];
971            if (TRSP_END[cid][l]   > max_trsp_ended)  max_trsp_ended = TRSP_END[cid][l];
972            if (SAVE_START[cid][l] < min_save_start)  min_save_start = SAVE_START[cid][l];
973            if (SAVE_START[cid][l] > max_save_start)  max_save_start = SAVE_START[cid][l];
974            if (SAVE_END[cid][l]   < min_save_ended)  min_save_ended = SAVE_END[cid][l];
975            if (SAVE_END[cid][l]   > max_save_ended)  max_save_ended = SAVE_END[cid][l];
976            if (FREE_START[cid][l] < min_free_start)  min_free_start = FREE_START[cid][l];
977            if (FREE_START[cid][l] > max_free_start)  max_free_start = FREE_START[cid][l];
978            if (FREE_END[cid][l]   < min_free_ended)  min_free_ended = FREE_END[cid][l];
979            if (FREE_END[cid][l]   > max_free_ended)  max_free_ended = FREE_END[cid][l];
[646]980        }
981    }
982
[652]983    printf( "\n ------ %s ------\n" , filename );
984    fprintf( f , "\n ------ %s ------\n" , filename );
[646]985
[657]986    printf( " - ALOC_START : min = %d / max = %d / delta = %d\n",
987           min_aloc_start, max_aloc_start, max_aloc_start-min_aloc_start ); 
988    fprintf( f , " - ALOC_START : min = %d / max = %d / delta = %d\n",
989           min_aloc_start, max_aloc_start, max_aloc_start-min_aloc_start ); 
990
991    printf( " - ALOC_END   : min = %d / max = %d / delta = %d\n",
992           min_aloc_start, max_aloc_start, max_aloc_start-min_aloc_start ); 
993    fprintf( f , " - ALOC_END   : min = %d / max = %d / delta = %d\n",
994           min_aloc_start, max_aloc_start, max_aloc_start-min_aloc_start ); 
995
[652]996    printf( " - LOAD_START : min = %d / max = %d / delta = %d\n",
997           min_load_start, max_load_start, max_load_start-min_load_start ); 
998    fprintf( f , " - LOAD_START : min = %d / max = %d / delta = %d\n",
999           min_load_start, max_load_start, max_load_start-min_load_start ); 
[646]1000
[652]1001    printf( " - LOAD_END   : min = %d / max = %d / delta = %d\n",
1002           min_load_ended, max_load_ended, max_load_ended-min_load_ended ); 
1003    fprintf( f , " - LOAD_END   : min = %d / max = %d / delta = %d\n",
1004           min_load_ended, max_load_ended, max_load_ended-min_load_ended ); 
[646]1005
[652]1006    printf( " - TRSP_START : min = %d / max = %d / delta = %d\n",
1007           min_trsp_start, max_trsp_start, max_trsp_start-min_trsp_start ); 
1008    fprintf( f , " - TRSP_START : min = %d / max = %d / delta = %d\n",
1009           min_trsp_start, max_trsp_start, max_trsp_start-min_trsp_start ); 
[646]1010
[652]1011    printf( " - TRSP_END   : min = %d / max = %d / delta = %d\n",
1012           min_trsp_ended, max_trsp_ended, max_trsp_ended-min_trsp_ended ); 
1013    fprintf( f , " - TRSP_END   : min = %d / max = %d / delta = %d\n",
1014           min_trsp_ended, max_trsp_ended, max_trsp_ended-min_trsp_ended ); 
[646]1015
[657]1016    printf( " - SAVE_START : min = %d / max = %d / delta = %d\n",
1017           min_save_start, max_save_start, max_save_start-min_save_start ); 
1018    fprintf( f , " - SAVE_START : min = %d / max = %d / delta = %d\n",
1019           min_save_start, max_save_start, max_save_start-min_save_start ); 
[646]1020
[657]1021    printf( " - SAVE_END   : min = %d / max = %d / delta = %d\n",
1022           min_save_ended, max_save_ended, max_save_ended-min_save_ended ); 
1023    fprintf( f , " - SAVE_END   : min = %d / max = %d / delta = %d\n",
1024           min_save_ended, max_save_ended, max_save_ended-min_save_ended ); 
[646]1025
[657]1026    printf( " - FREE_START : min = %d / max = %d / delta = %d\n",
1027           min_free_start, max_free_start, max_free_start-min_free_start ); 
1028    fprintf( f , " - FREE_START : min = %d / max = %d / delta = %d\n",
1029           min_free_start, max_free_start, max_free_start-min_free_start ); 
[646]1030
[657]1031    printf( " - FREE_END   : min = %d / max = %d / delta = %d\n",
1032           min_free_start, max_free_start, max_free_start-min_free_start ); 
1033    fprintf( f , " - FREE_END   : min = %d / max = %d / delta = %d\n",
1034           min_free_start, max_free_start, max_free_start-min_free_start ); 
1035
1036
1037    printf( "\n   Sequencial %d"
1038            "\n   Parallel   %d"
1039            "\n   Alloc      %d"
1040            "\n   Load       %d"
1041            "\n   Transpose  %d"
1042            "\n   Save       %d"
1043            "\n   Free       %d\n" ,
1044            SEQUENCIAL_TIME / 1000 ,
1045            PARALLEL_TIME / 1000 ,
1046            (max_aloc_ended - min_aloc_start) / 1000 ,
1047            (max_load_ended - min_load_start) / 1000 ,
1048            (max_trsp_ended - min_trsp_start) / 1000 ,
1049            (max_save_ended - min_save_start) / 1000 ,
1050            (max_free_ended - min_free_start) / 1000 );
1051
1052    fprintf( f , "\n   Sequencial %d"
1053            "\n   Parallel   %d"
1054            "\n   Alloc      %d"
1055            "\n   Load       %d"
1056            "\n   Transpose  %d"
1057            "\n   Save       %d"
1058            "\n   Free       %d\n" ,
1059            SEQUENCIAL_TIME / 1000 ,
1060            PARALLEL_TIME / 1000 ,
1061            (max_aloc_ended - min_aloc_start) / 1000 ,
1062            (max_load_ended - min_load_start) / 1000 ,
1063            (max_trsp_ended - min_trsp_start) / 1000 ,
1064            (max_save_ended - min_save_start) / 1000 ,
1065            (max_free_ended - min_free_start) / 1000 );
[646]1066}  // end instrument()
1067
1068
1069
1070
Note: See TracBrowser for help on using the repository browser.