source: trunk/user/transpose/transpose.c @ 683

Last change on this file since 683 was 676, checked in by alain, 4 years ago

Introduce chat application to test the named pipes.

File size: 39.5 KB
RevLine 
[646]1//////////////////////////////////////////////////////////////////////////////////////////
2// File   : transpose.c   
3// Date   : september 2019
4// author : Alain Greiner
5//////////////////////////////////////////////////////////////////////////////////////////
6// This multi-threaded aplication read a raw image (one byte per pixel)
[652]7// stored on disk, transposes it, displays the result on the frame buffer,
8// and stores the transposed image on disk.
[646]9//
[652]10// The image size and the pixel encoding type are defined by the IMAGE_SIZE and
11// IMAGE_TYPE global parameters.
[646]12//
[652]13// It can run on a multi-cores, multi-clusters architecture, where (X_SIZE * Y_SIZE)
14// is the number of clusters and NCORES the number of cores per cluster.
15// A core is identified by two indexes [cxy,lid] : cxy is the cluster identifier,
16// (that is NOT required to be a continuous index), and lid is the local core index,
[657]17// (that must be in the [0,NCORES-1] range).
[646]18//
[652]19// The main() function can run on any core in any cluster. This main thread
[657]20// makes the initialisations, load the input file to the "image_in" buffer,
21// launches the working threads, calls the instrument() function when all working
22// threads complete, and saves the result "image_out" buffer to the output file.
[646]23//
[657]24// The number of working threads is always defined by the number of cores availables
[652]25// in the architecture, but this application supports three placement modes.
26// In all modes, the working threads are identified by the [tid] continuous index
27// in range [0, NTHREADS-1], and defines how the lines are shared amongst the threads.
28// This continuous index can always be decomposed in two continuous sub-indexes:
29// tid == cid * ncores + lid,  where cid is in [0,NCLUSTERS-1] and lid in [0,NCORES-1].
30//
31// - NO_PLACEMENT: the main thread is itsef a working thread. The (N_1) other working
32//   threads are created by the main thread, but the placement is done by the OS, using
33//   the DQDT for load balancing, and two working threads can be placed on the same core.
34//   The [cid,lid] are only abstract identifiers, and cannot be associated to a physical
35//   cluster or a physical core. In this mode, the main thread run on any cluster,
36//   but has tid = 0 (i.e. cid = 0 & tid = 0).
37//
38// - EXPLICIT_PLACEMENT: the main thread is again a working thread, but the placement of
39//   of the threads on the cores is explicitely controled by the main thread to have
40//   exactly one working thread per core, and the [cxy][lpid] core coordinates for a given
41//   thread[tid] can be directly derived from the [tid] value: [cid] is an alias for the
42//   physical cluster identifier, and [lid] is the local core index.
43//
44// - PARALLEL_PLACEMENT: the main thread is not anymore a working thread, and uses the
45//   non standard pthread_parallel_create() function to avoid the costly sequencial
46//   loops for pthread_create() and pthread_join(). It garanty one working thread
47//   per core, and the same relation between the thread[tid] and the core[cxy][lpid].
48//   
[657]49// Each working thread[cid][lid] run the "execute" function, that uses the "buf_in" and
50// "buf_out" local buffers, containing the direct and transposed images:
51// Each thread[cid][0] allocates two buf_in[cid] and buf_out[cid] buffers, load from
52// "image_in" to buf_in[cid] all lines that must be handled by the threads sharing the
53// same cid, and finally save from buf_out[cid] to "image_out" all lines that have been
54// transposed to buf_out[cid].
55// Each thread[cid][lid] in the group defined by the cid index read pixels from the
56// local buf_in[cid] buffer, and write pixels to all remote // buf_out[cid] buffers.
[652]57//
[676]58// - The image must have [nlines = npixels = IMAGE_SIZE], and cannot exceed the FBF size.
[646]59// - The number of clusters  must be a power of 2 no larger than 256.
60// - The number of cores per cluster must be a power of 2 no larger than 4.
[652]61// - The number of threads cannot be larger than IMAGE_SIZE.
[646]62//
63//////////////////////////////////////////////////////////////////////////////////////////
64
65#include <sys/mman.h>
66#include <stdio.h>
67#include <stdlib.h>
68#include <unistd.h>
69#include <pthread.h>
70#include <string.h>
71#include <almosmkh.h>
72#include <fcntl.h>
73#include <hal_macros.h>
74
75#define X_MAX                 16                           // max number of clusters in row
76#define Y_MAX                 16                           // max number of clusters in column
77#define CORES_MAX             4                            // max number of cores per cluster
78#define CLUSTERS_MAX          (X_MAX * Y_MAX)              // max number of clusters
[652]79#define THREADS_MAX           (X_MAX * Y_MAX * CORES_MAX)  // max number of threads
[646]80
81#define IMAGE_TYPE            420                          // pixel encoding type
[676]82#define IMAGE_SIZE            256                          // default image size
83#define INPUT_IMAGE_PATH      "/misc/lena_256.raw"         // default input image pathname
84#define OUTPUT_IMAGE_PATH     "/misc/lena_trsp_256.raw"    // default output image pathname
[646]85
[676]86#define SAVE_RESULT_FILE      1                            // save result image on disk
[657]87#define USE_DQT_BARRIER       0                            // quad-tree barrier if non zero
[646]88
[652]89#define NO_PLACEMENT          0                            // uncontrolefdthread placement
[657]90#define EXPLICIT_PLACEMENT    1                            // explicit threads placement
91#define PARALLEL_PLACEMENT    0                            // parallel threads placement
[646]92
[657]93#define VERBOSE_MAIN          1                            // main function print comments
94#define VERBOSE_MAIN_DETAILED 0                            // main function print comments
95#define VERBOSE_EXEC          1                            // exec function print comments
[652]96
[676]97#define INTERACTIVE_MODE      1
[652]98
[646]99///////////////////////////////////////////////////////
100//                global variables
101///////////////////////////////////////////////////////
102
[652]103// global instrumentation counters for the main thread
104unsigned int SEQUENCIAL_TIME = 0;
105unsigned int PARALLEL_TIME   = 0;
106
107// instrumentation counters for each thread in each cluster
108// indexed by [cid][lid] : cluster continuous index / thread local index
[657]109unsigned int ALOC_START[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
110unsigned int ALOC_END  [CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
[652]111unsigned int LOAD_START[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
112unsigned int LOAD_END  [CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
[646]113unsigned int TRSP_START[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
114unsigned int TRSP_END  [CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
[657]115unsigned int SAVE_START[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
116unsigned int SAVE_END  [CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
117unsigned int FREE_START[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
118unsigned int FREE_END  [CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
[646]119
[657]120// buffer containing the input image, loaded by the main from input file
121unsigned char  image_in[IMAGE_SIZE * IMAGE_SIZE];
[646]122
[657]123// buffer containing the output image, saved by the main to output file
124unsigned char  image_out[IMAGE_SIZE * IMAGE_SIZE];
[652]125
[657]126// arrays of pointers on distributed buffers indexed by [cid]
127unsigned char *  buf_in [CLUSTERS_MAX];
128unsigned char *  buf_out[CLUSTERS_MAX];
[652]129
[676]130// pointer and identifier for FBF windows
131void   *  in_win_buf;
132int       in_wid;
133void   *  out_win_buf;
134int       out_wid;
[657]135
[652]136// synchronisation barrier (all working threads)
[646]137pthread_barrier_t   barrier;
138
139// platform parameters
[652]140unsigned int  x_size;              // number of clusters in a row
141unsigned int  y_size;              // number of clusters in a column
142unsigned int  ncores;              // number of cores per cluster
[646]143
[652]144// main thread continuous index
145unsigned int     tid_main; 
[646]146
[652]147//return values at thread exit
148unsigned int THREAD_EXIT_SUCCESS = 0;
149unsigned int THREAD_EXIT_FAILURE = 1;
[646]150
[652]151// array of kernel thread identifiers / indexed by [tid]
152pthread_t                     exec_trdid[THREADS_MAX];   
[646]153
[652]154// array of execute function arguments / indexed by [tid]
155pthread_parallel_work_args_t  exec_args[THREADS_MAX];
[646]156
[652]157// array of thread attributes / indexed by [tid]
158pthread_attr_t                exec_attr[THREADS_MAX];
[646]159
[676]160// image features
161unsigned int   image_size;
162char           input_image_path[128];
163char           output_image_path[128];
164
[646]165////////////////////////////////////////////////////////////////
166//             functions declaration
167////////////////////////////////////////////////////////////////
168
[656]169void * execute( void * arguments );
[646]170
[652]171void instrument( FILE * f , char * filename );
[646]172
[656]173////////////////
174int main( void )
[646]175{
[652]176    unsigned long long start_cycle;
177    unsigned long long end_sequencial_cycle;
178    unsigned long long end_parallel_cycle;
[646]179
[652]180    char               filename[32];      // instrumentation file name
181    char               pathname[64];      // instrumentation file pathname
182
[646]183    int error;
184
[652]185    /////////////////////////////////////////////////////////////////////////////////
186    get_cycle( &start_cycle );
187    /////////////////////////////////////////////////////////////////////////////////
[646]188
[652]189    if( (NO_PLACEMENT + EXPLICIT_PLACEMENT + PARALLEL_PLACEMENT) != 1 )
190    {
191        printf("\n[transpose error] illegal placement\n");
192        exit( 0 );
193    }
[646]194
195    // get & check plat-form parameters
[659]196    hard_config_t  config;
197    get_config( &config );
198    x_size = config.x_size;
199    y_size = config.y_size;
200    ncores = config.ncores;
[646]201
[652]202    if((ncores != 1) && (ncores != 2) && (ncores != 4))
[646]203    {
204        printf("\n[transpose error] number of cores per cluster must be 1/2/4\n");
205        exit( 0 );
206    }
207
208    if( (x_size != 1) && (x_size != 2) && (x_size != 4) && 
209        (x_size != 8) && (x_size != 16) )
210    {
211        printf("\n[transpose error] x_size must be 1/2/4/8/16\n");
212        exit( 0 );
213    }
214       
215    if( (y_size != 1) && (y_size != 2) && (y_size != 4) && 
216        (y_size != 8) && (y_size != 16) )
217    {
218        printf("\n[transpose error] y_size must be 1/2/4/8/16\n");
219        exit( 0 );
220    }
221       
[657]222    // get identifiers for core executing main
[652]223    unsigned int  cxy_main;
224    unsigned int  lid_main;
225    get_core_id( &cxy_main , &lid_main );
[646]226
227    // compute number of threads
228    unsigned int nclusters = x_size * y_size;
229    unsigned int nthreads  = nclusters * ncores;
230
[676]231    // get input and output images path and size
232    if( INTERACTIVE_MODE )
[657]233    {
[676]234        printf("\n - image size : ");
235        get_uint32( &image_size );
236
237        printf("\n - input image path : ");
238        get_string( input_image_path , 128 );
239 
240        printf(" - output image path : ");
241        get_string( output_image_path , 128 );
[657]242    }
[676]243    else
244    {
245        image_size = IMAGE_SIZE;
246        strcpy( input_image_path , INPUT_IMAGE_PATH );
247        strcpy( input_image_path , OUTPUT_IMAGE_PATH );
248    }
[657]249
250    // get FBF size and type
[676]251    int   fbf_width;
252    int   fbf_height;
253    int   fbf_type;
[646]254    fbf_get_config( &fbf_width , &fbf_height , &fbf_type );
255
[676]256    // check image
257    if( nthreads > image_size )
[646]258    {
[676]259        printf("\n[transpose error] nthreads (%d) larger than image size (%d)\n",
260               nthreads , image_size );
[646]261        exit( 0 );
262    }
263
[676]264    if( ((unsigned int)fbf_width  < image_size) ||
265        ((unsigned int)fbf_height < image_size) || 
266        (fbf_type != IMAGE_TYPE) )
267    {
268        printf("\n[transpose error] image not acceptable\n"
269               "FBF width  = %d / npixels  = %d\n"
270               "FBF height = %d / nlines   = %d\n"
271               "FBF type   = %d / expected = %d\n",
272               fbf_width, image_size, fbf_height, image_size, fbf_type, IMAGE_TYPE );
273        exit( 0 );
274    }
275
[657]276    // define total number of pixels
[676]277    int npixels = image_size * image_size;
[646]278
[652]279    // define instrumentation file name
280    if( NO_PLACEMENT )
281    {
[657]282        printf("\n[transpose] %d cluster(s) / %d core(s) / <%s> / PID %x / NO_PLACE\n",
[676]283        nclusters, ncores, input_image_path, getpid() );
[652]284
285        // build instrumentation file name
286        if( USE_DQT_BARRIER )
287        snprintf( filename , 32 , "trsp_dqt_no_place_%d_%d_%d",
[676]288        image_size , x_size * y_size , ncores );
[652]289        else
290        snprintf( filename , 32 , "trsp_smp_no_place_%d_%d_%d",
[676]291        image_size , x_size * y_size , ncores );
[652]292    }
293
294    if( EXPLICIT_PLACEMENT )
295    {
[657]296        printf("\n[transpose] %d cluster(s) / %d core(s) / <%s> / PID %x / EXPLICIT\n",
[676]297        nclusters, ncores, input_image_path, getpid() );
[652]298
299        // build instrumentation file name
300        if( USE_DQT_BARRIER )
301        snprintf( filename , 32 , "trsp_dqt_explicit_%d_%d_%d",
[676]302        image_size , x_size * y_size , ncores );
[652]303        else
304        snprintf( filename , 32 , "trsp_smp_explicit_%d_%d_%d",
[676]305        image_size , x_size * y_size , ncores );
[652]306    }
307
308    if( PARALLEL_PLACEMENT )
309    {
[657]310        printf("\n[transpose] %d cluster(s) / %d core(s) / <%s> / PID %x / PARALLEL\n",
[676]311        nclusters, ncores, input_image_path, getpid() );
[652]312
313        // build instrumentation file name
314        if( USE_DQT_BARRIER )
315        snprintf( filename , 32 , "trsp_dqt_parallel_%d_%d_%d",
[676]316        image_size , x_size * y_size , ncores );
[652]317        else
318        snprintf( filename , 32 , "trsp_smp_parallel_%d_%d_%d",
[676]319        image_size , x_size * y_size , ncores );
[652]320    }
321
[676]322    // create an FBF window for input image
323    in_wid = fbf_create_window( 0,                // l_zero
324                                0,                // p_zero
325                                image_size,       // lines
326                                image_size,       // pixels
327                                &in_win_buf );    // pointer on buffer in user space
328    if( in_wid < 0) 
[657]329    {
[676]330        printf("\n[transpose error] cannot create window for %s\n", input_image_path );
[657]331        exit( 0 );
332    }
333
[676]334    // activate window
335    error = fbf_active_window( in_wid , 1 );
336
337    if( error )
338    {
339        printf("\n[transpose error] cannot activate window for %s\n", input_image_path );
340        exit( 0 );
341    }
342
[657]343#if  VERBOSE_MAIN
[676]344printf("\n[transpose] main on core[%x,%d] created window for %s / wid %d / buf %x\n",
345cxy_main, lid_main, input_image_path, in_wid , in_win_buf );
[657]346#endif
347
[676]348    // create an FBF window for output image
349    out_wid = fbf_create_window( image_size,       // l_zero
350                                 image_size,       // p_zero
351                                 image_size,       // lines
352                                 image_size,       // pixels
353                                 &out_win_buf );   // pointer on buffer in user space
354    if( out_wid < 0) 
355    {
356        printf("\n[transpose error] cannot create window for %s\n", output_image_path );
357        exit( 0 );
358    }
359
360    // activate window
361    error = fbf_active_window( out_wid , 1 );
362
363    if( error )
364    {
365        printf("\n[transpose error] cannot activate window for %s\n", output_image_path );
366        exit( 0 );
367    }
368
369#if  VERBOSE_MAIN
370printf("\n[transpose] main on core[%x,%d] created window for %s / wid %d / buf %x\n",
371cxy_main, lid_main, output_image_path, out_wid , out_win_buf );
372#endif
373
[652]374    // open instrumentation file
375    snprintf( pathname , 64 , "/home/%s", filename );
376    FILE * f = fopen( pathname , NULL );
[657]377
[652]378    if ( f == NULL ) 
[646]379    { 
[657]380        printf("\n[transpose error] cannot open instru file %s\n", pathname );
[646]381        exit( 0 );
382    }
383
[652]384#if  VERBOSE_MAIN
385printf("\n[transpose] main on core[%x,%d] open instrumentation file %s\n",
386cxy_main, lid_main, pathname );
[646]387#endif
388
[652]389    // main thread initializes barrier
[646]390    if( USE_DQT_BARRIER )
391    {
392        pthread_barrierattr_t attr;
393        attr.x_size   = x_size;
394        attr.y_size   = y_size;
395        attr.nthreads = ncores;
396        error = pthread_barrier_init( &barrier, &attr , nthreads );
397    }
398    else
399    {
400        error = pthread_barrier_init( &barrier, NULL , nthreads );
401    }
402
403    if( error )
404    { 
405        printf("\n[transpose error] main cannot initialize barrier\n" );
406        exit( 0 );
407    }
408
[652]409#if  VERBOSE_MAIN
[657]410printf("\n[transpose] main on core[%x,%d] completed barrier initialisation\n",
[652]411cxy_main, lid_main );
412#endif
[646]413
[657]414    // open input file
[676]415    int fd_in = open( input_image_path , O_RDONLY , 0 ); 
[646]416
[652]417    if ( fd_in < 0 ) 
418    { 
[676]419        printf("\n[transpose error] main cannot open file %s\n", input_image_path );
[652]420        exit( 0 );
421    }
422
423#if  VERBOSE_MAIN
[676]424printf("\n[transpose] main open file <%s> / fd = %d\n", input_image_path , fd_in );
[652]425#endif
426
[657]427    // open output file
[676]428    int fd_out = open( output_image_path , O_CREAT , 0 ); 
[652]429
430    if ( fd_out < 0 ) 
431    { 
[676]432        printf("\n[transpose error] main cannot open file %s\n", output_image_path );
[652]433        exit( 0 );
434    }
435
[657]436    // move input image to input buffer
437    if( read( fd_in , image_in , npixels ) != npixels )
438    {
439        printf("\n[transpose error] main cannot read input image\n");
[652]440        exit( 0 );
441    }
442
443#if  VERBOSE_MAIN
[676]444printf("\n[transpose] main moved file <%s> to buf_in\n", input_image_path );
[652]445#endif
446
447    /////////////////////////////////////////////////////////////////////////////////////
448    get_cycle( &end_sequencial_cycle );
449    SEQUENCIAL_TIME = (unsigned int)(end_sequencial_cycle - start_cycle);
450    /////////////////////////////////////////////////////////////////////////////////////
451
452    //////////////////
453    if( NO_PLACEMENT )
454    {
455        // the tid value for the main thread is always 0
456        // main thread creates new threads with tid in [1,nthreads-1] 
457        unsigned int tid;
458        for ( tid = 0 ; tid < nthreads ; tid++ )
[646]459        {
[652]460            // register tid value in exec_args[tid] array
461            exec_args[tid].tid = tid;
462           
463            // create other threads
464            if( tid > 0 )
[646]465            {
[652]466                if ( pthread_create( &exec_trdid[tid], 
467                                     NULL,                  // no attribute
468                                     &execute,
469                                     &exec_args[tid] ) ) 
[646]470                {
[652]471                    printf("\n[transpose error] cannot create thread %d\n", tid );
472                    exit( 0 );
473                }
[646]474
[657]475#if VERBOSE_MAIN_DETAILED
[652]476printf("\n[transpose] main created thread %d\n", tid );
[646]477#endif
[652]478
[646]479            }
[652]480            else
481            {
482                tid_main = 0;
483            }
484        }  // end for tid
[646]485
[652]486        // main thread calls itself the execute() function
487        execute( &exec_args[0] );
[646]488
[652]489        // main thread wait other threads completion
490        for ( tid = 1 ; tid < nthreads ; tid++ )
491        {
492            unsigned int * status;
493
494            // main wait thread[tid] status
495            if ( pthread_join( exec_trdid[tid], (void*)(&status)) )
496            {
497                printf("\n[transpose error] main cannot join thread %d\n", tid );
498                exit( 0 );
499            }
500       
501            // check status
502            if( *status != THREAD_EXIT_SUCCESS )
503            {
504                printf("\n[transpose error] thread %x returned failure\n", tid );
505                exit( 0 );
506            }
507
[657]508#if VERBOSE_MAIN_DETAILED
509printf("\n[transpose] main joined thread %x\n", tid );
[652]510#endif
511       
512        }  // end for tid
513
514    }  // end if no_placement
515
516    ////////////////////////
517    if( EXPLICIT_PLACEMENT )
[646]518    {
[652]519        // main thread places each other threads on a specific core[cxy][lid]
520        // but the actual thread creation is sequencial
521        unsigned int x;
522        unsigned int y;
523        unsigned int l;
524        unsigned int cxy;                   // cluster identifier
525        unsigned int tid;                   // thread continuous index
526
527        for( x = 0 ; x < x_size ; x++ )
[646]528        {
[652]529            for( y = 0 ; y < y_size ; y++ )
[646]530            {
[652]531                cxy = HAL_CXY_FROM_XY( x , y );
532                for( l = 0 ; l < ncores ; l++ )
[646]533                {
[652]534                    // compute thread continuous index
535                    tid = (((* y_size) + y) * ncores) + l;
[646]536
[652]537                    // register tid value in exec_args[tid] array
538                    exec_args[tid].tid = tid;
539
540                    // no thread created on the core running the main
541                    if( (cxy != cxy_main) || (l != lid_main) )
[646]542                    {
[652]543                        // define thread attributes
544                        exec_attr[tid].attributes = PT_ATTR_CLUSTER_DEFINED |
545                                                    PT_ATTR_CORE_DEFINED;
546                        exec_attr[tid].cxy        = cxy;
547                        exec_attr[tid].lid        = l;
548 
549                        // create thread[tid] on core[cxy][l]
550                        if ( pthread_create( &exec_trdid[tid],   
551                                             &exec_attr[tid],   
552                                             &execute,
553                                             &exec_args[tid] ) )       
554                        {
555                            printf("\n[transpose error] cannot create thread %d\n", tid );
556                            exit( 0 );
557                        }
[657]558
559#if VERBOSE_MAIN_DETAILED
[652]560printf("\n[transpose] main created thread[%d] on core[%x,%d]\n", tid, cxy, l );
561#endif
[646]562                    }
[652]563                    else
[646]564                    {
[652]565                        tid_main = tid;
[646]566                    }
567                }
568            }
569        }
570
[652]571        // main thread calls itself the execute() function
572        execute( &exec_args[tid_main] );
[646]573
[652]574        // main thread wait other threads completion
575        for( tid = 0 ; tid < nthreads ; tid++ )
[646]576        {
[652]577            // no other thread on the core running the main
578            if( tid != tid_main )
579            {
580                unsigned int * status;
[646]581
[652]582                // wait thread[tid]
583                if( pthread_join( exec_trdid[tid] , (void*)(&status) ) )
584                {
585                    printf("\n[transpose error] main cannot join thread %d\n", tid );
586                    exit( 0 );
587                }
588       
589                // check status
590                if( *status != THREAD_EXIT_SUCCESS )
591                {
592                    printf("\n[transpose error] thread %d returned failure\n", tid );
593                    exit( 0 );
594                }
[657]595
596#if VERBOSE_MAIN_DETAILED
597printf("\n[transpose] main joined thread %d\n", tid );
[646]598#endif
[652]599            }
600        }
601    }  // end if explicit_placement
[646]602
[652]603    ////////////////////////
604    if( PARALLEL_PLACEMENT )
[646]605    {
[652]606        // compute covering DQT size an level
607        unsigned int z          = (x_size > y_size) ? x_size : y_size;
608        unsigned int root_level = ((z == 1) ? 0 : 
609                                  ((z == 2) ? 1 : 
610                                  ((z == 4) ? 2 : 
611                                  ((z == 8) ? 3 : 4))));
[646]612
[652]613        // create & execute the working threads
614        if( pthread_parallel_create( root_level , &execute ) )
[646]615        {
[652]616            printf("\n[transpose error] in %s\n", __FUNCTION__ );
[646]617            exit( 0 );
618        }
[652]619    }  // end if parallel_placement
[646]620
621
[652]622    /////////////////////////////////////////////////////////////////////////////
623    get_cycle( &end_parallel_cycle );
624    PARALLEL_TIME = (unsigned int)(end_parallel_cycle - end_sequencial_cycle);
625    /////////////////////////////////////////////////////////////////////////////
[646]626
[657]627    // register instrumentation results
[652]628    instrument( f , filename );
[646]629
[657]630#if VERBOSE_MAIN
631printf("\n[transpose] main completed instrumentation\n");
632#endif
633
634/*
635    printf("\n> ");
636    getchar();
637
638    // move window
639    if( fbf_move_window( wid , 100 , 100 ) )
640    {
641        printf("\n[transpose error] main cannot move FBF window\n");
642        exit( 0 );
643    }
644
645    printf("\n> ");
646    getchar();
647*/   
648    // save image_out to output file
649    if( write( fd_out , image_out , npixels ) != npixels )
650    {
651        printf("\n[transpose error] main cannot write output image\n");
652        exit( 0 );
653    }
654
655#if VERBOSE_MAIN
656printf("\n[transpose] main saved buf_out to output file\n");
657#endif
658
659    // close input file
[646]660    close( fd_in );
[652]661
[657]662#if VERBOSE_MAIN
663printf("\n[transpose] main closed input file\n");
664#endif
[652]665
[657]666    // close output file
[646]667    close( fd_out );
668
[657]669#if VERBOSE_MAIN
670printf("\n[transpose] main closed output file\n");
[652]671#endif
672
[657]673    // close instrumentation file
[652]674    fclose( f );
675
[657]676#if VERBOSE_MAIN
677printf("\n[transpose] main closed instrumentation file\n");
678#endif
679
[676]680    // delete FBF windows
681    if( fbf_delete_window( in_wid ) )
682    if( fbf_delete_window( out_wid ) )
[657]683    {
684        printf("\n[transpose error] main cannot delete FBF window\n");
685        exit( 0 );
686    }
687
[676]688#if VERBOSE_MAIN
689printf("\n[transpose] main deleted FBF windows\n");
690#endif
691
[652]692    // main thread suicide
[646]693    exit( 0 );
694   
[656]695    return 0;
696
[646]697} // end main()
698
699
700
[652]701
[676]702
703
704
705
[656]706//////////////////////////////////
707void * execute( void * arguments ) 
[646]708{
709    unsigned long long   date;
[657]710    unsigned int         l;         // line index for loop
711    unsigned int         p;         // pixel index for loop
712    int                  error;
713
[656]714    pthread_parallel_work_args_t * args = (pthread_parallel_work_args_t *)arguments;
715
[652]716    // WARNING
717    //A thread is identified by the tid index, defined in the "args" structure.
718    // This index being in range [0,nclusters*ncores-1] we can always write
719    //       tid == cid * ncores + lid
720    // with cid in [0,nclusters-1] and lid in [0,ncores-1].
721    // if NO_PLACEMENT, there is no relation between these
722    // thread [cid][lid] indexes, and the core coordinates [cxy][lpid]
[646]723
[652]724    // get thread abstract identifiers
725    unsigned int tid = args->tid;
[657]726    unsigned int cid = tid / ncores;    // abstract cluster index
727    unsigned int lid = tid % ncores;    // local thread index
[652]728
729#if VERBOSE_EXEC
730unsigned int cxy;
731unsigned int lpid;
732get_core_id( &cxy , &lpid );   // get core physical identifiers
[657]733#endif
734
735#if VERBOSE_EXEC
[652]736printf("\n[transpose] exec[%d] on core[%x,%d] enters parallel exec\n",
737tid , cxy , lpid );
738#endif
739
740    get_cycle( &date );
[657]741    ALOC_START[cid][lid] = (unsigned int)date;
[652]742
[657]743    // compute total number of pixels per image
[676]744    unsigned int npixels = image_size * image_size;     
[646]745
[657]746    // compute total number of threads and clusters
[646]747    unsigned int nclusters = x_size * y_size;
[652]748    unsigned int nthreads  = nclusters * ncores;
[646]749
[657]750    // compute number of pixels per cid & per thread
751    unsigned int pixels_per_cid = npixels / nclusters;
752    unsigned int pixels_per_lid = pixels_per_cid / ncores;
[646]753
[657]754    // compute first and last line per thread
[676]755    unsigned int lines_per_cid = pixels_per_cid / image_size;
756    unsigned int lines_per_lid = pixels_per_lid / image_size;
[646]757
[657]758    unsigned int line_first = (cid * lines_per_cid) + (lid * lines_per_lid);
759    unsigned int line_last  = line_first + lines_per_lid;
760
761    // Each thread[cid,0] allocates two local buffers, and register the base
762    // adresses in the global variable buf_in_ptr[cid] & buf_out_ptr[cid].
763   
[652]764    if( lid == 0 )
[646]765    {
[652]766        // allocate buf_in
[657]767        buf_in[cid] = (unsigned char *)malloc( pixels_per_cid );
[646]768
[657]769        if( buf_in[cid] == NULL )
[646]770        {
[652]771            printf("\n[transpose error] thread[%d] cannot allocate buf_in\n", tid );
[646]772            pthread_exit( &THREAD_EXIT_FAILURE );
773        }
[652]774
775#if VERBOSE_EXEC
776printf("\n[transpose] exec[%d] on core[%x,%d] allocated buf_in = %x\n",
777tid , cxy , lpid , buf_in );
[646]778#endif
779
[652]780        // allocate buf_out
[657]781        buf_out[cid] = (unsigned char *)malloc( pixels_per_cid );
[652]782
[657]783        if( buf_out[cid] == NULL )
[646]784        {
[652]785            printf("\n[transpose error] thread[%d] cannot allocate buf_in\n", tid );
[646]786            pthread_exit( &THREAD_EXIT_FAILURE );
787        }
[652]788
789#if VERBOSE_EXEC
790printf("\n[transpose] exec[%d] on core[%x,%d] allocated buf_out = %x\n",
791tid , cxy , lpid , buf_out );
[646]792#endif
793
794    }
[657]795
[646]796    get_cycle( &date );
[657]797    ALOC_END[cid][lid] = (unsigned int)date;
798
799    /////////////////////////////////
800    pthread_barrier_wait( &barrier );
801    /////////////////////////////////
802
803    get_cycle( &date );
804    LOAD_START[cid][lid] = (unsigned int)date;
805
806    // all threads copy relevant part of the image_in to buf_in[cid]
807    memcpy( buf_in[cid] + (lid * pixels_per_lid), 
808            image_in + (cid * pixels_per_cid) + (lid * pixels_per_lid),
809            pixels_per_lid );
810
811#if VERBOSE_EXEC
812printf("\n[transpose] exec[%d] on core[%x,%d] loaded buf_in[%d]\n",
813tid , cxy , lpid , cid );
814#endif
815
816    // all local threads copy part of buf_in[cid] to FBF window for display
[676]817    memcpy( in_win_buf + (cid * pixels_per_cid) + (lid * pixels_per_lid),
[657]818            buf_in[cid] + (lid * pixels_per_lid),
819            pixels_per_lid );
820
821#if  VERBOSE_EXEC
822printf("\n[transpose] exec[%d] on core[%x,%d] loaded buf_in to FBF (first %d / last %d)\n",
823tid , cxy , lpid , line_first , line_last );
824#endif
825
[676]826    // all threads contribute to input window refresh
827    error = fbf_refresh_window( in_wid , line_first , line_last );
[657]828
829    if( error )
830    {
831        printf("\n[transpose error] exec[%d] cannot refresh FBF window\n", tid );
832        exit( 0 );
833    }
834
835    get_cycle( &date );
[652]836    LOAD_END[cid][lid] = (unsigned int)date;
[646]837
838    /////////////////////////////////
839    pthread_barrier_wait( &barrier );
[657]840    /////////////////////////////////
[646]841
[652]842    get_cycle( &date );
843    TRSP_START[cid][lid] = (unsigned int)date;
844
[657]845    // All threads contribute to parallel transpose from buf_in to buf_out:
[652]846    // each thread makes the transposition for nlt lines (nlt = npixels/nthreads)
[646]847    // from line [tid*nlt] to line [(tid + 1)*nlt - 1]
848    // (p,l) are the absolute pixel coordinates in the source image
[657]849    // (l,p) are the absolute pixel coordinates in the dest image
[646]850
[676]851    unsigned int nlt   = image_size / nthreads;    // number of lines per thread
852    unsigned int nlc   = image_size / nclusters;   // number of lines per cluster
[646]853
[652]854    unsigned int src_cid;
[646]855    unsigned int src_index;
[652]856    unsigned int dst_cid;
[646]857    unsigned int dst_index;
858
859    unsigned char byte;
860
[657]861    unsigned int first = tid * nlt;        // first line index for a given thread
[646]862    unsigned int last  = first + nlt;      // last line index for a given thread
863
[652]864    // loop on lines handled by this thread
[646]865    for ( l = first ; l < last ; l++ )
866    {
[652]867        // loop on pixels in one line (one pixel per iteration)
[676]868        for ( p = 0 ; p < image_size ; p++ )
[646]869        {
870            // read one byte from local buf_in
[652]871            src_cid   = l / nlc;
[676]872            src_index = (l % nlc) * image_size + p;
[646]873
[657]874            byte = buf_in[src_cid][src_index];
[652]875
[646]876            // write one byte to remote buf_out
[652]877            dst_cid   = p / nlc; 
[676]878            dst_index = (p % nlc) * image_size + l;
[646]879
[657]880            buf_out[dst_cid][dst_index] = byte;
[646]881        }
882    }
883
[652]884#if VERBOSE_EXEC
885printf("\n[transpose] exec[%d] on core[%x,%d] completes transpose\n",
886tid , cxy , lpid );
[646]887#endif
888
889    get_cycle( &date );
[652]890    TRSP_END[cid][lid] = (unsigned int)date;
[646]891
892    /////////////////////////////////
893    pthread_barrier_wait( &barrier );
[657]894    /////////////////////////////////
[646]895
896    get_cycle( &date );
[657]897    SAVE_START[cid][lid] = (unsigned int)date;
[646]898
[657]899    // each local threads copy part of buf_out[cid] to FBF window for display
[676]900    memcpy( out_win_buf + (cid * pixels_per_cid) + (lid * pixels_per_lid),
[657]901            buf_out[cid] + (lid * pixels_per_lid),
902            pixels_per_lid );
[646]903
[657]904#if  VERBOSE_EXEC
905printf("\n[transpose] exec[%d] on core[%x,%d] loaded buf_out to FBF (first %d / last %d)\n",
906tid , cxy , lpid , line_first , line_last );
907#endif
908
[676]909    // each thread contributes to output window refresh
910    error = fbf_refresh_window( out_wid , line_first , line_last );
[657]911
912    if( error )
[646]913    {
[657]914        printf("\n[transpose error] exec[%d] cannot refresh FBF window\n", tid );
915        exit( 0 );
[646]916    }
917
[657]918    // each local thread copy relevant part of buf_out to image_out
919    memcpy( image_out + (cid * pixels_per_cid) + (lid * pixels_per_lid),
920            buf_out[cid] + (lid * pixels_per_lid),
921            pixels_per_lid );
922
[652]923#if VERBOSE_EXEC
[657]924printf("\n[transpose] exec[%d] on core[%x,%d] saved buf_out[%d]\n",
925tid , cxy , lpid , cid );
[646]926#endif
927
928    get_cycle( &date );
[657]929    SAVE_END[cid][lid] = (unsigned int)date;
[646]930
931    /////////////////////////////////
932    pthread_barrier_wait( &barrier );
[657]933    /////////////////////////////////
[646]934
[657]935    get_cycle( &date );
936    FREE_START[cid][lid] = (unsigned int)date;
[652]937
[657]938    // Each thread[cid,0] release local buffers buf_in & buf_out
939
[652]940    if( lid == 0 )
941    {
[657]942        // release local buffers
943        free( buf_in[cid] );
944        free( buf_out[cid] );
[652]945
946#if VERBOSE_EXEC
[657]947printf("\n[transpose] exec[%d] on core[%x,%d] released buf_in & buf_out\n",
948tid , cxy , lpid );
[652]949#endif
950
[657]951    }
[652]952
[657]953    get_cycle( &date );
954    FREE_END[cid][lid] = (unsigned int)date;
955
956    /////////////////////////////////
957    pthread_barrier_wait( &barrier );
958    /////////////////////////////////
[652]959   
960    // thread termination depends on the placement policy
961    if( PARALLEL_PLACEMENT )   
962    {
[656]963        // <work> threads are runing in detached mode, and
[652]964        // each thread must signal completion by calling barrier
965        // passed in arguments before exit
966
967        pthread_barrier_wait( args->barrier );
968
[646]969        pthread_exit( &THREAD_EXIT_SUCCESS );
970    }
[652]971    else
972    {
973        // <work> threads are running in attached mode
974        // each thread, but de main, simply exit
[657]975        if ( tid != tid_main ) 
976        {
977
978#if VERBOSE_EXEC
979printf("\n[transpose] exec[%d] on core[%x,%d] exit\n",
980tid , cxy , lpid );
981#endif
982            pthread_exit( &THREAD_EXIT_SUCCESS );
983        }
[652]984    }
[646]985
[656]986    return NULL;
987
[646]988} // end execute()
989
990
991
[657]992//////////////////////////
[652]993void instrument( FILE * f,
994                 char * filename )
[646]995{
[657]996    unsigned int cid;
997    unsigned int l;
[646]998
[657]999    unsigned int min_aloc_start = 0xFFFFFFFF;
1000    unsigned int max_aloc_start = 0;
1001    unsigned int min_aloc_ended = 0xFFFFFFFF;
1002    unsigned int max_aloc_ended = 0;
[646]1003    unsigned int min_load_start = 0xFFFFFFFF;
1004    unsigned int max_load_start = 0;
1005    unsigned int min_load_ended = 0xFFFFFFFF;
1006    unsigned int max_load_ended = 0;
1007    unsigned int min_trsp_start = 0xFFFFFFFF;
1008    unsigned int max_trsp_start = 0;
1009    unsigned int min_trsp_ended = 0xFFFFFFFF;
1010    unsigned int max_trsp_ended = 0;
[657]1011    unsigned int min_save_start = 0xFFFFFFFF;
1012    unsigned int max_save_start = 0;
1013    unsigned int min_save_ended = 0xFFFFFFFF;
1014    unsigned int max_save_ended = 0;
1015    unsigned int min_free_start = 0xFFFFFFFF;
1016    unsigned int max_free_start = 0;
1017    unsigned int min_free_ended = 0xFFFFFFFF;
1018    unsigned int max_free_ended = 0;
[646]1019 
[657]1020    for (cid = 0; cid < (x_size * y_size) ; cid++)
[646]1021    {
[657]1022        for ( l = 0 ; l < ncores ; l++ )
[646]1023        {
[657]1024            if (ALOC_START[cid][l] < min_aloc_start)  min_aloc_start = ALOC_START[cid][l];
1025            if (ALOC_START[cid][l] > max_aloc_start)  max_aloc_start = ALOC_START[cid][l];
1026            if (ALOC_END[cid][l]   < min_aloc_ended)  min_aloc_ended = ALOC_END[cid][l]; 
1027            if (ALOC_END[cid][l]   > max_aloc_ended)  max_aloc_ended = ALOC_END[cid][l];
1028            if (LOAD_START[cid][l] < min_load_start)  min_load_start = LOAD_START[cid][l];
1029            if (LOAD_START[cid][l] > max_load_start)  max_load_start = LOAD_START[cid][l];
1030            if (LOAD_END[cid][l]   < min_load_ended)  min_load_ended = LOAD_END[cid][l]; 
1031            if (LOAD_END[cid][l]   > max_load_ended)  max_load_ended = LOAD_END[cid][l];
1032            if (TRSP_START[cid][l] < min_trsp_start)  min_trsp_start = TRSP_START[cid][l];
1033            if (TRSP_START[cid][l] > max_trsp_start)  max_trsp_start = TRSP_START[cid][l];
1034            if (TRSP_END[cid][l]   < min_trsp_ended)  min_trsp_ended = TRSP_END[cid][l];
1035            if (TRSP_END[cid][l]   > max_trsp_ended)  max_trsp_ended = TRSP_END[cid][l];
1036            if (SAVE_START[cid][l] < min_save_start)  min_save_start = SAVE_START[cid][l];
1037            if (SAVE_START[cid][l] > max_save_start)  max_save_start = SAVE_START[cid][l];
1038            if (SAVE_END[cid][l]   < min_save_ended)  min_save_ended = SAVE_END[cid][l];
1039            if (SAVE_END[cid][l]   > max_save_ended)  max_save_ended = SAVE_END[cid][l];
1040            if (FREE_START[cid][l] < min_free_start)  min_free_start = FREE_START[cid][l];
1041            if (FREE_START[cid][l] > max_free_start)  max_free_start = FREE_START[cid][l];
1042            if (FREE_END[cid][l]   < min_free_ended)  min_free_ended = FREE_END[cid][l];
1043            if (FREE_END[cid][l]   > max_free_ended)  max_free_ended = FREE_END[cid][l];
[646]1044        }
1045    }
1046
[652]1047    printf( "\n ------ %s ------\n" , filename );
1048    fprintf( f , "\n ------ %s ------\n" , filename );
[646]1049
[657]1050    printf( " - ALOC_START : min = %d / max = %d / delta = %d\n",
1051           min_aloc_start, max_aloc_start, max_aloc_start-min_aloc_start ); 
1052    fprintf( f , " - ALOC_START : min = %d / max = %d / delta = %d\n",
1053           min_aloc_start, max_aloc_start, max_aloc_start-min_aloc_start ); 
1054
1055    printf( " - ALOC_END   : min = %d / max = %d / delta = %d\n",
1056           min_aloc_start, max_aloc_start, max_aloc_start-min_aloc_start ); 
1057    fprintf( f , " - ALOC_END   : min = %d / max = %d / delta = %d\n",
1058           min_aloc_start, max_aloc_start, max_aloc_start-min_aloc_start ); 
1059
[652]1060    printf( " - LOAD_START : min = %d / max = %d / delta = %d\n",
1061           min_load_start, max_load_start, max_load_start-min_load_start ); 
1062    fprintf( f , " - LOAD_START : min = %d / max = %d / delta = %d\n",
1063           min_load_start, max_load_start, max_load_start-min_load_start ); 
[646]1064
[652]1065    printf( " - LOAD_END   : min = %d / max = %d / delta = %d\n",
1066           min_load_ended, max_load_ended, max_load_ended-min_load_ended ); 
1067    fprintf( f , " - LOAD_END   : min = %d / max = %d / delta = %d\n",
1068           min_load_ended, max_load_ended, max_load_ended-min_load_ended ); 
[646]1069
[652]1070    printf( " - TRSP_START : min = %d / max = %d / delta = %d\n",
1071           min_trsp_start, max_trsp_start, max_trsp_start-min_trsp_start ); 
1072    fprintf( f , " - TRSP_START : min = %d / max = %d / delta = %d\n",
1073           min_trsp_start, max_trsp_start, max_trsp_start-min_trsp_start ); 
[646]1074
[652]1075    printf( " - TRSP_END   : min = %d / max = %d / delta = %d\n",
1076           min_trsp_ended, max_trsp_ended, max_trsp_ended-min_trsp_ended ); 
1077    fprintf( f , " - TRSP_END   : min = %d / max = %d / delta = %d\n",
1078           min_trsp_ended, max_trsp_ended, max_trsp_ended-min_trsp_ended ); 
[646]1079
[657]1080    printf( " - SAVE_START : min = %d / max = %d / delta = %d\n",
1081           min_save_start, max_save_start, max_save_start-min_save_start ); 
1082    fprintf( f , " - SAVE_START : min = %d / max = %d / delta = %d\n",
1083           min_save_start, max_save_start, max_save_start-min_save_start ); 
[646]1084
[657]1085    printf( " - SAVE_END   : min = %d / max = %d / delta = %d\n",
1086           min_save_ended, max_save_ended, max_save_ended-min_save_ended ); 
1087    fprintf( f , " - SAVE_END   : min = %d / max = %d / delta = %d\n",
1088           min_save_ended, max_save_ended, max_save_ended-min_save_ended ); 
[646]1089
[657]1090    printf( " - FREE_START : min = %d / max = %d / delta = %d\n",
1091           min_free_start, max_free_start, max_free_start-min_free_start ); 
1092    fprintf( f , " - FREE_START : min = %d / max = %d / delta = %d\n",
1093           min_free_start, max_free_start, max_free_start-min_free_start ); 
[646]1094
[657]1095    printf( " - FREE_END   : min = %d / max = %d / delta = %d\n",
1096           min_free_start, max_free_start, max_free_start-min_free_start ); 
1097    fprintf( f , " - FREE_END   : min = %d / max = %d / delta = %d\n",
1098           min_free_start, max_free_start, max_free_start-min_free_start ); 
1099
1100
1101    printf( "\n   Sequencial %d"
1102            "\n   Parallel   %d"
1103            "\n   Alloc      %d"
1104            "\n   Load       %d"
1105            "\n   Transpose  %d"
1106            "\n   Save       %d"
1107            "\n   Free       %d\n" ,
1108            SEQUENCIAL_TIME / 1000 ,
1109            PARALLEL_TIME / 1000 ,
1110            (max_aloc_ended - min_aloc_start) / 1000 ,
1111            (max_load_ended - min_load_start) / 1000 ,
1112            (max_trsp_ended - min_trsp_start) / 1000 ,
1113            (max_save_ended - min_save_start) / 1000 ,
1114            (max_free_ended - min_free_start) / 1000 );
1115
1116    fprintf( f , "\n   Sequencial %d"
1117            "\n   Parallel   %d"
1118            "\n   Alloc      %d"
1119            "\n   Load       %d"
1120            "\n   Transpose  %d"
1121            "\n   Save       %d"
1122            "\n   Free       %d\n" ,
1123            SEQUENCIAL_TIME / 1000 ,
1124            PARALLEL_TIME / 1000 ,
1125            (max_aloc_ended - min_aloc_start) / 1000 ,
1126            (max_load_ended - min_load_start) / 1000 ,
1127            (max_trsp_ended - min_trsp_start) / 1000 ,
1128            (max_save_ended - min_save_start) / 1000 ,
1129            (max_free_ended - min_free_start) / 1000 );
[646]1130}  // end instrument()
1131
1132
1133
1134
Note: See TracBrowser for help on using the repository browser.