source: trunk/user/transpose/transpose.c @ 652

Last change on this file since 652 was 652, checked in by alain, 5 years ago

Introduce the three placement modes in "transpose", "convol', "fft" applications.

File size: 32.6 KB
RevLine 
[646]1//////////////////////////////////////////////////////////////////////////////////////////
2// File   : transpose.c   
3// Date   : september 2019
4// author : Alain Greiner
5//////////////////////////////////////////////////////////////////////////////////////////
6// This multi-threaded aplication read a raw image (one byte per pixel)
[652]7// stored on disk, transposes it, displays the result on the frame buffer,
8// and stores the transposed image on disk.
[646]9//
[652]10// The image size and the pixel encoding type are defined by the IMAGE_SIZE and
11// IMAGE_TYPE global parameters.
[646]12//
[652]13// It can run on a multi-cores, multi-clusters architecture, where (X_SIZE * Y_SIZE)
14// is the number of clusters and NCORES the number of cores per cluster.
15// A core is identified by two indexes [cxy,lid] : cxy is the cluster identifier,
16// (that is NOT required to be a continuous index), and lid is the local core index,
17// (that must be in the [Ø,NCORES-1] range).
[646]18//
[652]19// The main() function can run on any core in any cluster. This main thread
20// makes the initialisations, uses the pthread_create() syscall to launch (NTHREADS-1)
21// other threads in "attached" mode running in parallel the execute() function, calls
22// himself the execute() function, wait completion of the (NTHREADS-1) other threads
23// with a pthread_join(), and finally calls the instrument() function to display
24// and register the instrumentation results when execution is completed.
25// All threads run the execute() function, but each thread transposes only
26// (NLINES / NTHREADS) lines. This requires that NLINES == k * NTHREADS.
[646]27//
[652]28// The number N of working threads is always defined by the number of cores availables
29// in the architecture, but this application supports three placement modes.
30// In all modes, the working threads are identified by the [tid] continuous index
31// in range [0, NTHREADS-1], and defines how the lines are shared amongst the threads.
32// This continuous index can always be decomposed in two continuous sub-indexes:
33// tid == cid * ncores + lid,  where cid is in [0,NCLUSTERS-1] and lid in [0,NCORES-1].
34//
35// - NO_PLACEMENT: the main thread is itsef a working thread. The (N_1) other working
36//   threads are created by the main thread, but the placement is done by the OS, using
37//   the DQDT for load balancing, and two working threads can be placed on the same core.
38//   The [cid,lid] are only abstract identifiers, and cannot be associated to a physical
39//   cluster or a physical core. In this mode, the main thread run on any cluster,
40//   but has tid = 0 (i.e. cid = 0 & tid = 0).
41//
42// - EXPLICIT_PLACEMENT: the main thread is again a working thread, but the placement of
43//   of the threads on the cores is explicitely controled by the main thread to have
44//   exactly one working thread per core, and the [cxy][lpid] core coordinates for a given
45//   thread[tid] can be directly derived from the [tid] value: [cid] is an alias for the
46//   physical cluster identifier, and [lid] is the local core index.
47//
48// - PARALLEL_PLACEMENT: the main thread is not anymore a working thread, and uses the
49//   non standard pthread_parallel_create() function to avoid the costly sequencial
50//   loops for pthread_create() and pthread_join(). It garanty one working thread
51//   per core, and the same relation between the thread[tid] and the core[cxy][lpid].
52//   
53// The buf_in[x,y] and buf_out[put buffers containing the direct and transposed images
54// are distributed in clusters: each thread[cid][0] allocate a local input buffer
55// and load in this buffer all lines that must be handled by the threads sharing the
56// same cid, from the mapper of the input image file.
57// In the execute function, all threads in the group defined by the cid index read pixels
58// from the local buf_in[cid] buffer, and write pixels to all remote buf_out[cid] buffers.
59// Finally, each thread displays a part of the transposed image to the frame buffer.
60//
[646]61// - The image  must fit the frame buffer size, that must be power of 2.
62// - The number of clusters  must be a power of 2 no larger than 256.
63// - The number of cores per cluster must be a power of 2 no larger than 4.
[652]64// - The number of threads cannot be larger than IMAGE_SIZE.
[646]65//
66//////////////////////////////////////////////////////////////////////////////////////////
67
68#include <sys/mman.h>
69#include <stdio.h>
70#include <stdlib.h>
71#include <unistd.h>
72#include <pthread.h>
73#include <string.h>
74#include <almosmkh.h>
75#include <fcntl.h>
76#include <hal_macros.h>
77
78#define X_MAX                 16                           // max number of clusters in row
79#define Y_MAX                 16                           // max number of clusters in column
80#define CORES_MAX             4                            // max number of cores per cluster
81#define CLUSTERS_MAX          (X_MAX * Y_MAX)              // max number of clusters
[652]82#define THREADS_MAX           (X_MAX * Y_MAX * CORES_MAX)  // max number of threads
[646]83
[652]84#define IMAGE_SIZE            512                          // image size
[646]85#define IMAGE_TYPE            420                          // pixel encoding type
[652]86#define INPUT_FILE_PATH       "/misc/couple_512.raw"       // input file pathname
87#define OUTPUT_FILE_PATH      "/misc/transposed_512.raw"   // output file pathname
[646]88
[652]89#define SAVE_RESULT_FILE      0                            // save result image on disk
[646]90#define USE_DQT_BARRIER       1                            // quad-tree barrier if non zero
91
[652]92#define NO_PLACEMENT          0                            // uncontrolefdthread placement
93#define EXPLICIT_PLACEMENT    0                            // explicit threads placement
94#define PARALLEL_PLACEMENT    1                            // parallel threads placement
[646]95
[652]96#define VERBOSE_MAIN          0                            // main function print comments
97#define VERBOSE_EXEC          0                            // exec function print comments
98#define VERBOSE_INSTRU        0                            // instru function print comments
99
100
[646]101///////////////////////////////////////////////////////
102//                global variables
103///////////////////////////////////////////////////////
104
[652]105// global instrumentation counters for the main thread
106unsigned int SEQUENCIAL_TIME = 0;
107unsigned int PARALLEL_TIME   = 0;
108
109// instrumentation counters for each thread in each cluster
110// indexed by [cid][lid] : cluster continuous index / thread local index
111unsigned int LOAD_START[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
112unsigned int LOAD_END  [CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
[646]113unsigned int TRSP_START[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
114unsigned int TRSP_END  [CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
115unsigned int DISP_START[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
116unsigned int DISP_END  [CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
117
[652]118// pointer on buffer containing the input image, maped by the main to the input file
119unsigned char *  image_in;
[646]120
[652]121// pointer on buffer containing the output image, maped by the main to the output file
122unsigned char *  image_out;
123
124// arrays of pointers on distributed buffers indexed by [cid] : cluster continuous index
125unsigned char *  buf_in_ptr [CLUSTERS_MAX];
126unsigned char *  buf_out_ptr[CLUSTERS_MAX];
127
128// synchronisation barrier (all working threads)
[646]129pthread_barrier_t   barrier;
130
131// platform parameters
[652]132unsigned int  x_size;              // number of clusters in a row
133unsigned int  y_size;              // number of clusters in a column
134unsigned int  ncores;              // number of cores per cluster
[646]135
[652]136// main thread continuous index
137unsigned int     tid_main; 
[646]138
[652]139//return values at thread exit
140unsigned int THREAD_EXIT_SUCCESS = 0;
141unsigned int THREAD_EXIT_FAILURE = 1;
[646]142
[652]143// array of kernel thread identifiers / indexed by [tid]
144pthread_t                     exec_trdid[THREADS_MAX];   
[646]145
[652]146// array of execute function arguments / indexed by [tid]
147pthread_parallel_work_args_t  exec_args[THREADS_MAX];
[646]148
[652]149// array of thread attributes / indexed by [tid]
150pthread_attr_t                exec_attr[THREADS_MAX];
[646]151
152////////////////////////////////////////////////////////////////
153//             functions declaration
154////////////////////////////////////////////////////////////////
155
[652]156void execute( pthread_parallel_work_args_t * args );
[646]157
[652]158void instrument( FILE * f , char * filename );
[646]159
[652]160/////////////////
161void main( void )
[646]162{
[652]163    unsigned long long start_cycle;
164    unsigned long long end_sequencial_cycle;
165    unsigned long long end_parallel_cycle;
[646]166
[652]167    char               filename[32];      // instrumentation file name
168    char               pathname[64];      // instrumentation file pathname
169
[646]170    int error;
171
[652]172    /////////////////////////////////////////////////////////////////////////////////
173    get_cycle( &start_cycle );
174    /////////////////////////////////////////////////////////////////////////////////
[646]175
[652]176    if( (NO_PLACEMENT + EXPLICIT_PLACEMENT + PARALLEL_PLACEMENT) != 1 )
177    {
178        printf("\n[transpose error] illegal placement\n");
179        exit( 0 );
180    }
[646]181
182    // get & check plat-form parameters
[652]183    get_config( &x_size,
184                &y_size,
185                &ncores );
[646]186
[652]187    if((ncores != 1) && (ncores != 2) && (ncores != 4))
[646]188    {
189        printf("\n[transpose error] number of cores per cluster must be 1/2/4\n");
190        exit( 0 );
191    }
192
193    if( (x_size != 1) && (x_size != 2) && (x_size != 4) && 
194        (x_size != 8) && (x_size != 16) )
195    {
196        printf("\n[transpose error] x_size must be 1/2/4/8/16\n");
197        exit( 0 );
198    }
199       
200    if( (y_size != 1) && (y_size != 2) && (y_size != 4) && 
201        (y_size != 8) && (y_size != 16) )
202    {
203        printf("\n[transpose error] y_size must be 1/2/4/8/16\n");
204        exit( 0 );
205    }
206       
[652]207    // main thread get identifiers for core executing main
208    unsigned int  cxy_main;
209    unsigned int  lid_main;
210    get_core_id( &cxy_main , &lid_main );
[646]211
212    // compute number of threads
213    unsigned int nclusters = x_size * y_size;
214    unsigned int nthreads  = nclusters * ncores;
215
[652]216    // main thread get FBF size and type
[646]217    unsigned int   fbf_width;
218    unsigned int   fbf_height;
219    unsigned int   fbf_type;
220    fbf_get_config( &fbf_width , &fbf_height , &fbf_type );
221
222    if( (fbf_width != IMAGE_SIZE) || (fbf_height != IMAGE_SIZE) || (fbf_type != IMAGE_TYPE) )
223    {
224        printf("\n[transpose error] image does not fit FBF size or type\n");
225        exit( 0 );
226    }
227
[652]228    if( nthreads > IMAGE_SIZE )
229    {
230        printf("\n[transpose error] number of threads larger than number of lines\n");
[646]231        exit( 0 );
[652]232    }
[646]233
[652]234    unsigned int npixels = IMAGE_SIZE * IMAGE_SIZE;
[646]235
[652]236    // define instrumentation file name
237    if( NO_PLACEMENT )
238    {
239        printf("\n[transpose] %d cluster(s) / %d core(s) / FBF[%d*%d] / PID %x / NO_PLACE\n",
240        nclusters, ncores, fbf_width, fbf_height, getpid() );
241
242        // build instrumentation file name
243        if( USE_DQT_BARRIER )
244        snprintf( filename , 32 , "trsp_dqt_no_place_%d_%d_%d",
245        IMAGE_SIZE , x_size * y_size , ncores );
246        else
247        snprintf( filename , 32 , "trsp_smp_no_place_%d_%d_%d",
248        IMAGE_SIZE , x_size * y_size , ncores );
249    }
250
251    if( EXPLICIT_PLACEMENT )
252    {
253        printf("\n[transpose] %d cluster(s) / %d core(s) / FBF[%d*%d] / PID %x / EXPLICIT\n",
254        nclusters, ncores, fbf_width, fbf_height, getpid() );
255
256        // build instrumentation file name
257        if( USE_DQT_BARRIER )
258        snprintf( filename , 32 , "trsp_dqt_explicit_%d_%d_%d",
259        IMAGE_SIZE , x_size * y_size , ncores );
260        else
261        snprintf( filename , 32 , "trsp_smp_explicit_%d_%d_%d",
262        IMAGE_SIZE , x_size * y_size , ncores );
263    }
264
265    if( PARALLEL_PLACEMENT )
266    {
267        printf("\n[transpose] %d cluster(s) / %d core(s) / FBF[%d*%d] / PID %x / PARALLEL\n",
268        nclusters, ncores, fbf_width, fbf_height, getpid() );
269
270        // build instrumentation file name
271        if( USE_DQT_BARRIER )
272        snprintf( filename , 32 , "trsp_dqt_parallel_%d_%d_%d",
273        IMAGE_SIZE , x_size * y_size , ncores );
274        else
275        snprintf( filename , 32 , "trsp_smp_parallel_%d_%d_%d",
276        IMAGE_SIZE , x_size * y_size , ncores );
277    }
278
279    // open instrumentation file
280    snprintf( pathname , 64 , "/home/%s", filename );
281    FILE * f = fopen( pathname , NULL );
282    if ( f == NULL ) 
[646]283    { 
[652]284        printf("\n[transpose error] cannot open instrumentation file %s\n", pathname );
[646]285        exit( 0 );
286    }
287
[652]288#if  VERBOSE_MAIN
289printf("\n[transpose] main on core[%x,%d] open instrumentation file %s\n",
290cxy_main, lid_main, pathname );
[646]291#endif
292
[652]293    // main thread initializes barrier
[646]294    if( USE_DQT_BARRIER )
295    {
296        pthread_barrierattr_t attr;
297        attr.x_size   = x_size;
298        attr.y_size   = y_size;
299        attr.nthreads = ncores;
300        error = pthread_barrier_init( &barrier, &attr , nthreads );
301    }
302    else
303    {
304        error = pthread_barrier_init( &barrier, NULL , nthreads );
305    }
306
307    if( error )
308    { 
309        printf("\n[transpose error] main cannot initialize barrier\n" );
310        exit( 0 );
311    }
312
[652]313#if  VERBOSE_MAIN
314printf("\n[transpose] main on core[%x,%d] completes barrier initialisation\n",
315cxy_main, lid_main );
316#endif
[646]317
[652]318    // main thread open input file
319    int fd_in = open( INPUT_FILE_PATH , O_RDONLY , 0 ); 
[646]320
[652]321    if ( fd_in < 0 ) 
322    { 
323        printf("\n[transpose error] main cannot open file %s\n", INPUT_FILE_PATH );
324        exit( 0 );
325    }
326
327#if  VERBOSE_MAIN
328printf("\n[transpose] main open file <%s> / fd = %d\n", INPUT_FILE_PATH , fd_in );
329#endif
330
331    // main thread map image_in buffer to input image file
332    image_in = (unsigned char *)mmap( NULL,
333                                      npixels,
334                                      PROT_READ,
335                                      MAP_FILE | MAP_SHARED,
336                                      fd_in,
337                                      0 );     // offset
338    if ( image_in == NULL ) 
339    { 
340        printf("\n[transpose error] main cannot map buffer to file %s\n", INPUT_FILE_PATH );
341        exit( 0 );
342    }
343
344#if  VERBOSE_MAIN
345printf("\n[transpose] main map buffer to file <%s>\n", INPUT_FILE_PATH );
346#endif
347
348    // main thread display input image on FBF
349    if( fbf_write( image_in, 
350                   npixels,
351                   0 ) )
[646]352    {
[652]353        printf("\n[transpose error] main cannot access FBF\n");
354        exit( 0 );
355    }
356
357#if SAVE_RESULT_IMAGE
358
359    // main thread open output file
360    int fd_out = open( OUTPUT_FILE_PATH , O_CREAT , 0 ); 
361
362    if ( fd_out < 0 ) 
363    { 
364        printf("\n[transpose error] main cannot open file %s\n", OUTPUT_FILE_PATH );
365        exit( 0 );
366    }
367
368#if  VERBOSE_MAIN
369printf("\n[transpose] main open file <%s> / fd = %d\n", OUTPUT_FILE_PATH , fd_out );
370#endif
371
372    // main thread map image_out buffer to output image file
373    image_out = (unsigned char *)mmap( NULL,
374                                       npixels,
375                                       PROT_WRITE,
376                                       MAP_FILE | MAP_SHARED,
377                                       fd_out,
378                                       0 );     // offset
379    if ( image_out == NULL ) 
380    { 
381        printf("\n[transpose error] main cannot map buf_out to file %s\n", OUTPUT_FILE_PATH );
382        exit( 0 );
383    }
384
385#if  VERBOSE_MAIN
386printf("\n[transpose] main map buffer to file <%s>\n", OUTPUT_FILE_PATH );
387#endif
388
389#endif  // SAVE_RESULT_IMAGE
390
391    /////////////////////////////////////////////////////////////////////////////////////
392    get_cycle( &end_sequencial_cycle );
393    SEQUENCIAL_TIME = (unsigned int)(end_sequencial_cycle - start_cycle);
394    /////////////////////////////////////////////////////////////////////////////////////
395
396    //////////////////
397    if( NO_PLACEMENT )
398    {
399        // the tid value for the main thread is always 0
400        // main thread creates new threads with tid in [1,nthreads-1] 
401        unsigned int tid;
402        for ( tid = 0 ; tid < nthreads ; tid++ )
[646]403        {
[652]404            // register tid value in exec_args[tid] array
405            exec_args[tid].tid = tid;
406           
407            // create other threads
408            if( tid > 0 )
[646]409            {
[652]410                if ( pthread_create( &exec_trdid[tid], 
411                                     NULL,                  // no attribute
412                                     &execute,
413                                     &exec_args[tid] ) ) 
[646]414                {
[652]415                    printf("\n[transpose error] cannot create thread %d\n", tid );
416                    exit( 0 );
417                }
[646]418
[652]419#if VERBOSE_MAIN
420printf("\n[transpose] main created thread %d\n", tid );
[646]421#endif
[652]422
[646]423            }
[652]424            else
425            {
426                tid_main = 0;
427            }
428        }  // end for tid
[646]429
[652]430        // main thread calls itself the execute() function
431        execute( &exec_args[0] );
[646]432
[652]433        // main thread wait other threads completion
434        for ( tid = 1 ; tid < nthreads ; tid++ )
435        {
436            unsigned int * status;
437
438            // main wait thread[tid] status
439            if ( pthread_join( exec_trdid[tid], (void*)(&status)) )
440            {
441                printf("\n[transpose error] main cannot join thread %d\n", tid );
442                exit( 0 );
443            }
444       
445            // check status
446            if( *status != THREAD_EXIT_SUCCESS )
447            {
448                printf("\n[transpose error] thread %x returned failure\n", tid );
449                exit( 0 );
450            }
451
452#if VERBOSE_MAIN
453printf("\n[transpose] main successfully joined thread %x\n", tid );
454#endif
455       
456        }  // end for tid
457
458    }  // end if no_placement
459
460    ////////////////////////
461    if( EXPLICIT_PLACEMENT )
[646]462    {
[652]463        // main thread places each other threads on a specific core[cxy][lid]
464        // but the actual thread creation is sequencial
465        unsigned int x;
466        unsigned int y;
467        unsigned int l;
468        unsigned int cxy;                   // cluster identifier
469        unsigned int tid;                   // thread continuous index
470
471        for( x = 0 ; x < x_size ; x++ )
[646]472        {
[652]473            for( y = 0 ; y < y_size ; y++ )
[646]474            {
[652]475                cxy = HAL_CXY_FROM_XY( x , y );
476                for( l = 0 ; l < ncores ; l++ )
[646]477                {
[652]478                    // compute thread continuous index
479                    tid = (((* y_size) + y) * ncores) + l;
[646]480
[652]481                    // register tid value in exec_args[tid] array
482                    exec_args[tid].tid = tid;
483
484                    // no thread created on the core running the main
485                    if( (cxy != cxy_main) || (l != lid_main) )
[646]486                    {
[652]487                        // define thread attributes
488                        exec_attr[tid].attributes = PT_ATTR_CLUSTER_DEFINED |
489                                                    PT_ATTR_CORE_DEFINED;
490                        exec_attr[tid].cxy        = cxy;
491                        exec_attr[tid].lid        = l;
492 
493                        // create thread[tid] on core[cxy][l]
494                        if ( pthread_create( &exec_trdid[tid],   
495                                             &exec_attr[tid],   
496                                             &execute,
497                                             &exec_args[tid] ) )       
498                        {
499                            printf("\n[transpose error] cannot create thread %d\n", tid );
500                            exit( 0 );
501                        }
502#if VERBOSE_MAIN
503printf("\n[transpose] main created thread[%d] on core[%x,%d]\n", tid, cxy, l );
504#endif
[646]505                    }
[652]506                    else
[646]507                    {
[652]508                        tid_main = tid;
[646]509                    }
510                }
511            }
512        }
513
[652]514        // main thread calls itself the execute() function
515        execute( &exec_args[tid_main] );
[646]516
[652]517        // main thread wait other threads completion
518        for( tid = 0 ; tid < nthreads ; tid++ )
[646]519        {
[652]520            // no other thread on the core running the main
521            if( tid != tid_main )
522            {
523                unsigned int * status;
[646]524
[652]525                // wait thread[tid]
526                if( pthread_join( exec_trdid[tid] , (void*)(&status) ) )
527                {
528                    printf("\n[transpose error] main cannot join thread %d\n", tid );
529                    exit( 0 );
530                }
531       
532                // check status
533                if( *status != THREAD_EXIT_SUCCESS )
534                {
535                    printf("\n[transpose error] thread %d returned failure\n", tid );
536                    exit( 0 );
537                }
538#if VERBOSE_MAIN
539printf("\n[transpose] main joined thread %d on core[%x,%d]\n", tid , cxy , l );
[646]540#endif
[652]541            }
542        }
543    }  // end if explicit_placement
[646]544
[652]545    ////////////////////////
546    if( PARALLEL_PLACEMENT )
[646]547    {
[652]548        // compute covering DQT size an level
549        unsigned int z          = (x_size > y_size) ? x_size : y_size;
550        unsigned int root_level = ((z == 1) ? 0 : 
551                                  ((z == 2) ? 1 : 
552                                  ((z == 4) ? 2 : 
553                                  ((z == 8) ? 3 : 4))));
[646]554
[652]555        // create & execute the working threads
556        if( pthread_parallel_create( root_level , &execute ) )
[646]557        {
[652]558            printf("\n[transpose error] in %s\n", __FUNCTION__ );
[646]559            exit( 0 );
560        }
[652]561    }  // end if parallel_placement
[646]562
563
[652]564    /////////////////////////////////////////////////////////////////////////////
565    get_cycle( &end_parallel_cycle );
566    PARALLEL_TIME = (unsigned int)(end_parallel_cycle - end_sequencial_cycle);
567    /////////////////////////////////////////////////////////////////////////////
[646]568
[652]569    // main thread register instrumentation results
570    instrument( f , filename );
[646]571
[652]572    // main thread close input file
[646]573    close( fd_in );
[652]574
575#if SAVE_RESULT_IMAGE
576
577    // main thread close output file
[646]578    close( fd_out );
579
[652]580#endif
581
582    // main close instrumentation file
583    fclose( f );
584
585    // main thread suicide
[646]586    exit( 0 );
587   
588} // end main()
589
590
591
[652]592
593///////////////////////////////////////////////////
594void execute( pthread_parallel_work_args_t * args )
[646]595{
596    unsigned long long   date;
597 
[652]598    unsigned int l;                         // line index for loop
599    unsigned int p;                         // pixel index for loop
[646]600
[652]601    // WARNING
602    //A thread is identified by the tid index, defined in the "args" structure.
603    // This index being in range [0,nclusters*ncores-1] we can always write
604    //       tid == cid * ncores + lid
605    // with cid in [0,nclusters-1] and lid in [0,ncores-1].
606    // if NO_PLACEMENT, there is no relation between these
607    // thread [cid][lid] indexes, and the core coordinates [cxy][lpid]
[646]608
[652]609    // get thread abstract identifiers
610    unsigned int tid = args->tid;
611    unsigned int cid = tid / ncores;   
612    unsigned int lid = tid % ncores;
613
614#if VERBOSE_EXEC
615unsigned int cxy;
616unsigned int lpid;
617get_core_id( &cxy , &lpid );   // get core physical identifiers
618printf("\n[transpose] exec[%d] on core[%x,%d] enters parallel exec\n",
619tid , cxy , lpid );
620#endif
621
622    get_cycle( &date );
623    LOAD_START[cid][lid] = (unsigned int)date;
624
[646]625    // build total number of pixels per image
626    unsigned int npixels = IMAGE_SIZE * IMAGE_SIZE;     
627
[652]628    // build total number of threads and clusters
[646]629    unsigned int nclusters = x_size * y_size;
[652]630    unsigned int nthreads  = nclusters * ncores;
[646]631
[652]632    unsigned int buf_size = npixels / nclusters;     // number of bytes in buf_in & buf_out
633    unsigned int offset   = cid * buf_size;       // offset in file (bytes)
[646]634
[652]635    unsigned char  * buf_in = NULL;        // private pointer on local input buffer
636    unsigned char  * buf_out = NULL;       // private pointer on local output buffer
[646]637
[652]638    // Each thread[cid,0] allocate a local buffer buf_in, and register
639    // the base adress in the global variable buf_in_ptr[cid]
640    // this local buffer is shared by all threads with the same cid
641    if( lid == 0 )
[646]642    {
[652]643        // allocate buf_in
644        buf_in = (unsigned char *)malloc( buf_size );
[646]645
[652]646        if( buf_in == NULL )
[646]647        {
[652]648            printf("\n[transpose error] thread[%d] cannot allocate buf_in\n", tid );
[646]649            pthread_exit( &THREAD_EXIT_FAILURE );
650        }
[652]651
652        // register buf_in buffer in global array of pointers
653        buf_in_ptr[cid] = buf_in;
654
655#if VERBOSE_EXEC
656printf("\n[transpose] exec[%d] on core[%x,%d] allocated buf_in = %x\n",
657tid , cxy , lpid , buf_in );
[646]658#endif
659
[652]660    }
[646]661
[652]662    // Each thread[cid,0] copy relevant part of the image_in to buf_in
663    if( lid == 0 )
664    {
665        memcpy( buf_in,
666                image_in + offset,
667                buf_size );
668    } 
669
670#if VERBOSE_EXEC
671printf("\n[transpose] exec[%d] on core[%x,%d] loaded buf_in[%d]\n",
672tid , cxy , lpid , cid );
673#endif
674
675    // Each thread[cid,0] allocate a local buffer buf_out, and register
676    // the base adress in the global variable buf_out_ptr[cid]
677    if( lid == 0 )
678    {
679        // allocate buf_out
680        buf_out = (unsigned char *)malloc( buf_size );
681
682        if( buf_out == NULL )
[646]683        {
[652]684            printf("\n[transpose error] thread[%d] cannot allocate buf_in\n", tid );
[646]685            pthread_exit( &THREAD_EXIT_FAILURE );
686        }
[652]687
688        // register buf_in buffer in global array of pointers
689        buf_out_ptr[cid] = buf_out;
690
691#if VERBOSE_EXEC
692printf("\n[transpose] exec[%d] on core[%x,%d] allocated buf_out = %x\n",
693tid , cxy , lpid , buf_out );
[646]694#endif
695
696    }
[652]697   
[646]698    get_cycle( &date );
[652]699    LOAD_END[cid][lid] = (unsigned int)date;
[646]700
701    /////////////////////////////////
702    pthread_barrier_wait( &barrier );
703
[652]704    get_cycle( &date );
705    TRSP_START[cid][lid] = (unsigned int)date;
706
707    // All threads contribute to parallel transpose from buf_in to buf_out
708    // each thread makes the transposition for nlt lines (nlt = npixels/nthreads)
[646]709    // from line [tid*nlt] to line [(tid + 1)*nlt - 1]
710    // (p,l) are the absolute pixel coordinates in the source image
[652]711    // (l,p) are the absolute pixel coordinates in the source image
712    // (p,l) are the absolute pixel coordinates in the dest image
[646]713
714    get_cycle( &date );
[652]715    TRSP_START[cid][lid] = (unsigned int)date;
[646]716
717    unsigned int nlt   = IMAGE_SIZE / nthreads;    // number of lines per thread
718    unsigned int nlc   = IMAGE_SIZE / nclusters;   // number of lines per cluster
719
[652]720    unsigned int src_cid;
[646]721    unsigned int src_index;
[652]722    unsigned int dst_cid;
[646]723    unsigned int dst_index;
724
725    unsigned char byte;
726
[652]727    unsigned int first = tid * nlt;     // first line index for a given thread
[646]728    unsigned int last  = first + nlt;      // last line index for a given thread
729
[652]730    // loop on lines handled by this thread
[646]731    for ( l = first ; l < last ; l++ )
732    {
[652]733        // loop on pixels in one line (one pixel per iteration)
[646]734        for ( p = 0 ; p < IMAGE_SIZE ; p++ )
735        {
736            // read one byte from local buf_in
[652]737            src_cid   = l / nlc;
738            src_index = (l % nlc) * IMAGE_SIZE + p;
[646]739
[652]740            byte        = buf_in_ptr[src_cid][src_index];
741
[646]742            // write one byte to remote buf_out
[652]743            dst_cid   = p / nlc; 
744            dst_index = (p % nlc) * IMAGE_SIZE + l;
[646]745
[652]746            buf_out_ptr[dst_cid][dst_index] = byte;
[646]747        }
748    }
749
[652]750#if VERBOSE_EXEC
751printf("\n[transpose] exec[%d] on core[%x,%d] completes transpose\n",
752tid , cxy , lpid );
[646]753#endif
754
755    get_cycle( &date );
[652]756    TRSP_END[cid][lid] = (unsigned int)date;
[646]757
758    /////////////////////////////////
759    pthread_barrier_wait( &barrier );
760
761    get_cycle( &date );
[652]762    DISP_START[cid][lid] = (unsigned int)date;
[646]763
[652]764    // All threads contribute to parallel display
765    // from local buf_out to frame buffer
[646]766    unsigned int  npt   = npixels / nthreads;   // number of pixels per thread
767
[652]768    if( fbf_write( &buf_out_ptr[cid][lid * npt], 
[646]769                   npt,
[652]770                   npt * tid ) )
[646]771    {
[652]772        printf("\n[transpose error] thread[%d] cannot access FBF\n", tid );
[646]773        pthread_exit( &THREAD_EXIT_FAILURE );
774    }
775
[652]776#if VERBOSE_EXEC
777printf("\n[transpose] exec[%d] on core [%x,%d] completes display\n",
778tid, cxy , lpid );
[646]779#endif
780
781    get_cycle( &date );
[652]782    DISP_END[cid][lid] = (unsigned int)date;
[646]783
784    /////////////////////////////////
785    pthread_barrier_wait( &barrier );
786
[652]787#if SAVE_RESULT_IMAGE
788
789    // Each thread[cid,0] copy buf_out to relevant part of image_out
790    if( lid == 0 )
791    {
792        memcpy( image_out + offset,
793                buf_out,
794                buf_size );
795    } 
796
797#if VERBOSE_EXEC
798printf("\n[transpose] exec[%d] on core[%x,%d] saved buf_out[%d]\n",
799tid , cxy , lpid , cid );
800#endif
801
802#endif
803
804    // Each thread[cid,0] releases local buffer buf_out
805    if( lid == 0 )
806    {
807        // release buf_out
808        free( buf_in );
809        free( buf_out );
810    }
811   
812    // thread termination depends on the placement policy
813    if( PARALLEL_PLACEMENT )   
814    {
815        // <work> threads are runing in detached mode
816        // each thread must signal completion by calling barrier
817        // passed in arguments before exit
818
819        pthread_barrier_wait( args->barrier );
820
[646]821        pthread_exit( &THREAD_EXIT_SUCCESS );
822    }
[652]823    else
824    {
825        // <work> threads are running in attached mode
826        // each thread, but de main, simply exit
827        if ( tid != tid_main )  pthread_exit( &THREAD_EXIT_SUCCESS );
828    }
[646]829
830} // end execute()
831
832
833
[652]834///////////////////////////
835void instrument( FILE * f,
836                 char * filename )
[646]837{
838    unsigned int x, y, l;
839
[652]840#if VERBOSE_EXEC
841printf("\n[transpose] main enters instrument\n" );
842#endif
843
[646]844    unsigned int min_load_start = 0xFFFFFFFF;
845    unsigned int max_load_start = 0;
846    unsigned int min_load_ended = 0xFFFFFFFF;
847    unsigned int max_load_ended = 0;
848    unsigned int min_trsp_start = 0xFFFFFFFF;
849    unsigned int max_trsp_start = 0;
850    unsigned int min_trsp_ended = 0xFFFFFFFF;
851    unsigned int max_trsp_ended = 0;
852    unsigned int min_disp_start = 0xFFFFFFFF;
853    unsigned int max_disp_start = 0;
854    unsigned int min_disp_ended = 0xFFFFFFFF;
855    unsigned int max_disp_ended = 0;
856 
857    for (x = 0; x < x_size; x++)
858    {
859        for (y = 0; y < y_size; y++)
860        {
[652]861            unsigned int cid = y_size * x + y;
[646]862
863            for ( l = 0 ; l < ncores ; l++ )
864            {
[652]865                if (LOAD_START[cid][l] < min_load_start)  min_load_start = LOAD_START[cid][l];
866                if (LOAD_START[cid][l] > max_load_start)  max_load_start = LOAD_START[cid][l];
867                if (LOAD_END[cid][l]   < min_load_ended)  min_load_ended = LOAD_END[cid][l]; 
868                if (LOAD_END[cid][l]   > max_load_ended)  max_load_ended = LOAD_END[cid][l];
869                if (TRSP_START[cid][l] < min_trsp_start)  min_trsp_start = TRSP_START[cid][l];
870                if (TRSP_START[cid][l] > max_trsp_start)  max_trsp_start = TRSP_START[cid][l];
871                if (TRSP_END[cid][l]   < min_trsp_ended)  min_trsp_ended = TRSP_END[cid][l];
872                if (TRSP_END[cid][l]   > max_trsp_ended)  max_trsp_ended = TRSP_END[cid][l];
873                if (DISP_START[cid][l] < min_disp_start)  min_disp_start = DISP_START[cid][l];
874                if (DISP_START[cid][l] > max_disp_start)  max_disp_start = DISP_START[cid][l];
875                if (DISP_END[cid][l]   < min_disp_ended)  min_disp_ended = DISP_END[cid][l];
876                if (DISP_END[cid][l]   > max_disp_ended)  max_disp_ended = DISP_END[cid][l];
[646]877            }
878        }
879    }
880
[652]881    printf( "\n ------ %s ------\n" , filename );
882    fprintf( f , "\n ------ %s ------\n" , filename );
[646]883
[652]884    printf( " - LOAD_START : min = %d / max = %d / delta = %d\n",
885           min_load_start, max_load_start, max_load_start-min_load_start ); 
886    fprintf( f , " - LOAD_START : min = %d / max = %d / delta = %d\n",
887           min_load_start, max_load_start, max_load_start-min_load_start ); 
[646]888
[652]889    printf( " - LOAD_END   : min = %d / max = %d / delta = %d\n",
890           min_load_ended, max_load_ended, max_load_ended-min_load_ended ); 
891    fprintf( f , " - LOAD_END   : min = %d / max = %d / delta = %d\n",
892           min_load_ended, max_load_ended, max_load_ended-min_load_ended ); 
[646]893
[652]894    printf( " - TRSP_START : min = %d / max = %d / delta = %d\n",
895           min_trsp_start, max_trsp_start, max_trsp_start-min_trsp_start ); 
896    fprintf( f , " - TRSP_START : min = %d / max = %d / delta = %d\n",
897           min_trsp_start, max_trsp_start, max_trsp_start-min_trsp_start ); 
[646]898
[652]899    printf( " - TRSP_END   : min = %d / max = %d / delta = %d\n",
900           min_trsp_ended, max_trsp_ended, max_trsp_ended-min_trsp_ended ); 
901    fprintf( f , " - TRSP_END   : min = %d / max = %d / delta = %d\n",
902           min_trsp_ended, max_trsp_ended, max_trsp_ended-min_trsp_ended ); 
[646]903
[652]904    printf( " - DISP_START : min = %d / max = %d / delta = %d\n",
905           min_disp_start, max_disp_start, max_disp_start-min_disp_start ); 
906    fprintf( f , " - DISP_START : min = %d / max = %d / delta = %d\n",
907           min_disp_start, max_disp_start, max_disp_start-min_disp_start ); 
[646]908
[652]909    printf( " - DISP_END   : min = %d / max = %d / delta = %d\n",
910           min_disp_ended, max_disp_ended, max_disp_ended-min_disp_ended ); 
911    fprintf( f , " - DISP_END   : min = %d / max = %d / delta = %d\n",
912           min_disp_ended, max_disp_ended, max_disp_ended-min_disp_ended ); 
[646]913
[652]914    printf( "\n   Sequencial = %d / Parallel = %d\n", SEQUENCIAL_TIME, PARALLEL_TIME );
915    fprintf( f , "\n   Sequencial = %d / Parallel = %d\n", SEQUENCIAL_TIME, PARALLEL_TIME );
[646]916
917}  // end instrument()
918
919
920
921
Note: See TracBrowser for help on using the repository browser.