source: trunk/user/convol/convol.c @ 677

Last change on this file since 677 was 676, checked in by alain, 4 years ago

Introduce chat application to test the named pipes.

File size: 44.7 KB
Line 
1///////////////////////////////////////////////////////////////////////////////////////
2// File   : convol.c 
3// Date   : june 2014
4// author : Alain Greiner
5///////////////////////////////////////////////////////////////////////////////////////
6// This multi-threaded application implements a 2D convolution product. 
7// It can run on a multi-cores, multi-clusters architecture, with one thread
8// per core, and uses the POSIX threads API.
9//
10// The input image is read from a file and the output image is saved to another file.
11//
12// - number of clusters containing processors must be power of 2 no larger than 256.
13// - number of processors per cluster must be power of 2 no larger than 4.
14// - number of working threads is the number of cores availables in the hardware
15//   architecture : nthreads = nclusters * ncores.
16//
17// The convolution kernel is defined in the execute() function.
18// It can be factored in two independant line and column convolution products.
19//
20// The main() function can be launched on any processor.
21// - It checks software requirements versus the hardware resources.
22// - It open & maps the input file to a global <image_in> buffer.
23// - it open & maps the output file to another global <image_out> buffer.
24// - it open the instrumentation file.
25// - it creates & activates two FBF windows to display input & output images.
26// - it launches other threads to run in parallel the execute() function.
27// - it saves the instrumentation results on disk.
28// - it closes the input, output, & instrumentation files.
29// - it deletes the FBF input & output windows.
30//
31// The execute() function is executed in parallel by all threads. These threads are
32// working on 5 arrays of distributed buffers, indexed by the cluster index [cid].
33// - A[cid]: contain the distributed initial image (NL/NCLUSTERS lines per cluster).
34// - B[cid]: is the result of horizontal filter, then transpose B <= Trsp(HF(A)
35// - C[cid]: is the result of vertical image, then transpose : c <= Trsp(VF(B)
36// - D[cid]: is the the difference between A and FH(A) : D <= A - FH(A)
37// - Z[cid]: contain the distributed final image Z <= C + D
38//
39// It can be split in four phases separated by synchronisation barriers:
40// 1. Initialisation:
41//    Allocates the 5 A[cid],B[cid],C[cid],D[cid],Z[cid] buffers, initialise A[cid]
42//    from the <image_in> buffer, and display the initial image on FBF if rquired.
43// 2. Horizontal Filter:
44//    Set B[cid] and D[cid] from A[cid]. Read data accesses are local, write data
45//    accesses are remote, to implement the transpose.
46// 3. Vertical Filter: 
47//    Set C[cid] from B[cid]. Read data accesses are local, write data accesses
48//    are remote, to implement the transpose.
49// 4. Save results:
50//    Set the Z[cid] from C[cid] and D[cid]. All read and write access are local.
51//    Move the final image (Z[cid] buffer) to the <image_out> buffer.   
52//
53// This application supports three placement modes, implemented in the main() function.
54// In all modes, the working threads are identified by the [tid] continuous index
55// in range [0, NTHREADS-1], and defines how the lines are shared amongst the threads.
56// This continuous index can always be decomposed in two continuous sub-indexes:
57// tid == cid * NCORES + lid,  where cid is in [0,NCLUSTERS-1] and lid in [0,NCORES-1].
58//
59// - NO_PLACEMENT: the main thread is itsef a working thread. The (N_1) other working
60//   threads are created by the main thread, but the placement is done by the OS, using
61//   the DQDT for load balancing, and two working threads can be placed on the same core.
62//   The [cid,lid] are only abstract identifiers, and cannot be associated to a physical
63//   cluster or a physical core. In this mode, the main thread run on any cluster,
64//   but has tid = 0 (i.e. cid = 0 & tid = 0).
65//
66// - EXPLICIT_PLACEMENT: the main thread is again a working thread, but the placement
67//   of the threads on the cores is explicitely controled by the main thread to have
68//   exactly one working thread per core, and the [cxy][lpid] core coordinates for a given
69//   thread[tid] can be directly derived from the [tid] value: [cid] is an alias for the
70//   physical cluster identifier, and [lid] is the local core index.
71//
72// - PARALLEL_PLACEMENT: the main thread is not anymore a working thread, and uses the
73//   non standard pthread_parallel_create() function to avoid the costly sequencial
74//   loops for pthread_create() and pthread_join(). It garanties one working thread
75//   per core, and the same relation between the thread[tid] and the core[cxy][lpid].
76//
77// The [tid] continuous index defines how the work is shared amongst the threads:
78// - each thread handles NL/nthreads lines for the horizontal filter.
79// - each thread handles NP/nthreads columns for the vertical filter.
80///////////////////////////////////////////////////////////////////////////////////////
81
82#include <sys/mman.h>
83#include <stdio.h>
84#include <stdlib.h>
85#include <fcntl.h>
86#include <unistd.h>
87#include <pthread.h>
88#include <string.h>
89#include <almosmkh.h>
90#include <hal_macros.h>
91
92#define VERBOSE_MAIN               1
93#define VERBOSE_EXEC               1
94#define SUPER_VERBOSE              0
95
96#define X_MAX                      16
97#define Y_MAX                      16
98#define CORES_MAX                  4
99#define CLUSTERS_MAX               (X_MAX * Y_MAX)
100#define THREADS_MAX                (X_MAX * Y_MAX * CORES_MAX)
101
102#define IMAGE_TYPE                 420                         // pixel encoding type
103#define INPUT_IMAGE_PATH           "misc/couple_512.raw"       // default image_in
104#define OUTPUT_IMAGE_PATH          "misc/couple_conv_512.raw"  // default image_out
105#define NL                         512                         // default nlines
106#define NP                         512                         // default npixels
107
108#define NO_PLACEMENT               0
109#define EXPLICIT_PLACEMENT         0
110#define PARALLEL_PLACEMENT         1
111
112#define INTERACTIVE_MODE           0
113#define USE_DQT_BARRIER            1
114#define INITIAL_DISPLAY_ENABLE     1
115#define FINAL_DISPLAY_ENABLE       1
116
117#define TA(c,l,p)  (A[c][((NP) * (l)) + (p)])
118#define TB(c,p,l)  (B[c][((NL) * (p)) + (l)])
119#define TC(c,l,p)  (C[c][((NP) * (l)) + (p)])
120#define TD(c,l,p)  (D[c][((NP) * (l)) + (p)])
121#define TZ(c,l,p)  (Z[c][((NP) * (l)) + (p)])
122
123#define max(x,y) ((x) > (y) ? (x) : (y))
124#define min(x,y) ((x) < (y) ? (x) : (y))
125
126//////////////////////////////////////////////////////////
127//            global variables
128//////////////////////////////////////////////////////////
129
130// global instrumentation counters for the main thread
131unsigned int SEQUENCIAL_TIME = 0;
132unsigned int PARALLEL_TIME   = 0;
133
134// instrumentation counters for thread[tid] in cluster[cid]
135unsigned int START[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
136unsigned int H_BEG[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
137unsigned int H_END[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
138unsigned int V_BEG[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
139unsigned int V_END[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
140unsigned int F_BEG[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
141unsigned int F_END[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
142
143// pointer on buffer containing the input image, maped by the main to the input file
144unsigned char *  image_in;
145
146// pointer on buffer containing the output image, maped by the main to the output file
147unsigned char *  image_out;
148
149// return values at thread exit
150unsigned int THREAD_EXIT_SUCCESS = 0;
151unsigned int THREAD_EXIT_FAILURE = 1;
152
153// pointer and identifier for FBF windows
154void   *  in_win_buf;
155int       in_wid;
156void   *  out_win_buf;
157int       out_wid;
158
159// synchronization barrier
160pthread_barrier_t     barrier;
161
162// platform parameters
163unsigned int  x_size;              // number of clusters in a row
164unsigned int  y_size;              // number of clusters in a column
165unsigned int  ncores;              // number of processors per cluster
166
167// main thread continuous index
168unsigned int     tid_main;
169
170// arrays of pointers on distributed buffers in all clusters
171unsigned char  * GA[CLUSTERS_MAX];
172int            * GB[CLUSTERS_MAX];
173int            * GC[CLUSTERS_MAX];
174int            * GD[CLUSTERS_MAX];
175unsigned char  * GZ[CLUSTERS_MAX];
176
177// array of threads kernel identifiers / indexed by [tid]
178pthread_t        exec_trdid[THREADS_MAX];
179
180// array of threads attributes / indexed bi [tid]
181pthread_attr_t   exec_attr[THREADS_MAX]; 
182
183// array of execute() function arguments / indexed by [tid]
184pthread_parallel_work_args_t exec_args[THREADS_MAX];
185
186// image features
187unsigned int   image_nl;
188unsigned int   image_np;
189char           input_image_path[128];
190char           output_image_path[128];
191
192/////////////////////////////////////////////////////////////////////////////////////
193//           functions declaration
194/////////////////////////////////////////////////////////////////////////////////////
195
196void * execute( void * args );
197
198void instrument( FILE * f , char * filename );
199
200/////////////////
201void main( void )
202/////////////////
203{
204    unsigned long long start_cycle;
205    unsigned long long end_sequencial_cycle;
206    unsigned long long end_parallel_cycle;
207
208    int          error;
209
210    char         instru_name[32];               // instrumentation file name
211    char         instru_path[64];               // instrumentation path name
212
213    /////////////////////////////////////////////////////////////////////////////////
214    get_cycle( &start_cycle );
215    /////////////////////////////////////////////////////////////////////////////////
216
217    if( (NO_PLACEMENT + EXPLICIT_PLACEMENT + PARALLEL_PLACEMENT) != 1 )
218    {
219        printf("\n[convol error] illegal placement\n");
220        exit( 0 );
221    }
222
223    // get & check platform parameters
224    hard_config_t  config;
225    get_config( &config );
226    x_size = config.x_size;
227    y_size = config.y_size;
228    ncores = config.ncores;
229
230    if((ncores != 1) && (ncores != 2) && (ncores != 4))
231    {
232        printf("\n[convol error] number of cores per cluster must be 1/2/4\n");
233        exit( 0 );
234    }
235
236    if( (x_size != 1) && (x_size != 2) && (x_size != 4) && 
237        (x_size != 8) && (x_size != 16) )
238    {
239        printf("\n[convol error] x_size must be 1/2/4/8/16\n");
240        exit( 0 );
241    }
242       
243    if( (y_size != 1) && (y_size != 2) && (y_size != 4) && 
244        (y_size != 8) && (y_size != 16) )
245    {
246        printf("\n[convol error] y_size must be 1/2/4/8/16\n");
247        exit( 0 );
248    }
249       
250    // main thread get identifiers for core executing main
251    unsigned int  cxy_main;
252    unsigned int  lid_main;
253    get_core_id( &cxy_main , &lid_main );
254
255    // compute nthreads and nclusters
256    unsigned int nclusters = x_size * y_size;
257    unsigned int nthreads  = nclusters * ncores;
258
259    // get input and output images pathnames and size
260    if( INTERACTIVE_MODE )
261    {
262        // get image size
263        printf("\n[convol] image nlines      : ");
264        get_uint32( &image_nl );
265
266        printf("\n[convol] image npixels     : ");
267        get_uint32( &image_np );
268
269        printf("\n[convol] input image path  : ");
270        get_string( input_image_path , 128 );
271
272        printf("[convol] output image path : ");
273        get_string( output_image_path , 128 );
274    }
275    else
276    {
277        image_nl = NL;
278        image_np = NP;
279        strcpy( input_image_path  , INPUT_IMAGE_PATH );
280        strcpy( output_image_path , OUTPUT_IMAGE_PATH );
281    }
282
283    // main thread get FBF size and type
284    int   fbf_width;
285    int   fbf_height;
286    int   fbf_type;
287    fbf_get_config( &fbf_width , &fbf_height , &fbf_type );
288
289    if( ((unsigned int)fbf_width  < image_np) || 
290        ((unsigned int)fbf_height < image_nl) || 
291        (fbf_type != IMAGE_TYPE) )
292    {
293        printf("\n[convol error] image not acceptable\n"
294               "FBF width  = %d / npixels  = %d\n"
295               "FBF height = %d / nlines   = %d\n"
296               "FBF type   = %d / expected = %d\n",
297               fbf_width, image_np, fbf_height, image_nl, fbf_type, IMAGE_TYPE );
298        exit( 0 );
299    }
300
301    if( nthreads > image_nl )
302    {
303        printf("\n[convol error] nthreads (%d] larger than nlines (%d)\n",
304        nthreads , image_nl );
305        exit( 0 );
306    }
307
308    // define instrumentation file name
309    if( NO_PLACEMENT )
310    {
311        printf("\n[convol] %d cluster(s) / %d core(s) / FBF[%d*%d] / PID %x / NO_PLACE\n",
312        nclusters, ncores, fbf_width, fbf_height, getpid() );
313
314        // build instrumentation file name
315        if( USE_DQT_BARRIER )
316        snprintf( instru_name , 32 , "dqt_no_place_%d_%d", x_size * y_size , ncores );
317        else
318        snprintf( instru_name , 32 , "smp_no_place_%d_%d", x_size * y_size , ncores );
319    }
320
321    if( EXPLICIT_PLACEMENT )
322    {
323        printf("\n[convol] %d cluster(s) / %d core(s) / FBF[%d*%d] / PID %x / EXPLICIT\n",
324        nclusters, ncores, fbf_width, fbf_height, getpid() );
325
326        // build instrumentation file name
327        if( USE_DQT_BARRIER )
328        snprintf( instru_name , 32 , "dqt_explicit_%d_%d", x_size * y_size , ncores );
329        else
330        snprintf( instru_name , 32 , "smp_explicit_%d_%d", x_size * y_size , ncores );
331    }
332
333    if( PARALLEL_PLACEMENT )
334    {
335        printf("\n[convol] %d cluster(s) / %d core(s) / FBF[%d*%d] / PID %x / PARALLEL\n",
336        nclusters, ncores, fbf_width, fbf_height, getpid() );
337
338        // build instrumentation file name
339        if( USE_DQT_BARRIER )
340        snprintf( instru_name , 32 , "dqt_parallel_%d_%d", x_size * y_size , ncores );
341        else
342        snprintf( instru_name , 32 , "smp_parallel_%d_%d", x_size * y_size , ncores );
343    }
344
345    // open instrumentation file
346    snprintf( instru_path , 64 , "/home/convol/%s", instru_name );
347    FILE * f_instru = fopen( instru_path , NULL );
348    if ( f_instru == NULL ) 
349    { 
350        printf("\n[convol error] cannot open instrumentation file %s\n", instru_path );
351        exit( 0 );
352    }
353
354#if  VERBOSE_MAIN
355printf("\n[convol] main on core[%x,%d] open instrumentation file %s\n",
356cxy_main, lid_main, instru_path );
357#endif
358
359    // main create an FBF window for input image
360    in_wid = fbf_create_window( 0,                   // l_zero
361                                0,                   // p_zero
362                                image_nl,            // lines
363                                image_np,            // pixels
364                                &in_win_buf );
365    if( in_wid < 0 ) 
366    {
367        printf("\n[transpose error] cannot open FBF window for %s\n",
368        input_image_path);
369        exit( 0 );
370    }
371
372    // activate window
373    error = fbf_active_window( in_wid , 1 );
374
375    if( error )
376    {
377        printf("\n[transpose error] cannot activate window for %s\n",
378        input_image_path );
379        exit( 0 );
380    }
381
382#if  VERBOSE_MAIN
383printf("\n[convol] main on core[%x,%d] created FBF window (wid %d) for <%s>\n",
384cxy_main, lid_main, in_wid, input_image_path );
385#endif
386
387    // main create an FBF window for output image
388    out_wid = fbf_create_window( 0,                   // l_zero
389                                 image_np,            // p_zero
390                                 image_nl,            // lines
391                                 image_np,            // pixels
392                                 &out_win_buf );
393    if( out_wid < 0 ) 
394    {
395        printf("\n[transpose error] cannot create FBF window for %s\n",
396        output_image_path);
397        exit( 0 );
398    }
399
400    // activate window
401    error = fbf_active_window( out_wid , 1 );
402
403    if( error )
404    {
405        printf("\n[transpose error] cannot activate window for %s\n",
406        output_image_path );
407        exit( 0 );
408    }
409
410#if  VERBOSE_MAIN
411printf("\n[convol] main on core[%x,%d] created FBF window (wid %d) for <%s>\n",
412cxy_main, lid_main, out_wid, output_image_path );
413#endif
414
415    // main initialise barrier
416    if( USE_DQT_BARRIER )
417    {
418        pthread_barrierattr_t attr;
419        attr.x_size   = x_size;
420        attr.y_size   = y_size;
421        attr.nthreads = ncores;
422        error = pthread_barrier_init( &barrier, &attr , nthreads );
423    }
424    else
425    {
426        error = pthread_barrier_init( &barrier, NULL , nthreads );
427    }
428
429    if( error )
430    {
431        printf("\n[convol error] cannot initialize barrier\n");
432        exit( 0 );
433    }
434
435#if VERBOSE_MAIN
436printf("\n[convol] main on core[%x,%d] completed barrier init\n", 
437cxy_main, lid_main );
438#endif
439
440    // main open input file
441    int fd_in = open( input_image_path , O_RDONLY , 0 );
442
443    if ( fd_in < 0 ) 
444    { 
445        printf("\n[convol error] cannot open input file <%s>\n", input_image_path );
446        exit( 0 );
447    }
448
449    // main thread map input file to image_in buffer
450    image_in = (unsigned char *)mmap( NULL,
451                                      image_np * image_nl,
452                                      PROT_READ,
453                                      MAP_FILE | MAP_SHARED,
454                                      fd_in,
455                                      0 );           // offset
456    if ( image_in == NULL ) 
457    { 
458        printf("\n[convol error] main cannot map buffer to file %s\n", input_image_path );
459        exit( 0 );
460    }
461
462#if  VERBOSE_MAIN
463printf("\n[convol] main on core[%x,%x] map <image_in> buffer to file <%s>\n",
464cxy_main, lid_main, input_image_path );
465#endif
466
467    // main thread open output file
468    int fd_out = open( output_image_path , O_CREAT , 0 ); 
469
470    if ( fd_out < 0 ) 
471    { 
472        printf("\n[convol error] main cannot open file %s\n", output_image_path );
473        exit( 0 );
474    }
475
476    // main thread map image_out buffer to output file
477    image_out = (unsigned char *)mmap( NULL,
478                                       image_np * image_nl,
479                                       PROT_WRITE,
480                                       MAP_FILE | MAP_SHARED,
481                                       fd_out,
482                                       0 );     // offset
483    if ( image_out == NULL ) 
484    { 
485        printf("\n[convol error] main cannot map buffer to file %s\n", output_image_path );
486        exit( 0 );
487    }
488
489#if  VERBOSE_MAIN
490printf("\n[convol] main on core[%x,%x] map <image_out> buffer to file <%s>\n",
491cxy_main, lid_main, output_image_path );
492#endif
493
494    /////////////////////////////////////////////////////////////////////////////////////
495    get_cycle( &end_sequencial_cycle );
496    SEQUENCIAL_TIME = (unsigned int)(end_sequencial_cycle - start_cycle);
497    /////////////////////////////////////////////////////////////////////////////////////
498
499//////////////////
500#if NO_PLACEMENT
501{
502    // the tid value for the main thread is always 0
503    // main thread creates other threads with tid in [1,nthreads-1] 
504    unsigned int tid;
505    for ( tid = 0 ; tid < nthreads ; tid++ )
506    {
507        // register tid value in exec_args[tid] array
508        exec_args[tid].tid = tid;
509         
510        // create other threads
511        if( tid > 0 )
512        {
513            if ( pthread_create( &exec_trdid[tid], 
514                                 NULL,                  // no attribute
515                                 &execute,
516                                 &exec_args[tid] ) ) 
517            {
518                printf("\n[convol error] cannot create thread %d\n", tid );
519                exit( 0 );
520            }
521
522#if VERBOSE_MAIN
523printf("\n[convol] main created thread %d\n", tid );
524#endif
525
526        }
527        else
528        {
529            tid_main = 0;
530        }
531    }  // end for tid
532
533    // main thread calls itself the execute() function
534    execute( &exec_args[0] );
535
536    // main thread wait other threads completion
537    for ( tid = 1 ; tid < nthreads ; tid++ )
538    {
539        unsigned int * status;
540
541        // main wait thread[tid] status
542        if ( pthread_join( exec_trdid[tid], (void*)(&status)) )
543        {
544            printf("\n[convol error] main cannot join thread %d\n", tid );
545            exit( 0 );
546        }
547       
548        // check status
549        if( *status != THREAD_EXIT_SUCCESS )
550        {
551            printf("\n[convol error] thread %x returned failure\n", tid );
552            exit( 0 );
553        }
554
555#if VERBOSE_MAIN
556printf("\n[convol] main successfully joined thread %x\n", tid );
557#endif
558       
559    }  // end for tid
560} 
561#endif // end no_placement
562
563//////////////////////
564#if EXPLICIT_PLACEMENT
565{
566    // main thread places each other threads on a specific core[cxy][lid]
567    // but the actual thread creation is sequencial
568    unsigned int x;
569    unsigned int y;
570    unsigned int l;
571    unsigned int cxy;                   // cluster identifier
572    unsigned int tid;                   // thread continuous index
573
574    for( x = 0 ; x < x_size ; x++ )
575    {
576        for( y = 0 ; y < y_size ; y++ )
577        {
578            cxy = HAL_CXY_FROM_XY( x , y );
579            for( l = 0 ; l < ncores ; l++ )
580            {
581                // compute thread continuous index
582                tid = (((* y_size) + y) * ncores) + l;
583
584                // register tid value in exec_args[tid] array
585                exec_args[tid].tid = tid;
586
587                // no thread created on the core running the main
588                if( (cxy != cxy_main) || (l != lid_main) )
589                {
590                    // define thread attributes
591                    exec_attr[tid].attributes = PT_ATTR_CLUSTER_DEFINED |
592                                                PT_ATTR_CORE_DEFINED;
593                    exec_attr[tid].cxy        = cxy;
594                    exec_attr[tid].lid        = l;
595 
596                    // create thread[tid] on core[cxy][l]
597                    if ( pthread_create( &exec_trdid[tid],   
598                                         &exec_attr[tid],   
599                                         &execute,
600                                         &exec_args[tid] ) )       
601                    {
602                        printf("\n[convol error] cannot create thread %d\n", tid );
603                        exit( 0 );
604                    }
605#if VERBOSE_MAIN
606printf("\n[convol] main created thread[%d] on core[%x,%d]\n", tid, cxy, l );
607#endif
608                }
609                else
610                {
611                    tid_main = tid;
612                }
613            }
614        }
615    }
616
617    // main thread calls itself the execute() function
618    execute( &exec_args[tid_main] );
619
620    // main thread wait other threads completion
621    for( tid = 0 ; tid < nthreads ; tid++ )
622    {
623        // no other thread on the core running the main
624        if( tid != tid_main )
625        {
626            unsigned int * status;
627
628            // wait thread[tid]
629            if( pthread_join( exec_trdid[tid] , (void*)(&status) ) )
630            {
631                printf("\n[convol error] main cannot join thread %d\n", tid );
632                exit( 0 );
633            }
634     
635            // check status
636            if( *status != THREAD_EXIT_SUCCESS )
637            {
638                printf("\n[convol error] thread %d returned failure\n", tid );
639                exit( 0 );
640            }
641#if VERBOSE_MAIN
642printf("\n[convol] main joined thread %d on core[%x,%d]\n", tid , cxy , l );
643#endif
644        }
645    }
646} 
647#endif   // end explicit_placement
648
649//////////////////////
650#if PARALLEL_PLACEMENT
651{
652    // compute covering DQT size an level
653    unsigned int z          = (x_size > y_size) ? x_size : y_size;
654    unsigned int root_level = ((z == 1) ? 0 : 
655                              ((z == 2) ? 1 : 
656                              ((z == 4) ? 2 : 
657                              ((z == 8) ? 3 : 4))));
658
659    // create & execute the working threads
660    if( pthread_parallel_create( root_level , &execute ) )
661    {
662        printf("\n[convol error] in %s\n", __FUNCTION__ );
663        exit( 0 );
664    }
665}
666#endif  // end parallel_placement
667
668    /////////////////////////////////////////////////////////////////////////////
669    get_cycle( &end_parallel_cycle );
670    PARALLEL_TIME = (unsigned int)(end_parallel_cycle - end_sequencial_cycle);
671    /////////////////////////////////////////////////////////////////////////////
672
673    // main thread register instrumentation results
674    instrument( f_instru , instru_name );
675
676#if VERBOSE_MAIN
677printf("\n[convol] main registered instrumentation info\n" );
678#endif
679
680    // main thread close input file
681    close( fd_in );
682
683#if VERBOSE_MAIN
684printf("\n[convol] main closed input file\n" );
685#endif
686
687    // main thread close output file
688    close( fd_out );
689
690#if VERBOSE_MAIN
691printf("\n[convol] main closed output file\n" );
692#endif
693
694    // main thread close instrumentation file
695    fclose( f_instru );
696
697#if VERBOSE_MAIN
698printf("\n[convol] main closed instrumentation file\n" );
699#endif
700
701    // ask confirm for exit
702    if( INTERACTIVE_MODE )
703    {
704        char byte;
705        printf("\n[convol] press any key to to delete FBF windows and exit\n");
706        getc( &byte );
707    }
708 
709    // main thread delete FBF windows
710    fbf_delete_window( in_wid );
711    fbf_delete_window( out_wid );
712
713#if VERBOSE_MAIN
714printf("\n[convol] main deleted FBF windows\n" );
715#endif
716
717    // main thread suicide
718    exit( 0 );
719   
720} // end main()
721
722
723
724
725
726
727
728
729
730
731//////////////////////////////////
732void * execute( void * arguments )
733//////////////////////////////////
734{
735    unsigned long long date;
736
737    pthread_parallel_work_args_t * args = (pthread_parallel_work_args_t *)arguments;
738
739    // Each thread initialises the convolution kernel parameters in local stack.
740    // The values defined in the next 12 lines are Philips proprietary information.
741
742    int   vnorm  = 115;
743    int   vf[35] = { 1, 1, 2, 2, 2,
744                     2, 3, 3, 3, 4,
745                     4, 4, 4, 5, 5,
746                     5, 5, 5, 5, 5,
747                     5, 5, 4, 4, 4,
748                     4, 3, 3, 3, 2,
749                     2, 2, 2, 1, 1 };
750
751    unsigned int hrange = 100;
752    unsigned int hnorm  = 201;
753
754    // WARNING
755    //A thread is identified by the tid index, defined in the "args" structure.
756    // This index being in range [0,nclusters*ncores-1] we can always write
757    //       tid == cid * ncores + lid
758    // with cid in [0,nclusters-1] and lid in [0,ncores-1].
759    // if NO_PLACEMENT, there is no relation between these
760    // thread [cid][lid] indexes, and the core coordinates [cxy][lpid]
761
762    // get thread abstract identifiers[cid,lid]  from tid
763    unsigned int tid = args->tid;
764    unsigned int cid = tid / ncores;   
765    unsigned int lid = tid % ncores;
766
767#if VERBOSE_EXEC
768unsigned int cxy;              // core cluster identifier
769unsigned int lpid;             // core local identifier
770get_cycle( &date );
771get_core_id( &cxy , &lpid );
772printf("\n[convol] exec[%d] on core[%x,%d] enters parallel exec / cycle %d\n",
773tid , cxy , lpid , (unsigned int)date );
774#endif
775
776    // compute nthreads and nclusters from global variables
777    unsigned int nclusters = x_size * y_size;
778    unsigned int nthreads  = nclusters * ncores;
779
780    // indexes for loops
781    unsigned int c;                 // cluster index
782    unsigned int l;                 // line index
783    unsigned int p;                 // pixel index
784    unsigned int z;                 // vertical filter index
785
786    unsigned int lines_per_thread   = image_nl / nthreads;
787    unsigned int lines_per_cluster  = image_nl / nclusters;
788    unsigned int pixels_per_thread  = image_np / nthreads;
789    unsigned int pixels_per_cluster = image_np / nclusters;
790
791    // compute number of pixels stored in one cluster
792    unsigned int local_pixels = image_nl * image_np / nclusters;       
793
794    get_cycle( &date );
795    START[cid][lid] = (unsigned int)date;
796
797    // Each thread[cid][0] allocates 5 buffers local cluster cid
798    // and registers these 5 pointers in the global arrays
799    if ( lid == 0 )
800    {
801        GA[cid] = malloc( local_pixels * sizeof( unsigned char ) );
802        GB[cid] = malloc( local_pixels * sizeof( int ) );
803        GC[cid] = malloc( local_pixels * sizeof( int ) );
804        GD[cid] = malloc( local_pixels * sizeof( int ) );
805        GZ[cid] = malloc( local_pixels * sizeof( unsigned char ) );
806
807        if( (GA[cid] == NULL) || 
808            (GB[cid] == NULL) || 
809            (GC[cid] == NULL) || 
810            (GD[cid] == NULL) || 
811            (GZ[cid] == NULL) )
812        {
813            printf("\n[convol error] thread[%d] cannot allocate buf_in\n", tid );
814            pthread_exit( &THREAD_EXIT_FAILURE );
815        }
816
817#if VERBOSE_EXEC
818get_cycle( &date );
819printf("\n[convol] exec[%d] on core[%x,%d] allocated shared buffers / cycle %d\n"
820" GA %x / GB %x / GC %x / GD %x / GZ %x\n",
821tid, cxy , lpid, (unsigned int)date, GA[cid], GB[cid], GC[cid], GD[cid], GZ[cid] );
822#endif
823   
824    }
825
826    ////////////////////////////////
827    pthread_barrier_wait( &barrier );
828
829    // Each thread[tid] allocates and initialises in its private stack
830    // a copy of the arrays of pointers on the distributed buffers.
831    unsigned char  * A[CLUSTERS_MAX];
832    int            * B[CLUSTERS_MAX];
833    int            * C[CLUSTERS_MAX];
834    int            * D[CLUSTERS_MAX];
835    unsigned char  * Z[CLUSTERS_MAX];
836
837    for( c = 0 ; c < nclusters ; c++ )
838    {
839        A[c] = GA[c];
840        B[c] = GB[c];
841        C[c] = GC[c];
842        D[c] = GD[c];
843        Z[c] = GZ[c];
844    }
845
846    unsigned int npixels  = image_np * lines_per_thread;     // pixels moved by any thread
847    unsigned int g_offset = npixels * tid;             // offset in global buffer for tid
848    unsigned int l_offset = npixels * lid;             // offset in local buffer for tid
849
850    // min and max line indexes handled by thread[tid] for a global buffer
851    unsigned int global_lmin = tid * lines_per_thread;   
852    unsigned int global_lmax = global_lmin + lines_per_thread; 
853
854    // min and max line indexes handled by thread[tid] for a local buffer
855    unsigned int local_lmin  = lid * lines_per_thread;   
856    unsigned int local_lmax  = local_lmin + lines_per_thread; 
857
858    // pmin and pmax pixel indexes handled by thread[tid] in a column
859    unsigned int column_pmin = tid * pixels_per_thread; 
860    unsigned int column_pmax = column_pmin + pixels_per_thread; 
861
862    // Each thread[tid] copy npixels from image_in buffer to local A[cid] buffer
863    memcpy( A[cid]   + l_offset,
864            image_in + g_offset,
865            npixels );
866 
867#if VERBOSE_EXEC
868get_cycle( &date );
869printf( "\n[convol] exec[%d] on core[%x,%d] loaded input file in A[%d] / cycle %d\n", 
870tid , cxy , lpid , cid , (unsigned int)date);
871#endif
872
873    // Optionnal parallel display for the initial image
874    if ( INITIAL_DISPLAY_ENABLE )
875    {
876        // each thread[tid] copy npixels from A[cid] to in_win_buf buffer
877        memcpy( in_win_buf + g_offset,
878                A[cid]     + l_offset,
879                npixels );
880
881        // refresh the FBF window
882        if( fbf_refresh_window( in_wid , global_lmin , global_lmax ) ) 
883        {
884            printf("\n[convol error] in %s : thread[%d] cannot access FBF\n",
885            __FUNCTION__ , tid );
886            pthread_exit( &THREAD_EXIT_FAILURE );
887        }
888
889#if VERBOSE_EXEC
890get_cycle( &date );
891printf( "\n[convol] exec[%d] on core[%x,%d] completed initial display / cycle %d\n",
892tid , cxy , lpid , (unsigned int)date );
893#endif
894
895        ////////////////////////////////
896        pthread_barrier_wait( &barrier );
897    }
898
899    ////////////////////////////////////////////////////////////
900    // parallel horizontal filter :
901    // B <= Transpose(FH(A))
902    // D <= A - FH(A)
903    // Each thread computes (image_nl/nthreads) lines.
904    // The image must be extended :
905    // if (z<0)    TA(cid,l,z) == TA(cid,l,0)
906    // if (z>image_np-1) TA(cid,l,z) == TA(cid,l,image_np-1)
907    ////////////////////////////////////////////////////////////
908
909    get_cycle( &date );
910    H_BEG[cid][lid] = (unsigned int)date;
911
912    // l = global line index / p = absolute pixel index 
913
914    for (l = global_lmin; l < global_lmax; l++)
915    {
916        // src_c and src_l are the cluster index and the line index for A & D
917        int src_c = l / lines_per_cluster;
918        int src_l = l % lines_per_cluster;
919
920        // We use the specific values of the horizontal ep-filter for optimisation:
921        // sum(p) = sum(p-1) + TA[p+hrange] - TA[p-hrange-1]
922        // To minimize the number of tests, the loop on pixels is split in three domains
923
924        int sum_p = (hrange + 2) * TA(src_c, src_l, 0);
925        for (z = 1; z < hrange; z++)
926        {
927            sum_p = sum_p + TA(src_c, src_l, z);
928        }
929
930        // first domain : from 0 to hrange
931        for (p = 0; p < hrange + 1; p++)
932        {
933            // dst_c and dst_p are the cluster index and the pixel index for B
934            int dst_c = p / pixels_per_cluster;
935            int dst_p = p % pixels_per_cluster;
936            sum_p = sum_p + (int) TA(src_c, src_l, p + hrange) - (int) TA(src_c, src_l, 0);
937            TB(dst_c, dst_p, l) = sum_p / hnorm;
938            TD(src_c, src_l, p) = (int) TA(src_c, src_l, p) - sum_p / hnorm;
939        }
940        // second domain : from (hrange+1) to (image_np-hrange-1)
941        for (p = hrange + 1; p < image_np - hrange; p++)
942        {
943            // dst_c and dst_p are the cluster index and the pixel index for B
944            int dst_c = p / pixels_per_cluster;
945            int dst_p = p % pixels_per_cluster;
946            sum_p = sum_p + (int) TA(src_c, src_l, p + hrange) 
947                          - (int) TA(src_c, src_l, p - hrange - 1);
948            TB(dst_c, dst_p, l) = sum_p / hnorm;
949            TD(src_c, src_l, p) = (int) TA(src_c, src_l, p) - sum_p / hnorm;
950        }
951        // third domain : from (image_np-hrange) to (image_np-1)
952        for (p = image_np - hrange; p < image_np; p++)
953        {
954            // dst_c and dst_p are the cluster index and the pixel index for B
955            int dst_c = p / pixels_per_cluster;
956            int dst_p = p % pixels_per_cluster;
957            sum_p = sum_p + (int) TA(src_c, src_l, image_np - 1) 
958                          - (int) TA(src_c, src_l, p - hrange - 1);
959            TB(dst_c, dst_p, l) = sum_p / hnorm;
960            TD(src_c, src_l, p) = (int) TA(src_c, src_l, p) - sum_p / hnorm;
961        }
962
963#if SUPER_VERBOSE
964get_cycle( &date );
965printf(" - line %d computed at cycle %d\n", l, (unsigned int)date );
966#endif   
967
968    }
969
970    get_cycle( &date );
971    H_END[cid][lid] = (unsigned int)date;
972
973#if VERBOSE_EXEC
974get_cycle( &date );
975printf( "\n[convol] exec[%d] on core[%x,%d] completed horizontal filter / cycle %d\n",
976tid , cxy , lpid , (unsigned int)date );
977#endif
978
979    ////////////////////////////////
980    pthread_barrier_wait( &barrier );
981
982    ///////////////////////////////////////////////////////////////
983    // parallel vertical filter :
984    // C <= Transpose(FV(B))
985    // Each thread computes (image_np/nthreads) columns
986    // The image must be extended :
987    // if (l<0)    TB(cid,p,l) == TB(cid,p,0)
988    // if (l>image_nl-1)   TB(cid,p,l) == TB(cid,p,image_nl-1)
989    ///////////////////////////////////////////////////////////////
990
991    get_cycle( &date );
992    V_BEG[cid][lid] = (unsigned int)date;
993
994    // l = global line index / p = pixel index in column
995
996    for (p = column_pmin; p < column_pmax ; p++)
997    {
998        // src_c and src_p are the cluster index and the pixel index for B
999        int src_c = p / pixels_per_cluster;
1000        int src_p = p % pixels_per_cluster;
1001
1002        int sum_l;
1003
1004        // We use the specific values of the vertical ep-filter
1005        // To minimize the number of tests, the image_nl lines are split in three domains
1006
1007        // first domain : explicit computation for the first 18 values
1008        for (l = 0; l < 18; l++)
1009        {
1010            // dst_c and dst_l are the cluster index and the line index for C
1011            int dst_c = l / lines_per_cluster;
1012            int dst_l = l % lines_per_cluster;
1013
1014            for (z = 0, sum_l = 0; z < 35; z++)
1015            {
1016                sum_l = sum_l + vf[z] * TB(src_c, src_p, max(l - 17 + z,0) );
1017            }
1018            TC(dst_c, dst_l, p) = sum_l / vnorm;
1019        }
1020        // second domain
1021        for (l = 18; l < image_nl - 17; l++)
1022        {
1023            // dst_c and dst_l are the cluster index and the line index for C
1024            int dst_c = l / lines_per_cluster;
1025            int dst_l = l % lines_per_cluster;
1026
1027            sum_l = sum_l + TB(src_c, src_p, l + 4)
1028                  + TB(src_c, src_p, l + 8)
1029                  + TB(src_c, src_p, l + 11)
1030                  + TB(src_c, src_p, l + 15)
1031                  + TB(src_c, src_p, l + 17)
1032                  - TB(src_c, src_p, l - 5)
1033                  - TB(src_c, src_p, l - 9)
1034                  - TB(src_c, src_p, l - 12)
1035                  - TB(src_c, src_p, l - 16)
1036                  - TB(src_c, src_p, l - 18);
1037
1038            TC(dst_c, dst_l, p) = sum_l / vnorm;
1039        }
1040        // third domain
1041        for (l = image_nl - 17; l < image_nl; l++)
1042        {
1043            // dst_c and dst_l are the cluster index and the line index for C
1044            int dst_c = l / lines_per_cluster;
1045            int dst_l = l % lines_per_cluster;
1046
1047            sum_l = sum_l + TB(src_c, src_p, min(l + 4, image_nl - 1))
1048                  + TB(src_c, src_p, min(l + 8, image_nl - 1))
1049                  + TB(src_c, src_p, min(l + 11, image_nl - 1))
1050                  + TB(src_c, src_p, min(l + 15, image_nl - 1))
1051                  + TB(src_c, src_p, min(l + 17, image_nl - 1))
1052                  - TB(src_c, src_p, l - 5)
1053                  - TB(src_c, src_p, l - 9)
1054                  - TB(src_c, src_p, l - 12)
1055                  - TB(src_c, src_p, l - 16)
1056                  - TB(src_c, src_p, l - 18);
1057
1058            TC(dst_c, dst_l, p) = sum_l / vnorm;
1059        }
1060
1061#if SUPER_VERBOSE
1062get_cycle( &date );
1063printf(" - column %d computed at cycle %d\n", p, (unsigned int)date );
1064#endif
1065
1066    }
1067
1068    get_cycle( &date );
1069    V_END[cid][lid] = (unsigned int)date;
1070
1071#if VERBOSE_EXEC
1072get_cycle( &date );
1073printf( "\n[convol] exec[%d] on core[%x,%d] completed vertical filter / cycle %d\n",
1074tid , cxy , lid , (unsigned int)date );
1075#endif
1076
1077    ////////////////////////////////
1078    pthread_barrier_wait( &barrier );
1079
1080    ///////////////////////////////////////////////////////////////
1081    // build final image in local Z buffer from C & D local buffers
1082    // store it in output image file, and display it on FBF.
1083    // Z <= C + D
1084    ///////////////////////////////////////////////////////////////
1085
1086    get_cycle( &date );
1087    F_BEG[cid][lid] = (unsigned int)date;
1088
1089    // Each thread[tid] set local buffer Z[cid] from local buffers C[cid] & D[cid]
1090
1091    for( l = local_lmin ; l < local_lmax ; l++ )
1092    {
1093        for( p = 0 ; p < image_np ; p++ )
1094        {
1095            TZ(cid,l,p) = TC(cid,l,p) + TD(cid,l,p);
1096        }
1097    }
1098
1099    // Each thread[tid] copy npixels from Z[cid] buffer to image_out buffer
1100    memcpy( image_out + g_offset,
1101            Z[cid]    + l_offset,
1102            npixels );
1103
1104    // Optional parallel display of the final image
1105    if ( FINAL_DISPLAY_ENABLE )
1106    {
1107        // each thread[tid] copy npixels from Z[cid] to out_win_buf buffer
1108        memcpy( out_win_buf + g_offset,
1109                Z[cid]      + l_offset,
1110                npixels );
1111
1112        // refresh the FBF window
1113        if( fbf_refresh_window( out_wid , global_lmin , global_lmax ) )
1114        {
1115            printf("\n[convol error] in %s : thread[%d] cannot access FBF\n",
1116            __FUNCTION__ , tid );
1117            pthread_exit( &THREAD_EXIT_FAILURE );
1118        }
1119
1120#if VERBOSE_EXEC
1121get_cycle( &date );
1122printf( "\n[convol] exec[%d] on core[%x,%d] completed final display / cycle %d\n",
1123tid , cxy , lpid , (unsigned int)date );
1124#endif
1125
1126    }
1127
1128    // Each thread[cid,0] releases the 5 local buffers
1129    if( lid == 0 )
1130    {
1131        free( A[cid] );
1132        free( B[cid] );
1133        free( C[cid] );
1134        free( D[cid] );
1135        free( Z[cid] );
1136    }
1137
1138    get_cycle( &date );
1139    F_END[cid][lid] = (unsigned int)date;
1140
1141    // thread termination depends on the placement policy
1142    if( PARALLEL_PLACEMENT )   
1143    {
1144        // <exec> threads are runing in detached mode, and
1145        // each thread must signal completion by calling barrier
1146        // passed in arguments before exit
1147
1148        pthread_barrier_wait( args->barrier );
1149
1150        pthread_exit( &THREAD_EXIT_SUCCESS );
1151    }
1152    else
1153    {
1154        // <exec> threads are running in attached mode
1155        // all threads (but the one executing main) exit
1156        if ( tid != tid_main ) pthread_exit( &THREAD_EXIT_SUCCESS );
1157    }
1158
1159    return NULL;
1160
1161} // end execute()
1162
1163
1164
1165
1166
1167
1168//////////////////////////
1169void instrument( FILE * f,
1170                 char * filename )
1171{
1172    unsigned int nclusters = x_size * y_size;
1173
1174    unsigned int cc, pp;
1175
1176    unsigned int min_start = 0xFFFFFFFF;
1177    unsigned int max_start = 0;
1178
1179    unsigned int min_h_beg = 0xFFFFFFFF;
1180    unsigned int max_h_beg = 0;
1181
1182    unsigned int min_h_end = 0xFFFFFFFF;
1183    unsigned int max_h_end = 0;
1184
1185    unsigned int min_v_beg = 0xFFFFFFFF;
1186    unsigned int max_v_beg = 0;
1187
1188    unsigned int min_v_end = 0xFFFFFFFF;
1189    unsigned int max_v_end = 0;
1190
1191    unsigned int min_f_beg = 0xFFFFFFFF;
1192    unsigned int max_f_beg = 0;
1193
1194    unsigned int min_f_end = 0xFFFFFFFF;
1195    unsigned int max_f_end = 0;
1196
1197    for (cc = 0; cc < nclusters; cc++)
1198    {
1199        for (pp = 0; pp < ncores; pp++ )
1200        {
1201            if (START[cc][pp] < min_start) min_start = START[cc][pp];
1202            if (START[cc][pp] > max_start) max_start = START[cc][pp];
1203
1204            if (H_BEG[cc][pp] < min_h_beg) min_h_beg = H_BEG[cc][pp];
1205            if (H_BEG[cc][pp] > max_h_beg) max_h_beg = H_BEG[cc][pp];
1206
1207            if (H_END[cc][pp] < min_h_end) min_h_end = H_END[cc][pp];
1208            if (H_END[cc][pp] > max_h_end) max_h_end = H_END[cc][pp];
1209
1210            if (V_BEG[cc][pp] < min_v_beg) min_v_beg = V_BEG[cc][pp];
1211            if (V_BEG[cc][pp] > max_v_beg) max_v_beg = V_BEG[cc][pp];
1212
1213            if (V_END[cc][pp] < min_v_end) min_v_end = V_END[cc][pp];
1214            if (V_END[cc][pp] > max_v_end) max_v_end = V_END[cc][pp];
1215
1216            if (F_BEG[cc][pp] < min_f_beg) min_f_beg = F_BEG[cc][pp];
1217            if (F_BEG[cc][pp] > max_f_beg) max_f_beg = F_BEG[cc][pp];
1218
1219            if (F_END[cc][pp] < min_f_end) min_f_end = F_END[cc][pp];
1220            if (F_END[cc][pp] > max_f_end) max_f_end = F_END[cc][pp];
1221        }
1222    }
1223
1224    // display on terminal
1225    printf( "\n ------ %s ------\n" , filename );
1226
1227    printf(" - START : min = %d / max = %d / med = %d / delta = %d\n",
1228           min_start, max_start, (min_start+max_start)/2, max_start-min_start);
1229
1230    printf(" - H_BEG : min = %d / max = %d / med = %d / delta = %d\n",
1231           min_h_beg, max_h_beg, (min_h_beg+max_h_beg)/2, max_h_beg-min_h_beg);
1232
1233    printf(" - H_END : min = %d / max = %d / med = %d / delta = %d\n",
1234           min_h_end, max_h_end, (min_h_end+max_h_end)/2, max_h_end-min_h_end);
1235
1236    printf(" - V_BEG : min = %d / max = %d / med = %d / delta = %d\n",
1237           min_v_beg, max_v_beg, (min_v_beg+max_v_beg)/2, max_v_beg-min_v_beg);
1238
1239    printf(" - V_END : min = %d / max = %d / med = %d / delta = %d\n",
1240           min_v_end, max_v_end, (min_v_end+max_v_end)/2, max_v_end-min_v_end);
1241
1242    printf(" - D_BEG : min = %d / max = %d / med = %d / delta = %d\n",
1243           min_f_beg, max_f_beg, (min_f_beg+max_f_beg)/2, max_f_beg-min_f_beg);
1244
1245    printf(" - D_END : min = %d / max = %d / med = %d / delta = %d\n",
1246           min_f_end, max_f_end, (min_f_end+max_f_end)/2, max_f_end-min_f_end);
1247
1248    printf( "\n General Scenario   (Kcycles)\n" );
1249    printf( " - LOAD IMAGE        = %d\n", (min_h_beg - min_start)/1000 );
1250    printf( " - H_FILTER          = %d\n", (max_h_end - min_h_beg)/1000 );
1251    printf( " - BARRIER HORI/VERT = %d\n", (min_v_beg - max_h_end)/1000 );
1252    printf( " - V_FILTER          = %d\n", (max_v_end - min_v_beg)/1000 );
1253    printf( " - BARRIER VERT/DISP = %d\n", (min_f_beg - max_v_end)/1000 );
1254    printf( " - DISPLAY           = %d\n", (max_f_end - min_f_beg)/1000 );
1255    printf( " \nSEQUENCIAL = %d / PARALLEL = %d\n",
1256            SEQUENCIAL_TIME/1000, PARALLEL_TIME/1000 );
1257
1258    // save on disk
1259    fprintf( f ,  "\n ------ %s ------\n" , filename );
1260
1261    fprintf( f , " - START : min = %d / max = %d / med = %d / delta = %d\n",
1262           min_start, max_start, (min_start+max_start)/2, max_start-min_start);
1263
1264    fprintf( f , " - H_BEG : min = %d / max = %d / med = %d / delta = %d\n",
1265           min_h_beg, max_h_beg, (min_h_beg+max_h_beg)/2, max_h_beg-min_h_beg);
1266
1267    fprintf( f , " - H_END : min = %d / max = %d / med = %d / delta = %d\n",
1268           min_h_end, max_h_end, (min_h_end+max_h_end)/2, max_h_end-min_h_end);
1269
1270    fprintf( f , " - V_BEG : min = %d / max = %d / med = %d / delta = %d\n",
1271           min_v_beg, max_v_beg, (min_v_beg+max_v_beg)/2, max_v_beg-min_v_beg);
1272
1273    fprintf( f , " - V_END : min = %d / max = %d / med = %d / delta = %d\n",
1274           min_v_end, max_v_end, (min_v_end+max_v_end)/2, max_v_end-min_v_end);
1275
1276    fprintf( f , " - D_BEG : min = %d / max = %d / med = %d / delta = %d\n",
1277           min_f_beg, max_f_beg, (min_f_beg+max_f_beg)/2, max_f_beg-min_f_beg);
1278
1279    fprintf( f , " - D_END : min = %d / max = %d / med = %d / delta = %d\n",
1280           min_f_end, max_f_end, (min_f_end+max_f_end)/2, max_f_end-min_f_end);
1281
1282    fprintf( f ,  "\n General Scenario (Kcycles)\n" );
1283    fprintf( f ,  " - LOAD IMAGE        = %d\n", (min_h_beg - min_start)/1000 );
1284    fprintf( f ,  " - H_FILTER          = %d\n", (max_h_end - min_h_beg)/1000 );
1285    fprintf( f ,  " - BARRIER HORI/VERT = %d\n", (min_v_beg - max_h_end)/1000 );
1286    fprintf( f ,  " - V_FILTER          = %d\n", (max_v_end - min_v_beg)/1000 );
1287    fprintf( f ,  " - BARRIER VERT/DISP = %d\n", (min_f_beg - max_v_end)/1000 );
1288    fprintf( f ,  " - SAVE              = %d\n", (max_f_end - min_f_beg)/1000 );
1289    fprintf( f ,  " \nSEQUENCIAL = %d / PARALLEL = %d\n",
1290    SEQUENCIAL_TIME/1000, PARALLEL_TIME/1000 );
1291
1292} // end instrument()
1293
1294
1295
1296
1297
1298// Local Variables:
1299// tab-width: 3
1300// c-basic-offset: 3
1301// c-file-offsets:((innamespace . 0)(inline-open . 0))
1302// indent-tabs-mode: nil
1303// End:
1304
1305// vim: filetype=cpp:expandtab:shiftwidth=3:tabstop=3:softtabstop=3
1306
1307
Note: See TracBrowser for help on using the repository browser.