source: trunk/user/convol/convol.c @ 683

Last change on this file since 683 was 682, checked in by alain, 4 years ago

Introduce three new applications:

  • windows : to test the FBF windows kernel manager
  • udp_chat : chat application based on UDP sockets.
  • tcp_chat : chat application based on TCP sockets (including packet loss recovery).
File size: 44.7 KB
Line 
1///////////////////////////////////////////////////////////////////////////////////////
2// File   : convol.c 
3// Date   : june 2014
4// author : Alain Greiner
5///////////////////////////////////////////////////////////////////////////////////////
6// This multi-threaded application implements a 2D convolution product. 
7// It can run on a multi-cores, multi-clusters architecture, with one thread
8// per core, and uses the POSIX threads API.
9//
10// The input image is read from a file and the output image is saved to another file.
11//
12// - number of clusters containing processors must be power of 2 no larger than 256.
13// - number of processors per cluster must be power of 2 no larger than 4.
14// - number of working threads is the number of cores availables in the hardware
15//   architecture : nthreads = nclusters * ncores.
16//
17// The convolution kernel is defined in the execute() function.
18// It can be factored in two independant line and column convolution products.
19//
20// The main() function can be launched on any processor.
21// - It checks software requirements versus the hardware resources.
22// - It open & maps the input file to a global <image_in> buffer.
23// - it open & maps the output file to another global <image_out> buffer.
24// - it open the instrumentation file.
25// - it creates & activates two FBF windows to display input & output images.
26// - it launches other threads to run in parallel the execute() function.
27// - it saves the instrumentation results on disk.
28// - it closes the input, output, & instrumentation files.
29// - it deletes the FBF input & output windows.
30//
31// The execute() function is executed in parallel by all threads. These threads are
32// working on 5 arrays of distributed buffers, indexed by the cluster index [cid].
33// - A[cid]: contain the distributed initial image (NL/NCLUSTERS lines per cluster).
34// - B[cid]: is the result of horizontal filter, then transpose B <= Trsp(HF(A)
35// - C[cid]: is the result of vertical image, then transpose : c <= Trsp(VF(B)
36// - D[cid]: is the the difference between A and FH(A) : D <= A - FH(A)
37// - Z[cid]: contain the distributed final image Z <= C + D
38//
39// It can be split in four phases separated by synchronisation barriers:
40// 1. Initialisation:
41//    Allocates the 5 A[cid],B[cid],C[cid],D[cid],Z[cid] buffers, initialise A[cid]
42//    from the <image_in> buffer, and display the initial image on FBF if rquired.
43// 2. Horizontal Filter:
44//    Set B[cid] and D[cid] from A[cid]. Read data accesses are local, write data
45//    accesses are remote, to implement the transpose.
46// 3. Vertical Filter: 
47//    Set C[cid] from B[cid]. Read data accesses are local, write data accesses
48//    are remote, to implement the transpose.
49// 4. Save results:
50//    Set the Z[cid] from C[cid] and D[cid]. All read and write access are local.
51//    Move the final image (Z[cid] buffer) to the <image_out> buffer.   
52//
53// This application supports three placement modes, implemented in the main() function.
54// In all modes, the working threads are identified by the [tid] continuous index
55// in range [0, NTHREADS-1], and defines how the lines are shared amongst the threads.
56// This continuous index can always be decomposed in two continuous sub-indexes:
57// tid == cid * NCORES + lid,  where cid is in [0,NCLUSTERS-1] and lid in [0,NCORES-1].
58//
59// - NO_PLACEMENT: the main thread is itsef a working thread. The (N_1) other working
60//   threads are created by the main thread, but the placement is done by the OS, using
61//   the DQDT for load balancing, and two working threads can be placed on the same core.
62//   The [cid,lid] are only abstract identifiers, and cannot be associated to a physical
63//   cluster or a physical core. In this mode, the main thread run on any cluster,
64//   but has tid = 0 (i.e. cid = 0 & tid = 0).
65//
66// - EXPLICIT_PLACEMENT: the main thread is again a working thread, but the placement
67//   of the threads on the cores is explicitely controled by the main thread to have
68//   exactly one working thread per core, and the [cxy][lpid] core coordinates for a given
69//   thread[tid] can be directly derived from the [tid] value: [cid] is an alias for the
70//   physical cluster identifier, and [lid] is the local core index.
71//
72// - PARALLEL_PLACEMENT: the main thread is not anymore a working thread, and uses the
73//   non standard pthread_parallel_create() function to avoid the costly sequencial
74//   loops for pthread_create() and pthread_join(). It garanties one working thread
75//   per core, and the same relation between the thread[tid] and the core[cxy][lpid].
76//
77// The [tid] continuous index defines how the work is shared amongst the threads:
78// - each thread handles NL/nthreads lines for the horizontal filter.
79// - each thread handles NP/nthreads columns for the vertical filter.
80///////////////////////////////////////////////////////////////////////////////////////
81
82#include <sys/mman.h>
83#include <stdio.h>
84#include <stdlib.h>
85#include <fcntl.h>
86#include <unistd.h>
87#include <pthread.h>
88#include <string.h>
89#include <almosmkh.h>
90#include <hal_macros.h>
91
92#define VERBOSE_MAIN               1
93#define VERBOSE_EXEC               1
94#define SUPER_VERBOSE              0
95
96#define X_MAX                      16
97#define Y_MAX                      16
98#define CORES_MAX                  4
99#define CLUSTERS_MAX               (X_MAX * Y_MAX)
100#define THREADS_MAX                (X_MAX * Y_MAX * CORES_MAX)
101
102#define IMAGE_TYPE                 420                         // pixel encoding type
103#define INPUT_IMAGE_PATH           "misc/couple_512.raw"       // default image_in
104#define OUTPUT_IMAGE_PATH          "misc/couple_conv_512.raw"  // default image_out
105#define NL                         512                         // default nlines
106#define NP                         512                         // default npixels
107
108#define NO_PLACEMENT               0
109#define EXPLICIT_PLACEMENT         0
110#define PARALLEL_PLACEMENT         1
111
112#define INTERACTIVE_MODE           0
113#define USE_DQT_BARRIER            1
114#define INITIAL_DISPLAY_ENABLE     1
115#define FINAL_DISPLAY_ENABLE       1
116
117#define TA(c,l,p)  (A[c][((NP) * (l)) + (p)])
118#define TB(c,p,l)  (B[c][((NL) * (p)) + (l)])
119#define TC(c,l,p)  (C[c][((NP) * (l)) + (p)])
120#define TD(c,l,p)  (D[c][((NP) * (l)) + (p)])
121#define TZ(c,l,p)  (Z[c][((NP) * (l)) + (p)])
122
123#define max(x,y) ((x) > (y) ? (x) : (y))
124#define min(x,y) ((x) < (y) ? (x) : (y))
125
126//////////////////////////////////////////////////////////
127//            global variables
128//////////////////////////////////////////////////////////
129
130// global instrumentation counters for the main thread
131unsigned int SEQUENCIAL_TIME = 0;
132unsigned int PARALLEL_TIME   = 0;
133
134// instrumentation counters for thread[tid] in cluster[cid]
135unsigned int START[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
136unsigned int H_BEG[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
137unsigned int H_END[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
138unsigned int V_BEG[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
139unsigned int V_END[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
140unsigned int F_BEG[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
141unsigned int F_END[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
142
143// pointer on buffer containing the input image, maped by the main to the input file
144unsigned char *  image_in;
145
146// pointer on buffer containing the output image, maped by the main to the output file
147unsigned char *  image_out;
148
149// return values at thread exit
150unsigned int THREAD_EXIT_SUCCESS = 0;
151unsigned int THREAD_EXIT_FAILURE = 1;
152
153// pointer and identifier for FBF windows
154void   *  in_win_buf;
155int       in_wid;
156void   *  out_win_buf;
157int       out_wid;
158
159// synchronization barrier
160pthread_barrier_t     barrier;
161
162// platform parameters
163unsigned int  x_size;              // number of clusters in a row
164unsigned int  y_size;              // number of clusters in a column
165unsigned int  ncores;              // number of processors per cluster
166
167// main thread continuous index
168unsigned int     tid_main;
169
170// arrays of pointers on distributed buffers in all clusters
171unsigned char  * GA[CLUSTERS_MAX];
172int            * GB[CLUSTERS_MAX];
173int            * GC[CLUSTERS_MAX];
174int            * GD[CLUSTERS_MAX];
175unsigned char  * GZ[CLUSTERS_MAX];
176
177// array of threads kernel identifiers / indexed by [tid]
178pthread_t        exec_trdid[THREADS_MAX];
179
180// array of threads attributes / indexed bi [tid]
181pthread_attr_t   exec_attr[THREADS_MAX]; 
182
183// array of execute() function arguments / indexed by [tid]
184pthread_parallel_work_args_t exec_args[THREADS_MAX];
185
186// image features
187unsigned int   image_nl;
188unsigned int   image_np;
189char           input_image_path[128];
190char           output_image_path[128];
191
192/////////////////////////////////////////////////////////////////////////////////////
193//           functions declaration
194/////////////////////////////////////////////////////////////////////////////////////
195
196void * execute( void * args );
197
198void instrument( FILE * f , char * filename );
199
200/////////////////
201void main( void )
202/////////////////
203{
204    unsigned long long start_cycle;
205    unsigned long long end_sequencial_cycle;
206    unsigned long long end_parallel_cycle;
207
208    int          error;
209
210    char         instru_name[32];               // instrumentation file name
211    char         instru_path[64];               // instrumentation path name
212
213    /////////////////////////////////////////////////////////////////////////////////
214    get_cycle( &start_cycle );
215    /////////////////////////////////////////////////////////////////////////////////
216
217    if( (NO_PLACEMENT + EXPLICIT_PLACEMENT + PARALLEL_PLACEMENT) != 1 )
218    {
219        printf("\n[convol error] illegal placement\n");
220        exit( 0 );
221    }
222
223    // get & check platform parameters
224    hard_config_t  config;
225    get_config( &config );
226    x_size = config.x_size;
227    y_size = config.y_size;
228    ncores = config.ncores;
229
230    if((ncores != 1) && (ncores != 2) && (ncores != 4))
231    {
232        printf("\n[convol error] number of cores per cluster must be 1/2/4\n");
233        exit( 0 );
234    }
235
236    if( (x_size != 1) && (x_size != 2) && (x_size != 4) && 
237        (x_size != 8) && (x_size != 16) )
238    {
239        printf("\n[convol error] x_size must be 1/2/4/8/16\n");
240        exit( 0 );
241    }
242       
243    if( (y_size != 1) && (y_size != 2) && (y_size != 4) && 
244        (y_size != 8) && (y_size != 16) )
245    {
246        printf("\n[convol error] y_size must be 1/2/4/8/16\n");
247        exit( 0 );
248    }
249       
250    // main thread get identifiers for core executing main
251    unsigned int  cxy_main;
252    unsigned int  lid_main;
253    get_core_id( &cxy_main , &lid_main );
254
255    // compute nthreads and nclusters
256    unsigned int nclusters = x_size * y_size;
257    unsigned int nthreads  = nclusters * ncores;
258
259    // get input and output images pathnames and size
260    if( INTERACTIVE_MODE )
261    {
262        // get image size
263        printf("\n[convol] image nlines      : ");
264        get_uint32( &image_nl );
265
266        printf("\n[convol] image npixels     : ");
267        get_uint32( &image_np );
268
269        printf("\n[convol] input image path  : ");
270        get_string( input_image_path , 128 );
271
272        printf("[convol] output image path : ");
273        get_string( output_image_path , 128 );
274    }
275    else
276    {
277        image_nl = NL;
278        image_np = NP;
279        strcpy( input_image_path  , INPUT_IMAGE_PATH );
280        strcpy( output_image_path , OUTPUT_IMAGE_PATH );
281    }
282
283    // main thread get FBF size and type
284    int   fbf_width;
285    int   fbf_height;
286    int   fbf_type;
287    fbf_get_config( &fbf_width , &fbf_height , &fbf_type );
288
289    if( ((unsigned int)fbf_width  < image_np) || 
290        ((unsigned int)fbf_height < image_nl) || 
291        (fbf_type != IMAGE_TYPE) )
292    {
293        printf("\n[convol error] image not acceptable\n"
294               "FBF width  = %d / npixels  = %d\n"
295               "FBF height = %d / nlines   = %d\n"
296               "FBF type   = %d / expected = %d\n",
297               fbf_width, image_np, fbf_height, image_nl, fbf_type, IMAGE_TYPE );
298        exit( 0 );
299    }
300
301    if( nthreads > image_nl )
302    {
303        printf("\n[convol error] nthreads (%d] larger than nlines (%d)\n",
304        nthreads , image_nl );
305        exit( 0 );
306    }
307
308    // define instrumentation file name
309    if( NO_PLACEMENT )
310    {
311        printf("\n[convol] %d cluster(s) / %d core(s) / FBF[%d*%d] / PID %x / NO_PLACE\n",
312        nclusters, ncores, fbf_width, fbf_height, getpid() );
313
314        // build instrumentation file name
315        if( USE_DQT_BARRIER )
316        snprintf( instru_name , 32 , "dqt_no_place_%d_%d", x_size * y_size , ncores );
317        else
318        snprintf( instru_name , 32 , "smp_no_place_%d_%d", x_size * y_size , ncores );
319    }
320
321    if( EXPLICIT_PLACEMENT )
322    {
323        printf("\n[convol] %d cluster(s) / %d core(s) / FBF[%d*%d] / PID %x / EXPLICIT\n",
324        nclusters, ncores, fbf_width, fbf_height, getpid() );
325
326        // build instrumentation file name
327        if( USE_DQT_BARRIER )
328        snprintf( instru_name , 32 , "dqt_explicit_%d_%d", x_size * y_size , ncores );
329        else
330        snprintf( instru_name , 32 , "smp_explicit_%d_%d", x_size * y_size , ncores );
331    }
332
333    if( PARALLEL_PLACEMENT )
334    {
335        printf("\n[convol] %d cluster(s) / %d core(s) / FBF[%d*%d] / PID %x / PARALLEL\n",
336        nclusters, ncores, fbf_width, fbf_height, getpid() );
337
338        // build instrumentation file name
339        if( USE_DQT_BARRIER )
340        snprintf( instru_name , 32 , "dqt_parallel_%d_%d", x_size * y_size , ncores );
341        else
342        snprintf( instru_name , 32 , "smp_parallel_%d_%d", x_size * y_size , ncores );
343    }
344
345    // open instrumentation file
346    snprintf( instru_path , 64 , "/home/convol/%s", instru_name );
347    FILE * f_instru = fopen( instru_path , NULL );
348    if ( f_instru == NULL ) 
349    { 
350        printf("\n[convol error] cannot open instrumentation file %s\n", instru_path );
351        exit( 0 );
352    }
353
354#if  VERBOSE_MAIN
355printf("\n[convol] main on core[%x,%d] open instrumentation file %s\n",
356cxy_main, lid_main, instru_path );
357#endif
358
359    // main create an FBF window for input image
360    in_wid = fbf_create_window( 0,                   // l_zero
361                                0,                   // p_zero
362                                image_nl,            // lines
363                                image_np,            // pixels
364                                &in_win_buf );
365    if( in_wid < 0 ) 
366    {
367        printf("\n[transpose error] cannot open FBF window for %s\n",
368        input_image_path);
369        exit( 0 );
370    }
371
372    // activate window
373    error = fbf_active_window( in_wid , 1 );
374
375    if( error )
376    {
377        printf("\n[transpose error] cannot activate window for %s\n",
378        input_image_path );
379        exit( 0 );
380    }
381
382#if  VERBOSE_MAIN
383printf("\n[convol] main on core[%x,%d] created FBF window (wid %d) for <%s>\n",
384cxy_main, lid_main, in_wid, input_image_path );
385#endif
386
387    // main create an FBF window for output image
388    out_wid = fbf_create_window( 0,                   // l_zero
389                                 image_np,            // p_zero
390                                 image_nl,            // lines
391                                 image_np,            // pixels
392                                 &out_win_buf );
393    if( out_wid < 0 ) 
394    {
395        printf("\n[transpose error] cannot create FBF window for %s\n",
396        output_image_path);
397        exit( 0 );
398    }
399
400    // activate window
401    error = fbf_active_window( out_wid , 1 );
402
403    if( error )
404    {
405        printf("\n[transpose error] cannot activate window for %s\n",
406        output_image_path );
407        exit( 0 );
408    }
409
410#if  VERBOSE_MAIN
411printf("\n[convol] main on core[%x,%d] created FBF window (wid %d) for <%s>\n",
412cxy_main, lid_main, out_wid, output_image_path );
413#endif
414
415    // main initialise barrier
416    if( USE_DQT_BARRIER )
417    {
418        pthread_barrierattr_t attr;
419        attr.x_size   = x_size;
420        attr.y_size   = y_size;
421        attr.nthreads = ncores;
422        error = pthread_barrier_init( &barrier, &attr , nthreads );
423    }
424    else
425    {
426        error = pthread_barrier_init( &barrier, NULL , nthreads );
427    }
428
429    if( error )
430    {
431        printf("\n[convol error] cannot initialize barrier\n");
432        exit( 0 );
433    }
434
435#if VERBOSE_MAIN
436printf("\n[convol] main on core[%x,%d] completed barrier init\n", 
437cxy_main, lid_main );
438#endif
439
440    // main open input file
441    int fd_in = open( input_image_path , O_RDONLY , 0 );
442
443    if ( fd_in < 0 ) 
444    { 
445        printf("\n[convol error] cannot open input file <%s>\n", input_image_path );
446        exit( 0 );
447    }
448
449    // main thread map input file to image_in buffer
450    image_in = (unsigned char *)mmap( NULL,
451                                      image_np * image_nl,
452                                      PROT_READ,
453                                      MAP_FILE | MAP_SHARED,
454                                      fd_in,
455                                      0 );           // offset
456    if ( image_in == NULL ) 
457    { 
458        printf("\n[convol error] main cannot map buffer to file %s\n", input_image_path );
459        exit( 0 );
460    }
461
462#if  VERBOSE_MAIN
463printf("\n[convol] main on core[%x,%x] map <image_in> buffer to file <%s>\n",
464cxy_main, lid_main, input_image_path );
465#endif
466
467    // main thread open output file
468    int fd_out = open( output_image_path , O_CREAT , 0 ); 
469
470    if ( fd_out < 0 ) 
471    { 
472        printf("\n[convol error] main cannot open file %s\n", output_image_path );
473        exit( 0 );
474    }
475
476    // main thread map image_out buffer to output file
477    image_out = (unsigned char *)mmap( NULL,
478                                       image_np * image_nl,
479                                       PROT_WRITE,
480                                       MAP_FILE | MAP_SHARED,
481                                       fd_out,
482                                       0 );     // offset
483    if ( image_out == NULL ) 
484    { 
485        printf("\n[convol error] main cannot map buffer to file %s\n", output_image_path );
486        exit( 0 );
487    }
488
489#if  VERBOSE_MAIN
490printf("\n[convol] main on core[%x,%x] map <image_out> buffer to file <%s>\n",
491cxy_main, lid_main, output_image_path );
492#endif
493
494    /////////////////////////////////////////////////////////////////////////////////////
495    get_cycle( &end_sequencial_cycle );
496    SEQUENCIAL_TIME = (unsigned int)(end_sequencial_cycle - start_cycle);
497    /////////////////////////////////////////////////////////////////////////////////////
498
499//////////////////
500#if NO_PLACEMENT
501{
502    // the tid value for the main thread is always 0
503    // main thread creates other threads with tid in [1,nthreads-1] 
504    unsigned int tid;
505    for ( tid = 0 ; tid < nthreads ; tid++ )
506    {
507        // register tid value in exec_args[tid] array
508        exec_args[tid].tid = tid;
509         
510        // create other threads
511        if( tid > 0 )
512        {
513            if ( pthread_create( &exec_trdid[tid], 
514                                 NULL,                  // no attribute
515                                 &execute,
516                                 &exec_args[tid] ) ) 
517            {
518                printf("\n[convol error] cannot create thread %d\n", tid );
519                exit( 0 );
520            }
521
522#if VERBOSE_MAIN
523printf("\n[convol] main created thread %d\n", tid );
524#endif
525
526        }
527        else
528        {
529            tid_main = 0;
530        }
531    }  // end for tid
532
533    // main thread calls itself the execute() function
534    execute( &exec_args[0] );
535
536    // main thread wait other threads completion
537    for ( tid = 1 ; tid < nthreads ; tid++ )
538    {
539        unsigned int * status;
540
541        // main wait thread[tid] status
542        if ( pthread_join( exec_trdid[tid], (void*)(&status)) )
543        {
544            printf("\n[convol error] main cannot join thread %d\n", tid );
545            exit( 0 );
546        }
547       
548        // check status
549        if( *status != THREAD_EXIT_SUCCESS )
550        {
551            printf("\n[convol error] thread %x returned failure\n", tid );
552            exit( 0 );
553        }
554
555#if VERBOSE_MAIN
556printf("\n[convol] main successfully joined thread %x\n", tid );
557#endif
558       
559    }  // end for tid
560} 
561#endif // end no_placement
562
563//////////////////////
564#if EXPLICIT_PLACEMENT
565{
566    // main thread places each other threads on a specific core[cxy][lid]
567    // but the actual thread creation is sequencial
568    unsigned int x;
569    unsigned int y;
570    unsigned int l;
571    unsigned int cxy;                   // cluster identifier
572    unsigned int tid;                   // thread continuous index
573
574    for( x = 0 ; x < x_size ; x++ )
575    {
576        for( y = 0 ; y < y_size ; y++ )
577        {
578            cxy = HAL_CXY_FROM_XY( x , y );
579            for( l = 0 ; l < ncores ; l++ )
580            {
581                // compute thread continuous index
582                tid = (((* y_size) + y) * ncores) + l;
583
584                // register tid value in exec_args[tid] array
585                exec_args[tid].tid = tid;
586
587                // no thread created on the core running the main
588                if( (cxy != cxy_main) || (l != lid_main) )
589                {
590                    // define thread attributes
591                    exec_attr[tid].attributes = PT_ATTR_CLUSTER_DEFINED |
592                                                PT_ATTR_CORE_DEFINED;
593                    exec_attr[tid].cxy        = cxy;
594                    exec_attr[tid].lid        = l;
595 
596                    // create thread[tid] on core[cxy][l]
597                    if ( pthread_create( &exec_trdid[tid],   
598                                         &exec_attr[tid],   
599                                         &execute,
600                                         &exec_args[tid] ) )       
601                    {
602                        printf("\n[convol error] cannot create thread %d\n", tid );
603                        exit( 0 );
604                    }
605#if VERBOSE_MAIN
606printf("\n[convol] main created thread[%d] on core[%x,%d]\n", tid, cxy, l );
607#endif
608                }
609                else
610                {
611                    tid_main = tid;
612                }
613            }
614        }
615    }
616
617    // main thread calls itself the execute() function
618    execute( &exec_args[tid_main] );
619
620    // main thread wait other threads completion
621    for( tid = 0 ; tid < nthreads ; tid++ )
622    {
623        // no other thread on the core running the main
624        if( tid != tid_main )
625        {
626            unsigned int * status;
627
628            // wait thread[tid]
629            if( pthread_join( exec_trdid[tid] , (void*)(&status) ) )
630            {
631                printf("\n[convol error] main cannot join thread %d\n", tid );
632                exit( 0 );
633            }
634     
635            // check status
636            if( *status != THREAD_EXIT_SUCCESS )
637            {
638                printf("\n[convol error] thread %d returned failure\n", tid );
639                exit( 0 );
640            }
641#if VERBOSE_MAIN
642printf("\n[convol] main joined thread %d on core[%x,%d]\n", tid , cxy , l );
643#endif
644        }
645    }
646} 
647#endif   // end explicit_placement
648
649//////////////////////
650#if PARALLEL_PLACEMENT
651{
652    // compute covering DQT size an level
653    unsigned int z          = (x_size > y_size) ? x_size : y_size;
654    unsigned int root_level = ((z == 1) ? 0 : 
655                              ((z == 2) ? 1 : 
656                              ((z == 4) ? 2 : 
657                              ((z == 8) ? 3 : 4))));
658
659    // create & execute the working threads
660    if( pthread_parallel_create( root_level , &execute ) )
661    {
662        printf("\n[convol error] in %s\n", __FUNCTION__ );
663        exit( 0 );
664    }
665}
666#endif  // end parallel_placement
667
668    /////////////////////////////////////////////////////////////////////////////
669    get_cycle( &end_parallel_cycle );
670    PARALLEL_TIME = (unsigned int)(end_parallel_cycle - end_sequencial_cycle);
671    /////////////////////////////////////////////////////////////////////////////
672
673    // main thread register instrumentation results
674    instrument( f_instru , instru_name );
675
676#if VERBOSE_MAIN
677printf("\n[convol] main registered instrumentation info\n" );
678#endif
679
680    // main thread close input file
681    close( fd_in );
682
683#if VERBOSE_MAIN
684printf("\n[convol] main closed input file\n" );
685#endif
686
687    // main thread close output file
688    close( fd_out );
689
690#if VERBOSE_MAIN
691printf("\n[convol] main closed output file\n" );
692#endif
693
694    // main thread close instrumentation file
695    fclose( f_instru );
696
697#if VERBOSE_MAIN
698printf("\n[convol] main closed instrumentation file\n" );
699#endif
700
701    // ask confirm for exit
702    if( INTERACTIVE_MODE )
703    {
704        printf("\n[convol] press any key to to delete FBF windows and exit\n");
705        getchar();
706    }
707 
708    // main thread delete FBF windows
709    fbf_delete_window( in_wid );
710    fbf_delete_window( out_wid );
711
712#if VERBOSE_MAIN
713printf("\n[convol] main deleted FBF windows\n" );
714#endif
715
716    // main thread suicide
717    exit( 0 );
718   
719} // end main()
720
721
722
723
724
725
726
727
728
729
730//////////////////////////////////
731void * execute( void * arguments )
732//////////////////////////////////
733{
734    unsigned long long date;
735
736    pthread_parallel_work_args_t * args = (pthread_parallel_work_args_t *)arguments;
737
738    // Each thread initialises the convolution kernel parameters in local stack.
739    // The values defined in the next 12 lines are Philips proprietary information.
740
741    int   vnorm  = 115;
742    int   vf[35] = { 1, 1, 2, 2, 2,
743                     2, 3, 3, 3, 4,
744                     4, 4, 4, 5, 5,
745                     5, 5, 5, 5, 5,
746                     5, 5, 4, 4, 4,
747                     4, 3, 3, 3, 2,
748                     2, 2, 2, 1, 1 };
749
750    unsigned int hrange = 100;
751    unsigned int hnorm  = 201;
752
753    // WARNING
754    //A thread is identified by the tid index, defined in the "args" structure.
755    // This index being in range [0,nclusters*ncores-1] we can always write
756    //       tid == cid * ncores + lid
757    // with cid in [0,nclusters-1] and lid in [0,ncores-1].
758    // if NO_PLACEMENT, there is no relation between these
759    // thread [cid][lid] indexes, and the core coordinates [cxy][lpid]
760
761    // get thread abstract identifiers[cid,lid]  from tid
762    unsigned int tid = args->tid;
763    unsigned int cid = tid / ncores;   
764    unsigned int lid = tid % ncores;
765
766#if VERBOSE_EXEC
767unsigned int cxy;              // core cluster identifier
768unsigned int lpid;             // core local identifier
769get_cycle( &date );
770get_core_id( &cxy , &lpid );
771printf("\n[convol] exec[%d] on core[%x,%d] enters parallel exec / cycle %d\n",
772tid , cxy , lpid , (unsigned int)date );
773#endif
774
775    // compute nthreads and nclusters from global variables
776    unsigned int nclusters = x_size * y_size;
777    unsigned int nthreads  = nclusters * ncores;
778
779    // indexes for loops
780    unsigned int c;                 // cluster index
781    unsigned int l;                 // line index
782    unsigned int p;                 // pixel index
783    unsigned int z;                 // vertical filter index
784
785    unsigned int lines_per_thread   = image_nl / nthreads;
786    unsigned int lines_per_cluster  = image_nl / nclusters;
787    unsigned int pixels_per_thread  = image_np / nthreads;
788    unsigned int pixels_per_cluster = image_np / nclusters;
789
790    // compute number of pixels stored in one cluster
791    unsigned int local_pixels = image_nl * image_np / nclusters;       
792
793    get_cycle( &date );
794    START[cid][lid] = (unsigned int)date;
795
796    // Each thread[cid][0] allocates 5 buffers local cluster cid
797    // and registers these 5 pointers in the global arrays
798    if ( lid == 0 )
799    {
800        GA[cid] = malloc( local_pixels * sizeof( unsigned char ) );
801        GB[cid] = malloc( local_pixels * sizeof( int ) );
802        GC[cid] = malloc( local_pixels * sizeof( int ) );
803        GD[cid] = malloc( local_pixels * sizeof( int ) );
804        GZ[cid] = malloc( local_pixels * sizeof( unsigned char ) );
805
806        if( (GA[cid] == NULL) || 
807            (GB[cid] == NULL) || 
808            (GC[cid] == NULL) || 
809            (GD[cid] == NULL) || 
810            (GZ[cid] == NULL) )
811        {
812            printf("\n[convol error] thread[%d] cannot allocate buf_in\n", tid );
813            pthread_exit( &THREAD_EXIT_FAILURE );
814        }
815
816#if VERBOSE_EXEC
817get_cycle( &date );
818printf("\n[convol] exec[%d] on core[%x,%d] allocated shared buffers / cycle %d\n"
819" GA %x / GB %x / GC %x / GD %x / GZ %x\n",
820tid, cxy , lpid, (unsigned int)date, GA[cid], GB[cid], GC[cid], GD[cid], GZ[cid] );
821#endif
822   
823    }
824
825    ////////////////////////////////
826    pthread_barrier_wait( &barrier );
827
828    // Each thread[tid] allocates and initialises in its private stack
829    // a copy of the arrays of pointers on the distributed buffers.
830    unsigned char  * A[CLUSTERS_MAX];
831    int            * B[CLUSTERS_MAX];
832    int            * C[CLUSTERS_MAX];
833    int            * D[CLUSTERS_MAX];
834    unsigned char  * Z[CLUSTERS_MAX];
835
836    for( c = 0 ; c < nclusters ; c++ )
837    {
838        A[c] = GA[c];
839        B[c] = GB[c];
840        C[c] = GC[c];
841        D[c] = GD[c];
842        Z[c] = GZ[c];
843    }
844
845    unsigned int npixels  = image_np * lines_per_thread;     // pixels moved by any thread
846    unsigned int g_offset = npixels * tid;             // offset in global buffer for tid
847    unsigned int l_offset = npixels * lid;             // offset in local buffer for tid
848
849    // min and max line indexes handled by thread[tid] for a global buffer
850    unsigned int global_lmin = tid * lines_per_thread;   
851    unsigned int global_lmax = global_lmin + lines_per_thread; 
852
853    // min and max line indexes handled by thread[tid] for a local buffer
854    unsigned int local_lmin  = lid * lines_per_thread;   
855    unsigned int local_lmax  = local_lmin + lines_per_thread; 
856
857    // pmin and pmax pixel indexes handled by thread[tid] in a column
858    unsigned int column_pmin = tid * pixels_per_thread; 
859    unsigned int column_pmax = column_pmin + pixels_per_thread; 
860
861    // Each thread[tid] copy npixels from image_in buffer to local A[cid] buffer
862    memcpy( A[cid]   + l_offset,
863            image_in + g_offset,
864            npixels );
865 
866#if VERBOSE_EXEC
867get_cycle( &date );
868printf( "\n[convol] exec[%d] on core[%x,%d] loaded input file in A[%d] / cycle %d\n", 
869tid , cxy , lpid , cid , (unsigned int)date);
870#endif
871
872    // Optionnal parallel display for the initial image
873    if ( INITIAL_DISPLAY_ENABLE )
874    {
875        // each thread[tid] copy npixels from A[cid] to in_win_buf buffer
876        memcpy( in_win_buf + g_offset,
877                A[cid]     + l_offset,
878                npixels );
879
880        // refresh the FBF window
881        if( fbf_refresh_window( in_wid , global_lmin , global_lmax ) ) 
882        {
883            printf("\n[convol error] in %s : thread[%d] cannot access FBF\n",
884            __FUNCTION__ , tid );
885            pthread_exit( &THREAD_EXIT_FAILURE );
886        }
887
888#if VERBOSE_EXEC
889get_cycle( &date );
890printf( "\n[convol] exec[%d] on core[%x,%d] completed initial display / cycle %d\n",
891tid , cxy , lpid , (unsigned int)date );
892#endif
893
894        ////////////////////////////////
895        pthread_barrier_wait( &barrier );
896    }
897
898    ////////////////////////////////////////////////////////////
899    // parallel horizontal filter :
900    // B <= Transpose(FH(A))
901    // D <= A - FH(A)
902    // Each thread computes (image_nl/nthreads) lines.
903    // The image must be extended :
904    // if (z<0)    TA(cid,l,z) == TA(cid,l,0)
905    // if (z>image_np-1) TA(cid,l,z) == TA(cid,l,image_np-1)
906    ////////////////////////////////////////////////////////////
907
908    get_cycle( &date );
909    H_BEG[cid][lid] = (unsigned int)date;
910
911    // l = global line index / p = absolute pixel index 
912
913    for (l = global_lmin; l < global_lmax; l++)
914    {
915        // src_c and src_l are the cluster index and the line index for A & D
916        int src_c = l / lines_per_cluster;
917        int src_l = l % lines_per_cluster;
918
919        // We use the specific values of the horizontal ep-filter for optimisation:
920        // sum(p) = sum(p-1) + TA[p+hrange] - TA[p-hrange-1]
921        // To minimize the number of tests, the loop on pixels is split in three domains
922
923        int sum_p = (hrange + 2) * TA(src_c, src_l, 0);
924        for (z = 1; z < hrange; z++)
925        {
926            sum_p = sum_p + TA(src_c, src_l, z);
927        }
928
929        // first domain : from 0 to hrange
930        for (p = 0; p < hrange + 1; p++)
931        {
932            // dst_c and dst_p are the cluster index and the pixel index for B
933            int dst_c = p / pixels_per_cluster;
934            int dst_p = p % pixels_per_cluster;
935            sum_p = sum_p + (int) TA(src_c, src_l, p + hrange) - (int) TA(src_c, src_l, 0);
936            TB(dst_c, dst_p, l) = sum_p / hnorm;
937            TD(src_c, src_l, p) = (int) TA(src_c, src_l, p) - sum_p / hnorm;
938        }
939        // second domain : from (hrange+1) to (image_np-hrange-1)
940        for (p = hrange + 1; p < image_np - hrange; p++)
941        {
942            // dst_c and dst_p are the cluster index and the pixel index for B
943            int dst_c = p / pixels_per_cluster;
944            int dst_p = p % pixels_per_cluster;
945            sum_p = sum_p + (int) TA(src_c, src_l, p + hrange) 
946                          - (int) TA(src_c, src_l, p - hrange - 1);
947            TB(dst_c, dst_p, l) = sum_p / hnorm;
948            TD(src_c, src_l, p) = (int) TA(src_c, src_l, p) - sum_p / hnorm;
949        }
950        // third domain : from (image_np-hrange) to (image_np-1)
951        for (p = image_np - hrange; p < image_np; p++)
952        {
953            // dst_c and dst_p are the cluster index and the pixel index for B
954            int dst_c = p / pixels_per_cluster;
955            int dst_p = p % pixels_per_cluster;
956            sum_p = sum_p + (int) TA(src_c, src_l, image_np - 1) 
957                          - (int) TA(src_c, src_l, p - hrange - 1);
958            TB(dst_c, dst_p, l) = sum_p / hnorm;
959            TD(src_c, src_l, p) = (int) TA(src_c, src_l, p) - sum_p / hnorm;
960        }
961
962#if SUPER_VERBOSE
963get_cycle( &date );
964printf(" - line %d computed at cycle %d\n", l, (unsigned int)date );
965#endif   
966
967    }
968
969    get_cycle( &date );
970    H_END[cid][lid] = (unsigned int)date;
971
972#if VERBOSE_EXEC
973get_cycle( &date );
974printf( "\n[convol] exec[%d] on core[%x,%d] completed horizontal filter / cycle %d\n",
975tid , cxy , lpid , (unsigned int)date );
976#endif
977
978    ////////////////////////////////
979    pthread_barrier_wait( &barrier );
980
981    ///////////////////////////////////////////////////////////////
982    // parallel vertical filter :
983    // C <= Transpose(FV(B))
984    // Each thread computes (image_np/nthreads) columns
985    // The image must be extended :
986    // if (l<0)    TB(cid,p,l) == TB(cid,p,0)
987    // if (l>image_nl-1)   TB(cid,p,l) == TB(cid,p,image_nl-1)
988    ///////////////////////////////////////////////////////////////
989
990    get_cycle( &date );
991    V_BEG[cid][lid] = (unsigned int)date;
992
993    // l = global line index / p = pixel index in column
994
995    for (p = column_pmin; p < column_pmax ; p++)
996    {
997        // src_c and src_p are the cluster index and the pixel index for B
998        int src_c = p / pixels_per_cluster;
999        int src_p = p % pixels_per_cluster;
1000
1001        int sum_l;
1002
1003        // We use the specific values of the vertical ep-filter
1004        // To minimize the number of tests, the image_nl lines are split in three domains
1005
1006        // first domain : explicit computation for the first 18 values
1007        for (l = 0; l < 18; l++)
1008        {
1009            // dst_c and dst_l are the cluster index and the line index for C
1010            int dst_c = l / lines_per_cluster;
1011            int dst_l = l % lines_per_cluster;
1012
1013            for (z = 0, sum_l = 0; z < 35; z++)
1014            {
1015                sum_l = sum_l + vf[z] * TB(src_c, src_p, max(l - 17 + z,0) );
1016            }
1017            TC(dst_c, dst_l, p) = sum_l / vnorm;
1018        }
1019        // second domain
1020        for (l = 18; l < image_nl - 17; l++)
1021        {
1022            // dst_c and dst_l are the cluster index and the line index for C
1023            int dst_c = l / lines_per_cluster;
1024            int dst_l = l % lines_per_cluster;
1025
1026            sum_l = sum_l + TB(src_c, src_p, l + 4)
1027                  + TB(src_c, src_p, l + 8)
1028                  + TB(src_c, src_p, l + 11)
1029                  + TB(src_c, src_p, l + 15)
1030                  + TB(src_c, src_p, l + 17)
1031                  - TB(src_c, src_p, l - 5)
1032                  - TB(src_c, src_p, l - 9)
1033                  - TB(src_c, src_p, l - 12)
1034                  - TB(src_c, src_p, l - 16)
1035                  - TB(src_c, src_p, l - 18);
1036
1037            TC(dst_c, dst_l, p) = sum_l / vnorm;
1038        }
1039        // third domain
1040        for (l = image_nl - 17; l < image_nl; l++)
1041        {
1042            // dst_c and dst_l are the cluster index and the line index for C
1043            int dst_c = l / lines_per_cluster;
1044            int dst_l = l % lines_per_cluster;
1045
1046            sum_l = sum_l + TB(src_c, src_p, min(l + 4, image_nl - 1))
1047                  + TB(src_c, src_p, min(l + 8, image_nl - 1))
1048                  + TB(src_c, src_p, min(l + 11, image_nl - 1))
1049                  + TB(src_c, src_p, min(l + 15, image_nl - 1))
1050                  + TB(src_c, src_p, min(l + 17, image_nl - 1))
1051                  - TB(src_c, src_p, l - 5)
1052                  - TB(src_c, src_p, l - 9)
1053                  - TB(src_c, src_p, l - 12)
1054                  - TB(src_c, src_p, l - 16)
1055                  - TB(src_c, src_p, l - 18);
1056
1057            TC(dst_c, dst_l, p) = sum_l / vnorm;
1058        }
1059
1060#if SUPER_VERBOSE
1061get_cycle( &date );
1062printf(" - column %d computed at cycle %d\n", p, (unsigned int)date );
1063#endif
1064
1065    }
1066
1067    get_cycle( &date );
1068    V_END[cid][lid] = (unsigned int)date;
1069
1070#if VERBOSE_EXEC
1071get_cycle( &date );
1072printf( "\n[convol] exec[%d] on core[%x,%d] completed vertical filter / cycle %d\n",
1073tid , cxy , lid , (unsigned int)date );
1074#endif
1075
1076    ////////////////////////////////
1077    pthread_barrier_wait( &barrier );
1078
1079    ///////////////////////////////////////////////////////////////
1080    // build final image in local Z buffer from C & D local buffers
1081    // store it in output image file, and display it on FBF.
1082    // Z <= C + D
1083    ///////////////////////////////////////////////////////////////
1084
1085    get_cycle( &date );
1086    F_BEG[cid][lid] = (unsigned int)date;
1087
1088    // Each thread[tid] set local buffer Z[cid] from local buffers C[cid] & D[cid]
1089
1090    for( l = local_lmin ; l < local_lmax ; l++ )
1091    {
1092        for( p = 0 ; p < image_np ; p++ )
1093        {
1094            TZ(cid,l,p) = TC(cid,l,p) + TD(cid,l,p);
1095        }
1096    }
1097
1098    // Each thread[tid] copy npixels from Z[cid] buffer to image_out buffer
1099    memcpy( image_out + g_offset,
1100            Z[cid]    + l_offset,
1101            npixels );
1102
1103    // Optional parallel display of the final image
1104    if ( FINAL_DISPLAY_ENABLE )
1105    {
1106        // each thread[tid] copy npixels from Z[cid] to out_win_buf buffer
1107        memcpy( out_win_buf + g_offset,
1108                Z[cid]      + l_offset,
1109                npixels );
1110
1111        // refresh the FBF window
1112        if( fbf_refresh_window( out_wid , global_lmin , global_lmax ) )
1113        {
1114            printf("\n[convol error] in %s : thread[%d] cannot access FBF\n",
1115            __FUNCTION__ , tid );
1116            pthread_exit( &THREAD_EXIT_FAILURE );
1117        }
1118
1119#if VERBOSE_EXEC
1120get_cycle( &date );
1121printf( "\n[convol] exec[%d] on core[%x,%d] completed final display / cycle %d\n",
1122tid , cxy , lpid , (unsigned int)date );
1123#endif
1124
1125    }
1126
1127    // Each thread[cid,0] releases the 5 local buffers
1128    if( lid == 0 )
1129    {
1130        free( A[cid] );
1131        free( B[cid] );
1132        free( C[cid] );
1133        free( D[cid] );
1134        free( Z[cid] );
1135    }
1136
1137    get_cycle( &date );
1138    F_END[cid][lid] = (unsigned int)date;
1139
1140    // thread termination depends on the placement policy
1141    if( PARALLEL_PLACEMENT )   
1142    {
1143        // <exec> threads are runing in detached mode, and
1144        // each thread must signal completion by calling barrier
1145        // passed in arguments before exit
1146
1147        pthread_barrier_wait( args->barrier );
1148
1149        pthread_exit( &THREAD_EXIT_SUCCESS );
1150    }
1151    else
1152    {
1153        // <exec> threads are running in attached mode
1154        // all threads (but the one executing main) exit
1155        if ( tid != tid_main ) pthread_exit( &THREAD_EXIT_SUCCESS );
1156    }
1157
1158    return NULL;
1159
1160} // end execute()
1161
1162
1163
1164
1165
1166
1167//////////////////////////
1168void instrument( FILE * f,
1169                 char * filename )
1170{
1171    unsigned int nclusters = x_size * y_size;
1172
1173    unsigned int cc, pp;
1174
1175    unsigned int min_start = 0xFFFFFFFF;
1176    unsigned int max_start = 0;
1177
1178    unsigned int min_h_beg = 0xFFFFFFFF;
1179    unsigned int max_h_beg = 0;
1180
1181    unsigned int min_h_end = 0xFFFFFFFF;
1182    unsigned int max_h_end = 0;
1183
1184    unsigned int min_v_beg = 0xFFFFFFFF;
1185    unsigned int max_v_beg = 0;
1186
1187    unsigned int min_v_end = 0xFFFFFFFF;
1188    unsigned int max_v_end = 0;
1189
1190    unsigned int min_f_beg = 0xFFFFFFFF;
1191    unsigned int max_f_beg = 0;
1192
1193    unsigned int min_f_end = 0xFFFFFFFF;
1194    unsigned int max_f_end = 0;
1195
1196    for (cc = 0; cc < nclusters; cc++)
1197    {
1198        for (pp = 0; pp < ncores; pp++ )
1199        {
1200            if (START[cc][pp] < min_start) min_start = START[cc][pp];
1201            if (START[cc][pp] > max_start) max_start = START[cc][pp];
1202
1203            if (H_BEG[cc][pp] < min_h_beg) min_h_beg = H_BEG[cc][pp];
1204            if (H_BEG[cc][pp] > max_h_beg) max_h_beg = H_BEG[cc][pp];
1205
1206            if (H_END[cc][pp] < min_h_end) min_h_end = H_END[cc][pp];
1207            if (H_END[cc][pp] > max_h_end) max_h_end = H_END[cc][pp];
1208
1209            if (V_BEG[cc][pp] < min_v_beg) min_v_beg = V_BEG[cc][pp];
1210            if (V_BEG[cc][pp] > max_v_beg) max_v_beg = V_BEG[cc][pp];
1211
1212            if (V_END[cc][pp] < min_v_end) min_v_end = V_END[cc][pp];
1213            if (V_END[cc][pp] > max_v_end) max_v_end = V_END[cc][pp];
1214
1215            if (F_BEG[cc][pp] < min_f_beg) min_f_beg = F_BEG[cc][pp];
1216            if (F_BEG[cc][pp] > max_f_beg) max_f_beg = F_BEG[cc][pp];
1217
1218            if (F_END[cc][pp] < min_f_end) min_f_end = F_END[cc][pp];
1219            if (F_END[cc][pp] > max_f_end) max_f_end = F_END[cc][pp];
1220        }
1221    }
1222
1223    // display on terminal
1224    printf( "\n ------ %s ------\n" , filename );
1225
1226    printf(" - START : min = %d / max = %d / med = %d / delta = %d\n",
1227           min_start, max_start, (min_start+max_start)/2, max_start-min_start);
1228
1229    printf(" - H_BEG : min = %d / max = %d / med = %d / delta = %d\n",
1230           min_h_beg, max_h_beg, (min_h_beg+max_h_beg)/2, max_h_beg-min_h_beg);
1231
1232    printf(" - H_END : min = %d / max = %d / med = %d / delta = %d\n",
1233           min_h_end, max_h_end, (min_h_end+max_h_end)/2, max_h_end-min_h_end);
1234
1235    printf(" - V_BEG : min = %d / max = %d / med = %d / delta = %d\n",
1236           min_v_beg, max_v_beg, (min_v_beg+max_v_beg)/2, max_v_beg-min_v_beg);
1237
1238    printf(" - V_END : min = %d / max = %d / med = %d / delta = %d\n",
1239           min_v_end, max_v_end, (min_v_end+max_v_end)/2, max_v_end-min_v_end);
1240
1241    printf(" - D_BEG : min = %d / max = %d / med = %d / delta = %d\n",
1242           min_f_beg, max_f_beg, (min_f_beg+max_f_beg)/2, max_f_beg-min_f_beg);
1243
1244    printf(" - D_END : min = %d / max = %d / med = %d / delta = %d\n",
1245           min_f_end, max_f_end, (min_f_end+max_f_end)/2, max_f_end-min_f_end);
1246
1247    printf( "\n General Scenario   (Kcycles)\n" );
1248    printf( " - LOAD IMAGE        = %d\n", (min_h_beg - min_start)/1000 );
1249    printf( " - H_FILTER          = %d\n", (max_h_end - min_h_beg)/1000 );
1250    printf( " - BARRIER HORI/VERT = %d\n", (min_v_beg - max_h_end)/1000 );
1251    printf( " - V_FILTER          = %d\n", (max_v_end - min_v_beg)/1000 );
1252    printf( " - BARRIER VERT/DISP = %d\n", (min_f_beg - max_v_end)/1000 );
1253    printf( " - DISPLAY           = %d\n", (max_f_end - min_f_beg)/1000 );
1254    printf( " \nSEQUENCIAL = %d / PARALLEL = %d\n",
1255            SEQUENCIAL_TIME/1000, PARALLEL_TIME/1000 );
1256
1257    // save on disk
1258    fprintf( f ,  "\n ------ %s ------\n" , filename );
1259
1260    fprintf( f , " - START : min = %d / max = %d / med = %d / delta = %d\n",
1261           min_start, max_start, (min_start+max_start)/2, max_start-min_start);
1262
1263    fprintf( f , " - H_BEG : min = %d / max = %d / med = %d / delta = %d\n",
1264           min_h_beg, max_h_beg, (min_h_beg+max_h_beg)/2, max_h_beg-min_h_beg);
1265
1266    fprintf( f , " - H_END : min = %d / max = %d / med = %d / delta = %d\n",
1267           min_h_end, max_h_end, (min_h_end+max_h_end)/2, max_h_end-min_h_end);
1268
1269    fprintf( f , " - V_BEG : min = %d / max = %d / med = %d / delta = %d\n",
1270           min_v_beg, max_v_beg, (min_v_beg+max_v_beg)/2, max_v_beg-min_v_beg);
1271
1272    fprintf( f , " - V_END : min = %d / max = %d / med = %d / delta = %d\n",
1273           min_v_end, max_v_end, (min_v_end+max_v_end)/2, max_v_end-min_v_end);
1274
1275    fprintf( f , " - D_BEG : min = %d / max = %d / med = %d / delta = %d\n",
1276           min_f_beg, max_f_beg, (min_f_beg+max_f_beg)/2, max_f_beg-min_f_beg);
1277
1278    fprintf( f , " - D_END : min = %d / max = %d / med = %d / delta = %d\n",
1279           min_f_end, max_f_end, (min_f_end+max_f_end)/2, max_f_end-min_f_end);
1280
1281    fprintf( f ,  "\n General Scenario (Kcycles)\n" );
1282    fprintf( f ,  " - LOAD IMAGE        = %d\n", (min_h_beg - min_start)/1000 );
1283    fprintf( f ,  " - H_FILTER          = %d\n", (max_h_end - min_h_beg)/1000 );
1284    fprintf( f ,  " - BARRIER HORI/VERT = %d\n", (min_v_beg - max_h_end)/1000 );
1285    fprintf( f ,  " - V_FILTER          = %d\n", (max_v_end - min_v_beg)/1000 );
1286    fprintf( f ,  " - BARRIER VERT/DISP = %d\n", (min_f_beg - max_v_end)/1000 );
1287    fprintf( f ,  " - SAVE              = %d\n", (max_f_end - min_f_beg)/1000 );
1288    fprintf( f ,  " \nSEQUENCIAL = %d / PARALLEL = %d\n",
1289    SEQUENCIAL_TIME/1000, PARALLEL_TIME/1000 );
1290
1291} // end instrument()
1292
1293
1294
1295
1296
1297// Local Variables:
1298// tab-width: 3
1299// c-basic-offset: 3
1300// c-file-offsets:((innamespace . 0)(inline-open . 0))
1301// indent-tabs-mode: nil
1302// End:
1303
1304// vim: filetype=cpp:expandtab:shiftwidth=3:tabstop=3:softtabstop=3
1305
1306
Note: See TracBrowser for help on using the repository browser.