source: trunk/user/convol/convol.c @ 684

Last change on this file since 684 was 682, checked in by alain, 4 years ago

Introduce three new applications:

  • windows : to test the FBF windows kernel manager
  • udp_chat : chat application based on UDP sockets.
  • tcp_chat : chat application based on TCP sockets (including packet loss recovery).
File size: 44.7 KB
RevLine 
[645]1///////////////////////////////////////////////////////////////////////////////////////
2// File   : convol.c 
3// Date   : june 2014
4// author : Alain Greiner
5///////////////////////////////////////////////////////////////////////////////////////
6// This multi-threaded application implements a 2D convolution product. 
[652]7// It can run on a multi-cores, multi-clusters architecture, with one thread
8// per core, and uses the POSIX threads API.
[645]9//
[676]10// The input image is read from a file and the output image is saved to another file.
[645]11//
[676]12// - number of clusters containing processors must be power of 2 no larger than 256.
13// - number of processors per cluster must be power of 2 no larger than 4.
14// - number of working threads is the number of cores availables in the hardware
15//   architecture : nthreads = nclusters * ncores.
16//
[652]17// The convolution kernel is defined in the execute() function.
18// It can be factored in two independant line and column convolution products.
[645]19//
[676]20// The main() function can be launched on any processor.
21// - It checks software requirements versus the hardware resources.
22// - It open & maps the input file to a global <image_in> buffer.
23// - it open & maps the output file to another global <image_out> buffer.
24// - it open the instrumentation file.
25// - it creates & activates two FBF windows to display input & output images.
26// - it launches other threads to run in parallel the execute() function.
27// - it saves the instrumentation results on disk.
28// - it closes the input, output, & instrumentation files.
29// - it deletes the FBF input & output windows.
[652]30//
[676]31// The execute() function is executed in parallel by all threads. These threads are
32// working on 5 arrays of distributed buffers, indexed by the cluster index [cid].
33// - A[cid]: contain the distributed initial image (NL/NCLUSTERS lines per cluster).
34// - B[cid]: is the result of horizontal filter, then transpose B <= Trsp(HF(A)
35// - C[cid]: is the result of vertical image, then transpose : c <= Trsp(VF(B)
36// - D[cid]: is the the difference between A and FH(A) : D <= A - FH(A)
37// - Z[cid]: contain the distributed final image Z <= C + D
38//
39// It can be split in four phases separated by synchronisation barriers:
40// 1. Initialisation:
41//    Allocates the 5 A[cid],B[cid],C[cid],D[cid],Z[cid] buffers, initialise A[cid]
42//    from the <image_in> buffer, and display the initial image on FBF if rquired.
43// 2. Horizontal Filter:
44//    Set B[cid] and D[cid] from A[cid]. Read data accesses are local, write data
45//    accesses are remote, to implement the transpose.
46// 3. Vertical Filter: 
47//    Set C[cid] from B[cid]. Read data accesses are local, write data accesses
48//    are remote, to implement the transpose.
49// 4. Save results:
50//    Set the Z[cid] from C[cid] and D[cid]. All read and write access are local.
51//    Move the final image (Z[cid] buffer) to the <image_out> buffer.   
52//
53// This application supports three placement modes, implemented in the main() function.
[652]54// In all modes, the working threads are identified by the [tid] continuous index
55// in range [0, NTHREADS-1], and defines how the lines are shared amongst the threads.
56// This continuous index can always be decomposed in two continuous sub-indexes:
[676]57// tid == cid * NCORES + lid,  where cid is in [0,NCLUSTERS-1] and lid in [0,NCORES-1].
[652]58//
59// - NO_PLACEMENT: the main thread is itsef a working thread. The (N_1) other working
60//   threads are created by the main thread, but the placement is done by the OS, using
61//   the DQDT for load balancing, and two working threads can be placed on the same core.
62//   The [cid,lid] are only abstract identifiers, and cannot be associated to a physical
63//   cluster or a physical core. In this mode, the main thread run on any cluster,
64//   but has tid = 0 (i.e. cid = 0 & tid = 0).
65//
[676]66// - EXPLICIT_PLACEMENT: the main thread is again a working thread, but the placement
[652]67//   of the threads on the cores is explicitely controled by the main thread to have
68//   exactly one working thread per core, and the [cxy][lpid] core coordinates for a given
69//   thread[tid] can be directly derived from the [tid] value: [cid] is an alias for the
70//   physical cluster identifier, and [lid] is the local core index.
71//
72// - PARALLEL_PLACEMENT: the main thread is not anymore a working thread, and uses the
73//   non standard pthread_parallel_create() function to avoid the costly sequencial
[676]74//   loops for pthread_create() and pthread_join(). It garanties one working thread
[652]75//   per core, and the same relation between the thread[tid] and the core[cxy][lpid].
76//
77// The [tid] continuous index defines how the work is shared amongst the threads:
78// - each thread handles NL/nthreads lines for the horizontal filter.
79// - each thread handles NP/nthreads columns for the vertical filter.
[645]80///////////////////////////////////////////////////////////////////////////////////////
81
[652]82#include <sys/mman.h>
[645]83#include <stdio.h>
84#include <stdlib.h>
85#include <fcntl.h>
86#include <unistd.h>
87#include <pthread.h>
[652]88#include <string.h>
[645]89#include <almosmkh.h>
90#include <hal_macros.h>
91
[652]92#define VERBOSE_MAIN               1
[676]93#define VERBOSE_EXEC               1
[656]94#define SUPER_VERBOSE              0
[645]95
96#define X_MAX                      16
97#define Y_MAX                      16
[652]98#define CORES_MAX                  4
[645]99#define CLUSTERS_MAX               (X_MAX * Y_MAX)
[652]100#define THREADS_MAX                (X_MAX * Y_MAX * CORES_MAX)
[645]101
[676]102#define IMAGE_TYPE                 420                         // pixel encoding type
103#define INPUT_IMAGE_PATH           "misc/couple_512.raw"       // default image_in
104#define OUTPUT_IMAGE_PATH          "misc/couple_conv_512.raw"  // default image_out
105#define NL                         512                         // default nlines
106#define NP                         512                         // default npixels
[645]107
[652]108#define NO_PLACEMENT               0
109#define EXPLICIT_PLACEMENT         0
110#define PARALLEL_PLACEMENT         1
[645]111
[676]112#define INTERACTIVE_MODE           0
[652]113#define USE_DQT_BARRIER            1
114#define INITIAL_DISPLAY_ENABLE     1
115#define FINAL_DISPLAY_ENABLE       1
116
[645]117#define TA(c,l,p)  (A[c][((NP) * (l)) + (p)])
118#define TB(c,p,l)  (B[c][((NL) * (p)) + (l)])
119#define TC(c,l,p)  (C[c][((NP) * (l)) + (p)])
120#define TD(c,l,p)  (D[c][((NP) * (l)) + (p)])
121#define TZ(c,l,p)  (Z[c][((NP) * (l)) + (p)])
122
123#define max(x,y) ((x) > (y) ? (x) : (y))
124#define min(x,y) ((x) < (y) ? (x) : (y))
125
126//////////////////////////////////////////////////////////
[652]127//            global variables
[645]128//////////////////////////////////////////////////////////
129
[652]130// global instrumentation counters for the main thread
131unsigned int SEQUENCIAL_TIME = 0;
132unsigned int PARALLEL_TIME   = 0;
[645]133
[652]134// instrumentation counters for thread[tid] in cluster[cid]
135unsigned int START[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
136unsigned int H_BEG[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
137unsigned int H_END[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
138unsigned int V_BEG[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
139unsigned int V_END[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
[676]140unsigned int F_BEG[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
141unsigned int F_END[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
[645]142
[652]143// pointer on buffer containing the input image, maped by the main to the input file
144unsigned char *  image_in;
145
146// pointer on buffer containing the output image, maped by the main to the output file
147unsigned char *  image_out;
148
[645]149// return values at thread exit
150unsigned int THREAD_EXIT_SUCCESS = 0;
151unsigned int THREAD_EXIT_FAILURE = 1;
152
[676]153// pointer and identifier for FBF windows
154void   *  in_win_buf;
155int       in_wid;
156void   *  out_win_buf;
157int       out_wid;
158
[645]159// synchronization barrier
160pthread_barrier_t     barrier;
161
[652]162// platform parameters
163unsigned int  x_size;              // number of clusters in a row
164unsigned int  y_size;              // number of clusters in a column
165unsigned int  ncores;              // number of processors per cluster
[645]166
[676]167// main thread continuous index
168unsigned int     tid_main;
169
[645]170// arrays of pointers on distributed buffers in all clusters
[676]171unsigned char  * GA[CLUSTERS_MAX];
[652]172int            * GB[CLUSTERS_MAX];
173int            * GC[CLUSTERS_MAX];
174int            * GD[CLUSTERS_MAX];
175unsigned char  * GZ[CLUSTERS_MAX];
[645]176
[652]177// array of threads kernel identifiers / indexed by [tid]
178pthread_t        exec_trdid[THREADS_MAX];
[645]179
[652]180// array of threads attributes / indexed bi [tid]
181pthread_attr_t   exec_attr[THREADS_MAX]; 
[645]182
[652]183// array of execute() function arguments / indexed by [tid]
184pthread_parallel_work_args_t exec_args[THREADS_MAX];
185
[676]186// image features
187unsigned int   image_nl;
188unsigned int   image_np;
189char           input_image_path[128];
190char           output_image_path[128];
[652]191
[645]192/////////////////////////////////////////////////////////////////////////////////////
193//           functions declaration
194/////////////////////////////////////////////////////////////////////////////////////
195
[656]196void * execute( void * args );
[645]197
[652]198void instrument( FILE * f , char * filename );
[645]199
200/////////////////
201void main( void )
[676]202/////////////////
[645]203{
[652]204    unsigned long long start_cycle;
205    unsigned long long end_sequencial_cycle;
206    unsigned long long end_parallel_cycle;
[645]207
[652]208    int          error;
[645]209
[652]210    char         instru_name[32];               // instrumentation file name
[656]211    char         instru_path[64];               // instrumentation path name
[645]212
[652]213    /////////////////////////////////////////////////////////////////////////////////
214    get_cycle( &start_cycle );
215    /////////////////////////////////////////////////////////////////////////////////
[645]216
[652]217    if( (NO_PLACEMENT + EXPLICIT_PLACEMENT + PARALLEL_PLACEMENT) != 1 )
[645]218    {
[652]219        printf("\n[convol error] illegal placement\n");
[645]220        exit( 0 );
221    }
222
[652]223    // get & check platform parameters
[659]224    hard_config_t  config;
225    get_config( &config );
226    x_size = config.x_size;
227    y_size = config.y_size;
228    ncores = config.ncores;
[652]229
230    if((ncores != 1) && (ncores != 2) && (ncores != 4))
[645]231    {
232        printf("\n[convol error] number of cores per cluster must be 1/2/4\n");
233        exit( 0 );
234    }
235
[652]236    if( (x_size != 1) && (x_size != 2) && (x_size != 4) && 
237        (x_size != 8) && (x_size != 16) )
[645]238    {
239        printf("\n[convol error] x_size must be 1/2/4/8/16\n");
240        exit( 0 );
241    }
[652]242       
243    if( (y_size != 1) && (y_size != 2) && (y_size != 4) && 
244        (y_size != 8) && (y_size != 16) )
[645]245    {
246        printf("\n[convol error] y_size must be 1/2/4/8/16\n");
247        exit( 0 );
248    }
[652]249       
250    // main thread get identifiers for core executing main
251    unsigned int  cxy_main;
252    unsigned int  lid_main;
253    get_core_id( &cxy_main , &lid_main );
[645]254
255    // compute nthreads and nclusters
256    unsigned int nclusters = x_size * y_size;
[652]257    unsigned int nthreads  = nclusters * ncores;
[645]258
[676]259    // get input and output images pathnames and size
260    if( INTERACTIVE_MODE )
261    {
262        // get image size
263        printf("\n[convol] image nlines      : ");
264        get_uint32( &image_nl );
265
266        printf("\n[convol] image npixels     : ");
267        get_uint32( &image_np );
268
269        printf("\n[convol] input image path  : ");
270        get_string( input_image_path , 128 );
271
272        printf("[convol] output image path : ");
273        get_string( output_image_path , 128 );
274    }
275    else
276    {
277        image_nl = NL;
278        image_np = NP;
279        strcpy( input_image_path  , INPUT_IMAGE_PATH );
280        strcpy( output_image_path , OUTPUT_IMAGE_PATH );
281    }
282
[652]283    // main thread get FBF size and type
[676]284    int   fbf_width;
285    int   fbf_height;
286    int   fbf_type;
[652]287    fbf_get_config( &fbf_width , &fbf_height , &fbf_type );
[645]288
[676]289    if( ((unsigned int)fbf_width  < image_np) || 
290        ((unsigned int)fbf_height < image_nl) || 
291        (fbf_type != IMAGE_TYPE) )
[652]292    {
[676]293        printf("\n[convol error] image not acceptable\n"
294               "FBF width  = %d / npixels  = %d\n"
295               "FBF height = %d / nlines   = %d\n"
296               "FBF type   = %d / expected = %d\n",
297               fbf_width, image_np, fbf_height, image_nl, fbf_type, IMAGE_TYPE );
[652]298        exit( 0 );
299    }
[645]300
[676]301    if( nthreads > image_nl )
[652]302    {
[676]303        printf("\n[convol error] nthreads (%d] larger than nlines (%d)\n",
304        nthreads , image_nl );
[645]305        exit( 0 );
306    }
307
[652]308    // define instrumentation file name
309    if( NO_PLACEMENT )
310    {
311        printf("\n[convol] %d cluster(s) / %d core(s) / FBF[%d*%d] / PID %x / NO_PLACE\n",
312        nclusters, ncores, fbf_width, fbf_height, getpid() );
[645]313
[652]314        // build instrumentation file name
315        if( USE_DQT_BARRIER )
[676]316        snprintf( instru_name , 32 , "dqt_no_place_%d_%d", x_size * y_size , ncores );
[652]317        else
[676]318        snprintf( instru_name , 32 , "smp_no_place_%d_%d", x_size * y_size , ncores );
[645]319    }
320
[652]321    if( EXPLICIT_PLACEMENT )
322    {
323        printf("\n[convol] %d cluster(s) / %d core(s) / FBF[%d*%d] / PID %x / EXPLICIT\n",
324        nclusters, ncores, fbf_width, fbf_height, getpid() );
[645]325
[652]326        // build instrumentation file name
327        if( USE_DQT_BARRIER )
[676]328        snprintf( instru_name , 32 , "dqt_explicit_%d_%d", x_size * y_size , ncores );
[652]329        else
[676]330        snprintf( instru_name , 32 , "smp_explicit_%d_%d", x_size * y_size , ncores );
[652]331    }
332
333    if( PARALLEL_PLACEMENT )
[645]334    {
[652]335        printf("\n[convol] %d cluster(s) / %d core(s) / FBF[%d*%d] / PID %x / PARALLEL\n",
336        nclusters, ncores, fbf_width, fbf_height, getpid() );
337
338        // build instrumentation file name
339        if( USE_DQT_BARRIER )
[676]340        snprintf( instru_name , 32 , "dqt_parallel_%d_%d", x_size * y_size , ncores );
[652]341        else
[676]342        snprintf( instru_name , 32 , "smp_parallel_%d_%d", x_size * y_size , ncores );
[645]343    }
344
[652]345    // open instrumentation file
[676]346    snprintf( instru_path , 64 , "/home/convol/%s", instru_name );
[652]347    FILE * f_instru = fopen( instru_path , NULL );
348    if ( f_instru == NULL ) 
349    { 
350        printf("\n[convol error] cannot open instrumentation file %s\n", instru_path );
[645]351        exit( 0 );
352    }
353
[652]354#if  VERBOSE_MAIN
355printf("\n[convol] main on core[%x,%d] open instrumentation file %s\n",
356cxy_main, lid_main, instru_path );
357#endif
358
[676]359    // main create an FBF window for input image
360    in_wid = fbf_create_window( 0,                   // l_zero
361                                0,                   // p_zero
362                                image_nl,            // lines
363                                image_np,            // pixels
364                                &in_win_buf );
365    if( in_wid < 0 ) 
366    {
367        printf("\n[transpose error] cannot open FBF window for %s\n",
368        input_image_path);
369        exit( 0 );
370    }
371
372    // activate window
373    error = fbf_active_window( in_wid , 1 );
374
375    if( error )
376    {
377        printf("\n[transpose error] cannot activate window for %s\n",
378        input_image_path );
379        exit( 0 );
380    }
381
382#if  VERBOSE_MAIN
383printf("\n[convol] main on core[%x,%d] created FBF window (wid %d) for <%s>\n",
384cxy_main, lid_main, in_wid, input_image_path );
385#endif
386
387    // main create an FBF window for output image
388    out_wid = fbf_create_window( 0,                   // l_zero
389                                 image_np,            // p_zero
390                                 image_nl,            // lines
391                                 image_np,            // pixels
392                                 &out_win_buf );
393    if( out_wid < 0 ) 
394    {
395        printf("\n[transpose error] cannot create FBF window for %s\n",
396        output_image_path);
397        exit( 0 );
398    }
399
400    // activate window
401    error = fbf_active_window( out_wid , 1 );
402
403    if( error )
404    {
405        printf("\n[transpose error] cannot activate window for %s\n",
406        output_image_path );
407        exit( 0 );
408    }
409
410#if  VERBOSE_MAIN
411printf("\n[convol] main on core[%x,%d] created FBF window (wid %d) for <%s>\n",
412cxy_main, lid_main, out_wid, output_image_path );
413#endif
414
[652]415    // main initialise barrier
[645]416    if( USE_DQT_BARRIER )
417    {
418        pthread_barrierattr_t attr;
419        attr.x_size   = x_size;
420        attr.y_size   = y_size;
421        attr.nthreads = ncores;
422        error = pthread_barrier_init( &barrier, &attr , nthreads );
423    }
424    else
425    {
426        error = pthread_barrier_init( &barrier, NULL , nthreads );
427    }
428
429    if( error )
430    {
431        printf("\n[convol error] cannot initialize barrier\n");
432        exit( 0 );
433    }
434
[652]435#if VERBOSE_MAIN
[676]436printf("\n[convol] main on core[%x,%d] completed barrier init\n", 
[652]437cxy_main, lid_main );
438#endif
[645]439
[652]440    // main open input file
[676]441    int fd_in = open( input_image_path , O_RDONLY , 0 );
[652]442
443    if ( fd_in < 0 ) 
444    { 
[676]445        printf("\n[convol error] cannot open input file <%s>\n", input_image_path );
[652]446        exit( 0 );
447    }
448
[676]449    // main thread map input file to image_in buffer
[652]450    image_in = (unsigned char *)mmap( NULL,
[676]451                                      image_np * image_nl,
[652]452                                      PROT_READ,
453                                      MAP_FILE | MAP_SHARED,
454                                      fd_in,
455                                      0 );           // offset
456    if ( image_in == NULL ) 
457    { 
[676]458        printf("\n[convol error] main cannot map buffer to file %s\n", input_image_path );
[652]459        exit( 0 );
460    }
461
462#if  VERBOSE_MAIN
[676]463printf("\n[convol] main on core[%x,%x] map <image_in> buffer to file <%s>\n",
464cxy_main, lid_main, input_image_path );
[652]465#endif
466
467    // main thread open output file
[676]468    int fd_out = open( output_image_path , O_CREAT , 0 ); 
[652]469
470    if ( fd_out < 0 ) 
471    { 
[676]472        printf("\n[convol error] main cannot open file %s\n", output_image_path );
[652]473        exit( 0 );
474    }
475
476    // main thread map image_out buffer to output file
477    image_out = (unsigned char *)mmap( NULL,
[676]478                                       image_np * image_nl,
[652]479                                       PROT_WRITE,
480                                       MAP_FILE | MAP_SHARED,
481                                       fd_out,
482                                       0 );     // offset
483    if ( image_out == NULL ) 
484    { 
[676]485        printf("\n[convol error] main cannot map buffer to file %s\n", output_image_path );
[652]486        exit( 0 );
487    }
488
489#if  VERBOSE_MAIN
[676]490printf("\n[convol] main on core[%x,%x] map <image_out> buffer to file <%s>\n",
491cxy_main, lid_main, output_image_path );
[652]492#endif
493
494    /////////////////////////////////////////////////////////////////////////////////////
495    get_cycle( &end_sequencial_cycle );
496    SEQUENCIAL_TIME = (unsigned int)(end_sequencial_cycle - start_cycle);
497    /////////////////////////////////////////////////////////////////////////////////////
498
[656]499//////////////////
500#if NO_PLACEMENT
501{
502    // the tid value for the main thread is always 0
[676]503    // main thread creates other threads with tid in [1,nthreads-1] 
[656]504    unsigned int tid;
505    for ( tid = 0 ; tid < nthreads ; tid++ )
[645]506    {
[656]507        // register tid value in exec_args[tid] array
508        exec_args[tid].tid = tid;
509         
510        // create other threads
511        if( tid > 0 )
[645]512        {
[656]513            if ( pthread_create( &exec_trdid[tid], 
514                                 NULL,                  // no attribute
515                                 &execute,
516                                 &exec_args[tid] ) ) 
[652]517            {
[656]518                printf("\n[convol error] cannot create thread %d\n", tid );
519                exit( 0 );
520            }
[652]521
522#if VERBOSE_MAIN
523printf("\n[convol] main created thread %d\n", tid );
524#endif
525
[656]526        }
527        else
528        {
529            tid_main = 0;
530        }
531    }  // end for tid
[645]532
[656]533    // main thread calls itself the execute() function
534    execute( &exec_args[0] );
[652]535
[656]536    // main thread wait other threads completion
537    for ( tid = 1 ; tid < nthreads ; tid++ )
538    {
539        unsigned int * status;
540
541        // main wait thread[tid] status
542        if ( pthread_join( exec_trdid[tid], (void*)(&status)) )
[645]543        {
[656]544            printf("\n[convol error] main cannot join thread %d\n", tid );
545            exit( 0 );
546        }
[652]547       
[656]548        // check status
549        if( *status != THREAD_EXIT_SUCCESS )
550        {
551            printf("\n[convol error] thread %x returned failure\n", tid );
552            exit( 0 );
553        }
[645]554
[652]555#if VERBOSE_MAIN
556printf("\n[convol] main successfully joined thread %x\n", tid );
557#endif
558       
[656]559    }  // end for tid
560} 
561#endif // end no_placement
[652]562
[656]563//////////////////////
564#if EXPLICIT_PLACEMENT
565{
566    // main thread places each other threads on a specific core[cxy][lid]
567    // but the actual thread creation is sequencial
568    unsigned int x;
569    unsigned int y;
570    unsigned int l;
571    unsigned int cxy;                   // cluster identifier
572    unsigned int tid;                   // thread continuous index
[652]573
[656]574    for( x = 0 ; x < x_size ; x++ )
[645]575    {
[656]576        for( y = 0 ; y < y_size ; y++ )
[645]577        {
[656]578            cxy = HAL_CXY_FROM_XY( x , y );
579            for( l = 0 ; l < ncores ; l++ )
[645]580            {
[656]581                // compute thread continuous index
582                tid = (((* y_size) + y) * ncores) + l;
[645]583
[656]584                // register tid value in exec_args[tid] array
585                exec_args[tid].tid = tid;
[652]586
[656]587                // no thread created on the core running the main
588                if( (cxy != cxy_main) || (l != lid_main) )
589                {
590                    // define thread attributes
591                    exec_attr[tid].attributes = PT_ATTR_CLUSTER_DEFINED |
592                                                PT_ATTR_CORE_DEFINED;
593                    exec_attr[tid].cxy        = cxy;
594                    exec_attr[tid].lid        = l;
595 
596                    // create thread[tid] on core[cxy][l]
597                    if ( pthread_create( &exec_trdid[tid],   
598                                         &exec_attr[tid],   
599                                         &execute,
600                                         &exec_args[tid] ) )       
[645]601                    {
[656]602                        printf("\n[convol error] cannot create thread %d\n", tid );
603                        exit( 0 );
604                    }
[652]605#if VERBOSE_MAIN
606printf("\n[convol] main created thread[%d] on core[%x,%d]\n", tid, cxy, l );
607#endif
[645]608                }
[656]609                else
610                {
611                    tid_main = tid;
612                }
[645]613            }
614        }
[656]615    }
[652]616
[656]617    // main thread calls itself the execute() function
618    execute( &exec_args[tid_main] );
[652]619
[656]620    // main thread wait other threads completion
621    for( tid = 0 ; tid < nthreads ; tid++ )
622    {
623        // no other thread on the core running the main
624        if( tid != tid_main )
[652]625        {
[656]626            unsigned int * status;
627
628            // wait thread[tid]
629            if( pthread_join( exec_trdid[tid] , (void*)(&status) ) )
[652]630            {
[656]631                printf("\n[convol error] main cannot join thread %d\n", tid );
632                exit( 0 );
633            }
634     
635            // check status
636            if( *status != THREAD_EXIT_SUCCESS )
637            {
638                printf("\n[convol error] thread %d returned failure\n", tid );
639                exit( 0 );
640            }
[652]641#if VERBOSE_MAIN
642printf("\n[convol] main joined thread %d on core[%x,%d]\n", tid , cxy , l );
643#endif
644        }
[656]645    }
646} 
647#endif   // end explicit_placement
[652]648
[656]649//////////////////////
650#if PARALLEL_PLACEMENT
651{
652    // compute covering DQT size an level
653    unsigned int z          = (x_size > y_size) ? x_size : y_size;
654    unsigned int root_level = ((z == 1) ? 0 : 
655                              ((z == 2) ? 1 : 
656                              ((z == 4) ? 2 : 
657                              ((z == 8) ? 3 : 4))));
658
659    // create & execute the working threads
660    if( pthread_parallel_create( root_level , &execute ) )
[645]661    {
[656]662        printf("\n[convol error] in %s\n", __FUNCTION__ );
663        exit( 0 );
664    }
665}
666#endif  // end parallel_placement
[652]667
668    /////////////////////////////////////////////////////////////////////////////
669    get_cycle( &end_parallel_cycle );
670    PARALLEL_TIME = (unsigned int)(end_parallel_cycle - end_sequencial_cycle);
671    /////////////////////////////////////////////////////////////////////////////
672
673    // main thread register instrumentation results
674    instrument( f_instru , instru_name );
675
[656]676#if VERBOSE_MAIN
677printf("\n[convol] main registered instrumentation info\n" );
678#endif
679
[652]680    // main thread close input file
681    close( fd_in );
682
[656]683#if VERBOSE_MAIN
684printf("\n[convol] main closed input file\n" );
685#endif
686
[652]687    // main thread close output file
688    close( fd_out );
689
[656]690#if VERBOSE_MAIN
691printf("\n[convol] main closed output file\n" );
692#endif
693
[652]694    // main thread close instrumentation file
695    fclose( f_instru );
696
[656]697#if VERBOSE_MAIN
698printf("\n[convol] main closed instrumentation file\n" );
699#endif
700
[676]701    // ask confirm for exit
702    if( INTERACTIVE_MODE )
703    {
704        printf("\n[convol] press any key to to delete FBF windows and exit\n");
[682]705        getchar();
[676]706    }
707 
708    // main thread delete FBF windows
709    fbf_delete_window( in_wid );
710    fbf_delete_window( out_wid );
711
712#if VERBOSE_MAIN
713printf("\n[convol] main deleted FBF windows\n" );
714#endif
715
[652]716    // main thread suicide
[645]717    exit( 0 );
718   
719} // end main()
720
721
722
[652]723
724
725
[676]726
727
728
729
[656]730//////////////////////////////////
731void * execute( void * arguments )
[676]732//////////////////////////////////
[645]733{
734    unsigned long long date;
735
[656]736    pthread_parallel_work_args_t * args = (pthread_parallel_work_args_t *)arguments;
737
[652]738    // Each thread initialises the convolution kernel parameters in local stack.
[645]739    // The values defined in the next 12 lines are Philips proprietary information.
740
741    int   vnorm  = 115;
742    int   vf[35] = { 1, 1, 2, 2, 2,
743                     2, 3, 3, 3, 4,
744                     4, 4, 4, 5, 5,
745                     5, 5, 5, 5, 5,
746                     5, 5, 4, 4, 4,
747                     4, 3, 3, 3, 2,
748                     2, 2, 2, 1, 1 };
749
750    unsigned int hrange = 100;
751    unsigned int hnorm  = 201;
752
[652]753    // WARNING
754    //A thread is identified by the tid index, defined in the "args" structure.
755    // This index being in range [0,nclusters*ncores-1] we can always write
756    //       tid == cid * ncores + lid
757    // with cid in [0,nclusters-1] and lid in [0,ncores-1].
758    // if NO_PLACEMENT, there is no relation between these
759    // thread [cid][lid] indexes, and the core coordinates [cxy][lpid]
[645]760
[676]761    // get thread abstract identifiers[cid,lid]  from tid
[652]762    unsigned int tid = args->tid;
763    unsigned int cid = tid / ncores;   
764    unsigned int lid = tid % ncores;
[645]765
[652]766#if VERBOSE_EXEC
767unsigned int cxy;              // core cluster identifier
768unsigned int lpid;             // core local identifier
[656]769get_cycle( &date );
[652]770get_core_id( &cxy , &lpid );
[656]771printf("\n[convol] exec[%d] on core[%x,%d] enters parallel exec / cycle %d\n",
772tid , cxy , lpid , (unsigned int)date );
[652]773#endif
774
[676]775    // compute nthreads and nclusters from global variables
[652]776    unsigned int nclusters = x_size * y_size;
777    unsigned int nthreads  = nclusters * ncores;
778
[645]779    // indexes for loops
780    unsigned int c;                 // cluster index
781    unsigned int l;                 // line index
782    unsigned int p;                 // pixel index
783    unsigned int z;                 // vertical filter index
784
[676]785    unsigned int lines_per_thread   = image_nl / nthreads;
786    unsigned int lines_per_cluster  = image_nl / nclusters;
787    unsigned int pixels_per_thread  = image_np / nthreads;
788    unsigned int pixels_per_cluster = image_np / nclusters;
[645]789
[676]790    // compute number of pixels stored in one cluster
791    unsigned int local_pixels = image_nl * image_np / nclusters;       
[652]792
[645]793    get_cycle( &date );
[652]794    START[cid][lid] = (unsigned int)date;
[645]795
[676]796    // Each thread[cid][0] allocates 5 buffers local cluster cid
[656]797    // and registers these 5 pointers in the global arrays
[645]798    if ( lid == 0 )
799    {
[676]800        GA[cid] = malloc( local_pixels * sizeof( unsigned char ) );
[652]801        GB[cid] = malloc( local_pixels * sizeof( int ) );
802        GC[cid] = malloc( local_pixels * sizeof( int ) );
803        GD[cid] = malloc( local_pixels * sizeof( int ) );
804        GZ[cid] = malloc( local_pixels * sizeof( unsigned char ) );
[645]805
[676]806        if( (GA[cid] == NULL) || 
807            (GB[cid] == NULL) || 
808            (GC[cid] == NULL) || 
809            (GD[cid] == NULL) || 
810            (GZ[cid] == NULL) )
[652]811        {
812            printf("\n[convol error] thread[%d] cannot allocate buf_in\n", tid );
813            pthread_exit( &THREAD_EXIT_FAILURE );
814        }
[645]815
[652]816#if VERBOSE_EXEC
[656]817get_cycle( &date );
[676]818printf("\n[convol] exec[%d] on core[%x,%d] allocated shared buffers / cycle %d\n"
[656]819" GA %x / GB %x / GC %x / GD %x / GZ %x\n",
820tid, cxy , lpid, (unsigned int)date, GA[cid], GB[cid], GC[cid], GD[cid], GZ[cid] );
[645]821#endif
822   
823    }
824
825    ////////////////////////////////
826    pthread_barrier_wait( &barrier );
827
[676]828    // Each thread[tid] allocates and initialises in its private stack
[652]829    // a copy of the arrays of pointers on the distributed buffers.
[676]830    unsigned char  * A[CLUSTERS_MAX];
[645]831    int            * B[CLUSTERS_MAX];
832    int            * C[CLUSTERS_MAX];
833    int            * D[CLUSTERS_MAX];
834    unsigned char  * Z[CLUSTERS_MAX];
835
836    for( c = 0 ; c < nclusters ; c++ )
837    {
838        A[c] = GA[c];
839        B[c] = GB[c];
840        C[c] = GC[c];
841        D[c] = GD[c];
842        Z[c] = GZ[c];
843    }
844
[676]845    unsigned int npixels  = image_np * lines_per_thread;     // pixels moved by any thread
846    unsigned int g_offset = npixels * tid;             // offset in global buffer for tid
847    unsigned int l_offset = npixels * lid;             // offset in local buffer for tid
[645]848
[676]849    // min and max line indexes handled by thread[tid] for a global buffer
850    unsigned int global_lmin = tid * lines_per_thread;   
851    unsigned int global_lmax = global_lmin + lines_per_thread; 
852
853    // min and max line indexes handled by thread[tid] for a local buffer
854    unsigned int local_lmin  = lid * lines_per_thread;   
855    unsigned int local_lmax  = local_lmin + lines_per_thread; 
856
857    // pmin and pmax pixel indexes handled by thread[tid] in a column
858    unsigned int column_pmin = tid * pixels_per_thread; 
859    unsigned int column_pmax = column_pmin + pixels_per_thread; 
860
861    // Each thread[tid] copy npixels from image_in buffer to local A[cid] buffer
862    memcpy( A[cid]   + l_offset,
863            image_in + g_offset,
864            npixels );
[645]865 
[652]866#if VERBOSE_EXEC
[645]867get_cycle( &date );
[656]868printf( "\n[convol] exec[%d] on core[%x,%d] loaded input file in A[%d] / cycle %d\n", 
869tid , cxy , lpid , cid , (unsigned int)date);
[645]870#endif
871
[676]872    // Optionnal parallel display for the initial image
[645]873    if ( INITIAL_DISPLAY_ENABLE )
874    {
[676]875        // each thread[tid] copy npixels from A[cid] to in_win_buf buffer
876        memcpy( in_win_buf + g_offset,
877                A[cid]     + l_offset,
878                npixels );
[645]879
[676]880        // refresh the FBF window
881        if( fbf_refresh_window( in_wid , global_lmin , global_lmax ) ) 
[645]882        {
[676]883            printf("\n[convol error] in %s : thread[%d] cannot access FBF\n",
884            __FUNCTION__ , tid );
885            pthread_exit( &THREAD_EXIT_FAILURE );
[645]886        }
887
[652]888#if VERBOSE_EXEC
[645]889get_cycle( &date );
[656]890printf( "\n[convol] exec[%d] on core[%x,%d] completed initial display / cycle %d\n",
891tid , cxy , lpid , (unsigned int)date );
[645]892#endif
893
894        ////////////////////////////////
895        pthread_barrier_wait( &barrier );
896    }
897
898    ////////////////////////////////////////////////////////////
899    // parallel horizontal filter :
[676]900    // B <= Transpose(FH(A))
[645]901    // D <= A - FH(A)
[676]902    // Each thread computes (image_nl/nthreads) lines.
[645]903    // The image must be extended :
[652]904    // if (z<0)    TA(cid,l,z) == TA(cid,l,0)
[676]905    // if (z>image_np-1) TA(cid,l,z) == TA(cid,l,image_np-1)
[645]906    ////////////////////////////////////////////////////////////
907
908    get_cycle( &date );
[652]909    H_BEG[cid][lid] = (unsigned int)date;
[645]910
[676]911    // l = global line index / p = absolute pixel index 
[645]912
[676]913    for (l = global_lmin; l < global_lmax; l++)
[645]914    {
915        // src_c and src_l are the cluster index and the line index for A & D
916        int src_c = l / lines_per_cluster;
917        int src_l = l % lines_per_cluster;
918
919        // We use the specific values of the horizontal ep-filter for optimisation:
920        // sum(p) = sum(p-1) + TA[p+hrange] - TA[p-hrange-1]
921        // To minimize the number of tests, the loop on pixels is split in three domains
922
923        int sum_p = (hrange + 2) * TA(src_c, src_l, 0);
924        for (z = 1; z < hrange; z++)
925        {
926            sum_p = sum_p + TA(src_c, src_l, z);
927        }
928
929        // first domain : from 0 to hrange
930        for (p = 0; p < hrange + 1; p++)
931        {
932            // dst_c and dst_p are the cluster index and the pixel index for B
933            int dst_c = p / pixels_per_cluster;
934            int dst_p = p % pixels_per_cluster;
935            sum_p = sum_p + (int) TA(src_c, src_l, p + hrange) - (int) TA(src_c, src_l, 0);
936            TB(dst_c, dst_p, l) = sum_p / hnorm;
937            TD(src_c, src_l, p) = (int) TA(src_c, src_l, p) - sum_p / hnorm;
938        }
[676]939        // second domain : from (hrange+1) to (image_np-hrange-1)
940        for (p = hrange + 1; p < image_np - hrange; p++)
[645]941        {
942            // dst_c and dst_p are the cluster index and the pixel index for B
943            int dst_c = p / pixels_per_cluster;
944            int dst_p = p % pixels_per_cluster;
945            sum_p = sum_p + (int) TA(src_c, src_l, p + hrange) 
946                          - (int) TA(src_c, src_l, p - hrange - 1);
947            TB(dst_c, dst_p, l) = sum_p / hnorm;
948            TD(src_c, src_l, p) = (int) TA(src_c, src_l, p) - sum_p / hnorm;
949        }
[676]950        // third domain : from (image_np-hrange) to (image_np-1)
951        for (p = image_np - hrange; p < image_np; p++)
[645]952        {
953            // dst_c and dst_p are the cluster index and the pixel index for B
954            int dst_c = p / pixels_per_cluster;
955            int dst_p = p % pixels_per_cluster;
[676]956            sum_p = sum_p + (int) TA(src_c, src_l, image_np - 1) 
[645]957                          - (int) TA(src_c, src_l, p - hrange - 1);
958            TB(dst_c, dst_p, l) = sum_p / hnorm;
959            TD(src_c, src_l, p) = (int) TA(src_c, src_l, p) - sum_p / hnorm;
960        }
961
962#if SUPER_VERBOSE
963get_cycle( &date );
964printf(" - line %d computed at cycle %d\n", l, (unsigned int)date );
965#endif   
966
967    }
968
969    get_cycle( &date );
[652]970    H_END[cid][lid] = (unsigned int)date;
[645]971
[652]972#if VERBOSE_EXEC
[656]973get_cycle( &date );
974printf( "\n[convol] exec[%d] on core[%x,%d] completed horizontal filter / cycle %d\n",
975tid , cxy , lpid , (unsigned int)date );
[645]976#endif
977
978    ////////////////////////////////
979    pthread_barrier_wait( &barrier );
980
981    ///////////////////////////////////////////////////////////////
982    // parallel vertical filter :
[676]983    // C <= Transpose(FV(B))
984    // Each thread computes (image_np/nthreads) columns
[645]985    // The image must be extended :
[652]986    // if (l<0)    TB(cid,p,l) == TB(cid,p,0)
[676]987    // if (l>image_nl-1)   TB(cid,p,l) == TB(cid,p,image_nl-1)
[645]988    ///////////////////////////////////////////////////////////////
989
990    get_cycle( &date );
[652]991    V_BEG[cid][lid] = (unsigned int)date;
[645]992
[676]993    // l = global line index / p = pixel index in column
[645]994
[676]995    for (p = column_pmin; p < column_pmax ; p++)
[645]996    {
997        // src_c and src_p are the cluster index and the pixel index for B
998        int src_c = p / pixels_per_cluster;
999        int src_p = p % pixels_per_cluster;
1000
1001        int sum_l;
1002
1003        // We use the specific values of the vertical ep-filter
[676]1004        // To minimize the number of tests, the image_nl lines are split in three domains
[645]1005
1006        // first domain : explicit computation for the first 18 values
1007        for (l = 0; l < 18; l++)
1008        {
1009            // dst_c and dst_l are the cluster index and the line index for C
1010            int dst_c = l / lines_per_cluster;
1011            int dst_l = l % lines_per_cluster;
1012
1013            for (z = 0, sum_l = 0; z < 35; z++)
1014            {
1015                sum_l = sum_l + vf[z] * TB(src_c, src_p, max(l - 17 + z,0) );
1016            }
1017            TC(dst_c, dst_l, p) = sum_l / vnorm;
1018        }
1019        // second domain
[676]1020        for (l = 18; l < image_nl - 17; l++)
[645]1021        {
1022            // dst_c and dst_l are the cluster index and the line index for C
1023            int dst_c = l / lines_per_cluster;
1024            int dst_l = l % lines_per_cluster;
1025
1026            sum_l = sum_l + TB(src_c, src_p, l + 4)
1027                  + TB(src_c, src_p, l + 8)
1028                  + TB(src_c, src_p, l + 11)
1029                  + TB(src_c, src_p, l + 15)
1030                  + TB(src_c, src_p, l + 17)
1031                  - TB(src_c, src_p, l - 5)
1032                  - TB(src_c, src_p, l - 9)
1033                  - TB(src_c, src_p, l - 12)
1034                  - TB(src_c, src_p, l - 16)
1035                  - TB(src_c, src_p, l - 18);
1036
1037            TC(dst_c, dst_l, p) = sum_l / vnorm;
1038        }
1039        // third domain
[676]1040        for (l = image_nl - 17; l < image_nl; l++)
[645]1041        {
1042            // dst_c and dst_l are the cluster index and the line index for C
1043            int dst_c = l / lines_per_cluster;
1044            int dst_l = l % lines_per_cluster;
1045
[676]1046            sum_l = sum_l + TB(src_c, src_p, min(l + 4, image_nl - 1))
1047                  + TB(src_c, src_p, min(l + 8, image_nl - 1))
1048                  + TB(src_c, src_p, min(l + 11, image_nl - 1))
1049                  + TB(src_c, src_p, min(l + 15, image_nl - 1))
1050                  + TB(src_c, src_p, min(l + 17, image_nl - 1))
[645]1051                  - TB(src_c, src_p, l - 5)
1052                  - TB(src_c, src_p, l - 9)
1053                  - TB(src_c, src_p, l - 12)
1054                  - TB(src_c, src_p, l - 16)
1055                  - TB(src_c, src_p, l - 18);
1056
1057            TC(dst_c, dst_l, p) = sum_l / vnorm;
1058        }
1059
1060#if SUPER_VERBOSE
1061get_cycle( &date );
1062printf(" - column %d computed at cycle %d\n", p, (unsigned int)date );
1063#endif
1064
1065    }
1066
1067    get_cycle( &date );
[652]1068    V_END[cid][lid] = (unsigned int)date;
[645]1069
[652]1070#if VERBOSE_EXEC
[656]1071get_cycle( &date );
1072printf( "\n[convol] exec[%d] on core[%x,%d] completed vertical filter / cycle %d\n",
1073tid , cxy , lid , (unsigned int)date );
[645]1074#endif
1075
1076    ////////////////////////////////
1077    pthread_barrier_wait( &barrier );
1078
[676]1079    ///////////////////////////////////////////////////////////////
1080    // build final image in local Z buffer from C & D local buffers
1081    // store it in output image file, and display it on FBF.
1082    // Z <= C + D
1083    ///////////////////////////////////////////////////////////////
[645]1084
[676]1085    get_cycle( &date );
1086    F_BEG[cid][lid] = (unsigned int)date;
[645]1087
[676]1088    // Each thread[tid] set local buffer Z[cid] from local buffers C[cid] & D[cid]
[645]1089
[676]1090    for( l = local_lmin ; l < local_lmax ; l++ )
1091    {
1092        for( p = 0 ; p < image_np ; p++ )
[645]1093        {
[676]1094            TZ(cid,l,p) = TC(cid,l,p) + TD(cid,l,p);
1095        }
1096    }
[645]1097
[676]1098    // Each thread[tid] copy npixels from Z[cid] buffer to image_out buffer
1099    memcpy( image_out + g_offset,
1100            Z[cid]    + l_offset,
1101            npixels );
[645]1102
[676]1103    // Optional parallel display of the final image
1104    if ( FINAL_DISPLAY_ENABLE )
1105    {
1106        // each thread[tid] copy npixels from Z[cid] to out_win_buf buffer
1107        memcpy( out_win_buf + g_offset,
1108                Z[cid]      + l_offset,
1109                npixels );
1110
1111        // refresh the FBF window
1112        if( fbf_refresh_window( out_wid , global_lmin , global_lmax ) )
1113        {
1114            printf("\n[convol error] in %s : thread[%d] cannot access FBF\n",
1115            __FUNCTION__ , tid );
1116            pthread_exit( &THREAD_EXIT_FAILURE );
[645]1117        }
1118
[676]1119#if VERBOSE_EXEC
[656]1120get_cycle( &date );
1121printf( "\n[convol] exec[%d] on core[%x,%d] completed final display / cycle %d\n",
[676]1122tid , cxy , lpid , (unsigned int)date );
[645]1123#endif
[652]1124
[645]1125    }
1126
[656]1127    // Each thread[cid,0] releases the 5 local buffers
1128    if( lid == 0 )
[645]1129    {
[656]1130        free( A[cid] );
1131        free( B[cid] );
1132        free( C[cid] );
1133        free( D[cid] );
1134        free( Z[cid] );
1135    }
1136
[676]1137    get_cycle( &date );
1138    F_END[cid][lid] = (unsigned int)date;
1139
[656]1140    // thread termination depends on the placement policy
1141    if( PARALLEL_PLACEMENT )   
1142    {
1143        // <exec> threads are runing in detached mode, and
1144        // each thread must signal completion by calling barrier
1145        // passed in arguments before exit
1146
1147        pthread_barrier_wait( args->barrier );
1148
[645]1149        pthread_exit( &THREAD_EXIT_SUCCESS );
1150    }
[656]1151    else
1152    {
1153        // <exec> threads are running in attached mode
1154        // all threads (but the one executing main) exit
1155        if ( tid != tid_main ) pthread_exit( &THREAD_EXIT_SUCCESS );
1156    }
[645]1157
[656]1158    return NULL;
1159
[645]1160} // end execute()
1161
1162
1163
[676]1164
1165
1166
[652]1167//////////////////////////
1168void instrument( FILE * f,
1169                 char * filename )
[645]1170{
[652]1171    unsigned int nclusters = x_size * y_size;
[645]1172
[652]1173    unsigned int cc, pp;
[645]1174
[652]1175    unsigned int min_start = 0xFFFFFFFF;
1176    unsigned int max_start = 0;
[645]1177
[652]1178    unsigned int min_h_beg = 0xFFFFFFFF;
1179    unsigned int max_h_beg = 0;
[645]1180
[652]1181    unsigned int min_h_end = 0xFFFFFFFF;
1182    unsigned int max_h_end = 0;
[645]1183
[652]1184    unsigned int min_v_beg = 0xFFFFFFFF;
1185    unsigned int max_v_beg = 0;
[645]1186
[652]1187    unsigned int min_v_end = 0xFFFFFFFF;
1188    unsigned int max_v_end = 0;
[645]1189
[676]1190    unsigned int min_f_beg = 0xFFFFFFFF;
1191    unsigned int max_f_beg = 0;
[645]1192
[676]1193    unsigned int min_f_end = 0xFFFFFFFF;
1194    unsigned int max_f_end = 0;
[652]1195
1196    for (cc = 0; cc < nclusters; cc++)
1197    {
1198        for (pp = 0; pp < ncores; pp++ )
[645]1199        {
[652]1200            if (START[cc][pp] < min_start) min_start = START[cc][pp];
1201            if (START[cc][pp] > max_start) max_start = START[cc][pp];
[645]1202
[652]1203            if (H_BEG[cc][pp] < min_h_beg) min_h_beg = H_BEG[cc][pp];
1204            if (H_BEG[cc][pp] > max_h_beg) max_h_beg = H_BEG[cc][pp];
[645]1205
[652]1206            if (H_END[cc][pp] < min_h_end) min_h_end = H_END[cc][pp];
1207            if (H_END[cc][pp] > max_h_end) max_h_end = H_END[cc][pp];
[645]1208
[652]1209            if (V_BEG[cc][pp] < min_v_beg) min_v_beg = V_BEG[cc][pp];
1210            if (V_BEG[cc][pp] > max_v_beg) max_v_beg = V_BEG[cc][pp];
[645]1211
[652]1212            if (V_END[cc][pp] < min_v_end) min_v_end = V_END[cc][pp];
1213            if (V_END[cc][pp] > max_v_end) max_v_end = V_END[cc][pp];
[645]1214
[676]1215            if (F_BEG[cc][pp] < min_f_beg) min_f_beg = F_BEG[cc][pp];
1216            if (F_BEG[cc][pp] > max_f_beg) max_f_beg = F_BEG[cc][pp];
[645]1217
[676]1218            if (F_END[cc][pp] < min_f_end) min_f_end = F_END[cc][pp];
1219            if (F_END[cc][pp] > max_f_end) max_f_end = F_END[cc][pp];
[645]1220        }
[652]1221    }
[645]1222
[652]1223    // display on terminal
1224    printf( "\n ------ %s ------\n" , filename );
[645]1225
[652]1226    printf(" - START : min = %d / max = %d / med = %d / delta = %d\n",
1227           min_start, max_start, (min_start+max_start)/2, max_start-min_start);
[645]1228
[652]1229    printf(" - H_BEG : min = %d / max = %d / med = %d / delta = %d\n",
1230           min_h_beg, max_h_beg, (min_h_beg+max_h_beg)/2, max_h_beg-min_h_beg);
[645]1231
[652]1232    printf(" - H_END : min = %d / max = %d / med = %d / delta = %d\n",
1233           min_h_end, max_h_end, (min_h_end+max_h_end)/2, max_h_end-min_h_end);
[645]1234
[652]1235    printf(" - V_BEG : min = %d / max = %d / med = %d / delta = %d\n",
1236           min_v_beg, max_v_beg, (min_v_beg+max_v_beg)/2, max_v_beg-min_v_beg);
[645]1237
[652]1238    printf(" - V_END : min = %d / max = %d / med = %d / delta = %d\n",
1239           min_v_end, max_v_end, (min_v_end+max_v_end)/2, max_v_end-min_v_end);
[645]1240
[652]1241    printf(" - D_BEG : min = %d / max = %d / med = %d / delta = %d\n",
[676]1242           min_f_beg, max_f_beg, (min_f_beg+max_f_beg)/2, max_f_beg-min_f_beg);
[645]1243
[652]1244    printf(" - D_END : min = %d / max = %d / med = %d / delta = %d\n",
[676]1245           min_f_end, max_f_end, (min_f_end+max_f_end)/2, max_f_end-min_f_end);
[645]1246
[656]1247    printf( "\n General Scenario   (Kcycles)\n" );
[652]1248    printf( " - LOAD IMAGE        = %d\n", (min_h_beg - min_start)/1000 );
1249    printf( " - H_FILTER          = %d\n", (max_h_end - min_h_beg)/1000 );
1250    printf( " - BARRIER HORI/VERT = %d\n", (min_v_beg - max_h_end)/1000 );
1251    printf( " - V_FILTER          = %d\n", (max_v_end - min_v_beg)/1000 );
[676]1252    printf( " - BARRIER VERT/DISP = %d\n", (min_f_beg - max_v_end)/1000 );
1253    printf( " - DISPLAY           = %d\n", (max_f_end - min_f_beg)/1000 );
[656]1254    printf( " \nSEQUENCIAL = %d / PARALLEL = %d\n",
1255            SEQUENCIAL_TIME/1000, PARALLEL_TIME/1000 );
[645]1256
[652]1257    // save on disk
1258    fprintf( f ,  "\n ------ %s ------\n" , filename );
1259
1260    fprintf( f , " - START : min = %d / max = %d / med = %d / delta = %d\n",
1261           min_start, max_start, (min_start+max_start)/2, max_start-min_start);
1262
1263    fprintf( f , " - H_BEG : min = %d / max = %d / med = %d / delta = %d\n",
1264           min_h_beg, max_h_beg, (min_h_beg+max_h_beg)/2, max_h_beg-min_h_beg);
1265
1266    fprintf( f , " - H_END : min = %d / max = %d / med = %d / delta = %d\n",
1267           min_h_end, max_h_end, (min_h_end+max_h_end)/2, max_h_end-min_h_end);
1268
1269    fprintf( f , " - V_BEG : min = %d / max = %d / med = %d / delta = %d\n",
1270           min_v_beg, max_v_beg, (min_v_beg+max_v_beg)/2, max_v_beg-min_v_beg);
1271
1272    fprintf( f , " - V_END : min = %d / max = %d / med = %d / delta = %d\n",
1273           min_v_end, max_v_end, (min_v_end+max_v_end)/2, max_v_end-min_v_end);
1274
1275    fprintf( f , " - D_BEG : min = %d / max = %d / med = %d / delta = %d\n",
[676]1276           min_f_beg, max_f_beg, (min_f_beg+max_f_beg)/2, max_f_beg-min_f_beg);
[652]1277
1278    fprintf( f , " - D_END : min = %d / max = %d / med = %d / delta = %d\n",
[676]1279           min_f_end, max_f_end, (min_f_end+max_f_end)/2, max_f_end-min_f_end);
[652]1280
1281    fprintf( f ,  "\n General Scenario (Kcycles)\n" );
1282    fprintf( f ,  " - LOAD IMAGE        = %d\n", (min_h_beg - min_start)/1000 );
1283    fprintf( f ,  " - H_FILTER          = %d\n", (max_h_end - min_h_beg)/1000 );
1284    fprintf( f ,  " - BARRIER HORI/VERT = %d\n", (min_v_beg - max_h_end)/1000 );
1285    fprintf( f ,  " - V_FILTER          = %d\n", (max_v_end - min_v_beg)/1000 );
[676]1286    fprintf( f ,  " - BARRIER VERT/DISP = %d\n", (min_f_beg - max_v_end)/1000 );
1287    fprintf( f ,  " - SAVE              = %d\n", (max_f_end - min_f_beg)/1000 );
[656]1288    fprintf( f ,  " \nSEQUENCIAL = %d / PARALLEL = %d\n",
1289    SEQUENCIAL_TIME/1000, PARALLEL_TIME/1000 );
[652]1290
[645]1291} // end instrument()
1292
1293
1294
1295
1296
1297// Local Variables:
1298// tab-width: 3
1299// c-basic-offset: 3
1300// c-file-offsets:((innamespace . 0)(inline-open . 0))
1301// indent-tabs-mode: nil
1302// End:
1303
1304// vim: filetype=cpp:expandtab:shiftwidth=3:tabstop=3:softtabstop=3
1305
1306
Note: See TracBrowser for help on using the repository browser.