Changeset 502 for soft/giet_vm/applications/transpose
- Timestamp:
- Feb 8, 2015, 9:20:45 PM (10 years ago)
- Location:
- soft/giet_vm/applications/transpose
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
soft/giet_vm/applications/transpose/main.c
r444 r502 1 /////////////////////////////////////////////////////////////////////////////////////// //////1 /////////////////////////////////////////////////////////////////////////////////////// 2 2 // File : main.c (for transpose application) 3 3 // Date : february 2014 4 4 // author : Alain Greiner 5 ///////////////////////////////////////////////////////////////////////////////////////////// 6 // This multi-threaded application makes a transpose for a NN*NN pixels sequence of images. 5 /////////////////////////////////////////////////////////////////////////////////////// 6 // This multi-threaded application makes a transpose for a NN*NN pixels 7 // sequence of images. 7 8 // It can run on a multi-processors, multi-clusters architecture, with one thread 8 // per processor. It uses the he following hardware parameters, that must be defined 9 // in the hard_config.h file: 10 // - X_SIZE : number of clusters in a row 11 // - Y_SIZE : number of clusters in a column 12 // - NB_PROCS_MAX : number of processors per cluster 13 // - FBUF_X_SIZE : number of pixels per line in frame buffer 14 // - FBUF_Y_SIZE : number of lines in frame buffer 15 // 9 // per processor. 10 // 16 11 // The image sequence is read from a file (one byte per pixel). 17 12 // The input and output buffers containing the image are distributed in all clusters. 18 13 // 19 // - The image size NN must be a power of 2 and must fit the frame buffer size.20 // - The number of clusters containing processors must be a power of 2.21 // - The number of processors per cluster must be a power of 2.22 // - The image size NN must be larger or equal to the total number of processor.14 // - The image size NN must fit the frame buffer size: 128 bytes 15 // - The block size in block device must be 512 bytes. 16 // - The number of clusters must be a power of 2 no larger than 32 17 // - The number of processors per cluster must be a power of 2 no larger than 4 23 18 // 24 19 // For each image the application makes a self test (checksum for each line). 25 20 // The actual display on the frame buffer depends on frame buffer availability. 26 ///////////////////////////////////////////////////////////////////////////////////////////// 27 28 #include "hard_config.h" 21 /////////////////////////////////////////////////////////////////////////////////////// 22 29 23 #include "stdio.h" 30 #include " barrier.h"24 #include "user_barrier.h" 31 25 #include "malloc.h" 32 26 27 #define BLOCK_SIZE 512 // block size on disk 28 #define CLUSTERS_MAX 32 // max number of clusters 29 #define PROCS_MAX 4 // max number of processors per cluster 33 30 #define NN 128 // image size : nlines = npixels = 128 34 31 #define NB_IMAGES 5 // number of images to be handled 35 32 #define FILE_PATHNAME "misc/images.raw" // file pathname on disk 36 #define NB_CLUSTERS (X_SIZE * Y_SIZE) // number of clusters37 33 #define INSTRUMENTATION_OK 0 // display statistics on TTY when non zero 38 34 … … 41 37 /////////////////////////////////////////////////////// 42 38 43 // instrumentation counters 44 // for each processor (up to 4 processors) 45 // in each cluster (up to 32 clusters) 46 unsigned int LOAD_START[NB_CLUSTERS][NB_PROCS_MAX]; 47 unsigned int LOAD_END [NB_CLUSTERS][NB_PROCS_MAX]; 48 unsigned int TRSP_START[NB_CLUSTERS][NB_PROCS_MAX]; 49 unsigned int TRSP_END [NB_CLUSTERS][NB_PROCS_MAX]; 50 unsigned int DISP_START[NB_CLUSTERS][NB_PROCS_MAX]; 51 unsigned int DISP_END [NB_CLUSTERS][NB_PROCS_MAX]; 39 // instrumentation counters for each processor in each cluster 40 unsigned int LOAD_START[CLUSTERS_MAX][PROCS_MAX]; 41 unsigned int LOAD_END [CLUSTERS_MAX][PROCS_MAX]; 42 unsigned int TRSP_START[CLUSTERS_MAX][PROCS_MAX]; 43 unsigned int TRSP_END [CLUSTERS_MAX][PROCS_MAX]; 44 unsigned int DISP_START[CLUSTERS_MAX][PROCS_MAX]; 45 unsigned int DISP_END [CLUSTERS_MAX][PROCS_MAX]; 52 46 53 47 // arrays of pointers on distributed buffers 54 48 // one input buffer & one output buffer per cluster 55 unsigned char* buf_in [ NB_CLUSTERS];56 unsigned char* buf_out[ NB_CLUSTERS];49 unsigned char* buf_in [CLUSTERS_MAX]; 50 unsigned char* buf_out[CLUSTERS_MAX]; 57 51 58 52 // checksum variables … … 61 55 62 56 // global synchronisation barrier 63 giet_ barrier_t barrier;57 giet_sqt_barrier_t barrier; 64 58 65 59 volatile unsigned int init_ok = 0; … … 70 64 { 71 65 72 int file = 0; // file descriptor 73 unsigned int l; // line index for loops 74 unsigned int p; // pixel index for loops 75 unsigned int c; // cluster index for loops 76 77 // get processor identifiers 78 unsigned int x; // x cluster coordinate 79 unsigned int y; // y cluster coordinate 80 unsigned int lpid; // local processor index 66 unsigned int l; // line index for loops 67 unsigned int p; // pixel index for loops 68 unsigned int c; // cluster index for loops 69 70 // processor identifiers 71 unsigned int x; // x cluster coordinate 72 unsigned int y; // y cluster coordinate 73 unsigned int lpid; // local processor index 74 75 // plat-form parameters 76 unsigned int x_size; // number of clusters in a row 77 unsigned int y_size; // number of clusters in a column 78 unsigned int nprocs; // number of processors per cluster 79 81 80 giet_proc_xyp( &x, &y, &lpid); 82 81 83 unsigned int npixels = NN * NN; // pixels per image 84 unsigned int nblocks = npixels / 512; // blocks per image 85 unsigned int image = 0; // image counter 86 87 unsigned int cluster_id = (x * Y_SIZE) + y; // "continuous" index 88 unsigned int ntasks = NB_CLUSTERS * NB_PROCS_MAX; // number of tasks 89 unsigned int task_id = (cluster_id * NB_PROCS_MAX) + lpid; // "continuous" task index 82 giet_procs_number( &x_size , &y_size , &nprocs ); 83 84 giet_shr_printf("\n[TRANSPOSE] Processor[%d,%d,%d] starts at cycle %d\n" 85 " - x_size = %d\n" 86 " - y_size = %d\n" 87 " - nprocs = %d\n", 88 x, y, lpid, giet_proctime(), x_size , y_size , nprocs ); 89 90 unsigned int nclusters = x_size * y_size; // number of clusters 91 unsigned int ntasks = x_size * y_size * nprocs; // number of tasks 92 unsigned int npixels = NN * NN; // pixels per image 93 unsigned int nblocks = npixels / BLOCK_SIZE; // blocks per image 94 unsigned int image = 0; // image counter 95 int file = 0; // file descriptor 96 unsigned int cluster_id = (x * y_size) + y; // "continuous" index 97 unsigned int task_id = (cluster_id * nprocs) + lpid; // "continuous" task index 90 98 91 99 // Processor [0,0,0] makes initialisation 92 // It includes parameters checking, barrier sinitialization,100 // It includes parameters checking, barrier initialization, 93 101 // distributed buffers allocation, and file open 94 102 if ( (x==0) && (y==0) && (lpid==0) ) 95 103 { 96 // Parameters checking 97 if ( (NN != FBUF_X_SIZE) || (NN != FBUF_Y_SIZE) ) 98 { 99 giet_exit("[TRANSPOSE ERROR] Frame buffer size does not fit image size"); 100 } 101 if ((NB_PROCS_MAX != 1) && (NB_PROCS_MAX != 2) && (NB_PROCS_MAX != 4)) 104 if ((nprocs != 1) && (nprocs != 2) && (nprocs != 4)) 102 105 { 103 giet_exit("[TRANSPOSE ERROR] NB_PROCS_MAXmust be 1, 2 or 4");104 } 105 if (( NB_CLUSTERS != 1) && (NB_CLUSTERS != 2) && (NB_CLUSTERS!= 4) &&106 ( NB_CLUSTERS != 8) && (NB_CLUSTERS != 16) && (NB_CLUSTERS!= 32) )106 giet_exit("[TRANSPOSE ERROR] number of procs per cluster must be 1, 2 or 4"); 107 } 108 if ((nclusters != 1) && (nclusters != 2) && (nclusters != 4) && 109 (nclusters != 8) && (nclusters != 16) && (nclusters != 32) ) 107 110 { 108 111 giet_exit("[TRANSPOSE ERROR] number of clusters must be 1,2,4,8,16,32"); … … 113 116 } 114 117 115 giet_shr_printf("\n[TRANSPOSE] Processor[0,0,0] starts at cycle %d\n"116 " - x_size = %d\n"117 " - y_size = %d\n"118 " - nprocs = %d\n"119 " - nclusters = %d\n"120 " - ntasks = %d\n",121 giet_proctime(), X_SIZE, Y_SIZE, NB_PROCS_MAX, NB_CLUSTERS, ntasks );122 123 118 // Barrier initialisation 124 barrier_init( &barrier, ntasks );119 sqt_barrier_init( &barrier, x_size , y_size , nprocs ); 125 120 126 121 giet_shr_printf("\n[TRANSPOSE] Proc [0,0,0] completes barrier init at cycle %d\n", … … 128 123 129 124 // Distributed buffers allocation 130 // The buffers containing one image are distributed in clusters131 // (one buf_in and one buf_out per cluster).132 // Each buffer contains (NN*NN / NB_CLUSTERS) bytes.133 for ( c = 0 ; c < NB_CLUSTERS; c++ )134 { 135 unsigned int rx = c / Y_SIZE;136 unsigned int ry = c % Y_SIZE;137 138 buf_in[c] = remote_malloc( npixels/ NB_CLUSTERS, rx, ry );139 buf_out[c] = remote_malloc( npixels/ NB_CLUSTERS, rx, ry );125 // The buffers containing one image are distributed in the user 126 // heap (one buf_in and one buf_out per cluster). 127 // Each buffer contains (NN*NN / nclusters) bytes. 128 for ( c = 0 ; c < nclusters ; c++ ) 129 { 130 unsigned int rx = c / y_size; 131 unsigned int ry = c % y_size; 132 133 buf_in[c] = remote_malloc( npixels/nclusters, rx, ry ); 134 buf_out[c] = remote_malloc( npixels/nclusters, rx, ry ); 140 135 141 136 giet_shr_printf("\n[TRANSPOSE] Proc [0,0,0] completes buffer allocation" … … 167 162 { 168 163 while ( init_ok == 0 ); 169 giet_shr_printf("\n[TRANSPOSE] Processor[%d,%d,%d] starts at cycle %d\n",170 x, y, lpid, giet_proctime() );171 164 } 172 165 … … 175 168 while (image < NB_IMAGES) 176 169 { 177 // pseudo parallel load from disk to buf_in buffer : nblocks/ NB_CLUSTERSblocks170 // pseudo parallel load from disk to buf_in buffer : nblocks/nclusters blocks 178 171 // only task running on processor with (lpid == 0) does it 179 172 … … 184 177 giet_fat_read( file, 185 178 buf_in[cluster_id], 186 (nblocks / NB_CLUSTERS), 187 ((image*nblocks) + ((nblocks*cluster_id)/NB_CLUSTERS)) ); 188 179 (nblocks / nclusters), 180 ((image*nblocks) + ((nblocks*cluster_id)/nclusters)) ); 181 182 if ( (x==0) && (y==0) ) 189 183 giet_shr_printf("\n[TRANSPOSE] Proc [%d,%d,%d] completes load" 190 184 " for image %d at cycle %d\n", … … 194 188 LOAD_END[cluster_id][lpid] = giet_proctime(); 195 189 196 ///////////////////////// 197 barrier_wait( &barrier );190 ///////////////////////////// 191 sqt_barrier_wait( &barrier ); 198 192 199 193 // parallel transpose from buf_in to buf_out … … 206 200 207 201 unsigned int nlt = NN / ntasks; // number of lines per task 208 unsigned int nlc = NN / NB_CLUSTERS; // number of lines per cluster202 unsigned int nlc = NN / nclusters; // number of lines per cluster 209 203 210 204 unsigned int src_cluster; … … 242 236 if ( lpid == 0 ) 243 237 { 238 if ( (x==0) && (y==0) ) 244 239 giet_shr_printf("\n[TRANSPOSE] proc [%d,%d,0] completes transpose" 245 240 " for image %d at cycle %d\n", … … 249 244 TRSP_END[cluster_id][lpid] = giet_proctime(); 250 245 251 ///////////////////////// 252 barrier_wait( &barrier );246 ///////////////////////////// 247 sqt_barrier_wait( &barrier ); 253 248 254 249 // optional parallel display from local buf_out to frame buffer … … 265 260 npt ); 266 261 267 if ( lpid == 0 ) 268 { 269 giet_shr_printf("\n[TRANSPOSE] Proc [%d,%d,0] completes display" 270 " for image %d at cycle %d\n", 271 x, y, image, giet_proctime() ); 272 } 262 if ( (x==0) && (y==0) && (lpid==0) ) 263 giet_shr_printf("\n[TRANSPOSE] Proc [%d,%d,%d] completes display" 264 " for image %d at cycle %d\n", 265 x, y, lpid, image, giet_proctime() ); 273 266 274 267 DISP_END[cluster_id][lpid] = giet_proctime(); 275 268 276 ///////////////////////// 277 barrier_wait( &barrier );269 ///////////////////////////// 270 sqt_barrier_wait( &barrier ); 278 271 } 279 272 … … 318 311 } 319 312 320 ///////////////////////// 321 barrier_wait( &barrier );313 ///////////////////////////// 314 sqt_barrier_wait( &barrier ); 322 315 323 316 // instrumentation done by processor [0,0,0] … … 338 331 unsigned int max_disp_ended = 0; 339 332 340 for (cc = 0; cc < NB_CLUSTERS; cc++)333 for (cc = 0; cc < nclusters; cc++) 341 334 { 342 335 for (pp = 0; pp < NB_PROCS_MAX; pp++) … … 384 377 image++; 385 378 386 ///////////////////////// 387 barrier_wait( &barrier );379 ///////////////////////////// 380 sqt_barrier_wait( &barrier ); 388 381 389 382 } // end while image … … 392 385 if ( (x==0) && (y==0) && (lpid==0) ) 393 386 { 394 for ( c = 0 ; c < NB_CLUSTERS; c++ )387 for ( c = 0 ; c < nclusters ; c++ ) 395 388 { 396 389 free( buf_in[c] ); -
soft/giet_vm/applications/transpose/transpose.py
r457 r502 3 3 from mapping import * 4 4 5 ################################################################################## ####5 ################################################################################## 6 6 # file : transpose.py (for the transpose application) 7 7 # date : may 2014 8 8 # author : Alain Greiner 9 ################################################################################## #####9 ################################################################################## 10 10 # This file describes the mapping of the multi-threaded "transpose" 11 11 # application on a multi-clusters, multi-processors architecture. 12 12 # This include both the mapping of virtual segments on the clusters, 13 13 # and the mapping of tasks on processors. 14 # There is one task per processor. 15 # The mapping of virtual segments is the following: 16 # - There is one shared data vseg in cluster[0][0] 17 # - The code vsegs are replicated on all clusters containing processors. 18 # - There is one heap vseg per cluster containing processors. 19 # - The stacks vsegs are distibuted on all clusters containing processors. 14 20 # This mapping uses 5 platform parameters, (obtained from the "mapping" argument) 15 # - x_size : number of clusters in a row16 # - y_size : number of clusters in a column17 # - x_width : number of bits coding x coordinate18 # - y_width : number of bits coding y coordinate19 # - nprocs : number of processors per cluster20 ################################################################################## ##21 # - x_size : number of clusters in a row 22 # - y_size : number of clusters in a column 23 # - x_width : number of bits coding x coordinate 24 # - y_width : number of bits coding y coordinate 25 # - nprocs : number of processors per cluster 26 ################################################################################## 21 27 22 28 ######################### … … 54 60 for x in xrange (x_size): 55 61 for y in xrange (y_size): 56 mapping.addVseg( vspace, 'trsp_code_%d_%d' %(x,y), code_base , code_size, 57 'CXWU', vtype = 'ELF', x = x, y = y, pseg = 'RAM', 58 binpath = 'build/transpose/transpose.elf', 59 local = True ) 62 cluster_id = (x * y_size) + y 63 if ( mapping.clusters[cluster_id].procs ): 60 64 61 # stacks vsegs: local (one stack per processor => nprocs stacks per cluster) 62 for x in xrange (x_size): 63 for y in xrange (y_size): 64 for p in xrange( nprocs ): 65 proc_id = (((x * y_size) + y) * nprocs) + p 66 size = (stack_size / nprocs) & 0xFFFFF000 67 base = stack_base + (proc_id * size) 68 mapping.addVseg( vspace, 'trsp_stack_%d_%d_%d' % (x,y,p), base, size, 69 'C_WU', vtype = 'BUFFER', x = x , y = y , pseg = 'RAM', 70 local = True, big = True ) 65 mapping.addVseg( vspace, 'trsp_code_%d_%d' %(x,y), 66 code_base , code_size, 67 'CXWU', vtype = 'ELF', x = x, y = y, pseg = 'RAM', 68 binpath = 'build/transpose/transpose.elf', 69 local = True ) 71 70 72 # heap vsegs: distributed but non local (all heap vsegs can be accessed by all tasks)71 # stacks vsegs: local (one stack per processor => nprocs stacks per cluster) 73 72 for x in xrange (x_size): 74 73 for y in xrange (y_size): 75 74 cluster_id = (x * y_size) + y 76 size = heap_size 77 base = heap_base + (cluster_id * size) 78 mapping.addVseg( vspace, 'trsp_heap_%d_%d' % (x,y), base, size, 79 'C_WU', vtype = 'BUFFER', x = x, y = y, pseg = 'RAM', 80 local = False, big = True ) 75 if ( mapping.clusters[cluster_id].procs ): 76 for p in xrange( nprocs ): 77 proc_id = (((x * y_size) + y) * nprocs) + p 78 size = (stack_size / nprocs) & 0xFFFFF000 79 base = stack_base + (proc_id * size) 80 81 mapping.addVseg( vspace, 'trsp_stack_%d_%d_%d' % (x,y,p), 82 base, size, 'C_WU', vtype = 'BUFFER', 83 x = x , y = y , pseg = 'RAM', 84 local = True, big = True ) 85 86 # heap vsegs: distributed non local (all heap vsegs can be accessed by all tasks) 87 for x in xrange (x_size): 88 for y in xrange (y_size): 89 cluster_id = (x * y_size) + y 90 if ( mapping.clusters[cluster_id].procs ): 91 size = heap_size 92 base = heap_base + (cluster_id * size) 93 94 mapping.addVseg( vspace, 'trsp_heap_%d_%d' % (x,y), base, size, 95 'C_WU', vtype = 'HEAP', x = x, y = y, pseg = 'RAM', 96 local = False, big = True ) 81 97 82 98 # distributed tasks / one task per processor 83 99 for x in xrange (x_size): 84 100 for y in xrange (y_size): 85 for p in xrange( nprocs ): 86 trdid = (((x * y_size) + y) * nprocs) + p 87 mapping.addTask( vspace, 'trsp_%d_%d_%d' % (x,y,p), trdid, x, y, p, 88 'trsp_stack_%d_%d_%d' % (x,y,p), 89 'trsp_heap_%d_%d' % (x,y), 0 ) 101 cluster_id = (x * y_size) + y 102 if ( mapping.clusters[cluster_id].procs ): 103 for p in xrange( nprocs ): 104 trdid = (((x * y_size) + y) * nprocs) + p 105 106 mapping.addTask( vspace, 'trsp_%d_%d_%d' % (x,y,p), 107 trdid, x, y, p, 108 'trsp_stack_%d_%d_%d' % (x,y,p), 109 'trsp_heap_%d_%d' % (x,y), 0 ) 90 110 91 111 # extend mapping name … … 94 114 return vspace # useful for test 95 115 96 ################################ test ################################################## ####116 ################################ test ################################################## 97 117 98 118 if __name__ == '__main__':
Note: See TracChangeset
for help on using the changeset viewer.