Changeset 383 for soft/giet_vm/transpose
- Timestamp:
- Aug 7, 2014, 12:27:17 PM (10 years ago)
- Location:
- soft/giet_vm/transpose
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
soft/giet_vm/transpose/main.c
r355 r383 20 20 #include "stdio.h" 21 21 #include "barrier.h" 22 #include "malloc.h" 22 23 23 24 #define NN 128 // image size : nlines = npixels = 128 24 25 #define NB_IMAGES 5 // number of images to be handled 25 26 #define FILE_PATHNAME "misc/images.raw" // file pathname on disk 26 27 #define NB_CLUSTERS (X_SIZE * Y_SIZE) // number of clusters 27 28 #define INSTRUMENTATION_OK 1 // display statistics on TTY when non zero 28 29 … … 34 35 // for each processor (up to 4 processors) 35 36 // in each cluster (up to 32 clusters) 36 unsigned int LOAD_START[ 32][4];37 unsigned int LOAD_END [ 32][4];38 unsigned int TRSP_START[ 32][4];39 unsigned int TRSP_END [ 32][4];40 unsigned int DISP_START[ 32][4];41 unsigned int DISP_END [ 32][4];37 unsigned int LOAD_START[NB_CLUSTERS][NB_PROCS_MAX]; 38 unsigned int LOAD_END [NB_CLUSTERS][NB_PROCS_MAX]; 39 unsigned int TRSP_START[NB_CLUSTERS][NB_PROCS_MAX]; 40 unsigned int TRSP_END [NB_CLUSTERS][NB_PROCS_MAX]; 41 unsigned int DISP_START[NB_CLUSTERS][NB_PROCS_MAX]; 42 unsigned int DISP_END [NB_CLUSTERS][NB_PROCS_MAX]; 42 43 43 44 // arrays of pointers on distributed buffers 44 45 // one input buffer & one output buffer per cluster 45 unsigned char* buf_in [ 32];46 unsigned char* buf_out[ 32];46 unsigned char* buf_in [NB_CLUSTERS]; 47 unsigned char* buf_out[NB_CLUSTERS]; 47 48 48 49 // checksum variables … … 50 51 unsigned check_line_after[NN]; 51 52 52 // synchronisation barriers 53 giet_barrier_t barrier_0; 54 giet_barrier_t barrier_1; 55 giet_barrier_t barrier_2; 56 giet_barrier_t barrier_3; 57 giet_barrier_t barrier_4; 58 giet_barrier_t barrier_5; 53 // global synchronisation barrier 54 giet_barrier_t barrier; 59 55 60 56 volatile unsigned int init_ok = 1; … … 62 58 ////////////////////////////////////////// 63 59 __attribute__ ((constructor)) void main() 60 ////////////////////////////////////////// 64 61 { 65 62 … … 100 97 } 101 98 102 barrier_init( &barrier_0, ntasks ); 103 barrier_init( &barrier_1, ntasks ); 104 barrier_init( &barrier_2, ntasks ); 105 barrier_init( &barrier_3, ntasks ); 106 barrier_init( &barrier_4, ntasks ); 107 barrier_init( &barrier_5, ntasks ); 99 barrier_init( &barrier, ntasks ); 108 100 109 101 giet_shr_printf("\n[TRANSPOSE] Proc [%d,%d,%d] completes barrier init at cycle %d\n", … … 135 127 136 128 // allocate buffers in cluster[x,y] 137 buf_in[cluster_id] = ((unsigned char*)heap_base) + (cluster_xy << 20);138 buf_out[cluster_id] = buf_in[cluster_id] + NN*NN/nclusters;129 buf_in[cluster_id] = remote_malloc( npixels/NB_CLUSTERS, x, y); 130 buf_out[cluster_id] = remote_malloc( npixels/NB_CLUSTERS, x, y); 139 131 140 132 giet_shr_printf("\n[TRANSPOSE] Proc [%d,%d,%d] completes buffer allocation at cycle %d\n" … … 163 155 } 164 156 165 ///////////////////////// //166 barrier_wait( &barrier _0);157 ///////////////////////// 158 barrier_wait( &barrier ); 167 159 168 160 // Main loop (on images) … … 188 180 LOAD_END[cluster_id][lpid] = giet_proctime(); 189 181 190 ///////////////////////// //191 barrier_wait( &barrier _1);182 ///////////////////////// 183 barrier_wait( &barrier ); 192 184 193 185 // parallel transpose from buf_in to buf_out … … 243 235 TRSP_END[cluster_id][lpid] = giet_proctime(); 244 236 245 ///////////////////////// //246 barrier_wait( &barrier _2);237 ///////////////////////// 238 barrier_wait( &barrier ); 247 239 248 240 // optional parallel display from local buf_out to frame buffer … … 268 260 DISP_END[cluster_id][lpid] = giet_proctime(); 269 261 270 ///////////////////////// //271 barrier_wait( &barrier _3);262 ///////////////////////// 263 barrier_wait( &barrier ); 272 264 } 273 265 … … 312 304 } 313 305 314 ///////////////////////// //315 barrier_wait( &barrier _4);306 ///////////////////////// 307 barrier_wait( &barrier ); 316 308 317 309 // instrumentation done by processor [0,0,0] … … 382 374 // all tasks must wait instrumentation completion 383 375 ////////////////////////////////////////////////// 384 barrier_wait( &barrier _5);376 barrier_wait( &barrier ); 385 377 386 378 } // end while image -
soft/giet_vm/transpose/transpose.py
r336 r383 31 31 # define vsegs base & size 32 32 code_base = 0x10000000 33 code_size = 0x00010000 # 64 Kbytes 33 code_size = 0x00010000 # 64 Kbytes (replicated in each cluster) 34 34 35 35 data_base = 0x20000000 36 data_size = 0x00010000 # 64 Kbytes 36 data_size = 0x00010000 # 64 Kbytes (non replicated) 37 37 38 38 ptab_base = 0x30000000 39 ptab_size = 0x00040000 # 256 Kbytes 39 ptab_size = 0x00040000 # 256 Kbytes (replicated in each cluster) 40 40 41 41 stack_base = 0x40000000 42 stack_size = 0x00 010000 # 64 Kbytes42 stack_size = 0x00100000 # 1 Mbytes (to be divided between all tasks) 43 43 44 44 heap_base = 0x50000000 45 heap_size = 0x00010000 # 64 Kbytes 45 heap_size = 0x00010000 # 64 Kbytes (to be shared by all tasks) 46 46 47 # create Vspace47 # create vspace 48 48 vspace = mapping.addVspace( name = 'transpose', startname = 'trsp_data' ) 49 49 50 # non replicated vsegs in cluster[0,0] 51 mapping.addVseg( vspace, 'trsp_code', code_base , code_size, 'CXWU', vtype = 'ELF', 52 x = 0, y = 0, pseg = 'RAM', binpath = 'build/transpose/transpose.elf' ) 50 # data vseg : shared (only in cluster[0,0]) 51 mapping.addVseg( vspace, 'trsp_data', data_base , data_size, 52 'C_WU', vtype = 'ELF', x = 0, y = 0, pseg = 'RAM', 53 binpath = 'build/transpose/transpose.elf', 54 local = False ) 53 55 54 mapping.addVseg( vspace, 'trsp_data', data_base , data_size, 'C_WU', vtype = 'ELF', 55 x = 0, y = 0, pseg = 'RAM', binpath = 'build/transpose/transpose.elf' ) 56 # code vsegs : local (one copy in each cluster) 57 for x in xrange (x_size): 58 for y in xrange (y_size): 59 mapping.addVseg( vspace, 'trsp_code_%d_%d' %(x,y), code_base , code_size, 60 'CXWU', vtype = 'ELF', x = x, y = y, pseg = 'RAM', 61 binpath = 'build/transpose/transpose.elf', 62 local = True ) 56 63 57 mapping.addVseg( vspace, 'trsp_ptab', ptab_base , ptab_size, 'C_WU', vtype = 'PTAB', 58 x = 0, y = 0, pseg = 'RAM', align = 13 ) 64 # ptab vsegs : local (one specific ptab per cluster) 65 for x in xrange (x_size): 66 for y in xrange (y_size): 67 mapping.addVseg( vspace, 'trsp_ptab_%d_%d' %(x,y), ptab_base , ptab_size, 68 'C_WU', vtype = 'PTAB', x = x, y = y, pseg = 'RAM', 69 align = 13, 70 local = True ) 59 71 60 # distributed vsegs: one stack per processor/task, one heap per cluster 61 for x_rep in xrange (x_size): 62 for y_rep in xrange (y_size): 63 cluster_offset = ((x_rep << y_width) + y_rep) << 20 # 1 Mbytes per cluster 64 mapping.addVseg( vspace, 'trsp_heap_%d_%d' % (x_rep, y_rep), 65 heap_base + cluster_offset, heap_size, 'C_WU', 66 vtype = 'BUFFER', x = x_rep, y = y_rep, pseg = 'RAM' ) 67 72 # stacks vsegs: local (one stack per processor, procs_max stacks per cluster) 73 for x in xrange (x_size): 74 for y in xrange (y_size): 68 75 for p in xrange( procs_max ): 69 proc_offset = cluster_offset + (p << 18) # 256 Kbytes per proc 70 mapping.addVseg( vspace, 'trsp_stack_%d_%d_%d' % (x_rep, y_rep, p), 71 stack_base + proc_offset, stack_size, 'C_WU', 72 vtype = 'BUFFER', x = x_rep, y = y_rep, pseg = 'RAM' ) 73 76 proc_id = (((x * y_size) + y) * procs_max) + p 77 size = stack_size / (x_size * y_size * procs_max) 78 base = stack_base + (proc_id * size) 79 mapping.addVseg( vspace, 'trsp_stack_%d_%d_%d' % (x,y,p), base, size, 80 'C_WU', vtype = 'BUFFER', x = x , y = y , pseg = 'RAM', 81 local = True ) 82 83 # heap vsegs: shared (all heap segments can be accessed by all tasks) 84 for x in xrange (x_size): 85 for y in xrange (y_size): 86 cluster_id = (x * y_size) + y 87 size = heap_size / (x_size * y_size) 88 base = heap_base + (cluster_id * size) 89 mapping.addVseg( vspace, 'trsp_heap_%d_%d' % (x,y), base, size, 90 'C_WU', vtype = 'BUFFER', x = x, y = y, pseg = 'RAM', 91 local = False ) 92 74 93 # distributed tasks / one task per processor 75 94 for x in xrange (x_size): 76 95 for y in xrange (y_size): 77 96 for p in xrange( procs_max ): 78 79 97 trdid = (((x * y_size) + y) * procs_max) + p 80 mapping.addTask( vspace, ' sort_%d_%d_%d' % (x,y,p), trdid, x, y, p,98 mapping.addTask( vspace, 'trsp_%d_%d_%d' % (x,y,p), trdid, x, y, p, 81 99 'trsp_stack_%d_%d_%d' % (x,y,p), 82 100 'trsp_heap_%d_%d' % (x,y), 0 )
Note: See TracChangeset
for help on using the changeset viewer.