- Timestamp:
- Feb 8, 2015, 9:20:45 PM (10 years ago)
- Location:
- soft/giet_vm/applications
- Files:
-
- 1 added
- 10 edited
Legend:
- Unmodified
- Added
- Removed
-
soft/giet_vm/applications/classif/classif.py
r488 r502 3 3 from mapping import * 4 4 5 ################################################################################### ###5 ################################################################################### 6 6 # file : classif.py 7 7 # date : november 2014 8 8 # author : Alain Greiner 9 ################################################################################### ####9 ################################################################################### 10 10 # This file describes the mapping of the multi-threaded "classif" 11 11 # application on a multi-clusters, multi-processors architecture. 12 12 # The mapping of tasks on processors is the following: 13 # - one "load" task per cluster, 14 # - one "store" task per cluster, 15 # - (nprocs-2) "analyse" task per cluster. 16 # The mapping of virtual segments on the clusters is the following: 17 # - The code vsegs are replicated on all clusters. 13 # - one "load" task per cluster containing processors, 14 # - one "store" task per cluster containing processors, 15 # - (nprocs-2) "analyse" task per cluster containing processors. 16 # The mapping of virtual segments is the following: 18 17 # - There is one shared data vseg in cluster[0][0] 19 # - There is one heap vseg per cluster. 20 # - The stacks vsegs are distibuted on all clusters. 18 # - The code vsegs are replicated on all clusters containing processors. 19 # - There is one heap vseg per cluster containing processors. 20 # - The stacks vsegs are distibuted on all clusters containing processors. 21 21 # This mapping uses 5 platform parameters, (obtained from the "mapping" argument) 22 22 # - x_size : number of clusters in a row … … 28 28 # WARNING: The target architecture cannot contain less 29 29 # than 3 processors per cluster. 30 ################################################################################## ##30 ################################################################################## 31 31 32 32 ######################### … … 49 49 50 50 heap_base = 0x30000000 51 heap_size = 0x000 08000 # 32Kbytes (per cluster)51 heap_size = 0x00040000 # 256 Kbytes (per cluster) 52 52 53 53 stack_base = 0x40000000 … … 63 63 local = False ) 64 64 65 # heap _x_y vsegs : shared / one per cluster65 # heap vsegs : shared (one per cluster) 66 66 for x in xrange (x_size): 67 67 for y in xrange (y_size): 68 base = heap_base + ( (4*x + y) * heap_size ) 68 cluster_id = (x * y_size) + y 69 if ( mapping.clusters[cluster_id].procs ): 70 size = heap_size 71 base = heap_base + (cluster_id * size) 69 72 70 mapping.addVseg( vspace, 'classif_heap_%d_%d' %(x,y), base , heap_size,71 'C_WU', vtype = 'HEAP', x = x, y = y, pseg = 'RAM',72 local = False )73 mapping.addVseg( vspace, 'classif_heap_%d_%d' %(x,y), base , size, 74 'C_WU', vtype = 'HEAP', x = x, y = y, pseg = 'RAM', 75 local = False ) 73 76 74 77 # code vsegs : local (one copy in each cluster) 75 78 for x in xrange (x_size): 76 79 for y in xrange (y_size): 80 cluster_id = (x * y_size) + y 81 if ( mapping.clusters[cluster_id].procs ): 77 82 78 mapping.addVseg( vspace, 'classif_code_%d_%d' %(x,y), code_base , code_size, 79 'CXWU', vtype = 'ELF', x = x, y = y, pseg = 'RAM', 80 binpath = 'build/classif/classif.elf', 81 local = True ) 83 mapping.addVseg( vspace, 'classif_code_%d_%d' %(x,y), 84 code_base , code_size, 85 'CXWU', vtype = 'ELF', x = x, y = y, pseg = 'RAM', 86 binpath = 'build/classif/classif.elf', 87 local = True ) 82 88 83 # stacks vsegs: local (one stack per processor => nprocs stacks per cluster) 89 # stacks vsegs: local (one stack per processor => nprocs stacks per cluster) 84 90 for x in xrange (x_size): 85 91 for y in xrange (y_size): 86 for p in xrange( nprocs ): 87 proc_id = (((x * y_size) + y) * nprocs) + p 88 size = (stack_size / nprocs) & 0xFFFFF000 89 base = stack_base + (proc_id * size) 92 cluster_id = (x * y_size) + y 93 if ( mapping.clusters[cluster_id].procs ): 94 for p in xrange( nprocs ): 95 proc_id = (((x * y_size) + y) * nprocs) + p 96 size = (stack_size / nprocs) & 0xFFFFF000 97 base = stack_base + (proc_id * size) 90 98 91 mapping.addVseg( vspace, 'classif_stack_%d_%d_%d' % (x,y,p), base, size, 92 'C_WU', vtype = 'BUFFER', x = x , y = y , pseg = 'RAM', 93 local = True, big = True ) 99 mapping.addVseg( vspace, 'classif_stack_%d_%d_%d' % (x,y,p), 100 base, size, 'C_WU', vtype = 'BUFFER', 101 x = x , y = y , pseg = 'RAM', 102 local = True, big = True ) 94 103 95 104 # distributed tasks / one task per processor 96 105 for x in xrange (x_size): 97 106 for y in xrange (y_size): 98 for p in xrange( nprocs ): 99 trdid = (((x * y_size) + y) * nprocs) + p 100 if ( p== 0 ): # task load 101 task_index = 0 102 task_name = 'load_%d_%d_%d' %(x,y,p) 103 elif ( p== 1 ): # task store 104 task_index = 1 105 task_name = 'store_%d_%d_%d' %(x,y,p) 106 else : # task analyse 107 task_index = 2 108 task_name = 'analyse_%d_%d_%d' % (x,y,p) 107 cluster_id = (x * y_size) + y 108 if ( mapping.clusters[cluster_id].procs ): 109 for p in xrange( nprocs ): 110 trdid = (((x * y_size) + y) * nprocs) + p 111 if ( p== 0 ): # task load 112 task_index = 0 113 task_name = 'load_%d_%d_%d' %(x,y,p) 114 elif ( p== 1 ): # task store 115 task_index = 1 116 task_name = 'store_%d_%d_%d' %(x,y,p) 117 else : # task analyse 118 task_index = 2 119 task_name = 'analyse_%d_%d_%d' % (x,y,p) 109 120 110 mapping.addTask( vspace, task_name, trdid, x, y, p,111 'classif_stack_%d_%d_%d' % (x,y,p),112 'classif_heap_%d_%d' % (x,y),113 task_index )121 mapping.addTask( vspace, task_name, trdid, x, y, p, 122 'classif_stack_%d_%d_%d' % (x,y,p), 123 'classif_heap_%d_%d' % (x,y), 124 task_index ) 114 125 115 126 # extend mapping name … … 118 129 return vspace # useful for test 119 130 120 ################################ test ################################################ ######131 ################################ test ################################################ 121 132 122 133 if __name__ == '__main__': -
soft/giet_vm/applications/classif/main.c
r488 r502 1 /////////////////////////////////////////////////////////////////////////////////////// //1 /////////////////////////////////////////////////////////////////////////////////////// 2 2 // File : main.c (for classif application) 3 3 // Date : november 2014 4 4 // author : Alain Greiner 5 /////////////////////////////////////////////////////////////////////////////////////// //5 /////////////////////////////////////////////////////////////////////////////////////// 6 6 // This multi-threaded application takes a stream of Gigabit Ethernet packets, 7 7 // and makes packet analysis and classification, based on the source MAC address. … … 9 9 // component to receive and send packets on the Gigabit Ethernet port. 10 10 // 11 // This application is described as a TCG (Task and Communication Graph) containing 12 // (N+2) tasks per cluster: 11 // It can run on architectures containing up to 16 * 16 clusters, 12 // and up to 8 processors per cluster. 13 // 14 // This application is described as a TCG (Task and Communication Graph) 15 // containing (N+2) tasks per cluster: 13 16 // - one "load" task 17 // - one "store" task 14 18 // - N "analyse" tasks 15 // - one "store" task 16 // The 4 Kbytes containers are diributed (N+2 containers per cluster): 19 // The containers are distributed (N+2 containers per cluster): 17 20 // - one RX container (part of the kernel rx_chbuf), in the kernel heap. 18 21 // - one TX container (part of the kernel tx-chbuf), in the kernel heap. … … 30 33 // The MWMR fifo descriptors array is defined as a global variable in cluster[0][0]. 31 34 // 32 // Initialisation is done in two steps by the "load" tasks: 33 // - Task "load" in cluster[0][0] initialises NIC & CMA channel, and initialises 34 // the barrier between all "load" tasks. Other "load" tasks are waiting on the 35 // global_sync synchronisation variable. 36 // - In each cluster[x][y], the "load" task allocates the working containers 37 // and the MWMR fifos descriptors in the local heap. 38 // The "analyse" tasks are waiting on the sync[x][y] variables. 35 // Initialisation is done in two steps by the "load" & "store" tasks: 36 // - Task "load" in cluster[0][0] initialises the barrier between all "load" tasks, 37 // allocates NIC & CMA RX channel, and starts the NIC_CMA RX transfer. 38 // Other "load" tasks are waiting on the load_sync synchronisation variable. 39 // Task "store" in cluster[0][0] initialises the barrier between all "store" tasks, 40 // allocates NIC & CMA TX channels, and starts the NIC_CMA TX transfer. 41 // Other "store" tasks are waiting on the store_sync synchronisation variable. 42 // - When this global initialisation is completed, the "load" task in all clusters 43 // allocates the working containers and the MWMR fifos descriptors from the 44 // user local heap. In each cluster, the "analyse" and "store" tasks are waiting 45 // the local initialisation completion on the local_sync[x][y] variables. 39 46 // 40 // Instrumentation results display is done by the "store" task in cluster[0][0]41 // when all "store" tasks completed the number of clusters specified by the42 // CONTAINERS_MAX parameter.43 //44 47 // When initialisation is completed, all tasks loop on containers: 45 48 // 1) The "load" task get an empty working container from the fifo_s2l, … … 47 50 // and transfer ownership of this container to one "analysis" task by writing 48 51 // into the fifo_l2a. 49 //50 52 // 2) The "analyse" task get one working container from the fifo_l2a, analyse 51 53 // each packet header, compute the packet type (depending on the SRC MAC address), 52 54 // increment the correspondint classification counter, and transpose the SRC 53 55 // and the DST MAC addresses fot TX tranmission. 54 //55 56 // 3) The "store" task transfer get a full working container from the fifo_a2s, 56 57 // transfer this user container content to the the kernel tx_chbuf, 57 58 // and transfer ownership of this empty container to the "load" task by writing 58 59 // into the fifo_s2l. 59 // 60 // This application uses the following hardware parameters (hard_config.h file): 61 // - X_SIZE : number of clusters in a row 62 // - Y_SIZE : number of clusters in a column 63 // - NB_PROCS_MAX : number of processors per cluster 64 ///////////////////////////////////////////////////////////////////////////////////////// 60 // 61 // Instrumentation results display is done by the "store" task in cluster[0][0] 62 // when all "store" tasks completed the number of clusters specified by the 63 // CONTAINERS_MAX parameter. 64 /////////////////////////////////////////////////////////////////////////////////////// 65 65 66 66 #include "stdio.h" 67 #include " barrier.h"67 #include "user_barrier.h" 68 68 #include "malloc.h" 69 69 #include "user_lock.h" 70 70 #include "mwmr_channel.h" 71 #include "hard_config.h" 72 73 #define CONTAINERS_MAX 5 74 #define VERBOSE_ANALYSE 1 75 #define ANALYSIS_TASKS (NB_PROCS_MAX - 2) 76 77 ///////////////////////////////////////////////////////////////////////////////////////// 71 72 #define X_SIZE_MAX 16 73 #define Y_SIZE_MAX 16 74 #define NPROCS_MAX 8 75 #define CONTAINERS_MAX 500 76 #define VERBOSE_ANALYSE 0 77 78 /////////////////////////////////////////////////////////////////////////////////////// 78 79 // Global variables 79 80 // The MWMR channels (descriptors and buffers), as well as the working containers … … 81 82 // But the pointers on these distributed structures are shared arrays 82 83 // stored in cluster[0][0]. 83 /////////////////////////////////////////////////////////////////////////////////////// //84 85 // pointers on distributed temp[x][y][n]containers86 unsigned int* container[X_SIZE ][Y_SIZE][ANALYSIS_TASKS];84 /////////////////////////////////////////////////////////////////////////////////////// 85 86 // pointers on distributed containers 87 unsigned int* container[X_SIZE_MAX][Y_SIZE_MAX][NPROCS_MAX-2]; 87 88 88 89 // pointers on distributed mwmr fifos containing : temp[x][y][l] container descriptors 89 mwmr_channel_t* mwmr_l2a[X_SIZE ][Y_SIZE];90 mwmr_channel_t* mwmr_a2s[X_SIZE ][Y_SIZE];91 mwmr_channel_t* mwmr_s2l[X_SIZE ][Y_SIZE];90 mwmr_channel_t* mwmr_l2a[X_SIZE_MAX][Y_SIZE_MAX]; 91 mwmr_channel_t* mwmr_a2s[X_SIZE_MAX][Y_SIZE_MAX]; 92 mwmr_channel_t* mwmr_s2l[X_SIZE_MAX][Y_SIZE_MAX]; 92 93 93 94 // local synchros signaling local MWMR fifos initialisation completion 94 unsigned int local_sync[X_SIZE][Y_SIZE];95 volatile unsigned int local_sync[X_SIZE_MAX][Y_SIZE_MAX]; 95 96 96 97 // global synchro signaling global initialisation completion 97 unsigned int load_sync = 0;98 unsigned int store_sync = 0;98 volatile unsigned int load_sync = 0; 99 volatile unsigned int store_sync = 0; 99 100 100 101 // instrumentation counters 101 102 unsigned int counter[16]; 102 103 103 // distributed barriers (between "load" and "store" tasks) 104 giet_sbt_barrier_t rx_barrier; 105 giet_sbt_barrier_t tx_barrier; 104 // distributed barrier between "load" tasks 105 giet_sqt_barrier_t rx_barrier; 106 107 // distributed barrier between "store" tasks 108 giet_sqt_barrier_t tx_barrier; 106 109 107 110 // NIC_RX and NIC_TX channel index … … 113 116 ///////////////////////////////////////// 114 117 { 118 // each "load" task get platform parameters 119 unsigned int x_size; // number of clusters in a row 120 unsigned int y_size; // number of clusters in a column 121 unsigned int nprocs; // number of processors per cluster 122 giet_procs_number( &x_size, &y_size, &nprocs ); 123 124 giet_assert( (x_size <= X_SIZE_MAX) && 125 (y_size <= Y_SIZE_MAX) && 126 (nprocs <= NPROCS_MAX) , 127 "[CLASSIF ERROR] illegal platform parameters" ); 128 115 129 // each "load" task get processor identifiers 116 130 unsigned int x; … … 119 133 giet_proc_xyp( &x, &y, &l ); 120 134 121 // "load" task[0][0] initialises barrier between load tasks,135 // "load" task[0][0] initialises barrier between all load tasks, 122 136 // allocates the NIC & CMA RX channels, and start the NIC_CMA RX transfer. 123 137 // Other "load" tasks wait completion 124 138 if ( (x==0) && (y==0) ) 125 139 { 126 giet_shr_printf("\n*** Task load on P[%d][%d][%d] starts at cycle %d\n", 127 x , y , l , giet_proctime() ); 140 giet_shr_printf("\n*** Task load on P[%d][%d][%d] starts at cycle %d\n" 141 " x_size = %d / y_size = %d / nprocs = %d\n", 142 x , y , l , giet_proctime() , x_size, y_size, nprocs ); 128 143 129 s bt_barrier_init( &rx_barrier, X_SIZE*Y_SIZE, 1 );130 nic_rx_channel = giet_nic_rx_alloc( );144 sqt_barrier_init( &rx_barrier, x_size , y_size , 1 ); 145 nic_rx_channel = giet_nic_rx_alloc( x_size , y_size ); 131 146 giet_nic_rx_start( nic_rx_channel ); 132 147 load_sync = 1; … … 137 152 } 138 153 139 // all load tasks allocatecontainers[x][y][n] (from local heap)154 // each load tasks allocates containers[x][y][n] (from local heap) 140 155 // and register pointers in the local stack 141 156 unsigned int n; 142 unsigned int* cont[ANALYSIS_TASKS]; 143 144 for ( n = 0 ; n < ANALYSIS_TASKS ; n++ ) 157 unsigned int* cont[NPROCS_MAX-2]; 158 unsigned int analysis_tasks = nprocs-2; 159 160 for ( n = 0 ; n < analysis_tasks ; n++ ) 145 161 { 146 162 container[x][y][n] = malloc( 4096 ); … … 148 164 } 149 165 150 // all load tasks allocatedata buffers for mwmr fifos (from local heap)151 unsigned int* data_l2a = malloc( ANALYSIS_TASKS<<2 );152 unsigned int* data_a2s = malloc( ANALYSIS_TASKS<<2 );153 unsigned int* data_s2l = malloc( ANALYSIS_TASKS<<2 );154 155 // all load tasks allocatemwmr fifos descriptors (from local heap)166 // each load task allocates data buffers for mwmr fifos (from local heap) 167 unsigned int* data_l2a = malloc( analysis_tasks<<2 ); 168 unsigned int* data_a2s = malloc( analysis_tasks<<2 ); 169 unsigned int* data_s2l = malloc( analysis_tasks<<2 ); 170 171 // each load task allocates mwmr fifos descriptors (from local heap) 156 172 mwmr_l2a[x][y] = malloc( sizeof(mwmr_channel_t) ); 157 173 mwmr_a2s[x][y] = malloc( sizeof(mwmr_channel_t) ); 158 174 mwmr_s2l[x][y] = malloc( sizeof(mwmr_channel_t) ); 159 175 160 // all "load" tasks registerlocal pointers on mwmr fifos in local stack176 // each load task registers local pointers on mwmr fifos in local stack 161 177 mwmr_channel_t* fifo_l2a = mwmr_l2a[x][y]; 162 178 mwmr_channel_t* fifo_a2s = mwmr_a2s[x][y]; 163 179 mwmr_channel_t* fifo_s2l = mwmr_s2l[x][y]; 164 180 165 // all "load" tasks initialiselocal mwmr fifos descriptors181 // each load task initialises local mwmr fifos descriptors 166 182 // ( width = 4 bytes / depth = number of analysis tasks ) 167 mwmr_init( fifo_l2a , data_l2a , 1 , ANALYSIS_TASKS);168 mwmr_init( fifo_a2s , data_a2s , 1 , ANALYSIS_TASKS);169 mwmr_init( fifo_s2l , data_s2l , 1 , ANALYSIS_TASKS);183 mwmr_init( fifo_l2a , data_l2a , 1 , analysis_tasks ); 184 mwmr_init( fifo_a2s , data_a2s , 1 , analysis_tasks ); 185 mwmr_init( fifo_s2l , data_s2l , 1 , analysis_tasks ); 170 186 171 187 172 // all "load" tasks initialiselocal containers as empty in fifo_s2l173 for ( n = 0 ; n < ANALYSIS_TASKS; n++ ) mwmr_write( fifo_s2l , &n , 1 );174 175 // each "load"task[x][y] signals mwmr fifos initialisation completion188 // each load task initialises local containers as empty in fifo_s2l 189 for ( n = 0 ; n < analysis_tasks ; n++ ) mwmr_write( fifo_s2l , &n , 1 ); 190 191 // each load task[x][y] signals mwmr fifos initialisation completion 176 192 // to other tasks in same cluster[x][y] 177 193 local_sync[x][y] = 1; 178 194 179 // "load"task[0][0] displays status195 // load task[0][0] displays status 180 196 if ( (x==0) && (y==0) ) 181 197 giet_shr_printf("\n*** Task load on P[%d,%d,%d] enters main loop at cycle %d\n" … … 192 208 (unsigned int)fifo_s2l, (unsigned int)data_s2l, 193 209 (unsigned int)cont[0], 194 X_SIZE, Y_SIZE, NB_PROCS_MAX);210 x_size, y_size, nprocs ); 195 211 196 212 ///////////////////////////////////////////////////////////// 197 // All "load"tasks enter the main loop (on containers)198 unsigned int count = 0; // loaded containers count199 unsigned int index; // available container index200 unsigned int* temp; // pointer on available container213 // All load tasks enter the main loop (on containers) 214 unsigned int count = 0; // loaded containers count 215 unsigned int index; // available container index 216 unsigned int* temp; // pointer on available container 201 217 202 218 while ( count < CONTAINERS_MAX ) 203 219 { 204 // get one empty co untindex from fifo_s2l220 // get one empty container index from fifo_s2l 205 221 mwmr_read( fifo_s2l , &index , 1 ); 206 222 temp = cont[index]; 207 223 208 // get one co untfrom kernel rx_chbuf224 // get one container from kernel rx_chbuf 209 225 giet_nic_rx_move( nic_rx_channel, temp ); 210 226 … … 213 229 unsigned int nwords = temp[0] >> 16; 214 230 215 if ( (x== X_SIZE-1) && (y==Y_SIZE-1) )231 if ( (x==0) && (y==0) ) 216 232 giet_shr_printf("\n*** Task load on P[%d,%d,%d] get container %d at cycle %d" 217 233 " : %d packets / %d words\n", 218 234 x, y, l, count, giet_proctime(), npackets, nwords ); 219 235 220 // put the full co untindex to fifo_l2a236 // put the full container index to fifo_l2a 221 237 mwmr_write( fifo_l2a, &index , 1 ); 222 238 … … 225 241 226 242 // all "load" tasks synchronise before stats 227 s bt_barrier_wait( &rx_barrier );243 sqt_barrier_wait( &rx_barrier ); 228 244 229 245 // "load" task[0][0] stops the NIC_CMA RX transfer and displays stats … … 244 260 ////////////////////////////////////////// 245 261 { 262 // each "load" task get platform parameters 263 unsigned int x_size; // number of clusters in row 264 unsigned int y_size; // number of clusters in a column 265 unsigned int nprocs; // number of processors per cluster 266 giet_procs_number( &x_size, &y_size, &nprocs ); 267 246 268 // get processor identifiers 247 269 unsigned int x; … … 250 272 giet_proc_xyp( &x, &y, &l ); 251 273 252 253 274 // "store" task[0][0] initialises the barrier between all "store" tasks, 254 275 // allocates NIC & CMA TX channels, and starts the NIC_CMA TX transfer. … … 256 277 if ( (x==0) && (y==0) ) 257 278 { 258 giet_shr_printf("\n*** Task store on P[%d][%d][%d] starts at cycle %d\n", 259 x , y , l , giet_proctime() ); 279 giet_shr_printf("\n*** Task store on P[%d][%d][%d] starts at cycle %d\n" 280 " x_size = %d / y_size = %d / nprocs = %d\n", 281 x , y , l , giet_proctime() , x_size, y_size, nprocs ); 260 282 261 s bt_barrier_init( &tx_barrier , X_SIZE*Y_SIZE, 1 );262 nic_tx_channel = giet_nic_tx_alloc( );283 sqt_barrier_init( &tx_barrier , x_size , y_size , 1 ); 284 nic_tx_channel = giet_nic_tx_alloc( x_size , y_size ); 263 285 giet_nic_tx_start( nic_tx_channel ); 264 286 store_sync = 1; … … 272 294 while ( local_sync[x][y] == 0 ) asm volatile ("nop"); 273 295 274 // all"store" tasks register pointers on working containers in local stack296 // each "store" tasks register pointers on working containers in local stack 275 297 unsigned int n; 276 unsigned int* cont[ANALYSIS_TASKS]; 277 for ( n = 0 ; n < ANALYSIS_TASKS ; n++ ) 298 unsigned int analysis_tasks = nprocs-2; 299 unsigned int* cont[NPROCS_MAX-2]; 300 301 for ( n = 0 ; n < analysis_tasks ; n++ ) 278 302 { 279 303 cont[n] = container[x][y][n]; … … 318 342 unsigned int nwords = temp[0] >> 16; 319 343 320 if ( (x== X_SIZE-1) && (y==Y_SIZE-1) )344 if ( (x==0) && (y==0) ) 321 345 giet_shr_printf("\n*** Task store on P[%d,%d,%d] get container %d at cycle %d" 322 346 " : %d packets / %d words\n", … … 330 354 331 355 // all "store" tasks synchronise before result display 332 s bt_barrier_wait( &tx_barrier );356 sqt_barrier_wait( &tx_barrier ); 333 357 334 358 // "store" task[0,0] stops NIC_CMA TX transfer and displays results … … 377 401 //////////////////////////////////////////// 378 402 { 403 // each "load" task get platform parameters 404 unsigned int x_size; // number of clusters in row 405 unsigned int y_size; // number of clusters in a column 406 unsigned int nprocs; // number of processors per cluster 407 giet_procs_number( &x_size, &y_size, &nprocs ); 408 379 409 // get processor identifiers 380 410 unsigned int x; … … 385 415 if ( (x==0) && (y==0) ) 386 416 { 387 giet_shr_printf("\n*** Task analyse on P[%d][%d][%d] starts at cycle %d\n", 388 x , y , l , giet_proctime() ); 417 giet_shr_printf("\n*** Task analyse on P[%d][%d][%d] starts at cycle %d\n" 418 " x_size = %d / y_size = %d / nprocs = %d\n", 419 x , y , l , giet_proctime() , x_size, y_size, nprocs ); 389 420 } 390 421 … … 394 425 // all "analyse" tasks register pointers on working containers in local stack 395 426 unsigned int n; 396 unsigned int* cont[ANALYSIS_TASKS]; 397 for ( n = 0 ; n < ANALYSIS_TASKS ; n++ ) 427 unsigned int analysis_tasks = nprocs-2; 428 unsigned int* cont[NPROCS_MAX-2]; 429 for ( n = 0 ; n < analysis_tasks ; n++ ) 398 430 { 399 431 cont[n] = container[x][y][n]; … … 471 503 unsigned int word2 = temp[first + 2]; 472 504 505 #if VERBOSE_ANALYSE 473 506 unsigned long long dst = ((unsigned long long)(word1 & 0xFFFF0000)>>16) | 474 507 (((unsigned long long)word0)<<16); 475 508 unsigned long long src = ((unsigned long long)(word1 & 0x0000FFFF)<<32) | 476 509 ((unsigned long long)word2); 477 #if VERBOSE_ANALYSE478 510 if ( p < 10 ) 479 511 { -
soft/giet_vm/applications/convol/convol.py
r457 r502 12 12 # This include both the mapping of virtual segments on the clusters, 13 13 # and the mapping of tasks on processors. 14 # There is one task per processor. 15 # The mapping of virtual segments is the following: 16 # - There is one shared data vseg in cluster[0][0] 17 # - The code vsegs are replicated on all clusters containing processors. 18 # - There is one heap vseg per cluster containing processors. 19 # - The stacks vsegs are distibuted on all clusters containing processors. 14 20 # This mapping uses 5 platform parameters, (obtained from the "mapping" argument) 15 # - x_size : number of clusters in a row16 # - y_size : number of clusters in a column17 # - x_width : number of bits coding x coordinate18 # - y_width : number of bits coding y coordinate19 # - nprocs : number of processors per cluster21 # - x_size : number of clusters in a row 22 # - y_size : number of clusters in a column 23 # - x_width : number of bits coding x coordinate 24 # - y_width : number of bits coding y coordinate 25 # - nprocs : number of processors per cluster 20 26 #################################################################################### 21 27 … … 46 52 47 53 # data vseg in cluster[0,0] : non local 48 mapping.addVseg( vspace, 'conv_data', data_base , data_size, 'C_WU', vtype = 'ELF', 49 x = 0, y = 0, pseg = 'RAM', binpath = 'build/convol/convol.elf', 54 mapping.addVseg( vspace, 'conv_data', data_base , data_size, 55 'C_WU', vtype = 'ELF', x = 0, y = 0, pseg = 'RAM', 56 binpath = 'build/convol/convol.elf', 50 57 local = False ) 51 58 … … 53 60 for x in xrange (x_size): 54 61 for y in xrange (y_size): 55 size = code_size 56 base = code_base 57 mapping.addVseg( vspace, 'conv_code_%d_%d' % (x,y), base, size, 58 'CXWU', vtype = 'ELF', x = x , y = y , pseg = 'RAM', 59 binpath = 'build/convol/convol.elf', 60 local = True ) 62 cluster_id = (x * y_size) + y 63 if ( mapping.clusters[cluster_id].procs ): 64 size = code_size 65 base = code_base 66 67 mapping.addVseg( vspace, 'conv_code_%d_%d' % (x,y), base, size, 68 'CXWU', vtype = 'ELF', x = x , y = y , pseg = 'RAM', 69 binpath = 'build/convol/convol.elf', 70 local = True ) 61 71 62 72 # stack vsegs : local (one stack per processor) 63 73 for x in xrange (x_size): 64 74 for y in xrange (y_size): 65 for p in xrange( nprocs ): 66 proc_id = (((x * y_size) + y) * nprocs) + p 67 size = (stack_size / nprocs) & 0xFFFFF000 68 base = stack_base + (proc_id * size) 69 mapping.addVseg( vspace, 'conv_stack_%d_%d_%d' % (x,y,p), base, size, 70 'C_WU', vtype = 'BUFFER', x = x , y = y , pseg = 'RAM', 71 local = True, big = True ) 75 cluster_id = (x * y_size) + y 76 if ( mapping.clusters[cluster_id].procs ): 77 for p in xrange( nprocs ): 78 proc_id = (((x * y_size) + y) * nprocs) + p 79 size = (stack_size / nprocs) & 0xFFFFF000 80 base = stack_base + (proc_id * size) 81 82 mapping.addVseg( vspace, 'conv_stack_%d_%d_%d' % (x,y,p), 83 base, size, 'C_WU', vtype = 'BUFFER', 84 x = x , y = y , pseg = 'RAM', 85 local = True, big = True ) 72 86 73 # heap vsegs : distributed but non local (a ll heap vsegs can be accessed by all tasks)87 # heap vsegs : distributed but non local (any heap can be accessed by any task) 74 88 for x in xrange (x_size): 75 89 for y in xrange (y_size): 76 90 cluster_id = (x * y_size) + y 77 size = heap_size 78 base = heap_base + (cluster_id * size) 79 mapping.addVseg( vspace, 'conv_heap_%d_%d' % (x,y), base, size, 80 'C_WU', vtype = 'BUFFER', x = x , y = y , pseg = 'RAM', 81 local = False, big = True ) 91 if ( mapping.clusters[cluster_id].procs ): 92 size = heap_size 93 base = heap_base + (cluster_id * size) 94 95 mapping.addVseg( vspace, 'conv_heap_%d_%d' % (x,y), base, size, 96 'C_WU', vtype = 'BUFFER', x = x , y = y , pseg = 'RAM', 97 local = False, big = True ) 82 98 83 99 # distributed tasks : one task per processor 84 100 for x in xrange (x_size): 85 101 for y in xrange (y_size): 86 for p in xrange( nprocs ): 87 trdid = (((x * y_size) + y) * nprocs) + p 88 mapping.addTask( vspace, 'conv_%d_%d_%d' % (x,y,p), trdid, x, y, p, 89 'conv_stack_%d_%d_%d' % (x,y,p), 90 'conv_heap_%d_%d' % (x,y), 0 ) 102 cluster_id = (x * y_size) + y 103 if ( mapping.clusters[cluster_id].procs ): 104 for p in xrange( nprocs ): 105 trdid = (((x * y_size) + y) * nprocs) + p 106 107 mapping.addTask( vspace, 'conv_%d_%d_%d' % (x,y,p), 108 trdid, x, y, p, 109 'conv_stack_%d_%d_%d' % (x,y,p), 110 'conv_heap_%d_%d' % (x,y), 0 ) 91 111 92 112 # extend mapping name … … 95 115 return vspace # useful for test 96 116 97 ################################ test ################################################ ######117 ################################ test ################################################ 98 118 99 119 if __name__ == '__main__': -
soft/giet_vm/applications/convol/main.c
r488 r502 1 /////////////////////////////////////////////////////////////////////////////////////// /////1 /////////////////////////////////////////////////////////////////////////////////////// 2 2 // File : main.c (for convol application) 3 3 // Date : june 2014 4 4 // author : Alain Greiner 5 /////////////////////////////////////////////////////////////////////////////////////// /////5 /////////////////////////////////////////////////////////////////////////////////////// 6 6 // This multi-threaded application application implements a 2D convolution product. 7 7 // The convolution kernel is [201]*[35] pixels, but it can be factored in two 8 8 // independant line and column convolution products. 9 9 // It can run on a multi-processors, multi-clusters architecture, with one thread 10 // per processor. It uses the he following hardware parameters, that must be defined 11 // in the hard_config.h file: 12 // - X_SIZE : number of clusters in a row 13 // - Y_SIZE : number of clusters in a column 14 // - NB_PROCS_MAX : number of processors per cluster 15 // - FBUF_X_SIZE : number of pixels per line in frame buffer 16 // - FBUF_Y_SIZE : number of lines in frame buffer 10 // per processor. 17 11 // 18 12 // The (1024 * 1024) pixels image is read from a file (2 bytes per pixel). 19 13 // 20 // - The number of clusters containing processors must be a power of 2. 21 // - The number of processors per cluster must be a power of 2. 22 //////////////////////////////////////////////////////////////////////////////////////////// 23 24 #include "hard_config.h" 14 // - number of clusters containing processors must be power of 2 no larger than 256. 15 // - number of processors per cluster must be power of 2 no larger than 8. 16 /////////////////////////////////////////////////////////////////////////////////////// 17 25 18 #include "stdio.h" 26 19 #include "stdlib.h" 27 #include " barrier.h"20 #include "user_barrier.h" 28 21 #include "malloc.h" 29 22 30 #define USE_S BT_BARRIER 123 #define USE_SQT_BARRIER 1 31 24 #define VERBOSE 0 32 25 #define SUPER_VERBOSE 0 33 26 27 #define X_SIZE_MAX 16 28 #define Y_SIZE_MAX 16 29 #define PROCS_MAX 8 30 #define CLUSTERS_MAX (X_SIZE_MAX * Y_SIZE_MAX) 31 34 32 #define INITIAL_DISPLAY_ENABLE 0 35 33 #define FINAL_DISPLAY_ENABLE 1 36 34 37 #define NB_CLUSTERS (X_SIZE * Y_SIZE)38 35 #define PIXEL_SIZE 2 39 36 #define NL 1024 … … 53 50 // global instrumentation counters (cluster_id, lpid] 54 51 55 unsigned int START[ NB_CLUSTERS][NB_PROCS_MAX];56 unsigned int H_BEG[ NB_CLUSTERS][NB_PROCS_MAX];57 unsigned int H_END[ NB_CLUSTERS][NB_PROCS_MAX];58 unsigned int V_BEG[ NB_CLUSTERS][NB_PROCS_MAX];59 unsigned int V_END[ NB_CLUSTERS][NB_PROCS_MAX];60 unsigned int D_BEG[ NB_CLUSTERS][NB_PROCS_MAX];61 unsigned int D_END[ NB_CLUSTERS][NB_PROCS_MAX];52 unsigned int START[CLUSTERS_MAX][PROCS_MAX]; 53 unsigned int H_BEG[CLUSTERS_MAX][PROCS_MAX]; 54 unsigned int H_END[CLUSTERS_MAX][PROCS_MAX]; 55 unsigned int V_BEG[CLUSTERS_MAX][PROCS_MAX]; 56 unsigned int V_END[CLUSTERS_MAX][PROCS_MAX]; 57 unsigned int D_BEG[CLUSTERS_MAX][PROCS_MAX]; 58 unsigned int D_END[CLUSTERS_MAX][PROCS_MAX]; 62 59 63 60 // global synchronization barrier 64 61 65 #if USE_S BT_BARRIER66 giet_s bt_barrier_t barrier;62 #if USE_SQT_BARRIER 63 giet_sqt_barrier_t barrier; 67 64 #else 68 65 giet_barrier_t barrier; … … 74 71 75 72 // global pointers on distributed buffers in all clusters 76 unsigned short * GA[ NB_CLUSTERS];77 int * GB[ NB_CLUSTERS];78 int * GC[ NB_CLUSTERS];79 int * GD[ NB_CLUSTERS];80 unsigned char * GZ[ NB_CLUSTERS];73 unsigned short * GA[CLUSTERS_MAX]; 74 int * GB[CLUSTERS_MAX]; 75 int * GC[CLUSTERS_MAX]; 76 int * GD[CLUSTERS_MAX]; 77 unsigned char * GZ[CLUSTERS_MAX]; 81 78 82 79 /////////////////////////////////////////// … … 109 106 int z; // vertical filter index for loops 110 107 108 // plat-form parameters 109 unsigned int x_size; // number of clusters in a row 110 unsigned int y_size; // number of clusters in a column 111 unsigned int nprocs; // number of processors per cluster 112 113 giet_procs_number( &x_size , &y_size , &nprocs ); 114 111 115 // processor identifiers 112 unsigned int x; 113 unsigned int y; 114 unsigned int lpid; 116 unsigned int x; // x coordinate 117 unsigned int y; // y coordinate 118 unsigned int lpid; // local proc/task id 115 119 giet_proc_xyp( &x, &y, &lpid ); 116 120 117 int file = 0; // file descriptor 118 unsigned int nprocs = NB_PROCS_MAX; // procs per cluster 119 unsigned int nclusters = NB_CLUSTERS; // number of clusters 120 unsigned int cluster_id = (x * Y_SIZE) + y; // continuous cluster index 121 unsigned int task_id = (cluster_id * nprocs) + lpid; // continuous task index 122 unsigned int ntasks = nclusters * nprocs; // number of tasks 123 unsigned int frame_size = FRAME_SIZE; // total size (bytes) 124 unsigned int nblocks = frame_size / 512; // number of blocks per frame 125 126 unsigned int lines_per_task = NL / ntasks; // lines per task 127 unsigned int lines_per_cluster = NL / nclusters; // lines per cluster 128 unsigned int pixels_per_task = NP / ntasks; // columns per task 129 unsigned int pixels_per_cluster = NP / nclusters; // columns per cluster 121 int file = 0; // file descriptor 122 unsigned int nclusters = x_size * y_size; // number of clusters 123 unsigned int cluster_id = (x * y_size) + y; // continuous cluster index 124 unsigned int task_id = (cluster_id * nprocs) + lpid; // continuous task index 125 unsigned int ntasks = nclusters * nprocs; // number of tasks 126 unsigned int frame_size = FRAME_SIZE; // total size (bytes) 127 unsigned int nblocks = frame_size / 512; // number of blocks/frame 128 129 unsigned int lines_per_task = NL / ntasks; // lines per task 130 unsigned int lines_per_cluster = NL / nclusters; // lines per cluster 131 unsigned int pixels_per_task = NP / ntasks; // columns per task 132 unsigned int pixels_per_cluster = NP / nclusters; // columns per cluster 130 133 131 134 int first, last; … … 140 143 // parameters checking 141 144 142 if ( (NP != FBUF_X_SIZE) || (NL != FBUF_Y_SIZE) ) 143 { 144 giet_exit("[TRANSPOSE ERROR] Frame buffer size does not fit image size"); 145 } 146 if ((nprocs != 1) && (nprocs != 2) && (nprocs != 4)) 147 giet_exit( "[CONVOL ERROR] NB_PROCS_MAX must be 1, 2 or 4\n"); 148 149 if ((X_SIZE!=1) && (X_SIZE!=2) && (X_SIZE!=4) && (X_SIZE!=8) && (X_SIZE!=16)) 150 giet_exit( "[CONVOL ERROR] X_SIZE must be 1, 2, 4, 8, 16\n"); 145 if ((nprocs != 1) && (nprocs != 2) && (nprocs != 4) && (nprocs != 8)) 146 giet_exit( "[CONVOL ERROR] NB_PROCS_MAX must be 1, 2, 4 or 8\n"); 147 148 if ((x_size!=1) && (x_size!=2) && (x_size!=4) && (x_size!=8) && (x_size!=16)) 149 giet_exit( "[CONVOL ERROR] x_size must be 1, 2, 4, 8, 16\n"); 151 150 152 if (( Y_SIZE!=1) && (Y_SIZE!=2) && (Y_SIZE!=4) && (Y_SIZE!=8) && (Y_SIZE!=16))153 giet_exit( "[CONVOL ERROR] Y_SIZEmust be 1, 2, 4, 8, 16\n");151 if ((y_size!=1) && (y_size!=2) && (y_size!=4) && (y_size!=8) && (y_size!=16)) 152 giet_exit( "[CONVOL ERROR] y_size must be 1, 2, 4, 8, 16\n"); 154 153 155 154 if ( NL % nclusters != 0 ) 156 giet_exit( "[CONVOL ERROR] NB_CLUSTERSmust be a divider of NL");155 giet_exit( "[CONVOL ERROR] CLUSTERS_MAX must be a divider of NL"); 157 156 158 157 if ( NP % nclusters != 0 ) 159 giet_exit( "[CONVOL ERROR] NB_CLUSTERSmust be a divider of NP");158 giet_exit( "[CONVOL ERROR] CLUSTERS_MAX must be a divider of NP"); 160 159 161 160 … … 166 165 if ( (x==0) && (y==0) && (lpid==0) ) 167 166 { 168 // parameters checking169 if ( (NP != FBUF_X_SIZE) || (NL != FBUF_Y_SIZE) )170 giet_exit("[TRANSPOSE ERROR] Frame buffer size does not fit image size");171 172 if ((nprocs != 1) && (nprocs != 2) && (nprocs != 4))173 giet_exit( "[CONVOL ERROR] NB_PROCS_MAX must be 1, 2 or 4\n");174 175 if ((X_SIZE!=1) && (X_SIZE!=2) && (X_SIZE!=4) && (X_SIZE!=8) && (X_SIZE!=16))176 giet_exit( "[CONVOL ERROR] X_SIZE must be 1, 2, 4, 8, 16\n");177 178 if ((Y_SIZE!=1) && (Y_SIZE!=2) && (Y_SIZE!=4) && (Y_SIZE!=8) && (Y_SIZE!=16))179 giet_exit( "[CONVOL ERROR] Y_SIZE must be 1, 2, 4, 8, 16\n");180 181 if ( NL % nclusters != 0 )182 giet_exit( "[CONVOL ERROR] NB_CLUSTERS must be a divider of NL");183 184 if ( NP % nclusters != 0 )185 giet_exit( "[CONVOL ERROR] NB_CLUSTERS must be a divider of NP");186 187 188 167 giet_shr_printf("\n[CONVOL] task[0,0,0] starts barrier init at cycle %d\n" 189 "- NB_CLUSTERS= %d\n"190 "- NB_PROCS_MAX= %d\n"191 "- NB_TASKS= %d\n"192 "- NB_BLOCKS = %x\n",168 "- CLUSTERS = %d\n" 169 "- PROCS = %d\n" 170 "- TASKS = %d\n" 171 "- BLOCKS = %d\n", 193 172 giet_proctime(), nclusters, nprocs, ntasks, nblocks ); 194 #if USE_S BT_BARRIER195 s bt_barrier_init( &barrier, nclusters, nprocs );173 #if USE_SQT_BARRIER 174 sqt_barrier_init( &barrier, x_size , y_size , nprocs ); 196 175 #else 197 176 barrier_init( &barrier, ntasks ); … … 216 195 217 196 #if VERBOSE 218 giet_shr_printf( "\n[CONVOL] task[%d,%d,%d] enters malloc at cycle %d\n", x,y,lpid, date ); 197 giet_shr_printf( "\n[CONVOL] task[%d,%d,%d] enters malloc at cycle %d\n", 198 x,y,lpid, date ); 219 199 #endif 220 200 … … 242 222 243 223 /////////////////////////////// 244 #if USE_S BT_BARRIER245 s bt_barrier_wait( &barrier );224 #if USE_SQT_BARRIER 225 sqt_barrier_wait( &barrier ); 246 226 #else 247 227 barrier_wait( &barrier ); … … 253 233 /////////////////////////////////////////////////////////////////// 254 234 255 unsigned short * A[ NB_CLUSTERS];256 int * B[NB_CLUSTERS];257 int * C[NB_CLUSTERS];258 int * D[NB_CLUSTERS];259 unsigned char * Z[NB_CLUSTERS];235 unsigned short * A[CLUSTERS_MAX]; 236 int * B[CLUSTERS_MAX]; 237 int * C[CLUSTERS_MAX]; 238 int * D[CLUSTERS_MAX]; 239 unsigned char * Z[CLUSTERS_MAX]; 260 240 261 241 for (c = 0; c < nclusters; c++) … … 283 263 " at cycle %d\n", giet_proctime() ); 284 264 285 for ( c = 0 ; c < NB_CLUSTERS; c++ )265 for ( c = 0 ; c < nclusters ; c++ ) 286 266 { 287 267 giet_shr_printf( "\n[CONVOL] task[0,0,0] starts load " … … 341 321 342 322 //////////////////////////// 343 #if USE_S BT_BARRIER344 s bt_barrier_wait( &barrier );323 #if USE_SQT_BARRIER 324 sqt_barrier_wait( &barrier ); 345 325 #else 346 326 barrier_wait( &barrier ); … … 447 427 448 428 ///////////////////////////// 449 #if USE_S BT_BARRIER450 s bt_barrier_wait( &barrier );429 #if USE_SQT_BARRIER 430 sqt_barrier_wait( &barrier ); 451 431 #else 452 432 barrier_wait( &barrier ); … … 567 547 568 548 //////////////////////////// 569 #if USE_S BT_BARRIER570 s bt_barrier_wait( &barrier );549 #if USE_SQT_BARRIER 550 sqt_barrier_wait( &barrier ); 571 551 #else 572 552 barrier_wait( &barrier ); … … 626 606 627 607 ////////////////////////////// 628 #if USE_S BT_BARRIER629 s bt_barrier_wait( &barrier );608 #if USE_SQT_BARRIER 609 sqt_barrier_wait( &barrier ); 630 610 #else 631 611 barrier_wait( &barrier ); -
soft/giet_vm/applications/gameoflife/gameoflife.ld
r251 r502 3 3 *****************************************************************************/ 4 4 5 seg_data_base = 0x 00800000;6 seg_code_base = 0x 00400000;5 seg_data_base = 0x20000000; 6 seg_code_base = 0x10000000; 7 7 8 8 /*************************************************************************** … … 22 22 *(.ctors) 23 23 *(.rodata) 24 /* . = ALIGN(4); */25 24 *(.rodata.*) 26 /* . = ALIGN(4); */27 25 *(.data) 28 /* . = ALIGN(4); */29 26 *(.lit8) 30 27 *(.lit4) 31 28 *(.sdata) 32 /* . = ALIGN(4); */33 29 *(.bss) 34 30 *(COMMON) -
soft/giet_vm/applications/gameoflife/main.c
r444 r502 1 /* 2 * This application is an emulation of the game of life automaton 3 * It must be deployed from processor 0 and use contiguous processor 4 * (example 0,1,2,3) 5 */ 6 1 ////////////////////////////////////////////////////////////////////////////////// 2 // File : main.c (for gameoflife) 3 // Date : November 2013 4 // Author : Alexandre Joannou <alexandre.joannou@lip6.fr> 5 // 6 // This application is an emulation of the game of life automaton. 7 // The world size is defined by the HEIGHT and WIDTH parameters. 8 // There is one task per processor, and each task compute HEIGHT/nbprocs lines. 9 // The number of processors must be a power of 2 not larger than HEIGHT. 10 ////////////////////////////////////////////////////////////////////////////////// 7 11 8 12 #include "stdio.h" 9 13 #include "limits.h" 10 #include "barrier.h" 11 #include "hard_config.h" 14 #include "user_barrier.h" 12 15 #include "mapping_info.h" 13 16 14 17 #define WIDTH 128 15 18 #define HEIGHT 128 16 #define NB_CLUSTER_MAX 25617 19 #define NB_ITERATION 1000000000 18 20 19 #define PRINTF(...) ({ if ( proc_id==0) { giet_tty_printf(__VA_ARGS__); } }) 20 21 giet_barrier_t barriers[2]; 22 23 unsigned int init_ok = 1; 24 25 #define NEW 0 26 #define OLD 1 21 #define PRINTF(...) ({ if ( proc_id==0) { giet_shr_printf(__VA_ARGS__); } }) 22 23 giet_sqt_barrier_t barrier; 24 25 unsigned int init_ok = 0; 26 27 #define OLD 0 28 #define NEW 1 29 #define DSP 2 27 30 28 31 typedef unsigned char uint8_t; 29 32 typedef unsigned int size_t; 30 33 31 uint8_t world[2][HEIGHT][WIDTH]; 32 uint8_t world_yuv[HEIGHT][WIDTH]; 33 34 /* Generate binary values for world between base_line and base_line + nb_line */ 34 uint8_t world[3][HEIGHT][WIDTH]; 35 36 ///////////////////////////////////////////////// 35 37 void init_world(size_t base_line, size_t nb_line) 36 38 { 37 39 size_t x,y; 38 for (y = base_line ; y < base_line + nb_line; y++){ 39 for(x = 0; x < WIDTH ; x++) { 40 // TODO OPTIMIZE RANDOM INIT 40 for (y = base_line ; y < base_line + nb_line; y++) 41 { 42 for(x = 0; x < WIDTH ; x++) 43 { 41 44 world[OLD][y][x] = giet_rand() % 2; 42 45 } … … 44 47 } 45 48 49 ///////////////////////////////////////////////// 46 50 uint8_t number_of_alive_neigh(size_t x, size_t y) 47 51 { … … 60 64 } 61 65 62 / * Compute cell x,y */66 ///////////////////////////////////////////////// 63 67 uint8_t compute_cell(size_t x, size_t y) 64 68 { 65 69 uint8_t nb_neighbours_alive = number_of_alive_neigh(x,y); 66 if (world[OLD][y][x] == 1) { 67 if (nb_neighbours_alive == 2 || 68 nb_neighbours_alive == 3) 69 { 70 return 1; 71 } 72 } 73 else { 74 if (nb_neighbours_alive == 3) { 75 return 1; 76 } 77 else { 78 return world[OLD][y][x]; 79 } 70 if (world[OLD][y][x] == 1) 71 { 72 if (nb_neighbours_alive == 2 || nb_neighbours_alive == 3) return 1; 73 } 74 else 75 { 76 if (nb_neighbours_alive == 3) return 1; 77 else return world[OLD][y][x]; 80 78 } 81 79 return 0; … … 99 97 { 100 98 size_t x,y; 101 for (y = base_line; y < base_line + nb_line; y++){ 102 for(x = 0; x < WIDTH ; x++) { 103 //world_yuv[y][x] = world[NEW][y][x]*100; 104 world[NEW][y][x] = world[NEW][y][x]*255; 99 for (y = base_line; y < base_line + nb_line; y++) 100 { 101 for(x = 0; x < WIDTH ; x++) 102 { 103 world[DSP][y][x] = world[OLD][y][x]*255; 105 104 } 106 105 } 107 106 108 107 giet_fbf_sync_write( base_line * WIDTH , 109 &world[NEW][base_line][0],110 nb_line * WIDTH);108 &world[DSP][base_line][0], 109 nb_line * WIDTH ); 111 110 } 112 111 … … 133 132 giet_proc_xyp( &x, &y, &p ); 134 133 134 // get processors number 135 unsigned int x_size; 136 unsigned int y_size; 137 unsigned int n_local_procs; 138 giet_procs_number( &x_size, &y_size, &n_local_procs ); 139 135 140 // compute continuous processor index 136 unsigned int proc_id = (((x * Y_SIZE) + y) * NB_PROCS_MAX) + p; 137 138 unsigned int nlocal_procs = NB_PROCS_MAX; // processors per cluster 139 unsigned int nclusters = X_SIZE*Y_SIZE; // number of clusters 140 unsigned int nglobal_procs = nclusters * nlocal_procs; // number of processors 141 unsigned int proc_id = (((x * y_size) + y) * n_local_procs) + p; 142 143 unsigned int n_clusters = x_size * y_size; // number of clusters 144 unsigned int n_global_procs = n_clusters * n_local_procs; // number of processors 141 145 size_t i; 142 146 143 size_t nb_line = HEIGHT / nglobal_procs; 147 if ( n_global_procs > HEIGHT ) 148 { 149 PRINTF("[GAMEOFLIFE ERROR] Number or processors too large :" 150 " nb_procs = %d / image heigth = %d\n", n_global_procs, HEIGHT ); 151 giet_exit("error"); 152 } 153 154 size_t nb_line = HEIGHT / n_global_procs; 144 155 size_t base_line = nb_line * proc_id; 145 156 146 PRINTF("*** Starting init at cycle %d ***\n", giet_proctime()); 147 148 // barriers initialization 157 PRINTF("\n*** Starting barrier initialisation at cycle %d ***\n" 158 " nprocs = %d / nlines = %d\n", 159 giet_proctime() , n_global_procs, HEIGHT ); 160 161 // barrier initialization 149 162 if ( proc_id == 0 ) 150 163 { 151 barrier_init(&barriers[0], nglobal_procs); 152 barrier_init(&barriers[1], nglobal_procs); 153 154 init_ok = 0; 164 sqt_barrier_init( &barrier , x_size , y_size , n_local_procs ); 165 init_ok = 1; 155 166 } 156 167 else 157 168 { 158 while ( init_ok == 1 ); 159 } 160 161 init_world(base_line, nb_line); 162 163 PRINTF("*** Completing init at cycle %d ***\n", giet_proctime()); 164 barrier_wait(&barriers[0]); 169 while ( init_ok == 0 ) asm volatile("nop"); 170 } 171 172 PRINTF("\n*** Starting world initialisation at cycle %d ***\n", 173 giet_proctime() ); 174 175 // parallel world initialization 176 init_world( base_line , nb_line ); 177 178 PRINTF("coucou 0\n"); 179 180 display_world( base_line , nb_line ); 181 182 PRINTF("coucou 1\n"); 183 184 sqt_barrier_wait( &barrier ); 185 186 PRINTF("\n*** Starting life at cycle %d ***\n", 187 giet_proctime() ); 165 188 166 189 for (i = 0; i < NB_ITERATION; i++) 167 190 { 168 compute_new_gen(base_line, nb_line); 169 grow_old_world(base_line, nb_line); 170 display_world(base_line, nb_line); 171 barrier_wait(&barriers[1]); 172 barrier_init(&barriers[1], nglobal_procs); 173 } 174 175 PRINTF("*** End of main at cycle %d ***\n", giet_proctime()); 191 compute_new_gen( base_line, nb_line ); 192 grow_old_world( base_line, nb_line ); 193 display_world( base_line, nb_line ); 194 195 sqt_barrier_wait( &barrier ); 196 197 PRINTF(" - iteration %d completed\n", i ); 198 } 199 200 PRINTF("\n*** End of main at cycle %d ***\n", giet_proctime()); 176 201 177 202 giet_exit("Completed"); -
soft/giet_vm/applications/sort/main.c
r432 r502 9 9 // barrier routines to apply a sort algorithm in several stages. 10 10 // 11 // Considerations : 12 // 13 // - It supports up to 256 processors and the number of processors 14 // must be a power of 2. 15 // 16 // - If there is only one TTY available, this application uses a spin 17 // lock to avoid several threads writting at the same time. 18 // 19 // - This application must be executed on a cache coherent 20 // architecture. Otherwise some modifications must be applied 21 // 22 // - The processors executing this application must have a contiguous 23 // processor id and the first processor must have id 0. 11 // Constraints : 12 // 13 // - It supports up to 1024 processors and the number of processors 14 // must be a power of 2. 15 // 16 // _ The array of values to be sorted (ARRAY_LENGTH) must be power of 2 17 // larger than the number of processors. 18 // 19 // - This application must be executed on a cache coherent architecture. 24 20 // 25 21 /////////////////////////////////////////////////////////////////////////////// … … 28 24 #include "mapping_info.h" 29 25 #include "hard_config.h" 30 #include " barrier.h"31 32 #define ARRAY_LENGTH 51233 #define IPT (ARRAY_LENGTH / *nb_thread) // ITEMS PER THREAD26 #include "user_barrier.h" 27 28 #define ARRAY_LENGTH 4096 29 #define IPT (ARRAY_LENGTH / threads) // ITEMS PER THREAD 34 30 35 31 //////////////////////////////////////////////////////////////////////////////// 36 // Processors other than 0 display algorithm state 37 // The processor 0 always displays some information so this does not affect him 32 // Processors other than 0 display algorithm state if VERBOSE non zero 38 33 39 34 #define VERBOSE 1 40 35 41 36 //////////////////////////////////////////////////////////////////////////////// 42 // Define printf according to verbosity option and number of available 43 // TTY 37 // Define printf according to verbosity option and number of available TTY 44 38 45 39 #if (VERBOSE == 1) … … 50 44 51 45 #define task0_printf(...) if(thread_id == 0) giet_shr_printf(__VA_ARGS__) 52 53 #define exit giet_exit54 #define procid giet_procid55 #define rand giet_rand56 46 57 47 int array0[ARRAY_LENGTH]; … … 73 63 int init_pos_result); 74 64 75 /////////////////////////////////////////////////// 76 // This application support at most 256processors77 // Number of barriers = log2( nb_thread)78 79 giet_barrier_t barrier[ 8];65 /////////////////////////////////////////////////////// 66 // This application supports at most 1024 processors 67 // Number of barriers = log2(threads) 68 69 giet_barrier_t barrier[10]; 80 70 81 71 ////////////////////////////////////////// … … 83 73 { 84 74 int thread_id = giet_thread_id(); 85 unsigned int* nb_thread;86 75 int * src_array = NULL; 87 76 int * dst_array = NULL; … … 91 80 unsigned int time_end; 92 81 93 giet_vobj_get_vbase( "sort" , 94 "sort_args", 95 (unsigned int*)&nb_thread ); 96 97 task0_printf("\n[ Thread 0 ] Starting sort application with %u threads " 98 "at cycle %u\n", *nb_thread, time_start); 82 // compute number of threads (one thread per proc) 83 unsigned int x_size; 84 unsigned int y_size; 85 unsigned int nprocs; 86 unsigned int threads; 87 giet_procs_number( &x_size , &y_size , &nprocs ); 88 threads = x_size * y_size * nprocs; 89 90 if ( (threads != 1) && (threads != 2) && (threads != 4) && 91 (threads != 8) && (threads != 16 ) && (threads != 32) && 92 (threads != 64) && (threads != 128) && (threads != 256) && 93 (threads != 512) && (threads != 1024) ) 94 { 95 task0_printf("[SORT ERROR] Number of processors must be power of 2\n" 96 " x_size = %d / y_size = %d / nprocs = %d\n", 97 x_size , y_size , nprocs ); 98 giet_exit("error"); 99 } 100 101 task0_printf("\n[ Thread 0 ] Starting sort application with %d threads " 102 "at cycle %d\n", threads, time_start); 99 103 100 104 /////////////////////////// … … 103 107 if (thread_id == 0) 104 108 { 105 for (i = 0; i < __builtin_ctz( *nb_thread); i++)106 { 107 barrier_init(&barrier[i], *nb_thread>> i);109 for (i = 0; i < __builtin_ctz( threads ); i++) 110 { 111 barrier_init(&barrier[i], threads >> i); 108 112 } 109 113 … … 120 124 for (i = IPT * thread_id; i < IPT * (thread_id + 1); i++) 121 125 { 122 array0[i] = rand();126 array0[i] = giet_rand(); 123 127 } 124 128 … … 132 136 printf("[ Thread %d ] Finishing Stage 0\n\r", thread_id); 133 137 134 for (i = 0; i < __builtin_ctz( *nb_thread); i++)138 for (i = 0; i < __builtin_ctz( threads ); i++) 135 139 { 136 140 barrier_wait(&barrier[i]); … … 139 143 { 140 144 printf("[ Thread %d ] Quit\n\r", thread_id ); 141 exit("Completed");145 giet_exit("Completed"); 142 146 } 143 147 … … 173 177 if(thread_id != 0) 174 178 { 175 exit("error: only thread 0 should get here");179 giet_exit("error: only thread 0 should get here"); 176 180 } 177 181 … … 196 200 if (success) 197 201 { 198 exit("!!! Success !!!");202 giet_exit("!!! Success !!!"); 199 203 } 200 204 else … … 206 210 printf("array[%d] = %d\n", i, dst_array[i]); 207 211 } 208 exit("!!! Failure !!!");209 } 210 211 exit("Completed");212 giet_exit("!!! Failure !!!"); 213 } 214 215 giet_exit("Completed"); 212 216 } 213 217 -
soft/giet_vm/applications/sort/sort.py
r434 r502 33 33 # define vsegs base & size 34 34 code_base = 0x10000000 35 code_size = 0x00 200000 # 2 Mbytes (replicated in each cluster)35 code_size = 0x00010000 # 64 Kbytes (replicated in each cluster) 36 36 37 37 data_base = 0x20000000 38 data_size = 0x00100000 # 1 Mbyte (non replicated) 39 40 args_base = 0x20100000 41 args_size = 0x00000004 # 4 bytes (non replicated) 38 data_size = 0x00010000 # 64 Kbyte (non replicated) 42 39 43 40 stack_base = 0x40000000 … … 54 51 'C_WU', vtype = 'ELF', x = 0, y = 0, pseg = 'RAM', 55 52 binpath = 'build/sort/sort.elf', 56 local = False, big = True ) 57 58 # args vseg : non local (only in cluster[0,0]) 59 mapping.addVseg( vspace, 'sort_args', args_base , args_size, 60 'C_WU', vtype = 'CONST', x = 0, y = 0, pseg = 'RAM', 61 init = ntasks, 62 local = False, big = True ) 53 local = False ) 63 54 64 55 # code vsegs : local (one copy per cluster) 65 56 for x in xrange (x_size): 66 57 for y in xrange (y_size): 67 mapping.addVseg( vspace, 'sort_code', code_base , code_size, 68 'CXWU', vtype = 'ELF', x = x, y = y, pseg = 'RAM', 69 binpath = 'build/sort/sort.elf', 70 local = True, big = True ) 58 cluster_id = (x * y_size) + y 59 if ( mapping.clusters[cluster_id].procs ): 60 61 mapping.addVseg( vspace, 'sort_code', code_base , code_size, 62 'CXWU', vtype = 'ELF', x = x, y = y, pseg = 'RAM', 63 binpath = 'build/sort/sort.elf', 64 local = True ) 71 65 72 66 # stacks vsegs : local (one stack per task) 73 67 for x in xrange (x_size): 74 68 for y in xrange (y_size): 75 for p in xrange (nprocs): 76 proc_id = (((x * y_size) + y) * nprocs) + p 77 size = stack_size / nprocs 78 base = stack_base + (proc_id * size) 79 mapping.addVseg( vspace, 'sort_stack_%d_%d_%d' % (x,y,p), base, size, 69 cluster_id = (x * y_size) + y 70 if ( mapping.clusters[cluster_id].procs ): 71 for p in xrange (nprocs): 72 proc_id = (((x * y_size) + y) * nprocs) + p 73 size = stack_size / nprocs 74 base = stack_base + (proc_id * size) 75 76 mapping.addVseg( vspace, 'sort_stack_%d_%d_%d' % (x,y,p), 77 base, size, 'C_WU', vtype = 'BUFFER', 78 x = x, y = y, pseg = 'RAM', 79 local = True, big = True ) 80 81 # heap vsegs : distributed but non local (all tasks can access all heap vsegs) 82 for x in xrange (x_size): 83 for y in xrange (y_size): 84 cluster_id = (x * y_size) + y 85 if ( mapping.clusters[cluster_id].procs ): 86 size = heap_size 87 base = heap_base + (cluster_id * size) 88 89 mapping.addVseg( vspace, 'sort_heap_%d_%d' % (x,y), base, size, 80 90 'C_WU', vtype = 'BUFFER', x = x, y = y, pseg = 'RAM', 81 local = True, big = True ) 82 83 # heap vsegs : distributed but non local (all tasks can access all heap vsegs) 84 cluster_id = (x * y_size) + y 85 size = heap_size 86 base = heap_base + (cluster_id * size) 87 mapping.addVseg( vspace, 'sort_heap_%d_%d' % (x,y), base, size, 88 'C_WU', vtype = 'BUFFER', x = x, y = y, pseg = 'RAM', 89 local = False, big = True ) 91 local = False, big = True ) 90 92 91 93 # distributed tasks / one task per processor 92 94 for x in xrange (x_size): 93 95 for y in xrange (y_size): 94 for p in xrange( nprocs ): 95 trdid = (((x * y_size) + y) * nprocs) + p 96 mapping.addTask( vspace, 'sort_%d_%d_%d' % (x,y,p), trdid, x, y, p, 97 'sort_stack_%d_%d_%d' % (x,y,p), 98 'sort_heap_%d_%d' % (x,y), 0 ) 96 cluster_id = (x * y_size) + y 97 if ( mapping.clusters[cluster_id].procs ): 98 for p in xrange( nprocs ): 99 trdid = (((x * y_size) + y) * nprocs) + p 100 101 mapping.addTask( vspace, 'sort_%d_%d_%d' % (x,y,p), 102 trdid, x, y, p, 103 'sort_stack_%d_%d_%d' % (x,y,p), 104 'sort_heap_%d_%d' % (x,y), 0 ) 99 105 100 106 # extend mapping name -
soft/giet_vm/applications/transpose/main.c
r444 r502 1 /////////////////////////////////////////////////////////////////////////////////////// //////1 /////////////////////////////////////////////////////////////////////////////////////// 2 2 // File : main.c (for transpose application) 3 3 // Date : february 2014 4 4 // author : Alain Greiner 5 ///////////////////////////////////////////////////////////////////////////////////////////// 6 // This multi-threaded application makes a transpose for a NN*NN pixels sequence of images. 5 /////////////////////////////////////////////////////////////////////////////////////// 6 // This multi-threaded application makes a transpose for a NN*NN pixels 7 // sequence of images. 7 8 // It can run on a multi-processors, multi-clusters architecture, with one thread 8 // per processor. It uses the he following hardware parameters, that must be defined 9 // in the hard_config.h file: 10 // - X_SIZE : number of clusters in a row 11 // - Y_SIZE : number of clusters in a column 12 // - NB_PROCS_MAX : number of processors per cluster 13 // - FBUF_X_SIZE : number of pixels per line in frame buffer 14 // - FBUF_Y_SIZE : number of lines in frame buffer 15 // 9 // per processor. 10 // 16 11 // The image sequence is read from a file (one byte per pixel). 17 12 // The input and output buffers containing the image are distributed in all clusters. 18 13 // 19 // - The image size NN must be a power of 2 and must fit the frame buffer size.20 // - The number of clusters containing processors must be a power of 2.21 // - The number of processors per cluster must be a power of 2.22 // - The image size NN must be larger or equal to the total number of processor.14 // - The image size NN must fit the frame buffer size: 128 bytes 15 // - The block size in block device must be 512 bytes. 16 // - The number of clusters must be a power of 2 no larger than 32 17 // - The number of processors per cluster must be a power of 2 no larger than 4 23 18 // 24 19 // For each image the application makes a self test (checksum for each line). 25 20 // The actual display on the frame buffer depends on frame buffer availability. 26 ///////////////////////////////////////////////////////////////////////////////////////////// 27 28 #include "hard_config.h" 21 /////////////////////////////////////////////////////////////////////////////////////// 22 29 23 #include "stdio.h" 30 #include " barrier.h"24 #include "user_barrier.h" 31 25 #include "malloc.h" 32 26 27 #define BLOCK_SIZE 512 // block size on disk 28 #define CLUSTERS_MAX 32 // max number of clusters 29 #define PROCS_MAX 4 // max number of processors per cluster 33 30 #define NN 128 // image size : nlines = npixels = 128 34 31 #define NB_IMAGES 5 // number of images to be handled 35 32 #define FILE_PATHNAME "misc/images.raw" // file pathname on disk 36 #define NB_CLUSTERS (X_SIZE * Y_SIZE) // number of clusters37 33 #define INSTRUMENTATION_OK 0 // display statistics on TTY when non zero 38 34 … … 41 37 /////////////////////////////////////////////////////// 42 38 43 // instrumentation counters 44 // for each processor (up to 4 processors) 45 // in each cluster (up to 32 clusters) 46 unsigned int LOAD_START[NB_CLUSTERS][NB_PROCS_MAX]; 47 unsigned int LOAD_END [NB_CLUSTERS][NB_PROCS_MAX]; 48 unsigned int TRSP_START[NB_CLUSTERS][NB_PROCS_MAX]; 49 unsigned int TRSP_END [NB_CLUSTERS][NB_PROCS_MAX]; 50 unsigned int DISP_START[NB_CLUSTERS][NB_PROCS_MAX]; 51 unsigned int DISP_END [NB_CLUSTERS][NB_PROCS_MAX]; 39 // instrumentation counters for each processor in each cluster 40 unsigned int LOAD_START[CLUSTERS_MAX][PROCS_MAX]; 41 unsigned int LOAD_END [CLUSTERS_MAX][PROCS_MAX]; 42 unsigned int TRSP_START[CLUSTERS_MAX][PROCS_MAX]; 43 unsigned int TRSP_END [CLUSTERS_MAX][PROCS_MAX]; 44 unsigned int DISP_START[CLUSTERS_MAX][PROCS_MAX]; 45 unsigned int DISP_END [CLUSTERS_MAX][PROCS_MAX]; 52 46 53 47 // arrays of pointers on distributed buffers 54 48 // one input buffer & one output buffer per cluster 55 unsigned char* buf_in [ NB_CLUSTERS];56 unsigned char* buf_out[ NB_CLUSTERS];49 unsigned char* buf_in [CLUSTERS_MAX]; 50 unsigned char* buf_out[CLUSTERS_MAX]; 57 51 58 52 // checksum variables … … 61 55 62 56 // global synchronisation barrier 63 giet_ barrier_t barrier;57 giet_sqt_barrier_t barrier; 64 58 65 59 volatile unsigned int init_ok = 0; … … 70 64 { 71 65 72 int file = 0; // file descriptor 73 unsigned int l; // line index for loops 74 unsigned int p; // pixel index for loops 75 unsigned int c; // cluster index for loops 76 77 // get processor identifiers 78 unsigned int x; // x cluster coordinate 79 unsigned int y; // y cluster coordinate 80 unsigned int lpid; // local processor index 66 unsigned int l; // line index for loops 67 unsigned int p; // pixel index for loops 68 unsigned int c; // cluster index for loops 69 70 // processor identifiers 71 unsigned int x; // x cluster coordinate 72 unsigned int y; // y cluster coordinate 73 unsigned int lpid; // local processor index 74 75 // plat-form parameters 76 unsigned int x_size; // number of clusters in a row 77 unsigned int y_size; // number of clusters in a column 78 unsigned int nprocs; // number of processors per cluster 79 81 80 giet_proc_xyp( &x, &y, &lpid); 82 81 83 unsigned int npixels = NN * NN; // pixels per image 84 unsigned int nblocks = npixels / 512; // blocks per image 85 unsigned int image = 0; // image counter 86 87 unsigned int cluster_id = (x * Y_SIZE) + y; // "continuous" index 88 unsigned int ntasks = NB_CLUSTERS * NB_PROCS_MAX; // number of tasks 89 unsigned int task_id = (cluster_id * NB_PROCS_MAX) + lpid; // "continuous" task index 82 giet_procs_number( &x_size , &y_size , &nprocs ); 83 84 giet_shr_printf("\n[TRANSPOSE] Processor[%d,%d,%d] starts at cycle %d\n" 85 " - x_size = %d\n" 86 " - y_size = %d\n" 87 " - nprocs = %d\n", 88 x, y, lpid, giet_proctime(), x_size , y_size , nprocs ); 89 90 unsigned int nclusters = x_size * y_size; // number of clusters 91 unsigned int ntasks = x_size * y_size * nprocs; // number of tasks 92 unsigned int npixels = NN * NN; // pixels per image 93 unsigned int nblocks = npixels / BLOCK_SIZE; // blocks per image 94 unsigned int image = 0; // image counter 95 int file = 0; // file descriptor 96 unsigned int cluster_id = (x * y_size) + y; // "continuous" index 97 unsigned int task_id = (cluster_id * nprocs) + lpid; // "continuous" task index 90 98 91 99 // Processor [0,0,0] makes initialisation 92 // It includes parameters checking, barrier sinitialization,100 // It includes parameters checking, barrier initialization, 93 101 // distributed buffers allocation, and file open 94 102 if ( (x==0) && (y==0) && (lpid==0) ) 95 103 { 96 // Parameters checking 97 if ( (NN != FBUF_X_SIZE) || (NN != FBUF_Y_SIZE) ) 98 { 99 giet_exit("[TRANSPOSE ERROR] Frame buffer size does not fit image size"); 100 } 101 if ((NB_PROCS_MAX != 1) && (NB_PROCS_MAX != 2) && (NB_PROCS_MAX != 4)) 104 if ((nprocs != 1) && (nprocs != 2) && (nprocs != 4)) 102 105 { 103 giet_exit("[TRANSPOSE ERROR] NB_PROCS_MAXmust be 1, 2 or 4");104 } 105 if (( NB_CLUSTERS != 1) && (NB_CLUSTERS != 2) && (NB_CLUSTERS!= 4) &&106 ( NB_CLUSTERS != 8) && (NB_CLUSTERS != 16) && (NB_CLUSTERS!= 32) )106 giet_exit("[TRANSPOSE ERROR] number of procs per cluster must be 1, 2 or 4"); 107 } 108 if ((nclusters != 1) && (nclusters != 2) && (nclusters != 4) && 109 (nclusters != 8) && (nclusters != 16) && (nclusters != 32) ) 107 110 { 108 111 giet_exit("[TRANSPOSE ERROR] number of clusters must be 1,2,4,8,16,32"); … … 113 116 } 114 117 115 giet_shr_printf("\n[TRANSPOSE] Processor[0,0,0] starts at cycle %d\n"116 " - x_size = %d\n"117 " - y_size = %d\n"118 " - nprocs = %d\n"119 " - nclusters = %d\n"120 " - ntasks = %d\n",121 giet_proctime(), X_SIZE, Y_SIZE, NB_PROCS_MAX, NB_CLUSTERS, ntasks );122 123 118 // Barrier initialisation 124 barrier_init( &barrier, ntasks );119 sqt_barrier_init( &barrier, x_size , y_size , nprocs ); 125 120 126 121 giet_shr_printf("\n[TRANSPOSE] Proc [0,0,0] completes barrier init at cycle %d\n", … … 128 123 129 124 // Distributed buffers allocation 130 // The buffers containing one image are distributed in clusters131 // (one buf_in and one buf_out per cluster).132 // Each buffer contains (NN*NN / NB_CLUSTERS) bytes.133 for ( c = 0 ; c < NB_CLUSTERS; c++ )134 { 135 unsigned int rx = c / Y_SIZE;136 unsigned int ry = c % Y_SIZE;137 138 buf_in[c] = remote_malloc( npixels/ NB_CLUSTERS, rx, ry );139 buf_out[c] = remote_malloc( npixels/ NB_CLUSTERS, rx, ry );125 // The buffers containing one image are distributed in the user 126 // heap (one buf_in and one buf_out per cluster). 127 // Each buffer contains (NN*NN / nclusters) bytes. 128 for ( c = 0 ; c < nclusters ; c++ ) 129 { 130 unsigned int rx = c / y_size; 131 unsigned int ry = c % y_size; 132 133 buf_in[c] = remote_malloc( npixels/nclusters, rx, ry ); 134 buf_out[c] = remote_malloc( npixels/nclusters, rx, ry ); 140 135 141 136 giet_shr_printf("\n[TRANSPOSE] Proc [0,0,0] completes buffer allocation" … … 167 162 { 168 163 while ( init_ok == 0 ); 169 giet_shr_printf("\n[TRANSPOSE] Processor[%d,%d,%d] starts at cycle %d\n",170 x, y, lpid, giet_proctime() );171 164 } 172 165 … … 175 168 while (image < NB_IMAGES) 176 169 { 177 // pseudo parallel load from disk to buf_in buffer : nblocks/ NB_CLUSTERSblocks170 // pseudo parallel load from disk to buf_in buffer : nblocks/nclusters blocks 178 171 // only task running on processor with (lpid == 0) does it 179 172 … … 184 177 giet_fat_read( file, 185 178 buf_in[cluster_id], 186 (nblocks / NB_CLUSTERS), 187 ((image*nblocks) + ((nblocks*cluster_id)/NB_CLUSTERS)) ); 188 179 (nblocks / nclusters), 180 ((image*nblocks) + ((nblocks*cluster_id)/nclusters)) ); 181 182 if ( (x==0) && (y==0) ) 189 183 giet_shr_printf("\n[TRANSPOSE] Proc [%d,%d,%d] completes load" 190 184 " for image %d at cycle %d\n", … … 194 188 LOAD_END[cluster_id][lpid] = giet_proctime(); 195 189 196 ///////////////////////// 197 barrier_wait( &barrier );190 ///////////////////////////// 191 sqt_barrier_wait( &barrier ); 198 192 199 193 // parallel transpose from buf_in to buf_out … … 206 200 207 201 unsigned int nlt = NN / ntasks; // number of lines per task 208 unsigned int nlc = NN / NB_CLUSTERS; // number of lines per cluster202 unsigned int nlc = NN / nclusters; // number of lines per cluster 209 203 210 204 unsigned int src_cluster; … … 242 236 if ( lpid == 0 ) 243 237 { 238 if ( (x==0) && (y==0) ) 244 239 giet_shr_printf("\n[TRANSPOSE] proc [%d,%d,0] completes transpose" 245 240 " for image %d at cycle %d\n", … … 249 244 TRSP_END[cluster_id][lpid] = giet_proctime(); 250 245 251 ///////////////////////// 252 barrier_wait( &barrier );246 ///////////////////////////// 247 sqt_barrier_wait( &barrier ); 253 248 254 249 // optional parallel display from local buf_out to frame buffer … … 265 260 npt ); 266 261 267 if ( lpid == 0 ) 268 { 269 giet_shr_printf("\n[TRANSPOSE] Proc [%d,%d,0] completes display" 270 " for image %d at cycle %d\n", 271 x, y, image, giet_proctime() ); 272 } 262 if ( (x==0) && (y==0) && (lpid==0) ) 263 giet_shr_printf("\n[TRANSPOSE] Proc [%d,%d,%d] completes display" 264 " for image %d at cycle %d\n", 265 x, y, lpid, image, giet_proctime() ); 273 266 274 267 DISP_END[cluster_id][lpid] = giet_proctime(); 275 268 276 ///////////////////////// 277 barrier_wait( &barrier );269 ///////////////////////////// 270 sqt_barrier_wait( &barrier ); 278 271 } 279 272 … … 318 311 } 319 312 320 ///////////////////////// 321 barrier_wait( &barrier );313 ///////////////////////////// 314 sqt_barrier_wait( &barrier ); 322 315 323 316 // instrumentation done by processor [0,0,0] … … 338 331 unsigned int max_disp_ended = 0; 339 332 340 for (cc = 0; cc < NB_CLUSTERS; cc++)333 for (cc = 0; cc < nclusters; cc++) 341 334 { 342 335 for (pp = 0; pp < NB_PROCS_MAX; pp++) … … 384 377 image++; 385 378 386 ///////////////////////// 387 barrier_wait( &barrier );379 ///////////////////////////// 380 sqt_barrier_wait( &barrier ); 388 381 389 382 } // end while image … … 392 385 if ( (x==0) && (y==0) && (lpid==0) ) 393 386 { 394 for ( c = 0 ; c < NB_CLUSTERS; c++ )387 for ( c = 0 ; c < nclusters ; c++ ) 395 388 { 396 389 free( buf_in[c] ); -
soft/giet_vm/applications/transpose/transpose.py
r457 r502 3 3 from mapping import * 4 4 5 ################################################################################## ####5 ################################################################################## 6 6 # file : transpose.py (for the transpose application) 7 7 # date : may 2014 8 8 # author : Alain Greiner 9 ################################################################################## #####9 ################################################################################## 10 10 # This file describes the mapping of the multi-threaded "transpose" 11 11 # application on a multi-clusters, multi-processors architecture. 12 12 # This include both the mapping of virtual segments on the clusters, 13 13 # and the mapping of tasks on processors. 14 # There is one task per processor. 15 # The mapping of virtual segments is the following: 16 # - There is one shared data vseg in cluster[0][0] 17 # - The code vsegs are replicated on all clusters containing processors. 18 # - There is one heap vseg per cluster containing processors. 19 # - The stacks vsegs are distibuted on all clusters containing processors. 14 20 # This mapping uses 5 platform parameters, (obtained from the "mapping" argument) 15 # - x_size : number of clusters in a row16 # - y_size : number of clusters in a column17 # - x_width : number of bits coding x coordinate18 # - y_width : number of bits coding y coordinate19 # - nprocs : number of processors per cluster20 ################################################################################## ##21 # - x_size : number of clusters in a row 22 # - y_size : number of clusters in a column 23 # - x_width : number of bits coding x coordinate 24 # - y_width : number of bits coding y coordinate 25 # - nprocs : number of processors per cluster 26 ################################################################################## 21 27 22 28 ######################### … … 54 60 for x in xrange (x_size): 55 61 for y in xrange (y_size): 56 mapping.addVseg( vspace, 'trsp_code_%d_%d' %(x,y), code_base , code_size, 57 'CXWU', vtype = 'ELF', x = x, y = y, pseg = 'RAM', 58 binpath = 'build/transpose/transpose.elf', 59 local = True ) 62 cluster_id = (x * y_size) + y 63 if ( mapping.clusters[cluster_id].procs ): 60 64 61 # stacks vsegs: local (one stack per processor => nprocs stacks per cluster) 62 for x in xrange (x_size): 63 for y in xrange (y_size): 64 for p in xrange( nprocs ): 65 proc_id = (((x * y_size) + y) * nprocs) + p 66 size = (stack_size / nprocs) & 0xFFFFF000 67 base = stack_base + (proc_id * size) 68 mapping.addVseg( vspace, 'trsp_stack_%d_%d_%d' % (x,y,p), base, size, 69 'C_WU', vtype = 'BUFFER', x = x , y = y , pseg = 'RAM', 70 local = True, big = True ) 65 mapping.addVseg( vspace, 'trsp_code_%d_%d' %(x,y), 66 code_base , code_size, 67 'CXWU', vtype = 'ELF', x = x, y = y, pseg = 'RAM', 68 binpath = 'build/transpose/transpose.elf', 69 local = True ) 71 70 72 # heap vsegs: distributed but non local (all heap vsegs can be accessed by all tasks)71 # stacks vsegs: local (one stack per processor => nprocs stacks per cluster) 73 72 for x in xrange (x_size): 74 73 for y in xrange (y_size): 75 74 cluster_id = (x * y_size) + y 76 size = heap_size 77 base = heap_base + (cluster_id * size) 78 mapping.addVseg( vspace, 'trsp_heap_%d_%d' % (x,y), base, size, 79 'C_WU', vtype = 'BUFFER', x = x, y = y, pseg = 'RAM', 80 local = False, big = True ) 75 if ( mapping.clusters[cluster_id].procs ): 76 for p in xrange( nprocs ): 77 proc_id = (((x * y_size) + y) * nprocs) + p 78 size = (stack_size / nprocs) & 0xFFFFF000 79 base = stack_base + (proc_id * size) 80 81 mapping.addVseg( vspace, 'trsp_stack_%d_%d_%d' % (x,y,p), 82 base, size, 'C_WU', vtype = 'BUFFER', 83 x = x , y = y , pseg = 'RAM', 84 local = True, big = True ) 85 86 # heap vsegs: distributed non local (all heap vsegs can be accessed by all tasks) 87 for x in xrange (x_size): 88 for y in xrange (y_size): 89 cluster_id = (x * y_size) + y 90 if ( mapping.clusters[cluster_id].procs ): 91 size = heap_size 92 base = heap_base + (cluster_id * size) 93 94 mapping.addVseg( vspace, 'trsp_heap_%d_%d' % (x,y), base, size, 95 'C_WU', vtype = 'HEAP', x = x, y = y, pseg = 'RAM', 96 local = False, big = True ) 81 97 82 98 # distributed tasks / one task per processor 83 99 for x in xrange (x_size): 84 100 for y in xrange (y_size): 85 for p in xrange( nprocs ): 86 trdid = (((x * y_size) + y) * nprocs) + p 87 mapping.addTask( vspace, 'trsp_%d_%d_%d' % (x,y,p), trdid, x, y, p, 88 'trsp_stack_%d_%d_%d' % (x,y,p), 89 'trsp_heap_%d_%d' % (x,y), 0 ) 101 cluster_id = (x * y_size) + y 102 if ( mapping.clusters[cluster_id].procs ): 103 for p in xrange( nprocs ): 104 trdid = (((x * y_size) + y) * nprocs) + p 105 106 mapping.addTask( vspace, 'trsp_%d_%d_%d' % (x,y,p), 107 trdid, x, y, p, 108 'trsp_stack_%d_%d_%d' % (x,y,p), 109 'trsp_heap_%d_%d' % (x,y), 0 ) 90 110 91 111 # extend mapping name … … 94 114 return vspace # useful for test 95 115 96 ################################ test ################################################## ####116 ################################ test ################################################## 97 117 98 118 if __name__ == '__main__':
Note: See TracChangeset
for help on using the changeset viewer.