Changeset 248 for trunk/softs
- Timestamp:
- Aug 9, 2012, 10:57:23 AM (12 years ago)
- Location:
- trunk/softs
- Files:
-
- 8 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/softs/giet_tsar/dma.h
r158 r248 9 9 DMA_IRQ_DISABLE = 4, 10 10 /***/ 11 DMA_SPAN = 8,11 DMA_SPAN = 0x400, 12 12 }; 13 13 14 14 enum DmaStatusValues { 15 DMA_ IDLE= 0,16 DMA_ SUCCESS= 1,17 DMA_ READ_ERROR= 2,15 DMA_SUCCESS = 0, 16 DMA_READ_ERROR = 1, 17 DMA_IDLE = 2, 18 18 DMA_WRITE_ERROR = 3, 19 19 }; -
trunk/softs/giet_tsar/drivers.c
r178 r248 376 376 377 377 tty_address = (char*)(base + increment + tid*TTY_SPAN*4); 378 //tty_address = (char*)(base + tid*TTY_SPAN*4); 378 379 379 380 for ( i=0 ; i < length ; i++ ) … … 569 570 if( index >= max ) return -1; 570 571 571 register int delay = ( (_proctime() + _procid()) & 0xF) << 4;572 register int * plock = (int*)&_spin_lock[index];572 register int delay = ((_proctime() +_procid()) & 0xF) << 4; 573 register int * plock = (int *) &_spin_lock[index]; 573 574 574 575 asm volatile ("_locks_llsc: \n" … … 769 770 { 770 771 int* dma_address; 771 unsigned int base = (unsigned int) &seg_dma_base;772 unsigned int base = (unsigned int) &seg_dma_base; 772 773 unsigned int increment = _segment_increment(DMA_SPAN*4); 773 char * fb = (char*)&seg_fb_base + offset;774 unsigned int delay 774 char * fb = (char *) &seg_fb_base + offset; 775 unsigned int delay = (_proctime() & 0xF) << 4; 775 776 unsigned int pid = _procid(); 776 777 unsigned int i; -
trunk/softs/giet_tsar/isr.c
r158 r248 183 183 End: */ 184 184 185 /* vim: set filetype= asmexpandtab shiftwidth=4 tabstop=4 softtabstop=4: */185 /* vim: set filetype=c expandtab shiftwidth=4 tabstop=4 softtabstop=4: */ 186 186 -
trunk/softs/giet_tsar/reset.s
r163 r248 107 107 nop 108 108 la $26, _interrupt_vector # interrupt vector address 109 la $27, _isr_dma 110 sw $27, 0($26) # interrupt_vector[0] <= _isr_dma_get 111 sw $27, 4($26) # interrupt_vector[1] <= _isr_dma_get 112 sw $27, 8($26) # interrupt_vector[2] <= _isr_dma_get 113 sw $27, 12($26) # interrupt_vector[3] <= _isr_dma_get 109 114 la $27, _isr_tty_get 110 sw $27, 0($26) # interrupt_vector[0] <= _isr_tty_get 111 sw $27, 4($26) # interrupt_vector[1] <= _isr_tty_get 112 sw $27, 8($26) # interrupt_vector[2] <= _isr_tty_get 113 sw $27, 12($26) # interrupt_vector[3] <= _isr_tty_get 114 la $27, _isr_dma 115 sw $27, 16($26) # interrupt_vector[4] <= _isr_dma 116 sw $27, 20($26) # interrupt_vector[5] <= _isr_dma 117 sw $27, 24($26) # interrupt_vector[6] <= _isr_dma 118 sw $27, 28($26) # interrupt_vector[7] <= _isr_dma 115 sw $27, 16($26) # interrupt_vector[4] <= _isr_tty 116 sw $27, 20($26) # interrupt_vector[5] <= _isr_tty 117 sw $27, 24($26) # interrupt_vector[6] <= _isr_tty 118 sw $27, 28($26) # interrupt_vector[7] <= _isr_tty 119 119 la $27, _isr_ioc 120 120 sw $27, 32($26) # interrupt_vector[8] <= _isr_ioc -
trunk/softs/soft_filter_giet/Makefile
r163 r248 1 LD=mipsel-unknown-elf-ld2 CC=mipsel-unknown-elf-gcc3 AS=mipsel-unknown-elf-as4 DU=mipsel-unknown-elf-objdump5 1 6 OBJS= reset.o \ 2 LD = mipsel-unknown-elf-ld 3 CC = mipsel-unknown-elf-gcc 4 AS = mipsel-unknown-elf-as 5 DU = mipsel-unknown-elf-objdump 6 7 OBJS = reset.o \ 7 8 giet.o \ 8 9 isr.o \ … … 11 12 main.o 12 13 13 CFLAGS = -Wall-mno-gpopt -ffreestanding -fomit-frame-pointer -mips32 -ggdb14 CFLAGS = -Wall -g -mno-gpopt -ffreestanding -fomit-frame-pointer -mips32 -ggdb 14 15 15 GIET = ../giet_tsar16 GIET = ../giet_tsar 16 17 17 18 bin.soft: $(OBJS) ldscript -
trunk/softs/soft_filter_giet/ldscript
r174 r248 10 10 peripherals are not present in the architecture */ 11 11 12 NB_CLUSTERS = 64; /* number of clusters */12 NB_CLUSTERS = 4; /* number of clusters */ 13 13 NB_PROCS = 4; /* number of processors per cluster */ 14 14 NB_TASKS = 1; /* number of tasks per processor */ … … 32 32 33 33 seg_icu_base = 0x00F00000; /* controleur ICU */ 34 seg_tty_base = 0x 00F10000; /* controleur TTY */35 seg_dma_base = 0x00F 20000; /* controleur DMA */34 seg_tty_base = 0xBFF20000; /* controleur TTY */ 35 seg_dma_base = 0x00F30000; /* controleur DMA */ 36 36 37 37 seg_reset_base = 0xBFC00000; /* le code de boot */ 38 38 seg_fb_base = 0xBFD00000; /* controleur FRAME BUFFER */ 39 seg_ioc_base = 0xBFF 30000; /* controleur I/O */39 seg_ioc_base = 0xBFF10000; /* controleur I/O */ 40 40 41 seg_timer_base = 0x BFF40000; /* controleur TIMER */41 seg_timer_base = 0x00F0000; /* controleur TIMER */ 42 42 seg_gcd_base = 0xBFF50000; /* controleur GCD */ 43 43 -
trunk/softs/soft_filter_giet/main.c
r174 r248 1 2 #include "limits.h" 1 3 #include "stdio.h" 4 5 #include "../giet_tsar/block_device.h" 2 6 3 7 //////////////////////////////////// 4 8 // Image parameters 5 9 6 #define PIXEL_SIZE 2 7 #define NL 1024 8 #define NP 1024 9 #define BLOCK_SIZE 1024 10 11 #define PRINTF if(lid==0) tty_printf 12 13 #define TA(c,l,p) (A[c][((NP)*(l))+(p)]) 14 #define TB(c,p,l) (B[c][((NL)*(p))+(l)]) 15 #define TC(c,l,p) (C[c][((NP)*(l))+(p)]) 16 #define TD(c,l,p) (D[c][((NP)*(l))+(p)]) 17 #define TZ(c,l,p) (Z[c][((NP)*(l))+(p)]) 10 #define NB_CLUSTER_MAX 256 11 #define PIXEL_SIZE 2 12 #define NL 1024 13 #define NP 1024 14 15 #define NB_PIXELS ((NP) * (NL)) 16 #define FRAME_SIZE ((NB_PIXELS) * (PIXEL_SIZE)) 17 18 #define PRINTF(...) ({ if (proc_id == 1) { tty_printf(__VA_ARGS__); } }) 19 20 #define TA(c,l,p) (A[c][((NP) * (l)) + (p)]) 21 #define TB(c,p,l) (B[c][((NL) * (p)) + (l)]) 22 #define TC(c,l,p) (C[c][((NP) * (l)) + (p)]) 23 #define TD(c,l,p) (D[c][((NP) * (l)) + (p)]) 24 #define TZ(c,l,p) (Z[c][((NP) * (l)) + (p)]) 18 25 19 26 #define max(x,y) ((x) > (y) ? (x) : (y)) … … 26 33 struct plaf; 27 34 35 extern struct plouf seg_ioc_base; 28 36 extern struct plaf seg_heap_base; 29 37 extern struct plaf NB_PROCS; 30 38 extern struct plaf NB_CLUSTERS; 31 39 32 ///////////// 33 void main() 34 { 35 36 ////////////////////////////////// 37 // convolution kernel parameters 38 // The content of this section is 39 // Philips proprietary information. 40 /////////////////////////////////// 41 42 int vnorm = 115; 43 int vf[35]; 44 vf[0] = 1; 45 vf[1] = 1; 46 vf[2] = 2; 47 vf[3] = 2; 48 vf[4] = 2; 49 vf[5] = 2; 50 vf[6] = 3; 51 vf[7] = 3; 52 vf[8] = 3; 53 vf[9] = 4; 54 vf[10] = 4; 55 vf[11] = 4; 56 vf[12] = 4; 57 vf[13] = 5; 58 vf[14] = 5; 59 vf[15] = 5; 60 vf[16] = 5; 61 vf[17] = 5; 62 vf[18] = 5; 63 vf[19] = 5; 64 vf[20] = 5; 65 vf[21] = 5; 66 vf[22] = 4; 67 vf[23] = 4; 68 vf[24] = 4; 69 vf[25] = 4; 70 vf[26] = 3; 71 vf[27] = 3; 72 vf[28] = 3; 73 vf[29] = 2; 74 vf[30] = 2; 75 vf[31] = 2; 76 vf[32] = 2; 77 vf[33] = 1; 78 vf[34] = 1; 79 80 int hrange = 100; 81 int hnorm = 201; 82 83 unsigned int date = 0; 84 85 int c; // cluster index for loops 86 int l; // line index for loops 87 int p; // pixel index for loops 88 int x; // filter index for loops 89 90 int pid = procid(); // processor id 91 int nprocs = (int)&NB_PROCS; // number of processors per cluster 92 int nclusters = (int)&NB_CLUSTERS; // number of clusters 93 int lid = pid%nprocs; // local task id 94 int cid = pid/nprocs; // cluster task id 95 int base = (unsigned int)&seg_heap_base; // base address for shared buffers 96 int increment = (0x80000000 / nclusters) * 2; // cluster increment 97 int ntasks = nclusters * nprocs; // number of tasks 98 int nblocks = (NP*NL*PIXEL_SIZE)/BLOCK_SIZE; // number of blocks per image 99 100 int lines_per_task = NL/ntasks; // number of lines per task 101 int lines_per_cluster = NL/nclusters; // number of lines per cluster 102 int pixels_per_task = NP/ntasks; // number of columns per task 103 int pixels_per_cluster = NP/nclusters; // number of columns per cluster 104 105 int first, last; 106 107 PRINTF("\n*** Processor %d entering main at cycle %d ***\n\n", pid, proctime()); 108 109 ////////////////////////// 110 // parameters checking 111 if( (nprocs != 1) && (nprocs != 2) && (nprocs != 4) ) 112 { 113 PRINTF("NB_PROCS must be 1, 2 or 4\n"); 114 while(1); 115 } 116 if( (nclusters != 4) && (nclusters != 8) && (nclusters != 16) && 117 (nclusters != 32) && (nclusters != 64) && (nclusters !=128) && (nclusters != 256) ) 118 { 119 PRINTF("NB_CLUSTERS must be a power of 2 between 4 and 256\n"); 120 while(1); 121 } 122 if( pid >= ntasks ) 123 { 124 PRINTF("processor id %d larger than NB_CLUSTERS*NB_PROCS\n", pid); 125 while(1); 126 } 127 if ( NL % nclusters != 0 ) 128 { 129 PRINTF("NB_CLUSTERS must be a divider of NL"); 130 while(1); 131 } 132 if( NP % nclusters != 0 ) 133 { 134 PRINTF("NB_CLUSTERS must be a divider of NP"); 135 while(1); 136 } 137 138 ////////////////////////////////////////////////////////////////// 139 // Arrays of pointers on the shared, distributed buffers 140 // containing the images (sized for the worst case : 256 clusters) 141 unsigned short* A[256]; 142 int* B[256]; 143 int* C[256]; 144 int* D[256]; 145 unsigned char* Z[256]; 146 147 // Arrays of pointers on the instrumentation arrays 148 // These arrays are indexed by the cluster index (sized for the worst case : 256 clusters) 149 // each pointer points on the base adress of an array of 4 (NPROCS max) unsigned int 150 unsigned int* LOAD_START[256]; 151 unsigned int* LOAD_ENDED[256]; 152 unsigned int* VERT_START[256]; 153 unsigned int* VERT_ENDED[256]; 154 unsigned int* HORI_START[256]; 155 unsigned int* HORI_ENDED[256]; 156 unsigned int* DISP_START[256]; 157 unsigned int* DISP_ENDED[256]; 158 159 // The shared, distributed buffers addresses are computed 160 // from the seg_heap_base value defined in the ldscript file 161 // and from the cluster increment = 4Gbytes/nclusters. 162 // These arrays of pointers are identical and 163 // replicated in the stack of each task 164 for( c=0 ; c<nclusters ; c++) 165 { 166 A[c] = (unsigned short*) (base + increment*c); 167 Z[c] = (unsigned char*) (base + 2*NP*NL/nclusters + increment*c); 168 B[c] = (int*) (base + 4*NP*NL/nclusters + increment*c); 169 C[c] = (int*) (base + 8*NP*NL/nclusters + increment*c); 170 D[c] = (int*) (base + 12*NP*NL/nclusters + increment*c); 171 172 LOAD_START[c] = (unsigned int*) (base + 3*NL*NP/nclusters + increment*c); 173 LOAD_ENDED[c] = (unsigned int*) (base + 3*NL*NP/nclusters + 16 + increment*c); 174 VERT_START[c] = (unsigned int*) (base + 3*NL*NP/nclusters + 32 + increment*c); 175 VERT_ENDED[c] = (unsigned int*) (base + 3*NL*NP/nclusters + 48 + increment*c); 176 HORI_START[c] = (unsigned int*) (base + 3*NL*NP/nclusters + 64 + increment*c); 177 HORI_ENDED[c] = (unsigned int*) (base + 3*NL*NP/nclusters + 80 + increment*c); 178 DISP_START[c] = (unsigned int*) (base + 3*NL*NP/nclusters + 96 + increment*c); 179 DISP_ENDED[c] = (unsigned int*) (base + 3*NL*NP/nclusters + 112 + increment*c); 180 } 181 182 PRINTF("NCLUSTERS = %d\n", nclusters); 183 PRINTF("NPROCS = %d\n\n", nprocs); 184 185 PRINTF("*** Starting barrier init at cycle %d ***\n", proctime()); 186 187 // barriers initialization 188 barrier_init(0, ntasks); 189 barrier_init(1, ntasks); 190 barrier_init(2, ntasks); 191 barrier_init(3, ntasks); 192 193 PRINTF("*** Completing barrier init at cycle %d ***\n", proctime()); 194 195 //////////////////////////////////////////////////////// 196 // pseudo parallel load from disk to A[c] buffers 197 // only task running on processor with (lid==0) does it 198 // nblocks/nclusters are loaded in each cluster 199 200 if ( lid == 0 ) 201 { 202 int p; 203 date = proctime(); 204 PRINTF("\n*** Starting load at cycle %d\n", date); 205 for ( p=0 ; p<nprocs ; p++ ) LOAD_START[cid][p] = date; 206 207 if( ioc_read(nblocks*cid/nclusters, 208 A[cid] , 209 nblocks/nclusters) ) 210 { 211 PRINTF("echec ioc_read\n"); 212 while(1); 213 } 214 if ( ioc_completed() ) 215 { 216 PRINTF("echec ioc_completed\n"); 217 while(1); 218 } 219 220 date = proctime(); 221 PRINTF("*** Completing load at cycle %d\n", date); 222 for ( p=0 ; p<nprocs ; p++ ) LOAD_ENDED[cid][p] = date; 223 } 224 225 barrier_wait(0); 226 227 ////////////////////////////////////////////////////////// 228 // parallel horizontal filter : 229 // B <= transpose(FH(A)) 230 // D <= A - FH(A) 231 // Each task computes (NL/ntasks) lines 232 // The image must be extended : 233 // if (z<0) TA(cid,l,z) == TA(cid,l,0) 234 // if (z>NP-1) TA(cid,l,z) == TA(cid,l,NP-1) 235 236 date = proctime(); 237 PRINTF("\n*** Starting horizontal filter at cycle %d\n", date); 238 HORI_START[cid][lid] = date; 239 240 // l = absolute line index / p = absolute pixel index 241 // first & last define which lines are handled by a given task(cid,lid) 242 243 first = (cid*nprocs + lid)*lines_per_task; 244 last = first + lines_per_task; 245 246 for ( l=first ; l<last ; l++) 247 { 248 // src_c and src_l are the cluster index and the line index for A & D 249 int src_c = l/lines_per_cluster; 250 int src_l = l%lines_per_cluster; 251 252 // We use the spécific values of the horizontal ep-filter for optimisation: 253 // sum(p) = sum(p-1) + TA[p+hrange] - TA[p-hrange-1] 254 // To minimize the number of tests, the loop on pixels is split in three domains 255 256 int sum_p = (hrange+2)*TA(src_c, src_l, 0); 257 for ( x = 1 ; x < hrange ; x++) sum_p = sum_p + TA(src_c, src_l, x); 258 259 // first domain : from 0 to hrange 260 for ( p=0 ; p<hrange+1 ; p++) 261 { 262 // dst_c and dst_p are the cluster index and the pixel index for B 263 int dst_c = p/pixels_per_cluster; 264 int dst_p = p%pixels_per_cluster; 265 sum_p = sum_p + (int)TA(src_c, src_l, p+hrange) - (int)TA(src_c, src_l, 0); 266 TB(dst_c, dst_p, l) = sum_p/hnorm; 267 TD(src_c, src_l, p) = (int)TA(src_c, src_l, p) - sum_p/hnorm; 268 } 269 // second domain : from (hrange+1) to (NP-hrange-1) 270 for ( p = hrange+1 ; p < NP-hrange ; p++) 271 { 272 // dst_c and dst_p are the cluster index and the pixel index for B 273 int dst_c = p/pixels_per_cluster; 274 int dst_p = p%pixels_per_cluster; 275 sum_p = sum_p + (int)TA(src_c, src_l, p+hrange) - (int)TA(src_c, src_l, p-hrange-1); 276 TB(dst_c, dst_p, l) = sum_p/hnorm; 277 TD(src_c, src_l, p) = (int)TA(src_c, src_l, p) - sum_p/hnorm; 278 } 279 // third domain : from (NP-hrange) to (NP-1) 280 for ( p = NP-hrange ; p < NP ; p++) 281 { 282 // dst_c and dst_p are the cluster index and the pixel index for B 283 int dst_c = p/pixels_per_cluster; 284 int dst_p = p%pixels_per_cluster; 285 sum_p = sum_p + (int)TA(src_c, src_l, NP-1) - (int)TA(src_c, src_l, p-hrange-1); 286 TB(dst_c, dst_p, l) = sum_p/hnorm; 287 TD(src_c, src_l, p) = (int)TA(src_c, src_l, p) - sum_p/hnorm; 288 } 289 290 PRINTF(" - line %d computed at cycle %d\n", l, proctime()); 291 } 292 293 date = proctime(); 294 PRINTF("*** Completing horizontal filter at cycle %d\n", date); 295 HORI_ENDED[cid][lid] = date; 296 297 barrier_wait(1); 298 299 ////////////////////////////////////////////////////////// 300 // parallel vertical filter : 301 // C <= transpose(FV(B)) 302 // Each task computes (NP/ntasks) columns 303 // The image must be extended : 304 // if (l<0) TB(cid,p,x) == TB(cid,p,0) 305 // if (l>NL-1) TB(cid,p,x) == TB(cid,p,NL-1) 306 307 date = proctime(); 308 PRINTF("\n*** starting vertical filter at cycle %d\n", date); 309 VERT_START[cid][lid] = date; 310 311 // l = absolute line index / p = absolute pixel index 312 // first & last define which pixels are handled by a given task(cid,lid) 313 314 first = (cid*nprocs + lid)*pixels_per_task; 315 last = first + pixels_per_task; 316 317 for ( p=first ; p<last ; p++) 318 { 319 // src_c and src_p are the cluster index and the pixel index for B 320 int src_c = p/pixels_per_cluster; 321 int src_p = p%pixels_per_cluster; 322 323 int sum_l; 324 325 // We use the specific values of the vertical ep-filter 326 // To minimize the number of tests, the NL lines are split in three domains 327 328 // first domain : explicit computation for the first 18 values 329 for ( l=0 ; l<18 ; l++) 330 { 331 // dst_c and dst_l are the cluster index and the line index for C 332 int dst_c = l/lines_per_cluster; 333 int dst_l = l%lines_per_cluster; 334 335 for ( x=0, sum_l=0 ; x<35 ; x++ ) 336 { 337 sum_l = sum_l + vf[x] * TB(src_c, src_p, max(l-17+x,0) ); 338 } 339 TC(dst_c, dst_l, p) = sum_l/vnorm; 340 } 341 // second domain 342 for ( l = 18 ; l < NL-17 ; l++ ) 343 { 344 // dst_c and dst_l are the cluster index and the line index for C 345 int dst_c = l/lines_per_cluster; 346 int dst_l = l%lines_per_cluster; 347 348 sum_l = sum_l + TB(src_c, src_p, l+4) 349 + TB(src_c, src_p, l+8) 350 + TB(src_c, src_p, l+11) 351 + TB(src_c, src_p, l+15) 352 + TB(src_c, src_p, l+17) 353 - TB(src_c, src_p, l-5) 354 - TB(src_c, src_p, l-9) 355 - TB(src_c, src_p, l-12) 356 - TB(src_c, src_p, l-16) 357 - TB(src_c, src_p, l-18); 358 TC(dst_c, dst_l, p) = sum_l/vnorm; 359 } 360 // third domain 361 for ( l = NL-17 ; l < NL ; l++ ) 362 { 363 // dst_c and dst_l are the cluster index and the line index for C 364 int dst_c = l/lines_per_cluster; 365 int dst_l = l%lines_per_cluster; 366 367 sum_l = sum_l + TB(src_c, src_p, min(l+4,NL-1)) 368 + TB(src_c, src_p, min(l+8,NL-1)) 369 + TB(src_c, src_p, min(l+11,NL-1)) 370 + TB(src_c, src_p, min(l+15,NL-1)) 371 + TB(src_c, src_p, min(l+17,NL-1)) 372 - TB(src_c, src_p, l-5) 373 - TB(src_c, src_p, l-9) 374 - TB(src_c, src_p, l-12) 375 - TB(src_c, src_p, l-16) 376 - TB(src_c, src_p, l-18); 377 TC(dst_c, dst_l, p) = sum_l/vnorm; 378 } 379 PRINTF(" - column %d computed at cycle %d\n", p, proctime()); 380 } 381 382 date = proctime(); 383 PRINTF("*** Completing vertical filter at cycle %d\n", date); 384 VERT_ENDED[cid][lid] = date; 385 386 barrier_wait(2); 387 388 //////////////////////////////////////////////////////////////// 389 // final computation and parallel display 390 // Z <= D + C 391 // Each processor use its private DMA channel to display 392 // the resulting image, line per line (one byte per pixel). 393 // Eah processor computes & displays (NL/ntasks) lines. 394 395 date = proctime(); 396 PRINTF("\n*** Starting display at cycle %d\n", date); 397 DISP_START[cid][lid] = date; 398 399 first = lid*lines_per_task; 400 last = first + lines_per_task; 401 402 for ( l=first ; l<last ; l++) 403 { 404 for ( p=0 ; p<NP ; p++) 405 { 406 TZ(cid,l,p) = (unsigned char)(((TD(cid,l,p) + TC(cid,l,p))>>8) & 0xFF); 407 } 408 fb_write(NP*(cid*lines_per_cluster+l), &TZ(cid,l,0), NP); 409 } 410 411 date = proctime(); 412 PRINTF("*** Completing display at cycle %d\n", date); 413 DISP_ENDED[cid][lid] = date; 414 415 barrier_wait(3); 416 417 ///////////////////////////////////////////////////////// 418 // Instrumentation (done by processor 0 in cluster 0) 419 420 if ( pid == 0 ) 421 { 422 date = proctime(); 423 PRINTF("\n*** Starting Instrumentation at cycle %d\n\n", date); 424 425 int cc, pp; 426 unsigned int min_load_start = 1000000000; 427 unsigned int max_load_start = 0; 428 unsigned int min_load_ended = 1000000000; 429 unsigned int max_load_ended = 0; 430 431 unsigned int min_hori_start = 1000000000; 432 unsigned int max_hori_start = 0; 433 unsigned int min_hori_ended = 1000000000; 434 unsigned int max_hori_ended = 0; 435 436 unsigned int min_vert_start = 1000000000; 437 unsigned int max_vert_start = 0; 438 unsigned int min_vert_ended = 1000000000; 439 unsigned int max_vert_ended = 0; 440 441 unsigned int min_disp_start = 1000000000; 442 unsigned int max_disp_start = 0; 443 unsigned int min_disp_ended = 1000000000; 444 unsigned int max_disp_ended = 0; 445 446 for ( cc=0 ; cc<nclusters ; cc++ ) 447 { 448 for ( pp=0 ; pp<nprocs ; pp++ ) 449 { 450 if ( LOAD_START[cc][pp] < min_load_start ) min_load_start = LOAD_START[cc][pp]; 451 if ( LOAD_START[cc][pp] > max_load_start ) max_load_start = LOAD_START[cc][pp]; 452 if ( LOAD_ENDED[cc][pp] < min_load_ended ) min_load_ended = LOAD_ENDED[cc][pp]; 453 if ( LOAD_ENDED[cc][pp] > max_load_ended ) max_load_ended = LOAD_ENDED[cc][pp]; 454 455 if ( HORI_START[cc][pp] < min_hori_start ) min_hori_start = HORI_START[cc][pp]; 456 if ( HORI_START[cc][pp] > max_hori_start ) max_hori_start = HORI_START[cc][pp]; 457 if ( HORI_ENDED[cc][pp] < min_hori_ended ) min_hori_ended = HORI_ENDED[cc][pp]; 458 if ( HORI_ENDED[cc][pp] > max_hori_ended ) max_hori_ended = HORI_ENDED[cc][pp]; 459 460 if ( VERT_START[cc][pp] < min_vert_start ) min_vert_start = VERT_START[cc][pp]; 461 if ( VERT_START[cc][pp] > max_vert_start ) max_vert_start = VERT_START[cc][pp]; 462 if ( VERT_ENDED[cc][pp] < min_vert_ended ) min_vert_ended = VERT_ENDED[cc][pp]; 463 if ( VERT_ENDED[cc][pp] > max_vert_ended ) max_vert_ended = VERT_ENDED[cc][pp]; 464 465 if ( DISP_START[cc][pp] < min_disp_start ) min_disp_start = DISP_START[cc][pp]; 466 if ( DISP_START[cc][pp] > max_disp_start ) max_disp_start = DISP_START[cc][pp]; 467 if ( DISP_ENDED[cc][pp] < min_disp_ended ) min_disp_ended = DISP_ENDED[cc][pp]; 468 if ( DISP_ENDED[cc][pp] > max_disp_ended ) max_disp_ended = DISP_ENDED[cc][pp]; 469 } 470 } 471 PRINTF(" - LOAD_START : min = %d / max = %d / med = %d / delta = %d\n", 472 min_load_start, max_load_start, (min_load_start+max_load_start)/2, max_load_start-min_load_start); 473 PRINTF(" - LOAD_END : min = %d / max = %d / med = %d / delta = %d\n", 474 min_load_ended, max_load_ended, (min_load_ended+max_load_ended)/2, max_load_ended-min_load_ended); 475 476 PRINTF(" - HORI_START : min = %d / max = %d / med = %d / delta = %d\n", 477 min_hori_start, max_hori_start, (min_hori_start+max_hori_start)/2, max_hori_start-min_hori_start); 478 PRINTF(" - HORI_END : min = %d / max = %d / med = %d / delta = %d\n", 479 min_hori_ended, max_hori_ended, (min_hori_ended+max_hori_ended)/2, max_hori_ended-min_hori_ended); 480 481 PRINTF(" - VERT_START : min = %d / max = %d / med = %d / delta = %d\n", 482 min_vert_start, max_vert_start, (min_vert_start+max_vert_start)/2, max_vert_start-min_vert_start); 483 PRINTF(" - VERT_END : min = %d / max = %d / med = %d / delta = %d\n", 484 min_vert_ended, max_vert_ended, (min_vert_ended+max_vert_ended)/2, max_vert_ended-min_vert_ended); 485 486 PRINTF(" - DISP_START : min = %d / max = %d / med = %d / delta = %d\n", 487 min_disp_start, max_disp_start, (min_disp_start+max_disp_start)/2, max_disp_start-min_disp_start); 488 PRINTF(" - DISP_END : min = %d / max = %d / med = %d / delta = %d\n", 489 min_disp_ended, max_disp_ended, (min_disp_ended+max_disp_ended)/2, max_disp_ended-min_disp_ended); 490 491 PRINTF(" - BARRIER LOAD/HORI = %d\n", min_hori_start - max_load_ended); 492 PRINTF(" - BARRIER HORI/VERT = %d\n", min_vert_start - max_hori_ended); 493 PRINTF(" - BARRIER VERT/DISP = %d\n", min_disp_start - max_vert_ended); 494 495 PRINTF(" - LOAD = %d\n", max_load_ended); 496 PRINTF(" - FILTER = %d\n", max_vert_ended - max_load_ended); 497 PRINTF(" - DISPLAY = %d\n", max_disp_ended - max_vert_ended); 498 499 PRINTF("\nBEGIN LOAD_START\n"); 500 for ( cc=0 ; cc<nclusters ; cc++ ) 501 { 502 for ( pp=0 ; pp<nprocs ; pp++ ) 503 { 504 PRINTF("cluster= %d proc= %d date= %d\n", cc, pp, LOAD_START[cc][pp]); 505 } 506 } 507 PRINTF("END\n"); 508 PRINTF("\nBEGIN LOAD_ENDED\n"); 509 for ( cc=0 ; cc<nclusters ; cc++ ) 510 { 511 for ( pp=0 ; pp<nprocs ; pp++ ) 512 { 513 PRINTF("cluster= %d proc= %d date= %d\n", cc, pp, LOAD_ENDED[cc][pp]); 514 } 515 } 516 PRINTF("END\n"); 517 PRINTF("\nBEGIN HORI_START\n"); 518 for ( cc=0 ; cc<nclusters ; cc++ ) 519 { 520 for ( pp=0 ; pp<nprocs ; pp++ ) 521 { 522 PRINTF("cluster= %d proc= %d date= %d\n", cc, pp, HORI_START[cc][pp]); 523 } 524 } 525 PRINTF("END\n"); 526 PRINTF("\nBEGIN HORI_ENDED\n"); 527 for ( cc=0 ; cc<nclusters ; cc++ ) 528 { 529 for ( pp=0 ; pp<nprocs ; pp++ ) 530 { 531 PRINTF("cluster= %d proc= %d date= %d\n", cc, pp, HORI_ENDED[cc][pp]); 532 } 533 } 534 PRINTF("END\n"); 535 PRINTF("\nBEGIN VERT_START\n"); 536 for ( cc=0 ; cc<nclusters ; cc++ ) 537 { 538 for ( pp=0 ; pp<nprocs ; pp++ ) 539 { 540 PRINTF("cluster= %d proc= %d date= %d\n", cc, pp, VERT_START[cc][pp]); 541 } 542 } 543 PRINTF("END\n"); 544 PRINTF("\nBEGIN VERT_ENDED\n"); 545 for ( cc=0 ; cc<nclusters ; cc++ ) 546 { 547 for ( pp=0 ; pp<nprocs ; pp++ ) 548 { 549 PRINTF("cluster= %d proc= %d date= %d\n", cc, pp, VERT_ENDED[cc][pp]); 550 } 551 } 552 PRINTF("END\n"); 553 PRINTF("\nBEGIN DISP_START\n"); 554 for ( cc=0 ; cc<nclusters ; cc++ ) 555 { 556 for ( pp=0 ; pp<nprocs ; pp++ ) 557 { 558 PRINTF("cluster= %d proc= %d date= %d\n", cc, pp, DISP_START[cc][pp]); 559 } 560 } 561 PRINTF("END\n"); 562 PRINTF("\nBEGIN DISP_ENDED\n"); 563 for ( cc=0 ; cc<nclusters ; cc++ ) 564 { 565 for ( pp=0 ; pp<nprocs ; pp++ ) 566 { 567 PRINTF("cluster= %d proc= %d date= %d\n", cc, pp, DISP_ENDED[cc][pp]); 568 } 569 } 570 PRINTF("END\n"); 571 } 572 573 while(1); 40 41 // Required when initializing an array all at once 42 static void *memcpy(void *_dst, const void *_src, unsigned int size){ 43 unsigned int *dst = _dst; 44 const unsigned int *src = _src; 45 if (! ((unsigned int)dst & 3) && ! ((unsigned int)src & 3)){ 46 while (size > 3){ 47 *dst++ = *src++; 48 size -= 4; 49 } 50 } 51 52 unsigned char *cdst = (unsigned char*)dst; 53 unsigned char *csrc = (unsigned char*)src; 54 55 while (size--){ 56 *cdst++ = *csrc++; 57 } 58 return _dst; 59 } 60 61 62 63 64 65 66 67 68 void main(){ 69 70 ////////////////////////////////// 71 // convolution kernel parameters 72 // The content of this section is 73 // Philips proprietary information. 74 /////////////////////////////////// 75 76 int vnorm = 115; 77 int vf[35] = { 1, 1, 2, 2, 2, 78 2, 3, 3, 3, 4, 79 4, 4, 4, 5, 5, 80 5, 5, 5, 5, 5, 81 5, 5, 4, 4, 4, 82 4, 3, 3, 3, 2, 83 2, 2, 2, 1, 1 }; 84 85 int hrange = 100; 86 int hnorm = 201; 87 88 unsigned int date = 0; 89 90 int c; // cluster index for loops 91 int l; // line index for loops 92 int p; // pixel index for loops 93 int x; // filter index for loops 94 95 const unsigned int proc_id = procid(); // processor id 96 const unsigned int nlocal_procs = (int) &NB_PROCS; // number of processors per cluster 97 const unsigned int nclusters = (int) &NB_CLUSTERS; // number of clusters 98 const unsigned int local_id = proc_id % nlocal_procs; // local task id 99 const unsigned int cluster_id = proc_id / nlocal_procs; // cluster task id 100 const unsigned int base = (unsigned int) &seg_heap_base; // base address for shared buffers 101 const unsigned int increment = 0x80000000 / nclusters * 2; // cluster increment 102 const unsigned int nglobal_procs = nclusters * nlocal_procs; // number of tasks 103 const unsigned int npixels = NB_PIXELS; // Number of pixel per frame 104 const unsigned int frame_size = FRAME_SIZE; // Size of 1 frame (in bytes) 105 const unsigned int * ioc_address = (unsigned int *) &seg_ioc_base; 106 const unsigned int block_size = ioc_address[BLOCK_DEVICE_BLOCK_SIZE]; 107 const unsigned int nblocks = frame_size / block_size; // number of blocks per frame 108 109 const unsigned int lines_per_task = NL / nglobal_procs; // number of lines per task 110 const unsigned int lines_per_cluster = NL / nclusters; // number of lines per cluster 111 const unsigned int pixels_per_task = NP / nglobal_procs; // number of columns per task 112 const unsigned int pixels_per_cluster = NP / nclusters; // number of columns per cluster 113 114 int first, last; 115 116 PRINTF("\n*** Processor %d entering main at cycle %d ***\n\n", proc_id, proctime()); 117 118 //*(unsigned int *) 0x60000000 = *(unsigned int *) 0x70000000; 119 //PRINTF("apres acces illegal\n"); 120 121 ///////////////////////// 122 // parameters checking // 123 ///////////////////////// 124 125 126 if ((nlocal_procs != 1) && (nlocal_procs != 2) && (nlocal_procs != 4)){ 127 PRINTF("NB_PROCS must be 1, 2 or 4\n"); 128 exit(); 129 } 130 131 if ((nclusters != 4) && (nclusters != 8) && (nclusters != 16) && 132 (nclusters != 32) && (nclusters != 64) && (nclusters !=128) && (nclusters != 256)){ 133 PRINTF("NB_CLUSTERS must be a power of 2 between 4 and 256\n"); 134 exit(); 135 } 136 137 if (proc_id >= nglobal_procs){ 138 PRINTF("processor id %d larger than NB_CLUSTERS*NB_PROCS\n", proc_id); 139 exit(); 140 } 141 142 if (NL % nclusters != 0){ 143 PRINTF("NB_CLUSTERS must be a divider of NL"); 144 exit(); 145 } 146 147 if (NP % nclusters != 0){ 148 PRINTF("NB_CLUSTERS must be a divider of NP"); 149 exit(); 150 } 151 152 153 // Arrays of pointers on the shared, distributed buffers 154 // containing the images (sized for the worst case : 256 clusters) 155 unsigned short * A[NB_CLUSTER_MAX]; 156 int * B[NB_CLUSTER_MAX]; 157 int * C[NB_CLUSTER_MAX]; 158 int * D[NB_CLUSTER_MAX]; 159 unsigned char * Z[NB_CLUSTER_MAX]; 160 161 // Arrays of pointers on the instrumentation arrays 162 // These arrays are indexed by the cluster index (sized for the worst case : 256 clusters) 163 // each pointer points on the base adress of an array of 4 (NPROCS max) unsigned int 164 unsigned int * LOAD_START[NB_CLUSTER_MAX]; 165 unsigned int * LOAD_END[NB_CLUSTER_MAX]; 166 unsigned int * VERT_START[NB_CLUSTER_MAX]; 167 unsigned int * VERT_END[NB_CLUSTER_MAX]; 168 unsigned int * HORI_START[NB_CLUSTER_MAX]; 169 unsigned int * HORI_END[NB_CLUSTER_MAX]; 170 unsigned int * DISP_START[NB_CLUSTER_MAX]; 171 unsigned int * DISP_END[NB_CLUSTER_MAX]; 172 173 // The shared, distributed buffers addresses are computed 174 // from the seg_heap_base value defined in the ldscript file 175 // and from the cluster increment = 4Gbytes/nclusters. 176 // These arrays of pointers are identical and 177 // replicated in the stack of each task 178 for (c = 0; c < nclusters; c++){ 179 unsigned int offset = base + increment * c; 180 A[c] = (unsigned short *) (offset ); 181 B[c] = (int *) (offset + frame_size * 1 / nclusters); // We increment by 2 * frame_size 182 C[c] = (int *) (offset + frame_size * 3 / nclusters); // because sizeof(int) = 2*sizeof(short) 183 D[c] = (int *) (offset + frame_size * 5 / nclusters); // so an array of frame_size elements of type 184 Z[c] = (unsigned char *) (offset + frame_size * 7 / nclusters); // int can contain the equivalent of 2 frames 185 186 offset = base + increment * c + frame_size * 8 / nclusters; 187 LOAD_START[c] = (unsigned int *) (offset + nlocal_procs * sizeof(unsigned int) * 0); 188 LOAD_END[c] = (unsigned int *) (offset + nlocal_procs * sizeof(unsigned int) * 1); 189 VERT_START[c] = (unsigned int *) (offset + nlocal_procs * sizeof(unsigned int) * 2); 190 VERT_END[c] = (unsigned int *) (offset + nlocal_procs * sizeof(unsigned int) * 3); 191 HORI_START[c] = (unsigned int *) (offset + nlocal_procs * sizeof(unsigned int) * 4); 192 HORI_END[c] = (unsigned int *) (offset + nlocal_procs * sizeof(unsigned int) * 5); 193 DISP_START[c] = (unsigned int *) (offset + nlocal_procs * sizeof(unsigned int) * 6); 194 DISP_END[c] = (unsigned int *) (offset + nlocal_procs * sizeof(unsigned int) * 7); 195 } 196 197 PRINTF("NB_CLUSTERS = %d\n", nclusters); 198 PRINTF("NB_LOCAL_PROCS = %d\n", nlocal_procs); 199 PRINTF("NB_GLOBAL_PROCS = %d\n", nglobal_procs); 200 PRINTF("NB_PIXELS = %d\n", npixels); 201 PRINTF("PIXEL_SIZE = %d\n", PIXEL_SIZE); 202 PRINTF("FRAME_SIZE = %d\n", frame_size); 203 PRINTF("BLOCK_SIZE = %d\n", block_size); 204 PRINTF("NB_BLOCKS = %d\n\n", nblocks); 205 206 207 PRINTF("*** Starting barrier init at cycle %d ***\n", proctime()); 208 209 // barriers initialization 210 barrier_init(0, nglobal_procs); 211 barrier_init(1, nglobal_procs); 212 barrier_init(2, nglobal_procs); 213 barrier_init(3, nglobal_procs); 214 215 PRINTF("*** Completing barrier init at cycle %d ***\n", proctime()); 216 217 218 //////////////////////////////////////////////////////// 219 // pseudo parallel load from disk to A[c] buffers 220 // only task running on processor with (local_id==0) does it 221 // nblocks/nclusters are loaded in each cluster 222 //////////////////////////////////////////////////////// 223 224 if (local_id == 0){ 225 int p; 226 date = proctime(); 227 PRINTF("\n*** Starting load at cycle %d\n", date); 228 for (p = 0; p < nlocal_procs; p++){ 229 LOAD_START[cluster_id][p] = date; 230 } 231 232 if (ioc_read(nblocks*cluster_id/nclusters, A[cluster_id], nblocks/nclusters)){ 233 PRINTF("echec ioc_read\n"); 234 exit(1); 235 } 236 if (ioc_completed()){ 237 PRINTF("echec ioc_completed\n"); 238 exit(1); 239 } 240 241 date = proctime(); 242 PRINTF("*** Completing load at cycle %d\n", date); 243 for (p = 0; p < nlocal_procs; p++){ 244 LOAD_END[cluster_id][p] = date; 245 } 246 } 247 248 barrier_wait(0); 249 250 251 //////////////////////////////////////////////////////// 252 // parallel horizontal filter : 253 // B <= transpose(FH(A)) 254 // D <= A - FH(A) 255 // Each task computes (NL/nglobal_procs) lines 256 // The image must be extended : 257 // if (z<0) TA(cluster_id,l,z) == TA(cluster_id,l,0) 258 // if (z>NP-1) TA(cluster_id,l,z) == TA(cluster_id,l,NP-1) 259 //////////////////////////////////////////////////////// 260 261 date = proctime(); 262 PRINTF("\n*** Starting horizontal filter at cycle %d\n", date); 263 HORI_START[cluster_id][local_id] = date; 264 265 // l = absolute line index / p = absolute pixel index 266 // first & last define which lines are handled by a given task(cluster_id,local_id) 267 268 first = (cluster_id * nlocal_procs + local_id) * lines_per_task; 269 last = first + lines_per_task; 270 271 for (l = first; l < last; l++){ 272 // src_c and src_l are the cluster index and the line index for A & D 273 int src_c = l / lines_per_cluster; 274 int src_l = l % lines_per_cluster; 275 276 // We use the specific values of the horizontal ep-filter for optimisation: 277 // sum(p) = sum(p-1) + TA[p+hrange] - TA[p-hrange-1] 278 // To minimize the number of tests, the loop on pixels is split in three domains 279 280 int sum_p = (hrange + 2) * TA(src_c, src_l, 0); 281 for (x = 1; x < hrange; x++){ 282 sum_p = sum_p + TA(src_c, src_l, x); 283 } 284 285 // first domain : from 0 to hrange 286 for (p = 0; p < hrange + 1; p++){ 287 // dst_c and dst_p are the cluster index and the pixel index for B 288 int dst_c = p / pixels_per_cluster; 289 int dst_p = p % pixels_per_cluster; 290 sum_p = sum_p + (int) TA(src_c, src_l, p + hrange) - (int) TA(src_c, src_l, 0); 291 TB(dst_c, dst_p, l) = sum_p / hnorm; 292 TD(src_c, src_l, p) = (int) TA(src_c, src_l, p) - sum_p / hnorm; 293 } 294 // second domain : from (hrange+1) to (NP-hrange-1) 295 for (p = hrange + 1; p < NP - hrange; p++){ 296 // dst_c and dst_p are the cluster index and the pixel index for B 297 int dst_c = p / pixels_per_cluster; 298 int dst_p = p % pixels_per_cluster; 299 sum_p = sum_p + (int) TA(src_c, src_l, p + hrange) - (int) TA(src_c, src_l, p - hrange - 1); 300 TB(dst_c, dst_p, l) = sum_p / hnorm; 301 TD(src_c, src_l, p) = (int) TA(src_c, src_l, p) - sum_p / hnorm; 302 } 303 // third domain : from (NP-hrange) to (NP-1) 304 for (p = NP - hrange; p < NP; p++){ 305 // dst_c and dst_p are the cluster index and the pixel index for B 306 int dst_c = p / pixels_per_cluster; 307 int dst_p = p % pixels_per_cluster; 308 sum_p = sum_p + (int) TA(src_c, src_l, NP - 1) - (int) TA(src_c, src_l, p - hrange - 1); 309 TB(dst_c, dst_p, l) = sum_p / hnorm; 310 TD(src_c, src_l, p) = (int) TA(src_c, src_l, p) - sum_p / hnorm; 311 } 312 313 PRINTF(" - line %d computed at cycle %d\n", l, proctime()); 314 } 315 316 date = proctime(); 317 PRINTF("*** Completing horizontal filter at cycle %d\n", date); 318 HORI_END[cluster_id][local_id] = date; 319 320 barrier_wait(1); 321 322 323 ////////////////////////////////////////////////////////// 324 // parallel vertical filter : 325 // C <= transpose(FV(B)) 326 // Each task computes (NP/nglobal_procs) columns 327 // The image must be extended : 328 // if (l<0) TB(cluster_id,p,x) == TB(cluster_id,p,0) 329 // if (l>NL-1) TB(cluster_id,p,x) == TB(cluster_id,p,NL-1) 330 ////////////////////////////////////////////////////////// 331 332 date = proctime(); 333 PRINTF("\n*** starting vertical filter at cycle %d\n", date); 334 VERT_START[cluster_id][local_id] = date; 335 336 // l = absolute line index / p = absolute pixel index 337 // first & last define which pixels are handled by a given task(cluster_id,local_id) 338 339 first = (cluster_id * nlocal_procs + local_id) * pixels_per_task; 340 last = first + pixels_per_task; 341 342 for (p = first; p < last; p++){ 343 // src_c and src_p are the cluster index and the pixel index for B 344 int src_c = p / pixels_per_cluster; 345 int src_p = p % pixels_per_cluster; 346 347 int sum_l; 348 349 // We use the specific values of the vertical ep-filter 350 // To minimize the number of tests, the NL lines are split in three domains 351 352 // first domain : explicit computation for the first 18 values 353 for (l = 0; l < 18; l++){ 354 // dst_c and dst_l are the cluster index and the line index for C 355 int dst_c = l / lines_per_cluster; 356 int dst_l = l % lines_per_cluster; 357 358 for (x = 0, sum_l = 0; x < 35; x++){ 359 sum_l = sum_l + vf[x] * TB(src_c, src_p, max(l - 17 + x,0) ); 360 } 361 TC(dst_c, dst_l, p) = sum_l / vnorm; 362 } 363 // second domain 364 for (l = 18; l < NL - 17; l++){ 365 // dst_c and dst_l are the cluster index and the line index for C 366 int dst_c = l / lines_per_cluster; 367 int dst_l = l % lines_per_cluster; 368 369 sum_l = sum_l + TB(src_c, src_p, l + 4) 370 + TB(src_c, src_p, l + 8) 371 + TB(src_c, src_p, l + 11) 372 + TB(src_c, src_p, l + 15) 373 + TB(src_c, src_p, l + 17) 374 - TB(src_c, src_p, l - 5) 375 - TB(src_c, src_p, l - 9) 376 - TB(src_c, src_p, l - 12) 377 - TB(src_c, src_p, l - 16) 378 - TB(src_c, src_p, l - 18); 379 TC(dst_c, dst_l, p) = sum_l / vnorm; 380 } 381 // third domain 382 for (l = NL - 17; l < NL; l++){ 383 // dst_c and dst_l are the cluster index and the line index for C 384 int dst_c = l / lines_per_cluster; 385 int dst_l = l % lines_per_cluster; 386 387 sum_l = sum_l + TB(src_c, src_p, min(l + 4, NL - 1)) 388 + TB(src_c, src_p, min(l + 8, NL - 1)) 389 + TB(src_c, src_p, min(l + 11, NL - 1)) 390 + TB(src_c, src_p, min(l + 15, NL - 1)) 391 + TB(src_c, src_p, min(l + 17, NL - 1)) 392 - TB(src_c, src_p, l - 5) 393 - TB(src_c, src_p, l - 9) 394 - TB(src_c, src_p, l - 12) 395 - TB(src_c, src_p, l - 16) 396 - TB(src_c, src_p, l - 18); 397 TC(dst_c, dst_l, p) = sum_l / vnorm; 398 } 399 PRINTF(" - column %d computed at cycle %d\n", p, proctime()); 400 } 401 402 date = proctime(); 403 PRINTF("*** Completing vertical filter at cycle %d\n", date); 404 VERT_END[cluster_id][local_id] = date; 405 406 barrier_wait(2); 407 408 409 //////////////////////////////////////////////////////////////// 410 // final computation and parallel display 411 // Z <= D + C 412 // Each processor use its private DMA channel to display 413 // the resulting image, line per line (one byte per pixel). 414 // Eah processor computes & displays (NL/nglobal_procs) lines. 415 //////////////////////////////////////////////////////////////// 416 417 date = proctime(); 418 PRINTF("\n*** Starting display at cycle %d\n", date); 419 DISP_START[cluster_id][local_id] = date; 420 421 first = local_id * lines_per_task; 422 last = first + lines_per_task; 423 424 for (l = first; l < last; l++){ 425 for (p = 0; p < NP; p++){ 426 TZ(cluster_id,l,p) = (unsigned char) (((TD(cluster_id,l,p) + TC(cluster_id,l,p)) >> 8) & 0xFF); 427 } 428 fb_sync_write(NP * (cluster_id * lines_per_cluster + l), &TZ(cluster_id,l,0), NP); 429 } 430 431 #if 0 432 for (l = first; l < last; l++){ 433 for (p = 0; p < NP; p++){ 434 TA(cluster_id, l, p) = (unsigned char) ((TA(cluster_id, l, p) >> 8) & 0xFF); 435 } 436 fb_write(NP * (cluster_id * lines_per_cluster + l), &TA(cluster_id,l,0), NP); 437 } 438 #endif 439 440 date = proctime(); 441 PRINTF("*** Completing display at cycle %d\n", date); 442 DISP_END[cluster_id][local_id] = date; 443 444 barrier_wait(3); 445 446 447 ///////////////////////////////////////////////////////// 448 // Instrumentation (done by processor 0 in cluster 0) 449 ///////////////////////////////////////////////////////// 450 451 if (proc_id == 0){ 452 date = proctime(); 453 PRINTF("\n*** Starting Instrumentation at cycle %d\n\n", date); 454 455 int cc, pp; 456 unsigned int min_load_start = INT_MAX; 457 unsigned int max_load_start = 0; 458 unsigned int min_load_ended = INT_MAX; 459 unsigned int max_load_ended = 0; 460 461 unsigned int min_hori_start = INT_MAX; 462 unsigned int max_hori_start = 0; 463 unsigned int min_hori_ended = INT_MAX; 464 unsigned int max_hori_ended = 0; 465 466 unsigned int min_vert_start = INT_MAX; 467 unsigned int max_vert_start = 0; 468 unsigned int min_vert_ended = INT_MAX; 469 unsigned int max_vert_ended = 0; 470 471 unsigned int min_disp_start = INT_MAX; 472 unsigned int max_disp_start = 0; 473 unsigned int min_disp_ended = INT_MAX; 474 unsigned int max_disp_ended = 0; 475 476 for (cc = 0; cc < nclusters; cc++){ 477 for (pp = 0; pp < nlocal_procs; pp++ ){ 478 if (LOAD_START[cc][pp] < min_load_start){ 479 min_load_start = LOAD_START[cc][pp]; 480 } 481 if (LOAD_START[cc][pp] > max_load_start){ 482 max_load_start = LOAD_START[cc][pp]; 483 } 484 if (LOAD_END[cc][pp] < min_load_ended){ 485 min_load_ended = LOAD_END[cc][pp]; 486 } 487 if (LOAD_END[cc][pp] > max_load_ended){ 488 max_load_ended = LOAD_END[cc][pp]; 489 } 490 491 if (HORI_START[cc][pp] < min_hori_start){ 492 min_hori_start = HORI_START[cc][pp]; 493 } 494 if (HORI_START[cc][pp] > max_hori_start){ 495 max_hori_start = HORI_START[cc][pp]; 496 } 497 if (HORI_END[cc][pp] < min_hori_ended){ 498 min_hori_ended = HORI_END[cc][pp]; 499 } 500 if (HORI_END[cc][pp] > max_hori_ended){ 501 max_hori_ended = HORI_END[cc][pp]; 502 } 503 504 if (VERT_START[cc][pp] < min_vert_start){ 505 min_vert_start = VERT_START[cc][pp]; 506 } 507 if (VERT_START[cc][pp] > max_vert_start){ 508 max_vert_start = VERT_START[cc][pp]; 509 } 510 if (VERT_END[cc][pp] < min_vert_ended){ 511 min_vert_ended = VERT_END[cc][pp]; 512 } 513 if (VERT_END[cc][pp] > max_vert_ended){ 514 max_vert_ended = VERT_END[cc][pp]; 515 } 516 517 if (DISP_START[cc][pp] < min_disp_start){ 518 min_disp_start = DISP_START[cc][pp]; 519 } 520 if (DISP_START[cc][pp] > max_disp_start){ 521 max_disp_start = DISP_START[cc][pp]; 522 } 523 if (DISP_END[cc][pp] < min_disp_ended){ 524 min_disp_ended = DISP_END[cc][pp]; 525 } 526 if (DISP_END[cc][pp] > max_disp_ended){ 527 max_disp_ended = DISP_END[cc][pp]; 528 } 529 } 530 } 531 PRINTF(" - LOAD_START : min = %d / max = %d / med = %d / delta = %d\n", 532 min_load_start, max_load_start, (min_load_start+max_load_start) / 2, max_load_start-min_load_start); 533 PRINTF(" - LOAD_END : min = %d / max = %d / med = %d / delta = %d\n", 534 min_load_ended, max_load_ended, (min_load_ended+max_load_ended) / 2, max_load_ended-min_load_ended); 535 536 PRINTF(" - HORI_START : min = %d / max = %d / med = %d / delta = %d\n", 537 min_hori_start, max_hori_start, (min_hori_start+max_hori_start) / 2, max_hori_start-min_hori_start); 538 PRINTF(" - HORI_END : min = %d / max = %d / med = %d / delta = %d\n", 539 min_hori_ended, max_hori_ended, (min_hori_ended+max_hori_ended) / 2, max_hori_ended-min_hori_ended); 540 541 PRINTF(" - VERT_START : min = %d / max = %d / med = %d / delta = %d\n", 542 min_vert_start, max_vert_start, (min_vert_start+max_vert_start) / 2, max_vert_start-min_vert_start); 543 PRINTF(" - VERT_END : min = %d / max = %d / med = %d / delta = %d\n", 544 min_vert_ended, max_vert_ended, (min_vert_ended+max_vert_ended) / 2, max_vert_ended-min_vert_ended); 545 546 PRINTF(" - DISP_START : min = %d / max = %d / med = %d / delta = %d\n", 547 min_disp_start, max_disp_start, (min_disp_start+max_disp_start) / 2, max_disp_start-min_disp_start); 548 PRINTF(" - DISP_END : min = %d / max = %d / med = %d / delta = %d\n", 549 min_disp_ended, max_disp_ended, (min_disp_ended+max_disp_ended) / 2, max_disp_ended-min_disp_ended); 550 551 PRINTF(" - BARRIER LOAD/HORI = %d\n", min_hori_start - max_load_ended); 552 PRINTF(" - BARRIER HORI/VERT = %d\n", min_vert_start - max_hori_ended); 553 PRINTF(" - BARRIER VERT/DISP = %d\n", min_disp_start - max_vert_ended); 554 555 PRINTF(" - LOAD = %d\n", max_load_ended); 556 PRINTF(" - FILTER = %d\n", max_vert_ended - max_load_ended); 557 PRINTF(" - DISPLAY = %d\n", max_disp_ended - max_vert_ended); 558 559 PRINTF("\nBEGIN LOAD_START\n"); 560 for (cc = 0; cc < nclusters; cc++){ 561 for (pp = 0; pp < nlocal_procs; pp++){ 562 PRINTF("cluster= %d proc= %d date= %d\n", cc, pp, LOAD_START[cc][pp]); 563 } 564 } 565 PRINTF("END\n"); 566 567 PRINTF("\nBEGIN LOAD_END\n"); 568 for (cc = 0; cc < nclusters; cc++){ 569 for (pp = 0; pp < nlocal_procs; pp++){ 570 PRINTF("cluster= %d proc= %d date= %d\n", cc, pp, LOAD_END[cc][pp]); 571 } 572 } 573 PRINTF("END\n"); 574 575 PRINTF("\nBEGIN HORI_START\n"); 576 for (cc = 0; cc < nclusters; cc++){ 577 for (pp = 0; pp < nlocal_procs; pp++){ 578 PRINTF("cluster= %d proc= %d date= %d\n", cc, pp, HORI_START[cc][pp]); 579 } 580 } 581 PRINTF("END\n"); 582 583 PRINTF("\nBEGIN HORI_END\n"); 584 for (cc = 0; cc < nclusters; cc++){ 585 for (pp = 0; pp < nlocal_procs; pp++){ 586 PRINTF("cluster= %d proc= %d date= %d\n", cc, pp, HORI_END[cc][pp]); 587 } 588 } 589 PRINTF("END\n"); 590 591 PRINTF("\nBEGIN VERT_START\n"); 592 for (cc = 0; cc < nclusters; cc++){ 593 for (pp = 0; pp < nlocal_procs; pp++){ 594 PRINTF("cluster= %d proc= %d date= %d\n", cc, pp, VERT_START[cc][pp]); 595 } 596 } 597 PRINTF("END\n"); 598 599 PRINTF("\nBEGIN VERT_END\n"); 600 for (cc = 0; cc < nclusters; cc++){ 601 for (pp = 0; pp < nlocal_procs; pp++ ){ 602 PRINTF("cluster= %d proc= %d date= %d\n", cc, pp, VERT_END[cc][pp]); 603 } 604 } 605 PRINTF("END\n"); 606 607 PRINTF("\nBEGIN DISP_START\n"); 608 for (cc = 0; cc < nclusters; cc++){ 609 for (pp = 0; pp < nlocal_procs; pp++){ 610 PRINTF("cluster= %d proc= %d date= %d\n", cc, pp, DISP_START[cc][pp]); 611 } 612 } 613 PRINTF("END\n"); 614 615 PRINTF("\nBEGIN DISP_END\n"); 616 for (cc = 0; cc < nclusters; cc++){ 617 for (pp = 0; pp < nlocal_procs; pp++){ 618 PRINTF("cluster= %d proc= %d date= %d\n", cc, pp, DISP_END[cc][pp]); 619 } 620 } 621 PRINTF("END\n"); 622 } 623 624 while(1); 574 625 575 626 } // end main() 576 627 628 // Local Variables: 629 // tab-width: 3 630 // c-basic-offset: 3 631 // c-file-offsets:((innamespace . 0)(inline-open . 0)) 632 // indent-tabs-mode: nil 633 // End: 634 635 // vim: filetype=cpp:expandtab:shiftwidth=3:tabstop=3:softtabstop=3 636 637 -
trunk/softs/soft_transpose_giet/main.c
r244 r248 9 9 #define NB_CLUSTER_MAX 256 10 10 11 #define PRINTF(...) ({ if (local_id == 0) { tty_printf(__VA_ARGS__); } }) 11 #define PRINTF(...) ({ if (proc_id == 0) { tty_printf(__VA_ARGS__); } }) 12 13 //#define DISPLAY_ONLY 12 14 13 15 /////////////////////////////////////////// … … 104 106 105 107 106 PRINTF("*** starting barrier init at cycle %d ***\n", proctime());108 PRINTF("*** Starting barrier init at cycle %d ***\n", proctime()); 107 109 108 110 // barriers initialization … … 111 113 barrier_init(2, nglobal_procs); 112 114 113 PRINTF("*** completing barrier init at cycle %d ***\n", proctime());115 PRINTF("*** Completing barrier init at cycle %d ***\n", proctime()); 114 116 115 117 // Main loop (on frames) … … 127 129 LOAD_START[cluster_id][p] = date; 128 130 } 129 tty_printf(" block_device offset : %d\n", nblocks * cluster_id / nclusters);130 131 if (ioc_read(frame * nblocks + nblocks * cluster_id / nclusters, A[cluster_id], nblocks / nclusters)){ 131 tty_printf("echec ioc_read\n");132 PRINTF("echec ioc_read\n"); 132 133 exit(); 133 134 } 134 135 if (ioc_completed()){ 135 tty_printf("echec ioc_completed\n");136 PRINTF("echec ioc_completed\n"); 136 137 exit(); 137 138 } … … 150 151 // (p,l) are the (x,y) pixel coordinates in the source frame 151 152 153 #ifndef DISPLAY_ONLY 152 154 date = proctime(); 153 155 PRINTF("\n*** Starting transpose for frame %d at cycle %d\n", frame, date); … … 173 175 PRINTF("*** Completing transpose for frame %d at cycle %d\n", frame, date); 174 176 TRSP_END[cluster_id][local_id] = date; 175 176 177 barrier_wait(1); 178 #endif 177 179 178 180 // parallel display from B[c] to frame buffer … … 184 186 185 187 unsigned int npxt = npixels / nglobal_procs; // number of pixels per proc 186 if (npixels - npxt * nglobal_procs != 0){ 187 tty_printf("*** Error line %d\n", __LINE__); 188 189 #ifndef DISPLAY_ONLY 190 if (fb_write(npxt * proc_id, B[cluster_id] + npxt * local_id, npxt)){ 191 PRINTF("[%d]: echec fb_sync_write\n", proc_id); 188 192 exit(); 189 193 } 190 tty_printf(" npxt : %d\n", npxt); 191 192 if (fb_write(npxt * proc_id, B[cluster_id] + npxt * local_id, npxt)){ 193 tty_printf("[%d]: echec fb_sync_write\n", proc_id); 194 #else 195 if (fb_write(npxt * proc_id, A[cluster_id] + npxt * local_id, npxt)){ 196 PRINTF("[%d]: echec fb_sync_write\n", proc_id); 194 197 exit(); 195 198 } 196 197 PRINTF(" After fb_write and before fb_completed\n"); 199 #endif 198 200 199 201 if (fb_completed()){ 200 tty_printf("[%d]: echec fb_completed\n", proc_id);202 PRINTF("[%d]: echec fb_completed\n", proc_id); 201 203 exit(); 202 204 }
Note: See TracChangeset
for help on using the changeset viewer.