[248] | 1 | |
---|
| 2 | #include "limits.h" |
---|
[158] | 3 | #include "stdio.h" |
---|
| 4 | |
---|
[248] | 5 | #include "../giet_tsar/block_device.h" |
---|
| 6 | |
---|
[158] | 7 | //////////////////////////////////// |
---|
| 8 | // Image parameters |
---|
| 9 | |
---|
[248] | 10 | #define NB_CLUSTER_MAX 256 |
---|
| 11 | #define PIXEL_SIZE 2 |
---|
| 12 | #define NL 1024 |
---|
| 13 | #define NP 1024 |
---|
[158] | 14 | |
---|
[248] | 15 | #define NB_PIXELS ((NP) * (NL)) |
---|
| 16 | #define FRAME_SIZE ((NB_PIXELS) * (PIXEL_SIZE)) |
---|
[158] | 17 | |
---|
[252] | 18 | #define PRINTF(...) ({ if (proc_id == 0) { tty_printf(__VA_ARGS__); } }) |
---|
[158] | 19 | |
---|
[248] | 20 | #define TA(c,l,p) (A[c][((NP) * (l)) + (p)]) |
---|
| 21 | #define TB(c,p,l) (B[c][((NL) * (p)) + (l)]) |
---|
| 22 | #define TC(c,l,p) (C[c][((NP) * (l)) + (p)]) |
---|
| 23 | #define TD(c,l,p) (D[c][((NP) * (l)) + (p)]) |
---|
| 24 | #define TZ(c,l,p) (Z[c][((NP) * (l)) + (p)]) |
---|
| 25 | |
---|
[158] | 26 | #define max(x,y) ((x) > (y) ? (x) : (y)) |
---|
| 27 | #define min(x,y) ((x) < (y) ? (x) : (y)) |
---|
| 28 | |
---|
| 29 | /////////////////////////////////////////// |
---|
| 30 | // tricks to read parameters from ldscript |
---|
| 31 | /////////////////////////////////////////// |
---|
| 32 | |
---|
| 33 | struct plaf; |
---|
| 34 | |
---|
[248] | 35 | extern struct plouf seg_ioc_base; |
---|
[158] | 36 | extern struct plaf seg_heap_base; |
---|
| 37 | extern struct plaf NB_PROCS; |
---|
| 38 | extern struct plaf NB_CLUSTERS; |
---|
| 39 | |
---|
| 40 | |
---|
[248] | 41 | // Required when initializing an array all at once |
---|
| 42 | static void *memcpy(void *_dst, const void *_src, unsigned int size){ |
---|
| 43 | unsigned int *dst = _dst; |
---|
| 44 | const unsigned int *src = _src; |
---|
| 45 | if (! ((unsigned int)dst & 3) && ! ((unsigned int)src & 3)){ |
---|
| 46 | while (size > 3){ |
---|
| 47 | *dst++ = *src++; |
---|
| 48 | size -= 4; |
---|
| 49 | } |
---|
| 50 | } |
---|
[158] | 51 | |
---|
[248] | 52 | unsigned char *cdst = (unsigned char*)dst; |
---|
| 53 | unsigned char *csrc = (unsigned char*)src; |
---|
[158] | 54 | |
---|
[248] | 55 | while (size--){ |
---|
| 56 | *cdst++ = *csrc++; |
---|
| 57 | } |
---|
| 58 | return _dst; |
---|
| 59 | } |
---|
[158] | 60 | |
---|
| 61 | |
---|
| 62 | |
---|
| 63 | |
---|
| 64 | |
---|
| 65 | |
---|
| 66 | |
---|
[170] | 67 | |
---|
[248] | 68 | void main(){ |
---|
[170] | 69 | |
---|
[248] | 70 | ////////////////////////////////// |
---|
| 71 | // convolution kernel parameters |
---|
| 72 | // The content of this section is |
---|
| 73 | // Philips proprietary information. |
---|
| 74 | /////////////////////////////////// |
---|
[158] | 75 | |
---|
[248] | 76 | int vnorm = 115; |
---|
| 77 | int vf[35] = { 1, 1, 2, 2, 2, |
---|
| 78 | 2, 3, 3, 3, 4, |
---|
| 79 | 4, 4, 4, 5, 5, |
---|
| 80 | 5, 5, 5, 5, 5, |
---|
| 81 | 5, 5, 4, 4, 4, |
---|
| 82 | 4, 3, 3, 3, 2, |
---|
| 83 | 2, 2, 2, 1, 1 }; |
---|
[158] | 84 | |
---|
[248] | 85 | int hrange = 100; |
---|
| 86 | int hnorm = 201; |
---|
[158] | 87 | |
---|
[248] | 88 | unsigned int date = 0; |
---|
[158] | 89 | |
---|
[248] | 90 | int c; // cluster index for loops |
---|
| 91 | int l; // line index for loops |
---|
| 92 | int p; // pixel index for loops |
---|
| 93 | int x; // filter index for loops |
---|
[158] | 94 | |
---|
[248] | 95 | const unsigned int proc_id = procid(); // processor id |
---|
| 96 | const unsigned int nlocal_procs = (int) &NB_PROCS; // number of processors per cluster |
---|
| 97 | const unsigned int nclusters = (int) &NB_CLUSTERS; // number of clusters |
---|
| 98 | const unsigned int local_id = proc_id % nlocal_procs; // local task id |
---|
| 99 | const unsigned int cluster_id = proc_id / nlocal_procs; // cluster task id |
---|
| 100 | const unsigned int base = (unsigned int) &seg_heap_base; // base address for shared buffers |
---|
| 101 | const unsigned int increment = 0x80000000 / nclusters * 2; // cluster increment |
---|
| 102 | const unsigned int nglobal_procs = nclusters * nlocal_procs; // number of tasks |
---|
| 103 | const unsigned int npixels = NB_PIXELS; // Number of pixel per frame |
---|
| 104 | const unsigned int frame_size = FRAME_SIZE; // Size of 1 frame (in bytes) |
---|
| 105 | const unsigned int * ioc_address = (unsigned int *) &seg_ioc_base; |
---|
| 106 | const unsigned int block_size = ioc_address[BLOCK_DEVICE_BLOCK_SIZE]; |
---|
| 107 | const unsigned int nblocks = frame_size / block_size; // number of blocks per frame |
---|
[158] | 108 | |
---|
[248] | 109 | const unsigned int lines_per_task = NL / nglobal_procs; // number of lines per task |
---|
| 110 | const unsigned int lines_per_cluster = NL / nclusters; // number of lines per cluster |
---|
| 111 | const unsigned int pixels_per_task = NP / nglobal_procs; // number of columns per task |
---|
| 112 | const unsigned int pixels_per_cluster = NP / nclusters; // number of columns per cluster |
---|
[158] | 113 | |
---|
[248] | 114 | int first, last; |
---|
[158] | 115 | |
---|
[248] | 116 | PRINTF("\n*** Processor %d entering main at cycle %d ***\n\n", proc_id, proctime()); |
---|
[158] | 117 | |
---|
| 118 | |
---|
[248] | 119 | ///////////////////////// |
---|
| 120 | // parameters checking // |
---|
| 121 | ///////////////////////// |
---|
| 122 | |
---|
[158] | 123 | |
---|
[248] | 124 | if ((nlocal_procs != 1) && (nlocal_procs != 2) && (nlocal_procs != 4)){ |
---|
| 125 | PRINTF("NB_PROCS must be 1, 2 or 4\n"); |
---|
| 126 | exit(); |
---|
| 127 | } |
---|
[158] | 128 | |
---|
[252] | 129 | //////////////////////////////////////////////////////////////////////// |
---|
| 130 | // Warning: NB_CLUSTERS must be at least 4 because of the heap size; // |
---|
| 131 | // if there are less clusters, the heap mixes with the stack // |
---|
| 132 | // (the total heap size must be at least 0x01000000) // |
---|
| 133 | //////////////////////////////////////////////////////////////////////// |
---|
[248] | 134 | if ((nclusters != 4) && (nclusters != 8) && (nclusters != 16) && |
---|
| 135 | (nclusters != 32) && (nclusters != 64) && (nclusters !=128) && (nclusters != 256)){ |
---|
| 136 | PRINTF("NB_CLUSTERS must be a power of 2 between 4 and 256\n"); |
---|
| 137 | exit(); |
---|
| 138 | } |
---|
[158] | 139 | |
---|
[248] | 140 | if (proc_id >= nglobal_procs){ |
---|
| 141 | PRINTF("processor id %d larger than NB_CLUSTERS*NB_PROCS\n", proc_id); |
---|
| 142 | exit(); |
---|
| 143 | } |
---|
[158] | 144 | |
---|
[248] | 145 | if (NL % nclusters != 0){ |
---|
| 146 | PRINTF("NB_CLUSTERS must be a divider of NL"); |
---|
| 147 | exit(); |
---|
| 148 | } |
---|
[158] | 149 | |
---|
[248] | 150 | if (NP % nclusters != 0){ |
---|
| 151 | PRINTF("NB_CLUSTERS must be a divider of NP"); |
---|
| 152 | exit(); |
---|
| 153 | } |
---|
[158] | 154 | |
---|
| 155 | |
---|
[248] | 156 | // Arrays of pointers on the shared, distributed buffers |
---|
| 157 | // containing the images (sized for the worst case : 256 clusters) |
---|
| 158 | unsigned short * A[NB_CLUSTER_MAX]; |
---|
| 159 | int * B[NB_CLUSTER_MAX]; |
---|
| 160 | int * C[NB_CLUSTER_MAX]; |
---|
| 161 | int * D[NB_CLUSTER_MAX]; |
---|
| 162 | unsigned char * Z[NB_CLUSTER_MAX]; |
---|
[158] | 163 | |
---|
[248] | 164 | // Arrays of pointers on the instrumentation arrays |
---|
| 165 | // These arrays are indexed by the cluster index (sized for the worst case : 256 clusters) |
---|
| 166 | // each pointer points on the base adress of an array of 4 (NPROCS max) unsigned int |
---|
| 167 | unsigned int * LOAD_START[NB_CLUSTER_MAX]; |
---|
| 168 | unsigned int * LOAD_END[NB_CLUSTER_MAX]; |
---|
| 169 | unsigned int * VERT_START[NB_CLUSTER_MAX]; |
---|
| 170 | unsigned int * VERT_END[NB_CLUSTER_MAX]; |
---|
| 171 | unsigned int * HORI_START[NB_CLUSTER_MAX]; |
---|
| 172 | unsigned int * HORI_END[NB_CLUSTER_MAX]; |
---|
| 173 | unsigned int * DISP_START[NB_CLUSTER_MAX]; |
---|
| 174 | unsigned int * DISP_END[NB_CLUSTER_MAX]; |
---|
[158] | 175 | |
---|
[248] | 176 | // The shared, distributed buffers addresses are computed |
---|
| 177 | // from the seg_heap_base value defined in the ldscript file |
---|
| 178 | // and from the cluster increment = 4Gbytes/nclusters. |
---|
| 179 | // These arrays of pointers are identical and |
---|
| 180 | // replicated in the stack of each task |
---|
| 181 | for (c = 0; c < nclusters; c++){ |
---|
| 182 | unsigned int offset = base + increment * c; |
---|
[252] | 183 | A[c] = (unsigned short *) (offset); |
---|
[248] | 184 | B[c] = (int *) (offset + frame_size * 1 / nclusters); // We increment by 2 * frame_size |
---|
| 185 | C[c] = (int *) (offset + frame_size * 3 / nclusters); // because sizeof(int) = 2*sizeof(short) |
---|
| 186 | D[c] = (int *) (offset + frame_size * 5 / nclusters); // so an array of frame_size elements of type |
---|
| 187 | Z[c] = (unsigned char *) (offset + frame_size * 7 / nclusters); // int can contain the equivalent of 2 frames |
---|
[158] | 188 | |
---|
[248] | 189 | offset = base + increment * c + frame_size * 8 / nclusters; |
---|
| 190 | LOAD_START[c] = (unsigned int *) (offset + nlocal_procs * sizeof(unsigned int) * 0); |
---|
| 191 | LOAD_END[c] = (unsigned int *) (offset + nlocal_procs * sizeof(unsigned int) * 1); |
---|
| 192 | VERT_START[c] = (unsigned int *) (offset + nlocal_procs * sizeof(unsigned int) * 2); |
---|
| 193 | VERT_END[c] = (unsigned int *) (offset + nlocal_procs * sizeof(unsigned int) * 3); |
---|
| 194 | HORI_START[c] = (unsigned int *) (offset + nlocal_procs * sizeof(unsigned int) * 4); |
---|
| 195 | HORI_END[c] = (unsigned int *) (offset + nlocal_procs * sizeof(unsigned int) * 5); |
---|
| 196 | DISP_START[c] = (unsigned int *) (offset + nlocal_procs * sizeof(unsigned int) * 6); |
---|
| 197 | DISP_END[c] = (unsigned int *) (offset + nlocal_procs * sizeof(unsigned int) * 7); |
---|
| 198 | } |
---|
[158] | 199 | |
---|
[248] | 200 | PRINTF("NB_CLUSTERS = %d\n", nclusters); |
---|
| 201 | PRINTF("NB_LOCAL_PROCS = %d\n", nlocal_procs); |
---|
| 202 | PRINTF("NB_GLOBAL_PROCS = %d\n", nglobal_procs); |
---|
| 203 | PRINTF("NB_PIXELS = %d\n", npixels); |
---|
| 204 | PRINTF("PIXEL_SIZE = %d\n", PIXEL_SIZE); |
---|
| 205 | PRINTF("FRAME_SIZE = %d\n", frame_size); |
---|
| 206 | PRINTF("BLOCK_SIZE = %d\n", block_size); |
---|
| 207 | PRINTF("NB_BLOCKS = %d\n\n", nblocks); |
---|
[158] | 208 | |
---|
| 209 | |
---|
[248] | 210 | PRINTF("*** Starting barrier init at cycle %d ***\n", proctime()); |
---|
[158] | 211 | |
---|
[248] | 212 | // barriers initialization |
---|
| 213 | barrier_init(0, nglobal_procs); |
---|
| 214 | barrier_init(1, nglobal_procs); |
---|
| 215 | barrier_init(2, nglobal_procs); |
---|
| 216 | barrier_init(3, nglobal_procs); |
---|
[158] | 217 | |
---|
[248] | 218 | PRINTF("*** Completing barrier init at cycle %d ***\n", proctime()); |
---|
[158] | 219 | |
---|
| 220 | |
---|
[248] | 221 | //////////////////////////////////////////////////////// |
---|
| 222 | // pseudo parallel load from disk to A[c] buffers |
---|
| 223 | // only task running on processor with (local_id==0) does it |
---|
| 224 | // nblocks/nclusters are loaded in each cluster |
---|
| 225 | //////////////////////////////////////////////////////// |
---|
[158] | 226 | |
---|
[248] | 227 | if (local_id == 0){ |
---|
| 228 | int p; |
---|
| 229 | date = proctime(); |
---|
| 230 | PRINTF("\n*** Starting load at cycle %d\n", date); |
---|
| 231 | for (p = 0; p < nlocal_procs; p++){ |
---|
| 232 | LOAD_START[cluster_id][p] = date; |
---|
| 233 | } |
---|
[158] | 234 | |
---|
[248] | 235 | if (ioc_read(nblocks*cluster_id/nclusters, A[cluster_id], nblocks/nclusters)){ |
---|
| 236 | PRINTF("echec ioc_read\n"); |
---|
| 237 | exit(1); |
---|
| 238 | } |
---|
| 239 | if (ioc_completed()){ |
---|
| 240 | PRINTF("echec ioc_completed\n"); |
---|
| 241 | exit(1); |
---|
| 242 | } |
---|
[158] | 243 | |
---|
[248] | 244 | date = proctime(); |
---|
| 245 | PRINTF("*** Completing load at cycle %d\n", date); |
---|
| 246 | for (p = 0; p < nlocal_procs; p++){ |
---|
| 247 | LOAD_END[cluster_id][p] = date; |
---|
| 248 | } |
---|
| 249 | } |
---|
[158] | 250 | |
---|
[248] | 251 | barrier_wait(0); |
---|
[158] | 252 | |
---|
| 253 | |
---|
[248] | 254 | //////////////////////////////////////////////////////// |
---|
| 255 | // parallel horizontal filter : |
---|
| 256 | // B <= transpose(FH(A)) |
---|
| 257 | // D <= A - FH(A) |
---|
| 258 | // Each task computes (NL/nglobal_procs) lines |
---|
| 259 | // The image must be extended : |
---|
| 260 | // if (z<0) TA(cluster_id,l,z) == TA(cluster_id,l,0) |
---|
| 261 | // if (z>NP-1) TA(cluster_id,l,z) == TA(cluster_id,l,NP-1) |
---|
| 262 | //////////////////////////////////////////////////////// |
---|
[158] | 263 | |
---|
[248] | 264 | date = proctime(); |
---|
| 265 | PRINTF("\n*** Starting horizontal filter at cycle %d\n", date); |
---|
| 266 | HORI_START[cluster_id][local_id] = date; |
---|
[158] | 267 | |
---|
[248] | 268 | // l = absolute line index / p = absolute pixel index |
---|
| 269 | // first & last define which lines are handled by a given task(cluster_id,local_id) |
---|
[158] | 270 | |
---|
[248] | 271 | first = (cluster_id * nlocal_procs + local_id) * lines_per_task; |
---|
| 272 | last = first + lines_per_task; |
---|
[162] | 273 | |
---|
[248] | 274 | for (l = first; l < last; l++){ |
---|
| 275 | // src_c and src_l are the cluster index and the line index for A & D |
---|
| 276 | int src_c = l / lines_per_cluster; |
---|
| 277 | int src_l = l % lines_per_cluster; |
---|
[158] | 278 | |
---|
[248] | 279 | // We use the specific values of the horizontal ep-filter for optimisation: |
---|
| 280 | // sum(p) = sum(p-1) + TA[p+hrange] - TA[p-hrange-1] |
---|
| 281 | // To minimize the number of tests, the loop on pixels is split in three domains |
---|
[158] | 282 | |
---|
[248] | 283 | int sum_p = (hrange + 2) * TA(src_c, src_l, 0); |
---|
| 284 | for (x = 1; x < hrange; x++){ |
---|
| 285 | sum_p = sum_p + TA(src_c, src_l, x); |
---|
| 286 | } |
---|
[170] | 287 | |
---|
[248] | 288 | // first domain : from 0 to hrange |
---|
| 289 | for (p = 0; p < hrange + 1; p++){ |
---|
| 290 | // dst_c and dst_p are the cluster index and the pixel index for B |
---|
| 291 | int dst_c = p / pixels_per_cluster; |
---|
| 292 | int dst_p = p % pixels_per_cluster; |
---|
| 293 | sum_p = sum_p + (int) TA(src_c, src_l, p + hrange) - (int) TA(src_c, src_l, 0); |
---|
| 294 | TB(dst_c, dst_p, l) = sum_p / hnorm; |
---|
| 295 | TD(src_c, src_l, p) = (int) TA(src_c, src_l, p) - sum_p / hnorm; |
---|
| 296 | } |
---|
| 297 | // second domain : from (hrange+1) to (NP-hrange-1) |
---|
| 298 | for (p = hrange + 1; p < NP - hrange; p++){ |
---|
| 299 | // dst_c and dst_p are the cluster index and the pixel index for B |
---|
| 300 | int dst_c = p / pixels_per_cluster; |
---|
| 301 | int dst_p = p % pixels_per_cluster; |
---|
| 302 | sum_p = sum_p + (int) TA(src_c, src_l, p + hrange) - (int) TA(src_c, src_l, p - hrange - 1); |
---|
| 303 | TB(dst_c, dst_p, l) = sum_p / hnorm; |
---|
| 304 | TD(src_c, src_l, p) = (int) TA(src_c, src_l, p) - sum_p / hnorm; |
---|
| 305 | } |
---|
| 306 | // third domain : from (NP-hrange) to (NP-1) |
---|
| 307 | for (p = NP - hrange; p < NP; p++){ |
---|
| 308 | // dst_c and dst_p are the cluster index and the pixel index for B |
---|
| 309 | int dst_c = p / pixels_per_cluster; |
---|
| 310 | int dst_p = p % pixels_per_cluster; |
---|
| 311 | sum_p = sum_p + (int) TA(src_c, src_l, NP - 1) - (int) TA(src_c, src_l, p - hrange - 1); |
---|
| 312 | TB(dst_c, dst_p, l) = sum_p / hnorm; |
---|
| 313 | TD(src_c, src_l, p) = (int) TA(src_c, src_l, p) - sum_p / hnorm; |
---|
| 314 | } |
---|
[170] | 315 | |
---|
[248] | 316 | PRINTF(" - line %d computed at cycle %d\n", l, proctime()); |
---|
| 317 | } |
---|
[170] | 318 | |
---|
[248] | 319 | date = proctime(); |
---|
| 320 | PRINTF("*** Completing horizontal filter at cycle %d\n", date); |
---|
| 321 | HORI_END[cluster_id][local_id] = date; |
---|
[170] | 322 | |
---|
[248] | 323 | barrier_wait(1); |
---|
[170] | 324 | |
---|
| 325 | |
---|
[248] | 326 | ////////////////////////////////////////////////////////// |
---|
| 327 | // parallel vertical filter : |
---|
| 328 | // C <= transpose(FV(B)) |
---|
| 329 | // Each task computes (NP/nglobal_procs) columns |
---|
| 330 | // The image must be extended : |
---|
| 331 | // if (l<0) TB(cluster_id,p,x) == TB(cluster_id,p,0) |
---|
| 332 | // if (l>NL-1) TB(cluster_id,p,x) == TB(cluster_id,p,NL-1) |
---|
| 333 | ////////////////////////////////////////////////////////// |
---|
[174] | 334 | |
---|
[248] | 335 | date = proctime(); |
---|
| 336 | PRINTF("\n*** starting vertical filter at cycle %d\n", date); |
---|
| 337 | VERT_START[cluster_id][local_id] = date; |
---|
[170] | 338 | |
---|
[248] | 339 | // l = absolute line index / p = absolute pixel index |
---|
| 340 | // first & last define which pixels are handled by a given task(cluster_id,local_id) |
---|
[170] | 341 | |
---|
[248] | 342 | first = (cluster_id * nlocal_procs + local_id) * pixels_per_task; |
---|
| 343 | last = first + pixels_per_task; |
---|
[170] | 344 | |
---|
[248] | 345 | for (p = first; p < last; p++){ |
---|
| 346 | // src_c and src_p are the cluster index and the pixel index for B |
---|
| 347 | int src_c = p / pixels_per_cluster; |
---|
| 348 | int src_p = p % pixels_per_cluster; |
---|
[170] | 349 | |
---|
[248] | 350 | int sum_l; |
---|
[170] | 351 | |
---|
[248] | 352 | // We use the specific values of the vertical ep-filter |
---|
| 353 | // To minimize the number of tests, the NL lines are split in three domains |
---|
[170] | 354 | |
---|
[248] | 355 | // first domain : explicit computation for the first 18 values |
---|
| 356 | for (l = 0; l < 18; l++){ |
---|
| 357 | // dst_c and dst_l are the cluster index and the line index for C |
---|
| 358 | int dst_c = l / lines_per_cluster; |
---|
| 359 | int dst_l = l % lines_per_cluster; |
---|
[170] | 360 | |
---|
[248] | 361 | for (x = 0, sum_l = 0; x < 35; x++){ |
---|
| 362 | sum_l = sum_l + vf[x] * TB(src_c, src_p, max(l - 17 + x,0) ); |
---|
| 363 | } |
---|
| 364 | TC(dst_c, dst_l, p) = sum_l / vnorm; |
---|
| 365 | } |
---|
| 366 | // second domain |
---|
| 367 | for (l = 18; l < NL - 17; l++){ |
---|
| 368 | // dst_c and dst_l are the cluster index and the line index for C |
---|
| 369 | int dst_c = l / lines_per_cluster; |
---|
| 370 | int dst_l = l % lines_per_cluster; |
---|
[170] | 371 | |
---|
[248] | 372 | sum_l = sum_l + TB(src_c, src_p, l + 4) |
---|
| 373 | + TB(src_c, src_p, l + 8) |
---|
| 374 | + TB(src_c, src_p, l + 11) |
---|
| 375 | + TB(src_c, src_p, l + 15) |
---|
| 376 | + TB(src_c, src_p, l + 17) |
---|
| 377 | - TB(src_c, src_p, l - 5) |
---|
| 378 | - TB(src_c, src_p, l - 9) |
---|
| 379 | - TB(src_c, src_p, l - 12) |
---|
| 380 | - TB(src_c, src_p, l - 16) |
---|
| 381 | - TB(src_c, src_p, l - 18); |
---|
| 382 | TC(dst_c, dst_l, p) = sum_l / vnorm; |
---|
| 383 | } |
---|
| 384 | // third domain |
---|
| 385 | for (l = NL - 17; l < NL; l++){ |
---|
| 386 | // dst_c and dst_l are the cluster index and the line index for C |
---|
| 387 | int dst_c = l / lines_per_cluster; |
---|
| 388 | int dst_l = l % lines_per_cluster; |
---|
[174] | 389 | |
---|
[248] | 390 | sum_l = sum_l + TB(src_c, src_p, min(l + 4, NL - 1)) |
---|
| 391 | + TB(src_c, src_p, min(l + 8, NL - 1)) |
---|
| 392 | + TB(src_c, src_p, min(l + 11, NL - 1)) |
---|
| 393 | + TB(src_c, src_p, min(l + 15, NL - 1)) |
---|
| 394 | + TB(src_c, src_p, min(l + 17, NL - 1)) |
---|
| 395 | - TB(src_c, src_p, l - 5) |
---|
| 396 | - TB(src_c, src_p, l - 9) |
---|
| 397 | - TB(src_c, src_p, l - 12) |
---|
| 398 | - TB(src_c, src_p, l - 16) |
---|
| 399 | - TB(src_c, src_p, l - 18); |
---|
| 400 | TC(dst_c, dst_l, p) = sum_l / vnorm; |
---|
| 401 | } |
---|
| 402 | PRINTF(" - column %d computed at cycle %d\n", p, proctime()); |
---|
| 403 | } |
---|
| 404 | |
---|
| 405 | date = proctime(); |
---|
| 406 | PRINTF("*** Completing vertical filter at cycle %d\n", date); |
---|
| 407 | VERT_END[cluster_id][local_id] = date; |
---|
| 408 | |
---|
| 409 | barrier_wait(2); |
---|
| 410 | |
---|
| 411 | |
---|
| 412 | //////////////////////////////////////////////////////////////// |
---|
| 413 | // final computation and parallel display |
---|
| 414 | // Z <= D + C |
---|
| 415 | // Each processor use its private DMA channel to display |
---|
| 416 | // the resulting image, line per line (one byte per pixel). |
---|
| 417 | // Eah processor computes & displays (NL/nglobal_procs) lines. |
---|
| 418 | //////////////////////////////////////////////////////////////// |
---|
| 419 | |
---|
| 420 | date = proctime(); |
---|
| 421 | PRINTF("\n*** Starting display at cycle %d\n", date); |
---|
| 422 | DISP_START[cluster_id][local_id] = date; |
---|
| 423 | |
---|
| 424 | first = local_id * lines_per_task; |
---|
| 425 | last = first + lines_per_task; |
---|
| 426 | |
---|
| 427 | for (l = first; l < last; l++){ |
---|
| 428 | for (p = 0; p < NP; p++){ |
---|
| 429 | TZ(cluster_id,l,p) = (unsigned char) (((TD(cluster_id,l,p) + TC(cluster_id,l,p)) >> 8) & 0xFF); |
---|
| 430 | } |
---|
[252] | 431 | fb_write(NP * (cluster_id * lines_per_cluster + l), &TZ(cluster_id,l,0), NP); |
---|
[248] | 432 | } |
---|
| 433 | |
---|
| 434 | date = proctime(); |
---|
| 435 | PRINTF("*** Completing display at cycle %d\n", date); |
---|
| 436 | DISP_END[cluster_id][local_id] = date; |
---|
| 437 | |
---|
| 438 | barrier_wait(3); |
---|
| 439 | |
---|
| 440 | |
---|
| 441 | ///////////////////////////////////////////////////////// |
---|
| 442 | // Instrumentation (done by processor 0 in cluster 0) |
---|
| 443 | ///////////////////////////////////////////////////////// |
---|
| 444 | |
---|
| 445 | if (proc_id == 0){ |
---|
| 446 | date = proctime(); |
---|
| 447 | PRINTF("\n*** Starting Instrumentation at cycle %d\n\n", date); |
---|
| 448 | |
---|
| 449 | int cc, pp; |
---|
| 450 | unsigned int min_load_start = INT_MAX; |
---|
| 451 | unsigned int max_load_start = 0; |
---|
| 452 | unsigned int min_load_ended = INT_MAX; |
---|
| 453 | unsigned int max_load_ended = 0; |
---|
| 454 | |
---|
| 455 | unsigned int min_hori_start = INT_MAX; |
---|
| 456 | unsigned int max_hori_start = 0; |
---|
| 457 | unsigned int min_hori_ended = INT_MAX; |
---|
| 458 | unsigned int max_hori_ended = 0; |
---|
| 459 | |
---|
| 460 | unsigned int min_vert_start = INT_MAX; |
---|
| 461 | unsigned int max_vert_start = 0; |
---|
| 462 | unsigned int min_vert_ended = INT_MAX; |
---|
| 463 | unsigned int max_vert_ended = 0; |
---|
| 464 | |
---|
| 465 | unsigned int min_disp_start = INT_MAX; |
---|
| 466 | unsigned int max_disp_start = 0; |
---|
| 467 | unsigned int min_disp_ended = INT_MAX; |
---|
| 468 | unsigned int max_disp_ended = 0; |
---|
| 469 | |
---|
| 470 | for (cc = 0; cc < nclusters; cc++){ |
---|
| 471 | for (pp = 0; pp < nlocal_procs; pp++ ){ |
---|
| 472 | if (LOAD_START[cc][pp] < min_load_start){ |
---|
| 473 | min_load_start = LOAD_START[cc][pp]; |
---|
[174] | 474 | } |
---|
[248] | 475 | if (LOAD_START[cc][pp] > max_load_start){ |
---|
| 476 | max_load_start = LOAD_START[cc][pp]; |
---|
[174] | 477 | } |
---|
[248] | 478 | if (LOAD_END[cc][pp] < min_load_ended){ |
---|
| 479 | min_load_ended = LOAD_END[cc][pp]; |
---|
[174] | 480 | } |
---|
[248] | 481 | if (LOAD_END[cc][pp] > max_load_ended){ |
---|
| 482 | max_load_ended = LOAD_END[cc][pp]; |
---|
[174] | 483 | } |
---|
[248] | 484 | |
---|
| 485 | if (HORI_START[cc][pp] < min_hori_start){ |
---|
| 486 | min_hori_start = HORI_START[cc][pp]; |
---|
[174] | 487 | } |
---|
[248] | 488 | if (HORI_START[cc][pp] > max_hori_start){ |
---|
| 489 | max_hori_start = HORI_START[cc][pp]; |
---|
[174] | 490 | } |
---|
[248] | 491 | if (HORI_END[cc][pp] < min_hori_ended){ |
---|
| 492 | min_hori_ended = HORI_END[cc][pp]; |
---|
[174] | 493 | } |
---|
[248] | 494 | if (HORI_END[cc][pp] > max_hori_ended){ |
---|
| 495 | max_hori_ended = HORI_END[cc][pp]; |
---|
[174] | 496 | } |
---|
| 497 | |
---|
[248] | 498 | if (VERT_START[cc][pp] < min_vert_start){ |
---|
| 499 | min_vert_start = VERT_START[cc][pp]; |
---|
| 500 | } |
---|
| 501 | if (VERT_START[cc][pp] > max_vert_start){ |
---|
| 502 | max_vert_start = VERT_START[cc][pp]; |
---|
| 503 | } |
---|
| 504 | if (VERT_END[cc][pp] < min_vert_ended){ |
---|
| 505 | min_vert_ended = VERT_END[cc][pp]; |
---|
| 506 | } |
---|
| 507 | if (VERT_END[cc][pp] > max_vert_ended){ |
---|
| 508 | max_vert_ended = VERT_END[cc][pp]; |
---|
| 509 | } |
---|
[158] | 510 | |
---|
[248] | 511 | if (DISP_START[cc][pp] < min_disp_start){ |
---|
| 512 | min_disp_start = DISP_START[cc][pp]; |
---|
| 513 | } |
---|
| 514 | if (DISP_START[cc][pp] > max_disp_start){ |
---|
| 515 | max_disp_start = DISP_START[cc][pp]; |
---|
| 516 | } |
---|
| 517 | if (DISP_END[cc][pp] < min_disp_ended){ |
---|
| 518 | min_disp_ended = DISP_END[cc][pp]; |
---|
| 519 | } |
---|
| 520 | if (DISP_END[cc][pp] > max_disp_ended){ |
---|
| 521 | max_disp_ended = DISP_END[cc][pp]; |
---|
| 522 | } |
---|
| 523 | } |
---|
| 524 | } |
---|
| 525 | PRINTF(" - LOAD_START : min = %d / max = %d / med = %d / delta = %d\n", |
---|
| 526 | min_load_start, max_load_start, (min_load_start+max_load_start) / 2, max_load_start-min_load_start); |
---|
| 527 | PRINTF(" - LOAD_END : min = %d / max = %d / med = %d / delta = %d\n", |
---|
| 528 | min_load_ended, max_load_ended, (min_load_ended+max_load_ended) / 2, max_load_ended-min_load_ended); |
---|
| 529 | |
---|
| 530 | PRINTF(" - HORI_START : min = %d / max = %d / med = %d / delta = %d\n", |
---|
| 531 | min_hori_start, max_hori_start, (min_hori_start+max_hori_start) / 2, max_hori_start-min_hori_start); |
---|
| 532 | PRINTF(" - HORI_END : min = %d / max = %d / med = %d / delta = %d\n", |
---|
| 533 | min_hori_ended, max_hori_ended, (min_hori_ended+max_hori_ended) / 2, max_hori_ended-min_hori_ended); |
---|
| 534 | |
---|
| 535 | PRINTF(" - VERT_START : min = %d / max = %d / med = %d / delta = %d\n", |
---|
| 536 | min_vert_start, max_vert_start, (min_vert_start+max_vert_start) / 2, max_vert_start-min_vert_start); |
---|
| 537 | PRINTF(" - VERT_END : min = %d / max = %d / med = %d / delta = %d\n", |
---|
| 538 | min_vert_ended, max_vert_ended, (min_vert_ended+max_vert_ended) / 2, max_vert_ended-min_vert_ended); |
---|
| 539 | |
---|
| 540 | PRINTF(" - DISP_START : min = %d / max = %d / med = %d / delta = %d\n", |
---|
| 541 | min_disp_start, max_disp_start, (min_disp_start+max_disp_start) / 2, max_disp_start-min_disp_start); |
---|
| 542 | PRINTF(" - DISP_END : min = %d / max = %d / med = %d / delta = %d\n", |
---|
| 543 | min_disp_ended, max_disp_ended, (min_disp_ended+max_disp_ended) / 2, max_disp_ended-min_disp_ended); |
---|
| 544 | |
---|
| 545 | PRINTF(" - BARRIER LOAD/HORI = %d\n", min_hori_start - max_load_ended); |
---|
| 546 | PRINTF(" - BARRIER HORI/VERT = %d\n", min_vert_start - max_hori_ended); |
---|
| 547 | PRINTF(" - BARRIER VERT/DISP = %d\n", min_disp_start - max_vert_ended); |
---|
| 548 | |
---|
| 549 | PRINTF(" - LOAD = %d\n", max_load_ended); |
---|
| 550 | PRINTF(" - FILTER = %d\n", max_vert_ended - max_load_ended); |
---|
| 551 | PRINTF(" - DISPLAY = %d\n", max_disp_ended - max_vert_ended); |
---|
| 552 | |
---|
| 553 | PRINTF("\nBEGIN LOAD_START\n"); |
---|
| 554 | for (cc = 0; cc < nclusters; cc++){ |
---|
| 555 | for (pp = 0; pp < nlocal_procs; pp++){ |
---|
| 556 | PRINTF("cluster= %d proc= %d date= %d\n", cc, pp, LOAD_START[cc][pp]); |
---|
| 557 | } |
---|
| 558 | } |
---|
| 559 | PRINTF("END\n"); |
---|
| 560 | |
---|
| 561 | PRINTF("\nBEGIN LOAD_END\n"); |
---|
| 562 | for (cc = 0; cc < nclusters; cc++){ |
---|
| 563 | for (pp = 0; pp < nlocal_procs; pp++){ |
---|
| 564 | PRINTF("cluster= %d proc= %d date= %d\n", cc, pp, LOAD_END[cc][pp]); |
---|
| 565 | } |
---|
| 566 | } |
---|
| 567 | PRINTF("END\n"); |
---|
| 568 | |
---|
| 569 | PRINTF("\nBEGIN HORI_START\n"); |
---|
| 570 | for (cc = 0; cc < nclusters; cc++){ |
---|
| 571 | for (pp = 0; pp < nlocal_procs; pp++){ |
---|
| 572 | PRINTF("cluster= %d proc= %d date= %d\n", cc, pp, HORI_START[cc][pp]); |
---|
| 573 | } |
---|
| 574 | } |
---|
| 575 | PRINTF("END\n"); |
---|
| 576 | |
---|
| 577 | PRINTF("\nBEGIN HORI_END\n"); |
---|
| 578 | for (cc = 0; cc < nclusters; cc++){ |
---|
| 579 | for (pp = 0; pp < nlocal_procs; pp++){ |
---|
| 580 | PRINTF("cluster= %d proc= %d date= %d\n", cc, pp, HORI_END[cc][pp]); |
---|
| 581 | } |
---|
| 582 | } |
---|
| 583 | PRINTF("END\n"); |
---|
| 584 | |
---|
| 585 | PRINTF("\nBEGIN VERT_START\n"); |
---|
| 586 | for (cc = 0; cc < nclusters; cc++){ |
---|
| 587 | for (pp = 0; pp < nlocal_procs; pp++){ |
---|
| 588 | PRINTF("cluster= %d proc= %d date= %d\n", cc, pp, VERT_START[cc][pp]); |
---|
| 589 | } |
---|
| 590 | } |
---|
| 591 | PRINTF("END\n"); |
---|
| 592 | |
---|
| 593 | PRINTF("\nBEGIN VERT_END\n"); |
---|
| 594 | for (cc = 0; cc < nclusters; cc++){ |
---|
| 595 | for (pp = 0; pp < nlocal_procs; pp++ ){ |
---|
| 596 | PRINTF("cluster= %d proc= %d date= %d\n", cc, pp, VERT_END[cc][pp]); |
---|
| 597 | } |
---|
| 598 | } |
---|
| 599 | PRINTF("END\n"); |
---|
| 600 | |
---|
| 601 | PRINTF("\nBEGIN DISP_START\n"); |
---|
| 602 | for (cc = 0; cc < nclusters; cc++){ |
---|
| 603 | for (pp = 0; pp < nlocal_procs; pp++){ |
---|
| 604 | PRINTF("cluster= %d proc= %d date= %d\n", cc, pp, DISP_START[cc][pp]); |
---|
| 605 | } |
---|
| 606 | } |
---|
| 607 | PRINTF("END\n"); |
---|
| 608 | |
---|
| 609 | PRINTF("\nBEGIN DISP_END\n"); |
---|
| 610 | for (cc = 0; cc < nclusters; cc++){ |
---|
| 611 | for (pp = 0; pp < nlocal_procs; pp++){ |
---|
| 612 | PRINTF("cluster= %d proc= %d date= %d\n", cc, pp, DISP_END[cc][pp]); |
---|
| 613 | } |
---|
| 614 | } |
---|
| 615 | PRINTF("END\n"); |
---|
| 616 | } |
---|
| 617 | |
---|
| 618 | while(1); |
---|
| 619 | |
---|
[158] | 620 | } // end main() |
---|
| 621 | |
---|
[248] | 622 | // Local Variables: |
---|
| 623 | // tab-width: 3 |
---|
| 624 | // c-basic-offset: 3 |
---|
| 625 | // c-file-offsets:((innamespace . 0)(inline-open . 0)) |
---|
| 626 | // indent-tabs-mode: nil |
---|
| 627 | // End: |
---|
| 628 | |
---|
| 629 | // vim: filetype=cpp:expandtab:shiftwidth=3:tabstop=3:softtabstop=3 |
---|
| 630 | |
---|
| 631 | |
---|