Context Navigation

← Previous Change
Next Change →

Changeset 598 for soft

Timestamp:

Jul 9, 2015, 2:11:17 PM (11 years ago)

Author:

guerin

Message:

ocean: fix app broken by r589

Location:

soft/giet_vm

Files:

: 11 edited

Makefile (modified) (2 diffs)
applications/ocean/giet_utils.C (modified) (1 diff)
applications/ocean/jacobcalc.C (modified) (1 diff)
applications/ocean/jacobcalc2.C (modified) (1 diff)
applications/ocean/laplacalc.C (modified) (1 diff)
applications/ocean/linkup.C (modified) (1 diff)
applications/ocean/main.C (modified) (1 diff)
applications/ocean/multi.C (modified) (1 diff)
applications/ocean/slave1.C (modified) (1 diff)
applications/ocean/slave2.C (modified) (1 diff)
applications/ocean/subblock.C (modified) (1 diff)

Legend:

: Unmodified
: Added
: Removed

soft/giet_vm/Makefile

-                      r595
+                      r598
 ### ocean  application compilation
 applications/ocean/appli.elf: build/libs/libmath.a  build/libs/libuser.a | $(DISK_IMAGE)
         $(MAKE) -C applications/ocean
+        cd applications/ocean && $(MAKE) && cd ../..
         mmd -o -i $(DISK_IMAGE) ::/build/ocean || true
         mcopy -o -i $(DISK_IMAGE) $@ ::/build/ocean
 …
         $(MAKE) -C applications/dhrystone   clean
         $(MAKE) -C applications/gameoflife  clean
         $(MAKE) -C applications/ocean       clean
+        cd applications/ocean && $(MAKE) clean && cd ../..
         $(MAKE) -C applications/router      clean
         $(MAKE) -C applications/shell       clean

soft/giet_vm/applications/ocean/giet_utils.C

-                      r589
+                      r598
+#line 115 "/Users/alain/soc/giet_vm/applications/ocean/null_macros/c.m4.null.GIET"
+/* DÃ©finitions des fonctions standard (simplifiÃ©es) utilisÃ©es par ocean pour GIET */
+#include <stdarg.h>
+#include <stdio.h>
+#include <malloc.h>
+#include <stdlib.h>
+EXTERN_ENV
+#include "decs.h"
+#include "giet_utils.h"
+FILE * stdout = "";
+FILE *stderr = "STDERR : ";
+extern double ****main_q_multi;
+extern double ****main_rhs_multi;
+extern double ****main_psi;
+extern double ****main_psim;
+extern double ***main_psium;
+extern double ***main_psilm;
+extern double ***main_psib;
+extern double ***main_ga;
+extern double ***main_gb;
+extern double ***main_oldga;
+extern double ***main_oldgb;
+extern double ****main_work1;
+extern double ***main_work2;
+extern double ***main_work3;
+extern double ****main_work4;
+extern double ****main_work5;
+extern double ***main_work6;
+extern double ****main_work7;
+extern long *main_imx;
+extern long *main_jmx;
+extern double *main_lev_res;
+extern double *main_lev_tol;
+extern double *main_i_int_coeff;
+extern double *main_j_int_coeff;
+extern long *main_xpts_per_proc;
+extern long *main_ypts_per_proc;
+extern long main_xprocs;
+extern long main_yprocs;
+extern long main_numlev;
+extern double main_eig2;
+extern long main_im;
+extern long main_jm;
+double ****work1 __attribute__ ((section("seg_ldata")));
+double ***work2 __attribute__ ((section("seg_ldata")));
+double ***work3 __attribute__ ((section("seg_ldata")));
+double ****work4 __attribute__ ((section("seg_ldata")));
+double ****work5 __attribute__ ((section("seg_ldata")));
+double ***work6 __attribute__ ((section("seg_ldata")));
+double ****work7 __attribute__ ((section("seg_ldata")));
+double ****psi __attribute__ ((section("seg_ldata")));
+double ****psim __attribute__ ((section("seg_ldata")));
+double ***psium __attribute__ ((section("seg_ldata")));
+double ***psilm __attribute__ ((section("seg_ldata")));
+double ***psib __attribute__ ((section("seg_ldata")));
+double ***ga __attribute__ ((section("seg_ldata")));
+double ***gb __attribute__ ((section("seg_ldata")));
+double ***oldga __attribute__ ((section("seg_ldata")));
+double ***oldgb __attribute__ ((section("seg_ldata")));
+double ****q_multi __attribute__ ((section("seg_ldata")));
+double ****rhs_multi __attribute__ ((section("seg_ldata")));
+long *imx __attribute__ ((section("seg_ldata")));
+long *jmx __attribute__ ((section("seg_ldata")));
+double *f __attribute__ ((section("seg_ldata")));
+struct Global_Private *gp;
+double *lev_res __attribute__ ((section("seg_ldata")));
+double *lev_tol __attribute__ ((section("seg_ldata")));
+double *i_int_coeff __attribute__ ((section("seg_ldata")));
+double *j_int_coeff __attribute__ ((section("seg_ldata")));
+long *xpts_per_proc __attribute__ ((section("seg_ldata")));
+long *ypts_per_proc __attribute__ ((section("seg_ldata")));
+long xprocs __attribute__ ((section("seg_ldata")));
+long yprocs __attribute__ ((section("seg_ldata")));
+long numlev __attribute__ ((section("seg_ldata")));
+double eig2 __attribute__ ((section("seg_ldata")));
+long im __attribute__ ((section("seg_ldata")));
+long jm __attribute__ ((section("seg_ldata")));
+unsigned int nclusters_x __attribute__ ((section("seg_ldata")));
+unsigned int nclusters_y __attribute__ ((section("seg_ldata")));
+unsigned int procs_per_cluster __attribute__ ((section("seg_ldata")));
+volatile long heap_inited = 0;
+volatile int run_threads = 0;
+//Entry point for all threads (except main)
+//  waiting allocs and inits of main then copy read-only tabs in ldata segment (replicated)
+//  some read-write tabs are also replicated, but not entirely : only pointers
+__attribute__ ((constructor)) void thread()
+{
+    unsigned long size;
+    long id = (long) giet_thread_id();
+    unsigned int cx, cy, lp;
+    giet_proc_xyp(&cx, &cy, &lp);
+    giet_shr_printf("Thread %d (%d:%d.%d) waiting\n", id, cx, cy, lp);
+    if (lp == 0) {
+        giet_procs_number(&nclusters_x, &nclusters_y, &procs_per_cluster);
+        heap_init(cx, cy);
+        while (heap_inited != id) {
+            asm volatile ("nop\r\n");
+        }
+        heap_inited += procs_per_cluster;
+        size = nprocs * sizeof(double ***);
+        rhs_multi = (double ****) G_MALLOC(size, id);
+        q_multi = (double ****) G_MALLOC(size, id);
+        psi = (double ****) G_MALLOC(size, id);
+        psim = (double ****) G_MALLOC(size, id);
+        work1 = (double ****) G_MALLOC(size, id);
+        work4 = (double ****) G_MALLOC(size, id);
+        work5 = (double ****) G_MALLOC(size, id);
+        work7 = (double ****) G_MALLOC(size, id);
+        size = nprocs * sizeof(double **);
+        psium = (double ***) G_MALLOC(size, id);
+        psilm = (double ***) G_MALLOC(size, id);
+        psib = (double ***) G_MALLOC(size, id);
+        ga = (double ***) G_MALLOC(size, id);
+        gb = (double ***) G_MALLOC(size, id);
+        oldga = (double ***) G_MALLOC(size, id);
+        oldgb = (double ***) G_MALLOC(size, id);
+        work2 = (double ***) G_MALLOC(size, id);
+        work3 = (double ***) G_MALLOC(size, id);
+        work6 = (double ***) G_MALLOC(size, id);
+    }
+    while (run_threads != 1) {
+        asm volatile ("nop\r\n");
+    }
+    *gp[id].lpid = lp;
+    if (lp == 0) {
+        int i, j, k;
+        xprocs = main_xprocs;
+        yprocs = main_yprocs;
+        numlev = main_numlev;
+        eig2 = main_eig2;
+        im = main_im;
+        jm = main_jm;
+        size = numlev * sizeof(long);
+        imx = (long *) G_MALLOC(size, id);
+        jmx = (long *) G_MALLOC(size, id);
+        xpts_per_proc = (long *) G_MALLOC(size, id);
+        ypts_per_proc = (long *) G_MALLOC(size, id);
+        size = numlev * sizeof(double);
+        lev_res = (double *) G_MALLOC(size, id);
+        lev_tol = (double *) G_MALLOC(size, id);
+        i_int_coeff = (double *) G_MALLOC(size, id);
+        j_int_coeff = (double *) G_MALLOC(size, id);
+        for(i=0;i<numlev;i++) {
+            imx[i] = main_imx[i];
+            jmx[i] = main_jmx[i];
+            lev_res[i] = main_lev_res[i];
+            lev_tol[i] = main_lev_tol[i];
+            i_int_coeff[i] = main_i_int_coeff[i];
+            j_int_coeff[i] = main_j_int_coeff[i];
+            xpts_per_proc[i] = main_xpts_per_proc[i];
+            ypts_per_proc[i] = main_ypts_per_proc[i];
+        }
+        size = numlev * sizeof(double **);
+        for (i = 0; i < nprocs; i++) {
+            q_multi[i] = (double ***) G_MALLOC(size, id);
+            rhs_multi[i] = (double ***) G_MALLOC(size, id);
+            for (j = 0; j < numlev; j++) {
+                rhs_multi[i][j] = (double **) G_MALLOC(((imx[j] - 2) / yprocs + 2) * sizeof(double *), id);
+                q_multi[i][j] = (double **) G_MALLOC(((imx[j] - 2) / yprocs + 2) * sizeof(double *), id);
+                for (k = 0; k < ((imx[j] - 2) / yprocs + 2); k++) {
+                    q_multi[i][j][k] = main_q_multi[i][j][k];
+                    rhs_multi[i][j][k] = main_rhs_multi[i][j][k];
+                }
+            }
+            work1[i] = main_work1[i];
+            work2[i] = main_work2[i];
+            work3[i] = main_work3[i];
+            work4[i] = main_work4[i];
+            work5[i] = main_work5[i];
+            work6[i] = main_work6[i];
+            work7[i] = main_work7[i];
+            psi[i] = main_psi[i];
+            psim[i] = main_psim[i];
+            psium[i] = main_psium[i];
+            psilm[i] = main_psilm[i];
+            psib[i] = main_psib[i];
+            ga[i] = main_ga[i];
+            gb[i] = main_gb[i];
+            oldga[i] = main_oldga[i];
+            oldgb[i] = main_oldgb[i];
+        }
+    }
+    giet_shr_printf("Thread %d launched\n", id);
+    slave(&id);
+    BARRIER(bars->barrier, nprocs)
+    giet_exit("done.");
+}
+const char *optarg;
+int getopt(int argc, char *const *argv, const char *optstring)
+{
+    return -1;
+}
+//give the cluster coordinate by thread number
+//  if tid=-1, return the next cluster (round robin)
+void clusterXY(int tid, unsigned int *cx, unsigned int *cy)
+{
+    unsigned int cid;
+    static unsigned int x = 0, y = 0;
+    cid = tid / procs_per_cluster;
+    if (tid != -1) {
+        *cx = (cid / nclusters_y);
+        *cy = (cid % nclusters_y);
+        return;
+    }
+    if (giet_thread_id() != 0) {
+        giet_exit("pseudo-random mapped malloc : thread 0 only");
+    }
+    x++;
+    if (x == nclusters_x) {
+        x = 0;
+        y++;
+        if (y == nclusters_y) {
+            y = 0;
+        }
+    }
+    *cx = x;
+    *cy = y;
+}
+void *ocean_malloc(unsigned long s, int tid)
+{
+    void *ptr;
+    unsigned int x, y;
+    clusterXY(tid, &x, &y);
+    ptr = remote_malloc(s, x, y);
+    giet_assert (ptr != 0, "Malloc failed");
+    return ptr;
+}
+void exit(int status)
+{
+    if (status) {
+        giet_exit("Done (status != 0)");
+    } else {
+        giet_exit("Done (ok)");
+    }
+}

soft/giet_vm/applications/ocean/jacobcalc.C

-                      r589
+                      r598
+#line 115 "/Users/alain/soc/giet_vm/applications/ocean/null_macros/c.m4.null.GIET"
+/*************************************************************************/
+/*                                                                       */
+/*  Copyright (c) 1994 Stanford University                               */
+/*                                                                       */
+/*  All rights reserved.                                                 */
+/*                                                                       */
+/*  Permission is given to use, copy, and modify this software for any   */
+/*  non-commercial purpose as long as this copyright notice is not       */
+/*  removed.  All other uses, including redistribution in whole or in    */
+/*  part, are forbidden without prior written permission.                */
+/*                                                                       */
+/*  This software is provided with absolutely no warranty and no         */
+/*  support.                                                             */
+/*                                                                       */
+/*************************************************************************/
+/* Does the arakawa jacobian calculation (of the x and y matrices,
+   putting the results in the z matrix) for a subblock.  */
+EXTERN_ENV
+#include <stdio.h>
+#include <math.h>
+#include "decs.h"
+void jacobcalc(double ***x, double ***y, double ***z, long pid, long firstrow, long lastrow, long firstcol, long lastcol)
+{
+    double f1;
+    double f2;
+    double f3;
+    double f4;
+    double f5;
+    double f6;
+    double f7;
+    double f8;
+    long iindex;
+    long indexp1;
+    long indexm1;
+    long im1;
+    long ip1;
+    long i;
+    long j;
+    long jj;
+    double **t2a;
+    double **t2b;
+    double **t2c;
+    double *t1a;
+    double *t1b;
+    double *t1c;
+    double *t1d;
+    double *t1e;
+    double *t1f;
+    double *t1g;
+    t2a = (double **) z[pid];
+    if ((gp[pid].neighbors[UP] == -1) && (gp[pid].neighbors[LEFT] == -1)) {
+        t2a[0][0] = 0.0;
+    }
+    if ((gp[pid].neighbors[DOWN] == -1) && (gp[pid].neighbors[LEFT] == -1)) {
+        t2a[im - 1][0] = 0.0;
+    }
+    if ((gp[pid].neighbors[UP] == -1) && (gp[pid].neighbors[RIGHT] == -1)) {
+        t2a[0][jm - 1] = 0.0;
+    }
+    if ((gp[pid].neighbors[DOWN] == -1) && (gp[pid].neighbors[RIGHT] == -1)) {
+        t2a[im - 1][jm - 1] = 0.0;
+    }
+    t2a = (double **) x[pid];
+    jj = gp[pid].neighbors[UPLEFT];
+    if (jj != -1) {
+        t2a[0][0] = x[jj][im - 2][jm - 2];
+    }
+    jj = gp[pid].neighbors[UPRIGHT];
+    if (jj != -1) {
+        t2a[0][jm - 1] = x[jj][im - 2][1];
+    }
+    jj = gp[pid].neighbors[DOWNLEFT];
+    if (jj != -1) {
+        t2a[im - 1][0] = x[jj][1][jm - 2];
+    }
+    jj = gp[pid].neighbors[DOWNRIGHT];
+    if (jj != -1) {
+        t2a[im - 1][jm - 1] = x[jj][1][1];
+    }
+    t2a = (double **) y[pid];
+    jj = gp[pid].neighbors[UPLEFT];
+    if (jj != -1) {
+        t2a[0][0] = y[jj][im - 2][jm - 2];
+    }
+    jj = gp[pid].neighbors[UPRIGHT];
+    if (jj != -1) {
+        t2a[0][jm - 1] = y[jj][im - 2][1];
+    }
+    jj = gp[pid].neighbors[DOWNLEFT];
+    if (jj != -1) {
+        t2a[im - 1][0] = y[jj][1][jm - 2];
+    }
+    jj = gp[pid].neighbors[DOWNRIGHT];
+    if (jj != -1) {
+        t2a[im - 1][jm - 1] = y[jj][1][1];
+    }
+    t2a = (double **) x[pid];
+    if (gp[pid].neighbors[UP] == -1) {
+        jj = gp[pid].neighbors[LEFT];
+        if (jj != -1) {
+            t2a[0][0] = x[jj][0][jm - 2];
+        } else {
+            jj = gp[pid].neighbors[DOWN];
+            if (jj != -1) {
+                t2a[im - 1][0] = x[jj][1][0];
+            }
+        }
+        jj = gp[pid].neighbors[RIGHT];
+        if (jj != -1) {
+            t2a[0][jm - 1] = x[jj][0][1];
+        } else {
+            jj = gp[pid].neighbors[DOWN];
+            if (jj != -1) {
+                t2a[im - 1][jm - 1] = x[jj][1][jm - 1];
+            }
+        }
+    } else if (gp[pid].neighbors[DOWN] == -1) {
+        jj = gp[pid].neighbors[LEFT];
+        if (jj != -1) {
+            t2a[im - 1][0] = x[jj][im - 1][jm - 2];
+        } else {
+            jj = gp[pid].neighbors[UP];
+            if (jj != -1) {
+                t2a[0][0] = x[jj][im - 2][0];
+            }
+        }
+        jj = gp[pid].neighbors[RIGHT];
+        if (jj != -1) {
+            t2a[im - 1][jm - 1] = x[jj][im - 1][1];
+        } else {
+            jj = gp[pid].neighbors[UP];
+            if (jj != -1) {
+                t2a[0][jm - 1] = x[jj][im - 2][jm - 1];
+            }
+        }
+    } else if (gp[pid].neighbors[LEFT] == -1) {
+        jj = gp[pid].neighbors[UP];
+        if (jj != -1) {
+            t2a[0][0] = x[jj][im - 2][0];
+        }
+        jj = gp[pid].neighbors[DOWN];
+        if (jj != -1) {
+            t2a[im - 1][0] = x[jj][1][0];
+        }
+    } else if (gp[pid].neighbors[RIGHT] == -1) {
+        jj = gp[pid].neighbors[UP];
+        if (jj != -1) {
+            t2a[0][jm - 1] = x[jj][im - 2][jm - 1];
+        }
+        jj = gp[pid].neighbors[DOWN];
+        if (jj != -1) {
+            t2a[im - 1][jm - 1] = x[jj][1][jm - 1];
+        }
+    }
+    t2a = (double **) y[pid];
+    if (gp[pid].neighbors[UP] == -1) {
+        jj = gp[pid].neighbors[LEFT];
+        if (jj != -1) {
+            t2a[0][0] = y[jj][0][jm - 2];
+        } else {
+            jj = gp[pid].neighbors[DOWN];
+            if (jj != -1) {
+                t2a[im - 1][0] = y[jj][1][0];
+            }
+        }
+        jj = gp[pid].neighbors[RIGHT];
+        if (jj != -1) {
+            t2a[0][jm - 1] = y[jj][0][1];
+        } else {
+            jj = gp[pid].neighbors[DOWN];
+            if (jj != -1) {
+                t2a[im - 1][jm - 1] = y[jj][1][jm - 1];
+            }
+        }
+    } else if (gp[pid].neighbors[DOWN] == -1) {
+        jj = gp[pid].neighbors[LEFT];
+        if (jj != -1) {
+            t2a[im - 1][0] = y[jj][im - 1][jm - 2];
+        } else {
+            jj = gp[pid].neighbors[UP];
+            if (jj != -1) {
+                t2a[0][0] = y[jj][im - 2][0];
+            }
+        }
+        jj = gp[pid].neighbors[RIGHT];
+        if (jj != -1) {
+            t2a[im - 1][jm - 1] = y[jj][im - 1][1];
+        } else {
+            jj = gp[pid].neighbors[UP];
+            if (jj != -1) {
+                t2a[0][jm - 1] = y[jj][im - 2][jm - 1];
+            }
+        }
+    } else if (gp[pid].neighbors[LEFT] == -1) {
+        jj = gp[pid].neighbors[UP];
+        if (jj != -1) {
+            t2a[0][0] = y[jj][im - 2][0];
+        }
+        jj = gp[pid].neighbors[DOWN];
+        if (jj != -1) {
+            t2a[im - 1][0] = y[jj][1][0];
+        }
+    } else if (gp[pid].neighbors[RIGHT] == -1) {
+        jj = gp[pid].neighbors[UP];
+        if (jj != -1) {
+            t2a[0][jm - 1] = y[jj][im - 2][jm - 1];
+        }
+        jj = gp[pid].neighbors[DOWN];
+        if (jj != -1) {
+            t2a[im - 1][jm - 1] = y[jj][1][jm - 1];
+        }
+    }
+    j = gp[pid].neighbors[UP];
+    if (j != -1) {
+        t1a = (double *) t2a[0];
+        t1b = (double *) y[j][im - 2];
+        for (i = 1; i <= lastcol; i++) {
+            t1a[i] = t1b[i];
+        }
+    }
+    j = gp[pid].neighbors[DOWN];
+    if (j != -1) {
+        t1a = (double *) t2a[im - 1];
+        t1b = (double *) y[j][1];
+        for (i = 1; i <= lastcol; i++) {
+            t1a[i] = t1b[i];
+        }
+    }
+    j = gp[pid].neighbors[LEFT];
+    if (j != -1) {
+        t2b = (double **) y[j];
+        for (i = 1; i <= lastrow; i++) {
+            t2a[i][0] = t2b[i][jm - 2];
+        }
+    }
+    j = gp[pid].neighbors[RIGHT];
+    if (j != -1) {
+        t2b = (double **) y[j];
+        for (i = 1; i <= lastrow; i++) {
+            t2a[i][jm - 1] = t2b[i][1];
+        }
+    }
+    t2a = (double **) x[pid];
+    j = gp[pid].neighbors[UP];
+    if (j != -1) {
+        t1a = (double *) t2a[0];
+        t1b = (double *) x[j][im - 2];
+        for (i = 1; i <= lastcol; i++) {
+            t1a[i] = t1b[i];
+        }
+    }
+    j = gp[pid].neighbors[DOWN];
+    if (j != -1) {
+        t1a = (double *) t2a[im - 1];
+        t1b = (double *) x[j][1];
+        for (i = 1; i <= lastcol; i++) {
+            t1a[i] = t1b[i];
+        }
+    }
+    j = gp[pid].neighbors[LEFT];
+    if (j != -1) {
+        t2b = (double **) x[j];
+        for (i = 1; i <= lastrow; i++) {
+            t2a[i][0] = t2b[i][jm - 2];
+        }
+    }
+    j = gp[pid].neighbors[RIGHT];
+    if (j != -1) {
+        t2b = (double **) x[j];
+        for (i = 1; i <= lastrow; i++) {
+            t2a[i][jm - 1] = t2b[i][1];
+        }
+    }
+    t2a = (double **) x[pid];
+    t2b = (double **) y[pid];
+    t2c = (double **) z[pid];
+    for (i = firstrow; i <= lastrow; i++) {
+        ip1 = i + 1;
+        im1 = i - 1;
+        t1a = (double *) t2a[i];
+        t1b = (double *) t2b[i];
+        t1c = (double *) t2c[i];
+        t1d = (double *) t2b[ip1];
+        t1e = (double *) t2b[im1];
+        t1f = (double *) t2a[ip1];
+        t1g = (double *) t2a[im1];
+        for (iindex = firstcol; iindex <= lastcol; iindex++) {
+            indexp1 = iindex + 1;
+            indexm1 = iindex - 1;
+            f1 = (t1b[indexm1] + t1d[indexm1] - t1b[indexp1] - t1d[indexp1]) * (t1f[iindex] - t1a[iindex]);
+            f2 = (t1e[indexm1] + t1b[indexm1] - t1e[indexp1] - t1b[indexp1]) * (t1a[iindex] - t1g[iindex]);
+            f3 = (t1d[iindex] + t1d[indexp1] - t1e[iindex] - t1e[indexp1]) * (t1a[indexp1] - t1a[iindex]);
+            f4 = (t1d[indexm1] + t1d[iindex] - t1e[indexm1] - t1e[iindex]) * (t1a[iindex] - t1a[indexm1]);
+            f5 = (t1d[iindex] - t1b[indexp1]) * (t1f[indexp1] - t1a[iindex]);
+            f6 = (t1b[indexm1] - t1e[iindex]) * (t1a[iindex] - t1g[indexm1]);
+            f7 = (t1b[indexp1] - t1e[iindex]) * (t1g[indexp1] - t1a[iindex]);
+            f8 = (t1d[iindex] - t1b[indexm1]) * (t1a[iindex] - t1f[indexm1]);
+            t1c[iindex] = factjacob * (f1 + f2 + f3 + f4 + f5 + f6 + f7 + f8);
+        }
+    }
+    if (gp[pid].neighbors[UP] == -1) {
+        t1c = (double *) t2c[0];
+        for (j = firstcol; j <= lastcol; j++) {
+            t1c[j] = 0.0;
+        }
+    }
+    if (gp[pid].neighbors[DOWN] == -1) {
+        t1c = (double *) t2c[im - 1];
+        for (j = firstcol; j <= lastcol; j++) {
+            t1c[j] = 0.0;
+        }
+    }
+    if (gp[pid].neighbors[LEFT] == -1) {
+        for (j = firstrow; j <= lastrow; j++) {
+            t2c[j][0] = 0.0;
+        }
+    }
+    if (gp[pid].neighbors[RIGHT] == -1) {
+        for (j = firstrow; j <= lastrow; j++) {
+            t2c[j][jm - 1] = 0.0;
+        }
+    }
+}

soft/giet_vm/applications/ocean/jacobcalc2.C

-                      r589
+                      r598
+#line 115 "/Users/alain/soc/giet_vm/applications/ocean/null_macros/c.m4.null.GIET"
+/*************************************************************************/
+/*                                                                       */
+/*  Copyright (c) 1994 Stanford University                               */
+/*                                                                       */
+/*  All rights reserved.                                                 */
+/*                                                                       */
+/*  Permission is given to use, copy, and modify this software for any   */
+/*  non-commercial purpose as long as this copyright notice is not       */
+/*  removed.  All other uses, including redistribution in whole or in    */
+/*  part, are forbidden without prior written permission.                */
+/*                                                                       */
+/*  This software is provided with absolutely no warranty and no         */
+/*  support.                                                             */
+/*                                                                       */
+/*************************************************************************/
+/* Does the arakawa jacobian calculation (of the x and y matrices,
+   putting the results in the z matrix) for a subblock. */
+EXTERN_ENV
+#include <stdio.h>
+#include <math.h>
+#include "decs.h"
+void jacobcalc2(double ****x, double ****y, double ****z, long psiindex, long pid, long firstrow, long lastrow, long firstcol, long lastcol)
+{
+    double f1;
+    double f2;
+    double f3;
+    double f4;
+    double f5;
+    double f6;
+    double f7;
+    double f8;
+    long iindex;
+    long indexp1;
+    long indexm1;
+    long im1;
+    long ip1;
+    long i;
+    long j;
+    long jj;
+    double **t2a;
+    double **t2b;
+    double **t2c;
+    double *t1a;
+    double *t1b;
+    double *t1c;
+    double *t1d;
+    double *t1e;
+    double *t1f;
+    double *t1g;
+    t2a = z[pid][psiindex];
+    if ((gp[pid].neighbors[UP] == -1) && (gp[pid].neighbors[LEFT] == -1)) {
+        t2a[0][0] = 0.0;
+    }
+    if ((gp[pid].neighbors[DOWN] == -1) && (gp[pid].neighbors[LEFT] == -1)) {
+        t2a[im - 1][0] = 0.0;
+    }
+    if ((gp[pid].neighbors[UP] == -1) && (gp[pid].neighbors[RIGHT] == -1)) {
+        t2a[0][jm - 1] = 0.0;
+    }
+    if ((gp[pid].neighbors[DOWN] == -1) && (gp[pid].neighbors[RIGHT] == -1)) {
+        t2a[im - 1][jm - 1] = 0.0;
+    }
+    t2a = x[pid][psiindex];
+    jj = gp[pid].neighbors[UPLEFT];
+    if (jj != -1) {
+        t2a[0][0] = x[jj][psiindex][im - 2][jm - 2];
+    }
+    jj = gp[pid].neighbors[UPRIGHT];
+    if (jj != -1) {
+        t2a[0][jm - 1] = x[jj][psiindex][im - 2][1];
+    }
+    jj = gp[pid].neighbors[DOWNLEFT];
+    if (jj != -1) {
+        t2a[im - 1][0] = x[jj][psiindex][1][jm - 2];
+    }
+    jj = gp[pid].neighbors[DOWNRIGHT];
+    if (jj != -1) {
+        t2a[im - 1][jm - 1] = x[jj][psiindex][1][1];
+    }
+    t2a = y[pid][psiindex];
+    jj = gp[pid].neighbors[UPLEFT];
+    if (jj != -1) {
+        t2a[0][0] = y[jj][psiindex][im - 2][jm - 2];
+    }
+    jj = gp[pid].neighbors[UPRIGHT];
+    if (jj != -1) {
+        t2a[0][jm - 1] = y[jj][psiindex][im - 2][1];
+    }
+    jj = gp[pid].neighbors[DOWNLEFT];
+    if (jj != -1) {
+        t2a[im - 1][0] = y[jj][psiindex][1][jm - 2];
+    }
+    jj = gp[pid].neighbors[DOWNRIGHT];
+    if (jj != -1) {
+        t2a[im - 1][jm - 1] = y[jj][psiindex][1][1];
+    }
+    t2a = x[pid][psiindex];
+    if (gp[pid].neighbors[UP] == -1) {
+        jj = gp[pid].neighbors[LEFT];
+        if (jj != -1) {
+            t2a[0][0] = x[jj][psiindex][0][jm - 2];
+        } else {
+            jj = gp[pid].neighbors[DOWN];
+            if (jj != -1) {
+                t2a[im - 1][0] = x[jj][psiindex][1][0];
+            }
+        }
+        jj = gp[pid].neighbors[RIGHT];
+        if (jj != -1) {
+            t2a[0][jm - 1] = x[jj][psiindex][0][1];
+        } else {
+            jj = gp[pid].neighbors[DOWN];
+            if (jj != -1) {
+                t2a[im - 1][jm - 1] = x[jj][psiindex][1][jm - 1];
+            }
+        }
+    } else if (gp[pid].neighbors[DOWN] == -1) {
+        jj = gp[pid].neighbors[LEFT];
+        if (jj != -1) {
+            t2a[im - 1][0] = x[jj][psiindex][im - 1][jm - 2];
+        } else {
+            jj = gp[pid].neighbors[UP];
+            if (jj != -1) {
+                t2a[0][0] = x[jj][psiindex][im - 2][0];
+            }
+        }
+        jj = gp[pid].neighbors[RIGHT];
+        if (jj != -1) {
+            t2a[im - 1][jm - 1] = x[jj][psiindex][im - 1][1];
+        } else {
+            jj = gp[pid].neighbors[UP];
+            if (jj != -1) {
+                t2a[0][jm - 1] = x[jj][psiindex][im - 2][jm - 1];
+            }
+        }
+    } else if (gp[pid].neighbors[LEFT] == -1) {
+        jj = gp[pid].neighbors[UP];
+        if (jj != -1) {
+            t2a[0][0] = x[jj][psiindex][im - 2][0];
+        }
+        jj = gp[pid].neighbors[DOWN];
+        if (jj != -1) {
+            t2a[im - 1][0] = x[jj][psiindex][1][0];
+        }
+    } else if (gp[pid].neighbors[RIGHT] == -1) {
+        jj = gp[pid].neighbors[UP];
+        if (jj != -1) {
+            t2a[0][jm - 1] = x[jj][psiindex][im - 2][jm - 1];
+        }
+        jj = gp[pid].neighbors[DOWN];
+        if (jj != -1) {
+            t2a[im - 1][jm - 1] = x[jj][psiindex][1][jm - 1];
+        }
+    }
+    t2a = y[pid][psiindex];
+    if (gp[pid].neighbors[UP] == -1) {
+        jj = gp[pid].neighbors[LEFT];
+        if (jj != -1) {
+            t2a[0][0] = y[jj][psiindex][0][jm - 2];
+        } else {
+            jj = gp[pid].neighbors[DOWN];
+            if (jj != -1) {
+                t2a[im - 1][0] = y[jj][psiindex][1][0];
+            }
+        }
+        jj = gp[pid].neighbors[RIGHT];
+        if (jj != -1) {
+            t2a[0][jm - 1] = y[jj][psiindex][0][1];
+        } else {
+            jj = gp[pid].neighbors[DOWN];
+            if (jj != -1) {
+                t2a[im - 1][jm - 1] = y[jj][psiindex][1][jm - 1];
+            }
+        }
+    } else if (gp[pid].neighbors[DOWN] == -1) {
+        jj = gp[pid].neighbors[LEFT];
+        if (jj != -1) {
+            t2a[im - 1][0] = y[jj][psiindex][im - 1][jm - 2];
+        } else {
+            jj = gp[pid].neighbors[UP];
+            if (jj != -1) {
+                t2a[0][0] = y[jj][psiindex][im - 2][0];
+            }
+        }
+        jj = gp[pid].neighbors[RIGHT];
+        if (jj != -1) {
+            t2a[im - 1][jm - 1] = y[jj][psiindex][im - 1][1];
+        } else {
+            jj = gp[pid].neighbors[UP];
+            if (jj != -1) {
+                t2a[0][jm - 1] = y[jj][psiindex][im - 2][jm - 1];
+            }
+        }
+    } else if (gp[pid].neighbors[LEFT] == -1) {
+        jj = gp[pid].neighbors[UP];
+        if (jj != -1) {
+            t2a[0][0] = y[jj][psiindex][im - 2][0];
+        }
+        jj = gp[pid].neighbors[DOWN];
+        if (jj != -1) {
+            t2a[im - 1][0] = y[jj][psiindex][1][0];
+        }
+    } else if (gp[pid].neighbors[RIGHT] == -1) {
+        jj = gp[pid].neighbors[UP];
+        if (jj != -1) {
+            t2a[0][jm - 1] = y[jj][psiindex][im - 2][jm - 1];
+        }
+        jj = gp[pid].neighbors[DOWN];
+        if (jj != -1) {
+            t2a[im - 1][jm - 1] = y[jj][psiindex][1][jm - 1];
+        }
+    }
+    t2a = y[pid][psiindex];
+    j = gp[pid].neighbors[UP];
+    if (j != -1) {
+        t1a = (double *) t2a[0];
+        t1b = (double *) y[j][psiindex][im - 2];
+        for (i = 1; i <= lastcol; i++) {
+            t1a[i] = t1b[i];
+        }
+    }
+    j = gp[pid].neighbors[DOWN];
+    if (j != -1) {
+        t1a = (double *) t2a[im - 1];
+        t1b = (double *) y[j][psiindex][1];
+        for (i = 1; i <= lastcol; i++) {
+            t1a[i] = t1b[i];
+        }
+    }
+    j = gp[pid].neighbors[LEFT];
+    if (j != -1) {
+        t2b = y[j][psiindex];
+        for (i = 1; i <= lastrow; i++) {
+            t2a[i][0] = t2b[i][jm - 2];
+        }
+    }
+    j = gp[pid].neighbors[RIGHT];
+    if (j != -1) {
+        t2b = y[j][psiindex];
+        for (i = 1; i <= lastrow; i++) {
+            t2a[i][jm - 1] = t2b[i][1];
+        }
+    }
+    t2a = x[pid][psiindex];
+    j = gp[pid].neighbors[UP];
+    if (j != -1) {
+        t1a = (double *) t2a[0];
+        t1b = (double *) x[j][psiindex][im - 2];
+        for (i = 1; i <= lastcol; i++) {
+            t1a[i] = t1b[i];
+        }
+    }
+    j = gp[pid].neighbors[DOWN];
+    if (j != -1) {
+        t1a = (double *) t2a[im - 1];
+        t1b = (double *) x[j][psiindex][1];
+        for (i = 1; i <= lastcol; i++) {
+            t1a[i] = t1b[i];
+        }
+    }
+    j = gp[pid].neighbors[LEFT];
+    if (j != -1) {
+        t2b = x[j][psiindex];
+        for (i = 1; i <= lastrow; i++) {
+            t2a[i][0] = t2b[i][jm - 2];
+        }
+    }
+    j = gp[pid].neighbors[RIGHT];
+    if (j != -1) {
+        t2b = x[j][psiindex];
+        for (i = 1; i <= lastrow; i++) {
+            t2a[i][jm - 1] = t2b[i][1];
+        }
+    }
+    t2a = x[pid][psiindex];
+    t2b = y[pid][psiindex];
+    t2c = z[pid][psiindex];
+    for (i = firstrow; i <= lastrow; i++) {
+        ip1 = i + 1;
+        im1 = i - 1;
+        t1a = (double *) t2a[i];
+        t1b = (double *) t2b[i];
+        t1c = (double *) t2c[i];
+        t1d = (double *) t2b[ip1];
+        t1e = (double *) t2b[im1];
+        t1f = (double *) t2a[ip1];
+        t1g = (double *) t2a[im1];
+        for (iindex = firstcol; iindex <= lastcol; iindex++) {
+            indexp1 = iindex + 1;
+            indexm1 = iindex - 1;
+            f1 = (t1b[indexm1] + t1d[indexm1] - t1b[indexp1] - t1d[indexp1]) * (t1f[iindex] - t1a[iindex]);
+            f2 = (t1e[indexm1] + t1b[indexm1] - t1e[indexp1] - t1b[indexp1]) * (t1a[iindex] - t1g[iindex]);
+            f3 = (t1d[iindex] + t1d[indexp1] - t1e[iindex] - t1e[indexp1]) * (t1a[indexp1] - t1a[iindex]);
+            f4 = (t1d[indexm1] + t1d[iindex] - t1e[indexm1] - t1e[iindex]) * (t1a[iindex] - t1a[indexm1]);
+            f5 = (t1d[iindex] - t1b[indexp1]) * (t1f[indexp1] - t1a[iindex]);
+            f6 = (t1b[indexm1] - t1e[iindex]) * (t1a[iindex] - t1g[indexm1]);
+            f7 = (t1b[indexp1] - t1e[iindex]) * (t1g[indexp1] - t1a[iindex]);
+            f8 = (t1d[iindex] - t1b[indexm1]) * (t1a[iindex] - t1f[indexm1]);
+            t1c[iindex] = factjacob * (f1 + f2 + f3 + f4 + f5 + f6 + f7 + f8);
+        }
+    }
+    if (gp[pid].neighbors[UP] == -1) {
+        t1c = (double *) t2c[0];
+        for (j = firstcol; j <= lastcol; j++) {
+            t1c[j] = 0.0;
+        }
+    }
+    if (gp[pid].neighbors[DOWN] == -1) {
+        t1c = (double *) t2c[im - 1];
+        for (j = firstcol; j <= lastcol; j++) {
+            t1c[j] = 0.0;
+        }
+    }
+    if (gp[pid].neighbors[LEFT] == -1) {
+        for (j = firstrow; j <= lastrow; j++) {
+            t2c[j][0] = 0.0;
+        }
+    }
+    if (gp[pid].neighbors[RIGHT] == -1) {
+        for (j = firstrow; j <= lastrow; j++) {
+            t2c[j][jm - 1] = 0.0;
+        }
+    }
+}

soft/giet_vm/applications/ocean/laplacalc.C

-                      r589
+                      r598
+#line 115 "/Users/alain/soc/giet_vm/applications/ocean/null_macros/c.m4.null.GIET"
+/*************************************************************************/
+/*                                                                       */
+/*  Copyright (c) 1994 Stanford University                               */
+/*                                                                       */
+/*  All rights reserved.                                                 */
+/*                                                                       */
+/*  Permission is given to use, copy, and modify this software for any   */
+/*  non-commercial purpose as long as this copyright notice is not       */
+/*  removed.  All other uses, including redistribution in whole or in    */
+/*  part, are forbidden without prior written permission.                */
+/*                                                                       */
+/*  This software is provided with absolutely no warranty and no         */
+/*  support.                                                             */
+/*                                                                       */
+/*************************************************************************/
+/* Performs the laplacian calculation for a subblock */
+EXTERN_ENV
+#include <stdio.h>
+#include <math.h>
+#include "decs.h"
+void laplacalc(long procid, double ****x, double ****z, long psiindex, long firstrow, long lastrow, long firstcol, long lastcol)
+{
+    long iindex;
+    long indexp1;
+    long indexm1;
+    long ip1;
+    long im1;
+    long i;
+    long j;
+    double **t2a;
+    double **t2b;
+    double *t1a;
+    double *t1b;
+    double *t1c;
+    double *t1d;
+    t2a = (double **) x[procid][psiindex];
+    j = gp[procid].neighbors[UP];
+    if (j != -1) {
+        t1a = (double *) t2a[0];
+        t1b = (double *) x[j][psiindex][im - 2];
+        for (i = 1; i <= lastcol; i++) {
+            t1a[i] = t1b[i];
+        }
+    }
+    j = gp[procid].neighbors[DOWN];
+    if (j != -1) {
+        t1a = (double *) t2a[im - 1];
+        t1b = (double *) x[j][psiindex][1];
+        for (i = 1; i <= lastcol; i++) {
+            t1a[i] = t1b[i];
+        }
+    }
+    j = gp[procid].neighbors[LEFT];
+    if (j != -1) {
+        t2b = (double **) x[j][psiindex];
+        for (i = 1; i <= lastrow; i++) {
+            t2a[i][0] = t2b[i][jm - 2];
+        }
+    }
+    j = gp[procid].neighbors[RIGHT];
+    if (j != -1) {
+        t2b = (double **) x[j][psiindex];
+        for (i = 1; i <= lastrow; i++) {
+            t2a[i][jm - 1] = t2b[i][1];
+        }
+    }
+    t2a = (double **) x[procid][psiindex];
+    t2b = (double **) z[procid][psiindex];
+    for (i = firstrow; i <= lastrow; i++) {
+        ip1 = i + 1;
+        im1 = i - 1;
+        t1a = (double *) t2a[i];
+        t1b = (double *) t2b[i];
+        t1c = (double *) t2a[ip1];
+        t1d = (double *) t2a[im1];
+        for (iindex = firstcol; iindex <= lastcol; iindex++) {
+            indexp1 = iindex + 1;
+            indexm1 = iindex - 1;
+            t1b[iindex] = factlap * (t1c[iindex] + t1d[iindex] + t1a[indexp1] + t1a[indexm1] - 4. * t1a[iindex]);
+        }
+    }
+    if (gp[procid].neighbors[UP] == -1) {
+        t1b = (double *) t2b[0];
+        for (j = firstcol; j <= lastcol; j++) {
+            t1b[j] = 0.0;
+        }
+    }
+    if (gp[procid].neighbors[DOWN] == -1) {
+        t1b = (double *) t2b[im - 1];
+        for (j = firstcol; j <= lastcol; j++) {
+            t1b[j] = 0.0;
+        }
+    }
+    if (gp[procid].neighbors[LEFT] == -1) {
+        for (j = firstrow; j <= lastrow; j++) {
+            t2b[j][0] = 0.0;
+        }
+    }
+    if (gp[procid].neighbors[RIGHT] == -1) {
+        for (j = firstrow; j <= lastrow; j++) {
+            t2b[j][jm - 1] = 0.0;
+        }
+    }
+}

soft/giet_vm/applications/ocean/linkup.C

-                      r589
+                      r598
+#line 115 "/Users/alain/soc/giet_vm/applications/ocean/null_macros/c.m4.null.GIET"
+/*************************************************************************/
+/*                                                                       */
+/*  Copyright (c) 1994 Stanford University                               */
+/*                                                                       */
+/*  All rights reserved.                                                 */
+/*                                                                       */
+/*  Permission is given to use, copy, and modify this software for any   */
+/*  non-commercial purpose as long as this copyright notice is not       */
+/*  removed.  All other uses, including redistribution in whole or in    */
+/*  part, are forbidden without prior written permission.                */
+/*                                                                       */
+/*  This software is provided with absolutely no warranty and no         */
+/*  support.                                                             */
+/*                                                                       */
+/*************************************************************************/
+/* Set all the pointers to the proper locations for the q_multi and
+   rhs_multi data structures */
+EXTERN_ENV
+#include "decs.h"
+void link_all()
+{
+    long i;
+    long j;
+    for (j = 0; j < nprocs; j++) {
+        linkup(psium[j]);
+        linkup(psilm[j]);
+        linkup(psib[j]);
+        linkup(ga[j]);
+        linkup(gb[j]);
+        linkup(work2[j]);
+        linkup(work3[j]);
+        linkup(work6[j]);
+        linkup(tauz[j]);
+        linkup(oldga[j]);
+        linkup(oldgb[j]);
+        for (i = 0; i <= 1; i++) {
+            linkup(psi[j][i]);
+            linkup(psim[j][i]);
+            linkup(work1[j][i]);
+            linkup(work4[j][i]);
+            linkup(work5[j][i]);
+            linkup(work7[j][i]);
+            linkup(temparray[j][i]);
+        }
+    }
+    link_multi();
+}
+void linkup(double **row_ptr)
+{
+    long i;
+    double *a;
+    double **row;
+    double **y;
+    long x_part;
+    long y_part;
+    x_part = (jm - 2) / xprocs + 2;
+    y_part = (im - 2) / yprocs + 2;
+    row = row_ptr;
+    y = row + y_part;
+    a = (double *) y;
+    for (i = 0; i < y_part; i++) {
+        *row = (double *) a;
+        row++;
+        a += x_part;
+    }
+}
+void link_multi()
+{
+    long i;
+    long j;
+    long l;
+    double *a;
+    double **row;
+    double **y;
+    unsigned long z;
+    unsigned long zz;
+    long x_part;
+    long y_part;
+    unsigned long d_size;
+    z = ((unsigned long) q_multi + nprocs * sizeof(double ***));
+    if (nprocs % 2 == 1) {      /* To make sure that the actual data
+                                   starts double word aligned, add an extra
+                                   pointer */
+        z += sizeof(double ***);
+    }
+    d_size = numlev * sizeof(double **);
+    if (numlev % 2 == 1) {      /* To make sure that the actual data
+                                   starts double word aligned, add an extra
+                                   pointer */
+        d_size += sizeof(double **);
+    }
+    for (i = 0; i < numlev; i++) {
+        d_size += ((imx[i] - 2) / yprocs + 2) * ((jmx[i] - 2) / xprocs + 2) * sizeof(double) + ((imx[i] - 2) / yprocs + 2) * sizeof(double *);
+    }
+    for (i = 0; i < nprocs; i++) {
+        q_multi[i] = (double ***) z;
+        z += d_size;
+    }
+    for (j = 0; j < nprocs; j++) {
+        zz = (unsigned long) q_multi[j];
+        zz += numlev * sizeof(double **);
+        if (numlev % 2 == 1) {  /* To make sure that the actual data
+                                   starts double word aligned, add an extra
+                                   pointer */
+            zz += sizeof(double **);
+        }
+        for (i = 0; i < numlev; i++) {
+            d_size = ((imx[i] - 2) / yprocs + 2) * ((jmx[i] - 2) / xprocs + 2) * sizeof(double) + ((imx[i] - 2) / yprocs + 2) * sizeof(double *);
+            q_multi[j][i] = (double **) zz;
+            zz += d_size;
+        }
+    }
+    for (l = 0; l < numlev; l++) {
+        x_part = (jmx[l] - 2) / xprocs + 2;
+        y_part = (imx[l] - 2) / yprocs + 2;
+        for (j = 0; j < nprocs; j++) {
+            row = q_multi[j][l];
+            y = row + y_part;
+            a = (double *) y;
+            for (i = 0; i < y_part; i++) {
+                *row = (double *) a;
+                row++;
+                a += x_part;
+            }
+        }
+    }
+    z = ((unsigned long) rhs_multi + nprocs * sizeof(double ***));
+    if (nprocs % 2 == 1) {      /* To make sure that the actual data
+                                   starts double word aligned, add an extra
+                                   pointer */
+        z += sizeof(double ***);
+    }
+    d_size = numlev * sizeof(double **);
+    if (numlev % 2 == 1) {      /* To make sure that the actual data
+                                   starts double word aligned, add an extra
+                                   pointer */
+        d_size += sizeof(double **);
+    }
+    for (i = 0; i < numlev; i++) {
+        d_size += ((imx[i] - 2) / yprocs + 2) * ((jmx[i] - 2) / xprocs + 2) * sizeof(double) + ((imx[i] - 2) / yprocs + 2) * sizeof(double *);
+    }
+    for (i = 0; i < nprocs; i++) {
+        rhs_multi[i] = (double ***) z;
+        z += d_size;
+    }
+    for (j = 0; j < nprocs; j++) {
+        zz = (unsigned long) rhs_multi[j];
+        zz += numlev * sizeof(double **);
+        if (numlev % 2 == 1) {  /* To make sure that the actual data
+                                   starts double word aligned, add an extra
+                                   pointer */
+            zz += sizeof(double **);
+        }
+        for (i = 0; i < numlev; i++) {
+            d_size = ((imx[i] - 2) / yprocs + 2) * ((jmx[i] - 2) / xprocs + 2) * sizeof(double) + ((imx[i] - 2) / yprocs + 2) * sizeof(double *);
+            rhs_multi[j][i] = (double **) zz;
+            zz += d_size;
+        }
+    }
+    for (l = 0; l < numlev; l++) {
+        x_part = (jmx[l] - 2) / xprocs + 2;
+        y_part = (imx[l] - 2) / yprocs + 2;
+        for (j = 0; j < nprocs; j++) {
+            row = rhs_multi[j][l];
+            y = row + y_part;
+            a = (double *) y;
+            for (i = 0; i < y_part; i++) {
+                *row = (double *) a;
+                row++;
+                a += x_part;
+            }
+        }
+    }
+}

soft/giet_vm/applications/ocean/main.C

-                      r589
+                      r598
+#line 115 "/Users/alain/soc/giet_vm/applications/ocean/null_macros/c.m4.null.GIET"
+/*************************************************************************/
+/*                                                                       */
+/*  Copyright (c) 1994 Stanford University                               */
+/*                                                                       */
+/*  All rights reserved.                                                 */
+/*                                                                       */
+/*  Permission is given to use, copy, and modify this software for any   */
+/*  non-commercial purpose as long as this copyright notice is not       */
+/*  removed.  All other uses, including redistribution in whole or in    */
+/*  part, are forbidden without prior written permission.                */
+/*                                                                       */
+/*  This software is provided with absolutely no warranty and no         */
+/*  support.                                                             */
+/*                                                                       */
+/*************************************************************************/
+/*************************************************************************/
+/*                                                                       */
+/*  SPLASH Ocean Code                                                    */
+/*                                                                       */
+/*  This application studies the role of eddy and boundary currents in   */
+/*  influencing large-scale ocean movements.  This implementation uses   */
+/*  dynamically allocated four-dimensional arrays for grid data storage. */
+/*                                                                       */
+/*  Command line options:                                                */
+/*                                                                       */
+/*     -mM : Simulate MxM ocean. M must be (power of 2) +2.              */
+/*     -nN : N = number of threads. N must be power of 2.                */
+/*     -eE : E = error tolerance for iterative relaxation.               */
+/*     -rR : R = distance between grid points in meters.                 */
+/*     -tT : T = timestep in seconds.                                    */
+/*     -s  : Print timing statistics.                                    */
+/*     -o  : Print out relaxation residual values.                       */
+/*     -h  : Print out command line options.                             */
+/*                                                                       */
+/*  Default: OCEAN -m130 -n1 -e1e-7 -r20000.0 -t28800.0                  */
+/*                                                                       */
+/*  NOTE: This code works under both the FORK and SPROC models.          */
+/*                                                                       */
+/*************************************************************************/
+MAIN_ENV
+#define DEFAULT_M        514
+#define DEFAULT_N        4
+#define DEFAULT_E        1e-7
+#define DEFAULT_T    28800.0
+#define DEFAULT_R    20000.0
+#define UP               0
+#define DOWN             1
+#define LEFT             2
+#define RIGHT            3
+#define UPLEFT           4
+#define UPRIGHT          5
+#define DOWNLEFT         6
+#define DOWNRIGHT        7
+#define PAGE_SIZE     4096
+#include <stdio.h>
+#include <math.h>
+#include <stdlib.h>
+#include "decs.h"
+struct multi_struct *multi;
+struct global_struct *global;
+struct locks_struct *locks;
+struct bars_struct *bars;
+struct Global_Private *main_gp;
+double ****main_psi;
+double ****main_psim;
+double ***main_psium;
+double ***main_psilm;
+double ***main_psib;
+double ***main_ga;
+double ***main_gb;
+double ****main_work1;
+double ***main_work2;
+double ***main_work3;
+double ****main_work4;
+double ****main_work5;
+double ***main_work6;
+double ****main_work7;
+double ***main_oldga;
+double ***main_oldgb;
+double ****main_q_multi;
+double ****main_rhs_multi;
+double ****temparray;
+double ***tauz;
+long *main_imx;
+long *main_jmx;
+long nprocs = DEFAULT_N;
+const double h1 = 1000.0;
+const double h3 = 4000.0;
+const double h = 5000.0;
+const double lf = -5.12e11;
+double res = DEFAULT_R;
+double dtau = DEFAULT_T;
+const double f0 = 8.3e-5;
+const double beta = 2.0e-11;
+const double gpr = 0.02;
+double ysca;
+long oim;
+long jmm1;
+double tolerance = DEFAULT_E;
+const double pi = 3.141592653589793;
+const double t0 = 0.5e-4;
+const double outday0 = 1.0;
+const double outday1 = 2.0;
+const double outday2 = 2.0;
+const double outday3 = 2.0;
+const double maxwork = 10000.0;
+double factjacob;
+double factlap;
+//TODO : rÃ©pliquer Ã§a :
+double *main_lev_res;
+double *main_lev_tol;
+double *main_i_int_coeff;
+double *main_j_int_coeff;
+long *main_xpts_per_proc;
+long *main_ypts_per_proc;
+long main_xprocs;
+long main_yprocs;
+long main_numlev;
+double main_eig2;
+long main_im = DEFAULT_M;
+long main_jm;
+long minlevel;
+long do_stats = 1;
+long do_output = 0;
+long *ids_procs;
+__attribute__ ((constructor)) int main(int argc, char *argv[])
+{
+    long i;
+    long j;
+    long k;
+    long x_part;
+    long y_part;
+    long d_size;
+    long itemp;
+    long jtemp;
+    double procsqrt;
+    long temp = 0;
+    double min_total;
+    double max_total;
+    double avg_total;
+    double avg_wait;
+    double max_wait;
+    double min_wait;
+    double min_multi;
+    double max_multi;
+    double avg_multi;
+    double min_frac;
+    double max_frac;
+    double avg_frac;
+    long imax_wait;
+    long imin_wait;
+    long ch;
+    unsigned long long computeend;
+    unsigned long long start;
+    im = main_im;
+    CLOCK(start);
+    while ((ch = getopt(argc, argv, "m:n:e:r:t:soh")) != -1) {
+        switch (ch) {
+        case 'm':
+            im = atoi(optarg);
+            if (log_2(im - 2) == -1) {
+                printerr("Grid must be ((power of 2)+2) in each dimension\n");
+                exit(-1);
+            }
+            break;
+        case 'n':
+            nprocs = atoi(optarg);
+            if (nprocs < 1) {
+                printerr("N must be >= 1\n");
+                exit(-1);
+            }
+            if (log_2(nprocs) == -1) {
+                printerr("N must be a power of 2\n");
+                exit(-1);
+            }
+            break;
+        case 'e':
+            tolerance = atof(optarg);
+            break;
+        case 'r':
+            res = atof(optarg);
+            break;
+        case 't':
+            dtau = atof(optarg);
+            break;
+        case 's':
+            do_stats = !do_stats;
+            break;
+        case 'o':
+            do_output = !do_output;
+            break;
+        case 'h':
+            printf("Usage: ocean <options>\n\n");
+            printf("options:\n");
+            printf("  -mM : Simulate MxM ocean.  M must be (power of 2) + 2 (default = %d).\n", DEFAULT_M);
+            printf("  -nN : N = number of threads. N must be power of 2 (default = %d).\n", DEFAULT_N);
+            printf("  -eE : E = error tolerance for iterative relaxation (default = %f).\n", DEFAULT_E);
+            printf("  -rR : R = distance between grid points in meters (default = %f).\n", DEFAULT_R);
+            printf("  -tT : T = timestep in seconds (default = %f).\n", DEFAULT_T);
+            printf("  -s  : Print timing statistics.\n");
+            printf("  -o  : Print out relaxation residual values.\n");
+            printf("  -h  : Print out command line options.\n\n");
+            exit(0);
+            break;
+        }
+    }
+    MAIN_INITENV
+    jm = im;
+    printf("\n");
+    printf("Ocean simulation with W-cycle multigrid solver\n");
+    printf("    Processors                         : %1ld\n", nprocs);
+    printf("    Grid size                          : %1ld x %1ld\n", im, jm);
+    printf("    Grid resolution (meters)           : %0.2f\n", res);
+    printf("    Time between relaxations (seconds) : %0.0f\n", dtau);
+    printf("    Error tolerance                    : %0.7g\n", tolerance);
+    printf("\n");
+    xprocs = 0;
+    yprocs = 0;
+    procsqrt = sqrt((double) nprocs);
+    j = (long) procsqrt;
+    while ((xprocs == 0) && (j > 0)) {
+        k = nprocs / j;
+        if (k * j == nprocs) {
+            if (k > j) {
+                xprocs = j;
+                yprocs = k;
+            } else {
+                xprocs = k;
+                yprocs = j;
+            }
+        }
+        j--;
+    }
+    if (xprocs == 0) {
+        printerr("Could not find factors for subblocking\n");
+        exit(-1);
+    }
+    minlevel = 0;
+    itemp = 1;
+    jtemp = 1;
+    numlev = 0;
+    minlevel = 0;
+    while (itemp < (im - 2)) {
+        itemp = itemp * 2;
+        jtemp = jtemp * 2;
+        if ((itemp / yprocs > 1) && (jtemp / xprocs > 1)) {
+            numlev++;
+        }
+    }
+    if (numlev == 0) {
+        printerr("Must have at least 2 grid points per processor in each dimension\n");
+        exit(-1);
+    }
+    main_imx = (long *) G_MALLOC(numlev * sizeof(long), 0);
+    main_jmx = (long *) G_MALLOC(numlev * sizeof(long), 0);
+    main_lev_res = (double *) G_MALLOC(numlev * sizeof(double), 0);
+    main_lev_tol = (double *) G_MALLOC(numlev * sizeof(double), 0);
+    main_i_int_coeff = (double *) G_MALLOC(numlev * sizeof(double), 0);
+    main_j_int_coeff = (double *) G_MALLOC(numlev * sizeof(double), 0);
+    main_xpts_per_proc = (long *) G_MALLOC(numlev * sizeof(long), 0);
+    main_ypts_per_proc = (long *) G_MALLOC(numlev * sizeof(long), 0);
+    ids_procs = (long *) G_MALLOC(nprocs * sizeof(long), 0);
+    imx = main_imx;
+    jmx = main_jmx;
+    lev_res = main_lev_res;
+    lev_tol = main_lev_tol;
+    i_int_coeff = main_i_int_coeff;
+    j_int_coeff = main_j_int_coeff;
+    xpts_per_proc = main_xpts_per_proc;
+    ypts_per_proc = main_ypts_per_proc;
+    for (i = 0; i < nprocs; i++) {
+        ids_procs[i] = i;
+    }
+    imx[numlev - 1] = im;
+    jmx[numlev - 1] = jm;
+    lev_res[numlev - 1] = res;
+    lev_tol[numlev - 1] = tolerance;
+    for (i = numlev - 2; i >= 0; i--) {
+        imx[i] = ((imx[i + 1] - 2) / 2) + 2;
+        jmx[i] = ((jmx[i + 1] - 2) / 2) + 2;
+        lev_res[i] = lev_res[i + 1] * 2;
+    }
+    for (i = 0; i < numlev; i++) {
+        xpts_per_proc[i] = (jmx[i] - 2) / xprocs;
+        ypts_per_proc[i] = (imx[i] - 2) / yprocs;
+    }
+    for (i = numlev - 1; i >= 0; i--) {
+        if ((xpts_per_proc[i] < 2) || (ypts_per_proc[i] < 2)) {
+            minlevel = i + 1;
+            break;
+        }
+    }
+    for (i = 0; i < numlev; i++) {
+        temp += imx[i];
+    }
+    temp = 0;
+    j = 0;
+    for (k = 0; k < numlev; k++) {
+        for (i = 0; i < imx[k]; i++) {
+            j++;
+            temp += jmx[k];
+        }
+    }
+    d_size = nprocs * sizeof(double ***);
+    main_psi = (double ****) G_MALLOC(d_size, 0);
+    main_psim = (double ****) G_MALLOC(d_size, 0);
+    main_work1 = (double ****) G_MALLOC(d_size, 0);
+    main_work4 = (double ****) G_MALLOC(d_size, 0);
+    main_work5 = (double ****) G_MALLOC(d_size, 0);
+    main_work7 = (double ****) G_MALLOC(d_size, 0);
+    temparray = (double ****) G_MALLOC(d_size, -1);
+    psi = main_psi;
+    psim = main_psim;
+    work1 = main_work1;
+    work4 = main_work4;
+    work5 = main_work5;
+    work7 = main_work7;
+    d_size = 2 * sizeof(double **);
+    for (i = 0; i < nprocs; i++) {
+        psi[i] = (double ***) G_MALLOC(d_size, i);
+        psim[i] = (double ***) G_MALLOC(d_size, i);
+        work1[i] = (double ***) G_MALLOC(d_size, i);
+        work4[i] = (double ***) G_MALLOC(d_size, i);
+        work5[i] = (double ***) G_MALLOC(d_size, i);
+        work7[i] = (double ***) G_MALLOC(d_size, i);
+        temparray[i] = (double ***) G_MALLOC(d_size, i);
+    }
+    d_size = nprocs * sizeof(double **);
+    main_psium = (double ***) G_MALLOC(d_size, 0);
+    main_psilm = (double ***) G_MALLOC(d_size, 0);
+    main_psib = (double ***) G_MALLOC(d_size, 0);
+    main_ga = (double ***) G_MALLOC(d_size, 0);
+    main_gb = (double ***) G_MALLOC(d_size, 0);
+    main_work2 = (double ***) G_MALLOC(d_size, 0);
+    main_work3 = (double ***) G_MALLOC(d_size, 0);
+    main_work6 = (double ***) G_MALLOC(d_size, 0);
+    tauz = (double ***) G_MALLOC(d_size, 0);
+    main_oldga = (double ***) G_MALLOC(d_size, 0);
+    main_oldgb = (double ***) G_MALLOC(d_size, 0);
+    psium = main_psium;
+    psilm = main_psilm;
+    psib = main_psib;
+    ga = main_ga;
+    gb = main_gb;
+    work2 = main_work2;
+    work3 = main_work3;
+    work6 = main_work6;
+    oldga = main_oldga;
+    oldgb = main_oldgb;
+    main_gp = (struct Global_Private *) G_MALLOC((nprocs + 1) * sizeof(struct Global_Private), -1);
+    gp = main_gp;
+    for (i = 0; i < nprocs; i++) {
+        gp[i].pad = (char *) G_MALLOC(PAGE_SIZE * sizeof(char), i);
+        gp[i].rel_num_x = (long *) G_MALLOC(numlev * sizeof(long), i);
+        gp[i].rel_num_y = (long *) G_MALLOC(numlev * sizeof(long), i);
+        gp[i].eist = (long *) G_MALLOC(numlev * sizeof(long), i);
+        gp[i].ejst = (long *) G_MALLOC(numlev * sizeof(long), i);
+        gp[i].oist = (long *) G_MALLOC(numlev * sizeof(long), i);
+        gp[i].ojst = (long *) G_MALLOC(numlev * sizeof(long), i);
+        gp[i].rlist = (long *) G_MALLOC(numlev * sizeof(long), i);
+        gp[i].rljst = (long *) G_MALLOC(numlev * sizeof(long), i);
+        gp[i].rlien = (long *) G_MALLOC(numlev * sizeof(long), i);
+        gp[i].rljen = (long *) G_MALLOC(numlev * sizeof(long), i);
+        gp[i].neighbors = (long *) G_MALLOC(8 * sizeof(long), i);
+        gp[i].rownum = (long *) G_MALLOC(sizeof(long), i);
+        gp[i].colnum = (long *) G_MALLOC(sizeof(long), i);
+        gp[i].lpid = (long *) G_MALLOC(sizeof(long), i);
+        gp[i].multi_time = (double *) G_MALLOC(sizeof(double), i);
+        gp[i].total_time = (double *) G_MALLOC(sizeof(double), i);
+        gp[i].sync_time = (double *) G_MALLOC(sizeof(double), i);
+        gp[i].process_time = (double *) G_MALLOC(sizeof(double), i);
+        gp[i].step_start = (double *) G_MALLOC(sizeof(double), i);
+        gp[i].steps_time = (double *) G_MALLOC(10 * sizeof(double), i);
+        *gp[i].multi_time = 0;
+        *gp[i].total_time = 0;
+        *gp[i].sync_time = 0;
+        *gp[i].process_time = 0;
+        *gp[i].lpid = i;
+    }
+    subblock();
+    x_part = (jm - 2) / xprocs + 2;
+    y_part = (im - 2) / yprocs + 2;
+    d_size = x_part * y_part * sizeof(double) + y_part * sizeof(double *);
+    global = (struct global_struct *) G_MALLOC(sizeof(struct global_struct), -1);
+    for (i = 0; i < nprocs; i++) {
+        psi[i][0] = (double **) G_MALLOC(d_size, i);
+        psi[i][1] = (double **) G_MALLOC(d_size, i);
+        psim[i][0] = (double **) G_MALLOC(d_size, i);
+        psim[i][1] = (double **) G_MALLOC(d_size, i);
+        psium[i] = (double **) G_MALLOC(d_size, i);
+        psilm[i] = (double **) G_MALLOC(d_size, i);
+        psib[i] = (double **) G_MALLOC(d_size, i);
+        ga[i] = (double **) G_MALLOC(d_size, i);
+        gb[i] = (double **) G_MALLOC(d_size, i);
+        work1[i][0] = (double **) G_MALLOC(d_size, i);
+        work1[i][1] = (double **) G_MALLOC(d_size, i);
+        work2[i] = (double **) G_MALLOC(d_size, i);
+        work3[i] = (double **) G_MALLOC(d_size, i);
+        work4[i][0] = (double **) G_MALLOC(d_size, i);
+        work4[i][1] = (double **) G_MALLOC(d_size, i);
+        work5[i][0] = (double **) G_MALLOC(d_size, i);
+        work5[i][1] = (double **) G_MALLOC(d_size, i);
+        work6[i] = (double **) G_MALLOC(d_size, i);
+        work7[i][0] = (double **) G_MALLOC(d_size, i);
+        work7[i][1] = (double **) G_MALLOC(d_size, i);
+        temparray[i][0] = (double **) G_MALLOC(d_size, i);
+        temparray[i][1] = (double **) G_MALLOC(d_size, i);
+        tauz[i] = (double **) G_MALLOC(d_size, i);
+        oldga[i] = (double **) G_MALLOC(d_size, i);
+        oldgb[i] = (double **) G_MALLOC(d_size, i);
+    }
+    oim = im;
+    //f = (double *) G_MALLOC(oim*sizeof(double), 0);
+    multi = (struct multi_struct *) G_MALLOC(sizeof(struct multi_struct), -1);
+    d_size = numlev * sizeof(double **);
+    if (numlev % 2 == 1) {      /* To make sure that the actual data
+                                   starts double word aligned, add an extra
+                                   pointer */
+        d_size += sizeof(double **);
+    }
+    for (i = 0; i < numlev; i++) {
+        d_size += ((imx[i] - 2) / yprocs + 2) * ((jmx[i] - 2) / xprocs + 2) * sizeof(double) + ((imx[i] - 2) / yprocs + 2) * sizeof(double *);
+    }
+    d_size *= nprocs;
+    if (nprocs % 2 == 1) {      /* To make sure that the actual data
+                                   starts double word aligned, add an extra
+                                   pointer */
+        d_size += sizeof(double ***);
+    }
+    d_size += nprocs * sizeof(double ***);
+    main_q_multi = (double ****) G_MALLOC(d_size, -1);
+    main_rhs_multi = (double ****) G_MALLOC(d_size, -1);
+    q_multi = main_q_multi;
+    rhs_multi = main_rhs_multi;
+    locks = (struct locks_struct *) G_MALLOC(sizeof(struct locks_struct), -1);
+    bars = (struct bars_struct *) G_MALLOC(sizeof(struct bars_struct), -1);
+    LOCKINIT(locks->idlock)
+    LOCKINIT(locks->psiailock)
+    LOCKINIT(locks->psibilock)
+    LOCKINIT(locks->donelock)
+    LOCKINIT(locks->error_lock)
+    LOCKINIT(locks->bar_lock)
+#if defined(MULTIPLE_BARRIERS)
+    BARINIT(bars->iteration, nprocs)
+    BARINIT(bars->gsudn, nprocs)
+    BARINIT(bars->p_setup, nprocs)
+    BARINIT(bars->p_redph, nprocs)
+    BARINIT(bars->p_soln, nprocs)
+    BARINIT(bars->p_subph, nprocs)
+    BARINIT(bars->sl_prini, nprocs)
+    BARINIT(bars->sl_psini, nprocs)
+    BARINIT(bars->sl_onetime, nprocs)
+    BARINIT(bars->sl_phase_1, nprocs)
+    BARINIT(bars->sl_phase_2, nprocs)
+    BARINIT(bars->sl_phase_3, nprocs)
+    BARINIT(bars->sl_phase_4, nprocs)
+    BARINIT(bars->sl_phase_5, nprocs)
+    BARINIT(bars->sl_phase_6, nprocs)
+    BARINIT(bars->sl_phase_7, nprocs)
+    BARINIT(bars->sl_phase_8, nprocs)
+    BARINIT(bars->sl_phase_9, nprocs)
+    BARINIT(bars->sl_phase_10, nprocs)
+    BARINIT(bars->error_barrier, nprocs)
+#else
+    BARINIT(bars->barrier, nprocs)
+#endif
+    link_all();
+    multi->err_multi = 0.0;
+    i_int_coeff[0] = 0.0;
+    j_int_coeff[0] = 0.0;
+    for (i = 0; i < numlev; i++) {
+        i_int_coeff[i] = 1.0 / (imx[i] - 1);
+        j_int_coeff[i] = 1.0 / (jmx[i] - 1);
+    }
+    /*
+       initialize constants and variables
+       id is a global shared variable that has fetch-and-add operations
+       performed on it by processes to obtain their pids.
+     */
+    //global->id = 0;
+    global->trackstart = 0;
+    global->psibi = 0.0;
+    factjacob = -1. / (12. * res * res);
+    factlap = 1. / (res * res);
+    eig2 = -h * f0 * f0 / (h1 * h3 * gpr);
+    jmm1 = jm - 1;
+    ysca = ((double) jmm1) * res;
+    im = (imx[numlev - 1] - 2) / yprocs + 2;
+    jm = (jmx[numlev - 1] - 2) / xprocs + 2;
+    main_im = im;
+    main_jm = jm;
+    main_numlev = numlev;
+    main_xprocs = xprocs;
+    main_yprocs = yprocs;
+    main_eig2 = eig2;
+    if (do_output) {
+        printf("              MULTIGRID OUTPUTS\n");
+    }
+    CREATE(slave, nprocs);
+    WAIT_FOR_END(nprocs);
+    CLOCK(computeend);
+    printf("\n");
+    printf("                PROCESS STATISTICS\n");
+    printf("                  Total          Multigrid         Multigrid\n");
+    printf(" Proc             Time             Time            Fraction\n");
+    printf("    0   %15.0f    %15.0f        %10.3f\n", (*gp[0].total_time), (*gp[0].multi_time), (*gp[0].multi_time) / (*gp[0].total_time));
+    if (do_stats) {
+        double phase_time;
+        min_total = max_total = avg_total = (*gp[0].total_time);
+        min_multi = max_multi = avg_multi = (*gp[0].multi_time);
+        min_frac = max_frac = avg_frac = (*gp[0].multi_time) / (*gp[0].total_time);
+        avg_wait = *gp[0].sync_time;
+        max_wait = *gp[0].sync_time;
+        min_wait = *gp[0].sync_time;
+        imax_wait = 0;
+        imin_wait = 0;
+        for (i = 1; i < nprocs; i++) {
+            if ((*gp[i].total_time) > max_total) {
+                max_total = (*gp[i].total_time);
+            }
+            if ((*gp[i].total_time) < min_total) {
+                min_total = (*gp[i].total_time);
+            }
+            if ((*gp[i].multi_time) > max_multi) {
+                max_multi = (*gp[i].multi_time);
+            }
+            if ((*gp[i].multi_time) < min_multi) {
+                min_multi = (*gp[i].multi_time);
+            }
+            if ((*gp[i].multi_time) / (*gp[i].total_time) > max_frac) {
+                max_frac = (*gp[i].multi_time) / (*gp[i].total_time);
+            }
+            if ((*gp[i].multi_time) / (*gp[i].total_time) < min_frac) {
+                min_frac = (*gp[i].multi_time) / (*gp[i].total_time);
+            }
+            avg_total += (*gp[i].total_time);
+            avg_multi += (*gp[i].multi_time);
+            avg_frac += (*gp[i].multi_time) / (*gp[i].total_time);
+            avg_wait += (*gp[i].sync_time);
+            if (max_wait < (*gp[i].sync_time)) {
+                max_wait = (*gp[i].sync_time);
+                imax_wait = i;
+            }
+            if (min_wait > (*gp[i].sync_time)) {
+                min_wait = (*gp[i].sync_time);
+                imin_wait = i;
+            }
+        }
+        avg_total = avg_total / nprocs;
+        avg_multi = avg_multi / nprocs;
+        avg_frac = avg_frac / nprocs;
+        avg_wait = avg_wait / nprocs;
+        for (i = 1; i < nprocs; i++) {
+            printf("  %3ld   %15.0f    %15.0f        %10.3f\n", i, (*gp[i].total_time), (*gp[i].multi_time), (*gp[i].multi_time) / (*gp[i].total_time));
+        }
+        printf("  Avg   %15.0f    %15.0f        %10.3f\n", avg_total, avg_multi, avg_frac);
+        printf("  Min   %15.0f    %15.0f        %10.3f\n", min_total, min_multi, min_frac);
+        printf("  Max   %15.0f    %15.0f        %10.3f\n", max_total, max_multi, max_frac);
+        printf("\n\n                  Sync\n");
+        printf(" Proc      Time        Fraction\n");
+        for (i = 0; i < nprocs; i++) {
+            printf("  %ld        %u      %f\n", i, (unsigned int)*gp[i].sync_time, *gp[i].sync_time / ((long)(*gp[i].total_time)));
+        }
+        printf("  Avg   %f   %f\n", avg_wait, (double) avg_wait / (long) (computeend - global->trackstart));
+        printf("  Min   %f   %f\n", min_wait, (double) min_wait / (long) (*gp[imin_wait].total_time));
+        printf("  Max   %f   %f\n", max_wait, (double) max_wait / (long) (*gp[imax_wait].total_time));
+        printf("\nPhases Avg :\n\n");
+        for (i = 0; i < 10; i++) {
+            phase_time = 0;
+            for (j = 0; j < nprocs; j++) {
+                phase_time += gp[j].steps_time[i];
+            }
+            phase_time /= (double) nprocs;
+            printf("  %d = %f (fraction %f)\n", i + 1, phase_time, phase_time / (long) (computeend - global->trackstart));
+        }
+    }
+    printf("\n");
+    global->starttime = start;
+    printf("                       TIMING INFORMATION\n");
+    printf("[NPROCS]           : %16ld\n", nprocs);
+    printf("[START1]           : %16llu\n", global->starttime);
+    printf("[START2]           : %16llu\n", global->trackstart);
+    printf("[END]              : %16llu\n", computeend);
+    printf("[TOTAL]            : %16llu\n", computeend - global->starttime);    // With init
+    printf("[PARALLEL_COMPUTE] : %16llu\n", computeend - global->trackstart);   // Without init
+    printf("(excludes first timestep)\n");
+    printf("\n");
+    MAIN_END
+}
+long log_2(long number)
+{
+    long cumulative = 1;
+    long out = 0;
+    long done = 0;
+    while ((cumulative < number) && (!done) && (out < 50)) {
+        if (cumulative == number) {
+            done = 1;
+        } else {
+            cumulative = cumulative * 2;
+            out++;
+        }
+    }
+    if (cumulative == number) {
+        return (out);
+    } else {
+        return (-1);
+    }
+}
+void printerr(char *s)
+{
+    fprintf(stderr, "ERROR: %s\n", s);
+}
+// Local Variables:
+// tab-width: 4
+// c-basic-offset: 4
+// c-file-offsets:((innamespace . 0)(inline-open . 0))
+// indent-tabs-mode: nil
+// End:
+// vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=4:softtabstop=4

soft/giet_vm/applications/ocean/multi.C

-                      r589
+                      r598
+#line 115 "/Users/alain/soc/giet_vm/applications/ocean/null_macros/c.m4.null.GIET"
+/*************************************************************************/
+/*                                                                       */
+/*  Copyright (c) 1994 Stanford University                               */
+/*                                                                       */
+/*  All rights reserved.                                                 */
+/*                                                                       */
+/*  Permission is given to use, copy, and modify this software for any   */
+/*  non-commercial purpose as long as this copyright notice is not       */
+/*  removed.  All other uses, including redistribution in whole or in    */
+/*  part, are forbidden without prior written permission.                */
+/*                                                                       */
+/*  This software is provided with absolutely no warranty and no         */
+/*  support.                                                             */
+/*                                                                       */
+/*************************************************************************/
+/* Shared memory implementation of the multigrid method
+   Implementation uses red-black gauss-seidel relaxation
+   iterations, w cycles, and the method of half-injection for
+   residual computation. */
+EXTERN_ENV
+#include <stdio.h>
+#include <math.h>
+#include <stdlib.h>
+#include "decs.h"
+/* perform multigrid (w cycles)                                     */
+void multig(long my_id)
+{
+    long iter;
+    double wu;
+    double errp;
+    long m;
+    long flag;
+    long k;
+    long my_num;
+    double wmax;
+    double local_err;
+    double red_local_err;
+    double black_local_err;
+    double g_error;
+    flag = 0;
+    iter = 0;
+    m = numlev - 1;
+    wmax = maxwork;
+    my_num = my_id;
+    wu = 0.0;
+    k = m;
+    g_error = 1.0e30;
+    while (!flag) {
+        errp = g_error;
+        iter++;
+        if (my_num == MASTER) {
+            multi->err_multi = 0.0;
+        }
+/* barrier to make sure all procs have finished intadd or rescal   */
+/* before proceeding with relaxation                               */
+#if defined(MULTIPLE_BARRIERS)
+        BARRIER(bars->error_barrier, nprocs)
+#else
+        BARRIER(bars->barrier, nprocs)
+#endif
+        copy_black(k, my_num);
+        relax(k, &red_local_err, RED_ITER, my_num);
+/* barrier to make sure all red computations have been performed   */
+#if defined(MULTIPLE_BARRIERS)
+        BARRIER(bars->error_barrier, nprocs)
+#else
+        BARRIER(bars->barrier, nprocs)
+#endif
+        copy_red(k, my_num);
+        relax(k, &black_local_err, BLACK_ITER, my_num);
+/* compute max local error from red_local_err and black_local_err  */
+        if (red_local_err > black_local_err) {
+            local_err = red_local_err;
+        } else {
+            local_err = black_local_err;
+        }
+/* update the global error if necessary                         */
+        LOCK(locks->error_lock)
+            if (local_err > multi->err_multi) {
+            multi->err_multi = local_err;
+        }
+        UNLOCK(locks->error_lock)
+/* a single relaxation sweep at the finest level is one unit of    */
+/* work                                                            */
+        wu += pow((double) 4.0, (double) k - m);
+/* barrier to make sure all processors have checked local error    */
+#if defined(MULTIPLE_BARRIERS)
+        BARRIER(bars->error_barrier, nprocs)
+#else
+        BARRIER(bars->barrier, nprocs)
+#endif
+        g_error = multi->err_multi;
+/* barrier to make sure master does not cycle back to top of loop  */
+/* and reset global->err before we read it and decide what to do   */
+#if defined(MULTIPLE_BARRIERS)
+        BARRIER(bars->error_barrier, nprocs)
+#else
+        BARRIER(bars->barrier, nprocs)
+#endif
+        if (g_error >= lev_tol[k]) {
+            if (wu > wmax) {
+/* max work exceeded                                               */
+                fprintf(stderr, "ERROR: Maximum work limit %0.5f exceeded\n", wmax);
+                exit(-1);
+            } else {
+/* if we have not converged                                        */
+                if ((k != 0) && (g_error / errp >= 0.6) && (k > minlevel)) {
+/* if need to go to coarser grid                                   */
+                    copy_borders(k, my_num);
+                    copy_rhs_borders(k, my_num);
+/* This bar is needed because the routine rescal uses the neighbor's
+   border points to compute s4.  We must ensure that the neighbor's
+   border points have been written before we try computing the new
+   rescal values                                                   */
+#if defined(MULTIPLE_BARRIERS)
+                    BARRIER(bars->error_barrier, nprocs)
+#else
+                    BARRIER(bars->barrier, nprocs)
+#endif
+                    rescal(k, my_num);
+/* transfer residual to rhs of coarser grid                        */
+                    lev_tol[k - 1] = 0.3 * g_error;
+                    k = k - 1;
+                    putz(k, my_num);
+/* make initial guess on coarser grid zero                         */
+                    g_error = 1.0e30;
+                }
+            }
+        } else {
+/* if we have converged at this level                              */
+            if (k == m) {
+/* if finest grid, we are done                                     */
+                flag = 1;
+            } else {
+/* else go to next finest grid                                     */
+                copy_borders(k, my_num);
+                intadd(k, my_num);
+/* changes the grid values at the finer level.  rhs at finer level */
+/* remains what it already is                                      */
+                k++;
+                g_error = 1.0e30;
+            }
+        }
+    }
+    if (do_output) {
+        if (my_num == MASTER) {
+            printf("iter %ld, level %ld, residual norm %12.8e, work = %7.3f\n", iter, k, multi->err_multi, wu);
+        }
+    }
+}
+/* perform red or black iteration (not both)                    */
+void relax(long k, double *err, long color, long my_num)
+{
+    long i;
+    long j;
+    long iend;
+    long jend;
+    long oddistart;
+    long oddjstart;
+    long evenistart;
+    long evenjstart;
+    double a;
+    double h;
+    double factor;
+    double maxerr;
+    double newerr;
+    double oldval;
+    double newval;
+    double **t2a;
+    double **t2b;
+    double *t1a;
+    double *t1b;
+    double *t1c;
+    double *t1d;
+    i = 0;
+    j = 0;
+    *err = 0.0;
+    h = lev_res[k];
+/* points whose sum of row and col index is even do a red iteration, */
+/* others do a black                                                 */
+    evenistart = gp[my_num].eist[k];
+    evenjstart = gp[my_num].ejst[k];
+    oddistart = gp[my_num].oist[k];
+    oddjstart = gp[my_num].ojst[k];
+    iend = gp[my_num].rlien[k];
+    jend = gp[my_num].rljen[k];
+    factor = 4.0 - eig2 * h * h;
+    maxerr = 0.0;
+    t2a = (double **) q_multi[my_num][k];
+    t2b = (double **) rhs_multi[my_num][k];
+    if (color == RED_ITER) {
+        for (i = evenistart; i < iend; i += 2) {
+            t1a = (double *) t2a[i];
+            t1b = (double *) t2b[i];
+            t1c = (double *) t2a[i - 1];
+            t1d = (double *) t2a[i + 1];
+            for (j = evenjstart; j < jend; j += 2) {
+                a = t1a[j + 1] + t1a[j - 1] + t1c[j] + t1d[j] - t1b[j];
+                oldval = t1a[j];
+                newval = a / factor;
+                newerr = oldval - newval;
+                t1a[j] = newval;
+                if (fabs(newerr) > maxerr) {
+                    maxerr = fabs(newerr);
+                }
+            }
+        }
+        for (i = oddistart; i < iend; i += 2) {
+            t1a = (double *) t2a[i];
+            t1b = (double *) t2b[i];
+            t1c = (double *) t2a[i - 1];
+            t1d = (double *) t2a[i + 1];
+            for (j = oddjstart; j < jend; j += 2) {
+                a = t1a[j + 1] + t1a[j - 1] + t1c[j] + t1d[j] - t1b[j];
+                oldval = t1a[j];
+                newval = a / factor;
+                newerr = oldval - newval;
+                t1a[j] = newval;
+                if (fabs(newerr) > maxerr) {
+                    maxerr = fabs(newerr);
+                }
+            }
+        }
+    } else if (color == BLACK_ITER) {
+        for (i = evenistart; i < iend; i += 2) {
+            t1a = (double *) t2a[i];
+            t1b = (double *) t2b[i];
+            t1c = (double *) t2a[i - 1];
+            t1d = (double *) t2a[i + 1];
+            for (j = oddjstart; j < jend; j += 2) {
+                a = t1a[j + 1] + t1a[j - 1] + t1c[j] + t1d[j] - t1b[j];
+                oldval = t1a[j];
+                newval = a / factor;
+                newerr = oldval - newval;
+                t1a[j] = newval;
+                if (fabs(newerr) > maxerr) {
+                    maxerr = fabs(newerr);
+                }
+            }
+        }
+        for (i = oddistart; i < iend; i += 2) {
+            t1a = (double *) t2a[i];
+            t1b = (double *) t2b[i];
+            t1c = (double *) t2a[i - 1];
+            t1d = (double *) t2a[i + 1];
+            for (j = evenjstart; j < jend; j += 2) {
+                a = t1a[j + 1] + t1a[j - 1] + t1c[j] + t1d[j] - t1b[j];
+                oldval = t1a[j];
+                newval = a / factor;
+                newerr = oldval - newval;
+                t1a[j] = newval;
+                if (fabs(newerr) > maxerr) {
+                    maxerr = fabs(newerr);
+                }
+            }
+        }
+    }
+    *err = maxerr;
+}
+/* perform half-injection to next coarsest level                */
+void rescal(long kf, long my_num)
+{
+    long ic;
+    long if17;
+    long jf;
+    long jc;
+    long krc;
+    long istart;
+    long iend;
+    long jstart;
+    long jend;
+    double hf;
+    double s;
+    double s1;
+    double s2;
+    double s3;
+    double s4;
+    double factor;
+    double int1;
+    double int2;
+    double i_int_factor;
+    double j_int_factor;
+    long i_off;
+    long j_off;
+    long up_proc;
+    long left_proc;
+    long im;
+    long jm;
+    double temp;
+    double temp2;
+    double **t2a;
+    double **t2b;
+    double **t2c;
+    double *t1a;
+    double *t1b;
+    double *t1c;
+    double *t1d;
+    double *t1e;
+    double *t1f;
+    double *t1g;
+    double *t1h;
+    krc = kf - 1;
+    //hc = lev_res[krc];
+    hf = lev_res[kf];
+    i_off = (*gp[my_num].rownum) * ypts_per_proc[krc];
+    j_off = (*gp[my_num].colnum) * xpts_per_proc[krc];
+    up_proc = gp[my_num].neighbors[UP];
+    left_proc = gp[my_num].neighbors[LEFT];
+    im = (imx[kf] - 2) / yprocs;
+    jm = (jmx[kf] - 2) / xprocs;
+    istart = gp[my_num].rlist[krc];
+    jstart = gp[my_num].rljst[krc];
+    iend = gp[my_num].rlien[krc] - 1;
+    jend = gp[my_num].rljen[krc] - 1;
+    factor = 4.0 - eig2 * hf * hf;
+    t2a = (double **) q_multi[my_num][kf];
+    t2b = (double **) rhs_multi[my_num][kf];
+    t2c = (double **) rhs_multi[my_num][krc];
+    if17 = 2 * (istart - 1);
+    for (ic = istart; ic <= iend; ic++) {
+        if17 += 2;
+        i_int_factor = (ic + i_off) * i_int_coeff[krc] * 0.5;
+        jf = 2 * (jstart - 1);
+        t1a = (double *) t2a[if17];
+        t1b = (double *) t2b[if17];
+        t1c = (double *) t2c[ic];
+        t1d = (double *) t2a[if17 - 1];
+        t1e = (double *) t2a[if17 + 1];
+        t1f = (double *) t2a[if17 - 2];
+        t1g = (double *) t2a[if17 - 3];
+        t1h = (double *) t2b[if17 - 2];
+        for (jc = jstart; jc <= jend; jc++) {
+            jf += 2;
+            j_int_factor = (jc + j_off) * j_int_coeff[krc] * 0.5;
+/*             method of half-injection uses 2.0 instead of 4.0 */
+/* do bilinear interpolation */
+            s = t1a[jf + 1] + t1a[jf - 1] + t1d[jf] + t1e[jf];
+            s1 = 2.0 * (t1b[jf] - s + factor * t1a[jf]);
+            if (((if17 == 2) && (gp[my_num].neighbors[UP] == -1)) || ((jf == 2) && (gp[my_num].neighbors[LEFT] == -1))) {
+                s2 = 0;
+                s3 = 0;
+                s4 = 0;
+            } else if ((if17 == 2) || (jf == 2)) {
+                if (jf == 2) {
+                    temp = q_multi[left_proc][kf][if17][jm - 1];
+                } else {
+                    temp = t1a[jf - 3];
+                }
+                s = t1a[jf - 1] + temp + t1d[jf - 2] + t1e[jf - 2];
+                s2 = 2.0 * (t1b[jf - 2] - s + factor * t1a[jf - 2]);
+                if (if17 == 2) {
+                    temp = q_multi[up_proc][kf][im - 1][jf];
+                } else {
+                    temp = t1g[jf];
+                }
+                s = t1f[jf + 1] + t1f[jf - 1] + temp + t1d[jf];
+                s3 = 2.0 * (t1h[jf] - s + factor * t1f[jf]);
+                if (jf == 2) {
+                    temp = q_multi[left_proc][kf][if17 - 2][jm - 1];
+                } else {
+                    temp = t1f[jf - 3];
+                }
+                if (if17 == 2) {
+                    temp2 = q_multi[up_proc][kf][im - 1][jf - 2];
+                } else {
+                    temp2 = t1g[jf - 2];
+                }
+                s = t1f[jf - 1] + temp + temp2 + t1d[jf - 2];
+                s4 = 2.0 * (t1h[jf - 2] - s + factor * t1f[jf - 2]);
+            } else {
+                s = t1a[jf - 1] + t1a[jf - 3] + t1d[jf - 2] + t1e[jf - 2];
+                s2 = 2.0 * (t1b[jf - 2] - s + factor * t1a[jf - 2]);
+                s = t1f[jf + 1] + t1f[jf - 1] + t1g[jf] + t1d[jf];
+                s3 = 2.0 * (t1h[jf] - s + factor * t1f[jf]);
+                s = t1f[jf - 1] + t1f[jf - 3] + t1g[jf - 2] + t1d[jf - 2];
+                s4 = 2.0 * (t1h[jf - 2] - s + factor * t1f[jf - 2]);
+            }
+            int1 = j_int_factor * s4 + (1.0 - j_int_factor) * s3;
+            int2 = j_int_factor * s2 + (1.0 - j_int_factor) * s1;
+            //int_val = i_int_factor*int1+(1.0-i_int_factor)*int2;
+            t1c[jc] = i_int_factor * int1 + (1.0 - i_int_factor) * int2;
+        }
+    }
+}
+/* perform interpolation and addition to next finest grid       */
+void intadd(long kc, long my_num)
+{
+    long ic;
+    long if17;
+    long jf;
+    long jc;
+    long kf;
+    long istart;
+    long jstart;
+    long iend;
+    long jend;
+    double int1;
+    double int2;
+    double i_int_factor1;
+    double j_int_factor1;
+    double i_int_factor2;
+    double j_int_factor2;
+    long i_off;
+    long j_off;
+    double **t2a;
+    double **t2b;
+    double *t1a;
+    double *t1b;
+    double *t1c;
+    double *t1d;
+    double *t1e;
+    kf = kc + 1;
+    //hc = lev_res[kc];
+    //hf = lev_res[kf];
+    istart = gp[my_num].rlist[kc];
+    jstart = gp[my_num].rljst[kc];
+    iend = gp[my_num].rlien[kc] - 1;
+    jend = gp[my_num].rljen[kc] - 1;
+    i_off = (*gp[my_num].rownum) * ypts_per_proc[kc];
+    j_off = (*gp[my_num].colnum) * xpts_per_proc[kc];
+    t2a = (double **) q_multi[my_num][kc];
+    t2b = (double **) q_multi[my_num][kf];
+    if17 = 2 * (istart - 1);
+    for (ic = istart; ic <= iend; ic++) {
+        if17 += 2;
+        i_int_factor1 = ((imx[kc] - 2) - (ic + i_off - 1)) * (i_int_coeff[kf]);
+        i_int_factor2 = (ic + i_off) * i_int_coeff[kf];
+        jf = 2 * (jstart - 1);
+        t1a = (double *) t2a[ic];
+        t1b = (double *) t2a[ic - 1];
+        t1c = (double *) t2a[ic + 1];
+        t1d = (double *) t2b[if17];
+        t1e = (double *) t2b[if17 - 1];
+        for (jc = jstart; jc <= jend; jc++) {
+            jf += 2;
+            j_int_factor1 = ((jmx[kc] - 2) - (jc + j_off - 1)) * (j_int_coeff[kf]);
+            j_int_factor2 = (jc + j_off) * j_int_coeff[kf];
+            int1 = j_int_factor1 * t1a[jc - 1] + (1.0 - j_int_factor1) * t1a[jc];
+            int2 = j_int_factor1 * t1b[jc - 1] + (1.0 - j_int_factor1) * t1b[jc];
+            t1e[jf - 1] += i_int_factor1 * int2 + (1.0 - i_int_factor1) * int1;
+            int2 = j_int_factor1 * t1c[jc - 1] + (1.0 - j_int_factor1) * t1c[jc];
+            t1d[jf - 1] += i_int_factor2 * int2 + (1.0 - i_int_factor2) * int1;
+            int1 = j_int_factor2 * t1a[jc + 1] + (1.0 - j_int_factor2) * t1a[jc];
+            int2 = j_int_factor2 * t1b[jc + 1] + (1.0 - j_int_factor2) * t1b[jc];
+            t1e[jf] += i_int_factor1 * int2 + (1.0 - i_int_factor1) * int1;
+            int2 = j_int_factor2 * t1c[jc + 1] + (1.0 - j_int_factor2) * t1c[jc];
+            t1d[jf] += i_int_factor2 * int2 + (1.0 - i_int_factor2) * int1;
+        }
+    }
+}
+/* initialize a grid to zero in parallel                        */
+void putz(long k, long my_num)
+{
+    long i;
+    long j;
+    long istart;
+    long jstart;
+    long iend;
+    long jend;
+    double **t2a;
+    double *t1a;
+    istart = gp[my_num].rlist[k];
+    jstart = gp[my_num].rljst[k];
+    iend = gp[my_num].rlien[k];
+    jend = gp[my_num].rljen[k];
+    t2a = (double **) q_multi[my_num][k];
+    for (i = istart; i <= iend; i++) {
+        t1a = (double *) t2a[i];
+        for (j = jstart; j <= jend; j++) {
+            t1a[j] = 0.0;
+        }
+    }
+}
+void copy_borders(long k, long pid)
+{
+    long i;
+    long j;
+    long jj;
+    long im;
+    long jm;
+    long lastrow;
+    long lastcol;
+    double **t2a;
+    double **t2b;
+    double *t1a;
+    double *t1b;
+    im = (imx[k] - 2) / yprocs + 2;
+    jm = (jmx[k] - 2) / xprocs + 2;
+    lastrow = (imx[k] - 2) / yprocs;
+    lastcol = (jmx[k] - 2) / xprocs;
+    t2a = (double **) q_multi[pid][k];
+    jj = gp[pid].neighbors[UPLEFT];
+    if (jj != -1) {
+        t2a[0][0] = q_multi[jj][k][im - 2][jm - 2];
+    }
+    jj = gp[pid].neighbors[UPRIGHT];
+    if (jj != -1) {
+        t2a[0][jm - 1] = q_multi[jj][k][im - 2][1];
+    }
+    jj = gp[pid].neighbors[DOWNLEFT];
+    if (jj != -1) {
+        t2a[im - 1][0] = q_multi[jj][k][1][jm - 2];
+    }
+    jj = gp[pid].neighbors[DOWNRIGHT];
+    if (jj != -1) {
+        t2a[im - 1][jm - 1] = q_multi[jj][k][1][1];
+    }
+    if (gp[pid].neighbors[UP] == -1) {
+        jj = gp[pid].neighbors[LEFT];
+        if (jj != -1) {
+            t2a[0][0] = q_multi[jj][k][0][jm - 2];
+        } else {
+            jj = gp[pid].neighbors[DOWN];
+            if (jj != -1) {
+                t2a[im - 1][0] = q_multi[jj][k][1][0];
+            }
+        }
+        jj = gp[pid].neighbors[RIGHT];
+        if (jj != -1) {
+            t2a[0][jm - 1] = q_multi[jj][k][0][1];
+        } else {
+            jj = gp[pid].neighbors[DOWN];
+            if (jj != -1) {
+                t2a[im - 1][jm - 1] = q_multi[jj][k][1][jm - 1];
+            }
+        }
+    } else if (gp[pid].neighbors[DOWN] == -1) {
+        jj = gp[pid].neighbors[LEFT];
+        if (jj != -1) {
+            t2a[im - 1][0] = q_multi[jj][k][im - 1][jm - 2];
+        } else {
+            jj = gp[pid].neighbors[UP];
+            if (jj != -1) {
+                t2a[0][0] = q_multi[jj][k][im - 2][0];
+            }
+        }
+        jj = gp[pid].neighbors[RIGHT];
+        if (jj != -1) {
+            t2a[im - 1][jm - 1] = q_multi[jj][k][im - 1][1];
+        } else {
+            jj = gp[pid].neighbors[UP];
+            if (jj != -1) {
+                t2a[0][jm - 1] = q_multi[jj][k][im - 2][jm - 1];
+            }
+        }
+    } else if (gp[pid].neighbors[LEFT] == -1) {
+        jj = gp[pid].neighbors[UP];
+        if (jj != -1) {
+            t2a[0][0] = q_multi[jj][k][im - 2][0];
+        }
+        jj = gp[pid].neighbors[DOWN];
+        if (jj != -1) {
+            t2a[im - 1][0] = q_multi[jj][k][1][0];
+        }
+    } else if (gp[pid].neighbors[RIGHT] == -1) {
+        jj = gp[pid].neighbors[UP];
+        if (jj != -1) {
+            t2a[0][jm - 1] = q_multi[jj][k][im - 2][jm - 1];
+        }
+        jj = gp[pid].neighbors[DOWN];
+        if (jj != -1) {
+            t2a[im - 1][jm - 1] = q_multi[jj][k][1][jm - 1];
+        }
+    }
+    j = gp[pid].neighbors[UP];
+    if (j != -1) {
+        t1a = (double *) t2a[0];
+        t1b = (double *) q_multi[j][k][im - 2];
+        for (i = 1; i <= lastcol; i++) {
+            t1a[i] = t1b[i];
+        }
+    }
+    j = gp[pid].neighbors[DOWN];
+    if (j != -1) {
+        t1a = (double *) t2a[im - 1];
+        t1b = (double *) q_multi[j][k][1];
+        for (i = 1; i <= lastcol; i++) {
+            t1a[i] = t1b[i];
+        }
+    }
+    j = gp[pid].neighbors[LEFT];
+    if (j != -1) {
+        t2b = (double **) q_multi[j][k];
+        for (i = 1; i <= lastrow; i++) {
+            t2a[i][0] = t2b[i][jm - 2];
+        }
+    }
+    j = gp[pid].neighbors[RIGHT];
+    if (j != -1) {
+        t2b = (double **) q_multi[j][k];
+        for (i = 1; i <= lastrow; i++) {
+            t2a[i][jm - 1] = t2b[i][1];
+        }
+    }
+}
+void copy_rhs_borders(long k, long procid)
+{
+    long i;
+    long j;
+    long im;
+    long jm;
+    long lastrow;
+    long lastcol;
+    double **t2a;
+    double **t2b;
+    double *t1a;
+    double *t1b;
+    im = (imx[k] - 2) / yprocs + 2;
+    jm = (jmx[k] - 2) / xprocs + 2;
+    lastrow = (imx[k] - 2) / yprocs;
+    lastcol = (jmx[k] - 2) / xprocs;
+    t2a = (double **) rhs_multi[procid][k];
+    if (gp[procid].neighbors[UPLEFT] != -1) {
+        j = gp[procid].neighbors[UPLEFT];
+        t2a[0][0] = rhs_multi[j][k][im - 2][jm - 2];
+    }
+    if (gp[procid].neighbors[UP] != -1) {
+        j = gp[procid].neighbors[UP];
+        if (j != -1) {
+            t1a = (double *) t2a[0];
+            t1b = (double *) rhs_multi[j][k][im - 2];
+            for (i = 2; i <= lastcol; i += 2) {
+                t1a[i] = t1b[i];
+            }
+        }
+    }
+    if (gp[procid].neighbors[LEFT] != -1) {
+        j = gp[procid].neighbors[LEFT];
+        if (j != -1) {
+            t2b = (double **) rhs_multi[j][k];
+            for (i = 2; i <= lastrow; i += 2) {
+                t2a[i][0] = t2b[i][jm - 2];
+            }
+        }
+    }
+}
+void copy_red(long k, long procid)
+{
+    long i;
+    long j;
+    long im;
+    long jm;
+    long lastrow;
+    long lastcol;
+    double **t2a;
+    double **t2b;
+    double *t1a;
+    double *t1b;
+    im = (imx[k] - 2) / yprocs + 2;
+    jm = (jmx[k] - 2) / xprocs + 2;
+    lastrow = (imx[k] - 2) / yprocs;
+    lastcol = (jmx[k] - 2) / xprocs;
+    t2a = (double **) q_multi[procid][k];
+    j = gp[procid].neighbors[UP];
+    if (j != -1) {
+        t1a = (double *) t2a[0];
+        t1b = (double *) q_multi[j][k][im - 2];
+        for (i = 2; i <= lastcol; i += 2) {
+            t1a[i] = t1b[i];
+        }
+    }
+    j = gp[procid].neighbors[DOWN];
+    if (j != -1) {
+        t1a = (double *) t2a[im - 1];
+        t1b = (double *) q_multi[j][k][1];
+        for (i = 1; i <= lastcol; i += 2) {
+            t1a[i] = t1b[i];
+        }
+    }
+    j = gp[procid].neighbors[LEFT];
+    if (j != -1) {
+        t2b = (double **) q_multi[j][k];
+        for (i = 2; i <= lastrow; i += 2) {
+            t2a[i][0] = t2b[i][jm - 2];
+        }
+    }
+    j = gp[procid].neighbors[RIGHT];
+    if (j != -1) {
+        t2b = (double **) q_multi[j][k];
+        for (i = 1; i <= lastrow; i += 2) {
+            t2a[i][jm - 1] = t2b[i][1];
+        }
+    }
+}
+void copy_black(long k, long procid)
+{
+    long i;
+    long j;
+    long im;
+    long jm;
+    long lastrow;
+    long lastcol;
+    double **t2a;
+    double **t2b;
+    double *t1a;
+    double *t1b;
+    im = (imx[k] - 2) / yprocs + 2;
+    jm = (jmx[k] - 2) / xprocs + 2;
+    lastrow = (imx[k] - 2) / yprocs;
+    lastcol = (jmx[k] - 2) / xprocs;
+    t2a = (double **) q_multi[procid][k];
+    j = gp[procid].neighbors[UP];
+    if (j != -1) {
+        t1a = (double *) t2a[0];
+        t1b = (double *) q_multi[j][k][im - 2];
+        for (i = 1; i <= lastcol; i += 2) {
+            t1a[i] = t1b[i];
+        }
+    }
+    j = gp[procid].neighbors[DOWN];
+    if (j != -1) {
+        t1a = (double *) t2a[im - 1];
+        t1b = (double *) q_multi[j][k][1];
+        for (i = 2; i <= lastcol; i += 2) {
+            t1a[i] = t1b[i];
+        }
+    }
+    j = gp[procid].neighbors[LEFT];
+    if (j != -1) {
+        t2b = (double **) q_multi[j][k];
+        for (i = 1; i <= lastrow; i += 2) {
+            t2a[i][0] = t2b[i][jm - 2];
+        }
+    }
+    j = gp[procid].neighbors[RIGHT];
+    if (j != -1) {
+        t2b = (double **) q_multi[j][k];
+        for (i = 2; i <= lastrow; i += 2) {
+            t2a[i][jm - 1] = t2b[i][1];
+        }
+    }
+}

soft/giet_vm/applications/ocean/slave1.C

-                      r589
+                      r598
+#line 115 "/Users/alain/soc/giet_vm/applications/ocean/null_macros/c.m4.null.GIET"
+/*************************************************************************/
+/*                                                                       */
+/*  Copyright (c) 1994 Stanford University                               */
+/*                                                                       */
+/*  All rights reserved.                                                 */
+/*                                                                       */
+/*  Permission is given to use, copy, and modify this software for any   */
+/*  non-commercial purpose as long as this copyright notice is not       */
+/*  removed.  All other uses, including redistribution in whole or in    */
+/*  part, are forbidden without prior written permission.                */
+/*                                                                       */
+/*  This software is provided with absolutely no warranty and no         */
+/*  support.                                                             */
+/*                                                                       */
+/*************************************************************************/
+/*    ****************
+      subroutine slave
+      ****************  */
+EXTERN_ENV
+#include <stdio.h>
+#include <math.h>
+#include <stdlib.h>
+#include "decs.h"
+void slave(long *ptr_procid)
+{
+    long i;
+    long j;
+    long nstep;
+    long iindex;
+    long iday;
+    double ysca1;
+    double y;
+    double factor;
+    double sintemp;
+    double curlt;
+    double ressqr;
+    long istart;
+    long iend;
+    long jstart;
+    long jend;
+    long ist;
+    long ien;
+    long jst;
+    long jen;
+    double fac;
+    long dayflag = 0;
+    long dhourflag = 0;
+    long endflag = 0;
+    long firstrow;
+    long lastrow;
+    long numrows;
+    long firstcol;
+    long lastcol;
+    long numcols;
+    long psiindex;
+    double psibipriv;
+    double ttime;
+    double dhour;
+    double day;
+    long procid;
+    long j_off = 0;
+    unsigned long t1;
+    double **t2a;
+    double **t2b;
+    double *t1a;
+    double *t1b;
+    double *t1c;
+    double *t1d;
+    /*
+       LOCK(locks->idlock)
+       procid = global->id;
+       global->id = global->id+1;
+       UNLOCK(locks->idlock)
+     */
+    procid = *ptr_procid;
+    ressqr = lev_res[numlev - 1] * lev_res[numlev - 1];
+#if defined(MULTIPLE_BARRIERS)
+    BARRIER(bars->sl_prini, nprocs)
+#else
+    BARRIER(bars->barrier, nprocs)
+#endif
+/* POSSIBLE ENHANCEMENT:  Here is where one might pin processes to
+   processors to avoid migration. */
+/* POSSIBLE ENHANCEMENT:  Here is where one might distribute
+   data structures across physically distributed memories as
+   desired.
+   One way to do this is as follows.  The function allocate(START,SIZE,I)
+   is assumed to place all addresses x such that
+   (START <= x < START+SIZE) on node I.
+   long d_size;
+   unsigned long g_size;
+   unsigned long mg_size;
+   if (procid == MASTER) {
+     g_size = ((jmx[numlev-1]-2)/xprocs+2)*((imx[numlev-1]-2)/yprocs+2)*siz
+eof(double) +
+              ((imx[numlev-1]-2)/yprocs+2)*sizeof(double *);
+     mg_size = numlev*sizeof(double **);
+     for (i=0;i<numlev;i++) {
+       mg_size+=((imx[i]-2)/yprocs+2)*((jmx[i]-2)/xprocs+2)*sizeof(double)+
+                ((imx[i]-2)/yprocs+2)*sizeof(double *);
+     }
+     for (i= 0;i<nprocs;i++) {
+       d_size = 2*sizeof(double **);
+       allocate((unsigned long) psi[i],d_size,i);
+       allocate((unsigned long) psim[i],d_size,i);
+       allocate((unsigned long) work1[i],d_size,i);
+       allocate((unsigned long) work4[i],d_size,i);
+       allocate((unsigned long) work5[i],d_size,i);
+       allocate((unsigned long) work7[i],d_size,i);
+       allocate((unsigned long) temparray[i],d_size,i);
+       allocate((unsigned long) psi[i][0],g_size,i);
+       allocate((unsigned long) psi[i][1],g_size,i);
+       allocate((unsigned long) psim[i][0],g_size,i);
+       allocate((unsigned long) psim[i][1],g_size,i);
+       allocate((unsigned long) psium[i],g_size,i);
+       allocate((unsigned long) psilm[i],g_size,i);
+       allocate((unsigned long) psib[i],g_size,i);
+       allocate((unsigned long) ga[i],g_size,i);
+       allocate((unsigned long) gb[i],g_size,i);
+       allocate((unsigned long) work1[i][0],g_size,i);
+       allocate((unsigned long) work1[i][1],g_size,i);
+       allocate((unsigned long) work2[i],g_size,i);
+       allocate((unsigned long) work3[i],g_size,i);
+       allocate((unsigned long) work4[i][0],g_size,i);
+       allocate((unsigned long) work4[i][1],g_size,i);
+       allocate((unsigned long) work5[i][0],g_size,i);
+       allocate((unsigned long) work5[i][1],g_size,i);
+       allocate((unsigned long) work6[i],g_size,i);
+       allocate((unsigned long) work7[i][0],g_size,i);
+       allocate((unsigned long) work7[i][1],g_size,i);
+       allocate((unsigned long) temparray[i][0],g_size,i);
+       allocate((unsigned long) temparray[i][1],g_size,i);
+       allocate((unsigned long) tauz[i],g_size,i);
+       allocate((unsigned long) oldga[i],g_size,i);
+       allocate((unsigned long) oldgb[i],g_size,i);
+       d_size = numlev * sizeof(long);
+       allocate((unsigned long) gp[i].rel_num_x,d_size,i);
+       allocate((unsigned long) gp[i].rel_num_y,d_size,i);
+       allocate((unsigned long) gp[i].eist,d_size,i);
+       allocate((unsigned long) gp[i].ejst,d_size,i);
+       allocate((unsigned long) gp[i].oist,d_size,i);
+       allocate((unsigned long) gp[i].ojst,d_size,i);
+       allocate((unsigned long) gp[i].rlist,d_size,i);
+       allocate((unsigned long) gp[i].rljst,d_size,i);
+       allocate((unsigned long) gp[i].rlien,d_size,i);
+       allocate((unsigned long) gp[i].rljen,d_size,i);
+       allocate((unsigned long) q_multi[i],mg_size,i);
+       allocate((unsigned long) rhs_multi[i],mg_size,i);
+       allocate((unsigned long) &(gp[i]),sizeof(struct Global_Private),i);
+     }
+   }
+*/
+    t2a = (double **) oldga[procid];
+    t2b = (double **) oldgb[procid];
+    for (i = 0; i < im; i++) {
+        t1a = (double *) t2a[i];
+        t1b = (double *) t2b[i];
+        for (j = 0; j < jm; j++) {
+            t1a[j] = 0.0;
+            t1b[j] = 0.0;
+        }
+    }
+    firstcol = 1;
+    lastcol = firstcol + gp[procid].rel_num_x[numlev - 1] - 1;
+    firstrow = 1;
+    lastrow = firstrow + gp[procid].rel_num_y[numlev - 1] - 1;
+    numcols = gp[procid].rel_num_x[numlev - 1];
+    numrows = gp[procid].rel_num_y[numlev - 1];
+    j_off = (*gp[procid].colnum) * numcols;
+    /*
+       if (procid > nprocs/2) {
+       psinum = 2;
+       } else {
+       psinum = 1;
+       }
+     */
+/* every process gets its own copy of the timing variables to avoid
+   contention at shared memory locations.  here, these variables
+   are initialized.  */
+    ttime = 0.0;
+    dhour = 0.0;
+    nstep = 0;
+    day = 0.0;
+    ysca1 = 0.5 * ysca;
+    if (*gp[procid].lpid == MASTER) {
+        f = (double *) G_MALLOC(oim * sizeof(double), procid);
+        t1a = (double *) f;
+        for (iindex = 0; iindex <= jmx[numlev - 1] - 1; iindex++) {
+            y = ((double) iindex) * res;
+            t1a[iindex] = f0 + beta * (y - ysca1);
+        }
+    }
+    t2a = (double **) psium[procid];
+    if ((gp[procid].neighbors[UP] == -1) && (gp[procid].neighbors[LEFT] == -1)) {
+        t2a[0][0] = 0.0;
+    }
+    if ((gp[procid].neighbors[DOWN] == -1) && (gp[procid].neighbors[LEFT] == -1)) {
+        t2a[im - 1][0] = 0.0;
+    }
+    if ((gp[procid].neighbors[UP] == -1) && (gp[procid].neighbors[RIGHT] == -1)) {
+        t2a[0][jm - 1] = 0.0;
+    }
+    if ((gp[procid].neighbors[DOWN] == -1) && (gp[procid].neighbors[RIGHT] == -1)) {
+        t2a[im - 1][jm - 1] = 0.0;
+    }
+    if (gp[procid].neighbors[UP] == -1) {
+        t1a = (double *) t2a[0];
+        for (j = firstcol; j <= lastcol; j++) {
+            t1a[j] = 0.0;
+        }
+    }
+    if (gp[procid].neighbors[DOWN] == -1) {
+        t1a = (double *) t2a[im - 1];
+        for (j = firstcol; j <= lastcol; j++) {
+            t1a[j] = 0.0;
+        }
+    }
+    if (gp[procid].neighbors[LEFT] == -1) {
+        for (j = firstrow; j <= lastrow; j++) {
+            t2a[j][0] = 0.0;
+        }
+    }
+    if (gp[procid].neighbors[RIGHT] == -1) {
+        for (j = firstrow; j <= lastrow; j++) {
+            t2a[j][jm - 1] = 0.0;
+        }
+    }
+    for (i = firstrow; i <= lastrow; i++) {
+        t1a = (double *) t2a[i];
+        for (iindex = firstcol; iindex <= lastcol; iindex++) {
+            t1a[iindex] = 0.0;
+        }
+    }
+    t2a = (double **) psilm[procid];
+    if ((gp[procid].neighbors[UP] == -1) && (gp[procid].neighbors[LEFT] == -1)) {
+        t2a[0][0] = 0.0;
+    }
+    if ((gp[procid].neighbors[DOWN] == -1) && (gp[procid].neighbors[LEFT] == -1)) {
+        t2a[im - 1][0] = 0.0;
+    }
+    if ((gp[procid].neighbors[UP] == -1) && (gp[procid].neighbors[RIGHT] == -1)) {
+        t2a[0][jm - 1] = 0.0;
+    }
+    if ((gp[procid].neighbors[DOWN] == -1) && (gp[procid].neighbors[RIGHT] == -1)) {
+        t2a[im - 1][jm - 1] = 0.0;
+    }
+    if (gp[procid].neighbors[UP] == -1) {
+        t1a = (double *) t2a[0];
+        for (j = firstcol; j <= lastcol; j++) {
+            t1a[j] = 0.0;
+        }
+    }
+    if (gp[procid].neighbors[DOWN] == -1) {
+        t1a = (double *) t2a[im - 1];
+        for (j = firstcol; j <= lastcol; j++) {
+            t1a[j] = 0.0;
+        }
+    }
+    if (gp[procid].neighbors[LEFT] == -1) {
+        for (j = firstrow; j <= lastrow; j++) {
+            t2a[j][0] = 0.0;
+        }
+    }
+    if (gp[procid].neighbors[RIGHT] == -1) {
+        for (j = firstrow; j <= lastrow; j++) {
+            t2a[j][jm - 1] = 0.0;
+        }
+    }
+    for (i = firstrow; i <= lastrow; i++) {
+        t1a = (double *) t2a[i];
+        for (iindex = firstcol; iindex <= lastcol; iindex++) {
+            t1a[iindex] = 0.0;
+        }
+    }
+    t2a = (double **) psib[procid];
+    if ((gp[procid].neighbors[UP] == -1) && (gp[procid].neighbors[LEFT] == -1)) {
+        t2a[0][0] = 1.0;
+    }
+    if ((gp[procid].neighbors[UP] == -1) && (gp[procid].neighbors[RIGHT] == -1)) {
+        t2a[0][jm - 1] = 1.0;
+    }
+    if ((gp[procid].neighbors[DOWN] == -1) && (gp[procid].neighbors[LEFT] == -1)) {
+        t2a[im - 1][0] = 1.0;
+    }
+    if ((gp[procid].neighbors[DOWN] == -1) && (gp[procid].neighbors[RIGHT] == -1)) {
+        t2a[im - 1][jm - 1] = 1.0;
+    }
+    if (gp[procid].neighbors[UP] == -1) {
+        t1a = (double *) t2a[0];
+        for (j = firstcol; j <= lastcol; j++) {
+            t1a[j] = 1.0;
+        }
+    }
+    if (gp[procid].neighbors[DOWN] == -1) {
+        t1a = (double *) t2a[im - 1];
+        for (j = firstcol; j <= lastcol; j++) {
+            t1a[j] = 1.0;
+        }
+    }
+    if (gp[procid].neighbors[LEFT] == -1) {
+        for (j = firstrow; j <= lastrow; j++) {
+            t2a[j][0] = 1.0;
+        }
+    }
+    if (gp[procid].neighbors[RIGHT] == -1) {
+        for (j = firstrow; j <= lastrow; j++) {
+            t2a[j][jm - 1] = 1.0;
+        }
+    }
+    for (i = firstrow; i <= lastrow; i++) {
+        t1a = (double *) t2a[i];
+        for (iindex = firstcol; iindex <= lastcol; iindex++) {
+            t1a[iindex] = 0.0;
+        }
+    }
+/* wait until all processes have completed the above initialization  */
+#if defined(MULTIPLE_BARRIERS)
+    BARRIER(bars->sl_prini, nprocs)
+#else
+    BARRIER(bars->barrier, nprocs)
+#endif
+/* compute psib array (one-time computation) and integrate into psibi */
+        istart = 1;
+    iend = istart + gp[procid].rel_num_y[numlev - 1] - 1;
+    jstart = 1;
+    jend = jstart + gp[procid].rel_num_x[numlev - 1] - 1;
+    ist = istart;
+    ien = iend;
+    jst = jstart;
+    jen = jend;
+    if (gp[procid].neighbors[UP] == -1) {
+        istart = 0;
+    }
+    if (gp[procid].neighbors[LEFT] == -1) {
+        jstart = 0;
+    }
+    if (gp[procid].neighbors[DOWN] == -1) {
+        iend = im - 1;
+    }
+    if (gp[procid].neighbors[RIGHT] == -1) {
+        jend = jm - 1;
+    }
+    t2a = (double **) rhs_multi[procid][numlev - 1];
+    t2b = (double **) psib[procid];
+    for (i = istart; i <= iend; i++) {
+        t1a = (double *) t2a[i];
+        t1b = (double *) t2b[i];
+        for (j = jstart; j <= jend; j++) {
+            t1a[j] = t1b[j] * ressqr;
+        }
+    }
+    t2a = (double **) q_multi[procid][numlev - 1];
+    if (gp[procid].neighbors[UP] == -1) {
+        t1a = (double *) t2a[0];
+        t1b = (double *) t2b[0];
+        for (j = jstart; j <= jend; j++) {
+            t1a[j] = t1b[j];
+        }
+    }
+    if (gp[procid].neighbors[DOWN] == -1) {
+        t1a = (double *) t2a[im - 1];
+        t1b = (double *) t2b[im - 1];
+        for (j = jstart; j <= jend; j++) {
+            t1a[j] = t1b[j];
+        }
+    }
+    if (gp[procid].neighbors[LEFT] == -1) {
+        for (i = istart; i <= iend; i++) {
+            t2a[i][0] = t2b[i][0];
+        }
+    }
+    if (gp[procid].neighbors[RIGHT] == -1) {
+        for (i = istart; i <= iend; i++) {
+            t2a[i][jm - 1] = t2b[i][jm - 1];
+        }
+    }
+#if defined(MULTIPLE_BARRIERS)
+    BARRIER(bars->sl_psini, nprocs)
+#else
+    BARRIER(bars->barrier, nprocs)
+#endif
+    t2a = (double **) psib[procid];
+    j = gp[procid].neighbors[UP];
+    if (j != -1) {
+        t1a = (double *) t2a[0];
+        t1b = (double *) psib[j][im - 2];
+        for (i = 1; i < jm - 1; i++) {
+            t1a[i] = t1b[i];
+        }
+    }
+    j = gp[procid].neighbors[DOWN];
+    if (j != -1) {
+        t1a = (double *) t2a[im - 1];
+        t1b = (double *) psib[j][1];
+        for (i = 1; i < jm - 1; i++) {
+            t1a[i] = t1b[i];
+        }
+    }
+    j = gp[procid].neighbors[LEFT];
+    if (j != -1) {
+        t2b = (double **) psib[j];
+        for (i = 1; i < im - 1; i++) {
+            t2a[i][0] = t2b[i][jm - 2];
+        }
+    }
+    j = gp[procid].neighbors[RIGHT];
+    if (j != -1) {
+        t2b = (double **) psib[j];
+        for (i = 1; i < im - 1; i++) {
+            t2a[i][jm - 1] = t2b[i][1];
+        }
+    }
+    t2a = (double **) q_multi[procid][numlev - 1];
+    t2b = (double **) psib[procid];
+    fac = 1.0 / (4.0 - ressqr * eig2);
+    for (i = ist; i <= ien; i++) {
+        t1a = (double *) t2a[i];
+        t1b = (double *) t2b[i];
+        t1c = (double *) t2b[i - 1];
+        t1d = (double *) t2b[i + 1];
+        for (j = jst; j <= jen; j++) {
+            t1a[j] = fac * (t1d[j] + t1c[j] + t1b[j + 1] + t1b[j - 1] - ressqr * t1b[j]);
+        }
+    }
+    multig(procid);
+    for (i = istart; i <= iend; i++) {
+        t1a = (double *) t2a[i];
+        t1b = (double *) t2b[i];
+        for (j = jstart; j <= jend; j++) {
+            t1b[j] = t1a[j];
+        }
+    }
+#if defined(MULTIPLE_BARRIERS)
+    BARRIER(bars->sl_prini, nprocs)
+#else
+    BARRIER(bars->barrier, nprocs)
+#endif
+/* update the local running sum psibipriv by summing all the resulting
+   values in that process's share of the psib matrix   */
+    t2a = (double **) psib[procid];
+    psibipriv = 0.0;
+    if ((gp[procid].neighbors[UP] == -1) && (gp[procid].neighbors[LEFT] == -1)) {
+        psibipriv = psibipriv + 0.25 * (t2a[0][0]);
+    }
+    if ((gp[procid].neighbors[UP] == -1) && (gp[procid].neighbors[RIGHT] == -1)) {
+        psibipriv = psibipriv + 0.25 * (t2a[0][jm - 1]);
+    }
+    if ((gp[procid].neighbors[DOWN] == -1) && (gp[procid].neighbors[LEFT] == -1)) {
+        psibipriv = psibipriv + 0.25 * (t2a[im - 1][0]);
+    }
+    if ((gp[procid].neighbors[DOWN] == -1) && (gp[procid].neighbors[RIGHT] == -1)) {
+        psibipriv = psibipriv + 0.25 * (t2a[im - 1][jm - 1]);
+    }
+    if (gp[procid].neighbors[UP] == -1) {
+        t1a = (double *) t2a[0];
+        for (j = firstcol; j <= lastcol; j++) {
+            psibipriv = psibipriv + 0.5 * t1a[j];
+        }
+    }
+    if (gp[procid].neighbors[DOWN] == -1) {
+        t1a = (double *) t2a[im - 1];
+        for (j = firstcol; j <= lastcol; j++) {
+            psibipriv = psibipriv + 0.5 * t1a[j];
+        }
+    }
+    if (gp[procid].neighbors[LEFT] == -1) {
+        for (j = firstrow; j <= lastrow; j++) {
+            psibipriv = psibipriv + 0.5 * t2a[j][0];
+        }
+    }
+    if (gp[procid].neighbors[RIGHT] == -1) {
+        for (j = firstrow; j <= lastrow; j++) {
+            psibipriv = psibipriv + 0.5 * t2a[j][jm - 1];
+        }
+    }
+    for (i = firstrow; i <= lastrow; i++) {
+        t1a = (double *) t2a[i];
+        for (iindex = firstcol; iindex <= lastcol; iindex++) {
+            psibipriv = psibipriv + t1a[iindex];
+        }
+    }
+/* update the shared variable psibi by summing all the psibiprivs
+   of the individual processes into it.  note that this combined
+   private and shared sum method avoids accessing the shared
+   variable psibi once for every element of the matrix.  */
+    LOCK(locks->psibilock);
+    global->psibi = global->psibi + psibipriv;
+    UNLOCK(locks->psibilock);
+/* initialize psim matrices
+   if there is more than one process, then split the processes
+   between the two psim matrices; otherwise, let the single process
+   work on one first and then the other   */
+    for (psiindex = 0; psiindex <= 1; psiindex++) {
+        t2a = (double **) psim[procid][psiindex];
+        if ((gp[procid].neighbors[UP] == -1) && (gp[procid].neighbors[LEFT] == -1)) {
+            t2a[0][0] = 0.0;
+        }
+        if ((gp[procid].neighbors[DOWN] == -1) && (gp[procid].neighbors[LEFT] == -1)) {
+            t2a[im - 1][0] = 0.0;
+        }
+        if ((gp[procid].neighbors[UP] == -1) && (gp[procid].neighbors[RIGHT] == -1)) {
+            t2a[0][jm - 1] = 0.0;
+        }
+        if ((gp[procid].neighbors[DOWN] == -1) && (gp[procid].neighbors[RIGHT] == -1)) {
+            t2a[im - 1][jm - 1] = 0.0;
+        }
+        if (gp[procid].neighbors[UP] == -1) {
+            t1a = (double *) t2a[0];
+            for (j = firstcol; j <= lastcol; j++) {
+                t1a[j] = 0.0;
+            }
+        }
+        if (gp[procid].neighbors[DOWN] == -1) {
+            t1a = (double *) t2a[im - 1];
+            for (j = firstcol; j <= lastcol; j++) {
+                t1a[j] = 0.0;
+            }
+        }
+        if (gp[procid].neighbors[LEFT] == -1) {
+            for (j = firstrow; j <= lastrow; j++) {
+                t2a[j][0] = 0.0;
+            }
+        }
+        if (gp[procid].neighbors[RIGHT] == -1) {
+            for (j = firstrow; j <= lastrow; j++) {
+                t2a[j][jm - 1] = 0.0;
+            }
+        }
+        for (i = firstrow; i <= lastrow; i++) {
+            t1a = (double *) t2a[i];
+            for (iindex = firstcol; iindex <= lastcol; iindex++) {
+                t1a[iindex] = 0.0;
+            }
+        }
+    }
+/* initialize psi matrices the same way  */
+    for (psiindex = 0; psiindex <= 1; psiindex++) {
+        t2a = (double **) psi[procid][psiindex];
+        if ((gp[procid].neighbors[UP] == -1) && (gp[procid].neighbors[LEFT] == -1)) {
+            t2a[0][0] = 0.0;
+        }
+        if ((gp[procid].neighbors[UP] == -1) && (gp[procid].neighbors[RIGHT] == -1)) {
+            t2a[0][jm - 1] = 0.0;
+        }
+        if ((gp[procid].neighbors[DOWN] == -1) && (gp[procid].neighbors[LEFT] == -1)) {
+            t2a[im - 1][0] = 0.0;
+        }
+        if ((gp[procid].neighbors[DOWN] == -1) && (gp[procid].neighbors[RIGHT] == -1)) {
+            t2a[im - 1][jm - 1] = 0.0;
+        }
+        if (gp[procid].neighbors[UP] == -1) {
+            t1a = (double *) t2a[0];
+            for (j = firstcol; j <= lastcol; j++) {
+                t1a[j] = 0.0;
+            }
+        }
+        if (gp[procid].neighbors[DOWN] == -1) {
+            t1a = (double *) t2a[im - 1];
+            for (j = firstcol; j <= lastcol; j++) {
+                t1a[j] = 0.0;
+            }
+        }
+        if (gp[procid].neighbors[LEFT] == -1) {
+            for (j = firstrow; j <= lastrow; j++) {
+                t2a[j][0] = 0.0;
+            }
+        }
+        if (gp[procid].neighbors[RIGHT] == -1) {
+            for (j = firstrow; j <= lastrow; j++) {
+                t2a[j][jm - 1] = 0.0;
+            }
+        }
+        for (i = firstrow; i <= lastrow; i++) {
+            t1a = (double *) t2a[i];
+            for (iindex = firstcol; iindex <= lastcol; iindex++) {
+                t1a[iindex] = 0.0;
+            }
+        }
+    }
+/* compute input curl of wind stress */
+    t2a = (double **) tauz[procid];
+    ysca1 = .5 * ysca;
+    factor = -t0 * pi / ysca1;
+    if ((gp[procid].neighbors[UP] == -1) && (gp[procid].neighbors[LEFT] == -1)) {
+        t2a[0][0] = 0.0;
+    }
+    if ((gp[procid].neighbors[DOWN] == -1) && (gp[procid].neighbors[LEFT] == -1)) {
+        t2a[im - 1][0] = 0.0;
+    }
+    if ((gp[procid].neighbors[UP] == -1) && (gp[procid].neighbors[RIGHT] == -1)) {
+        sintemp = pi * ((double) jm - 1 + j_off) * res / ysca1;
+        sintemp = sin(sintemp);
+        t2a[0][jm - 1] = factor * sintemp;
+    }
+    if ((gp[procid].neighbors[DOWN] == -1) && (gp[procid].neighbors[RIGHT] == -1)) {
+        sintemp = pi * ((double) jm - 1 + j_off) * res / ysca1;
+        sintemp = sin(sintemp);
+        t2a[im - 1][jm - 1] = factor * sintemp;
+    }
+    if (gp[procid].neighbors[UP] == -1) {
+        t1a = (double *) t2a[0];
+        for (j = firstcol; j <= lastcol; j++) {
+            sintemp = pi * ((double) j + j_off) * res / ysca1;
+            sintemp = sin(sintemp);
+            curlt = factor * sintemp;
+            t1a[j] = curlt;
+        }
+    }
+    if (gp[procid].neighbors[DOWN] == -1) {
+        t1a = (double *) t2a[im - 1];
+        for (j = firstcol; j <= lastcol; j++) {
+            sintemp = pi * ((double) j + j_off) * res / ysca1;
+            sintemp = sin(sintemp);
+            curlt = factor * sintemp;
+            t1a[j] = curlt;
+        }
+    }
+    if (gp[procid].neighbors[LEFT] == -1) {
+        for (j = firstrow; j <= lastrow; j++) {
+            t2a[j][0] = 0.0;
+        }
+    }
+    if (gp[procid].neighbors[RIGHT] == -1) {
+        sintemp = pi * ((double) jm - 1 + j_off) * res / ysca1;
+        sintemp = sin(sintemp);
+        curlt = factor * sintemp;
+        for (j = firstrow; j <= lastrow; j++) {
+            t2a[j][jm - 1] = curlt;
+        }
+    }
+    for (i = firstrow; i <= lastrow; i++) {
+        t1a = (double *) t2a[i];
+        for (iindex = firstcol; iindex <= lastcol; iindex++) {
+            sintemp = pi * ((double) iindex + j_off) * res / ysca1;
+            sintemp = sin(sintemp);
+            curlt = factor * sintemp;
+            t1a[iindex] = curlt;
+        }
+    }
+#if defined(MULTIPLE_BARRIERS)
+    BARRIER(bars->sl_onetime, nprocs)
+#else
+    BARRIER(bars->barrier, nprocs)
+#endif
+/***************************************************************
+ one-time stuff over at this point
+ ***************************************************************/
+    while (!endflag) {
+        while ((!dayflag) || (!dhourflag)) {
+            dayflag = 0;
+            dhourflag = 0;
+            if (nstep == 1) {
+                for (i = 0; i < 10; i++) {
+                    gp[procid].steps_time[i] = 0;
+                }
+                if (procid == MASTER) {
+                    CLOCK(global->trackstart)
+                }
+                if ((procid == MASTER) || (do_stats)) {
+                    CLOCK(t1);
+                    (*gp[procid].total_time) = t1;
+                    (*gp[procid].multi_time) = 0;
+                }
+/* POSSIBLE ENHANCEMENT:  Here is where one might reset the
+   statistics that one is measuring about the parallel execution */
+            }
+            slave2(procid, firstrow, lastrow, numrows, firstcol, lastcol, numcols);
+/* update time and step number
+   note that these time and step variables are private i.e. every
+   process has its own copy and keeps track of its own time  */
+            ttime = ttime + dtau;
+            nstep = nstep + 1;
+            day = ttime / 86400.0;
+            if (day > ((double) outday0)) {
+                dayflag = 1;
+                iday = (long) day;
+                dhour = dhour + dtau;
+                if (dhour >= 86400.0) {
+                    dhourflag = 1;
+                }
+            }
+        }
+        dhour = 0.0;
+        t2a = (double **) psium[procid];
+        t2b = (double **) psim[procid][0];
+        if ((gp[procid].neighbors[UP] == -1) && (gp[procid].neighbors[LEFT] == -1)) {
+            t2a[0][0] = t2a[0][0] + t2b[0][0];
+        }
+        if ((gp[procid].neighbors[DOWN] == -1) && (gp[procid].neighbors[LEFT] == -1)) {
+            t2a[im - 1][0] = t2a[im - 1][0] + t2b[im - 1][0];
+        }
+        if ((gp[procid].neighbors[UP] == -1) && (gp[procid].neighbors[RIGHT] == -1)) {
+            t2a[0][jm - 1] = t2a[0][jm - 1] + t2b[0][jm - 1];
+        }
+        if ((gp[procid].neighbors[DOWN] == -1) && (gp[procid].neighbors[RIGHT] == -1)) {
+            t2a[im - 1][jm - 1] = t2a[im - 1][jm - 1] + t2b[im - 1][jm - 1];
+        }
+        if (gp[procid].neighbors[UP] == -1) {
+            t1a = (double *) t2a[0];
+            t1b = (double *) t2b[0];
+            for (j = firstcol; j <= lastcol; j++) {
+                t1a[j] = t1a[j] + t1b[j];
+            }
+        }
+        if (gp[procid].neighbors[DOWN] == -1) {
+            t1a = (double *) t2a[im - 1];
+            t1b = (double *) t2b[im - 1];
+            for (j = firstcol; j <= lastcol; j++) {
+                t1a[j] = t1a[j] + t1b[j];
+            }
+        }
+        if (gp[procid].neighbors[LEFT] == -1) {
+            for (j = firstrow; j <= lastrow; j++) {
+                t2a[j][0] = t2a[j][0] + t2b[j][0];
+            }
+        }
+        if (gp[procid].neighbors[RIGHT] == -1) {
+            for (j = firstrow; j <= lastrow; j++) {
+                t2a[j][jm - 1] = t2a[j][jm - 1] + t2b[j][jm - 1];
+            }
+        }
+        for (i = firstrow; i <= lastrow; i++) {
+            t1a = (double *) t2a[i];
+            t1b = (double *) t2b[i];
+            for (iindex = firstcol; iindex <= lastcol; iindex++) {
+                t1a[iindex] = t1a[iindex] + t1b[iindex];
+            }
+        }
+/* update values of psilm array to psilm + psim[2]  */
+        t2a = (double **) psilm[procid];
+        t2b = (double **) psim[procid][1];
+        if ((gp[procid].neighbors[UP] == -1) && (gp[procid].neighbors[LEFT] == -1)) {
+            t2a[0][0] = t2a[0][0] + t2b[0][0];
+        }
+        if ((gp[procid].neighbors[DOWN] == -1) && (gp[procid].neighbors[LEFT] == -1)) {
+            t2a[im - 1][0] = t2a[im - 1][0] + t2b[im - 1][0];
+        }
+        if ((gp[procid].neighbors[UP] == -1) && (gp[procid].neighbors[RIGHT] == -1)) {
+            t2a[0][jm - 1] = t2a[0][jm - 1] + t2b[0][jm - 1];
+        }
+        if ((gp[procid].neighbors[DOWN] == -1) && (gp[procid].neighbors[RIGHT] == -1)) {
+            t2a[im - 1][jm - 1] = t2a[im - 1][jm - 1] + t2b[im - 1][jm - 1];
+        }
+        if (gp[procid].neighbors[UP] == -1) {
+            t1a = (double *) t2a[0];
+            t1b = (double *) t2b[0];
+            for (j = firstcol; j <= lastcol; j++) {
+                t1a[j] = t1a[j] + t1b[j];
+            }
+        }
+        if (gp[procid].neighbors[DOWN] == -1) {
+            t1a = (double *) t2a[im - 1];
+            t1b = (double *) t2b[im - 1];
+            for (j = firstcol; j <= lastcol; j++) {
+                t1a[j] = t1a[j] + t1b[j];
+            }
+        }
+        if (gp[procid].neighbors[LEFT] == -1) {
+            for (j = firstrow; j <= lastrow; j++) {
+                t2a[j][0] = t2a[j][0] + t2b[j][0];
+            }
+        }
+        if (gp[procid].neighbors[RIGHT] == -1) {
+            for (j = firstrow; j <= lastrow; j++) {
+                t2a[j][jm - 1] = t2a[j][jm - 1] + t2b[j][jm - 1];
+            }
+        }
+        for (i = firstrow; i <= lastrow; i++) {
+            t1a = (double *) t2a[i];
+            t1b = (double *) t2b[i];
+            for (iindex = firstcol; iindex <= lastcol; iindex++) {
+                t1a[iindex] = t1a[iindex] + t1b[iindex];
+            }
+        }
+        if (iday >= (long) outday3) {
+            endflag = 1;
+        }
+    }
+    if ((procid == MASTER) || (do_stats)) {
+        CLOCK(t1);
+        (*gp[procid].total_time) = t1 - (*gp[procid].total_time);
+    }
+}

soft/giet_vm/applications/ocean/slave2.C

-                      r589
+                      r598
+#line 115 "/Users/alain/soc/giet_vm/applications/ocean/null_macros/c.m4.null.GIET"
+/*************************************************************************/
+/*                                                                       */
+/*  Copyright (c) 1994 Stanford University                               */
+/*                                                                       */
+/*  All rights reserved.                                                 */
+/*                                                                       */
+/*  Permission is given to use, copy, and modify this software for any   */
+/*  non-commercial purpose as long as this copyright notice is not       */
+/*  removed.  All other uses, including redistribution in whole or in    */
+/*  part, are forbidden without prior written permission.                */
+/*                                                                       */
+/*  This software is provided with absolutely no warranty and no         */
+/*  support.                                                             */
+/*                                                                       */
+/*************************************************************************/
+/*    ****************
+      subroutine slave2
+      ****************  */
+EXTERN_ENV
+#include <stdio.h>
+#include <math.h>
+#include <stdlib.h>
+#include "decs.h"
+void slave2(long procid, long firstrow, long lastrow, long numrows, long firstcol, long lastcol, long numcols)
+{
+    long i;
+    long j;
+    long iindex;
+    double hh1;
+    double hh3;
+    double hinv;
+    double h1inv;
+    long istart;
+    long iend;
+    long jstart;
+    long jend;
+    long ist;
+    long ien;
+    long jst;
+    long jen;
+    double ressqr;
+    double psiaipriv;
+    double f4;
+    double timst;
+    long psiindex;
+    long i_off;
+    long j_off;
+    long multi_start;
+    long multi_end;
+    double **t2a;
+    double **t2b;
+    double **t2c;
+    double **t2d;
+    double **t2e;
+    double **t2f;
+    double **t2g;
+    double **t2h;
+    double *t1a;
+    double *t1b;
+    double *t1c;
+    double *t1d;
+    double *t1e;
+    double *t1f;
+    double *t1g;
+    double *t1h;
+    ressqr = lev_res[numlev - 1] * lev_res[numlev - 1];
+    i_off = (*gp[procid].rownum) * numrows;
+    j_off = (*gp[procid].colnum) * numcols;
+    START_PHASE(procid, 1);
+/*   ***************************************************************
+          f i r s t     p h a s e   (of timestep calculation)
+     ***************************************************************/
+    t2a = (double **) ga[procid];
+    if ((gp[procid].neighbors[UP] == -1) && (gp[procid].neighbors[LEFT] == -1)) {
+        t2a[0][0] = 0.0;
+    }
+    if ((gp[procid].neighbors[DOWN] == -1) && (gp[procid].neighbors[LEFT] == -1)) {
+        t2a[im - 1][0] = 0.0;
+    }
+    if ((gp[procid].neighbors[UP] == -1) && (gp[procid].neighbors[RIGHT] == -1)) {
+        t2a[0][jm - 1] = 0.0;
+    }
+    if ((gp[procid].neighbors[DOWN] == -1) && (gp[procid].neighbors[RIGHT] == -1)) {
+        t2a[im - 1][jm - 1] = 0.0;
+    }
+    if (gp[procid].neighbors[UP] == -1) {
+        t1a = (double *) t2a[0];
+        for (j = firstcol; j <= lastcol; j++) {
+            t1a[j] = 0.0;
+        }
+    }
+    if (gp[procid].neighbors[DOWN] == -1) {
+        t1a = (double *) t2a[im - 1];
+        for (j = firstcol; j <= lastcol; j++) {
+            t1a[j] = 0.0;
+        }
+    }
+    if (gp[procid].neighbors[LEFT] == -1) {
+        for (j = firstrow; j <= lastrow; j++) {
+            t2a[j][0] = 0.0;
+        }
+    }
+    if (gp[procid].neighbors[RIGHT] == -1) {
+        for (j = firstrow; j <= lastrow; j++) {
+            t2a[j][jm - 1] = 0.0;
+        }
+    }
+    for (i = firstrow; i <= lastrow; i++) {
+        t1a = (double *) t2a[i];
+        for (iindex = firstcol; iindex <= lastcol; iindex++) {
+            t1a[iindex] = 0.0;
+        }
+    }
+    t2a = (double **) gb[procid];
+    if ((gp[procid].neighbors[UP] == -1) && (gp[procid].neighbors[LEFT] == -1)) {
+        t2a[0][0] = 0.0;
+    }
+    if ((gp[procid].neighbors[DOWN] == -1) && (gp[procid].neighbors[LEFT] == -1)) {
+        t2a[im - 1][0] = 0.0;
+    }
+    if ((gp[procid].neighbors[UP] == -1) && (gp[procid].neighbors[RIGHT] == -1)) {
+        t2a[0][jm - 1] = 0.0;
+    }
+    if ((gp[procid].neighbors[DOWN] == -1) && (gp[procid].neighbors[RIGHT] == -1)) {
+        t2a[im - 1][jm - 1] = 0.0;
+    }
+    if (gp[procid].neighbors[UP] == -1) {
+        t1a = (double *) t2a[0];
+        for (j = firstcol; j <= lastcol; j++) {
+            t1a[j] = 0.0;
+        }
+    }
+    if (gp[procid].neighbors[DOWN] == -1) {
+        t1a = (double *) t2a[im - 1];
+        for (j = firstcol; j <= lastcol; j++) {
+            t1a[j] = 0.0;
+        }
+    }
+    if (gp[procid].neighbors[LEFT] == -1) {
+        for (j = firstrow; j <= lastrow; j++) {
+            t2a[j][0] = 0.0;
+        }
+    }
+    if (gp[procid].neighbors[RIGHT] == -1) {
+        for (j = firstrow; j <= lastrow; j++) {
+            t2a[j][jm - 1] = 0.0;
+        }
+    }
+    for (i = firstrow; i <= lastrow; i++) {
+        t1a = (double *) t2a[i];
+        for (iindex = firstcol; iindex <= lastcol; iindex++) {
+            t1a[iindex] = 0.0;
+        }
+    }
+/* put the laplacian of psi{1,3} in work1{1,2}
+   note that psi(i,j,2) represents the psi3 array in
+   the original equations  */
+    for (psiindex = 0; psiindex <= 1; psiindex++) {
+        t2a = (double **) work1[procid][psiindex];
+        if ((gp[procid].neighbors[UP] == -1) && (gp[procid].neighbors[LEFT] == -1)) {
+            t2a[0][0] = 0;
+        }
+        if ((gp[procid].neighbors[DOWN] == -1) && (gp[procid].neighbors[LEFT] == -1)) {
+            t2a[im - 1][0] = 0;
+        }
+        if ((gp[procid].neighbors[UP] == -1) && (gp[procid].neighbors[RIGHT] == -1)) {
+            t2a[0][jm - 1] = 0;
+        }
+        if ((gp[procid].neighbors[DOWN] == -1) && (gp[procid].neighbors[RIGHT] == -1)) {
+            t2a[im - 1][jm - 1] = 0;
+        }
+        laplacalc(procid, psi, work1, psiindex, firstrow, lastrow, firstcol, lastcol);
+    }
+/* set values of work2 array to psi1 - psi3   */
+    t2a = (double **) work2[procid];
+    t2b = (double **) psi[procid][0];
+    t2c = (double **) psi[procid][1];
+    if ((gp[procid].neighbors[UP] == -1) && (gp[procid].neighbors[LEFT] == -1)) {
+        t2a[0][0] = t2b[0][0] - t2c[0][0];
+    }
+    if ((gp[procid].neighbors[DOWN] == -1) && (gp[procid].neighbors[LEFT] == -1)) {
+        t2a[im - 1][0] = t2b[im - 1][0] - t2c[im - 1][0];
+    }
+    if ((gp[procid].neighbors[UP] == -1) && (gp[procid].neighbors[RIGHT] == -1)) {
+        t2a[0][jm - 1] = t2b[0][jm - 1] - t2c[0][jm - 1];
+    }
+    if ((gp[procid].neighbors[DOWN] == -1) && (gp[procid].neighbors[RIGHT] == -1)) {
+        t2a[im - 1][jm - 1] = t2b[im - 1][jm - 1] - t2c[im - 1][jm - 1];
+    }
+    if (gp[procid].neighbors[UP] == -1) {
+        t1a = (double *) t2a[0];
+        t1b = (double *) t2b[0];
+        t1c = (double *) t2c[0];
+        for (j = firstcol; j <= lastcol; j++) {
+            t1a[j] = t1b[j] - t1c[j];
+        }
+    }
+    if (gp[procid].neighbors[DOWN] == -1) {
+        t1a = (double *) t2a[im - 1];
+        t1b = (double *) t2b[im - 1];
+        t1c = (double *) t2c[im - 1];
+        for (j = firstcol; j <= lastcol; j++) {
+            t1a[j] = t1b[j] - t1c[j];
+        }
+    }
+    if (gp[procid].neighbors[LEFT] == -1) {
+        for (j = firstrow; j <= lastrow; j++) {
+            t2a[j][0] = t2b[j][0] - t2c[j][0];
+        }
+    }
+    if (gp[procid].neighbors[RIGHT] == -1) {
+        for (j = firstrow; j <= lastrow; j++) {
+            t2a[j][jm - 1] = t2b[j][jm - 1] - t2c[j][jm - 1];
+        }
+    }
+    for (i = firstrow; i <= lastrow; i++) {
+        t1a = (double *) t2a[i];
+        t1b = (double *) t2b[i];
+        t1c = (double *) t2c[i];
+        for (iindex = firstcol; iindex <= lastcol; iindex++) {
+            t1a[iindex] = t1b[iindex] - t1c[iindex];
+        }
+    }
+/* set values of work3 array to h3/h * psi1 + h1/h * psi3  */
+    t2a = (double **) work3[procid];
+    hh3 = h3 / h;
+    hh1 = h1 / h;
+    if ((gp[procid].neighbors[UP] == -1) && (gp[procid].neighbors[LEFT] == -1)) {
+        t2a[0][0] = hh3 * t2a[0][0] + hh1 * t2c[0][0];
+    }
+    if ((gp[procid].neighbors[DOWN] == -1) && (gp[procid].neighbors[LEFT] == -1)) {
+        t2a[im - 1][0] = hh3 * t2a[im - 1][0] + hh1 * t2c[im - 1][0];
+    }
+    if ((gp[procid].neighbors[UP] == -1) && (gp[procid].neighbors[RIGHT] == -1)) {
+        t2a[0][jm - 1] = hh3 * t2a[0][jm - 1] + hh1 * t2c[0][jm - 1];
+    }
+    if ((gp[procid].neighbors[DOWN] == -1) && (gp[procid].neighbors[RIGHT] == -1)) {
+        t2a[im - 1][jm - 1] = hh3 * t2a[im - 1][jm - 1] + hh1 * t2c[im - 1][jm - 1];
+    }
+    if (gp[procid].neighbors[UP] == -1) {
+        for (j = firstcol; j <= lastcol; j++) {
+            t2a[0][j] = hh3 * t2a[0][j] + hh1 * t2c[0][j];
+        }
+    }
+    if (gp[procid].neighbors[DOWN] == -1) {
+        for (j = firstcol; j <= lastcol; j++) {
+            t2a[im - 1][j] = hh3 * t2a[im - 1][j] + hh1 * t2c[im - 1][j];
+        }
+    }
+    if (gp[procid].neighbors[LEFT] == -1) {
+        for (j = firstrow; j <= lastrow; j++) {
+            t2a[j][0] = hh3 * t2a[j][0] + hh1 * t2c[j][0];
+        }
+    }
+    if (gp[procid].neighbors[RIGHT] == -1) {
+        for (j = firstrow; j <= lastrow; j++) {
+            t2a[j][jm - 1] = hh3 * t2a[j][jm - 1] + hh1 * t2c[j][jm - 1];
+        }
+    }
+    for (i = firstrow; i <= lastrow; i++) {
+        t1a = (double *) t2a[i];
+        t1c = (double *) t2c[i];
+        for (iindex = firstcol; iindex <= lastcol; iindex++) {
+            t1a[iindex] = hh3 * t1a[iindex] + hh1 * t1c[iindex];
+        }
+    }
+/* set values of temparray{1,3} to psi{1,3}  */
+    for (psiindex = 0; psiindex <= 1; psiindex++) {
+        t2a = (double **) temparray[procid][psiindex];
+        t2b = (double **) psi[procid][psiindex];
+        if ((gp[procid].neighbors[UP] == -1) && (gp[procid].neighbors[LEFT] == -1)) {
+            t2a[0][0] = t2b[0][0];
+        }
+        if ((gp[procid].neighbors[DOWN] == -1) && (gp[procid].neighbors[LEFT] == -1)) {
+            t2a[im - 1][0] = t2b[im - 1][0];
+        }
+        if ((gp[procid].neighbors[UP] == -1) && (gp[procid].neighbors[RIGHT] == -1)) {
+            t2a[0][jm - 1] = t2b[0][jm - 1];
+        }
+        if ((gp[procid].neighbors[DOWN] == -1) && (gp[procid].neighbors[RIGHT] == -1)) {
+            t2a[im - 1][jm - 1] = t2b[im - 1][jm - 1];
+        }
+        if (gp[procid].neighbors[UP] == -1) {
+            for (j = firstcol; j <= lastcol; j++) {
+                t2a[0][j] = t2b[0][j];
+            }
+        }
+        if (gp[procid].neighbors[DOWN] == -1) {
+            for (j = firstcol; j <= lastcol; j++) {
+                t2a[im - 1][j] = t2b[im - 1][j];
+            }
+        }
+        if (gp[procid].neighbors[LEFT] == -1) {
+            for (j = firstrow; j <= lastrow; j++) {
+                t2a[j][0] = t2b[j][0];
+            }
+        }
+        if (gp[procid].neighbors[RIGHT] == -1) {
+            for (j = firstrow; j <= lastrow; j++) {
+                t2a[j][jm - 1] = t2b[j][jm - 1];
+            }
+        }
+        for (i = firstrow; i <= lastrow; i++) {
+            t1a = (double *) t2a[i];
+            t1b = (double *) t2b[i];
+            for (iindex = firstcol; iindex <= lastcol; iindex++) {
+                t1a[iindex] = t1b[iindex];
+            }
+        }
+    }
+    END_PHASE(procid, 1);
+#if defined(MULTIPLE_BARRIERS)
+    BARRIER(bars->sl_phase_1, nprocs)
+#else
+    BARRIER(bars->barrier, nprocs)
+#endif
+/*     *******************************************************
+              s e c o n d   p h a s e
+       *******************************************************
+   set values of psi{1,3} to psim{1,3}   */
+    START_PHASE(procid, 2);
+    for (psiindex = 0; psiindex <= 1; psiindex++) {
+        t2a = (double **) psi[procid][psiindex];
+        t2b = (double **) psim[procid][psiindex];
+        if ((gp[procid].neighbors[UP] == -1) && (gp[procid].neighbors[LEFT] == -1)) {
+            t2a[0][0] = t2b[0][0];
+        }
+        if ((gp[procid].neighbors[UP] == -1) && (gp[procid].neighbors[RIGHT] == -1)) {
+            t2a[0][jm - 1] = t2b[0][jm - 1];
+        }
+        if ((gp[procid].neighbors[DOWN] == -1) && (gp[procid].neighbors[LEFT] == -1)) {
+            t2a[im - 1][0] = t2b[im - 1][0];
+        }
+        if ((gp[procid].neighbors[DOWN] == -1) && (gp[procid].neighbors[RIGHT] == -1)) {
+            t2a[im - 1][jm - 1] = t2b[im - 1][jm - 1];
+        }
+        if (gp[procid].neighbors[UP] == -1) {
+            for (j = firstcol; j <= lastcol; j++) {
+                t2a[0][j] = t2b[0][j];
+            }
+        }
+        if (gp[procid].neighbors[DOWN] == -1) {
+            for (j = firstcol; j <= lastcol; j++) {
+                t2a[im - 1][j] = t2b[im - 1][j];
+            }
+        }
+        if (gp[procid].neighbors[LEFT] == -1) {
+            for (j = firstrow; j <= lastrow; j++) {
+                t2a[j][0] = t2b[j][0];
+            }
+        }
+        if (gp[procid].neighbors[RIGHT] == -1) {
+            for (j = firstrow; j <= lastrow; j++) {
+                t2a[j][jm - 1] = t2b[j][jm - 1];
+            }
+        }
+        for (i = firstrow; i <= lastrow; i++) {
+            t1a = (double *) t2a[i];
+            t1b = (double *) t2b[i];
+            for (iindex = firstcol; iindex <= lastcol; iindex++) {
+                t1a[iindex] = t1b[iindex];
+            }
+        }
+    }
+/* put the laplacian of the psim array
+   into the work7 array; first part of a three-laplacian
+   calculation to compute the friction terms  */
+    for (psiindex = 0; psiindex <= 1; psiindex++) {
+        t2a = (double **) work7[procid][psiindex];
+        if ((gp[procid].neighbors[UP] == -1) && (gp[procid].neighbors[LEFT] == -1)) {
+            t2a[0][0] = 0;
+        }
+        if ((gp[procid].neighbors[DOWN] == -1) && (gp[procid].neighbors[LEFT] == -1)) {
+            t2a[im - 1][0] = 0;
+        }
+        if ((gp[procid].neighbors[UP] == -1) && (gp[procid].neighbors[RIGHT] == -1)) {
+            t2a[0][jm - 1] = 0;
+        }
+        if ((gp[procid].neighbors[DOWN] == -1) && (gp[procid].neighbors[RIGHT] == -1)) {
+            t2a[im - 1][jm - 1] = 0;
+        }
+        laplacalc(procid, psim, work7, psiindex, firstrow, lastrow, firstcol, lastcol);
+    }
+/* to the values of the work1{1,2} arrays obtained from the
+   laplacians of psi{1,2} in the previous phase, add to the
+   elements of every column the corresponding value in the
+   one-dimenional f array  */
+    for (psiindex = 0; psiindex <= 1; psiindex++) {
+        t2a = (double **) work1[procid][psiindex];
+        if ((gp[procid].neighbors[UP] == -1) && (gp[procid].neighbors[LEFT] == -1)) {
+            t2a[0][0] = t2a[0][0] + f[0];
+        }
+        if ((gp[procid].neighbors[DOWN] == -1) && (gp[procid].neighbors[LEFT] == -1)) {
+            t2a[im - 1][0] = t2a[im - 1][0] + f[0];
+        }
+        if ((gp[procid].neighbors[UP] == -1) && (gp[procid].neighbors[RIGHT] == -1)) {
+            t2a[0][jm - 1] = t2a[0][jm - 1] + f[jmx[numlev - 1] - 1];
+        }
+        if ((gp[procid].neighbors[DOWN] == -1) && (gp[procid].neighbors[RIGHT] == -1)) {
+            t2a[im - 1][jm - 1] = t2a[im - 1][jm - 1] + f[jmx[numlev - 1] - 1];
+        }
+        if (gp[procid].neighbors[UP] == -1) {
+            for (j = firstcol; j <= lastcol; j++) {
+                t2a[0][j] = t2a[0][j] + f[j + j_off];
+            }
+        }
+        if (gp[procid].neighbors[DOWN] == -1) {
+            for (j = firstcol; j <= lastcol; j++) {
+                t2a[im - 1][j] = t2a[im - 1][j] + f[j + j_off];
+            }
+        }
+        if (gp[procid].neighbors[LEFT] == -1) {
+            for (j = firstrow; j <= lastrow; j++) {
+                t2a[j][0] = t2a[j][0] + f[j + i_off];
+            }
+        }
+        if (gp[procid].neighbors[RIGHT] == -1) {
+            for (j = firstrow; j <= lastrow; j++) {
+                t2a[j][jm - 1] = t2a[j][jm - 1] + f[j + i_off];
+            }
+        }
+        for (i = firstrow; i <= lastrow; i++) {
+            t1a = (double *) t2a[i];
+            for (iindex = firstcol; iindex <= lastcol; iindex++) {
+                t1a[iindex] = t1a[iindex] + f[iindex + j_off];
+            }
+        }
+    }
+    END_PHASE(procid, 2);
+#if defined(MULTIPLE_BARRIERS)
+    BARRIER(bars->sl_phase_2, nprocs)
+#else
+    BARRIER(bars->barrier, nprocs)
+#endif
+/*      *******************************************************
+                 t h i r d   p h a s e
+        *******************************************************
+   put the jacobian of the work1{1,2} and psi{1,3} arrays
+   (the latter currently in temparray) in the work5{1,2} arrays  */
+    START_PHASE(procid, 3);
+    for (psiindex = 0; psiindex <= 1; psiindex++) {
+        jacobcalc2(work1, temparray, work5, psiindex, procid, firstrow, lastrow, firstcol, lastcol);
+    }
+/* set values of psim{1,3} to temparray{1,3}  */
+    for (psiindex = 0; psiindex <= 1; psiindex++) {
+        t2a = (double **) psim[procid][psiindex];
+        t2b = (double **) temparray[procid][psiindex];
+        if ((gp[procid].neighbors[UP] == -1) && (gp[procid].neighbors[LEFT] == -1)) {
+            t2a[0][0] = t2b[0][0];
+        }
+        if ((gp[procid].neighbors[DOWN] == -1) && (gp[procid].neighbors[LEFT] == -1)) {
+            t2a[im - 1][0] = t2b[im - 1][0];
+        }
+        if ((gp[procid].neighbors[UP] == -1) && (gp[procid].neighbors[RIGHT] == -1)) {
+            t2a[0][jm - 1] = t2b[0][jm - 1];
+        }
+        if ((gp[procid].neighbors[DOWN] == -1) && (gp[procid].neighbors[RIGHT] == -1)) {
+            t2a[im - 1][jm - 1] = t2b[im - 1][jm - 1];
+        }
+        if (gp[procid].neighbors[UP] == -1) {
+            t1a = (double *) t2a[0];
+            t1b = (double *) t2b[0];
+            for (j = firstcol; j <= lastcol; j++) {
+                t1a[j] = t1b[j];
+            }
+        }
+        if (gp[procid].neighbors[DOWN] == -1) {
+            t1a = (double *) t2a[im - 1];
+            t1b = (double *) t2b[im - 1];
+            for (j = firstcol; j <= lastcol; j++) {
+                t1a[j] = t1b[j];
+            }
+        }
+        if (gp[procid].neighbors[LEFT] == -1) {
+            for (j = firstrow; j <= lastrow; j++) {
+                t2a[j][0] = t2b[j][0];
+            }
+        }
+        if (gp[procid].neighbors[RIGHT] == -1) {
+            for (j = firstrow; j <= lastrow; j++) {
+                t2a[j][jm - 1] = t2b[j][jm - 1];
+            }
+        }
+        for (i = firstrow; i <= lastrow; i++) {
+            t1a = (double *) t2a[i];
+            t1b = (double *) t2b[i];
+            for (iindex = firstcol; iindex <= lastcol; iindex++) {
+                t1a[iindex] = t1b[iindex];
+            }
+        }
+    }
+/* put the laplacian of the work7{1,2} arrays in the work4{1,2}
+   arrays; second step in the three-laplacian friction calculation  */
+    for (psiindex = 0; psiindex <= 1; psiindex++) {
+        laplacalc(procid, work7, work4, psiindex, firstrow, lastrow, firstcol, lastcol);
+    }
+    END_PHASE(procid, 3);
+#if defined(MULTIPLE_BARRIERS)
+    BARRIER(bars->sl_phase_3, nprocs)
+#else
+    BARRIER(bars->barrier, nprocs)
+#endif
+/*     *******************************************************
+                f o u r t h   p h a s e
+       *******************************************************
+   put the jacobian of the work2 and work3 arrays in the work6
+   array  */
+    START_PHASE(procid, 4);
+    jacobcalc(work2, work3, work6, procid, firstrow, lastrow, firstcol, lastcol);
+/* put the laplacian of the work4{1,2} arrays in the work7{1,2}
+   arrays; third step in the three-laplacian friction calculation  */
+    for (psiindex = 0; psiindex <= 1; psiindex++) {
+        laplacalc(procid, work4, work7, psiindex, firstrow, lastrow, firstcol, lastcol);
+    }
+    END_PHASE(procid, 4);
+#if defined(MULTIPLE_BARRIERS)
+    BARRIER(bars->sl_phase_4, nprocs)
+#else
+    BARRIER(bars->barrier, nprocs)
+#endif
+/*     *******************************************************
+                f i f t h   p h a s e
+       *******************************************************
+   use the values of the work5, work6 and work7 arrays
+   computed in the previous time-steps to compute the
+   ga and gb arrays   */
+    START_PHASE(procid, 5);
+    hinv = 1.0 / h;
+    h1inv = 1.0 / h1;
+    t2a = (double **) ga[procid];
+    t2b = (double **) gb[procid];
+    t2c = (double **) work5[procid][0];
+    t2d = (double **) work5[procid][1];
+    t2e = (double **) work7[procid][0];
+    t2f = (double **) work7[procid][1];
+    t2g = (double **) work6[procid];
+    t2h = (double **) tauz[procid];
+    if ((gp[procid].neighbors[UP] == -1) && (gp[procid].neighbors[LEFT] == -1)) {
+        t2a[0][0] = t2c[0][0] - t2d[0][0] + eig2 * t2g[0][0] + h1inv * t2h[0][0] + lf * t2e[0][0] - lf * t2f[0][0];
+        t2b[0][0] = hh1 * t2c[0][0] + hh3 * t2d[0][0] + hinv * t2h[0][0] + lf * hh1 * t2e[0][0] + lf * hh3 * t2f[0][0];
+    }
+    if ((gp[procid].neighbors[DOWN] == -1) && (gp[procid].neighbors[LEFT] == -1)) {
+        t2a[im - 1][0] = t2c[im - 1][0] - t2d[im - 1][0] + eig2 * t2g[im - 1][0] + h1inv * t2h[im - 1][0] + lf * t2e[im - 1][0] - lf * t2f[im - 1][0];
+        t2b[im - 1][0] = hh1 * t2c[im - 1][0] + hh3 * t2d[im - 1][0] + hinv * t2h[im - 1][0] + lf * hh1 * t2e[im - 1][0] + lf * hh3 * t2f[im - 1][0];
+    }
+    if ((gp[procid].neighbors[UP] == -1) && (gp[procid].neighbors[RIGHT] == -1)) {
+        t2a[0][jm - 1] = t2c[0][jm - 1] - t2d[0][jm - 1] + eig2 * t2g[0][jm - 1] + h1inv * t2h[0][jm - 1] + lf * t2e[0][jm - 1] - lf * t2f[0][jm - 1];
+        t2b[0][jm - 1] = hh1 * t2c[0][jm - 1] + hh3 * t2d[0][jm - 1] + hinv * t2h[0][jm - 1] + lf * hh1 * t2e[0][jm - 1] + lf * hh3 * t2f[0][jm - 1];
+    }
+    if ((gp[procid].neighbors[DOWN] == -1) && (gp[procid].neighbors[RIGHT] == -1)) {
+        t2a[im - 1][jm - 1] = t2c[im - 1][jm - 1] - t2d[im - 1][jm - 1] + eig2 * t2g[im - 1][jm - 1] + h1inv * t2h[im - 1][jm - 1] + lf * t2e[im - 1][jm - 1] - lf * t2f[im - 1][jm - 1];
+        t2b[im - 1][jm - 1] = hh1 * t2c[im - 1][jm - 1] + hh3 * t2d[im - 1][jm - 1] + hinv * t2h[im - 1][jm - 1] + lf * hh1 * t2e[im - 1][jm - 1] + lf * hh3 * t2f[im - 1][jm - 1];
+    }
+    if (gp[procid].neighbors[UP] == -1) {
+        t1a = (double *) t2a[0];
+        t1b = (double *) t2b[0];
+        t1c = (double *) t2c[0];
+        t1d = (double *) t2d[0];
+        t1e = (double *) t2e[0];
+        t1f = (double *) t2f[0];
+        t1g = (double *) t2g[0];
+        t1h = (double *) t2h[0];
+        for (j = firstcol; j <= lastcol; j++) {
+            t1a[j] = t1c[j] - t1d[j] + eig2 * t1g[j] + h1inv * t1h[j] + lf * t1e[j] - lf * t1f[j];
+            t1b[j] = hh1 * t1c[j] + hh3 * t1d[j] + hinv * t1h[j] + lf * hh1 * t1e[j] + lf * hh3 * t1f[j];
+        }
+    }
+    if (gp[procid].neighbors[DOWN] == -1) {
+        t1a = (double *) t2a[im - 1];
+        t1b = (double *) t2b[im - 1];
+        t1c = (double *) t2c[im - 1];
+        t1d = (double *) t2d[im - 1];
+        t1e = (double *) t2e[im - 1];
+        t1f = (double *) t2f[im - 1];
+        t1g = (double *) t2g[im - 1];
+        t1h = (double *) t2h[im - 1];
+        for (j = firstcol; j <= lastcol; j++) {
+            t1a[j] = t1c[j] - t1d[j] + eig2 * t1g[j] + h1inv * t1h[j] + lf * t1e[j] - lf * t1f[j];
+            t1b[j] = hh1 * t1c[j] + hh3 * t1d[j] + hinv * t1h[j] + lf * hh1 * t1e[j] + lf * hh3 * t1f[j];
+        }
+    }
+    if (gp[procid].neighbors[LEFT] == -1) {
+        for (j = firstrow; j <= lastrow; j++) {
+            t2a[j][0] = t2c[j][0] - t2d[j][0] + eig2 * t2g[j][0] + h1inv * t2h[j][0] + lf * t2e[j][0] - lf * t2f[j][0];
+            t2b[j][0] = hh1 * t2c[j][0] + hh3 * t2d[j][0] + hinv * t2h[j][0] + lf * hh1 * t2e[j][0] + lf * hh3 * t2f[j][0];
+        }
+    }
+    if (gp[procid].neighbors[RIGHT] == -1) {
+        for (j = firstrow; j <= lastrow; j++) {
+            t2a[j][jm - 1] = t2c[j][jm - 1] - t2d[j][jm - 1] + eig2 * t2g[j][jm - 1] + h1inv * t2h[j][jm - 1] + lf * t2e[j][jm - 1] - lf * t2f[j][jm - 1];
+            t2b[j][jm - 1] = hh1 * t2c[j][jm - 1] + hh3 * t2d[j][jm - 1] + hinv * t2h[j][jm - 1] + lf * hh1 * t2e[j][jm - 1] + lf * hh3 * t2f[j][jm - 1];
+        }
+    }
+    for (i = firstrow; i <= lastrow; i++) {
+        t1a = (double *) t2a[i];
+        t1b = (double *) t2b[i];
+        t1c = (double *) t2c[i];
+        t1d = (double *) t2d[i];
+        t1e = (double *) t2e[i];
+        t1f = (double *) t2f[i];
+        t1g = (double *) t2g[i];
+        t1h = (double *) t2h[i];
+        for (iindex = firstcol; iindex <= lastcol; iindex++) {
+            t1a[iindex] = t1c[iindex] - t1d[iindex] + eig2 * t1g[iindex] + h1inv * t1h[iindex] + lf * t1e[iindex] - lf * t1f[iindex];
+            t1b[iindex] = hh1 * t1c[iindex] + hh3 * t1d[iindex] + hinv * t1h[iindex] + lf * hh1 * t1e[iindex] + lf * hh3 * t1f[iindex];
+        }
+    }
+    END_PHASE(procid, 5);
+#if defined(MULTIPLE_BARRIERS)
+    BARRIER(bars->sl_phase_5, nprocs)
+#else
+    BARRIER(bars->barrier, nprocs)
+#endif
+/*     *******************************************************
+               s i x t h   p h a s e
+       *******************************************************  */
+    START_PHASE(procid, 6);
+    istart = 1;
+    iend = istart + gp[procid].rel_num_y[numlev - 1] - 1;
+    jstart = 1;
+    jend = jstart + gp[procid].rel_num_x[numlev - 1] - 1;
+    ist = istart;
+    ien = iend;
+    jst = jstart;
+    jen = jend;
+    if (gp[procid].neighbors[UP] == -1) {
+        istart = 0;
+    }
+    if (gp[procid].neighbors[LEFT] == -1) {
+        jstart = 0;
+    }
+    if (gp[procid].neighbors[DOWN] == -1) {
+        iend = im - 1;
+    }
+    if (gp[procid].neighbors[RIGHT] == -1) {
+        jend = jm - 1;
+    }
+    t2a = (double **) rhs_multi[procid][numlev - 1];
+    t2b = (double **) ga[procid];
+    t2c = (double **) oldga[procid];
+    t2d = (double **) q_multi[procid][numlev - 1];
+    for (i = istart; i <= iend; i++) {
+        t1a = (double *) t2a[i];
+        t1b = (double *) t2b[i];
+        for (j = jstart; j <= jend; j++) {
+            t1a[j] = t1b[j] * ressqr;
+        }
+    }
+    if (gp[procid].neighbors[UP] == -1) {
+        t1d = (double *) t2d[0];
+        t1b = (double *) t2b[0];
+        for (j = jstart; j <= jend; j++) {
+            t1d[j] = t1b[j];
+        }
+    }
+    if (gp[procid].neighbors[DOWN] == -1) {
+        t1d = (double *) t2d[im - 1];
+        t1b = (double *) t2b[im - 1];
+        for (j = jstart; j <= jend; j++) {
+            t1d[j] = t1b[j];
+        }
+    }
+    if (gp[procid].neighbors[LEFT] == -1) {
+        for (i = istart; i <= iend; i++) {
+            t2d[i][0] = t2b[i][0];
+        }
+    }
+    if (gp[procid].neighbors[RIGHT] == -1) {
+        for (i = istart; i <= iend; i++) {
+            t2d[i][jm - 1] = t2b[i][jm - 1];
+        }
+    }
+    //fac = 1.0 / (4.0 - ressqr*eig2);
+    for (i = ist; i <= ien; i++) {
+        t1d = (double *) t2d[i];
+        t1c = (double *) t2c[i];
+        for (j = jst; j <= jen; j++) {
+            t1d[j] = t1c[j];
+        }
+    }
+    if ((procid == MASTER) || (do_stats)) {
+        CLOCK(multi_start);
+    }
+    multig(procid);
+    if ((procid == MASTER) || (do_stats)) {
+        CLOCK(multi_end);
+        (*gp[procid].multi_time) += (multi_end - multi_start);
+    }
+/* the shared sum variable psiai is initialized to 0 at
+   every time-step  */
+    if (procid == MASTER) {
+        global->psiai = 0.0;
+    }
+/*  copy the solution for use as initial guess in next time-step  */
+    for (i = istart; i <= iend; i++) {
+        t1b = (double *) t2b[i];
+        t1c = (double *) t2c[i];
+        t1d = (double *) t2d[i];
+        for (j = jstart; j <= jend; j++) {
+            t1b[j] = t1d[j];
+            t1c[j] = t1d[j];
+        }
+    }
+    END_PHASE(procid, 6);
+#if defined(MULTIPLE_BARRIERS)
+    BARRIER(bars->sl_phase_6, nprocs)
+#else
+    BARRIER(bars->barrier, nprocs)
+#endif
+/*     *******************************************************
+                s e v e n t h   p h a s e
+       *******************************************************
+   every process computes the running sum for its assigned portion
+   in a private variable psiaipriv   */
+    START_PHASE(procid, 7);
+    psiaipriv = 0.0;
+    t2a = (double **) ga[procid];
+    if ((gp[procid].neighbors[UP] == -1) && (gp[procid].neighbors[LEFT] == -1)) {
+        psiaipriv = psiaipriv + 0.25 * (t2a[0][0]);
+    }
+    if ((gp[procid].neighbors[UP] == -1) && (gp[procid].neighbors[RIGHT] == -1)) {
+        psiaipriv = psiaipriv + 0.25 * (t2a[0][jm - 1]);
+    }
+    if ((gp[procid].neighbors[DOWN] == -1) && (gp[procid].neighbors[LEFT] == -1)) {
+        psiaipriv = psiaipriv + 0.25 * (t2a[im - 1][0]);
+    }
+    if ((gp[procid].neighbors[DOWN] == -1) && (gp[procid].neighbors[RIGHT] == -1)) {
+        psiaipriv = psiaipriv + 0.25 * (t2a[im - 1][jm - 1]);
+    }
+    if (gp[procid].neighbors[UP] == -1) {
+        t1a = (double *) t2a[0];
+        for (j = firstcol; j <= lastcol; j++) {
+            psiaipriv = psiaipriv + 0.5 * t1a[j];
+        }
+    }
+    if (gp[procid].neighbors[DOWN] == -1) {
+        t1a = (double *) t2a[im - 1];
+        for (j = firstcol; j <= lastcol; j++) {
+            psiaipriv = psiaipriv + 0.5 * t1a[j];
+        }
+    }
+    if (gp[procid].neighbors[LEFT] == -1) {
+        for (j = firstrow; j <= lastrow; j++) {
+            psiaipriv = psiaipriv + 0.5 * t2a[j][0];
+        }
+    }
+    if (gp[procid].neighbors[RIGHT] == -1) {
+        for (j = firstrow; j <= lastrow; j++) {
+            psiaipriv = psiaipriv + 0.5 * t2a[j][jm - 1];
+        }
+    }
+    for (i = firstrow; i <= lastrow; i++) {
+        t1a = (double *) t2a[i];
+        for (iindex = firstcol; iindex <= lastcol; iindex++) {
+            psiaipriv = psiaipriv + t1a[iindex];
+        }
+    }
+/* after computing its private sum, every process adds that to the
+   shared running sum psiai  */
+    LOCK(locks->psiailock)
+    global->psiai = global->psiai + psiaipriv;
+    UNLOCK(locks->psiailock)
+    END_PHASE(procid, 7);
+#if defined(MULTIPLE_BARRIERS)
+    BARRIER(bars->sl_phase_7, nprocs)
+#else
+    BARRIER(bars->barrier, nprocs)
+#endif
+/*      *******************************************************
+                e i g h t h   p h a s e
+        *******************************************************
+   augment ga(i,j) with [-psiai/psibi]*psib(i,j) */
+    START_PHASE(procid, 8);
+    f4 = (-global->psiai) /(global->psibi);
+    t2a = (double **) ga[procid];
+    t2b = (double **) psib[procid];
+    if ((gp[procid].neighbors[UP] == -1) && (gp[procid].neighbors[LEFT] == -1)) {
+        t2a[0][0] = t2a[0][0] + f4 * t2b[0][0];
+    }
+    if ((gp[procid].neighbors[DOWN] == -1) && (gp[procid].neighbors[LEFT] == -1)) {
+        t2a[im - 1][0] = t2a[im - 1][0] + f4 * t2b[im - 1][0];
+    }
+    if ((gp[procid].neighbors[UP] == -1) && (gp[procid].neighbors[RIGHT] == -1)) {
+        t2a[0][jm - 1] = t2a[0][jm - 1] + f4 * t2b[0][jm - 1];
+    }
+    if ((gp[procid].neighbors[DOWN] == -1) && (gp[procid].neighbors[RIGHT] == -1)) {
+        t2a[im - 1][jm - 1] = t2a[im - 1][jm - 1] + f4 * t2b[im - 1][jm - 1];
+    }
+    if (gp[procid].neighbors[UP] == -1) {
+        t1a = (double *) t2a[0];
+        t1b = (double *) t2b[0];
+        for (j = firstcol; j <= lastcol; j++) {
+            t1a[j] = t1a[j] + f4 * t1b[j];
+        }
+    }
+    if (gp[procid].neighbors[DOWN] == -1) {
+        t1a = (double *) t2a[im - 1];
+        t1b = (double *) t2b[im - 1];
+        for (j = firstcol; j <= lastcol; j++) {
+            t1a[j] = t1a[j] + f4 * t1b[j];
+        }
+    }
+    if (gp[procid].neighbors[LEFT] == -1) {
+        for (j = firstrow; j <= lastrow; j++) {
+            t2a[j][0] = t2a[j][0] + f4 * t2b[j][0];
+        }
+    }
+    if (gp[procid].neighbors[RIGHT] == -1) {
+        for (j = firstrow; j <= lastrow; j++) {
+            t2a[j][jm - 1] = t2a[j][jm - 1] + f4 * t2b[j][jm - 1];
+        }
+    }
+    for (i = firstrow; i <= lastrow; i++) {
+        t1a = (double *) t2a[i];
+        t1b = (double *) t2b[i];
+        for (iindex = firstcol; iindex <= lastcol; iindex++) {
+            t1a[iindex] = t1a[iindex] + f4 * t1b[iindex];
+        }
+    }
+    t2a = (double **) rhs_multi[procid][numlev - 1];
+    t2b = (double **) gb[procid];
+    t2c = (double **) oldgb[procid];
+    t2d = (double **) q_multi[procid][numlev - 1];
+    for (i = istart; i <= iend; i++) {
+        t1a = (double *) t2a[i];
+        t1b = (double *) t2b[i];
+        for (j = jstart; j <= jend; j++) {
+            t1a[j] = t1b[j] * ressqr;
+        }
+    }
+    if (gp[procid].neighbors[UP] == -1) {
+        t1d = (double *) t2d[0];
+        t1b = (double *) t2b[0];
+        for (j = jstart; j <= jend; j++) {
+            t1d[j] = t1b[j];
+        }
+    }
+    if (gp[procid].neighbors[DOWN] == -1) {
+        t1d = (double *) t2d[im - 1];
+        t1b = (double *) t2b[im - 1];
+        for (j = jstart; j <= jend; j++) {
+            t1d[j] = t1b[j];
+        }
+    }
+    if (gp[procid].neighbors[LEFT] == -1) {
+        for (i = istart; i <= iend; i++) {
+            t2d[i][0] = t2b[i][0];
+        }
+    }
+    if (gp[procid].neighbors[RIGHT] == -1) {
+        for (i = istart; i <= iend; i++) {
+            t2d[i][jm - 1] = t2b[i][jm - 1];
+        }
+    }
+    //fac = 1.0 / (4.0 - ressqr*eig2);
+    for (i = ist; i <= ien; i++) {
+        t1d = (double *) t2d[i];
+        t1c = (double *) t2c[i];
+        for (j = jst; j <= jen; j++) {
+            t1d[j] = t1c[j];
+        }
+    }
+    if ((procid == MASTER) || (do_stats)) {
+        CLOCK(multi_start);
+    }
+    multig(procid);
+    if ((procid == MASTER) || (do_stats)) {
+        CLOCK(multi_end);
+        (*gp[procid].multi_time) += (multi_end - multi_start);
+    }
+    for (i = istart; i <= iend; i++) {
+        t1b = (double *) t2b[i];
+        t1c = (double *) t2c[i];
+        t1d = (double *) t2d[i];
+        for (j = jstart; j <= jend; j++) {
+            t1b[j] = t1d[j];
+            t1c[j] = t1d[j];
+        }
+    }
+    END_PHASE(procid, 8);
+#if defined(MULTIPLE_BARRIERS)
+    BARRIER(bars->sl_phase_8, nprocs)
+#else
+    BARRIER(bars->barrier, nprocs)
+#endif
+/*      *******************************************************
+                n i n t h   p h a s e
+        *******************************************************
+   put appropriate linear combinations of ga and gb in work2 and work3;
+   note that here (as in most cases) the constant multipliers are made
+   private variables; the specific order in which things are done is
+   chosen in order to hopefully reuse things brought into the cache
+   note that here again we choose to have all processes share the work
+   on both matrices despite the fact that the work done per element
+   is the same, because the operand matrices are the same in both cases */
+    START_PHASE(procid, 9);
+    t2a = (double **) ga[procid];
+    t2b = (double **) gb[procid];
+    t2c = (double **) work2[procid];
+    t2d = (double **) work3[procid];
+    if ((gp[procid].neighbors[UP] == -1) && (gp[procid].neighbors[LEFT] == -1)) {
+        t2c[0][0] = t2b[0][0] - hh1 * t2a[0][0];
+        t2d[0][0] = t2b[0][0] + hh3 * t2a[0][0];
+    }
+    if ((gp[procid].neighbors[DOWN] == -1) && (gp[procid].neighbors[LEFT] == -1)) {
+        t2c[im - 1][0] = t2b[im - 1][0] - hh1 * t2a[im - 1][0];
+        t2d[im - 1][0] = t2b[im - 1][0] + hh3 * t2a[im - 1][0];
+    }
+    if ((gp[procid].neighbors[UP] == -1) && (gp[procid].neighbors[RIGHT] == -1)) {
+        t2c[0][jm - 1] = t2b[0][jm - 1] - hh1 * t2a[0][jm - 1];
+        t2d[0][jm - 1] = t2b[0][jm - 1] + hh3 * t2a[0][jm - 1];
+    }
+    if ((gp[procid].neighbors[DOWN] == -1) && (gp[procid].neighbors[RIGHT] == -1)) {
+        t2c[im - 1][jm - 1] = t2b[im - 1][jm - 1] - hh1 * t2a[im - 1][jm - 1];
+        t2d[im - 1][jm - 1] = t2b[im - 1][jm - 1] + hh3 * t2a[im - 1][jm - 1];
+    }
+    if (gp[procid].neighbors[UP] == -1) {
+        t1a = (double *) t2a[0];
+        t1b = (double *) t2b[0];
+        t1c = (double *) t2c[0];
+        t1d = (double *) t2d[0];
+        for (j = firstcol; j <= lastcol; j++) {
+            t1d[j] = t1b[j] + hh3 * t1a[j];
+            t1c[j] = t1b[j] - hh1 * t1a[j];
+        }
+    }
+    if (gp[procid].neighbors[DOWN] == -1) {
+        t1a = (double *) t2a[im - 1];
+        t1b = (double *) t2b[im - 1];
+        t1c = (double *) t2c[im - 1];
+        t1d = (double *) t2d[im - 1];
+        for (j = firstcol; j <= lastcol; j++) {
+            t1d[j] = t1b[j] + hh3 * t1a[j];
+            t1c[j] = t1b[j] - hh1 * t1a[j];
+        }
+    }
+    if (gp[procid].neighbors[LEFT] == -1) {
+        for (j = firstrow; j <= lastrow; j++) {
+            t2d[j][0] = t2b[j][0] + hh3 * t2a[j][0];
+            t2c[j][0] = t2b[j][0] - hh1 * t2a[j][0];
+        }
+    }
+    if (gp[procid].neighbors[RIGHT] == -1) {
+        for (j = firstrow; j <= lastrow; j++) {
+            t2d[j][jm - 1] = t2b[j][jm - 1] + hh3 * t2a[j][jm - 1];
+            t2c[j][jm - 1] = t2b[j][jm - 1] - hh1 * t2a[j][jm - 1];
+        }
+    }
+    for (i = firstrow; i <= lastrow; i++) {
+        t1a = (double *) t2a[i];
+        t1b = (double *) t2b[i];
+        t1c = (double *) t2c[i];
+        t1d = (double *) t2d[i];
+        for (iindex = firstcol; iindex <= lastcol; iindex++) {
+            t1d[iindex] = t1b[iindex] + hh3 * t1a[iindex];
+            t1c[iindex] = t1b[iindex] - hh1 * t1a[iindex];
+        }
+    }
+    END_PHASE(procid, 9);
+#if defined(MULTIPLE_BARRIERS)
+    BARRIER(bars->sl_phase_9, nprocs)
+#else
+    BARRIER(bars->barrier, nprocs)
+#endif
+/*      *******************************************************
+                t e n t h    p h a s e
+        *******************************************************/
+    START_PHASE(procid, 10);
+    timst = 2 * dtau;
+/* update the psi{1,3} matrices by adding 2*dtau*work3 to each */
+    t2a = (double **) psi[procid][0];
+    t2b = (double **) work3[procid];
+    if ((gp[procid].neighbors[UP] == -1) && (gp[procid].neighbors[LEFT] == -1)) {
+        t2a[0][0] = t2a[0][0] + timst * t2b[0][0];
+    }
+    if ((gp[procid].neighbors[DOWN] == -1) && (gp[procid].neighbors[LEFT] == -1)) {
+        t2a[im - 1][0] = t2a[im - 1][0] + timst * t2b[im - 1][0];
+    }
+    if ((gp[procid].neighbors[UP] == -1) && (gp[procid].neighbors[RIGHT] == -1)) {
+        t2a[0][jm - 1] = t2a[0][jm - 1] + timst * t2b[0][jm - 1];
+    }
+    if ((gp[procid].neighbors[DOWN] == -1) && (gp[procid].neighbors[RIGHT] == -1)) {
+        t2a[im - 1][jm - 1] = t2a[im - 1][jm - 1] + timst * t2b[im - 1][jm - 1];
+    }
+    if (gp[procid].neighbors[UP] == -1) {
+        t1a = (double *) t2a[0];
+        t1b = (double *) t2b[0];
+        for (j = firstcol; j <= lastcol; j++) {
+            t1a[j] = t1a[j] + timst * t1b[j];
+        }
+    }
+    if (gp[procid].neighbors[DOWN] == -1) {
+        t1a = (double *) t2a[im - 1];
+        t1b = (double *) t2b[im - 1];
+        for (j = firstcol; j <= lastcol; j++) {
+            t1a[j] = t1a[j] + timst * t1b[j];
+        }
+    }
+    if (gp[procid].neighbors[LEFT] == -1) {
+        for (j = firstrow; j <= lastrow; j++) {
+            t2a[j][0] = t2a[j][0] + timst * t2b[j][0];
+        }
+    }
+    if (gp[procid].neighbors[RIGHT] == -1) {
+        for (j = firstrow; j <= lastrow; j++) {
+            t2a[j][jm - 1] = t2a[j][jm - 1] + timst * t2b[j][jm - 1];
+        }
+    }
+    for (i = firstrow; i <= lastrow; i++) {
+        t1a = (double *) t2a[i];
+        t1b = (double *) t2b[i];
+        for (iindex = firstcol; iindex <= lastcol; iindex++) {
+            t1a[iindex] = t1a[iindex] + timst * t1b[iindex];
+        }
+    }
+    t2a = (double **) psi[procid][1];
+    t2b = (double **) work2[procid];
+    if ((gp[procid].neighbors[UP] == -1) && (gp[procid].neighbors[LEFT] == -1)) {
+        t2a[0][0] = t2a[0][0] + timst * t2b[0][0];
+    }
+    if ((gp[procid].neighbors[DOWN] == -1) && (gp[procid].neighbors[LEFT] == -1)) {
+        t2a[im - 1][0] = t2a[im - 1][0] + timst * t2b[im - 1][0];
+    }
+    if ((gp[procid].neighbors[UP] == -1) && (gp[procid].neighbors[RIGHT] == -1)) {
+        t2a[0][jm - 1] = t2a[0][jm - 1] + timst * t2b[0][jm - 1];
+    }
+    if ((gp[procid].neighbors[DOWN] == -1) && (gp[procid].neighbors[RIGHT] == -1)) {
+        t2a[im - 1][jm - 1] = t2a[im - 1][jm - 1] + timst * t2b[im - 1][jm - 1];
+    }
+    if (gp[procid].neighbors[UP] == -1) {
+        t1a = (double *) t2a[0];
+        t1b = (double *) t2b[0];
+        for (j = firstcol; j <= lastcol; j++) {
+            t1a[j] = t1a[j] + timst * t1b[j];
+        }
+    }
+    if (gp[procid].neighbors[DOWN] == -1) {
+        t1a = (double *) t2a[im - 1];
+        t1b = (double *) t2b[im - 1];
+        for (j = firstcol; j <= lastcol; j++) {
+            t1a[j] = t1a[j] + timst * t1b[j];
+        }
+    }
+    if (gp[procid].neighbors[LEFT] == -1) {
+        for (j = firstrow; j <= lastrow; j++) {
+            t2a[j][0] = t2a[j][0] + timst * t2b[j][0];
+        }
+    }
+    if (gp[procid].neighbors[RIGHT] == -1) {
+        for (j = firstrow; j <= lastrow; j++) {
+            t2a[j][jm - 1] = t2a[j][jm - 1] + timst * t2b[j][jm - 1];
+        }
+    }
+    for (i = firstrow; i <= lastrow; i++) {
+        t1a = (double *) t2a[i];
+        t1b = (double *) t2b[i];
+        for (iindex = firstcol; iindex <= lastcol; iindex++) {
+            t1a[iindex] = t1a[iindex] + timst * t1b[iindex];
+        }
+    }
+    END_PHASE(procid, 10);
+#if defined(MULTIPLE_BARRIERS)
+    BARRIER(bars->sl_phase_10, nprocs)
+#else
+    BARRIER(bars->barrier, nprocs)
+#endif
+}

soft/giet_vm/applications/ocean/subblock.C

-                      r589
+                      r598
+#line 115 "/Users/alain/soc/giet_vm/applications/ocean/null_macros/c.m4.null.GIET"
+/*************************************************************************/
+/*                                                                       */
+/*  Copyright (c) 1994 Stanford University                               */
+/*                                                                       */
+/*  All rights reserved.                                                 */
+/*                                                                       */
+/*  Permission is given to use, copy, and modify this software for any   */
+/*  non-commercial purpose as long as this copyright notice is not       */
+/*  removed.  All other uses, including redistribution in whole or in    */
+/*  part, are forbidden without prior written permission.                */
+/*                                                                       */
+/*  This software is provided with absolutely no warranty and no         */
+/*  support.                                                             */
+/*                                                                       */
+/*************************************************************************/
+EXTERN_ENV
+#include <stdio.h>
+#include <math.h>
+#include "decs.h"
+void subblock()
+{
+    long i;
+    long j;
+    long k;
+    long xportion;
+    long yportion;
+    long my_num;
+/* Determine starting coord and number of points to process in     */
+/* each direction                                                  */
+    for (i = 0; i < numlev; i++) {
+        xportion = (jmx[i] - 2) / xprocs;
+        //xextra = (jmx[i] - 2) % xprocs;
+        for (j = 0; j < xprocs; j++) {
+            for (k = 0; k < yprocs; k++) {
+                gp[k * xprocs + j].rel_num_x[i] = xportion;
+            }
+        }
+        yportion = (imx[i] - 2) / yprocs;
+        //yextra = (imx[i] - 2) % yprocs;
+        for (j = 0; j < yprocs; j++) {
+            for (k = 0; k < xprocs; k++) {
+                gp[j * xprocs + k].rel_num_y[i] = yportion;
+            }
+        }
+    }
+    for (my_num = 0; my_num < nprocs; my_num++) {
+        for (i = 0; i < numlev; i++) {
+            gp[my_num].rlist[i] = 1;
+            gp[my_num].rljst[i] = 1;
+            gp[my_num].rlien[i] = gp[my_num].rlist[i] + gp[my_num].rel_num_y[i];
+            gp[my_num].rljen[i] = gp[my_num].rljst[i] + gp[my_num].rel_num_x[i];
+            gp[my_num].eist[i] = gp[my_num].rlist[i] + 1;
+            gp[my_num].oist[i] = gp[my_num].rlist[i];
+            gp[my_num].ejst[i] = gp[my_num].rljst[i] + 1;
+            gp[my_num].ojst[i] = gp[my_num].rljst[i];
+        }
+    }
+    for (i = 0; i < nprocs; i++) {
+        gp[i].neighbors[LEFT] = -1;
+        gp[i].neighbors[RIGHT] = -1;
+        gp[i].neighbors[UP] = -1;
+        gp[i].neighbors[DOWN] = -1;
+        gp[i].neighbors[UPLEFT] = -1;
+        gp[i].neighbors[UPRIGHT] = -1;
+        gp[i].neighbors[DOWNLEFT] = -1;
+        gp[i].neighbors[DOWNRIGHT] = -1;
+        if (i >= xprocs) {
+            gp[i].neighbors[UP] = i - xprocs;
+        }
+        if (i < nprocs - xprocs) {
+            gp[i].neighbors[DOWN] = i + xprocs;
+        }
+        if ((i % xprocs) > 0) {
+            gp[i].neighbors[LEFT] = i - 1;
+        }
+        if ((i % xprocs) < (xprocs - 1)) {
+            gp[i].neighbors[RIGHT] = i + 1;
+        }
+        j = gp[i].neighbors[UP];
+        if (j != -1) {
+            if ((j % xprocs) > 0) {
+                gp[i].neighbors[UPLEFT] = j - 1;
+            }
+            if ((j % xprocs) < (xprocs - 1)) {
+                gp[i].neighbors[UPRIGHT] = j + 1;
+            }
+        }
+        j = gp[i].neighbors[DOWN];
+        if (j != -1) {
+            if ((j % xprocs) > 0) {
+                gp[i].neighbors[DOWNLEFT] = j - 1;
+            }
+            if ((j % xprocs) < (xprocs - 1)) {
+                gp[i].neighbors[DOWNRIGHT] = j + 1;
+            }
+        }
+    }
+    for (i = 0; i < nprocs; i++) {
+        (*gp[i].rownum) = i / xprocs;
+        (*gp[i].colnum) = i % xprocs;
+    }
+}

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 598 for soft

Legend:

Download in other formats: