// -*- C++ -*-
//
// Copyright (C) 1998, 1999, 2000, 2002  Los Alamos National Laboratory,
// Copyright (C) 1998, 1999, 2000, 2002  CodeSourcery, LLC
//
// This file is part of FreePOOMA.
//
// FreePOOMA is free software; you can redistribute it and/or modify it
// under the terms of the Expat license.
//
// This program is distributed in the hope that it will be useful, but
// WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Expat
// license for more details.
//
// You should have received a copy of the Expat license along with
// FreePOOMA; see the file LICENSE.
//

//-----------------------------------------------------------------------------
// Class MpiSpmd<Dim>
//
// Function mpiSpmdSetup();
//-----------------------------------------------------------------------------

#ifndef POOMA_BENCHMARKS_MESSAGING_MPISPMD_H
#define POOMA_BENCHMARKS_MESSAGING_MPISPMD_H

// Include files:
#include "Utilities/Benchmark.h"
#include "Utilities/Clock.h"
#include <stdlib.h>


//-----------------------------------------------------------------------------
// MpiSpmd class definition.
//-----------------------------------------------------------------------------

// Data-array container class; do best thing for 1D, 2D & 3D:
template <int D> class DataArrays;

template<> class DataArrays<1> 
{
public:
  double *a_m;
  double *b_m;
};
template<> class DataArrays<2> 
{
public:
  double **a_m;
  double **b_m;
};
template<> class DataArrays<3> 
{
public:
  double ***a_m;
  double ***b_m;
};



// General template:

template<int Dim>
class MpiSpmd : public Implementation
{
public:

  // Constructors:

  // The messageOnly parameter specifies whether to time only the MPI send and
  // receive calls, or whether to include the packing and unpacking to/from
  // buffers.
  MpiSpmd(const int *tiling, const int guards, const bool messageOnly = false)
    : guards_m(guards), messageOnly_m(messageOnly) {

    // Puke and die on unimplemented Dim values:
    if (Dim > 3) {
      std::cout << "Specialization for MpiSpmd<" << Dim 
                << " not implemented; quit." << std::endl;
      PInsist(false, "...quitting by calling PInsist.");
    }

    for (int d = 0; d < Dim; d++) { patches_m[d] = tiling[d]; }

    for (int d = 0; d < 2*Dim; d++) { tags_m[d] = 0 + 1000*d; }

    // Duplicate the MPI_COMM_WORLD communicator, to avoid conflict other
    // users of MPI_COMM_WORLD:
    MPI_Comm_dup(MPI_COMM_WORLD, &mpiSpmdComm_m);

    // Cache total number of processes, and caller's process ID (PE):
    npe_m = npe();
    pe_m = pe();

    // Cache domain decomposition information, including neighbor lists:

    procLevel(); // Set PE's levels in the array of subdomains.
    neighbors(); // Set neighboring PEs in guard-layer exchange

    initialized_m = false;
  }

  // This is an MPI SPMD benchmark:
  const char *type() const { return "MPI SPMD"; }
  const char *qualification() const { return "MB/s"; }

  void initialize(int n) {

    // Save the problem size (number of elements per dimension per patch):
    patchLength_m = n;
    totalPatchLength_m = n + 2*guards_m;

    // Number of elements per patch, including guard elements:
    totalPatchElems_m = 1;
    for (int d = 0; d < Dim; ++d) {
      totalPatchElems_m *= (patchLength_m + guards_m*2);
    }

    // [Re] allocate data arrays:
    if (initialized_m) {
      delete [] a_m;
      delete [] b_m;
    }
    a_m = new double[totalPatchElems_m];
    b_m = new double[totalPatchElems_m];

    // [Re] allocate the send/receive buffers:
    if (initialized_m) {
      for (int d = 0; d < 2*Dim; d++) {
        delete [] guardLayerOut_m[d];
      }
      delete [] guardLayerIn_m;
    }
    bufferSizes_m = guards_m;
    for (int d = 1; d < Dim; d++) { bufferSizes_m *= patchLength_m; }
    guardLayerIn_m = new double[bufferSizes_m];
    for (int d = 0; d < 2*Dim; d++) {
      guardLayerOut_m[d] = new double[bufferSizes_m];
    }

    // Zero everything out, including guard elements:
    for (int i = 0; i < totalPatchElems_m; i++) {
      a_m[i] = 0.0;
      b_m[i] = 0.0;
    }

    // Intialize array element values; bounce to dimensionality-dependent code:
    initializeArrays();
    
    // Save the index of the array element for value-checking:
    checkElement_m = guards_m + n/2;

    initialized_m = true;
  }

  void run();

  // Dimensionality-specialized array-value initializer:
  void initializeArrays();

  // Dimensionality-specialized guard-layer send-receive:
  void guardLayerExchange(double *a, double &messageTime);
  void guardLayerExchange(double **a, double &messageTime);
  void guardLayerExchange(double ***a, double &messageTime);

  // Dimensionality-specialized domain-decomposition initializers

  void procLevel(); // Set PE's levels in the array of subdomains

  void neighbors(); // Set neighboring PEs in guard-layer exchange

  // Query methods:

  // Level in dimension d of the processor (subdomain) array:
  int procLevel(int d) { return procLevel_m[d]; }

  // Calling processor's MPI rank:
  int pe() {
    int p;
    MPI_Comm_rank(mpiSpmdComm_m, &p);
    return p;
  }

  // Total number of processes:
  int npe() {
    int n;
    MPI_Comm_size(mpiSpmdComm_m, &n);
    return n;
  }

  // Number of subdomains in dimension d:
  int patches(int d) { return patches_m[d]; }

  // Return value for checking result of benchmark run:
  double resultCheck() const { return check_m; }

  // Return number of flops in this kernel (here, MB transferred):
  double flopCount() const { 
    double fc = 1;
    double flopsPerDimensionInKernel = 1;
    for (int d = 0; d < Dim; d++) {
      fc *= patchLength_m*flopsPerDimensionInKernel;
    }
    return fc;
  }

  // Return number of flops in this kernel (here, MB transferred):
  double opCount() const { 
    double oc = 2.0*Dim*bufferSizes_m * sizeof(double);
    return oc;
  }

  // Override virtual function to indicate we do internal timings
  bool internalClockCalls() const { return true; }

  // Override virtual function to return internal timing result
  double internalTimingResult() const { return time_m; }

private:

  // Data arrays.
  double *a_m;
  double *b_m;
  double * a1; double * b1;
  double ** a2; double ** b2;
  double *** a3; double *** b3;
//   DataArrays<Dim> dataArrays_m;

  // Flags whether initialize has ever been called:
  bool initialized_m;

  // Problem check value:
  double check_m;

  // Array element for checked value:
  int checkElement_m;

  // Problem size (number of elements per dimension per patch):
  int patchLength_m;

  // Number of elements per dimension per patch, inicluding guard elements:
  int totalPatchLength_m;

  // Number of elements per patch, including guard elements:
  int totalPatchElems_m;

  // Number of guard elements (symmetric all ways):
  int guards_m;

  // Size of guard-layer send/receive buffer:
  int bufferSizes_m;

  // Guard-layer send/receive buffers:
  double *guardLayerOut_m[2*Dim];
  double *guardLayerIn_m;

  // Tags for sends/receives:
  int tags_m[2*Dim];

  // Internal timing variable
  double time_m;

  // Whether or not to exclude packing data into and copying data out of the
  // send-buffers (guardLayerOut_m and guardLayerIn_m) when timing
  // guardLayerExchange(). Default = false.
  bool messageOnly_m;

  // an MPI communicator for this object to use, to avoid interfering
  // with other MPI usage.
  MPI_Comm mpiSpmdComm_m;

  // Domain decomposition:

  // Total number of processes:
  int npe_m;

  // Local process index (PE):
  int pe_m;

  // Number of patches per dimension:
  int patches_m[Dim];

  // Offsets in global index space of starting-element of subdomain-arrays:
  int globalOffset_m[Dim];

  // PE's levels in the array of subdomains:
  int procLevel_m[Dim];

  // Neighboring processor list:
  int neighbors_m[2*Dim];
};


// ------------------------------------------------------------
// Specializations on Dim for MpiSpmd<Dim>::initializeArrays():
// ------------------------------------------------------------

void MpiSpmd<1>::initializeArrays()
{
  int gO = 0; // Offset in global linearized index space.
  for (int d = 0; d < 2; d++) gO += globalOffset_m[d];

  a1 = a_m;
  b1 = b_m;

  int beginIndex = guards_m;
  int endIndex = beginIndex + patchLength_m;

  for (int i = beginIndex; i < endIndex; i++) {
    a1[i] = gO + i;
    b1[i] = 0;
  }
}

void MpiSpmd<2>::initializeArrays()
{
  int gO = 0; // Offset in global linearized index space.
  for (int d = 0; d < 2; d++) gO += globalOffset_m[d];

  a2 = new double *[totalPatchLength_m];
  b2 = new double *[totalPatchLength_m];

  for (int i = 0; i < totalPatchLength_m; i++) {
    a2[i] = &a_m[i*totalPatchLength_m];
    b2[i] = &b_m[i*totalPatchLength_m];
  }

  int beginIndex = guards_m;
  int endIndex = beginIndex + patchLength_m;

  for (int i = beginIndex; i < endIndex; i++) {
    for (int j = beginIndex; j< endIndex; j++) {
      a2[i][j] = gO + i + j;
      b2[i][j] = 0.0;
    }
  }
}

void MpiSpmd<3>::initializeArrays()
{
  int gO = 0; // Offset in global linearized index space.
  for (int d = 0; d < 2; d++) gO += globalOffset_m[d];
  int tp2 = totalPatchLength_m*totalPatchLength_m;

  a3 = new double **[totalPatchLength_m];
  b3 = new double **[totalPatchLength_m];
  for (int i = 0; i < totalPatchLength_m; i++) {
    a3[i] = new double*[totalPatchLength_m];
    b3[i] = new double*[totalPatchLength_m];
  }

  for (int i = 0; i < totalPatchLength_m; i++) {
    for (int j = 0; j < totalPatchLength_m; j++) {
      a3[i][j] = &a_m[i*tp2 + j*totalPatchLength_m];
      b3[i][j] = &b_m[i*tp2 + j*totalPatchLength_m];
    }
  }

  int beginIndex = guards_m;
  int endIndex = beginIndex + patchLength_m;

  for (int i = beginIndex; i < endIndex; i++) {
    for (int j = beginIndex; j< endIndex; j++) {
      for (int k = beginIndex; k< endIndex; k++) {
        a3[i][j][k] = gO + i + j + k;
        b3[i][j][k] = 0.0;
      }
    }
  }
}


// -----------------------------------------------------
// Specializations on Dim for MpiSpmd<Dim>::procLevel():
// -----------------------------------------------------

void MpiSpmd<1>::procLevel()
{
  procLevel_m[0] = pe_m % patches_m[0];
}

void MpiSpmd<2>::procLevel()
{
  procLevel_m[0] = pe_m % patches_m[0];
  procLevel_m[1] = (pe_m/patches_m[0]) % patches_m[1];
}

void MpiSpmd<3>::procLevel()
{
  procLevel_m[0] = pe_m % patches_m[0];
  procLevel_m[1] = (pe_m/patches_m[0]) % patches_m[1];
  procLevel_m[2] = pe_m/(patches_m[0]*patches_m[1]);
}


// -----------------------------------------------------
// Specializations on Dim for MpiSpmd<Dim>::neighbors():
// Note: periodic boundary conditions hardwired.
// -----------------------------------------------------

void MpiSpmd<1>::neighbors()
{
  int pLev;
  int pLev0 = procLevel_m[0];

  // West:
  if (pLev0 == 0) {
    pLev = patches(0) - 1;
  } else {
    pLev = pLev0 - 1;
  }
  neighbors_m[0] = pLev;

  // East:
  if (pLev0 == (patches(0) - 1)) {
    pLev = 0;
  } else {
    pLev = pLev0 + 1;
  }
  neighbors_m[1] = pLev;
}

void MpiSpmd<2>::neighbors()
{
  int pLev;
  int pLev0 = procLevel_m[0];
  int pLev1 = procLevel_m[1];

  // West:
  if (pLev0 == 0) {
    pLev = patches(0) - 1;
  } else {
    pLev = pLev0 - 1;
  }
  neighbors_m[0] = pLev + pLev1*patches(0);

  // East:
  if (pLev0 == (patches(0) - 1)) {
    pLev = 0;
  } else {
    pLev = pLev0 + 1;
  }
  neighbors_m[1] = pLev + pLev1*patches(0);

  // South:
  if (pLev1 == 0) {
    pLev = patches(1) - 1;
  } else {
    pLev = pLev1 - 1;
  }
  neighbors_m[2] = pLev0 + pLev*patches(0);

  // North:
  if (pLev1 == (patches(1) - 1)) {
    pLev = 0;
  } else {
    pLev = pLev1 + 1;
  }
  neighbors_m[3] = pLev0 + pLev*patches(0);
}

void MpiSpmd<3>::neighbors()
{
  int pLev;
  int pLev0 = procLevel_m[0];
  int pLev1 = procLevel_m[1];
  int pLev2 = procLevel_m[2];

  // West:
  if (pLev0  == 0) {
    pLev = patches(0) - 1;
  } else {
    pLev = pLev0 - 1;
  }
  neighbors_m[0] = pLev + pLev1*patches(0) + pLev2*patches(0)*patches(1);

  // East:
  if (pLev0  == (patches(0) - 1)) {
    pLev = 0;
  } else {
    pLev = pLev0 + 1;
  }
  neighbors_m[1] = pLev + pLev1*patches(0) + pLev2*patches(0)*patches(1);

  // South:
  if (pLev1  == 0) {
    pLev = patches(1) - 1;
  } else {
    pLev = pLev1 - 1;
  }
  neighbors_m[2] = pLev0 + pLev*patches(0) + pLev2*patches(0)*patches(1);

  // North:
  if (pLev1  == (patches(1) - 1)) {
    pLev = 0;
  } else {
    pLev = pLev1 + 1;
  }
  neighbors_m[3] = pLev0 + pLev*patches(0) + pLev2*patches(0)*patches(1);

  // Down:
  if (pLev2  == 0) {
    pLev = patches_m[2] - 1;
  } else {
    pLev = pLev2 - 1;
  }
  neighbors_m[4] = pLev0 + pLev1*patches(0) + pLev*patches(0)*patches(1);

  // Up:
  if (pLev2  == (patches(2) - 1)) {
    pLev = 0;
  } else {
    pLev = pLev2 + 1;
  }
  neighbors_m[5] = pLev0 + pLev1*patches(0) + pLev*patches(0)*patches(1);
}


// ----------------------------------------------------
// Specializations on Dim for MpiSpmd<Dim>::run():
// ----------------------------------------------------

void MpiSpmd<1>::run()
{
  int beginIndex = guards_m;
  int endIndex = guards_m + patchLength_m;
  for (int i = beginIndex; i < endIndex; i++) {
    a1[i] *= 2.0;
    b1[i] = a1[i];
  }

  // Guard-layer exchange; use slowest PE's time to compute speed:

  MPI_Barrier(mpiSpmdComm_m);
  double messageTime;
  double startTime = Pooma::Clock::value();

  guardLayerExchange(b1, messageTime);

  MPI_Barrier(mpiSpmdComm_m);
  double exchangeTime = Pooma::Clock::value() - startTime;
  double maxExchangeTime = 0;

  if (messageOnly_m) {
    // Time only the MPI send/receive calls
    MPI_Reduce(&messageTime, &maxExchangeTime, 1, MPI_DOUBLE, MPI_MAX, 0,
               mpiSpmdComm_m);

  } else {
    // Also include timing of the buffer-copying
    MPI_Reduce(&exchangeTime, &maxExchangeTime, 1, MPI_DOUBLE, MPI_MAX, 0,
               mpiSpmdComm_m);
  }

  time_m = maxExchangeTime;

  // Save a result value for checking:
  check_m = b1[checkElement_m];
}

void MpiSpmd<2>::run()
{
  int beginIndex = guards_m;
  int endIndex = guards_m + patchLength_m;
  for (int i = beginIndex; i < endIndex; i++) {
    for (int j = beginIndex; j < endIndex; j++) {
      a2[i][j] *= 2.0;
      b2[i][j] = a2[i][j];
    }
  }

  // Guard-layer exchange; use slowest PE's time to compute speed:

  MPI_Barrier(mpiSpmdComm_m);
  double startTime = Pooma::Clock::value();
  double messageTime;

  guardLayerExchange(b2, messageTime);

  MPI_Barrier(mpiSpmdComm_m);
  double exchangeTime = Pooma::Clock::value() - startTime;
  double maxExchangeTime = 0;

  if (messageOnly_m) {
    // Time only the MPI send/receive calls
    MPI_Reduce(&messageTime, &maxExchangeTime, 1, MPI_DOUBLE, MPI_MAX, 0,
               mpiSpmdComm_m);

  } else {
    // Also include timing of the buffer-copying
    MPI_Reduce(&exchangeTime, &maxExchangeTime, 1, MPI_DOUBLE, MPI_MAX, 0,
               mpiSpmdComm_m);
  }

  time_m = maxExchangeTime;

  // Save a result value for checking:
  check_m = b2[checkElement_m][checkElement_m];
}

void MpiSpmd<3>::run()
{
  int beginIndex = guards_m;
  int endIndex = guards_m + patchLength_m;
  for (int k = beginIndex; k < endIndex; k++) {
    for (int j = beginIndex; j < endIndex; j++) {
      for (int i = beginIndex; i < endIndex; i++) {
        a3[k][j][i] *= 2.0;
        b3[k][j][i] = a3[k][j][i];
      }
    }
  }

  // Guard-layer exchange; use slowest PE's time to compute speed:

  MPI_Barrier(mpiSpmdComm_m);
  double startTime = Pooma::Clock::value();
  double messageTime;

  guardLayerExchange(b3, messageTime);

  MPI_Barrier(mpiSpmdComm_m);
  double exchangeTime = Pooma::Clock::value() - startTime;
  double maxExchangeTime = 0;

  if (messageOnly_m) {
    // Time only the MPI send/receive calls
    MPI_Reduce(&messageTime, &maxExchangeTime, 1, MPI_DOUBLE, MPI_MAX, 0,
               mpiSpmdComm_m);

  } else {
    // Also include timing of the buffer-copying
    MPI_Reduce(&exchangeTime, &maxExchangeTime, 1, MPI_DOUBLE, MPI_MAX, 0,
               mpiSpmdComm_m);
  }

  time_m = maxExchangeTime;

  // Save a result value for checking:
  check_m = b3[checkElement_m][checkElement_m][checkElement_m];
}


// ------------------------------------------------------------
// Specializations on Dim for MpiSpmd<Dim>::guardLayerExchange():
// ------------------------------------------------------------

void MpiSpmd<1>::guardLayerExchange(double *a, double &messageTime)
{
  int beginIndex, endIndex;
  int ijk;
  MPI_Status status;

  // For optional buffer-copy-excluding timings done in here:
  double startTime;
  messageTime = 0.0;

  // Send to West:
  beginIndex = guards_m;
  endIndex = beginIndex + guards_m;
  ijk = 0;
  for (int i = beginIndex; i < endIndex; i++) {
    guardLayerOut_m[0][ijk] = a[i];
    ijk++;
  }
  tags_m[0]++;
  MPI_Request request0;

  if (messageOnly_m) startTime = Pooma::Clock::value();

  MPI_Isend(guardLayerOut_m[0], bufferSizes_m, MPI_DOUBLE, neighbors_m[0], 
            tags_m[0], mpiSpmdComm_m, &request0);

  if (messageOnly_m) messageTime += Pooma::Clock::value() - startTime;

  // Send to East:
  beginIndex = guards_m + patchLength_m - guards_m;
  endIndex = beginIndex + guards_m;
  ijk = 0;
  for (int i = beginIndex; i < endIndex; i++) {
    guardLayerOut_m[1][ijk] = a[i];
    ijk++;
  }
  tags_m[1]++;
  MPI_Request request1;

  if (messageOnly_m) startTime = Pooma::Clock::value();

  MPI_Isend(guardLayerOut_m[1], bufferSizes_m, MPI_DOUBLE, neighbors_m[1], 
            tags_m[1], mpiSpmdComm_m, &request1);

  if (messageOnly_m) messageTime += Pooma::Clock::value() - startTime;


  // Receive all messages, in any order, and extract the data:

  int nMsgs = 2;
  int messagesReceived = 0;
  int fromTag = -1;
#ifdef POOMA_BENCHMARKS_MESSAGING_MPISTUBS
  int count = 0;
#endif // POOMA_BENCHMARKS_MESSAGING_MPISTUBS

  while (messagesReceived < nMsgs) {

    // (Nonblocking) receive any incoming message:

    if (messageOnly_m) startTime = Pooma::Clock::value();

    MPI_Recv(guardLayerIn_m, bufferSizes_m, MPI_DOUBLE, MPI_ANY_SOURCE, 
             MPI_ANY_TAG, mpiSpmdComm_m, &status);

    if (messageOnly_m) messageTime += Pooma::Clock::value() - startTime;

    // Determine which direction it came from:
#ifdef POOMA_BENCHMARKS_MESSAGING_MPISTUBS
    fromTag = tags_m[count];
    count++;
#else
    fromTag = status.MPI_TAG;
#endif // POOMA_BENCHMARKS_MESSAGING_MPISTUBS

    // Copy the data into the corresponding guard elements:

    if (fromTag == tags_m[0]) { // From the East:
      beginIndex = 0;
      endIndex = beginIndex + guards_m;
      ijk = 0;
      for (int i = beginIndex; i < endIndex; i++) {
        a[i] = guardLayerIn_m[ijk];
        ijk++;
      }

    } else if (fromTag == tags_m[1]) { // From the West:
      beginIndex = guards_m + patchLength_m;
      endIndex = beginIndex + guards_m;
      ijk = 0;
      for (int i = beginIndex; i < endIndex; i++) {
        a[i] = guardLayerIn_m[ijk];
        ijk++;
      }

    } else { // Invalid tag:
      MPI_Finalize();
      PInsist(false, "Invalid tag. Quitting by calling PInsist.");
    }

    messagesReceived++;
  }

  if (messageOnly_m) {
    double maxMessageTime;
    MPI_Reduce(&messageTime, &maxMessageTime, 1, MPI_DOUBLE, MPI_MAX, 0,
               mpiSpmdComm_m);
    messageTime = maxMessageTime;
  }

}

void MpiSpmd<2>::guardLayerExchange(double **a, double &messageTime)
{
  int beginIndex, endIndex;
  int spanIndexBegin, spanIndexEnd;
  int ijk;
  MPI_Status status;

  // For optional buffer-copy-excluding timings done in here:
  double startTime;
  messageTime = 0.0;

  // Non-"active" indices span all non-guard elements:
  spanIndexBegin = guards_m;
  spanIndexEnd = spanIndexBegin + patchLength_m;

  // Send to West:
  beginIndex = guards_m;
  endIndex = beginIndex + guards_m;
  ijk = 0;
  for (int i = beginIndex; i < endIndex; i++) {
    for (int j = spanIndexBegin; j < spanIndexEnd; j++) {
      guardLayerOut_m[0][ijk] = a[i][j];
      ijk++;
    }
  }
  tags_m[0]++;
  MPI_Request request0;

  if (messageOnly_m) startTime = Pooma::Clock::value();

  MPI_Isend(guardLayerOut_m[0], bufferSizes_m, MPI_DOUBLE, neighbors_m[0], 
            tags_m[0], mpiSpmdComm_m, &request0);

  if (messageOnly_m) messageTime += Pooma::Clock::value() - startTime;


  // Send to East:
  beginIndex = guards_m + patchLength_m - guards_m;
  endIndex = beginIndex + guards_m;
  ijk = 0;
  for (int i = beginIndex; i < endIndex; i++) {
    for (int j = spanIndexBegin; j < spanIndexEnd; j++) {
      guardLayerOut_m[1][ijk] = a[i][j];
      ijk++;
    }
  }
  tags_m[1]++;
  MPI_Request request1;

  if (messageOnly_m) startTime = Pooma::Clock::value();

  MPI_Isend(guardLayerOut_m[1], bufferSizes_m, MPI_DOUBLE, neighbors_m[1], 
            tags_m[1], mpiSpmdComm_m, &request1);

  if (messageOnly_m) messageTime += Pooma::Clock::value() - startTime;


  // Send to South:
  beginIndex = guards_m;
  endIndex = beginIndex + guards_m;
  ijk = 0;
  for (int i = spanIndexBegin; i < spanIndexEnd; i++) {
    for (int j = beginIndex; j < endIndex; j++) {
      guardLayerOut_m[2][ijk] = a[i][j];
      ijk++;
    }
  }
  tags_m[2]++;
  MPI_Request request2;

  if (messageOnly_m) startTime = Pooma::Clock::value();

  MPI_Isend(guardLayerOut_m[2], bufferSizes_m, MPI_DOUBLE, neighbors_m[2], 
            tags_m[2], mpiSpmdComm_m, &request2);

  if (messageOnly_m) messageTime += Pooma::Clock::value() - startTime;

  // Send to North:
  beginIndex = guards_m + patchLength_m - guards_m;
  endIndex = beginIndex + guards_m;
  ijk = 0;
  for (int i = spanIndexBegin; i < spanIndexEnd; i++) {
    for (int j = beginIndex; j < endIndex; j++) {
      guardLayerOut_m[3][ijk] = a[i][j];
      ijk++;
    }
  }
  tags_m[3]++;
  MPI_Request request3;

  if (messageOnly_m) startTime = Pooma::Clock::value();

  MPI_Isend(guardLayerOut_m[3], bufferSizes_m, MPI_DOUBLE, neighbors_m[3], 
            tags_m[3], mpiSpmdComm_m, &request3);

  if (messageOnly_m) messageTime += Pooma::Clock::value() - startTime;


  // Receive all messages, in any order, and extract the data:

  int nMsgs = 4;
  int messagesReceived = 0;
  int fromTag = -1;

  while (messagesReceived < nMsgs) {

    // (Nonblocking) receive any incoming message:

    if (messageOnly_m) startTime = Pooma::Clock::value();

    MPI_Recv(guardLayerIn_m, bufferSizes_m, MPI_DOUBLE, MPI_ANY_SOURCE, 
             MPI_ANY_TAG, mpiSpmdComm_m, &status);

    if (messageOnly_m) messageTime += Pooma::Clock::value() - startTime;

    // Determine which direction it came from:
    fromTag = status.MPI_TAG;

    // debug:
    ijk = 0;
    beginIndex = guards_m + patchLength_m;
    endIndex = beginIndex + guards_m;
    for (int i = beginIndex; i < endIndex; i++) {
      for (int j = spanIndexBegin; j < spanIndexEnd; j++) {
        ijk++;
      }
    }
    // debug.

    // Copy the data into the corresponding guard elements:

    if (fromTag == tags_m[0]) { // From the East:
      beginIndex = guards_m + patchLength_m;
      endIndex = beginIndex + guards_m;
      ijk = 0;
      for (int i = beginIndex; i < endIndex; i++) {
        for (int j = spanIndexBegin; j < spanIndexEnd; j++) {
          a[i][j] = guardLayerIn_m[ijk];
          ijk++;
        }
      }

    } else if (fromTag == tags_m[1]) { // From the West:
      beginIndex = guards_m;
      endIndex = beginIndex + guards_m;
      ijk = 0;
      for (int i = beginIndex; i < endIndex; i++) {
        for (int j = spanIndexBegin; j < spanIndexEnd; j++) {
          a[i][j] = guardLayerIn_m[ijk];
          ijk++;
        }
      }

    } else if (fromTag == tags_m[3]) { // From the North:
      beginIndex = guards_m + patchLength_m;
      endIndex = beginIndex + guards_m;
      ijk = 0;
      for (int i = spanIndexBegin; i < spanIndexEnd; i++) {
        for (int j = beginIndex; j < endIndex; j++) {
          a[i][j] = guardLayerIn_m[ijk];
          ijk++;
        }
      }

    } else if (fromTag == tags_m[2]) { // From the South:
      beginIndex = 0;
      endIndex = beginIndex + guards_m;
      ijk = 0;
      for (int i = spanIndexBegin; i < spanIndexEnd; i++) {
        for (int j = beginIndex; j < endIndex; j++) {
          a[i][j] = guardLayerIn_m[ijk];
          ijk++;
        }
      }

    } else { // Invalid tag:
      MPI_Finalize();
      PInsist(false, "Invalid tag. Quitting by calling PInsist.");
    }

    messagesReceived++;
  }

  if (messageOnly_m) {
    double maxMessageTime;
    MPI_Reduce(&messageTime, &maxMessageTime, 1, MPI_DOUBLE, MPI_MAX, 0,
               mpiSpmdComm_m);
    messageTime = maxMessageTime;
  }

}

void MpiSpmd<3>::guardLayerExchange(double ***a, double &messageTime)
{
  int beginIndex, endIndex;
  int spanIndexBegin, spanIndexEnd;
  int ijk;
  MPI_Status status;

  // For optional buffer-copy-excluding timings done in here:
  double startTime;
  messageTime = 0.0;

  // Non-"active" indices span all non-guard elements:
  spanIndexBegin = guards_m;
  spanIndexEnd = spanIndexBegin + patchLength_m;

  // Send to West:
  beginIndex = guards_m;
  endIndex = beginIndex + guards_m;
  ijk = 0;
  for (int i = beginIndex; i < endIndex; i++) {
    for (int j = spanIndexBegin; j < spanIndexEnd; j++) {
      for (int k = spanIndexBegin; k < spanIndexEnd; k++) {
        guardLayerOut_m[0][ijk] = a[i][j][k];
        ijk++;
      }
    }
  }
  tags_m[0]++;
  MPI_Request request0;

  if (messageOnly_m) startTime = Pooma::Clock::value();

  MPI_Isend(guardLayerOut_m[0], bufferSizes_m, MPI_DOUBLE, neighbors_m[0], 
            tags_m[0], mpiSpmdComm_m, &request0);

  if (messageOnly_m) messageTime += Pooma::Clock::value() - startTime;

  // Send to East:
  beginIndex = guards_m + patchLength_m - guards_m;
  endIndex = beginIndex + guards_m;
  ijk = 0;
  for (int i = beginIndex; i < endIndex; i++) {
    for (int j = spanIndexBegin; j < spanIndexEnd; j++) {
      for (int k = spanIndexBegin; k < spanIndexEnd; k++) {
        guardLayerOut_m[1][ijk] = a[i][j][k];
        ijk++;
      }
    }
  }
  tags_m[1]++;
  MPI_Request request1;

  if (messageOnly_m) startTime = Pooma::Clock::value();

  MPI_Isend(guardLayerOut_m[1], bufferSizes_m, MPI_DOUBLE, neighbors_m[1], 
            tags_m[1], mpiSpmdComm_m, &request1);

  if (messageOnly_m) messageTime += Pooma::Clock::value() - startTime;


  // Send to South:
  beginIndex = guards_m;
  endIndex = beginIndex + guards_m;
  ijk = 0;
  for (int i = spanIndexBegin; i < spanIndexEnd; i++) {
    for (int j = beginIndex; j < endIndex; j++) {
      for (int k = spanIndexBegin; k < spanIndexEnd; k++) {
        guardLayerOut_m[2][ijk] = a[i][j][k];
        ijk++;
      }
    }
  }
  tags_m[2]++;
  MPI_Request request2;

  if (messageOnly_m) startTime = Pooma::Clock::value();

  MPI_Isend(guardLayerOut_m[2], bufferSizes_m, MPI_DOUBLE, neighbors_m[2], 
            tags_m[2], mpiSpmdComm_m, &request2);

  if (messageOnly_m) messageTime += Pooma::Clock::value() - startTime;

  // Send to North:
  beginIndex = guards_m + patchLength_m - guards_m;
  endIndex = beginIndex + guards_m;
  ijk = 0;
  for (int i = spanIndexBegin; i < spanIndexEnd; i++) {
    for (int j = beginIndex; j < endIndex; j++) {
      for (int k = spanIndexBegin; k < spanIndexEnd; k++) {
        guardLayerOut_m[3][ijk] = a[i][j][k];
        ijk++;
      }
    }
  }
  tags_m[3]++;
  MPI_Request request3;

  if (messageOnly_m) startTime = Pooma::Clock::value();

  MPI_Isend(guardLayerOut_m[3], bufferSizes_m, MPI_DOUBLE, neighbors_m[3], 
            tags_m[3], mpiSpmdComm_m, &request3);

  if (messageOnly_m) messageTime += Pooma::Clock::value() - startTime;


  // Send to Below:
  beginIndex = guards_m;
  endIndex = beginIndex + guards_m;
  ijk = 0;
  for (int i = spanIndexBegin; i < spanIndexEnd; i++) {
    for (int j = spanIndexBegin; j < spanIndexEnd; j++) {
      for (int k = beginIndex; k < endIndex; k++) {
        guardLayerOut_m[4][ijk] = a[i][j][k];
        ijk++;
      }
    }
  }
  tags_m[4]++;
  MPI_Request request4;

  if (messageOnly_m) startTime = Pooma::Clock::value();

  MPI_Isend(guardLayerOut_m[4], bufferSizes_m, MPI_DOUBLE, neighbors_m[2], 
            tags_m[4], mpiSpmdComm_m, &request4);

  if (messageOnly_m) messageTime += Pooma::Clock::value() - startTime;

  // Send to Above:
  beginIndex = guards_m + patchLength_m - guards_m;
  endIndex = beginIndex + guards_m;
  ijk = 0;
  for (int i = spanIndexBegin; i < spanIndexEnd; i++) {
    for (int j = spanIndexBegin; j < spanIndexEnd; j++) {
      for (int k = beginIndex; k < endIndex; k++) {
        guardLayerOut_m[5][ijk] = a[i][j][k];
        ijk++;
      }
    }
  }
  tags_m[5]++;
  MPI_Request request5;

  if (messageOnly_m) startTime = Pooma::Clock::value();

  MPI_Isend(guardLayerOut_m[5], bufferSizes_m, MPI_DOUBLE, neighbors_m[3], 
            tags_m[5], mpiSpmdComm_m, &request5);

  if (messageOnly_m) messageTime += Pooma::Clock::value() - startTime;


  // Receive all messages, in any order, and extract the data:

  int nMsgs = 6;
  int messagesReceived = 0;
  int fromTag = -1;

  while (messagesReceived < nMsgs) {

    // (Nonblocking) receive any incoming message:

    if (messageOnly_m) startTime = Pooma::Clock::value();

    MPI_Recv(guardLayerIn_m, bufferSizes_m, MPI_DOUBLE, MPI_ANY_SOURCE, 
             MPI_ANY_TAG, mpiSpmdComm_m, &status);

    if (messageOnly_m) messageTime += Pooma::Clock::value() - startTime;

    // Determine which direction it came from:
    fromTag = status.MPI_TAG;

    // Copy the data into the corresponding guard elements:

    if (fromTag == tags_m[0]) { // From the East:
      beginIndex = guards_m + patchLength_m;
      endIndex = beginIndex + guards_m;
      ijk = 0;
      for (int i = beginIndex; i < endIndex; i++) {
        for (int j = spanIndexBegin; j < spanIndexEnd; j++) {
          for (int k = spanIndexBegin; k < spanIndexEnd; k++) {
            a[i][j][k] = guardLayerIn_m[ijk];
            ijk++;
          }
        }
      }

    } else if (fromTag == tags_m[1]) { // From the West:
      beginIndex = guards_m;
      endIndex = beginIndex + guards_m;
      ijk = 0;
      for (int i = beginIndex; i < endIndex; i++) {
        for (int j = spanIndexBegin; j < spanIndexEnd; j++) {
          for (int k = spanIndexBegin; k < spanIndexEnd; k++) {
            a[i][j][k] = guardLayerIn_m[ijk];
            ijk++;
          }
        }
      }

    } else if (fromTag == tags_m[3]) { // From the North:
      beginIndex = guards_m + patchLength_m;
      endIndex = beginIndex + guards_m;
      ijk = 0;
      for (int i = spanIndexBegin; i < spanIndexEnd; i++) {
        for (int j = beginIndex; j < endIndex; j++) {
          for (int k = spanIndexBegin; k < spanIndexEnd; k++) {
            a[i][j][k] = guardLayerIn_m[ijk];
            ijk++;
          }
        }
      }

    } else if (fromTag == tags_m[2]) { // From the South:
      beginIndex = 0;
      endIndex = beginIndex + guards_m;
      ijk = 0;
      for (int i = spanIndexBegin; i < spanIndexEnd; i++) {
        for (int j = beginIndex; j < endIndex; j++) {
          for (int k = spanIndexBegin; k < spanIndexEnd; k++) {
            a[i][j][k] = guardLayerIn_m[ijk];
            ijk++;
          }
        }
      }

    } else if (fromTag == tags_m[5]) { // From the Above:
      beginIndex = guards_m + patchLength_m;
      endIndex = beginIndex + guards_m;
      ijk = 0;
      for (int i = spanIndexBegin; i < spanIndexEnd; i++) {
        for (int j = spanIndexBegin; j < spanIndexEnd; j++) {
          for (int k = beginIndex; k < endIndex; k++) {
            a[i][j][k] = guardLayerIn_m[ijk];
            ijk++;
          }
        }
      }

    } else if (fromTag == tags_m[4]) { // From the Below:
      beginIndex = 0;
      endIndex = beginIndex + guards_m;
      ijk = 0;
      for (int i = spanIndexBegin; i < spanIndexEnd; i++) {
        for (int j = spanIndexBegin; j < spanIndexEnd; j++) {
          for (int k = beginIndex; k < endIndex; k++) {
            a[i][j][k] = guardLayerIn_m[ijk];
            ijk++;
          }
        }
      }

    } else { // Invalid tag:
      MPI_Finalize();
      PInsist(false, "Invalid tag. Quitting by calling PInsist.");
    }

    messagesReceived++;
  }

  if (messageOnly_m) {
    double maxMessageTime;
    MPI_Reduce(&messageTime, &maxMessageTime, 1, MPI_DOUBLE, MPI_MAX, 0,
               mpiSpmdComm_m);
    messageTime = maxMessageTime;
  }

}

#endif // POOMA_BENCHMARKS_MESSAGING_MPISPMD_H

// ACL:rcsinfo
// ----------------------------------------------------------------------
// $RCSfile: MpiSpmd.h,v $   $Author: richard $
// $Revision: 1.13 $   $Date: 2004/11/01 18:15:15 $
// ----------------------------------------------------------------------
// ACL:rcsinfo
