// -*- C++ -*-
//
// Copyright (C) 1998, 1999, 2000, 2002  Los Alamos National Laboratory,
// Copyright (C) 1998, 1999, 2000, 2002  CodeSourcery, LLC
//
// This file is part of FreePOOMA.
//
// FreePOOMA is free software; you can redistribute it and/or modify it
// under the terms of the Expat license.
//
// This program is distributed in the hope that it will be useful, but
// WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Expat
// license for more details.
//
// You should have received a copy of the Expat license along with
// FreePOOMA; see the file LICENSE.
//

//-----------------------------------------------------------------------------
// Accumulate.h shows how to take advantage of parallelism in a simple
// loop-based accumulator.
// $Id: Accumulate.h,v 1.13 2004/11/01 18:15:43 richard Exp $
//-----------------------------------------------------------------------------

#ifndef ACCUMULATE_H
#define ACCUMULATE_H

#if POOMA_THREADS

#include <Pthread.h>

template<int D, class T, class E> class ConstArray;
template<int D> class UniformGridLayout;

//----------------------------------------------------------------------
// The guts of the accumulation algorithm.
// Specialized here for dimension 1, 2 and 3.
// Can't call these 'accumulate' because it would be ambiguous.
//----------------------------------------------------------------------

template<class T, class E>
inline T accumulateWithLoop(
    const ConstArray<1,T,E> & x
){
    T sum = 0;
    int f0 = x.first(0);
    int l0 = x.last(0);
    for (int i0=f0;i0<=l0; ++i0)
        sum += x(i0);
    return sum;
}

template<class T, class E>
inline T accumulateWithLoop(
    const ConstArray<2,T,E> & x
){
    T sum = 0;
    int f0 = x.first(0);
    int f1 = x.first(1);
    int l0 = x.last(0);
    int l1 = x.last(1);
    for (int i1=f1; i1<=l1; ++i1)
    {
        for (int i0=f0;i0<=l0; ++i0)
        {
            sum += x(i0,i1);
	}
    }
    return sum;
}

template<class T, class E>
inline T accumulateWithLoop(
    const ConstArray<3,T,E> & x
){
    T sum = 0;
    int f0 = x.first(0);
    int f1 = x.first(1);
    int f2 = x.first(2);
    int l0 = x.last(0);
    int l1 = x.last(1);
    int l2 = x.last(2);
    for (int i2=f2; i2<=l2; ++i2)
    {
        for (int i1=f1; i1<=l1; ++i1)
	{
            for (int i0=f0;i0<=l0; ++i0)
	    {
                sum += x(i0,i1);
	    }
	}
    }
    return sum;
}

//----------------------------------------------------------------------
// The user interface for accumulate.
// Bricks just call the dimension specialized versions.
//----------------------------------------------------------------------

template<int D, class T>
T accumulate(
    const ConstArray<D,T,Brick> & x
){
    // Make sure there's nothing going on in the background.
    Pooma::blockAndEvaluate();

    // Call the inner loop.
    return accumulateWithLoop(x);
}

template<int D1, class T, int D2, bool S>
T accumulate(
    const ConstArray<D1,T,BrickView<D2,S> > & x
){
    // Make sure there's nothing going on in the background.
    Pooma::blockAndEvaluate();

    // Call the inner loop.
    return accumulateWithLoop(x);
}

//----------------------------------------------------------------------
// class ResultHolder<T>
//
// A class which holds the result of a calculation in such
// a way that you don't have to worry about how it got it.
// That is handled in subclasses.
//----------------------------------------------------------------------

template<class T>
class ResultHolder
{
  public:
    ResultHolder()
    {}

    virtual ~ResultHolder()
    {}

    const T& get()
    {
        return result;
    }

  protected:
    T result;
};

//----------------------------------------------------------------------
// class ArrayAccumulator<T,ArrayType>
//
// A specific type of calculation that returns using a ResultHolder.
// This holds an array of arbitrary type and accumulates the sum
// into the result.
//----------------------------------------------------------------------

template<class T, class ArrayType>
class ArrayAccumulator : public ResultHolder<T>
{
  public:
    // Remember my type.
    typedef ArrayAccumulator<T,ArrayType> This_t;

    // Let the member data destroy itself.
    virtual ~ArrayAccumulator()
    {}

    // A static function that will be run in a thread.
    // The data passed in is an object of type This_t.
    static void *threadAccumulate(
        void * x
    ){
        This_t *y = static_cast<This_t*>(x);
        y->result = accumulate(y->array);
        return x;
    }

    // Construct with a const ref to an array.
    // Just remember the array.
    ArrayAccumulator(
        const ArrayType & a
    ) : array(a)
    {}

  private:
    // Store the array by value since the one passed in could be
    // a temporary.
    ArrayType array;
};

//----------------------------------------------------------------------
// void spawn_thread(Pthread_id,ArrayType)
//
// Spawns a thread that runs an ArrayAccumultor.
//----------------------------------------------------------------------

template<class ArrayType>
inline void
spawn_accumulate(
    Pthread_t &       id,
    const ArrayType & a
){
  // Typedefs to make the thread create more clear.
  typedef typename ArrayType::Element_t T;
  typedef ArrayAccumulator<T,ArrayType> Accumulator_t;

  // Spawn a thread:
  //   Store the id through the reference that is passed in.
  //   The function to call is threadAccumulate
  //   The thread data is an ArrayAccumulator using the passed in array.
  Pthread_create(&id, NULL, Accumulator_t::threadAccumulate,
		 new Accumulator_t(a));
}

//----------------------------------------------------------------------
// Multipatch version.
// Loop over patches and accumulate each patch.
//----------------------------------------------------------------------

template<int D, class T>
T accumulate(
    const ConstArray<D,T,MultiPatch<UniformTag,Brick> > & x
)
{
    // Make sure there's nothing going on in the background.
    Pooma::blockAndEvaluate();

    // Get the UniformGridLayout from the array.
    const UniformGridLayout<2>& layout = x.engine().layout();

    // Find the number of patches.  We'll have one thread per patch.
    int patches = layout.size();

    // An array of thread ids.
    Pthread_t *ids = new Pthread_t[patches];

    // Loop over patches.
    typename UniformGridLayout<2>::const_iterator i =
      x.engine().layout().begin();
    typename UniformGridLayout<2>::const_iterator e =
      x.engine().layout().end();
    int c=0;
    while (i!=e)
    {
        // Spawn a thread for each patch.
        spawn_accumulate(ids[c],x(*i));
        ++i;
        ++c;
    }

    // Wait for all the threads to finish.
    // Get the sum from each, and accumulate that
    // in this thread.
    T sum = 0;
    for (int j=0; j<c; ++j)
    {
        // Wait for a given thread to finish.
        void * v;
        Pthread_join(ids[j],&v);

        // Get the result of the sum for that thread.
        // We don't need to know the array type for this.
        ResultHolder<T>* s = static_cast<ResultHolder<T>*>(v);
        sum += s->get();

        // Delete the data structure passed to the thread.
        delete s;
    }

    // Return the full sum.
    return sum;
}

//----------------------------------------------------------------------
// General engine version.
// If we don't know anything about the engine, at least get the right answer.
//----------------------------------------------------------------------

template<int D, class T, class E>
T accumulate(
    const ConstArray<D,T,E> & x
){
    // Make sure there is nothing going on in the background.
    Pooma::blockAndEvaluate();

    // Call the inner loop.
    return accumulateWithLoop(x);
}

#endif // POOMA_THREADS

#endif // ACCUMULATE_H

// ACL:rcsinfo
// ----------------------------------------------------------------------
// $RCSfile: Accumulate.h,v $   $Author: richard $
// $Revision: 1.13 $   $Date: 2004/11/01 18:15:43 $
// ----------------------------------------------------------------------
// ACL:rcsinfo
