📄 eval.cc

📁 数值计算工具库,C语言编写的,可以直接调用.
💻 CC
📖 第 1 页 / 共 3 页
字号:
12 3 下一页
/*
 * $Id: arrayeval.cc,v 1.2 1998/03/14 00:04:47 tveldhui Exp $
 *
 * $Log: arrayeval.cc,v $
 * Revision 1.2  1998/03/14 00:04:47  tveldhui
 * 0.2-alpha-05
 *
 * Revision 1.1  1998/02/25 20:04:01  tveldhui
 * Initial revision
 *
 */

#ifndef BZ_ARRAYEVAL_CC
#define BZ_ARRAYEVAL_CC

#ifndef BZ_ARRAY_H
 #error <blitz/array/eval.cc> must be included via <blitz/array.h>
#endif

BZ_NAMESPACE(blitz)

/*
 * Assign an expression to an array.  For performance reasons, there are
 * several traversal mechanisms:
 *
 * - Index traversal scans through the destination array in storage order.
 *   The expression is evaluated using a TinyVector<int,N> operand.  This
 *   version is used only when there are index placeholders in the expression
 *   (see <blitz/indexexpr.h>)
 * - Stack traversal also scans through the destination array in storage
 *   order.  However, push/pop stack iterators are used.
 * - Fast traversal follows a Hilbert (or other) space-filling curve to
 *   improve cache reuse for stencilling operations.  Currently, the
 *   space filling curves must be generated by calling 
 *   generateFastTraversalOrder(TinyVector<int,N_dimensions>).
 * - 2D tiled traversal follows a tiled traversal, to improve cache reuse
 *   for 2D stencils.  Space filling curves have too much overhead to use
 *   in two-dimensions.
 *
 * _bz_tryFastTraversal is a helper class.  Fast traversals are only
 * attempted if the expression looks like a stencil -- it's at least
 * three-dimensional, has at least six array operands, and there are
 * no index placeholders in the expression.  These are all things which
 * can be checked at compile time, so the if()/else() syntax has been
 * replaced with this class template.
 */

// Fast traversals require <set> from the ISO/ANSI C++ standard library
#ifdef BZ_HAVE_STD

template<_bz_bool canTryFastTraversal>
struct _bz_tryFastTraversal {
    template<class T_numtype, int N_rank, class T_expr, class T_update>
    static _bz_bool tryFast(Array<T_numtype,N_rank>& array, 
        BZ_ETPARM(T_expr) expr, T_update)
    {
        return _bz_false;
    }
};

template<>
struct _bz_tryFastTraversal<_bz_true> {
    template<class T_numtype, int N_rank, class T_expr, class T_update>
    static _bz_bool tryFast(Array<T_numtype,N_rank>& array, 
        BZ_ETPARM(T_expr) expr, T_update)
    {
        // See if there's an appropriate space filling curve available.
        // Currently fast traversals use an N-1 dimensional curve.  The
        // Nth dimension column corresponding to each point on the curve
        // is traversed in the normal fashion.
        TraversalOrderCollection<N_rank-1> traversals;
        TinyVector<int, N_rank - 1> traversalGridSize;

        for (int i=0; i < N_rank - 1; ++i)
            traversalGridSize[i] = array.length(array.ordering(i+1));

#ifdef BZ_DEBUG_TRAVERSE
cout << "traversalGridSize = " << traversalGridSize << endl;
cout.flush();
#endif

        const TraversalOrder<N_rank-1>* order =
            traversals.find(traversalGridSize);

        if (order)
        {
#ifdef BZ_DEBUG_TRAVERSE
    cerr << "Array<" << BZ_DEBUG_TEMPLATE_AS_STRING_LITERAL(T_numtype)
         << ", " << N_rank << ">: Using stack traversal" << endl;
#endif
            // A curve was available -- use fast traversal.
            array.evaluateWithFastTraversal(*order, expr, T_update());
            return _bz_true;
        }

        return _bz_false;
    }
};

#endif // BZ_HAVE_STD

template<class T_numtype, int N_rank> template<class T_expr, class T_update>
inline Array<T_numtype, N_rank>& 
Array<T_numtype, N_rank>::evaluate(T_expr expr, 
    T_update)
{
    // Check that all arrays have the same shape
#ifdef BZ_DEBUG
    if (!expr.shapeCheck(shape()))
    {
      if (assertFailMode == _bz_false)
      {
        cerr << "[Blitz++] Shape check failed: Module " << __FILE__
             << " line " << __LINE__ << endl
             << "          Expression: ";
        prettyPrintFormat format(_bz_true);   // Use terse formatting
        string str;
        expr.prettyPrint(str, format);
        cerr << str << endl ;
      }

#if 0
// Shape dumping is broken by change to using string for prettyPrint
             << "          Shapes: " << shape() << " = ";
        prettyPrintFormat format2;
        format2.setDumpArrayShapesMode();
        expr.prettyPrint(cerr, format2);
        cerr << endl;
#endif
        BZ_PRE_FAIL;
    }
#endif

    BZPRECHECK(expr.shapeCheck(shape()),
        "Shape check failed." << endl << "Expression:");

    BZPRECHECK((T_expr::rank == N_rank) || (T_expr::numArrayOperands == 0), 
        "Assigned rank " << T_expr::rank << " expression to rank " 
        << N_rank << " array.");

    /*
     * Check that the arrays are not empty (e.g. length 0 arrays)
     * This fixes a bug found by Peter Bienstman, 6/16/99, where
     * Array<double,2> A(0,0),B(0,0); B=A(tensor::j,tensor::i);
     * went into an infinite loop.
     */

    if (numElements() == 0)
        return *this;

#ifdef BZ_DEBUG_TRAVERSE
cout << "T_expr::numIndexPlaceholders = " << T_expr::numIndexPlaceholders
     << endl; cout.flush();
#endif

    // Tau profiling code.  Provide Tau with a pretty-printed version of
    // the expression.
    // NEEDS_WORK-- use a static initializer somehow.

#ifdef BZ_TAU_PROFILING
    static string exprDescription;
    if (!exprDescription.length())   // faked static initializer
    {
        exprDescription = "A";
        prettyPrintFormat format(_bz_true);   // Terse mode on
        format.nextArrayOperandSymbol();
        T_update::prettyPrint(exprDescription);
        expr.prettyPrint(exprDescription, format);
    }
    TAU_PROFILE(" ", exprDescription, TAU_BLITZ);
#endif

    // Determine which evaluation mechanism to use 
    if (T_expr::numIndexPlaceholders > 0)
    {
        // The expression involves index placeholders, so have to
        // use index traversal rather than stack traversal.

        if (N_rank == 1)
            return evaluateWithIndexTraversal1(expr, T_update());
        else
            return evaluateWithIndexTraversalN(expr, T_update());
    }
    else {

        // If this expression looks like an array stencil, then attempt to
        // use a fast traversal order.
        // Fast traversals require <set> from the ISO/ANSI C++ standard
        // library.

#ifdef BZ_HAVE_STD

        enum { isStencil = (N_rank >= 3) && (T_expr::numArrayOperands > 6)
            && (T_expr::numIndexPlaceholders == 0) };

        if (_bz_tryFastTraversal<isStencil>::tryFast(*this, expr, T_update()))
            return *this;

#endif

#ifdef BZ_ARRAY_2D_STENCIL_TILING
        // Does this look like a 2-dimensional stencil on a largeish
        // array?

        if ((N_rank == 2) && (T_expr::numArrayOperands >= 5))
        {
            // Use a heuristic to determine whether a tiled traversal
            // is desirable.  First, estimate how much L1 cache is needed 
            // to achieve a high hit rate using the stack traversal.
            // Try to err on the side of using tiled traversal even when
            // it isn't strictly needed.

            // Assumptions:
            //    Stencil width 3
            //    3 arrays involved in stencil
            //    Uniform data type in arrays (all T_numtype)
            
            int cacheNeeded = 3 * 3 * sizeof(T_numtype) * length(ordering(0));
            if (cacheNeeded > BZ_L1_CACHE_ESTIMATED_SIZE)
                return evaluateWithTiled2DTraversal(expr, T_update());
        }

#endif

        // If fast traversal isn't available or appropriate, then just
        // do a stack traversal.
        if (N_rank == 1)
            return evaluateWithStackTraversal1(expr, T_update());
        else
            return evaluateWithStackTraversalN(expr, T_update());
    }
}

template<class T_numtype, int N_rank> template<class T_expr, class T_update>
inline Array<T_numtype, N_rank>&
Array<T_numtype, N_rank>::evaluateWithStackTraversal1(
    T_expr expr, T_update)
{
#ifdef BZ_DEBUG_TRAVERSE
    BZ_DEBUG_MESSAGE("Array<" << BZ_DEBUG_TEMPLATE_AS_STRING_LITERAL(T_numtype)
         << ", " << N_rank << ">: Using stack traversal");
#endif
    FastArrayIterator<T_numtype, N_rank> iter(*this);
    iter.loadStride(firstRank);
    expr.loadStride(firstRank);

    _bz_bool useUnitStride = iter.isUnitStride(firstRank)
          && expr.isUnitStride(firstRank);

#ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
    int commonStride = expr.suggestStride(firstRank);
    if (iter.suggestStride(firstRank) > commonStride)
        commonStride = iter.suggestStride(firstRank);
    bool useCommonStride = iter.isStride(firstRank,commonStride)
        && expr.isStride(firstRank,commonStride);

 #ifdef BZ_DEBUG_TRAVERSE
    BZ_DEBUG_MESSAGE("BZ_ARRAY_EXPR_USE_COMMON_STRIDE:" << endl
        << "    commonStride = " << commonStride << " useCommonStride = "
        << useCommonStride);
 #endif
#else
    int commonStride = 1;
    bool useCommonStride = _bz_false;
#endif

    const T_numtype * last = iter.data() + length(firstRank) 
        * stride(firstRank);

    if (useUnitStride || useCommonStride)
    {
#ifdef BZ_USE_FAST_READ_ARRAY_EXPR

#ifdef BZ_DEBUG_TRAVERSE
    BZ_DEBUG_MESSAGE("BZ_USE_FAST_READ_ARRAY_EXPR with commonStride");
#endif
        int ubound = length(firstRank) * commonStride;
        T_numtype* _bz_restrict data = const_cast<T_numtype*>(iter.data());

        if (commonStride == 1)
        {
 #ifndef BZ_ARRAY_STACK_TRAVERSAL_UNROLL
            for (int i=0; i < ubound; ++i)
                T_update::update(data[i], expr.fastRead(i));
 #else
            int n1 = ubound & 3;
            int i = 0;
            for (; i < n1; ++i)
                T_update::update(data[i], expr.fastRead(i));
           
            for (; i < ubound; i += 4)
            {
#ifndef BZ_ARRAY_STACK_TRAVERSAL_CSE_AND_ANTIALIAS
                T_update::update(data[i], expr.fastRead(i));
                T_update::update(data[i+1], expr.fastRead(i+1));
                T_update::update(data[i+2], expr.fastRead(i+2));
                T_update::update(data[i+3], expr.fastRead(i+3));
#else
                int t1 = i+1;
                int t2 = i+2;
                int t3 = i+3;

                _bz_typename T_expr::T_numtype tmp1, tmp2, tmp3, tmp4;

                tmp1 = expr.fastRead(i);
                tmp2 = expr.fastRead(BZ_NO_PROPAGATE(t1));
                tmp3 = expr.fastRead(BZ_NO_PROPAGATE(t2));
                tmp4 = expr.fastRead(BZ_NO_PROPAGATE(t3));

                T_update::update(data[i], BZ_NO_PROPAGATE(tmp1));
                T_update::update(data[BZ_NO_PROPAGATE(t1)], tmp2);
                T_update::update(data[BZ_NO_PROPAGATE(t2)], tmp3);
                T_update::update(data[BZ_NO_PROPAGATE(t3)], tmp4);
#endif
            }
 #endif // BZ_ARRAY_STACK_TRAVERSAL_UNROLL

        }
 #ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
        else {

  #ifndef BZ_ARRAY_STACK_TRAVERSAL_UNROLL
            for (int i=0; i < ubound; i += commonStride)
                T_update::update(data[i], expr.fastRead(i));
  #else
            int n1 = (length(firstRank) & 3) * commonStride;

            int i = 0;
            for (; i < n1; i += commonStride)
                T_update::update(data[i], expr.fastRead(i));

            int strideInc = 4 * commonStride;
            for (; i < ubound; i += strideInc)
            {
                T_update::update(data[i], expr.fastRead(i));
                int i2 = i + commonStride;
                T_update::update(data[i2], expr.fastRead(i2));
                int i3 = i + 2 * commonStride;
                T_update::update(data[i3], expr.fastRead(i3));
                int i4 = i + 3 * commonStride;
                T_update::update(data[i4], expr.fastRead(i4));
            }
  #endif  // BZ_ARRAY_STACK_TRAVERSAL_UNROLL
        }
 #endif  // BZ_ARRAY_EXPR_USE_COMMON_STRIDE

#else   // ! BZ_USE_FAST_READ_ARRAY_EXPR

#ifdef BZ_DEBUG_TRAVERSE
    BZ_DEBUG_MESSAGE("Common stride, no fast read");
#endif
        while (iter.data() != last)
        {
            T_update::update(*const_cast<T_numtype*>(iter.data()), *expr);
            iter.advance(commonStride);
            expr.advance(commonStride);
        }
#endif
    }
    else {
        while (iter.data() != last)
        {
            T_update::update(*const_cast<T_numtype*>(iter.data()), *expr);
            iter.advance();
            expr.advance();
        }
    }

    return *this;
}

template<class T_numtype, int N_rank> template<class T_expr, class T_update>
inline Array<T_numtype, N_rank>&
Array<T_numtype, N_rank>::evaluateWithStackTraversalN(
    T_expr expr, T_update)
{
    /*
     * A stack traversal replaces the usual nested loops:
     *
     * for (int i=A.lbound(firstDim); i <= A.ubound(firstDim); ++i)
     *   for (int j=A.lbound(secondDim); j <= A.ubound(secondDim); ++j)
     *     for (int k=A.lbound(thirdDim); k <= A.ubound(thirdDim); ++k)
     *       A(i,j,k) = 0;
     *
     * with a stack data structure.  The stack allows this single
     * routine to replace any number of nested loops.
     *
     * For each dimension (loop), these quantities are needed:
     * - a pointer to the first element encountered in the loop
     * - the stride associated with the dimension/loop
     * - a pointer to the last element encountered in the loop
     *
     * The basic idea is that entering each loop is a "push" onto the
     * stack, and exiting each loop is a "pop".  In practice, this
     * routine treats accesses the stack in a random-access way,
     * which confuses the picture a bit.  But conceptually, that's
     * what is going on.
     */

    /*
     * ordering(0) gives the dimension associated with the smallest
     * stride (usually; the exceptions have to do with subarrays and
     * are uninteresting).  We call this dimension maxRank; it will
     * become the innermost "loop".
     *
     * Ordering the loops from ordering(N_rank-1) down to
     * ordering(0) ensures that the largest stride is associated
     * with the outermost loop, and the smallest stride with the
     * innermost.  This is critical for good performance on
12 3 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -