📄 eval.cc

📁 著名的数学计算类库
💻 CC
📖 第 1 页 / 共 3 页
字号:
12 3 下一页
#ifndef BZ_ARRAYEVAL_CC#define BZ_ARRAYEVAL_CC#ifndef BZ_ARRAY_H #error <blitz/array/eval.cc> must be included via <blitz/array.h>#endifBZ_NAMESPACE(blitz)/* * Assign an expression to an array.  For performance reasons, there are * several traversal mechanisms: * * - Index traversal scans through the destination array in storage order. *   The expression is evaluated using a TinyVector<int,N> operand.  This *   version is used only when there are index placeholders in the expression *   (see <blitz/indexexpr.h>) * - Stack traversal also scans through the destination array in storage *   order.  However, push/pop stack iterators are used. * - Fast traversal follows a Hilbert (or other) space-filling curve to *   improve cache reuse for stencilling operations.  Currently, the *   space filling curves must be generated by calling  *   generateFastTraversalOrder(TinyVector<int,N_dimensions>). * - 2D tiled traversal follows a tiled traversal, to improve cache reuse *   for 2D stencils.  Space filling curves have too much overhead to use *   in two-dimensions. * * _bz_tryFastTraversal is a helper class.  Fast traversals are only * attempted if the expression looks like a stencil -- it's at least * three-dimensional, has at least six array operands, and there are * no index placeholders in the expression.  These are all things which * can be checked at compile time, so the if()/else() syntax has been * replaced with this class template. */// Fast traversals require <set> from the ISO/ANSI C++ standard library#ifdef BZ_HAVE_STD#ifdef BZ_ARRAY_SPACE_FILLING_TRAVERSALtemplate<bool canTryFastTraversal>struct _bz_tryFastTraversal {    template<typename T_numtype, int N_rank, typename T_expr, typename T_update>    static bool tryFast(Array<T_numtype,N_rank>& array,         BZ_ETPARM(T_expr) expr, T_update)    {        return false;    }};template<>struct _bz_tryFastTraversal<true> {    template<typename T_numtype, int N_rank, typename T_expr, typename T_update>    static bool tryFast(Array<T_numtype,N_rank>& array,         BZ_ETPARM(T_expr) expr, T_update)    {        // See if there's an appropriate space filling curve available.        // Currently fast traversals use an N-1 dimensional curve.  The        // Nth dimension column corresponding to each point on the curve        // is traversed in the normal fashion.        TraversalOrderCollection<N_rank-1> traversals;        TinyVector<int, N_rank - 1> traversalGridSize;        for (int i=0; i < N_rank - 1; ++i)            traversalGridSize[i] = array.length(array.ordering(i+1));#ifdef BZ_DEBUG_TRAVERSEcout << "traversalGridSize = " << traversalGridSize << endl;cout.flush();#endif        const TraversalOrder<N_rank-1>* order =            traversals.find(traversalGridSize);        if (order)        {#ifdef BZ_DEBUG_TRAVERSE    cerr << "Array<" << BZ_DEBUG_TEMPLATE_AS_STRING_LITERAL(T_numtype)         << ", " << N_rank << ">: Using stack traversal" << endl;#endif            // A curve was available -- use fast traversal.            array.evaluateWithFastTraversal(*order, expr, T_update());            return true;        }        return false;    }};#endif // BZ_ARRAY_SPACE_FILLING_TRAVERSAL#endif // BZ_HAVE_STDtemplate<typename T_numtype, int N_rank> template<typename T_expr, typename T_update>inline Array<T_numtype, N_rank>& Array<T_numtype, N_rank>::evaluate(T_expr expr,     T_update){    // Check that all arrays have the same shape#ifdef BZ_DEBUG    if (!expr.shapeCheck(shape()))    {      if (assertFailMode == false)      {        cerr << "[Blitz++] Shape check failed: Module " << __FILE__             << " line " << __LINE__ << endl             << "          Expression: ";        prettyPrintFormat format(true);   // Use terse formatting        BZ_STD_SCOPE(string) str;        expr.prettyPrint(str, format);        cerr << str << endl ;      }#if 0// Shape dumping is broken by change to using string for prettyPrint             << "          Shapes: " << shape() << " = ";        prettyPrintFormat format2;        format2.setDumpArrayShapesMode();        expr.prettyPrint(cerr, format2);        cerr << endl;#endif        BZ_PRE_FAIL;    }#endif    BZPRECHECK(expr.shapeCheck(shape()),        "Shape check failed." << endl << "Expression:");    BZPRECHECK((T_expr::rank == N_rank) || (T_expr::numArrayOperands == 0),         "Assigned rank " << T_expr::rank << " expression to rank "         << N_rank << " array.");    /*     * Check that the arrays are not empty (e.g. length 0 arrays)     * This fixes a bug found by Peter Bienstman, 6/16/99, where     * Array<double,2> A(0,0),B(0,0); B=A(tensor::j,tensor::i);     * went into an infinite loop.     */    if (numElements() == 0)        return *this;#ifdef BZ_DEBUG_TRAVERSE    cout << "T_expr::numIndexPlaceholders = " << T_expr::numIndexPlaceholders         << endl;     cout.flush();#endif    // Tau profiling code.  Provide Tau with a pretty-printed version of    // the expression.    // NEEDS_WORK-- use a static initializer somehow.#ifdef BZ_TAU_PROFILING    static BZ_STD_SCOPE(string) exprDescription;    if (!exprDescription.length())   // faked static initializer    {        exprDescription = "A";        prettyPrintFormat format(true);   // Terse mode on        format.nextArrayOperandSymbol();        T_update::prettyPrint(exprDescription);        expr.prettyPrint(exprDescription, format);    }    TAU_PROFILE(" ", exprDescription, TAU_BLITZ);#endif    // Determine which evaluation mechanism to use     if (T_expr::numIndexPlaceholders > 0)    {        // The expression involves index placeholders, so have to        // use index traversal rather than stack traversal.        if (N_rank == 1)            return evaluateWithIndexTraversal1(expr, T_update());        else            return evaluateWithIndexTraversalN(expr, T_update());    }    else {        // If this expression looks like an array stencil, then attempt to        // use a fast traversal order.        // Fast traversals require <set> from the ISO/ANSI C++ standard        // library.#ifdef BZ_HAVE_STD#ifdef BZ_ARRAY_SPACE_FILLING_TRAVERSAL        enum { isStencil = (N_rank >= 3) && (T_expr::numArrayOperands > 6)            && (T_expr::numIndexPlaceholders == 0) };        if (_bz_tryFastTraversal<isStencil>::tryFast(*this, expr, T_update()))            return *this;#endif#endif#ifdef BZ_ARRAY_2D_STENCIL_TILING        // Does this look like a 2-dimensional stencil on a largeish        // array?        if ((N_rank == 2) && (T_expr::numArrayOperands >= 5))        {            // Use a heuristic to determine whether a tiled traversal            // is desirable.  First, estimate how much L1 cache is needed             // to achieve a high hit rate using the stack traversal.            // Try to err on the side of using tiled traversal even when            // it isn't strictly needed.            // Assumptions:            //    Stencil width 3            //    3 arrays involved in stencil            //    Uniform data type in arrays (all T_numtype)                        int cacheNeeded = 3 * 3 * sizeof(T_numtype) * length(ordering(0));            if (cacheNeeded > BZ_L1_CACHE_ESTIMATED_SIZE)                return evaluateWithTiled2DTraversal(expr, T_update());        }#endif        // If fast traversal isn't available or appropriate, then just        // do a stack traversal.        if (N_rank == 1)            return evaluateWithStackTraversal1(expr, T_update());        else            return evaluateWithStackTraversalN(expr, T_update());    }}template<typename T_numtype, int N_rank> template<typename T_expr, typename T_update>inline Array<T_numtype, N_rank>&Array<T_numtype, N_rank>::evaluateWithStackTraversal1(    T_expr expr, T_update){#ifdef BZ_DEBUG_TRAVERSE    BZ_DEBUG_MESSAGE("Array<" << BZ_DEBUG_TEMPLATE_AS_STRING_LITERAL(T_numtype)         << ", " << N_rank << ">: Using stack traversal");#endif    FastArrayIterator<T_numtype, N_rank> iter(*this);    iter.loadStride(firstRank);    expr.loadStride(firstRank);    bool useUnitStride = iter.isUnitStride(firstRank)          && expr.isUnitStride(firstRank);#ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE    int commonStride = expr.suggestStride(firstRank);    if (iter.suggestStride(firstRank) > commonStride)        commonStride = iter.suggestStride(firstRank);    bool useCommonStride = iter.isStride(firstRank,commonStride)        && expr.isStride(firstRank,commonStride); #ifdef BZ_DEBUG_TRAVERSE    BZ_DEBUG_MESSAGE("BZ_ARRAY_EXPR_USE_COMMON_STRIDE:" << endl        << "    commonStride = " << commonStride << " useCommonStride = "        << useCommonStride); #endif#else    int commonStride = 1;    bool useCommonStride = false;#endif    const T_numtype * last = iter.data() + length(firstRank)         * stride(firstRank);    if (useUnitStride || useCommonStride)    {#ifdef BZ_USE_FAST_READ_ARRAY_EXPR#ifdef BZ_DEBUG_TRAVERSE    BZ_DEBUG_MESSAGE("BZ_USE_FAST_READ_ARRAY_EXPR with commonStride");#endif        int ubound = length(firstRank) * commonStride;        T_numtype* restrict data = const_cast<T_numtype*>(iter.data());        if (commonStride == 1)        { #ifndef BZ_ARRAY_STACK_TRAVERSAL_UNROLL            for (int i=0; i < ubound; ++i)                T_update::update(*data++, expr.fastRead(i)); #else            int n1 = ubound & 3;            int i = 0;            for (; i < n1; ++i)                T_update::update(*data++, expr.fastRead(i));                       for (; i < ubound; i += 4)            {#ifndef BZ_ARRAY_STACK_TRAVERSAL_CSE_AND_ANTIALIAS                T_update::update(*data++, expr.fastRead(i));                T_update::update(*data++, expr.fastRead(i+1));                T_update::update(*data++, expr.fastRead(i+2));                T_update::update(*data++, expr.fastRead(i+3));#else                const int t1 = i+1;                const int t2 = i+2;                const int t3 = i+3;                _bz_typename T_expr::T_numtype tmp1, tmp2, tmp3, tmp4;                tmp1 = expr.fastRead(i);                tmp2 = expr.fastRead(BZ_NO_PROPAGATE(t1));                tmp3 = expr.fastRead(BZ_NO_PROPAGATE(t2));                tmp4 = expr.fastRead(BZ_NO_PROPAGATE(t3));                T_update::update(*data++, tmp1);                T_update::update(*data++, tmp2);                T_update::update(*data++, tmp3);                T_update::update(*data++, tmp4);#endif            } #endif // BZ_ARRAY_STACK_TRAVERSAL_UNROLL        } #ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE        else {  #ifndef BZ_ARRAY_STACK_TRAVERSAL_UNROLL            for (int i=0; i != ubound; i += commonStride)                T_update::update(data[i], expr.fastRead(i));  #else            int n1 = (length(firstRank) & 3) * commonStride;            int i = 0;            for (; i != n1; i += commonStride)                T_update::update(data[i], expr.fastRead(i));            int strideInc = 4 * commonStride;            for (; i != ubound; i += strideInc)            {                T_update::update(data[i], expr.fastRead(i));                int i2 = i + commonStride;                T_update::update(data[i2], expr.fastRead(i2));                int i3 = i + 2 * commonStride;                T_update::update(data[i3], expr.fastRead(i3));                int i4 = i + 3 * commonStride;                T_update::update(data[i4], expr.fastRead(i4));            }  #endif  // BZ_ARRAY_STACK_TRAVERSAL_UNROLL        } #endif  // BZ_ARRAY_EXPR_USE_COMMON_STRIDE#else   // ! BZ_USE_FAST_READ_ARRAY_EXPR#ifdef BZ_DEBUG_TRAVERSE    BZ_DEBUG_MESSAGE("Common stride, no fast read");#endif        while (iter.data() != last)        {            T_update::update(*const_cast<T_numtype*>(iter.data()), *expr);            iter.advance(commonStride);            expr.advance(commonStride);        }#endif    }    else {        while (iter.data() != last)        {            T_update::update(*const_cast<T_numtype*>(iter.data()), *expr);            iter.advance();            expr.advance();        }    }    return *this;}template<typename T_numtype, int N_rank> template<typename T_expr, typename T_update>inline Array<T_numtype, N_rank>&Array<T_numtype, N_rank>::evaluateWithStackTraversalN(    T_expr expr, T_update){    /*     * A stack traversal replaces the usual nested loops:     *     * for (int i=A.lbound(firstDim); i <= A.ubound(firstDim); ++i)     *   for (int j=A.lbound(secondDim); j <= A.ubound(secondDim); ++j)     *     for (int k=A.lbound(thirdDim); k <= A.ubound(thirdDim); ++k)     *       A(i,j,k) = 0;     *     * with a stack data structure.  The stack allows this single     * routine to replace any number of nested loops.     *     * For each dimension (loop), these quantities are needed:     * - a pointer to the first element encountered in the loop     * - the stride associated with the dimension/loop     * - a pointer to the last element encountered in the loop     *     * The basic idea is that entering each loop is a "push" onto the     * stack, and exiting each loop is a "pop".  In practice, this     * routine treats accesses the stack in a random-access way,     * which confuses the picture a bit.  But conceptually, that's     * what is going on.     */    /*     * ordering(0) gives the dimension associated with the smallest     * stride (usually; the exceptions have to do with subarrays and     * are uninteresting).  We call this dimension maxRank; it will     * become the innermost "loop".     *     * Ordering the loops from ordering(N_rank-1) down to     * ordering(0) ensures that the largest stride is associated     * with the outermost loop, and the smallest stride with the     * innermost.  This is critical for good performance on     * cached machines.     */    const int maxRank = ordering(0);    // const int secondLastRank = ordering(1);    // Create an iterator for the array receiving the result
12 3 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -