📄 eval.cc

📁 数值计算工具库,C语言编写的,可以直接调用.
💻 CC
📖 第 1 页 / 共 3 页
字号:
     * cached machines.
     */

    const int maxRank = ordering(0);
    const int secondLastRank = ordering(1);

    // Create an iterator for the array receiving the result
    FastArrayIterator<T_numtype, N_rank> iter(*this);

    // Set the initial stack configuration by pushing the pointer
    // to the first element of the array onto the stack N times.

    int i;
    for (i=1; i < N_rank; ++i)
    {
        iter.push(i);
        expr.push(i);
    }

    // Load the strides associated with the innermost loop.
    iter.loadStride(maxRank);
    expr.loadStride(maxRank);

    /* 
     * Is the stride in the innermost loop equal to 1?  If so,
     * we might take advantage of this and generate more
     * efficient code.
     */
    _bz_bool useUnitStride = iter.isUnitStride(maxRank)
                          && expr.isUnitStride(maxRank);

    /*
     * Do all array operands share a common stride in the innermost
     * loop?  If so, we can generate more efficient code (but only
     * if this optimization has been enabled).
     */
#ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
    int commonStride = expr.suggestStride(maxRank);
    if (iter.suggestStride(maxRank) > commonStride)
        commonStride = iter.suggestStride(maxRank);
    bool useCommonStride = iter.isStride(maxRank,commonStride)
        && expr.isStride(maxRank,commonStride);

#ifdef BZ_DEBUG_TRAVERSE
    BZ_DEBUG_MESSAGE("BZ_ARRAY_EXPR_USE_COMMON_STRIDE" << endl
        << "commonStride = " << commonStride << " useCommonStride = "
        << useCommonStride);
#endif

#else
    int commonStride = 1;
    bool useCommonStride = _bz_false;
#endif

    /*
     * The "last" array contains a pointer to the last element
     * encountered in each "loop".
     */
    const T_numtype* last[N_rank];

    // Set up the initial state of the "last" array
    for (i=1; i < N_rank; ++i)
        last[i] = iter.data() + length(ordering(i)) * stride(ordering(i));

    int lastLength = length(maxRank);
    int firstNoncollapsedLoop = 1;

#ifdef BZ_COLLAPSE_LOOPS

    /*
     * This bit of code handles collapsing loops.  When possible,
     * the N nested loops are converted into a single loop (basically,
     * the N-dimensional array is treated as a long vector).
     * This is important for cases where the length of the innermost
     * loop is very small, for example a 100x100x3 array.
     * If this code can't collapse all the loops into a single loop,
     * it will collapse as many loops as possible starting from the
     * innermost and working out.
     */

    // Collapse loops when possible
    for (i=1; i < N_rank; ++i)
    {
        // Figure out which pair of loops we are considering combining.
        int outerLoopRank = ordering(i);
        int innerLoopRank = ordering(i-1);

        /*
         * The canCollapse() routines look at the strides and extents
         * of the loops, and determine if they can be combined into
         * one loop.
         */

        if (canCollapse(outerLoopRank,innerLoopRank) 
          && expr.canCollapse(outerLoopRank,innerLoopRank))
        {
#ifdef BZ_DEBUG_TRAVERSE
            cout << "Collapsing " << outerLoopRank << " and " 
                 << innerLoopRank << endl;
#endif
            lastLength *= length(outerLoopRank);
            firstNoncollapsedLoop = i+1;
        }
//        else  
//            break;
    }
#endif // BZ_COLLAPSE_LOOPS

    /*
     * Now we actually perform the loops.  This while loop contains
     * two parts: first, the innermost loop is performed.  Then we
     * exit the loop, and pop our way down the stack until we find
     * a loop that isn't completed.  We then restart the inner loops
     * and push them onto the stack.
     */

    while (true) {

        /*
         * This bit of code handles the innermost loop.  It would look
         * a lot simpler if it weren't for unit stride and common stride
         * optimizations; these clutter up the code with multiple versions.
         */

        if ((useUnitStride) || (useCommonStride))
        {
            T_numtype * _bz_restrict end = const_cast<T_numtype*>(iter.data()) 
                + lastLength;

#ifdef BZ_USE_FAST_READ_ARRAY_EXPR

            /*
             * The check for BZ_USE_FAST_READ_ARRAY_EXPR can probably
             * be taken out.  This was put in place while the unit stride/
             * common stride optimizations were being implemented and
             * tested.
             */

            // Calculate the end of the innermost loop
            int ubound = lastLength * commonStride;

            /*
             * This is a real kludge.  I didn't want to have to write
             * a const and non-const version of FastArrayIterator, so I use a
             * const iterator and cast away const.  This could
             * probably be avoided with some trick, but the whole routine
             * is ugly, so why bother.
             */

            T_numtype* _bz_restrict data = const_cast<T_numtype*>(iter.data());

            /*
             * BZ_NEEDS_WORK-- need to implement optional unrolling.
             */
            if (commonStride == 1)
            {
                for (int i=0; i < ubound; ++i)
                    T_update::update(data[i], expr.fastRead(i));
            }
#ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
            else {
                for (int i=0; i < ubound; i += commonStride)
                    T_update::update(data[i], expr.fastRead(i));
            }
#endif
            /*
             * Tidy up for the fact that we haven't actually been
             * incrementing the iterators in the innermost loop, by
             * faking it afterward.
             */
            iter.advance(lastLength * commonStride);
            expr.advance(lastLength * commonStride);
#else        
            // !BZ_USE_FAST_READ_ARRAY_EXPR
            // This bit of code not really needed; should remove at some
            // point, along with the test for BZ_USE_FAST_READ_ARRAY_EXPR 

            while (iter.data() != end) 
            {
                T_update::update(*const_cast<T_numtype*>(iter.data()), *expr);
                iter.advance(commonStride);
                expr.advance(commonStride);
            }
#endif
        }
        else {
            /*
             * We don't have a unit stride or common stride in the innermost
             * loop.  This is going to hurt performance.  Luckily 95% of
             * the time, we hit the cases above.
             */
            T_numtype * _bz_restrict end = const_cast<T_numtype*>(iter.data())
                + lastLength * stride(maxRank);

            while (iter.data() != end)
            {
                T_update::update(*const_cast<T_numtype*>(iter.data()), *expr);
                iter.advance();
                expr.advance();
            }
        }


        /*
         * We just finished the innermost loop.  Now we pop our way down
         * the stack, until we hit a loop that hasn't completed yet.
         */ 
        int j = firstNoncollapsedLoop;
        for (; j < N_rank; ++j)
        {
            // Get the next loop
            int r = ordering(j);

            // Pop-- this restores the data pointers to the first element
            // encountered in the loop.
            iter.pop(j);
            expr.pop(j);

            // Load the stride associated with this loop, and increment
            // once.
            iter.loadStride(r);
            expr.loadStride(r);
            iter.advance();
            expr.advance();

            // If we aren't at the end of this loop, then stop popping.
            if (iter.data() != last[j])
                break;
        }

        // Are we completely done?
        if (j == N_rank)
            break;

        // No, so push all the inner loops back onto the stack.
        for (; j >= firstNoncollapsedLoop; --j)
        {
            int r2 = ordering(j-1);
            iter.push(j);
            expr.push(j);
            last[j-1] = iter.data() + length(r2) * stride(r2);
        }

        // Load the stride for the innermost loop again.
        iter.loadStride(maxRank);
        expr.loadStride(maxRank);
    }

    return *this;
}

template<class T_numtype, int N_rank> template<class T_expr, class T_update>
inline Array<T_numtype, N_rank>&
Array<T_numtype, N_rank>::evaluateWithIndexTraversal1(
    T_expr expr, T_update)
{
    TinyVector<int,N_rank> index;

    if (stride(firstRank) == 1)
    {
        T_numtype * _bz_restrict iter = data_;
        int last = ubound(firstRank);

        for (index[0] = lbound(firstRank); index[0] <= last;
            ++index[0])
        {
            iter[index[0]] = T_numtype(expr(index));
        }
    }
    else {
        FastArrayIterator<T_numtype, N_rank> iter(*this);
        iter.loadStride(0);
        int last = ubound(firstRank);

        for (index[0] = lbound(firstRank); index[0] <= last;
            ++index[0])
        {
            T_update::update(*const_cast<T_numtype*>(iter.data()), 
                expr(index));
            iter.advance();
        }
    }

    return *this;
}

template<class T_numtype, int N_rank> template<class T_expr, class T_update>
inline Array<T_numtype, N_rank>&
Array<T_numtype, N_rank>::evaluateWithIndexTraversalN(
    T_expr expr, T_update)
{
    // Do a stack-type traversal for the destination array and use
    // index traversal for the source expression
   
    const int maxRank = ordering(0);
    const int secondLastRank = ordering(1);

#ifdef BZ_DEBUG_TRAVERSE
cout << "Index traversal: N_rank = " << N_rank << endl;
cout << "maxRank = " << maxRank << " secondLastRank = " << secondLastRank
     << endl;
cout.flush();
#endif

    FastArrayIterator<T_numtype, N_rank> iter(*this);
    for (int i=1; i < N_rank; ++i)
        iter.push(ordering(i));

    iter.loadStride(maxRank);

    TinyVector<int,N_rank> index, last;

    index = storage_.base();
    last = storage_.base() + length_;

    int lastLength = length(maxRank);

    while (true) {

        for (index[maxRank] = base(maxRank); 
             index[maxRank] < last[maxRank]; 
             ++index[maxRank])
        {
#ifdef BZ_DEBUG_TRAVERSE
#if 0
    cout << "(" << index[0] << "," << index[1] << ") " << endl;
    cout.flush();
#endif
#endif

            T_update::update(*const_cast<T_numtype*>(iter.data()), expr(index));
            iter.advance();
        }

        int j = 1;
        for (; j < N_rank; ++j)
        {
            iter.pop(ordering(j));
            iter.loadStride(ordering(j));
            iter.advance();

            index[ordering(j-1)] = base(ordering(j-1));
            ++index[ordering(j)];
            if (index[ordering(j)] != last[ordering(j)])
                break;
        }

        if (j == N_rank)
            break;

        for (; j > 0; --j)
        {
            iter.push(ordering(j));
        }
        iter.loadStride(maxRank);
    }

    return *this; 
}

// Fast traversals require <set> from the ISO/ANSI C++ standard library

#ifdef BZ_HAVE_STD

template<class T_numtype, int N_rank> template<class T_expr, class T_update>
inline Array<T_numtype, N_rank>&
Array<T_numtype, N_rank>::evaluateWithFastTraversal(
    const TraversalOrder<N_rank - 1>& order, 
    T_expr expr,
    T_update)
{
    const int maxRank = ordering(0);
    const int secondLastRank = ordering(1);

#ifdef BZ_DEBUG_TRAVERSE
cerr << "maxRank = " << maxRank << " secondLastRank = " << secondLastRank
     << endl;
#endif

    FastArrayIterator<T_numtype, N_rank> iter(*this);
    iter.push(0);
    expr.push(0);

    _bz_bool useUnitStride = iter.isUnitStride(maxRank) 
                          && expr.isUnitStride(maxRank);

#ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
    int commonStride = expr.suggestStride(maxRank);
    if (iter.suggestStride(maxRank) > commonStride)
        commonStride = iter.suggestStride(maxRank);
    bool useCommonStride = iter.isStride(maxRank,commonStride)
        && expr.isStride(maxRank,commonStride);
#else
    int commonStride = 1;
    bool useCommonStride = _bz_false;
#endif

    int lastLength = length(maxRank);

    for (int i=0; i < order.length(); ++i)
    {
        iter.pop(0);
        expr.pop(0);

#ifdef BZ_DEBUG_TRAVERSE
    cerr << "Traversing: " << order[i] << endl;
#endif
        // Position the iterator at the start of the next column       
        for (int j=1; j < N_rank; ++j)
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -