📄 eval.cc

📁 著名的数学计算类库
💻 CC
📖 第 1 页 / 共 3 页
字号:
    FastArrayIterator<T_numtype, N_rank> iter(*this);    // Set the initial stack configuration by pushing the pointer    // to the first element of the array onto the stack N times.    int i;    for (i=1; i < N_rank; ++i)    {        iter.push(i);        expr.push(i);    }    // Load the strides associated with the innermost loop.    iter.loadStride(maxRank);    expr.loadStride(maxRank);    /*      * Is the stride in the innermost loop equal to 1?  If so,     * we might take advantage of this and generate more     * efficient code.     */    bool useUnitStride = iter.isUnitStride(maxRank)                          && expr.isUnitStride(maxRank);    /*     * Do all array operands share a common stride in the innermost     * loop?  If so, we can generate more efficient code (but only     * if this optimization has been enabled).     */#ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE    int commonStride = expr.suggestStride(maxRank);    if (iter.suggestStride(maxRank) > commonStride)        commonStride = iter.suggestStride(maxRank);    bool useCommonStride = iter.isStride(maxRank,commonStride)        && expr.isStride(maxRank,commonStride);#ifdef BZ_DEBUG_TRAVERSE    BZ_DEBUG_MESSAGE("BZ_ARRAY_EXPR_USE_COMMON_STRIDE" << endl        << "commonStride = " << commonStride << " useCommonStride = "        << useCommonStride);#endif#else    int commonStride = 1;    bool useCommonStride = false;#endif    /*     * The "last" array contains a pointer to the last element     * encountered in each "loop".     */    const T_numtype* last[N_rank];    // Set up the initial state of the "last" array    for (i=1; i < N_rank; ++i)        last[i] = iter.data() + length(ordering(i)) * stride(ordering(i));    int lastLength = length(maxRank);    int firstNoncollapsedLoop = 1;#ifdef BZ_COLLAPSE_LOOPS    /*     * This bit of code handles collapsing loops.  When possible,     * the N nested loops are converted into a single loop (basically,     * the N-dimensional array is treated as a long vector).     * This is important for cases where the length of the innermost     * loop is very small, for example a 100x100x3 array.     * If this code can't collapse all the loops into a single loop,     * it will collapse as many loops as possible starting from the     * innermost and working out.     */    // Collapse loops when possible    for (i=1; i < N_rank; ++i)    {        // Figure out which pair of loops we are considering combining.        int outerLoopRank = ordering(i);        int innerLoopRank = ordering(i-1);        /*         * The canCollapse() routines look at the strides and extents         * of the loops, and determine if they can be combined into         * one loop.         */        if (canCollapse(outerLoopRank,innerLoopRank)           && expr.canCollapse(outerLoopRank,innerLoopRank))        {#ifdef BZ_DEBUG_TRAVERSE            cout << "Collapsing " << outerLoopRank << " and "                  << innerLoopRank << endl;#endif            lastLength *= length(outerLoopRank);            firstNoncollapsedLoop = i+1;        }        else              break;    }#endif // BZ_COLLAPSE_LOOPS    /*     * Now we actually perform the loops.  This while loop contains     * two parts: first, the innermost loop is performed.  Then we     * exit the loop, and pop our way down the stack until we find     * a loop that isn't completed.  We then restart the inner loops     * and push them onto the stack.     */    while (true) {        /*         * This bit of code handles the innermost loop.  It would look         * a lot simpler if it weren't for unit stride and common stride         * optimizations; these clutter up the code with multiple versions.         */        if ((useUnitStride) || (useCommonStride))        {#ifdef BZ_USE_FAST_READ_ARRAY_EXPR            /*             * The check for BZ_USE_FAST_READ_ARRAY_EXPR can probably             * be taken out.  This was put in place while the unit stride/             * common stride optimizations were being implemented and             * tested.             */            // Calculate the end of the innermost loop            int ubound = lastLength * commonStride;            /*             * This is a real kludge.  I didn't want to have to write             * a const and non-const version of FastArrayIterator, so I use a             * const iterator and cast away const.  This could             * probably be avoided with some trick, but the whole routine             * is ugly, so why bother.             */            T_numtype* restrict data = const_cast<T_numtype*>(iter.data());            /*             * BZ_NEEDS_WORK-- need to implement optional unrolling.             */            if (commonStride == 1)            {                for (int i=0; i < ubound; ++i)                    T_update::update(*data++, expr.fastRead(i));            }#ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE            else {                for (int i=0; i != ubound; i += commonStride)                    T_update::update(data[i], expr.fastRead(i));            }#endif            /*             * Tidy up for the fact that we haven't actually been             * incrementing the iterators in the innermost loop, by             * faking it afterward.             */            iter.advance(lastLength * commonStride);            expr.advance(lastLength * commonStride);#else                    // !BZ_USE_FAST_READ_ARRAY_EXPR            // This bit of code not really needed; should remove at some            // point, along with the test for BZ_USE_FAST_READ_ARRAY_EXPR             T_numtype * restrict end = const_cast<T_numtype*>(iter.data())                 + lastLength;            while (iter.data() != end)             {                T_update::update(*const_cast<T_numtype*>(iter.data()), *expr);                iter.advance(commonStride);                expr.advance(commonStride);            }#endif        }        else {            /*             * We don't have a unit stride or common stride in the innermost             * loop.  This is going to hurt performance.  Luckily 95% of             * the time, we hit the cases above.             */            T_numtype * restrict end = const_cast<T_numtype*>(iter.data())                + lastLength * stride(maxRank);            while (iter.data() != end)            {                T_update::update(*const_cast<T_numtype*>(iter.data()), *expr);                iter.advance();                expr.advance();            }        }        /*         * We just finished the innermost loop.  Now we pop our way down         * the stack, until we hit a loop that hasn't completed yet.         */         int j = firstNoncollapsedLoop;        for (; j < N_rank; ++j)        {            // Get the next loop            int r = ordering(j);            // Pop-- this restores the data pointers to the first element            // encountered in the loop.            iter.pop(j);            expr.pop(j);            // Load the stride associated with this loop, and increment            // once.            iter.loadStride(r);            expr.loadStride(r);            iter.advance();            expr.advance();            // If we aren't at the end of this loop, then stop popping.            if (iter.data() != last[j])                break;        }        // Are we completely done?        if (j == N_rank)            break;        // No, so push all the inner loops back onto the stack.        for (; j >= firstNoncollapsedLoop; --j)        {            int r2 = ordering(j-1);            iter.push(j);            expr.push(j);            last[j-1] = iter.data() + length(r2) * stride(r2);        }        // Load the stride for the innermost loop again.        iter.loadStride(maxRank);        expr.loadStride(maxRank);    }    return *this;}template<typename T_numtype, int N_rank> template<typename T_expr, typename T_update>inline Array<T_numtype, N_rank>&Array<T_numtype, N_rank>::evaluateWithIndexTraversal1(    T_expr expr, T_update){    TinyVector<int,N_rank> index;    if (stride(firstRank) == 1)    {        T_numtype * restrict iter = data_ + lbound(firstRank);        int last = ubound(firstRank);        for (index[0] = lbound(firstRank); index[0] <= last;            ++index[0])        {            T_update::update(*iter++, expr(index));        }    }    else {        FastArrayIterator<T_numtype, N_rank> iter(*this);        iter.loadStride(0);        int last = ubound(firstRank);        for (index[0] = lbound(firstRank); index[0] <= last;            ++index[0])        {            T_update::update(*const_cast<T_numtype*>(iter.data()),                 expr(index));            iter.advance();        }    }    return *this;}template<typename T_numtype, int N_rank> template<typename T_expr, typename T_update>inline Array<T_numtype, N_rank>&Array<T_numtype, N_rank>::evaluateWithIndexTraversalN(    T_expr expr, T_update){    // Do a stack-type traversal for the destination array and use    // index traversal for the source expression       const int maxRank = ordering(0);#ifdef BZ_DEBUG_TRAVERSE    const int secondLastRank = ordering(1);    cout << "Index traversal: N_rank = " << N_rank << endl;    cout << "maxRank = " << maxRank << " secondLastRank = " << secondLastRank         << endl;    cout.flush();#endif    FastArrayIterator<T_numtype, N_rank> iter(*this);    for (int i=1; i < N_rank; ++i)        iter.push(ordering(i));    iter.loadStride(maxRank);    TinyVector<int,N_rank> index, last;    index = storage_.base();    for (int i=0; i < N_rank; ++i)      last(i) = storage_.base(i) + length_(i);    // int lastLength = length(maxRank);    while (true) {        for (index[maxRank] = base(maxRank);              index[maxRank] < last[maxRank];              ++index[maxRank])        {#ifdef BZ_DEBUG_TRAVERSE#if 0    cout << "(" << index[0] << "," << index[1] << ") " << endl;    cout.flush();#endif#endif            T_update::update(*const_cast<T_numtype*>(iter.data()), expr(index));            iter.advance();        }        int j = 1;        for (; j < N_rank; ++j)        {            iter.pop(ordering(j));            iter.loadStride(ordering(j));            iter.advance();            index[ordering(j-1)] = base(ordering(j-1));            ++index[ordering(j)];            if (index[ordering(j)] != last[ordering(j)])                break;        }        if (j == N_rank)            break;        for (; j > 0; --j)        {            iter.push(ordering(j));        }        iter.loadStride(maxRank);    }    return *this; }// Fast traversals require <set> from the ISO/ANSI C++ standard library#ifdef BZ_HAVE_STD#ifdef BZ_ARRAY_SPACE_FILLING_TRAVERSALtemplate<typename T_numtype, int N_rank> template<typename T_expr, typename T_update>inline Array<T_numtype, N_rank>&Array<T_numtype, N_rank>::evaluateWithFastTraversal(    const TraversalOrder<N_rank - 1>& order,     T_expr expr,    T_update){    const int maxRank = ordering(0);#ifdef BZ_DEBUG_TRAVERSE    const int secondLastRank = ordering(1);    cerr << "maxRank = " << maxRank << " secondLastRank = " << secondLastRank         << endl;#endif    FastArrayIterator<T_numtype, N_rank> iter(*this);    iter.push(0);    expr.push(0);    bool useUnitStride = iter.isUnitStride(maxRank)                           && expr.isUnitStride(maxRank);#ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE    int commonStride = expr.suggestStride(maxRank);    if (iter.suggestStride(maxRank) > commonStride)        commonStride = iter.suggestStride(maxRank);    bool useCommonStride = iter.isStride(maxRank,commonStride)        && expr.isStride(maxRank,commonStride);#else    int commonStride = 1;    bool useCommonStride = false;#endif    int lastLength = length(maxRank);    for (int i=0; i < order.length(); ++i)    {        iter.pop(0);        expr.pop(0);#ifdef BZ_DEBUG_TRAVERSE    cerr << "Traversing: " << order[i] << endl;#endif        // Position the iterator at the start of the next column               for (int j=1; j < N_rank; ++j)        {            iter.loadStride(ordering(j));            expr.loadStride(ordering(j));
💿 文件大小 3117 K
👤 上传用户 evenxyz
📂 所属分类数学计算
🏷️ 相关标签

#计算
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -