📄 eval.cc
字号:
FastArrayIterator<T_numtype, N_rank> iter(*this); // Set the initial stack configuration by pushing the pointer // to the first element of the array onto the stack N times. int i; for (i=1; i < N_rank; ++i) { iter.push(i); expr.push(i); } // Load the strides associated with the innermost loop. iter.loadStride(maxRank); expr.loadStride(maxRank); /* * Is the stride in the innermost loop equal to 1? If so, * we might take advantage of this and generate more * efficient code. */ bool useUnitStride = iter.isUnitStride(maxRank) && expr.isUnitStride(maxRank); /* * Do all array operands share a common stride in the innermost * loop? If so, we can generate more efficient code (but only * if this optimization has been enabled). */#ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE int commonStride = expr.suggestStride(maxRank); if (iter.suggestStride(maxRank) > commonStride) commonStride = iter.suggestStride(maxRank); bool useCommonStride = iter.isStride(maxRank,commonStride) && expr.isStride(maxRank,commonStride);#ifdef BZ_DEBUG_TRAVERSE BZ_DEBUG_MESSAGE("BZ_ARRAY_EXPR_USE_COMMON_STRIDE" << endl << "commonStride = " << commonStride << " useCommonStride = " << useCommonStride);#endif#else int commonStride = 1; bool useCommonStride = false;#endif /* * The "last" array contains a pointer to the last element * encountered in each "loop". */ const T_numtype* last[N_rank]; // Set up the initial state of the "last" array for (i=1; i < N_rank; ++i) last[i] = iter.data() + length(ordering(i)) * stride(ordering(i)); int lastLength = length(maxRank); int firstNoncollapsedLoop = 1;#ifdef BZ_COLLAPSE_LOOPS /* * This bit of code handles collapsing loops. When possible, * the N nested loops are converted into a single loop (basically, * the N-dimensional array is treated as a long vector). * This is important for cases where the length of the innermost * loop is very small, for example a 100x100x3 array. * If this code can't collapse all the loops into a single loop, * it will collapse as many loops as possible starting from the * innermost and working out. */ // Collapse loops when possible for (i=1; i < N_rank; ++i) { // Figure out which pair of loops we are considering combining. int outerLoopRank = ordering(i); int innerLoopRank = ordering(i-1); /* * The canCollapse() routines look at the strides and extents * of the loops, and determine if they can be combined into * one loop. */ if (canCollapse(outerLoopRank,innerLoopRank) && expr.canCollapse(outerLoopRank,innerLoopRank)) {#ifdef BZ_DEBUG_TRAVERSE cout << "Collapsing " << outerLoopRank << " and " << innerLoopRank << endl;#endif lastLength *= length(outerLoopRank); firstNoncollapsedLoop = i+1; } else break; }#endif // BZ_COLLAPSE_LOOPS /* * Now we actually perform the loops. This while loop contains * two parts: first, the innermost loop is performed. Then we * exit the loop, and pop our way down the stack until we find * a loop that isn't completed. We then restart the inner loops * and push them onto the stack. */ while (true) { /* * This bit of code handles the innermost loop. It would look * a lot simpler if it weren't for unit stride and common stride * optimizations; these clutter up the code with multiple versions. */ if ((useUnitStride) || (useCommonStride)) {#ifdef BZ_USE_FAST_READ_ARRAY_EXPR /* * The check for BZ_USE_FAST_READ_ARRAY_EXPR can probably * be taken out. This was put in place while the unit stride/ * common stride optimizations were being implemented and * tested. */ // Calculate the end of the innermost loop int ubound = lastLength * commonStride; /* * This is a real kludge. I didn't want to have to write * a const and non-const version of FastArrayIterator, so I use a * const iterator and cast away const. This could * probably be avoided with some trick, but the whole routine * is ugly, so why bother. */ T_numtype* restrict data = const_cast<T_numtype*>(iter.data()); /* * BZ_NEEDS_WORK-- need to implement optional unrolling. */ if (commonStride == 1) { for (int i=0; i < ubound; ++i) T_update::update(*data++, expr.fastRead(i)); }#ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE else { for (int i=0; i != ubound; i += commonStride) T_update::update(data[i], expr.fastRead(i)); }#endif /* * Tidy up for the fact that we haven't actually been * incrementing the iterators in the innermost loop, by * faking it afterward. */ iter.advance(lastLength * commonStride); expr.advance(lastLength * commonStride);#else // !BZ_USE_FAST_READ_ARRAY_EXPR // This bit of code not really needed; should remove at some // point, along with the test for BZ_USE_FAST_READ_ARRAY_EXPR T_numtype * restrict end = const_cast<T_numtype*>(iter.data()) + lastLength; while (iter.data() != end) { T_update::update(*const_cast<T_numtype*>(iter.data()), *expr); iter.advance(commonStride); expr.advance(commonStride); }#endif } else { /* * We don't have a unit stride or common stride in the innermost * loop. This is going to hurt performance. Luckily 95% of * the time, we hit the cases above. */ T_numtype * restrict end = const_cast<T_numtype*>(iter.data()) + lastLength * stride(maxRank); while (iter.data() != end) { T_update::update(*const_cast<T_numtype*>(iter.data()), *expr); iter.advance(); expr.advance(); } } /* * We just finished the innermost loop. Now we pop our way down * the stack, until we hit a loop that hasn't completed yet. */ int j = firstNoncollapsedLoop; for (; j < N_rank; ++j) { // Get the next loop int r = ordering(j); // Pop-- this restores the data pointers to the first element // encountered in the loop. iter.pop(j); expr.pop(j); // Load the stride associated with this loop, and increment // once. iter.loadStride(r); expr.loadStride(r); iter.advance(); expr.advance(); // If we aren't at the end of this loop, then stop popping. if (iter.data() != last[j]) break; } // Are we completely done? if (j == N_rank) break; // No, so push all the inner loops back onto the stack. for (; j >= firstNoncollapsedLoop; --j) { int r2 = ordering(j-1); iter.push(j); expr.push(j); last[j-1] = iter.data() + length(r2) * stride(r2); } // Load the stride for the innermost loop again. iter.loadStride(maxRank); expr.loadStride(maxRank); } return *this;}template<typename T_numtype, int N_rank> template<typename T_expr, typename T_update>inline Array<T_numtype, N_rank>&Array<T_numtype, N_rank>::evaluateWithIndexTraversal1( T_expr expr, T_update){ TinyVector<int,N_rank> index; if (stride(firstRank) == 1) { T_numtype * restrict iter = data_ + lbound(firstRank); int last = ubound(firstRank); for (index[0] = lbound(firstRank); index[0] <= last; ++index[0]) { T_update::update(*iter++, expr(index)); } } else { FastArrayIterator<T_numtype, N_rank> iter(*this); iter.loadStride(0); int last = ubound(firstRank); for (index[0] = lbound(firstRank); index[0] <= last; ++index[0]) { T_update::update(*const_cast<T_numtype*>(iter.data()), expr(index)); iter.advance(); } } return *this;}template<typename T_numtype, int N_rank> template<typename T_expr, typename T_update>inline Array<T_numtype, N_rank>&Array<T_numtype, N_rank>::evaluateWithIndexTraversalN( T_expr expr, T_update){ // Do a stack-type traversal for the destination array and use // index traversal for the source expression const int maxRank = ordering(0);#ifdef BZ_DEBUG_TRAVERSE const int secondLastRank = ordering(1); cout << "Index traversal: N_rank = " << N_rank << endl; cout << "maxRank = " << maxRank << " secondLastRank = " << secondLastRank << endl; cout.flush();#endif FastArrayIterator<T_numtype, N_rank> iter(*this); for (int i=1; i < N_rank; ++i) iter.push(ordering(i)); iter.loadStride(maxRank); TinyVector<int,N_rank> index, last; index = storage_.base(); for (int i=0; i < N_rank; ++i) last(i) = storage_.base(i) + length_(i); // int lastLength = length(maxRank); while (true) { for (index[maxRank] = base(maxRank); index[maxRank] < last[maxRank]; ++index[maxRank]) {#ifdef BZ_DEBUG_TRAVERSE#if 0 cout << "(" << index[0] << "," << index[1] << ") " << endl; cout.flush();#endif#endif T_update::update(*const_cast<T_numtype*>(iter.data()), expr(index)); iter.advance(); } int j = 1; for (; j < N_rank; ++j) { iter.pop(ordering(j)); iter.loadStride(ordering(j)); iter.advance(); index[ordering(j-1)] = base(ordering(j-1)); ++index[ordering(j)]; if (index[ordering(j)] != last[ordering(j)]) break; } if (j == N_rank) break; for (; j > 0; --j) { iter.push(ordering(j)); } iter.loadStride(maxRank); } return *this; }// Fast traversals require <set> from the ISO/ANSI C++ standard library#ifdef BZ_HAVE_STD#ifdef BZ_ARRAY_SPACE_FILLING_TRAVERSALtemplate<typename T_numtype, int N_rank> template<typename T_expr, typename T_update>inline Array<T_numtype, N_rank>&Array<T_numtype, N_rank>::evaluateWithFastTraversal( const TraversalOrder<N_rank - 1>& order, T_expr expr, T_update){ const int maxRank = ordering(0);#ifdef BZ_DEBUG_TRAVERSE const int secondLastRank = ordering(1); cerr << "maxRank = " << maxRank << " secondLastRank = " << secondLastRank << endl;#endif FastArrayIterator<T_numtype, N_rank> iter(*this); iter.push(0); expr.push(0); bool useUnitStride = iter.isUnitStride(maxRank) && expr.isUnitStride(maxRank);#ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE int commonStride = expr.suggestStride(maxRank); if (iter.suggestStride(maxRank) > commonStride) commonStride = iter.suggestStride(maxRank); bool useCommonStride = iter.isStride(maxRank,commonStride) && expr.isStride(maxRank,commonStride);#else int commonStride = 1; bool useCommonStride = false;#endif int lastLength = length(maxRank); for (int i=0; i < order.length(); ++i) { iter.pop(0); expr.pop(0);#ifdef BZ_DEBUG_TRAVERSE cerr << "Traversing: " << order[i] << endl;#endif // Position the iterator at the start of the next column for (int j=1; j < N_rank; ++j) { iter.loadStride(ordering(j)); expr.loadStride(ordering(j));
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -