📄 eval.cc
字号:
* cached machines.
*/
const int maxRank = ordering(0);
const int secondLastRank = ordering(1);
// Create an iterator for the array receiving the result
FastArrayIterator<T_numtype, N_rank> iter(*this);
// Set the initial stack configuration by pushing the pointer
// to the first element of the array onto the stack N times.
int i;
for (i=1; i < N_rank; ++i)
{
iter.push(i);
expr.push(i);
}
// Load the strides associated with the innermost loop.
iter.loadStride(maxRank);
expr.loadStride(maxRank);
/*
* Is the stride in the innermost loop equal to 1? If so,
* we might take advantage of this and generate more
* efficient code.
*/
_bz_bool useUnitStride = iter.isUnitStride(maxRank)
&& expr.isUnitStride(maxRank);
/*
* Do all array operands share a common stride in the innermost
* loop? If so, we can generate more efficient code (but only
* if this optimization has been enabled).
*/
#ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
int commonStride = expr.suggestStride(maxRank);
if (iter.suggestStride(maxRank) > commonStride)
commonStride = iter.suggestStride(maxRank);
bool useCommonStride = iter.isStride(maxRank,commonStride)
&& expr.isStride(maxRank,commonStride);
#ifdef BZ_DEBUG_TRAVERSE
BZ_DEBUG_MESSAGE("BZ_ARRAY_EXPR_USE_COMMON_STRIDE" << endl
<< "commonStride = " << commonStride << " useCommonStride = "
<< useCommonStride);
#endif
#else
int commonStride = 1;
bool useCommonStride = _bz_false;
#endif
/*
* The "last" array contains a pointer to the last element
* encountered in each "loop".
*/
const T_numtype* last[N_rank];
// Set up the initial state of the "last" array
for (i=1; i < N_rank; ++i)
last[i] = iter.data() + length(ordering(i)) * stride(ordering(i));
int lastLength = length(maxRank);
int firstNoncollapsedLoop = 1;
#ifdef BZ_COLLAPSE_LOOPS
/*
* This bit of code handles collapsing loops. When possible,
* the N nested loops are converted into a single loop (basically,
* the N-dimensional array is treated as a long vector).
* This is important for cases where the length of the innermost
* loop is very small, for example a 100x100x3 array.
* If this code can't collapse all the loops into a single loop,
* it will collapse as many loops as possible starting from the
* innermost and working out.
*/
// Collapse loops when possible
for (i=1; i < N_rank; ++i)
{
// Figure out which pair of loops we are considering combining.
int outerLoopRank = ordering(i);
int innerLoopRank = ordering(i-1);
/*
* The canCollapse() routines look at the strides and extents
* of the loops, and determine if they can be combined into
* one loop.
*/
if (canCollapse(outerLoopRank,innerLoopRank)
&& expr.canCollapse(outerLoopRank,innerLoopRank))
{
#ifdef BZ_DEBUG_TRAVERSE
cout << "Collapsing " << outerLoopRank << " and "
<< innerLoopRank << endl;
#endif
lastLength *= length(outerLoopRank);
firstNoncollapsedLoop = i+1;
}
// else
// break;
}
#endif // BZ_COLLAPSE_LOOPS
/*
* Now we actually perform the loops. This while loop contains
* two parts: first, the innermost loop is performed. Then we
* exit the loop, and pop our way down the stack until we find
* a loop that isn't completed. We then restart the inner loops
* and push them onto the stack.
*/
while (true) {
/*
* This bit of code handles the innermost loop. It would look
* a lot simpler if it weren't for unit stride and common stride
* optimizations; these clutter up the code with multiple versions.
*/
if ((useUnitStride) || (useCommonStride))
{
T_numtype * _bz_restrict end = const_cast<T_numtype*>(iter.data())
+ lastLength;
#ifdef BZ_USE_FAST_READ_ARRAY_EXPR
/*
* The check for BZ_USE_FAST_READ_ARRAY_EXPR can probably
* be taken out. This was put in place while the unit stride/
* common stride optimizations were being implemented and
* tested.
*/
// Calculate the end of the innermost loop
int ubound = lastLength * commonStride;
/*
* This is a real kludge. I didn't want to have to write
* a const and non-const version of FastArrayIterator, so I use a
* const iterator and cast away const. This could
* probably be avoided with some trick, but the whole routine
* is ugly, so why bother.
*/
T_numtype* _bz_restrict data = const_cast<T_numtype*>(iter.data());
/*
* BZ_NEEDS_WORK-- need to implement optional unrolling.
*/
if (commonStride == 1)
{
for (int i=0; i < ubound; ++i)
T_update::update(data[i], expr.fastRead(i));
}
#ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
else {
for (int i=0; i < ubound; i += commonStride)
T_update::update(data[i], expr.fastRead(i));
}
#endif
/*
* Tidy up for the fact that we haven't actually been
* incrementing the iterators in the innermost loop, by
* faking it afterward.
*/
iter.advance(lastLength * commonStride);
expr.advance(lastLength * commonStride);
#else
// !BZ_USE_FAST_READ_ARRAY_EXPR
// This bit of code not really needed; should remove at some
// point, along with the test for BZ_USE_FAST_READ_ARRAY_EXPR
while (iter.data() != end)
{
T_update::update(*const_cast<T_numtype*>(iter.data()), *expr);
iter.advance(commonStride);
expr.advance(commonStride);
}
#endif
}
else {
/*
* We don't have a unit stride or common stride in the innermost
* loop. This is going to hurt performance. Luckily 95% of
* the time, we hit the cases above.
*/
T_numtype * _bz_restrict end = const_cast<T_numtype*>(iter.data())
+ lastLength * stride(maxRank);
while (iter.data() != end)
{
T_update::update(*const_cast<T_numtype*>(iter.data()), *expr);
iter.advance();
expr.advance();
}
}
/*
* We just finished the innermost loop. Now we pop our way down
* the stack, until we hit a loop that hasn't completed yet.
*/
int j = firstNoncollapsedLoop;
for (; j < N_rank; ++j)
{
// Get the next loop
int r = ordering(j);
// Pop-- this restores the data pointers to the first element
// encountered in the loop.
iter.pop(j);
expr.pop(j);
// Load the stride associated with this loop, and increment
// once.
iter.loadStride(r);
expr.loadStride(r);
iter.advance();
expr.advance();
// If we aren't at the end of this loop, then stop popping.
if (iter.data() != last[j])
break;
}
// Are we completely done?
if (j == N_rank)
break;
// No, so push all the inner loops back onto the stack.
for (; j >= firstNoncollapsedLoop; --j)
{
int r2 = ordering(j-1);
iter.push(j);
expr.push(j);
last[j-1] = iter.data() + length(r2) * stride(r2);
}
// Load the stride for the innermost loop again.
iter.loadStride(maxRank);
expr.loadStride(maxRank);
}
return *this;
}
template<class T_numtype, int N_rank> template<class T_expr, class T_update>
inline Array<T_numtype, N_rank>&
Array<T_numtype, N_rank>::evaluateWithIndexTraversal1(
T_expr expr, T_update)
{
TinyVector<int,N_rank> index;
if (stride(firstRank) == 1)
{
T_numtype * _bz_restrict iter = data_;
int last = ubound(firstRank);
for (index[0] = lbound(firstRank); index[0] <= last;
++index[0])
{
iter[index[0]] = T_numtype(expr(index));
}
}
else {
FastArrayIterator<T_numtype, N_rank> iter(*this);
iter.loadStride(0);
int last = ubound(firstRank);
for (index[0] = lbound(firstRank); index[0] <= last;
++index[0])
{
T_update::update(*const_cast<T_numtype*>(iter.data()),
expr(index));
iter.advance();
}
}
return *this;
}
template<class T_numtype, int N_rank> template<class T_expr, class T_update>
inline Array<T_numtype, N_rank>&
Array<T_numtype, N_rank>::evaluateWithIndexTraversalN(
T_expr expr, T_update)
{
// Do a stack-type traversal for the destination array and use
// index traversal for the source expression
const int maxRank = ordering(0);
const int secondLastRank = ordering(1);
#ifdef BZ_DEBUG_TRAVERSE
cout << "Index traversal: N_rank = " << N_rank << endl;
cout << "maxRank = " << maxRank << " secondLastRank = " << secondLastRank
<< endl;
cout.flush();
#endif
FastArrayIterator<T_numtype, N_rank> iter(*this);
for (int i=1; i < N_rank; ++i)
iter.push(ordering(i));
iter.loadStride(maxRank);
TinyVector<int,N_rank> index, last;
index = storage_.base();
last = storage_.base() + length_;
int lastLength = length(maxRank);
while (true) {
for (index[maxRank] = base(maxRank);
index[maxRank] < last[maxRank];
++index[maxRank])
{
#ifdef BZ_DEBUG_TRAVERSE
#if 0
cout << "(" << index[0] << "," << index[1] << ") " << endl;
cout.flush();
#endif
#endif
T_update::update(*const_cast<T_numtype*>(iter.data()), expr(index));
iter.advance();
}
int j = 1;
for (; j < N_rank; ++j)
{
iter.pop(ordering(j));
iter.loadStride(ordering(j));
iter.advance();
index[ordering(j-1)] = base(ordering(j-1));
++index[ordering(j)];
if (index[ordering(j)] != last[ordering(j)])
break;
}
if (j == N_rank)
break;
for (; j > 0; --j)
{
iter.push(ordering(j));
}
iter.loadStride(maxRank);
}
return *this;
}
// Fast traversals require <set> from the ISO/ANSI C++ standard library
#ifdef BZ_HAVE_STD
template<class T_numtype, int N_rank> template<class T_expr, class T_update>
inline Array<T_numtype, N_rank>&
Array<T_numtype, N_rank>::evaluateWithFastTraversal(
const TraversalOrder<N_rank - 1>& order,
T_expr expr,
T_update)
{
const int maxRank = ordering(0);
const int secondLastRank = ordering(1);
#ifdef BZ_DEBUG_TRAVERSE
cerr << "maxRank = " << maxRank << " secondLastRank = " << secondLastRank
<< endl;
#endif
FastArrayIterator<T_numtype, N_rank> iter(*this);
iter.push(0);
expr.push(0);
_bz_bool useUnitStride = iter.isUnitStride(maxRank)
&& expr.isUnitStride(maxRank);
#ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
int commonStride = expr.suggestStride(maxRank);
if (iter.suggestStride(maxRank) > commonStride)
commonStride = iter.suggestStride(maxRank);
bool useCommonStride = iter.isStride(maxRank,commonStride)
&& expr.isStride(maxRank,commonStride);
#else
int commonStride = 1;
bool useCommonStride = _bz_false;
#endif
int lastLength = length(maxRank);
for (int i=0; i < order.length(); ++i)
{
iter.pop(0);
expr.pop(0);
#ifdef BZ_DEBUG_TRAVERSE
cerr << "Traversing: " << order[i] << endl;
#endif
// Position the iterator at the start of the next column
for (int j=1; j < N_rank; ++j)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -