📄 eval.cc
字号:
/*
* $Id: arrayeval.cc,v 1.2 1998/03/14 00:04:47 tveldhui Exp $
*
* $Log: arrayeval.cc,v $
* Revision 1.2 1998/03/14 00:04:47 tveldhui
* 0.2-alpha-05
*
* Revision 1.1 1998/02/25 20:04:01 tveldhui
* Initial revision
*
*/
#ifndef BZ_ARRAYEVAL_CC
#define BZ_ARRAYEVAL_CC
#ifndef BZ_ARRAY_H
#error <blitz/array/eval.cc> must be included via <blitz/array.h>
#endif
BZ_NAMESPACE(blitz)
/*
* Assign an expression to an array. For performance reasons, there are
* several traversal mechanisms:
*
* - Index traversal scans through the destination array in storage order.
* The expression is evaluated using a TinyVector<int,N> operand. This
* version is used only when there are index placeholders in the expression
* (see <blitz/indexexpr.h>)
* - Stack traversal also scans through the destination array in storage
* order. However, push/pop stack iterators are used.
* - Fast traversal follows a Hilbert (or other) space-filling curve to
* improve cache reuse for stencilling operations. Currently, the
* space filling curves must be generated by calling
* generateFastTraversalOrder(TinyVector<int,N_dimensions>).
* - 2D tiled traversal follows a tiled traversal, to improve cache reuse
* for 2D stencils. Space filling curves have too much overhead to use
* in two-dimensions.
*
* _bz_tryFastTraversal is a helper class. Fast traversals are only
* attempted if the expression looks like a stencil -- it's at least
* three-dimensional, has at least six array operands, and there are
* no index placeholders in the expression. These are all things which
* can be checked at compile time, so the if()/else() syntax has been
* replaced with this class template.
*/
// Fast traversals require <set> from the ISO/ANSI C++ standard library
#ifdef BZ_HAVE_STD
template<_bz_bool canTryFastTraversal>
struct _bz_tryFastTraversal {
template<class T_numtype, int N_rank, class T_expr, class T_update>
static _bz_bool tryFast(Array<T_numtype,N_rank>& array,
BZ_ETPARM(T_expr) expr, T_update)
{
return _bz_false;
}
};
template<>
struct _bz_tryFastTraversal<_bz_true> {
template<class T_numtype, int N_rank, class T_expr, class T_update>
static _bz_bool tryFast(Array<T_numtype,N_rank>& array,
BZ_ETPARM(T_expr) expr, T_update)
{
// See if there's an appropriate space filling curve available.
// Currently fast traversals use an N-1 dimensional curve. The
// Nth dimension column corresponding to each point on the curve
// is traversed in the normal fashion.
TraversalOrderCollection<N_rank-1> traversals;
TinyVector<int, N_rank - 1> traversalGridSize;
for (int i=0; i < N_rank - 1; ++i)
traversalGridSize[i] = array.length(array.ordering(i+1));
#ifdef BZ_DEBUG_TRAVERSE
cout << "traversalGridSize = " << traversalGridSize << endl;
cout.flush();
#endif
const TraversalOrder<N_rank-1>* order =
traversals.find(traversalGridSize);
if (order)
{
#ifdef BZ_DEBUG_TRAVERSE
cerr << "Array<" << BZ_DEBUG_TEMPLATE_AS_STRING_LITERAL(T_numtype)
<< ", " << N_rank << ">: Using stack traversal" << endl;
#endif
// A curve was available -- use fast traversal.
array.evaluateWithFastTraversal(*order, expr, T_update());
return _bz_true;
}
return _bz_false;
}
};
#endif // BZ_HAVE_STD
template<class T_numtype, int N_rank> template<class T_expr, class T_update>
inline Array<T_numtype, N_rank>&
Array<T_numtype, N_rank>::evaluate(T_expr expr,
T_update)
{
// Check that all arrays have the same shape
#ifdef BZ_DEBUG
if (!expr.shapeCheck(shape()))
{
if (assertFailMode == _bz_false)
{
cerr << "[Blitz++] Shape check failed: Module " << __FILE__
<< " line " << __LINE__ << endl
<< " Expression: ";
prettyPrintFormat format(_bz_true); // Use terse formatting
string str;
expr.prettyPrint(str, format);
cerr << str << endl ;
}
#if 0
// Shape dumping is broken by change to using string for prettyPrint
<< " Shapes: " << shape() << " = ";
prettyPrintFormat format2;
format2.setDumpArrayShapesMode();
expr.prettyPrint(cerr, format2);
cerr << endl;
#endif
BZ_PRE_FAIL;
}
#endif
BZPRECHECK(expr.shapeCheck(shape()),
"Shape check failed." << endl << "Expression:");
BZPRECHECK((T_expr::rank == N_rank) || (T_expr::numArrayOperands == 0),
"Assigned rank " << T_expr::rank << " expression to rank "
<< N_rank << " array.");
/*
* Check that the arrays are not empty (e.g. length 0 arrays)
* This fixes a bug found by Peter Bienstman, 6/16/99, where
* Array<double,2> A(0,0),B(0,0); B=A(tensor::j,tensor::i);
* went into an infinite loop.
*/
if (numElements() == 0)
return *this;
#ifdef BZ_DEBUG_TRAVERSE
cout << "T_expr::numIndexPlaceholders = " << T_expr::numIndexPlaceholders
<< endl; cout.flush();
#endif
// Tau profiling code. Provide Tau with a pretty-printed version of
// the expression.
// NEEDS_WORK-- use a static initializer somehow.
#ifdef BZ_TAU_PROFILING
static string exprDescription;
if (!exprDescription.length()) // faked static initializer
{
exprDescription = "A";
prettyPrintFormat format(_bz_true); // Terse mode on
format.nextArrayOperandSymbol();
T_update::prettyPrint(exprDescription);
expr.prettyPrint(exprDescription, format);
}
TAU_PROFILE(" ", exprDescription, TAU_BLITZ);
#endif
// Determine which evaluation mechanism to use
if (T_expr::numIndexPlaceholders > 0)
{
// The expression involves index placeholders, so have to
// use index traversal rather than stack traversal.
if (N_rank == 1)
return evaluateWithIndexTraversal1(expr, T_update());
else
return evaluateWithIndexTraversalN(expr, T_update());
}
else {
// If this expression looks like an array stencil, then attempt to
// use a fast traversal order.
// Fast traversals require <set> from the ISO/ANSI C++ standard
// library.
#ifdef BZ_HAVE_STD
enum { isStencil = (N_rank >= 3) && (T_expr::numArrayOperands > 6)
&& (T_expr::numIndexPlaceholders == 0) };
if (_bz_tryFastTraversal<isStencil>::tryFast(*this, expr, T_update()))
return *this;
#endif
#ifdef BZ_ARRAY_2D_STENCIL_TILING
// Does this look like a 2-dimensional stencil on a largeish
// array?
if ((N_rank == 2) && (T_expr::numArrayOperands >= 5))
{
// Use a heuristic to determine whether a tiled traversal
// is desirable. First, estimate how much L1 cache is needed
// to achieve a high hit rate using the stack traversal.
// Try to err on the side of using tiled traversal even when
// it isn't strictly needed.
// Assumptions:
// Stencil width 3
// 3 arrays involved in stencil
// Uniform data type in arrays (all T_numtype)
int cacheNeeded = 3 * 3 * sizeof(T_numtype) * length(ordering(0));
if (cacheNeeded > BZ_L1_CACHE_ESTIMATED_SIZE)
return evaluateWithTiled2DTraversal(expr, T_update());
}
#endif
// If fast traversal isn't available or appropriate, then just
// do a stack traversal.
if (N_rank == 1)
return evaluateWithStackTraversal1(expr, T_update());
else
return evaluateWithStackTraversalN(expr, T_update());
}
}
template<class T_numtype, int N_rank> template<class T_expr, class T_update>
inline Array<T_numtype, N_rank>&
Array<T_numtype, N_rank>::evaluateWithStackTraversal1(
T_expr expr, T_update)
{
#ifdef BZ_DEBUG_TRAVERSE
BZ_DEBUG_MESSAGE("Array<" << BZ_DEBUG_TEMPLATE_AS_STRING_LITERAL(T_numtype)
<< ", " << N_rank << ">: Using stack traversal");
#endif
FastArrayIterator<T_numtype, N_rank> iter(*this);
iter.loadStride(firstRank);
expr.loadStride(firstRank);
_bz_bool useUnitStride = iter.isUnitStride(firstRank)
&& expr.isUnitStride(firstRank);
#ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
int commonStride = expr.suggestStride(firstRank);
if (iter.suggestStride(firstRank) > commonStride)
commonStride = iter.suggestStride(firstRank);
bool useCommonStride = iter.isStride(firstRank,commonStride)
&& expr.isStride(firstRank,commonStride);
#ifdef BZ_DEBUG_TRAVERSE
BZ_DEBUG_MESSAGE("BZ_ARRAY_EXPR_USE_COMMON_STRIDE:" << endl
<< " commonStride = " << commonStride << " useCommonStride = "
<< useCommonStride);
#endif
#else
int commonStride = 1;
bool useCommonStride = _bz_false;
#endif
const T_numtype * last = iter.data() + length(firstRank)
* stride(firstRank);
if (useUnitStride || useCommonStride)
{
#ifdef BZ_USE_FAST_READ_ARRAY_EXPR
#ifdef BZ_DEBUG_TRAVERSE
BZ_DEBUG_MESSAGE("BZ_USE_FAST_READ_ARRAY_EXPR with commonStride");
#endif
int ubound = length(firstRank) * commonStride;
T_numtype* _bz_restrict data = const_cast<T_numtype*>(iter.data());
if (commonStride == 1)
{
#ifndef BZ_ARRAY_STACK_TRAVERSAL_UNROLL
for (int i=0; i < ubound; ++i)
T_update::update(data[i], expr.fastRead(i));
#else
int n1 = ubound & 3;
int i = 0;
for (; i < n1; ++i)
T_update::update(data[i], expr.fastRead(i));
for (; i < ubound; i += 4)
{
#ifndef BZ_ARRAY_STACK_TRAVERSAL_CSE_AND_ANTIALIAS
T_update::update(data[i], expr.fastRead(i));
T_update::update(data[i+1], expr.fastRead(i+1));
T_update::update(data[i+2], expr.fastRead(i+2));
T_update::update(data[i+3], expr.fastRead(i+3));
#else
int t1 = i+1;
int t2 = i+2;
int t3 = i+3;
_bz_typename T_expr::T_numtype tmp1, tmp2, tmp3, tmp4;
tmp1 = expr.fastRead(i);
tmp2 = expr.fastRead(BZ_NO_PROPAGATE(t1));
tmp3 = expr.fastRead(BZ_NO_PROPAGATE(t2));
tmp4 = expr.fastRead(BZ_NO_PROPAGATE(t3));
T_update::update(data[i], BZ_NO_PROPAGATE(tmp1));
T_update::update(data[BZ_NO_PROPAGATE(t1)], tmp2);
T_update::update(data[BZ_NO_PROPAGATE(t2)], tmp3);
T_update::update(data[BZ_NO_PROPAGATE(t3)], tmp4);
#endif
}
#endif // BZ_ARRAY_STACK_TRAVERSAL_UNROLL
}
#ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
else {
#ifndef BZ_ARRAY_STACK_TRAVERSAL_UNROLL
for (int i=0; i < ubound; i += commonStride)
T_update::update(data[i], expr.fastRead(i));
#else
int n1 = (length(firstRank) & 3) * commonStride;
int i = 0;
for (; i < n1; i += commonStride)
T_update::update(data[i], expr.fastRead(i));
int strideInc = 4 * commonStride;
for (; i < ubound; i += strideInc)
{
T_update::update(data[i], expr.fastRead(i));
int i2 = i + commonStride;
T_update::update(data[i2], expr.fastRead(i2));
int i3 = i + 2 * commonStride;
T_update::update(data[i3], expr.fastRead(i3));
int i4 = i + 3 * commonStride;
T_update::update(data[i4], expr.fastRead(i4));
}
#endif // BZ_ARRAY_STACK_TRAVERSAL_UNROLL
}
#endif // BZ_ARRAY_EXPR_USE_COMMON_STRIDE
#else // ! BZ_USE_FAST_READ_ARRAY_EXPR
#ifdef BZ_DEBUG_TRAVERSE
BZ_DEBUG_MESSAGE("Common stride, no fast read");
#endif
while (iter.data() != last)
{
T_update::update(*const_cast<T_numtype*>(iter.data()), *expr);
iter.advance(commonStride);
expr.advance(commonStride);
}
#endif
}
else {
while (iter.data() != last)
{
T_update::update(*const_cast<T_numtype*>(iter.data()), *expr);
iter.advance();
expr.advance();
}
}
return *this;
}
template<class T_numtype, int N_rank> template<class T_expr, class T_update>
inline Array<T_numtype, N_rank>&
Array<T_numtype, N_rank>::evaluateWithStackTraversalN(
T_expr expr, T_update)
{
/*
* A stack traversal replaces the usual nested loops:
*
* for (int i=A.lbound(firstDim); i <= A.ubound(firstDim); ++i)
* for (int j=A.lbound(secondDim); j <= A.ubound(secondDim); ++j)
* for (int k=A.lbound(thirdDim); k <= A.ubound(thirdDim); ++k)
* A(i,j,k) = 0;
*
* with a stack data structure. The stack allows this single
* routine to replace any number of nested loops.
*
* For each dimension (loop), these quantities are needed:
* - a pointer to the first element encountered in the loop
* - the stride associated with the dimension/loop
* - a pointer to the last element encountered in the loop
*
* The basic idea is that entering each loop is a "push" onto the
* stack, and exiting each loop is a "pop". In practice, this
* routine treats accesses the stack in a random-access way,
* which confuses the picture a bit. But conceptually, that's
* what is going on.
*/
/*
* ordering(0) gives the dimension associated with the smallest
* stride (usually; the exceptions have to do with subarrays and
* are uninteresting). We call this dimension maxRank; it will
* become the innermost "loop".
*
* Ordering the loops from ordering(N_rank-1) down to
* ordering(0) ensures that the largest stride is associated
* with the outermost loop, and the smallest stride with the
* innermost. This is critical for good performance on
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -