📄 blas.hpp
字号:
#include <algorithm>using namespace std;namespace ebl {#if USING_STL_ITERS == 0template<class T> void idx_clear(Idx<T> &inp) { IdxIter<T> pinp; idx_aloop1_on(pinp,inp) { *pinp = 0; }}template<class T> void idx_fill(Idx<T> &inp, T v) { IdxIter<T> pinp; idx_aloop1_on(pinp,inp) { *pinp = v; }}template<class T> void idx_minus(Idx<T> &inp, Idx<T> &out) { IdxIter<T> pinp; IdxIter<T> pout; idx_aloop2_on(pinp,inp,pout,out) { *pout = - *pinp; }}template<class T> void idx_inv(Idx<T> &inp, Idx<T> &out) { IdxIter<T> pinp; IdxIter<T> pout; idx_aloop2_on(pinp,inp,pout,out) { *pout = 1 / *pinp; }}template<class T> void idx_sub(Idx<T> &i1, Idx<T> &i2, Idx<T> &out) { IdxIter<T> pi1, pi2; IdxIter<T> pout; idx_aloop3_on(pi1,i1,pi2,i2,pout,out) { *pout = *pi1 - *pi2; }}template<class T> void idx_add(Idx<T> &i1, Idx<T> &i2, Idx<T> &out) { IdxIter<T> pi1, pi2; IdxIter<T> pout; idx_aloop3_on(pi1,i1,pi2,i2,pout,out) { *pout = *pi1 + *pi2; }}template<class T> void idx_mul(Idx<T> &i1, Idx<T> &i2, Idx<T> &out) { IdxIter<T> pi1, pi2; IdxIter<T> pout; idx_aloop3_on(pi1,i1,pi2,i2,pout,out) { *pout = (*pi1) * (*pi2); }}template<class T> void idx_addc(Idx<T> &inp, T c, Idx<T> &out) { IdxIter<T> pinp; IdxIter<T> pout; idx_aloop2_on(pinp,inp,pout,out) { *pout = *pinp + c; }}template<class T> void idx_addcacc(Idx<T> &inp, T c, Idx<T> &out) { IdxIter<T> pinp; IdxIter<T> pout; idx_aloop2_on(pinp,inp,pout,out) { *pout += *pinp + c; }}template<class T> void idx_dotc(Idx<T> &inp, T c, Idx<T> &out) { IdxIter<T> pinp; IdxIter<T> pout; idx_aloop2_on(pinp,inp,pout,out) { *pout = *pinp * c; }}template<class T> void idx_dotcacc(Idx<T> &inp, T c, Idx<T> &out) { IdxIter<T> pinp; IdxIter<T> pout; idx_aloop2_on(pinp,inp,pout,out) { *pout += *pinp * c; }}template<class T> void idx_signdotc(Idx<T> &inp, T c, Idx<T> &out) { IdxIter<T> pinp; IdxIter<T> pout; idx_aloop2_on(pinp,inp,pout,out) { *pout = (*pinp<0)?-c:c; }}template<class T> void idx_signdotcacc(Idx<T> &inp, T c, Idx<T> &out) { IdxIter<T> pinp; IdxIter<T> pout; idx_aloop2_on(pinp,inp,pout,out) { *pout += (*pinp<0)?-c:c; }}template<class T> void idx_subsquare(Idx<T> &i1, Idx<T> &i2, Idx<T> &out) { IdxIter<T> pi1; IdxIter<T> pi2; IdxIter<T> pout; idx_aloop3_on(pi1,i1,pi2,i2,pout,out) { T d = *pi1 - *pi2; *pout += d*d; }}// not very efficient. There must be a more parallel way of doing thistemplate<class T> void idx_lincomb(Idx<T> &i1, T k1, Idx<T> &i2, T k2, Idx<T> &out) { IdxIter<T> pi1; IdxIter<T> pi2; IdxIter<T> pout; idx_aloop3_on(pi1,i1,pi2,i2,pout,out) { *pout = k1*(*pi1) + k2*(*pi2); }}template<class T> void idx_tanh(Idx<T> &inp, Idx<T> &out) { IdxIter<T> pinp; IdxIter<T> pout; idx_aloop2_on(pinp,inp,pout,out) { *pout = (T)(tanh((double)*pinp)); }}template<class T> void idx_dtanh(Idx<T> &inp, Idx<T> &out) { IdxIter<T> pinp; IdxIter<T> pout; idx_aloop2_on(pinp,inp,pout,out) { *pout = (T)(dtanh((double)*pinp)); }}template<class T> void idx_stdsigmoid(Idx<T> &inp, Idx<T> &out) { IdxIter<T> pinp; IdxIter<T> pout; idx_aloop2_on(pinp,inp,pout,out) { *pout = (T)(stdsigmoid((double)*pinp)); }}template<class T> void idx_dstdsigmoid(Idx<T> &inp, Idx<T> &out) { IdxIter<T> pinp; IdxIter<T> pout; idx_aloop2_on(pinp,inp,pout,out) { *pout = (T)(dstdsigmoid((double)*pinp)); }}template<class T> void idx_abs(Idx<T>& inp, Idx<T>& out) { IdxIter<T> pinp(inp); IdxIter<T> pout(out); idx_aloop2_on(pinp,inp,pout,out) { *pout = (T)(fabs((double)*pinp)); }}// there is a much faster and parallel way// of doing this using a tree.template<class T> T idx_sum(Idx<T> &inp, T *out) { T z = 0; IdxIter<T> pinp; idx_aloop1_on(pinp,inp) { z += *pinp; } if (out != NULL) { *out += z; return *out; } return z;}// there is a much faster and parallel way// of doing this using a tree.template<class T> T idx_sumsqr(Idx<T> &inp) { T z = 0; IdxIter<T> pinp; idx_aloop1_on(pinp,inp) { z += (*pinp)*(*pinp); } return z;}////////////////////////////////////////////////////////////////////////#else// generic copy for two different types.//template<class T1, class T2> void idx_copy(Idx<T1> &src, Idx<T2> &dst) {//// IdxIter<T1> isrc;//// IdxIter<T2> idst;//// idx_aloop2_on(isrc, src, idst, dst) { *idst = (T2)(*isrc); }// {idx_aloop2(isrc, src, T1, idst, dst, T2) { *idst = (T2)(*isrc); }}//}//// generic copy for two different types.//template<class T1, class T2> void idx_copy(Idx<T1> &src, Idx<T2> &dst) {//// IdxIter<T1> isrc;//// IdxIter<T2> idst;//// idx_aloop2_on(isrc, src, idst, dst) { *idst = (T2)(*isrc); }// {idx_aloop2(isrc, src, T1, idst, dst, T2) { *idst = (T2)(*isrc); }}//}template<class T1, class T2> void idx_copy(Idx<T1> &src, Idx<T2> &dst){ { idx_aloop2(isrc, src, T1, idst, dst, T2) { *idst = (T2)(*isrc); }}}// generic copy for the same type.template<class T> void idx_copy(Idx<T> &src, Idx<T> &dst) { intg N1=src.nelements(); intg N2 =dst.nelements(); if (N1 != N2) { ylerror("idx_op: Idxs have different number of elements\n"); } if ( (src.order() == 0) && (dst.order() == 0) ) { *(dst.idx_ptr()) = *(src.idx_ptr()); } else if ( src.contiguousp() && dst.contiguousp() ) { /* they are both contiguous: call the stride 1 routine */ memcpy(dst.idx_ptr(), src.idx_ptr(), N1 * sizeof(T)); } else { /* else, they don't have the same structure: do it "by hand". This is slower */ {idx_aloop2(isrc, src, T, idst, dst, T) { *idst = *isrc; }} }}template<class T> void idx_clear(Idx<T> &inp) { ScalarIter<T> pinp(inp); idx_aloop1_on(pinp,inp) { *pinp = 0; }}template<class T> void idx_fill(Idx<T> &inp, T v) { ScalarIter<T> pinp(inp); idx_aloop1_on(pinp,inp) { *pinp = v; }}template<class T> void idx_minus(Idx<T> &inp, Idx<T> &out) { ScalarIter<T> pinp(inp); ScalarIter<T> pout(out); idx_aloop2_on(pinp,inp,pout,out) { *pout = - *pinp; }}template<class T> void idx_inv(Idx<T> &inp, Idx<T> &out) { ScalarIter<T> pinp(inp); ScalarIter<T> pout(out); idx_aloop2_on(pinp,inp,pout,out) { *pout = 1 / *pinp; }}template<class T> void idx_sub(Idx<T> &i1, Idx<T> &i2, Idx<T> &out) { ScalarIter<T> pi1(i1), pi2(i2); ScalarIter<T> pout(out); idx_aloop3_on(pi1,i1,pi2,i2,pout,out) { *pout = *pi1 - *pi2; }}template<class T> void idx_add(Idx<T> &i1, Idx<T> &i2, Idx<T> &out) { ScalarIter<T> pi1(i1), pi2(i2); ScalarIter<T> pout(out); idx_aloop3_on(pi1,i1,pi2,i2,pout,out) { *pout = *pi1 + *pi2; }}template<class T> void idx_mul(Idx<T> &i1, Idx<T> &i2, Idx<T> &out) { ScalarIter<T> pi1(i1), pi2(i2); ScalarIter<T> pout(out); idx_aloop3_on(pi1,i1,pi2,i2,pout,out) { *pout = (*pi1) * (*pi2); }}template<class T> void idx_addc(Idx<T> &inp, T c, Idx<T> &out) { ScalarIter<T> pinp(inp); ScalarIter<T> pout(out); idx_aloop2_on(pinp,inp,pout,out) { *pout = *pinp + c; }}template<class T> void idx_addcacc(Idx<T> &inp, T c, Idx<T> &out) { ScalarIter<T> pinp(inp); ScalarIter<T> pout(out); idx_aloop2_on(pinp,inp,pout,out) { *pout += *pinp + c; }}template<class T> void idx_dotc(Idx<T> &inp, T c, Idx<T> &out) { ScalarIter<T> pinp(inp); ScalarIter<T> pout(out); idx_aloop2_on(pinp,inp,pout,out) { *pout = *pinp * c; }}template<class T> void idx_dotcacc(Idx<T> &inp, T c, Idx<T> &out) { ScalarIter<T> pinp(inp); ScalarIter<T> pout(out); idx_aloop2_on(pinp,inp,pout,out) { *pout += *pinp * c; }}template<class T> void idx_signdotc(Idx<T> &inp, T c, Idx<T> &out) { ScalarIter<T> pinp(inp); ScalarIter<T> pout(out); idx_aloop2_on(pinp,inp,pout,out) { *pout = (*pinp<0)?-c:c; }}template<class T> void idx_signdotcacc(Idx<T> &inp, T c, Idx<T> &out) { ScalarIter<T> pinp(inp); ScalarIter<T> pout(out); idx_aloop2_on(pinp,inp,pout,out) { *pout += (*pinp<0)?-c:c; }}template<class T> void idx_subsquare(Idx<T> &i1, Idx<T> &i2, Idx<T> &out) { ScalarIter<T> pi1; ScalarIter<T> pi2; ScalarIter<T> pout(out); idx_aloop3_on(pi1,i1,pi2,i2,pout,out) { T d = *pi1 - *pi2; *pout += d*d; }}// not very efficient. There must be a more parallel way of doing thistemplate<class T> void idx_lincomb(Idx<T> &i1, T k1, Idx<T> &i2, T k2, Idx<T> &out) { ScalarIter<T> pi1(i1); ScalarIter<T> pi2(i2); ScalarIter<T> pout(out); idx_aloop3_on(pi1,i1,pi2,i2,pout,out) { *pout = k1*(*pi1) + k2*(*pi2); }}template<class T> void idx_tanh(Idx<T> &inp, Idx<T> &out) { ScalarIter<T> pinp(inp); ScalarIter<T> pout(out); idx_aloop2_on(pinp,inp,pout,out) { *pout = (T)(tanh((double)*pinp)); }}template<class T> void idx_dtanh(Idx<T> &inp, Idx<T> &out) { ScalarIter<T> pinp(inp); ScalarIter<T> pout(out); idx_aloop2_on(pinp,inp,pout,out) { *pout = (T)(dtanh((double)*pinp)); }}template<class T> void idx_stdsigmoid(Idx<T> &inp, Idx<T> &out) { ScalarIter<T> pinp(inp); ScalarIter<T> pout(out); idx_aloop2_on(pinp,inp,pout,out) { *pout = (T)(stdsigmoid((double)*pinp)); }}template<class T> void idx_dstdsigmoid(Idx<T> &inp, Idx<T> &out) { ScalarIter<T> pinp(inp); ScalarIter<T> pout(out); idx_aloop2_on(pinp,inp,pout,out) { *pout = (T)(dstdsigmoid((double)*pinp)); }}template<class T> void idx_abs(Idx<T>& inp, Idx<T>& out) { ScalarIter<T> pinp(inp); ScalarIter<T> pout(out); idx_aloop2_on(pinp,inp,pout,out) { *pout = (T)(fabs((double)*pinp)); }}// there is a much faster and parallel way// of doing this using a tree.template<class T> T idx_sum(Idx<T> &inp, T *out) { T z = 0; ScalarIter<T> pinp(inp); idx_aloop1_on(pinp,inp) { z += *pinp; } if (out != NULL) { *out += z; return *out; } return z;}// there is a much faster and parallel way// of doing this using a tree.template<class T> T idx_sumsqr(Idx<T> &inp) { T z = 0; ScalarIter<T> pinp(inp); idx_aloop1_on(pinp,inp) { z += (*pinp)*(*pinp); } return z;}#endif////////////////////////////////////////////////////////////////////////// Functions without iteratorstemplate<class T> T idx_sumacc(Idx<T> &inp, Idx<T> &acc) { // acc must be of order 0. if (acc.order() != 0) ylerror("expecting an Idx0 as output"); return idx_sum(inp, acc.ptr());}template<class T> void rev_idx2 (Idx<T> &m) { if (m.order() != 2) ylerror("Expecting Idx of order 2"); T tmp, *p = m.ptr(); intg size = m.dim(0) * m.dim(1); intg i; for (i = 0; i < size/2; i++) { tmp = p[i]; p[i] = p[size-i-1]; p[size-i-1] = tmp; }}template<class T> void rev_idx2_tr (Idx<T> &m, Idx<T> &n) { if ((m.order() != 2) || (n.order() != 2)) ylerror("Expecting Idx of order 2"); T *p = m.ptr(); T *q = n.ptr(); intg size = m.dim(0) * m.dim(1); intg i; for (i=0; i < size; i++) { q[i] = p[size-i-1]; }}// TODO-0 write specialized blas version in cpptemplate<class T> void idx_m4dotm2(Idx<T> &i1, Idx<T> &i2, Idx<T> &o1) { idx_checkorder3(i1, 4, i2, 2, o1, 2); T *c1, *c1_2; T *c2, *c2_0; T *c1_0, *c1_1; T *ker; intg c1_m2 = (i1).mod(2), c2_m0 = (i2).mod(0); intg c1_m3 = (i1).mod(3), c2_m1 = (i2).mod(1); intg k,l, kmax = (i2).dim(0), lmax = (i2).dim(1); T *d1_0, *d1, f; intg c1_m0 = (i1).mod(0), d1_m0 = (o1).mod(0); intg c1_m1 = (i1).mod(1), d1_m1 = (o1).mod(1); intg i,j, imax = (o1).dim(0), jmax = (o1).dim(1); c1_0 = i1.idx_ptr(); ker = i2.idx_ptr(); d1_0 = o1.idx_ptr(); for (i=0; i<imax; i++) { c1_1 = c1_0; d1 = d1_0; for (j=0; j<jmax; j++) { f = 0; c1_2 = c1_1; c2_0 = ker; for (k=0; k<kmax; k++) { c1 = c1_2; c2 = c2_0; for (l=0; l<lmax; l++) { f += (*c1)*(*c2); c1 += c1_m3; c2 += c2_m1; } c1_2 += c1_m2; c2_0 += c2_m0; } *d1 = f; d1 += d1_m1; c1_1 += c1_m1; } d1_0 += d1_m0; c1_0 += c1_m0; }}// TODO-0 write specialized blas version in cpptemplate<class T> void idx_m4dotm2acc(Idx<T> &i1, Idx<T> &i2, Idx<T> &o1) { idx_checkorder3(i1, 4, i2, 2, o1, 2); T *c1, *c1_2; T *c2, *c2_0; T *c1_0, *c1_1; T *ker; intg c1_m2 = (i1).mod(2), c2_m0 = (i2).mod(0); intg c1_m3 = (i1).mod(3), c2_m1 = (i2).mod(1); intg k,l, kmax = (i2).dim(0), lmax = (i2).dim(1); T *d1_0, *d1, f; intg c1_m0 = (i1).mod(0), d1_m0 = (o1).mod(0); intg c1_m1 = (i1).mod(1), d1_m1 = (o1).mod(1); intg i,j, imax = (o1).dim(0), jmax = (o1).dim(1); c1_0 = i1.idx_ptr();
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -