📄 cxmatmul.cpp
字号:
tmat = cvMat( d_size.height, d_size.width, type, buffer );
D = &tmat;
}
if( (d_size.width == 1 || len == 1) && !(flags & CV_GEMM_B_T) && CV_IS_MAT_CONT(B->type) )
{
b_step = d_size.width == 1 ? 0 : CV_ELEM_SIZE(type);
flags |= CV_GEMM_B_T;
}
if( (d_size.width | d_size.height | len) >= 16 && icvBLAS_GEMM_32f_p != 0 )
{
blas_func = type == CV_32FC1 ? (icvBLAS_GEMM_32f_t)icvBLAS_GEMM_32f_p :
type == CV_64FC1 ? (icvBLAS_GEMM_32f_t)icvBLAS_GEMM_64f_p :
type == CV_32FC2 ? (icvBLAS_GEMM_32f_t)icvBLAS_GEMM_32fc_p :
type == CV_64FC2 ? (icvBLAS_GEMM_32f_t)icvBLAS_GEMM_64fc_p : 0;
}
if( blas_func )
{
const char* transa = flags & CV_GEMM_A_T ? "t" : "n";
const char* transb = flags & CV_GEMM_B_T ? "t" : "n";
int lda, ldb, ldd;
if( C->data.ptr )
{
if( C->data.ptr != D->data.ptr )
{
if( !(flags & CV_GEMM_C_T) )
cvCopy( C, D );
else
cvTranspose( C, D );
}
}
if( CV_MAT_DEPTH(type) == CV_32F )
{
CvComplex32f _alpha, _beta;
lda = A->step/sizeof(float);
ldb = b_step/sizeof(float);
ldd = D->step/sizeof(float);
_alpha.re = (float)alpha;
_alpha.im = 0;
_beta.re = C->data.ptr ? (float)beta : 0;
_beta.im = 0;
if( CV_MAT_CN(type) == 2 )
lda /= 2, ldb /= 2, ldd /= 2;
blas_func( transb, transa, &d_size.width, &d_size.height, &len,
&_alpha, B->data.ptr, &ldb, A->data.ptr, &lda,
&_beta, D->data.ptr, &ldd );
}
else
{
CvComplex64f _alpha, _beta;
lda = A->step/sizeof(double);
ldb = b_step/sizeof(double);
ldd = D->step/sizeof(double);
_alpha.re = alpha;
_alpha.im = 0;
_beta.re = C->data.ptr ? beta : 0;
_beta.im = 0;
if( CV_MAT_CN(type) == 2 )
lda /= 2, ldb /= 2, ldd /= 2;
blas_func( transb, transa, &d_size.width, &d_size.height, &len,
&_alpha, B->data.ptr, &ldb, A->data.ptr, &lda,
&_beta, D->data.ptr, &ldd );
}
}
else if( d_size.height <= block_lin_size/2 || d_size.width <= block_lin_size/2 || len <= 10 ||
d_size.width <= block_lin_size && d_size.height <= block_lin_size && len <= block_lin_size )
{
single_mul_func( A->data.ptr, A->step, B->data.ptr, b_step,
C->data.ptr, C->step, D->data.ptr, D->step,
a_size, d_size, alpha, beta, flags );
}
else
{
int is_a_t = flags & CV_GEMM_A_T;
int is_b_t = flags & CV_GEMM_B_T;
int elem_size = CV_ELEM_SIZE(type);
int dk0_1, dk0_2;
int a_buf_size = 0, b_buf_size, d_buf_size;
uchar* a_buf = 0;
uchar* b_buf = 0;
uchar* d_buf = 0;
int i, j, k, di = 0, dj = 0, dk = 0;
int dm0, dn0, dk0;
int a_step0, a_step1, b_step0, b_step1, c_step0, c_step1;
int work_elem_size = elem_size << (CV_MAT_DEPTH(type) == CV_32F ? 1 : 0);
CvGEMMBlockMulFunc block_mul_func = (CvGEMMBlockMulFunc)block_mul_tab.fn_2d[type];
CvGEMMStoreFunc store_func = (CvGEMMStoreFunc)store_tab.fn_2d[type];
assert( block_mul_func && store_func );
if( !is_a_t )
a_step0 = A->step, a_step1 = elem_size;
else
a_step0 = elem_size, a_step1 = A->step;
if( !is_b_t )
b_step0 = b_step, b_step1 = elem_size;
else
b_step0 = elem_size, b_step1 = b_step;
if( !C->data.ptr )
{
c_step0 = c_step1 = 0;
flags &= ~CV_GEMM_C_T;
}
else if( !(flags & CV_GEMM_C_T) )
c_step0 = C->step, c_step1 = elem_size;
else
c_step0 = elem_size, c_step1 = C->step;
dm0 = MIN( block_lin_size, d_size.height );
dn0 = MIN( block_lin_size, d_size.width );
dk0_1 = block_size / dm0;
dk0_2 = block_size / dn0;
dk0 = MAX( dk0_1, dk0_2 );
dk0 = MIN( dk0, len );
if( dk0*dm0 > block_size )
dm0 = block_size / dk0;
if( dk0*dn0 > block_size )
dn0 = block_size / dk0;
dk0_1 = (dn0+dn0/8+2) & -2;
b_buf_size = (dk0+dk0/8+1)*dk0_1*elem_size;
d_buf_size = (dk0+dk0/8+1)*dk0_1*work_elem_size;
if( is_a_t )
{
a_buf_size = (dm0+dm0/8+1)*((dk0+dk0/8+2)&-2)*elem_size;
flags &= ~CV_GEMM_A_T;
}
CV_CALL( block_buffer = (uchar*)cvAlloc(a_buf_size + b_buf_size + d_buf_size));
d_buf = block_buffer;
b_buf = d_buf + d_buf_size;
if( is_a_t )
a_buf = b_buf + b_buf_size;
for( i = 0; i < d_size.height; i += di )
{
di = dm0;
if( i + di >= d_size.height || 8*(i + di) + di > 8*d_size.height )
di = d_size.height - i;
for( j = 0; j < d_size.width; j += dj )
{
uchar* _d = D->data.ptr + i*D->step + j*elem_size;
const uchar* _c = C->data.ptr + i*c_step0 + j*c_step1;
int _d_step = D->step;
dj = dn0;
if( j + dj >= d_size.width || 8*(j + dj) + dj > 8*d_size.width )
dj = d_size.width - j;
flags &= 15;
if( dk0 < len )
{
_d = d_buf;
_d_step = dj*work_elem_size;
}
for( k = 0; k < len; k += dk )
{
const uchar* _a = A->data.ptr + i*a_step0 + k*a_step1;
int _a_step = A->step;
const uchar* _b = B->data.ptr + k*b_step0 + j*b_step1;
int _b_step = b_step;
CvSize a_bl_size;
dk = dk0;
if( k + dk >= len || 8*(k + dk) + dk > 8*len )
dk = len - k;
if( !is_a_t )
a_bl_size.width = dk, a_bl_size.height = di;
else
a_bl_size.width = di, a_bl_size.height = dk;
if( a_buf && is_a_t )
{
int t;
_a_step = dk*elem_size;
icvGEMM_TransposeBlock( _a, A->step, a_buf, _a_step, a_bl_size, elem_size );
CV_SWAP( a_bl_size.width, a_bl_size.height, t );
_a = a_buf;
}
if( dj < d_size.width )
{
CvSize b_size;
if( !is_b_t )
b_size.width = dj, b_size.height = dk;
else
b_size.width = dk, b_size.height = dj;
_b_step = b_size.width*elem_size;
icvGEMM_CopyBlock( _b, b_step, b_buf, _b_step, b_size, elem_size );
_b = b_buf;
}
if( dk0 < len )
block_mul_func( _a, _a_step, _b, _b_step, _d, _d_step,
a_bl_size, cvSize(dj,di), flags );
else
single_mul_func( _a, _a_step, _b, _b_step, _c, C->step, _d, _d_step,
a_bl_size, cvSize(dj,di), alpha, beta, flags );
flags |= 16;
}
if( dk0 < len )
store_func( _c, C->step, _d, _d_step, D->data.ptr + i*D->step + j*elem_size,
D->step, cvSize(dj,di), alpha, beta, flags );
}
}
}
if( D0 != D )
CV_CALL( cvCopy( D, D0 ));
}
__END__;
if( buffer && !local_alloc )
cvFree( &buffer );
if( block_buffer )
cvFree( &block_buffer );
}
/****************************************************************************************\
* cvTransform *
\****************************************************************************************/
#define ICV_DEF_TRANSFORM_CASE_C1( arrtype, temptype, _ld_, \
_cast_macro1_, _cast_macro2_ ) \
{ \
for( i = 0; i < size.width; i++, dst += dst_cn ) \
{ \
const double* _mat = mat; \
double v0 = _ld_(src[i]); \
for( k = 0; k < dst_cn; k++, _mat += 2 ) \
{ \
temptype t0 = _cast_macro1_(_mat[0]*v0 + _mat[1]); \
dst[k] = _cast_macro2_(t0); \
} \
} \
src += size.width;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -