📄 strmm.cu
字号:
fast_strmm_l_lo_nt_main_sw, fast_strmm_l_up_nt_main_sw, fast_strmm_r_lo_tr_main_unit_sw, fast_strmm_r_up_tr_main_unit_sw, fast_strmm_r_lo_nt_main_unit_sw, fast_strmm_r_up_nt_main_unit_sw, fast_strmm_l_lo_tr_main_unit_sw, fast_strmm_l_up_tr_main_unit_sw, fast_strmm_l_lo_nt_main_unit_sw, fast_strmm_l_up_nt_main_unit_sw, fast_strmm_r_lo_tr_main_alpha0_sw, fast_strmm_r_up_tr_main_alpha0_sw, fast_strmm_r_lo_nt_main_alpha0_sw, fast_strmm_r_up_nt_main_alpha0_sw, fast_strmm_l_lo_tr_main_alpha0_sw, fast_strmm_l_up_tr_main_alpha0_sw, fast_strmm_l_lo_nt_main_alpha0_sw, fast_strmm_l_up_nt_main_alpha0_sw, fast_strmm_r_lo_tr_main_unit_alpha0_sw, fast_strmm_r_up_tr_main_unit_alpha0_sw, fast_strmm_r_lo_nt_main_unit_alpha0_sw, fast_strmm_r_up_nt_main_unit_alpha0_sw, fast_strmm_l_lo_tr_main_unit_alpha0_sw, fast_strmm_l_up_tr_main_unit_alpha0_sw, fast_strmm_l_lo_nt_main_unit_alpha0_sw, fast_strmm_l_up_nt_main_unit_alpha0_sw, strmm_r_lo_tr_main_fulltile_sw, strmm_r_up_tr_main_fulltile_sw, strmm_r_lo_nt_main_fulltile_sw, strmm_r_up_nt_main_fulltile_sw, strmm_l_lo_tr_main_fulltile_sw, strmm_l_up_tr_main_fulltile_sw, strmm_l_lo_nt_main_fulltile_sw, strmm_l_up_nt_main_fulltile_sw, strmm_r_lo_tr_main_unit_fulltile_sw, strmm_r_up_tr_main_unit_fulltile_sw, strmm_r_lo_nt_main_unit_fulltile_sw, strmm_r_up_nt_main_unit_fulltile_sw, strmm_l_lo_tr_main_unit_fulltile_sw, strmm_l_up_tr_main_unit_fulltile_sw, strmm_l_lo_nt_main_unit_fulltile_sw, strmm_l_up_nt_main_unit_fulltile_sw, strmm_r_lo_tr_main_alpha0_fulltile_sw, strmm_r_up_tr_main_alpha0_fulltile_sw, strmm_r_lo_nt_main_alpha0_fulltile_sw, strmm_r_up_nt_main_alpha0_fulltile_sw, strmm_l_lo_tr_main_alpha0_fulltile_sw, strmm_l_up_tr_main_alpha0_fulltile_sw, strmm_l_lo_nt_main_alpha0_fulltile_sw, strmm_l_up_nt_main_alpha0_fulltile_sw, strmm_r_lo_tr_main_unit_alpha0_fulltile_sw, strmm_r_up_tr_main_unit_alpha0_fulltile_sw, strmm_r_lo_nt_main_unit_alpha0_fulltile_sw, strmm_r_up_nt_main_unit_alpha0_fulltile_sw, strmm_l_lo_tr_main_unit_alpha0_fulltile_sw, strmm_l_up_tr_main_unit_alpha0_fulltile_sw, strmm_l_lo_nt_main_unit_alpha0_fulltile_sw, strmm_l_up_nt_main_unit_alpha0_fulltile_sw, fast_strmm_r_lo_tr_main_fulltile_sw, fast_strmm_r_up_tr_main_fulltile_sw, fast_strmm_r_lo_nt_main_fulltile_sw, fast_strmm_r_up_nt_main_fulltile_sw, fast_strmm_l_lo_tr_main_fulltile_sw, fast_strmm_l_up_tr_main_fulltile_sw, fast_strmm_l_lo_nt_main_fulltile_sw, fast_strmm_l_up_nt_main_fulltile_sw, fast_strmm_r_lo_tr_main_unit_fulltile_sw, fast_strmm_r_up_tr_main_unit_fulltile_sw, fast_strmm_r_lo_nt_main_unit_fulltile_sw, fast_strmm_r_up_nt_main_unit_fulltile_sw, fast_strmm_l_lo_tr_main_unit_fulltile_sw, fast_strmm_l_up_tr_main_unit_fulltile_sw, fast_strmm_l_lo_nt_main_unit_fulltile_sw, fast_strmm_l_up_nt_main_unit_fulltile_sw, fast_strmm_r_lo_tr_main_alpha0_fulltile_sw, fast_strmm_r_up_tr_main_alpha0_fulltile_sw, fast_strmm_r_lo_nt_main_alpha0_fulltile_sw, fast_strmm_r_up_nt_main_alpha0_fulltile_sw, fast_strmm_l_lo_tr_main_alpha0_fulltile_sw, fast_strmm_l_up_tr_main_alpha0_fulltile_sw, fast_strmm_l_lo_nt_main_alpha0_fulltile_sw, fast_strmm_l_up_nt_main_alpha0_fulltile_sw, fast_strmm_r_lo_tr_main_unit_alpha0_fulltile_sw, fast_strmm_r_up_tr_main_unit_alpha0_fulltile_sw, fast_strmm_r_lo_nt_main_unit_alpha0_fulltile_sw, fast_strmm_r_up_nt_main_unit_alpha0_fulltile_sw, fast_strmm_l_lo_tr_main_unit_alpha0_fulltile_sw, fast_strmm_l_up_tr_main_unit_alpha0_fulltile_sw, fast_strmm_l_lo_nt_main_unit_alpha0_fulltile_sw, fast_strmm_l_up_nt_main_unit_alpha0_fulltile_sw};static pf strmm_hw[128] = { strmm_r_lo_tr_main_hw, strmm_r_up_tr_main_hw, strmm_r_lo_nt_main_hw, strmm_r_up_nt_main_hw, strmm_l_lo_tr_main_hw, strmm_l_up_tr_main_hw, strmm_l_lo_nt_main_hw, strmm_l_up_nt_main_hw, strmm_r_lo_tr_main_unit_hw, strmm_r_up_tr_main_unit_hw, strmm_r_lo_nt_main_unit_hw, strmm_r_up_nt_main_unit_hw, strmm_l_lo_tr_main_unit_hw, strmm_l_up_tr_main_unit_hw, strmm_l_lo_nt_main_unit_hw, strmm_l_up_nt_main_unit_hw, strmm_r_lo_tr_main_alpha0_hw, strmm_r_up_tr_main_alpha0_hw, strmm_r_lo_nt_main_alpha0_hw, strmm_r_up_nt_main_alpha0_hw, strmm_l_lo_tr_main_alpha0_hw, strmm_l_up_tr_main_alpha0_hw, strmm_l_lo_nt_main_alpha0_hw, strmm_l_up_nt_main_alpha0_hw, strmm_r_lo_tr_main_unit_alpha0_hw, strmm_r_up_tr_main_unit_alpha0_hw, strmm_r_lo_nt_main_unit_alpha0_hw, strmm_r_up_nt_main_unit_alpha0_hw, strmm_l_lo_tr_main_unit_alpha0_hw, strmm_l_up_tr_main_unit_alpha0_hw, strmm_l_lo_nt_main_unit_alpha0_hw, strmm_l_up_nt_main_unit_alpha0_hw, fast_strmm_r_lo_tr_main_hw, fast_strmm_r_up_tr_main_hw, fast_strmm_r_lo_nt_main_hw, fast_strmm_r_up_nt_main_hw, fast_strmm_l_lo_tr_main_hw, fast_strmm_l_up_tr_main_hw, fast_strmm_l_lo_nt_main_hw, fast_strmm_l_up_nt_main_hw, fast_strmm_r_lo_tr_main_unit_hw, fast_strmm_r_up_tr_main_unit_hw, fast_strmm_r_lo_nt_main_unit_hw, fast_strmm_r_up_nt_main_unit_hw, fast_strmm_l_lo_tr_main_unit_hw, fast_strmm_l_up_tr_main_unit_hw, fast_strmm_l_lo_nt_main_unit_hw, fast_strmm_l_up_nt_main_unit_hw, fast_strmm_r_lo_tr_main_alpha0_hw, fast_strmm_r_up_tr_main_alpha0_hw, fast_strmm_r_lo_nt_main_alpha0_hw, fast_strmm_r_up_nt_main_alpha0_hw, fast_strmm_l_lo_tr_main_alpha0_hw, fast_strmm_l_up_tr_main_alpha0_hw, fast_strmm_l_lo_nt_main_alpha0_hw, fast_strmm_l_up_nt_main_alpha0_hw, fast_strmm_r_lo_tr_main_unit_alpha0_hw, fast_strmm_r_up_tr_main_unit_alpha0_hw, fast_strmm_r_lo_nt_main_unit_alpha0_hw, fast_strmm_r_up_nt_main_unit_alpha0_hw, fast_strmm_l_lo_tr_main_unit_alpha0_hw, fast_strmm_l_up_tr_main_unit_alpha0_hw, fast_strmm_l_lo_nt_main_unit_alpha0_hw, fast_strmm_l_up_nt_main_unit_alpha0_hw, strmm_r_lo_tr_main_fulltile_hw, strmm_r_up_tr_main_fulltile_hw, strmm_r_lo_nt_main_fulltile_hw, strmm_r_up_nt_main_fulltile_hw, strmm_l_lo_tr_main_fulltile_hw, strmm_l_up_tr_main_fulltile_hw, strmm_l_lo_nt_main_fulltile_hw, strmm_l_up_nt_main_fulltile_hw, strmm_r_lo_tr_main_unit_fulltile_hw, strmm_r_up_tr_main_unit_fulltile_hw, strmm_r_lo_nt_main_unit_fulltile_hw, strmm_r_up_nt_main_unit_fulltile_hw, strmm_l_lo_tr_main_unit_fulltile_hw, strmm_l_up_tr_main_unit_fulltile_hw, strmm_l_lo_nt_main_unit_fulltile_hw, strmm_l_up_nt_main_unit_fulltile_hw, strmm_r_lo_tr_main_alpha0_fulltile_hw, strmm_r_up_tr_main_alpha0_fulltile_hw, strmm_r_lo_nt_main_alpha0_fulltile_hw, strmm_r_up_nt_main_alpha0_fulltile_hw, strmm_l_lo_tr_main_alpha0_fulltile_hw, strmm_l_up_tr_main_alpha0_fulltile_hw, strmm_l_lo_nt_main_alpha0_fulltile_hw, strmm_l_up_nt_main_alpha0_fulltile_hw, strmm_r_lo_tr_main_unit_alpha0_fulltile_hw, strmm_r_up_tr_main_unit_alpha0_fulltile_hw, strmm_r_lo_nt_main_unit_alpha0_fulltile_hw, strmm_r_up_nt_main_unit_alpha0_fulltile_hw, strmm_l_lo_tr_main_unit_alpha0_fulltile_hw, strmm_l_up_tr_main_unit_alpha0_fulltile_hw, strmm_l_lo_nt_main_unit_alpha0_fulltile_hw, strmm_l_up_nt_main_unit_alpha0_fulltile_hw, fast_strmm_r_lo_tr_main_fulltile_hw, fast_strmm_r_up_tr_main_fulltile_hw, fast_strmm_r_lo_nt_main_fulltile_hw, fast_strmm_r_up_nt_main_fulltile_hw, fast_strmm_l_lo_tr_main_fulltile_hw, fast_strmm_l_up_tr_main_fulltile_hw, fast_strmm_l_lo_nt_main_fulltile_hw, fast_strmm_l_up_nt_main_fulltile_hw, fast_strmm_r_lo_tr_main_unit_fulltile_hw, fast_strmm_r_up_tr_main_unit_fulltile_hw, fast_strmm_r_lo_nt_main_unit_fulltile_hw, fast_strmm_r_up_nt_main_unit_fulltile_hw, fast_strmm_l_lo_tr_main_unit_fulltile_hw, fast_strmm_l_up_tr_main_unit_fulltile_hw, fast_strmm_l_lo_nt_main_unit_fulltile_hw, fast_strmm_l_up_nt_main_unit_fulltile_hw, fast_strmm_r_lo_tr_main_alpha0_fulltile_hw, fast_strmm_r_up_tr_main_alpha0_fulltile_hw, fast_strmm_r_lo_nt_main_alpha0_fulltile_hw, fast_strmm_r_up_nt_main_alpha0_fulltile_hw, fast_strmm_l_lo_tr_main_alpha0_fulltile_hw, fast_strmm_l_up_tr_main_alpha0_fulltile_hw, fast_strmm_l_lo_nt_main_alpha0_fulltile_hw, fast_strmm_l_up_nt_main_alpha0_fulltile_hw, fast_strmm_r_lo_tr_main_unit_alpha0_fulltile_hw, fast_strmm_r_up_tr_main_unit_alpha0_fulltile_hw, fast_strmm_r_lo_nt_main_unit_alpha0_fulltile_hw, fast_strmm_r_up_nt_main_unit_alpha0_fulltile_hw, fast_strmm_l_lo_tr_main_unit_alpha0_fulltile_hw, fast_strmm_l_up_tr_main_unit_alpha0_fulltile_hw, fast_strmm_l_lo_nt_main_unit_alpha0_fulltile_hw, fast_strmm_l_up_nt_main_unit_alpha0_fulltile_hw};/* * void * cublasStrmm (char side, char uplo, char transa, char diag, int m, int n, * float alpha, const float *A, int lda, const float *B, int ldb) * * performs one of the matrix-matrix operations * * B = alpha * op(A) * B, or B = alpha * B * op(A) * * where alpha is a single-precision scalar, B is an m x n matrix composed * of single precision elements, and A is a unit or non-unit, upper or lower, * triangular matrix composed of single precision elements. op(A) is one of * * op(A) = A or op(A) = transpose(A) * * Matrices A and B are stored in column major format, and lda and ldb are * the leading dimensions of the two-dimensonials arrays that contain A and * B, respectively. * * Input * ----- * side specifies whether op(A) multiplies B from the left or right. * If side = 'L' or 'l', then B = alpha * op(A) * B. If side = * 'R' or 'r', then B = alpha * B * op(A). * uplo specifies whether the matrix A is an upper or lower triangular * matrix. If uplo = 'U' or 'u', A is an upper triangular matrix. * If uplo = 'L' or 'l', A is a lower triangular matrix. * transa specifies the form of op(A) to be used in the matrix * multiplication. If transa = 'N' or 'n', then op(A) = A. If * transa = 'T', 't', 'C', or 'c', then op(A) = transpose(A). * diag specifies whether or not A is unit triangular. If diag = 'U' * or 'u', A is assumed to be unit triangular. If diag = 'N' or * 'n', A is not assumed to be unit triangular. * m the number of rows of matrix B. m must be at least zero. * n the number of columns of matrix B. n must be at least zero. * alpha single precision scalar multiplier applied to op(A)*B, or * B*op(A), respectively. If alpha is zero no accesses are made * to matrix A, and no read accesses are made to matrix B. * A single precision array of dimensions (lda, k). k = m if side = * 'L' or 'l', k = n if side = 'R' or 'r'. If uplo = 'U' or 'u' * the leading k x k upper triangular part of the array A must * contain the upper triangular matrix, and the strictly lower * triangular part of A is not referenced. If uplo = 'L' or 'l' * the leading k x k lower triangular part of the array A must * contain the lower triangular matrix, and the strictly upper * triangular part of A is not referenced. When diag = 'U' or 'u' * the diagonal elements of A are no referenced and are assumed * to be unity. * lda leading dimension of A. When side = 'L' or 'l', it must be at * least max(1,m) and at least max(1,n) otherwise * B single precision array of dimensions (ldb, n). On entry, the * leading m x n part of the array contains the matrix B. It is * overwritten with the transformed matrix on exit. * ldb leading dimension of B. It must be at least max (1, m). * * Output * ------ * B updated according to B = alpha * op(A) * B or B = alpha * B * op(A) * * Reference: http://www.netlib.org/blas/strmm.f * * Error status for this function can be retrieved via cublasGetError(). * * Error Status * ------------ * CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialized * CUBLAS_STATUS_INVALID_VALUE if m or n < 0 * CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU */__host__ void CUBLASAPI cublasStrmm (char side, char uplo, char transa, char diag, int m, int n, float alpha, const float *A, int lda, float *B, int ldb){ struct cublasContext *ctx = CUBLAS_GET_CTX(); struct cublasStrmmParams params; cudaError_t cudaStat; int info; int lside = toupper(side) == 'L'; int upper, notrans, unit, nrowa; int funcIdx; int useFastImul; int fullTilesOnly; int usePureHwStepper; dim3 ctaDimsHw (lside ? ((n+BLK-1)/BLK) : ((m+BLK-1)/BLK)); dim3 ctaDimsSw (CUBLAS_STRMM_CTAS); if (!cublasInitialized (ctx)) { cublasSetError (ctx, CUBLAS_STATUS_NOT_INITIALIZED); return; } upper = toupper(uplo) == 'U'; notrans = toupper(transa) == 'N'; unit = toupper(diag) == 'U'; nrowa = (lside) ? m : n; info = 0; if ((!lside) && (toupper(side) != 'R')) { info = 1; } else if ((!upper) && (toupper(uplo) != 'L')) { info = 2; } else if ((!notrans) && (toupper(transa) != 'T') && (toupper(transa)!='C')){ info = 3; } else if ((unit) && (toupper(diag) != 'U')) { info = 4; } else if (m < 0) { info = 5; } else if (n < 0) { info = 6; } else if (lda < imax (1, nrowa)) { info = 9; } else if (ldb < imax (1, m)) { info = 11; } if (info) { cublasXerbla ("STRMM ", info); cublasSetError (ctx, CUBLAS_STATUS_INVALID_VALUE); return; } /* early out if nothing to do */ if ((m == 0) || (n == 0)) return; params.lside = lside; params.upper = upper; params.notrans = notrans; params.unit = unit; params.m = m; params.n = n; params.alpha = alpha; params.A = A; params.lda = lda; params.B = B; params.ldb = ldb;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -