📄 strsm.cu
字号:
fast_strsm_l_lo_nt_main_alpha0_hw_nu, fast_strsm_l_up_tr_main_alpha0_hw_nu, fast_strsm_l_up_nt_main_alpha0_hw_nu, fast_strsm_l_lo_tr_main_fulltile_alpha0_hw_nu, fast_strsm_l_lo_nt_main_fulltile_alpha0_hw_nu, fast_strsm_l_up_tr_main_fulltile_alpha0_hw_nu, fast_strsm_l_up_nt_main_fulltile_alpha0_hw_nu,};static pf strsm_l_sw[64] = { strsm_l_lo_tr_main_sw, strsm_l_lo_nt_main_sw, strsm_l_up_tr_main_sw, strsm_l_up_nt_main_sw, strsm_l_lo_tr_main_fulltile_sw, strsm_l_lo_nt_main_fulltile_sw, strsm_l_up_tr_main_fulltile_sw, strsm_l_up_nt_main_fulltile_sw, strsm_l_lo_tr_main_alpha0_sw, strsm_l_lo_nt_main_alpha0_sw, strsm_l_up_tr_main_alpha0_sw, strsm_l_up_nt_main_alpha0_sw, strsm_l_lo_tr_main_fulltile_alpha0_sw, strsm_l_lo_nt_main_fulltile_alpha0_sw, strsm_l_up_tr_main_fulltile_alpha0_sw, strsm_l_up_nt_main_fulltile_alpha0_sw, fast_strsm_l_lo_tr_main_sw, fast_strsm_l_lo_nt_main_sw, fast_strsm_l_up_tr_main_sw, fast_strsm_l_up_nt_main_sw, fast_strsm_l_lo_tr_main_fulltile_sw, fast_strsm_l_lo_nt_main_fulltile_sw, fast_strsm_l_up_tr_main_fulltile_sw, fast_strsm_l_up_nt_main_fulltile_sw, fast_strsm_l_lo_tr_main_alpha0_sw, fast_strsm_l_lo_nt_main_alpha0_sw, fast_strsm_l_up_tr_main_alpha0_sw, fast_strsm_l_up_nt_main_alpha0_sw, fast_strsm_l_lo_tr_main_fulltile_alpha0_sw, fast_strsm_l_lo_nt_main_fulltile_alpha0_sw, fast_strsm_l_up_tr_main_fulltile_alpha0_sw, fast_strsm_l_up_nt_main_fulltile_alpha0_sw, strsm_l_lo_tr_main_sw_nu, strsm_l_lo_nt_main_sw_nu, strsm_l_up_tr_main_sw_nu, strsm_l_up_nt_main_sw_nu, strsm_l_lo_tr_main_fulltile_sw_nu, strsm_l_lo_nt_main_fulltile_sw_nu, strsm_l_up_tr_main_fulltile_sw_nu, strsm_l_up_nt_main_fulltile_sw_nu, strsm_l_lo_tr_main_alpha0_sw_nu, strsm_l_lo_nt_main_alpha0_sw_nu, strsm_l_up_tr_main_alpha0_sw_nu, strsm_l_up_nt_main_alpha0_sw_nu, strsm_l_lo_tr_main_fulltile_alpha0_sw_nu, strsm_l_lo_nt_main_fulltile_alpha0_sw_nu, strsm_l_up_tr_main_fulltile_alpha0_sw_nu, strsm_l_up_nt_main_fulltile_alpha0_sw_nu, fast_strsm_l_lo_tr_main_sw_nu, fast_strsm_l_lo_nt_main_sw_nu, fast_strsm_l_up_tr_main_sw_nu, fast_strsm_l_up_nt_main_sw_nu, fast_strsm_l_lo_tr_main_fulltile_sw_nu, fast_strsm_l_lo_nt_main_fulltile_sw_nu, fast_strsm_l_up_tr_main_fulltile_sw_nu, fast_strsm_l_up_nt_main_fulltile_sw_nu, fast_strsm_l_lo_tr_main_alpha0_sw_nu, fast_strsm_l_lo_nt_main_alpha0_sw_nu, fast_strsm_l_up_tr_main_alpha0_sw_nu, fast_strsm_l_up_nt_main_alpha0_sw_nu, fast_strsm_l_lo_tr_main_fulltile_alpha0_sw_nu, fast_strsm_l_lo_nt_main_fulltile_alpha0_sw_nu, fast_strsm_l_up_tr_main_fulltile_alpha0_sw_nu, fast_strsm_l_up_nt_main_fulltile_alpha0_sw_nu,};static pf strsm_r_hw[64] = { strsm_r_lo_tr_main_hw, strsm_r_lo_nt_main_hw, strsm_r_up_tr_main_hw, strsm_r_up_nt_main_hw, strsm_r_lo_tr_main_fulltile_hw, strsm_r_lo_nt_main_fulltile_hw, strsm_r_up_tr_main_fulltile_hw, strsm_r_up_nt_main_fulltile_hw, strsm_r_lo_tr_main_alpha0_hw, strsm_r_lo_nt_main_alpha0_hw, strsm_r_up_tr_main_alpha0_hw, strsm_r_up_nt_main_alpha0_hw, strsm_r_lo_tr_main_fulltile_alpha0_hw, strsm_r_lo_nt_main_fulltile_alpha0_hw, strsm_r_up_tr_main_fulltile_alpha0_hw, strsm_r_up_nt_main_fulltile_alpha0_hw, fast_strsm_r_lo_tr_main_hw, fast_strsm_r_lo_nt_main_hw, fast_strsm_r_up_tr_main_hw, fast_strsm_r_up_nt_main_hw, fast_strsm_r_lo_tr_main_fulltile_hw, fast_strsm_r_lo_nt_main_fulltile_hw, fast_strsm_r_up_tr_main_fulltile_hw, fast_strsm_r_up_nt_main_fulltile_hw, fast_strsm_r_lo_tr_main_alpha0_hw, fast_strsm_r_lo_nt_main_alpha0_hw, fast_strsm_r_up_tr_main_alpha0_hw, fast_strsm_r_up_nt_main_alpha0_hw, fast_strsm_r_lo_tr_main_fulltile_alpha0_hw, fast_strsm_r_lo_nt_main_fulltile_alpha0_hw, fast_strsm_r_up_tr_main_fulltile_alpha0_hw, fast_strsm_r_up_nt_main_fulltile_alpha0_hw, strsm_r_lo_tr_main_hw_nu, strsm_r_lo_nt_main_hw_nu, strsm_r_up_tr_main_hw_nu, strsm_r_up_nt_main_hw_nu, strsm_r_lo_tr_main_fulltile_hw_nu, strsm_r_lo_nt_main_fulltile_hw_nu, strsm_r_up_tr_main_fulltile_hw_nu, strsm_r_up_nt_main_fulltile_hw_nu, strsm_r_lo_tr_main_alpha0_hw_nu, strsm_r_lo_nt_main_alpha0_hw_nu, strsm_r_up_tr_main_alpha0_hw_nu, strsm_r_up_nt_main_alpha0_hw_nu, strsm_r_lo_tr_main_fulltile_alpha0_hw_nu, strsm_r_lo_nt_main_fulltile_alpha0_hw_nu, strsm_r_up_tr_main_fulltile_alpha0_hw_nu, strsm_r_up_nt_main_fulltile_alpha0_hw_nu, fast_strsm_r_lo_tr_main_hw_nu, fast_strsm_r_lo_nt_main_hw_nu, fast_strsm_r_up_tr_main_hw_nu, fast_strsm_r_up_nt_main_hw_nu, fast_strsm_r_lo_tr_main_fulltile_hw_nu, fast_strsm_r_lo_nt_main_fulltile_hw_nu, fast_strsm_r_up_tr_main_fulltile_hw_nu, fast_strsm_r_up_nt_main_fulltile_hw_nu, fast_strsm_r_lo_tr_main_alpha0_hw_nu, fast_strsm_r_lo_nt_main_alpha0_hw_nu, fast_strsm_r_up_tr_main_alpha0_hw_nu, fast_strsm_r_up_nt_main_alpha0_hw_nu, fast_strsm_r_lo_tr_main_fulltile_alpha0_hw_nu, fast_strsm_r_lo_nt_main_fulltile_alpha0_hw_nu, fast_strsm_r_up_tr_main_fulltile_alpha0_hw_nu, fast_strsm_r_up_nt_main_fulltile_alpha0_hw_nu};static pf strsm_r_sw[64] = { strsm_r_lo_tr_main_sw, strsm_r_lo_nt_main_sw, strsm_r_up_tr_main_sw, strsm_r_up_nt_main_sw, strsm_r_lo_tr_main_fulltile_sw, strsm_r_lo_nt_main_fulltile_sw, strsm_r_up_tr_main_fulltile_sw, strsm_r_up_nt_main_fulltile_sw, strsm_r_lo_tr_main_alpha0_sw, strsm_r_lo_nt_main_alpha0_sw, strsm_r_up_tr_main_alpha0_sw, strsm_r_up_nt_main_alpha0_sw, strsm_r_lo_tr_main_fulltile_alpha0_sw, strsm_r_lo_nt_main_fulltile_alpha0_sw, strsm_r_up_tr_main_fulltile_alpha0_sw, strsm_r_up_nt_main_fulltile_alpha0_sw, fast_strsm_r_lo_tr_main_sw, fast_strsm_r_lo_nt_main_sw, fast_strsm_r_up_tr_main_sw, fast_strsm_r_up_nt_main_sw, fast_strsm_r_lo_tr_main_fulltile_sw, fast_strsm_r_lo_nt_main_fulltile_sw, fast_strsm_r_up_tr_main_fulltile_sw, fast_strsm_r_up_nt_main_fulltile_sw, fast_strsm_r_lo_tr_main_alpha0_sw, fast_strsm_r_lo_nt_main_alpha0_sw, fast_strsm_r_up_tr_main_alpha0_sw, fast_strsm_r_up_nt_main_alpha0_sw, fast_strsm_r_lo_tr_main_fulltile_alpha0_sw, fast_strsm_r_lo_nt_main_fulltile_alpha0_sw, fast_strsm_r_up_tr_main_fulltile_alpha0_sw, fast_strsm_r_up_nt_main_fulltile_alpha0_sw, strsm_r_lo_tr_main_sw_nu, strsm_r_lo_nt_main_sw_nu, strsm_r_up_tr_main_sw_nu, strsm_r_up_nt_main_sw_nu, strsm_r_lo_tr_main_fulltile_sw_nu, strsm_r_lo_nt_main_fulltile_sw_nu, strsm_r_up_tr_main_fulltile_sw_nu, strsm_r_up_nt_main_fulltile_sw_nu, strsm_r_lo_tr_main_alpha0_sw_nu, strsm_r_lo_nt_main_alpha0_sw_nu, strsm_r_up_tr_main_alpha0_sw_nu, strsm_r_up_nt_main_alpha0_sw_nu, strsm_r_lo_tr_main_fulltile_alpha0_sw_nu, strsm_r_lo_nt_main_fulltile_alpha0_sw_nu, strsm_r_up_tr_main_fulltile_alpha0_sw_nu, strsm_r_up_nt_main_fulltile_alpha0_sw_nu, fast_strsm_r_lo_tr_main_sw_nu, fast_strsm_r_lo_nt_main_sw_nu, fast_strsm_r_up_tr_main_sw_nu, fast_strsm_r_up_nt_main_sw_nu, fast_strsm_r_lo_tr_main_fulltile_sw_nu, fast_strsm_r_lo_nt_main_fulltile_sw_nu, fast_strsm_r_up_tr_main_fulltile_sw_nu, fast_strsm_r_up_nt_main_fulltile_sw_nu, fast_strsm_r_lo_tr_main_alpha0_sw_nu, fast_strsm_r_lo_nt_main_alpha0_sw_nu, fast_strsm_r_up_tr_main_alpha0_sw_nu, fast_strsm_r_up_nt_main_alpha0_sw_nu, fast_strsm_r_lo_tr_main_fulltile_alpha0_sw_nu, fast_strsm_r_lo_nt_main_fulltile_alpha0_sw_nu, fast_strsm_r_up_tr_main_fulltile_alpha0_sw_nu, fast_strsm_r_up_nt_main_fulltile_alpha0_sw_nu};#define BLK_LOG (5)#define BLK (1 << BLK_LOG)#define JINC (BLK * CUBLAS_STRSM_CTAS)#define IINC (BLK * CUBLAS_STRSM_CTAS)/* * void * cublasStrsm (char side, char uplo, char transa, char diag, int m, int n, * float alpha, const float *A, int lda, float *B, int ldb) * * solves one of the matrix equations * * op(A) * X = alpha * B, or X * op(A) = alpha * B, * * where alpha is a single precision scalar, and X and B are m x n matrices * that are composed of single precision elements. A is a unit or non-unit, * upper or lower triangular matrix, and op(A) is one of * * op(A) = A or op(A) = transpose(A) * * The result matrix X overwrites input matrix B; that is, on exit the result * is stored in B. Matrices A and B are stored in column major format, and * lda and ldb are the leading dimensions of the two-dimensonials arrays that * contain A and B, respectively. * * Input * ----- * side specifies whether op(A) appears on the left or right of X as * follows: side = 'L' or 'l' indicates solve op(A) * X = alpha * B. * side = 'R' or 'r' indicates solve X * op(A) = alpha * B. * uplo specifies whether the matrix A is an upper or lower triangular * matrix as follows: uplo = 'U' or 'u' indicates A is an upper * triangular matrix. uplo = 'L' or 'l' indicates A is a lower * triangular matrix. * transa specifies the form of op(A) to be used in matrix multiplication * as follows: If transa = 'N' or 'N', then op(A) = A. If transa = * 'T', 't', 'C', or 'c', then op(A) = transpose(A). * diag specifies whether or not A is a unit triangular matrix like so: * if diag = 'U' or 'u', A is assumed to be unit triangular. If * diag = 'N' or 'n', then A is not assumed to be unit triangular. * m specifies the number of rows of B. m must be at least zero. * n specifies the number of columns of B. n must be at least zero. * alpha is a single precision scalar to be multiplied with B. When alpha is * zero, then A is not referenced and B need not be set before entry. * A is a single precision array of dimensions (lda, k), where k is * m when side = 'L' or 'l', and is n when side = 'R' or 'r'. If * uplo = 'U' or 'u', the leading k x k upper triangular part of * the array A must contain the upper triangular matrix and the * strictly lower triangular matrix of A is not referenced. When * uplo = 'L' or 'l', the leading k x k lower triangular part of * the array A must contain the lower triangular matrix and the * strictly upper triangular part of A is not referenced. Note that * when diag = 'U' or 'u', the diagonal elements of A are not * referenced, and are assumed to be unity. * lda is the leading dimension of the two dimensional array containing A. * When side = 'L' or 'l' then lda must be at least max(1, m), when * side = 'R' or 'r' then lda must be at least max(1, n). * B is a single precision array of dimensions (ldb, n). ldb must be * at least max (1,m). The leading m x n part of the array B must * contain the right-hand side matrix B. On exit B is overwritten * by the solution matrix X. * ldb is the leading dimension of the two dimensional array containing B. * ldb must be at least max(1, m). * * Output * ------ * B contains the solution matrix X satisfying op(A) * X = alpha * B, * or X * op(A) = alpha * B * * Reference: http://www.netlib.org/blas/strsm.f * * Error status for this function can be retrieved via cublasGetError(). * * Error Status * ------------ * CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialized * CUBLAS_STATUS_INVALID_VALUE if m or n < 0 * CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU */__host__ void CUBLASAPI cublasStrsm (char side, char uplo, char transa, char diag, int m, int n, float alpha, const float *A, int lda, float *B, int ldb){ struct cublasContext *ctx = CUBLAS_GET_CTX(); struct cublasStrsmParams params; cudaError_t cudaStat; int fullTilesOnly; int funcIdx; int useFastImul; int usePureHwStepper; int info; int lside = toupper(side) == 'L'; int upper, notrans, nounit, nrowa; dim3 ctaDimsHw (lside ? ((n+BLK-1)/BLK) : ((m+BLK-1)/BLK)); dim3 ctaDimsSw (CUBLAS_STRSM_CTAS); if (!cublasInitialized (ctx)) { cublasSetError (ctx, CUBLAS_STATUS_NOT_INITIALIZED); return; } upper = toupper(uplo) == 'U'; notrans = toupper(transa) == 'N'; nounit = toupper(diag) == 'N'; nrowa = (lside) ? m : n; info = 0; if ((!lside) && (toupper(side) != 'R')) { info = 1; } else if ((!upper) && (toupper(uplo) != 'L')) { info = 2; } else if ((!notrans) && (toupper(transa) != 'T') && (toupper(transa)!='C')){ info = 3; } else if ((!nounit) && (toupper(diag) != 'U')) { info = 4; } else if (m < 0) { info = 5; } else if (n < 0) { info = 6; } else if (lda < imax (1, nrowa)) { info = 9; } else if (ldb < imax (1, m)) { info = 11; } if (info) { cublasXerbla ("STRSM ", info); cublasSetError (ctx, CUBLAS_STATUS_INVALID_VALUE); return; } /* early out if nothing to do */ if ((m == 0) || (n == 0)) { return; } params.lside = lside; params.upper = upper; params.notrans = notrans; params.nounit = nounit; params.m = m; params.n = n; params.alpha = alpha; params.A = A; params.lda = lda; params.B = B; params.ldb = ldb; /* choose HW-only stepping if columns in result matrix do not exceed the
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -