⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 strsm.cu

📁 Nividia提供的CUDA的BLAS库源码
💻 CU
📖 第 1 页 / 共 5 页
字号:
    fast_strsm_l_lo_nt_main_alpha0_hw_nu,    fast_strsm_l_up_tr_main_alpha0_hw_nu,    fast_strsm_l_up_nt_main_alpha0_hw_nu,    fast_strsm_l_lo_tr_main_fulltile_alpha0_hw_nu,    fast_strsm_l_lo_nt_main_fulltile_alpha0_hw_nu,    fast_strsm_l_up_tr_main_fulltile_alpha0_hw_nu,    fast_strsm_l_up_nt_main_fulltile_alpha0_hw_nu,};static pf strsm_l_sw[64] = {    strsm_l_lo_tr_main_sw,    strsm_l_lo_nt_main_sw,    strsm_l_up_tr_main_sw,    strsm_l_up_nt_main_sw,    strsm_l_lo_tr_main_fulltile_sw,    strsm_l_lo_nt_main_fulltile_sw,    strsm_l_up_tr_main_fulltile_sw,    strsm_l_up_nt_main_fulltile_sw,    strsm_l_lo_tr_main_alpha0_sw,    strsm_l_lo_nt_main_alpha0_sw,    strsm_l_up_tr_main_alpha0_sw,    strsm_l_up_nt_main_alpha0_sw,    strsm_l_lo_tr_main_fulltile_alpha0_sw,    strsm_l_lo_nt_main_fulltile_alpha0_sw,    strsm_l_up_tr_main_fulltile_alpha0_sw,    strsm_l_up_nt_main_fulltile_alpha0_sw,    fast_strsm_l_lo_tr_main_sw,    fast_strsm_l_lo_nt_main_sw,    fast_strsm_l_up_tr_main_sw,    fast_strsm_l_up_nt_main_sw,    fast_strsm_l_lo_tr_main_fulltile_sw,    fast_strsm_l_lo_nt_main_fulltile_sw,    fast_strsm_l_up_tr_main_fulltile_sw,    fast_strsm_l_up_nt_main_fulltile_sw,    fast_strsm_l_lo_tr_main_alpha0_sw,    fast_strsm_l_lo_nt_main_alpha0_sw,    fast_strsm_l_up_tr_main_alpha0_sw,    fast_strsm_l_up_nt_main_alpha0_sw,    fast_strsm_l_lo_tr_main_fulltile_alpha0_sw,    fast_strsm_l_lo_nt_main_fulltile_alpha0_sw,    fast_strsm_l_up_tr_main_fulltile_alpha0_sw,    fast_strsm_l_up_nt_main_fulltile_alpha0_sw,    strsm_l_lo_tr_main_sw_nu,    strsm_l_lo_nt_main_sw_nu,    strsm_l_up_tr_main_sw_nu,    strsm_l_up_nt_main_sw_nu,    strsm_l_lo_tr_main_fulltile_sw_nu,    strsm_l_lo_nt_main_fulltile_sw_nu,    strsm_l_up_tr_main_fulltile_sw_nu,    strsm_l_up_nt_main_fulltile_sw_nu,    strsm_l_lo_tr_main_alpha0_sw_nu,    strsm_l_lo_nt_main_alpha0_sw_nu,    strsm_l_up_tr_main_alpha0_sw_nu,    strsm_l_up_nt_main_alpha0_sw_nu,    strsm_l_lo_tr_main_fulltile_alpha0_sw_nu,    strsm_l_lo_nt_main_fulltile_alpha0_sw_nu,    strsm_l_up_tr_main_fulltile_alpha0_sw_nu,    strsm_l_up_nt_main_fulltile_alpha0_sw_nu,    fast_strsm_l_lo_tr_main_sw_nu,    fast_strsm_l_lo_nt_main_sw_nu,    fast_strsm_l_up_tr_main_sw_nu,    fast_strsm_l_up_nt_main_sw_nu,    fast_strsm_l_lo_tr_main_fulltile_sw_nu,    fast_strsm_l_lo_nt_main_fulltile_sw_nu,    fast_strsm_l_up_tr_main_fulltile_sw_nu,    fast_strsm_l_up_nt_main_fulltile_sw_nu,    fast_strsm_l_lo_tr_main_alpha0_sw_nu,    fast_strsm_l_lo_nt_main_alpha0_sw_nu,    fast_strsm_l_up_tr_main_alpha0_sw_nu,    fast_strsm_l_up_nt_main_alpha0_sw_nu,    fast_strsm_l_lo_tr_main_fulltile_alpha0_sw_nu,    fast_strsm_l_lo_nt_main_fulltile_alpha0_sw_nu,    fast_strsm_l_up_tr_main_fulltile_alpha0_sw_nu,    fast_strsm_l_up_nt_main_fulltile_alpha0_sw_nu,};static pf strsm_r_hw[64] = {    strsm_r_lo_tr_main_hw,    strsm_r_lo_nt_main_hw,    strsm_r_up_tr_main_hw,    strsm_r_up_nt_main_hw,    strsm_r_lo_tr_main_fulltile_hw,    strsm_r_lo_nt_main_fulltile_hw,    strsm_r_up_tr_main_fulltile_hw,    strsm_r_up_nt_main_fulltile_hw,    strsm_r_lo_tr_main_alpha0_hw,    strsm_r_lo_nt_main_alpha0_hw,    strsm_r_up_tr_main_alpha0_hw,    strsm_r_up_nt_main_alpha0_hw,    strsm_r_lo_tr_main_fulltile_alpha0_hw,    strsm_r_lo_nt_main_fulltile_alpha0_hw,    strsm_r_up_tr_main_fulltile_alpha0_hw,    strsm_r_up_nt_main_fulltile_alpha0_hw,    fast_strsm_r_lo_tr_main_hw,    fast_strsm_r_lo_nt_main_hw,    fast_strsm_r_up_tr_main_hw,    fast_strsm_r_up_nt_main_hw,    fast_strsm_r_lo_tr_main_fulltile_hw,    fast_strsm_r_lo_nt_main_fulltile_hw,    fast_strsm_r_up_tr_main_fulltile_hw,    fast_strsm_r_up_nt_main_fulltile_hw,    fast_strsm_r_lo_tr_main_alpha0_hw,    fast_strsm_r_lo_nt_main_alpha0_hw,    fast_strsm_r_up_tr_main_alpha0_hw,    fast_strsm_r_up_nt_main_alpha0_hw,    fast_strsm_r_lo_tr_main_fulltile_alpha0_hw,    fast_strsm_r_lo_nt_main_fulltile_alpha0_hw,    fast_strsm_r_up_tr_main_fulltile_alpha0_hw,    fast_strsm_r_up_nt_main_fulltile_alpha0_hw,    strsm_r_lo_tr_main_hw_nu,    strsm_r_lo_nt_main_hw_nu,    strsm_r_up_tr_main_hw_nu,    strsm_r_up_nt_main_hw_nu,    strsm_r_lo_tr_main_fulltile_hw_nu,    strsm_r_lo_nt_main_fulltile_hw_nu,    strsm_r_up_tr_main_fulltile_hw_nu,    strsm_r_up_nt_main_fulltile_hw_nu,    strsm_r_lo_tr_main_alpha0_hw_nu,    strsm_r_lo_nt_main_alpha0_hw_nu,    strsm_r_up_tr_main_alpha0_hw_nu,    strsm_r_up_nt_main_alpha0_hw_nu,    strsm_r_lo_tr_main_fulltile_alpha0_hw_nu,    strsm_r_lo_nt_main_fulltile_alpha0_hw_nu,    strsm_r_up_tr_main_fulltile_alpha0_hw_nu,    strsm_r_up_nt_main_fulltile_alpha0_hw_nu,    fast_strsm_r_lo_tr_main_hw_nu,    fast_strsm_r_lo_nt_main_hw_nu,    fast_strsm_r_up_tr_main_hw_nu,    fast_strsm_r_up_nt_main_hw_nu,    fast_strsm_r_lo_tr_main_fulltile_hw_nu,    fast_strsm_r_lo_nt_main_fulltile_hw_nu,    fast_strsm_r_up_tr_main_fulltile_hw_nu,    fast_strsm_r_up_nt_main_fulltile_hw_nu,    fast_strsm_r_lo_tr_main_alpha0_hw_nu,    fast_strsm_r_lo_nt_main_alpha0_hw_nu,    fast_strsm_r_up_tr_main_alpha0_hw_nu,    fast_strsm_r_up_nt_main_alpha0_hw_nu,    fast_strsm_r_lo_tr_main_fulltile_alpha0_hw_nu,    fast_strsm_r_lo_nt_main_fulltile_alpha0_hw_nu,    fast_strsm_r_up_tr_main_fulltile_alpha0_hw_nu,    fast_strsm_r_up_nt_main_fulltile_alpha0_hw_nu};static pf strsm_r_sw[64] = {    strsm_r_lo_tr_main_sw,    strsm_r_lo_nt_main_sw,    strsm_r_up_tr_main_sw,    strsm_r_up_nt_main_sw,    strsm_r_lo_tr_main_fulltile_sw,    strsm_r_lo_nt_main_fulltile_sw,    strsm_r_up_tr_main_fulltile_sw,    strsm_r_up_nt_main_fulltile_sw,    strsm_r_lo_tr_main_alpha0_sw,    strsm_r_lo_nt_main_alpha0_sw,    strsm_r_up_tr_main_alpha0_sw,    strsm_r_up_nt_main_alpha0_sw,    strsm_r_lo_tr_main_fulltile_alpha0_sw,    strsm_r_lo_nt_main_fulltile_alpha0_sw,    strsm_r_up_tr_main_fulltile_alpha0_sw,    strsm_r_up_nt_main_fulltile_alpha0_sw,    fast_strsm_r_lo_tr_main_sw,    fast_strsm_r_lo_nt_main_sw,    fast_strsm_r_up_tr_main_sw,    fast_strsm_r_up_nt_main_sw,    fast_strsm_r_lo_tr_main_fulltile_sw,    fast_strsm_r_lo_nt_main_fulltile_sw,    fast_strsm_r_up_tr_main_fulltile_sw,    fast_strsm_r_up_nt_main_fulltile_sw,    fast_strsm_r_lo_tr_main_alpha0_sw,    fast_strsm_r_lo_nt_main_alpha0_sw,    fast_strsm_r_up_tr_main_alpha0_sw,    fast_strsm_r_up_nt_main_alpha0_sw,    fast_strsm_r_lo_tr_main_fulltile_alpha0_sw,    fast_strsm_r_lo_nt_main_fulltile_alpha0_sw,    fast_strsm_r_up_tr_main_fulltile_alpha0_sw,    fast_strsm_r_up_nt_main_fulltile_alpha0_sw,    strsm_r_lo_tr_main_sw_nu,    strsm_r_lo_nt_main_sw_nu,    strsm_r_up_tr_main_sw_nu,    strsm_r_up_nt_main_sw_nu,    strsm_r_lo_tr_main_fulltile_sw_nu,    strsm_r_lo_nt_main_fulltile_sw_nu,    strsm_r_up_tr_main_fulltile_sw_nu,    strsm_r_up_nt_main_fulltile_sw_nu,    strsm_r_lo_tr_main_alpha0_sw_nu,    strsm_r_lo_nt_main_alpha0_sw_nu,    strsm_r_up_tr_main_alpha0_sw_nu,    strsm_r_up_nt_main_alpha0_sw_nu,    strsm_r_lo_tr_main_fulltile_alpha0_sw_nu,    strsm_r_lo_nt_main_fulltile_alpha0_sw_nu,    strsm_r_up_tr_main_fulltile_alpha0_sw_nu,    strsm_r_up_nt_main_fulltile_alpha0_sw_nu,    fast_strsm_r_lo_tr_main_sw_nu,    fast_strsm_r_lo_nt_main_sw_nu,    fast_strsm_r_up_tr_main_sw_nu,    fast_strsm_r_up_nt_main_sw_nu,    fast_strsm_r_lo_tr_main_fulltile_sw_nu,    fast_strsm_r_lo_nt_main_fulltile_sw_nu,    fast_strsm_r_up_tr_main_fulltile_sw_nu,    fast_strsm_r_up_nt_main_fulltile_sw_nu,    fast_strsm_r_lo_tr_main_alpha0_sw_nu,    fast_strsm_r_lo_nt_main_alpha0_sw_nu,    fast_strsm_r_up_tr_main_alpha0_sw_nu,    fast_strsm_r_up_nt_main_alpha0_sw_nu,    fast_strsm_r_lo_tr_main_fulltile_alpha0_sw_nu,    fast_strsm_r_lo_nt_main_fulltile_alpha0_sw_nu,    fast_strsm_r_up_tr_main_fulltile_alpha0_sw_nu,    fast_strsm_r_up_nt_main_fulltile_alpha0_sw_nu};#define BLK_LOG             (5)#define BLK                 (1 << BLK_LOG)#define JINC                (BLK * CUBLAS_STRSM_CTAS)#define IINC                (BLK * CUBLAS_STRSM_CTAS)/* * void  * cublasStrsm (char side, char uplo, char transa, char diag, int m, int n,  *              float alpha, const float *A, int lda, float *B, int ldb) * * solves one of the matrix equations * *    op(A) * X = alpha * B,   or   X * op(A) = alpha * B, * * where alpha is a single precision scalar, and X and B are m x n matrices  * that are composed of single precision elements. A is a unit or non-unit, * upper or lower triangular matrix, and op(A) is one of  * *    op(A) = A  or  op(A) = transpose(A) * * The result matrix X overwrites input matrix B; that is, on exit the result  * is stored in B. Matrices A and B are stored in column major format, and * lda and ldb are the leading dimensions of the two-dimensonials arrays that * contain A and B, respectively. * * Input * ----- * side   specifies whether op(A) appears on the left or right of X as *        follows: side = 'L' or 'l' indicates solve op(A) * X = alpha * B. *        side = 'R' or 'r' indicates solve X * op(A) = alpha * B. * uplo   specifies whether the matrix A is an upper or lower triangular *        matrix as follows: uplo = 'U' or 'u' indicates A is an upper *        triangular matrix. uplo = 'L' or 'l' indicates A is a lower *        triangular matrix. * transa specifies the form of op(A) to be used in matrix multiplication *        as follows: If transa = 'N' or 'N', then op(A) = A. If transa = *        'T', 't', 'C', or 'c', then op(A) = transpose(A). * diag   specifies whether or not A is a unit triangular matrix like so: *        if diag = 'U' or 'u', A is assumed to be unit triangular. If  *        diag = 'N' or 'n', then A is not assumed to be unit triangular. * m      specifies the number of rows of B. m must be at least zero. * n      specifies the number of columns of B. n must be at least zero. * alpha  is a single precision scalar to be multiplied with B. When alpha is  *        zero, then A is not referenced and B need not be set before entry. * A      is a single precision array of dimensions (lda, k), where k is *        m when side = 'L' or 'l', and is n when side = 'R' or 'r'. If *        uplo = 'U' or 'u', the leading k x k upper triangular part of *        the array A must contain the upper triangular matrix and the *        strictly lower triangular matrix of A is not referenced. When *        uplo = 'L' or 'l', the leading k x k lower triangular part of *        the array A must contain the lower triangular matrix and the  *        strictly upper triangular part of A is not referenced. Note that *        when diag = 'U' or 'u', the diagonal elements of A are not *        referenced, and are assumed to be unity. * lda    is the leading dimension of the two dimensional array containing A. *        When side = 'L' or 'l' then lda must be at least max(1, m), when  *        side = 'R' or 'r' then lda must be at least max(1, n). * B      is a single precision array of dimensions (ldb, n). ldb must be *        at least max (1,m). The leading m x n part of the array B must  *        contain the right-hand side matrix B. On exit B is overwritten  *        by the solution matrix X. * ldb    is the leading dimension of the two dimensional array containing B. *        ldb must be at least max(1, m). * * Output * ------ * B      contains the solution matrix X satisfying op(A) * X = alpha * B,  *        or X * op(A) = alpha * B * * Reference: http://www.netlib.org/blas/strsm.f * * Error status for this function can be retrieved via cublasGetError(). * * Error Status * ------------ * CUBLAS_STATUS_NOT_INITIALIZED  if CUBLAS library has not been initialized * CUBLAS_STATUS_INVALID_VALUE    if m or n < 0 * CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU */__host__ void CUBLASAPI cublasStrsm (char side, char uplo, char transa,                                      char diag, int m, int n, float alpha,                                     const float *A, int lda, float *B,                                     int ldb){    struct cublasContext *ctx = CUBLAS_GET_CTX();    struct cublasStrsmParams params;    cudaError_t cudaStat;    int fullTilesOnly;    int funcIdx;    int useFastImul;    int usePureHwStepper;    int info;    int lside = toupper(side)   == 'L';    int upper, notrans, nounit, nrowa;    dim3 ctaDimsHw (lside ? ((n+BLK-1)/BLK) : ((m+BLK-1)/BLK));    dim3 ctaDimsSw (CUBLAS_STRSM_CTAS);    if (!cublasInitialized (ctx)) {        cublasSetError (ctx, CUBLAS_STATUS_NOT_INITIALIZED);        return;    }    upper   = toupper(uplo)   == 'U';    notrans = toupper(transa) == 'N';    nounit  = toupper(diag)   == 'N';    nrowa = (lside) ? m : n;    info = 0;    if ((!lside) && (toupper(side) != 'R')) {        info = 1;    }     else if ((!upper) && (toupper(uplo) != 'L')) {        info = 2;    }    else if ((!notrans) && (toupper(transa) != 'T') && (toupper(transa)!='C')){        info = 3;    }    else if ((!nounit) && (toupper(diag) != 'U')) {        info = 4;    }    else if (m < 0) {        info = 5;    }    else if (n < 0) {        info = 6;    }    else if (lda < imax (1, nrowa)) {        info = 9;    }    else if (ldb < imax (1, m)) {        info = 11;    }    if (info) {        cublasXerbla ("STRSM ", info);        cublasSetError (ctx, CUBLAS_STATUS_INVALID_VALUE);        return;    }    /* early out if nothing to do */    if ((m == 0) || (n == 0)) {        return;    }    params.lside = lside;    params.upper = upper;    params.notrans = notrans;    params.nounit = nounit;    params.m = m;    params.n = n;    params.alpha = alpha;    params.A = A;    params.lda = lda;    params.B = B;    params.ldb = ldb;    /* choose HW-only stepping if columns in result matrix do not exceed the

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -