⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 strmm.cu

📁 Nividia提供的CUDA的BLAS库源码
💻 CU
📖 第 1 页 / 共 5 页
字号:
    fast_strmm_l_lo_nt_main_sw,    fast_strmm_l_up_nt_main_sw,    fast_strmm_r_lo_tr_main_unit_sw,    fast_strmm_r_up_tr_main_unit_sw,    fast_strmm_r_lo_nt_main_unit_sw,    fast_strmm_r_up_nt_main_unit_sw,    fast_strmm_l_lo_tr_main_unit_sw,    fast_strmm_l_up_tr_main_unit_sw,    fast_strmm_l_lo_nt_main_unit_sw,    fast_strmm_l_up_nt_main_unit_sw,    fast_strmm_r_lo_tr_main_alpha0_sw,    fast_strmm_r_up_tr_main_alpha0_sw,    fast_strmm_r_lo_nt_main_alpha0_sw,    fast_strmm_r_up_nt_main_alpha0_sw,    fast_strmm_l_lo_tr_main_alpha0_sw,    fast_strmm_l_up_tr_main_alpha0_sw,    fast_strmm_l_lo_nt_main_alpha0_sw,    fast_strmm_l_up_nt_main_alpha0_sw,    fast_strmm_r_lo_tr_main_unit_alpha0_sw,    fast_strmm_r_up_tr_main_unit_alpha0_sw,    fast_strmm_r_lo_nt_main_unit_alpha0_sw,    fast_strmm_r_up_nt_main_unit_alpha0_sw,    fast_strmm_l_lo_tr_main_unit_alpha0_sw,    fast_strmm_l_up_tr_main_unit_alpha0_sw,    fast_strmm_l_lo_nt_main_unit_alpha0_sw,    fast_strmm_l_up_nt_main_unit_alpha0_sw,    strmm_r_lo_tr_main_fulltile_sw,    strmm_r_up_tr_main_fulltile_sw,    strmm_r_lo_nt_main_fulltile_sw,    strmm_r_up_nt_main_fulltile_sw,    strmm_l_lo_tr_main_fulltile_sw,    strmm_l_up_tr_main_fulltile_sw,    strmm_l_lo_nt_main_fulltile_sw,    strmm_l_up_nt_main_fulltile_sw,    strmm_r_lo_tr_main_unit_fulltile_sw,    strmm_r_up_tr_main_unit_fulltile_sw,    strmm_r_lo_nt_main_unit_fulltile_sw,    strmm_r_up_nt_main_unit_fulltile_sw,    strmm_l_lo_tr_main_unit_fulltile_sw,    strmm_l_up_tr_main_unit_fulltile_sw,    strmm_l_lo_nt_main_unit_fulltile_sw,    strmm_l_up_nt_main_unit_fulltile_sw,    strmm_r_lo_tr_main_alpha0_fulltile_sw,    strmm_r_up_tr_main_alpha0_fulltile_sw,    strmm_r_lo_nt_main_alpha0_fulltile_sw,    strmm_r_up_nt_main_alpha0_fulltile_sw,    strmm_l_lo_tr_main_alpha0_fulltile_sw,    strmm_l_up_tr_main_alpha0_fulltile_sw,    strmm_l_lo_nt_main_alpha0_fulltile_sw,    strmm_l_up_nt_main_alpha0_fulltile_sw,    strmm_r_lo_tr_main_unit_alpha0_fulltile_sw,    strmm_r_up_tr_main_unit_alpha0_fulltile_sw,    strmm_r_lo_nt_main_unit_alpha0_fulltile_sw,    strmm_r_up_nt_main_unit_alpha0_fulltile_sw,    strmm_l_lo_tr_main_unit_alpha0_fulltile_sw,    strmm_l_up_tr_main_unit_alpha0_fulltile_sw,    strmm_l_lo_nt_main_unit_alpha0_fulltile_sw,    strmm_l_up_nt_main_unit_alpha0_fulltile_sw,    fast_strmm_r_lo_tr_main_fulltile_sw,    fast_strmm_r_up_tr_main_fulltile_sw,    fast_strmm_r_lo_nt_main_fulltile_sw,    fast_strmm_r_up_nt_main_fulltile_sw,    fast_strmm_l_lo_tr_main_fulltile_sw,    fast_strmm_l_up_tr_main_fulltile_sw,    fast_strmm_l_lo_nt_main_fulltile_sw,    fast_strmm_l_up_nt_main_fulltile_sw,    fast_strmm_r_lo_tr_main_unit_fulltile_sw,    fast_strmm_r_up_tr_main_unit_fulltile_sw,    fast_strmm_r_lo_nt_main_unit_fulltile_sw,    fast_strmm_r_up_nt_main_unit_fulltile_sw,    fast_strmm_l_lo_tr_main_unit_fulltile_sw,    fast_strmm_l_up_tr_main_unit_fulltile_sw,    fast_strmm_l_lo_nt_main_unit_fulltile_sw,    fast_strmm_l_up_nt_main_unit_fulltile_sw,    fast_strmm_r_lo_tr_main_alpha0_fulltile_sw,    fast_strmm_r_up_tr_main_alpha0_fulltile_sw,    fast_strmm_r_lo_nt_main_alpha0_fulltile_sw,    fast_strmm_r_up_nt_main_alpha0_fulltile_sw,    fast_strmm_l_lo_tr_main_alpha0_fulltile_sw,    fast_strmm_l_up_tr_main_alpha0_fulltile_sw,    fast_strmm_l_lo_nt_main_alpha0_fulltile_sw,    fast_strmm_l_up_nt_main_alpha0_fulltile_sw,    fast_strmm_r_lo_tr_main_unit_alpha0_fulltile_sw,    fast_strmm_r_up_tr_main_unit_alpha0_fulltile_sw,    fast_strmm_r_lo_nt_main_unit_alpha0_fulltile_sw,    fast_strmm_r_up_nt_main_unit_alpha0_fulltile_sw,    fast_strmm_l_lo_tr_main_unit_alpha0_fulltile_sw,    fast_strmm_l_up_tr_main_unit_alpha0_fulltile_sw,    fast_strmm_l_lo_nt_main_unit_alpha0_fulltile_sw,    fast_strmm_l_up_nt_main_unit_alpha0_fulltile_sw};static pf strmm_hw[128] = {    strmm_r_lo_tr_main_hw,    strmm_r_up_tr_main_hw,    strmm_r_lo_nt_main_hw,    strmm_r_up_nt_main_hw,    strmm_l_lo_tr_main_hw,    strmm_l_up_tr_main_hw,    strmm_l_lo_nt_main_hw,    strmm_l_up_nt_main_hw,    strmm_r_lo_tr_main_unit_hw,    strmm_r_up_tr_main_unit_hw,    strmm_r_lo_nt_main_unit_hw,    strmm_r_up_nt_main_unit_hw,    strmm_l_lo_tr_main_unit_hw,    strmm_l_up_tr_main_unit_hw,    strmm_l_lo_nt_main_unit_hw,    strmm_l_up_nt_main_unit_hw,    strmm_r_lo_tr_main_alpha0_hw,    strmm_r_up_tr_main_alpha0_hw,    strmm_r_lo_nt_main_alpha0_hw,    strmm_r_up_nt_main_alpha0_hw,    strmm_l_lo_tr_main_alpha0_hw,    strmm_l_up_tr_main_alpha0_hw,    strmm_l_lo_nt_main_alpha0_hw,    strmm_l_up_nt_main_alpha0_hw,    strmm_r_lo_tr_main_unit_alpha0_hw,    strmm_r_up_tr_main_unit_alpha0_hw,    strmm_r_lo_nt_main_unit_alpha0_hw,    strmm_r_up_nt_main_unit_alpha0_hw,    strmm_l_lo_tr_main_unit_alpha0_hw,    strmm_l_up_tr_main_unit_alpha0_hw,    strmm_l_lo_nt_main_unit_alpha0_hw,    strmm_l_up_nt_main_unit_alpha0_hw,    fast_strmm_r_lo_tr_main_hw,    fast_strmm_r_up_tr_main_hw,    fast_strmm_r_lo_nt_main_hw,    fast_strmm_r_up_nt_main_hw,    fast_strmm_l_lo_tr_main_hw,    fast_strmm_l_up_tr_main_hw,    fast_strmm_l_lo_nt_main_hw,    fast_strmm_l_up_nt_main_hw,    fast_strmm_r_lo_tr_main_unit_hw,    fast_strmm_r_up_tr_main_unit_hw,    fast_strmm_r_lo_nt_main_unit_hw,    fast_strmm_r_up_nt_main_unit_hw,    fast_strmm_l_lo_tr_main_unit_hw,    fast_strmm_l_up_tr_main_unit_hw,    fast_strmm_l_lo_nt_main_unit_hw,    fast_strmm_l_up_nt_main_unit_hw,    fast_strmm_r_lo_tr_main_alpha0_hw,    fast_strmm_r_up_tr_main_alpha0_hw,    fast_strmm_r_lo_nt_main_alpha0_hw,    fast_strmm_r_up_nt_main_alpha0_hw,    fast_strmm_l_lo_tr_main_alpha0_hw,    fast_strmm_l_up_tr_main_alpha0_hw,    fast_strmm_l_lo_nt_main_alpha0_hw,    fast_strmm_l_up_nt_main_alpha0_hw,    fast_strmm_r_lo_tr_main_unit_alpha0_hw,    fast_strmm_r_up_tr_main_unit_alpha0_hw,    fast_strmm_r_lo_nt_main_unit_alpha0_hw,    fast_strmm_r_up_nt_main_unit_alpha0_hw,    fast_strmm_l_lo_tr_main_unit_alpha0_hw,    fast_strmm_l_up_tr_main_unit_alpha0_hw,    fast_strmm_l_lo_nt_main_unit_alpha0_hw,    fast_strmm_l_up_nt_main_unit_alpha0_hw,    strmm_r_lo_tr_main_fulltile_hw,    strmm_r_up_tr_main_fulltile_hw,    strmm_r_lo_nt_main_fulltile_hw,    strmm_r_up_nt_main_fulltile_hw,    strmm_l_lo_tr_main_fulltile_hw,    strmm_l_up_tr_main_fulltile_hw,    strmm_l_lo_nt_main_fulltile_hw,    strmm_l_up_nt_main_fulltile_hw,    strmm_r_lo_tr_main_unit_fulltile_hw,    strmm_r_up_tr_main_unit_fulltile_hw,    strmm_r_lo_nt_main_unit_fulltile_hw,    strmm_r_up_nt_main_unit_fulltile_hw,    strmm_l_lo_tr_main_unit_fulltile_hw,    strmm_l_up_tr_main_unit_fulltile_hw,    strmm_l_lo_nt_main_unit_fulltile_hw,    strmm_l_up_nt_main_unit_fulltile_hw,    strmm_r_lo_tr_main_alpha0_fulltile_hw,    strmm_r_up_tr_main_alpha0_fulltile_hw,    strmm_r_lo_nt_main_alpha0_fulltile_hw,    strmm_r_up_nt_main_alpha0_fulltile_hw,    strmm_l_lo_tr_main_alpha0_fulltile_hw,    strmm_l_up_tr_main_alpha0_fulltile_hw,    strmm_l_lo_nt_main_alpha0_fulltile_hw,    strmm_l_up_nt_main_alpha0_fulltile_hw,    strmm_r_lo_tr_main_unit_alpha0_fulltile_hw,    strmm_r_up_tr_main_unit_alpha0_fulltile_hw,    strmm_r_lo_nt_main_unit_alpha0_fulltile_hw,    strmm_r_up_nt_main_unit_alpha0_fulltile_hw,    strmm_l_lo_tr_main_unit_alpha0_fulltile_hw,    strmm_l_up_tr_main_unit_alpha0_fulltile_hw,    strmm_l_lo_nt_main_unit_alpha0_fulltile_hw,    strmm_l_up_nt_main_unit_alpha0_fulltile_hw,    fast_strmm_r_lo_tr_main_fulltile_hw,    fast_strmm_r_up_tr_main_fulltile_hw,    fast_strmm_r_lo_nt_main_fulltile_hw,    fast_strmm_r_up_nt_main_fulltile_hw,    fast_strmm_l_lo_tr_main_fulltile_hw,    fast_strmm_l_up_tr_main_fulltile_hw,    fast_strmm_l_lo_nt_main_fulltile_hw,    fast_strmm_l_up_nt_main_fulltile_hw,    fast_strmm_r_lo_tr_main_unit_fulltile_hw,    fast_strmm_r_up_tr_main_unit_fulltile_hw,    fast_strmm_r_lo_nt_main_unit_fulltile_hw,    fast_strmm_r_up_nt_main_unit_fulltile_hw,    fast_strmm_l_lo_tr_main_unit_fulltile_hw,    fast_strmm_l_up_tr_main_unit_fulltile_hw,    fast_strmm_l_lo_nt_main_unit_fulltile_hw,    fast_strmm_l_up_nt_main_unit_fulltile_hw,    fast_strmm_r_lo_tr_main_alpha0_fulltile_hw,    fast_strmm_r_up_tr_main_alpha0_fulltile_hw,    fast_strmm_r_lo_nt_main_alpha0_fulltile_hw,    fast_strmm_r_up_nt_main_alpha0_fulltile_hw,    fast_strmm_l_lo_tr_main_alpha0_fulltile_hw,    fast_strmm_l_up_tr_main_alpha0_fulltile_hw,    fast_strmm_l_lo_nt_main_alpha0_fulltile_hw,    fast_strmm_l_up_nt_main_alpha0_fulltile_hw,    fast_strmm_r_lo_tr_main_unit_alpha0_fulltile_hw,    fast_strmm_r_up_tr_main_unit_alpha0_fulltile_hw,    fast_strmm_r_lo_nt_main_unit_alpha0_fulltile_hw,    fast_strmm_r_up_nt_main_unit_alpha0_fulltile_hw,    fast_strmm_l_lo_tr_main_unit_alpha0_fulltile_hw,    fast_strmm_l_up_tr_main_unit_alpha0_fulltile_hw,    fast_strmm_l_lo_nt_main_unit_alpha0_fulltile_hw,    fast_strmm_l_up_nt_main_unit_alpha0_fulltile_hw};/* * void  * cublasStrmm (char side, char uplo, char transa, char diag, int m, int n,  *              float alpha, const float *A, int lda, const float *B, int ldb) * * performs one of the matrix-matrix operations * *   B = alpha * op(A) * B,  or  B = alpha * B * op(A) * * where alpha is a single-precision scalar, B is an m x n matrix composed * of single precision elements, and A is a unit or non-unit, upper or lower,  * triangular matrix composed of single precision elements. op(A) is one of * *   op(A) = A  or  op(A) = transpose(A) * * Matrices A and B are stored in column major format, and lda and ldb are  * the leading dimensions of the two-dimensonials arrays that contain A and  * B, respectively. * * Input * ----- * side   specifies whether op(A) multiplies B from the left or right. *        If side = 'L' or 'l', then B = alpha * op(A) * B. If side = *        'R' or 'r', then B = alpha * B * op(A). * uplo   specifies whether the matrix A is an upper or lower triangular *        matrix. If uplo = 'U' or 'u', A is an upper triangular matrix. *        If uplo = 'L' or 'l', A is a lower triangular matrix. * transa specifies the form of op(A) to be used in the matrix  *        multiplication. If transa = 'N' or 'n', then op(A) = A. If *        transa = 'T', 't', 'C', or 'c', then op(A) = transpose(A). * diag   specifies whether or not A is unit triangular. If diag = 'U' *        or 'u', A is assumed to be unit triangular. If diag = 'N' or *        'n', A is not assumed to be unit triangular. * m      the number of rows of matrix B. m must be at least zero. * n      the number of columns of matrix B. n must be at least zero. * alpha  single precision scalar multiplier applied to op(A)*B, or *        B*op(A), respectively. If alpha is zero no accesses are made *        to matrix A, and no read accesses are made to matrix B. * A      single precision array of dimensions (lda, k). k = m if side = *        'L' or 'l', k = n if side = 'R' or 'r'. If uplo = 'U' or 'u' *        the leading k x k upper triangular part of the array A must *        contain the upper triangular matrix, and the strictly lower *        triangular part of A is not referenced. If uplo = 'L' or 'l' *        the leading k x k lower triangular part of the array A must *        contain the lower triangular matrix, and the strictly upper *        triangular part of A is not referenced. When diag = 'U' or 'u' *        the diagonal elements of A are no referenced and are assumed *        to be unity. * lda    leading dimension of A. When side = 'L' or 'l', it must be at *        least max(1,m) and at least max(1,n) otherwise * B      single precision array of dimensions (ldb, n). On entry, the  *        leading m x n part of the array contains the matrix B. It is *        overwritten with the transformed matrix on exit. * ldb    leading dimension of B. It must be at least max (1, m). * * Output * ------ * B      updated according to B = alpha * op(A) * B  or B = alpha * B * op(A) * * Reference: http://www.netlib.org/blas/strmm.f * * Error status for this function can be retrieved via cublasGetError(). * * Error Status * ------------ * CUBLAS_STATUS_NOT_INITIALIZED  if CUBLAS library has not been initialized * CUBLAS_STATUS_INVALID_VALUE    if m or n < 0 * CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU */__host__ void CUBLASAPI cublasStrmm (char side, char uplo, char transa,                                     char diag, int m, int n, float alpha,                                     const float *A, int lda, float *B,                                     int ldb){    struct cublasContext *ctx = CUBLAS_GET_CTX();    struct cublasStrmmParams params;    cudaError_t cudaStat;    int info;    int lside = toupper(side) == 'L';    int upper, notrans, unit, nrowa;    int funcIdx;    int useFastImul;    int fullTilesOnly;    int usePureHwStepper;    dim3 ctaDimsHw (lside ? ((n+BLK-1)/BLK) : ((m+BLK-1)/BLK));    dim3 ctaDimsSw (CUBLAS_STRMM_CTAS);    if (!cublasInitialized (ctx)) {        cublasSetError (ctx, CUBLAS_STATUS_NOT_INITIALIZED);        return;    }    upper   = toupper(uplo)   == 'U';    notrans = toupper(transa) == 'N';    unit    = toupper(diag)   == 'U';    nrowa = (lside) ? m : n;    info = 0;    if ((!lside) && (toupper(side) != 'R')) {        info = 1;    }     else if ((!upper) && (toupper(uplo) != 'L')) {        info = 2;    }    else if ((!notrans) && (toupper(transa) != 'T') && (toupper(transa)!='C')){        info = 3;    }    else if ((unit) && (toupper(diag) != 'U')) {        info = 4;    }    else if (m < 0) {        info = 5;    }    else if (n < 0) {        info = 6;    }    else if (lda < imax (1, nrowa)) {        info = 9;    }    else if (ldb < imax (1, m)) {        info = 11;    }    if (info) {        cublasXerbla ("STRMM ", info);        cublasSetError (ctx, CUBLAS_STATUS_INVALID_VALUE);        return;    }    /* early out if nothing to do */    if ((m == 0) || (n == 0)) return;    params.lside = lside;    params.upper = upper;    params.notrans = notrans;    params.unit = unit;    params.m = m;    params.n = n;    params.alpha = alpha;    params.A = A;    params.lda = lda;    params.B = B;    params.ldb = ldb;

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -