⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 strsm.cu

📁 Nividia提供的CUDA的BLAS库源码
💻 CU
📖 第 1 页 / 共 5 页
字号:
     * maximum CTA grid dimensions.     */    usePureHwStepper = (params.lside ? (n < (CUBLAS_CTA_MAX_DIM * BLK)) :                                       (m < (CUBLAS_CTA_MAX_DIM * BLK)));    /* We can eliminate checking for endcases if we know all tiles are fully     * populated. Important benchmark case!     */    fullTilesOnly = (((m % BLK) == 0) && ((n % BLK) == 0));    /* choose version using 24-bit multiplies if all dimensions are less than     * 2001, so we can guarantee that no multiplication result exceeds (2000 *     * 2000 * 4) < 2^24.     */    useFastImul = ((params.lda <= CUBLAS_FASTIMUL_F_MAX_DIM) &&                    (params.ldb <= CUBLAS_FASTIMUL_F_MAX_DIM) &&                    (params.m <= CUBLAS_FASTIMUL_F_MAX_DIM) &&                    (params.n <= CUBLAS_FASTIMUL_F_MAX_DIM));    funcIdx = ((params.nounit << 5) | (useFastImul << 4) |                ((params.alpha == 0.0f) << 3) | (fullTilesOnly << 2) |                (params.upper << 1) | params.notrans);    cudaStat = cudaGetLastError(); /* clear error status */    if (params.lside) {        if (usePureHwStepper) {            strsm_l_hw[funcIdx]<<<ctaDimsHw,CUBLAS_STRSM_THREAD_COUNT>>>(params);        } else {            strsm_l_sw[funcIdx]<<<ctaDimsSw,CUBLAS_STRSM_THREAD_COUNT>>>(params);        }    } else {        if (usePureHwStepper) {            strsm_r_hw[funcIdx]<<<ctaDimsHw,CUBLAS_STRSM_THREAD_COUNT>>>(params);        } else {            strsm_r_sw[funcIdx]<<<ctaDimsSw,CUBLAS_STRSM_THREAD_COUNT>>>(params);        }    }    cudaStat = cudaGetLastError(); /* check for launch error */    if (cudaStat != cudaSuccess) {        cublasSetError (ctx, CUBLAS_STATUS_EXECUTION_FAILED);    }}#if ((CUBLAS_STRSM_THREAD_COUNT<BLK))#error block dimension must be >= threadcount#endif#if ((CUBLAS_STRSM_THREAD_COUNT%BLK)!=0)#error threadcount and block dimensions do not divide evenly#endif#define A_NBR_COLS          (CUBLAS_STRSM_THREAD_COUNT/BLK)#define B_NBR_COLS          (CUBLAS_STRSM_THREAD_COUNT/BLK)#if (((BLK*BLK)%CUBLAS_STRSM_THREAD_COUNT)!=0)#error blocksize of A and B not evenly divided by threadcount!#endif#define A_ELEMS_PER_THREAD  ((BLK * BLK) / CUBLAS_STRSM_THREAD_COUNT)#define B_ELEMS_PER_THREAD  ((BLK * BLK) / CUBLAS_STRSM_THREAD_COUNT)__shared__ float AA[(BLK+1)*BLK];  // padded to avoid GRF bank conflicts__shared__ float BB[(BLK+1)*BLK];  // padded to avoid GRF bank conflicts__global__ void strsm_l_lo_nt_main_hw (struct cublasStrsmParams parms) {#undef NOUNIT#undef USE_MIXED_STEPPER#undef FAST_IMUL#undef ALPHA_IS_ZERO#undef FULL_TILES_ONLY#undef LOWER#undef TRANS#define NOUNIT            0#define USE_MIXED_STEPPER 0#define FAST_IMUL         0#define ALPHA_IS_ZERO     0#define FULL_TILES_ONLY   0#define LOWER             1#define TRANS             0#include "strsm_l.h"}__global__ void strsm_l_lo_tr_main_hw (struct cublasStrsmParams parms){#undef NOUNIT#undef USE_MIXED_STEPPER#undef FAST_IMUL#undef ALPHA_IS_ZERO#undef FULL_TILES_ONLY#undef LOWER#undef TRANS#define NOUNIT            0#define USE_MIXED_STEPPER 0#define FAST_IMUL         0#define ALPHA_IS_ZERO     0#define FULL_TILES_ONLY   0#define LOWER             1#define TRANS             1#include "strsm_l.h"}__global__ void strsm_l_up_nt_main_hw (struct cublasStrsmParams parms){#undef NOUNIT#undef USE_MIXED_STEPPER#undef FAST_IMUL#undef ALPHA_IS_ZERO#undef FULL_TILES_ONLY#undef LOWER#undef TRANS#define NOUNIT            0#define USE_MIXED_STEPPER 0#define FAST_IMUL         0#define ALPHA_IS_ZERO     0#define FULL_TILES_ONLY   0#define LOWER             0#define TRANS             0#include "strsm_l.h"}__global__ void strsm_l_up_tr_main_hw (struct cublasStrsmParams parms){#undef NOUNIT#undef USE_MIXED_STEPPER#undef FAST_IMUL#undef ALPHA_IS_ZERO#undef FULL_TILES_ONLY#undef LOWER#undef TRANS#define NOUNIT            0#define USE_MIXED_STEPPER 0#define FAST_IMUL         0#define ALPHA_IS_ZERO     0#define FULL_TILES_ONLY   0#define LOWER             0#define TRANS             1#include "strsm_l.h"}__global__ void strsm_l_lo_nt_main_fulltile_hw (struct cublasStrsmParams parms) {#undef NOUNIT#undef USE_MIXED_STEPPER#undef FAST_IMUL#undef ALPHA_IS_ZERO#undef FULL_TILES_ONLY #undef LOWER#undef TRANS#define NOUNIT            0#define USE_MIXED_STEPPER 0#define FAST_IMUL         0#define ALPHA_IS_ZERO     0#define FULL_TILES_ONLY   1#define LOWER             1#define TRANS             0#include "strsm_l.h"}__global__ void strsm_l_lo_tr_main_fulltile_hw (struct cublasStrsmParams parms){#undef NOUNIT#undef USE_MIXED_STEPPER#undef FAST_IMUL#undef ALPHA_IS_ZERO#undef FULL_TILES_ONLY#undef LOWER#undef TRANS#define NOUNIT            0#define USE_MIXED_STEPPER 0#define FAST_IMUL         0#define ALPHA_IS_ZERO     0#define FULL_TILES_ONLY   1#define LOWER             1#define TRANS             1#include "strsm_l.h"}__global__ void strsm_l_up_nt_main_fulltile_hw (struct cublasStrsmParams parms){#undef NOUNIT#undef USE_MIXED_STEPPER#undef FAST_IMUL#undef ALPHA_IS_ZERO#undef FULL_TILES_ONLY#undef LOWER#undef TRANS#define NOUNIT            0#define USE_MIXED_STEPPER 0#define FAST_IMUL         0#define ALPHA_IS_ZERO     0#define FULL_TILES_ONLY   1#define LOWER             0#define TRANS             0#include "strsm_l.h"}__global__ void strsm_l_up_tr_main_fulltile_hw (struct cublasStrsmParams parms){#undef NOUNIT#undef USE_MIXED_STEPPER#undef FAST_IMUL#undef ALPHA_IS_ZERO#undef FULL_TILES_ONLY#undef LOWER#undef TRANS#define NOUNIT            0#define USE_MIXED_STEPPER 0#define FAST_IMUL         0#define ALPHA_IS_ZERO     0#define FULL_TILES_ONLY   1#define LOWER             0#define TRANS             1#include "strsm_l.h"}__global__ void strsm_l_lo_nt_main_alpha0_hw (struct cublasStrsmParams parms) {#undef NOUNIT#undef USE_MIXED_STEPPER#undef FAST_IMUL#undef ALPHA_IS_ZERO#undef FULL_TILES_ONLY#undef LOWER#undef TRANS#define NOUNIT            0#define USE_MIXED_STEPPER 0#define FAST_IMUL         0#define ALPHA_IS_ZERO     1#define FULL_TILES_ONLY   0#define LOWER             1#define TRANS             0#include "strsm_l.h"}__global__ void strsm_l_lo_tr_main_alpha0_hw (struct cublasStrsmParams parms){#undef NOUNIT#undef USE_MIXED_STEPPER#undef FAST_IMUL#undef ALPHA_IS_ZERO#undef FULL_TILES_ONLY#undef LOWER#undef TRANS#define NOUNIT            0#define USE_MIXED_STEPPER 0#define FAST_IMUL         0#define ALPHA_IS_ZERO     1#define FULL_TILES_ONLY   0#define LOWER             1#define TRANS             1#include "strsm_l.h"}__global__ void strsm_l_up_nt_main_alpha0_hw (struct cublasStrsmParams parms){#undef NOUNIT#undef USE_MIXED_STEPPER#undef FAST_IMUL#undef ALPHA_IS_ZERO#undef FULL_TILES_ONLY#undef LOWER#undef TRANS#define NOUNIT            0#define USE_MIXED_STEPPER 0#define FAST_IMUL         0#define ALPHA_IS_ZERO     1#define FULL_TILES_ONLY   0#define LOWER             0#define TRANS             0#include "strsm_l.h"}__global__ void strsm_l_up_tr_main_alpha0_hw (struct cublasStrsmParams parms){#undef NOUNIT#undef USE_MIXED_STEPPER#undef FAST_IMUL#undef ALPHA_IS_ZERO#undef FULL_TILES_ONLY#undef LOWER#undef TRANS#define NOUNIT            0#define USE_MIXED_STEPPER 0#define FAST_IMUL         0#define ALPHA_IS_ZERO     1#define FULL_TILES_ONLY   0#define LOWER             0#define TRANS             1#include "strsm_l.h"}__global__ void strsm_l_lo_nt_main_fulltile_alpha0_hw (struct cublasStrsmParams parms) {#undef NOUNIT#undef USE_MIXED_STEPPER#undef FAST_IMUL#undef ALPHA_IS_ZERO#undef FULL_TILES_ONLY #undef LOWER#undef TRANS#define NOUNIT            0#define USE_MIXED_STEPPER 0#define FAST_IMUL         0#define ALPHA_IS_ZERO     1#define FULL_TILES_ONLY   1#define LOWER             1#define TRANS             0#include "strsm_l.h"}__global__ void strsm_l_lo_tr_main_fulltile_alpha0_hw (struct cublasStrsmParams parms){#undef NOUNIT#undef USE_MIXED_STEPPER#undef FAST_IMUL#undef ALPHA_IS_ZERO#undef FULL_TILES_ONLY#undef LOWER#undef TRANS#define NOUNIT            0#define USE_MIXED_STEPPER 0#define FAST_IMUL         0#define ALPHA_IS_ZERO     1#define FULL_TILES_ONLY   1#define LOWER             1#define TRANS             1#include "strsm_l.h"}__global__ void strsm_l_up_nt_main_fulltile_alpha0_hw (struct cublasStrsmParams parms){#undef NOUNIT#undef USE_MIXED_STEPPER#undef FAST_IMUL#undef ALPHA_IS_ZERO#undef FULL_TILES_ONLY#undef LOWER#undef TRANS#define NOUNIT            0#define USE_MIXED_STEPPER 0#define FAST_IMUL         0#define ALPHA_IS_ZERO     1#define FULL_TILES_ONLY   1#define LOWER             0#define TRANS             0#include "strsm_l.h"}__global__ void strsm_l_up_tr_main_fulltile_alpha0_hw (struct cublasStrsmParams parms){#undef NOUNIT#undef USE_MIXED_STEPPER#undef FAST_IMUL#undef ALPHA_IS_ZERO#undef FULL_TILES_ONLY#undef LOWER#undef TRANS#define NOUNIT            0#define USE_MIXED_STEPPER 0#define FAST_IMUL         0#define ALPHA_IS_ZERO     1#define FULL_TILES_ONLY   1#define LOWER             0#define TRANS             1#include "strsm_l.h"

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -