📄 strsm.cu
字号:
* maximum CTA grid dimensions. */ usePureHwStepper = (params.lside ? (n < (CUBLAS_CTA_MAX_DIM * BLK)) : (m < (CUBLAS_CTA_MAX_DIM * BLK))); /* We can eliminate checking for endcases if we know all tiles are fully * populated. Important benchmark case! */ fullTilesOnly = (((m % BLK) == 0) && ((n % BLK) == 0)); /* choose version using 24-bit multiplies if all dimensions are less than * 2001, so we can guarantee that no multiplication result exceeds (2000 * * 2000 * 4) < 2^24. */ useFastImul = ((params.lda <= CUBLAS_FASTIMUL_F_MAX_DIM) && (params.ldb <= CUBLAS_FASTIMUL_F_MAX_DIM) && (params.m <= CUBLAS_FASTIMUL_F_MAX_DIM) && (params.n <= CUBLAS_FASTIMUL_F_MAX_DIM)); funcIdx = ((params.nounit << 5) | (useFastImul << 4) | ((params.alpha == 0.0f) << 3) | (fullTilesOnly << 2) | (params.upper << 1) | params.notrans); cudaStat = cudaGetLastError(); /* clear error status */ if (params.lside) { if (usePureHwStepper) { strsm_l_hw[funcIdx]<<<ctaDimsHw,CUBLAS_STRSM_THREAD_COUNT>>>(params); } else { strsm_l_sw[funcIdx]<<<ctaDimsSw,CUBLAS_STRSM_THREAD_COUNT>>>(params); } } else { if (usePureHwStepper) { strsm_r_hw[funcIdx]<<<ctaDimsHw,CUBLAS_STRSM_THREAD_COUNT>>>(params); } else { strsm_r_sw[funcIdx]<<<ctaDimsSw,CUBLAS_STRSM_THREAD_COUNT>>>(params); } } cudaStat = cudaGetLastError(); /* check for launch error */ if (cudaStat != cudaSuccess) { cublasSetError (ctx, CUBLAS_STATUS_EXECUTION_FAILED); }}#if ((CUBLAS_STRSM_THREAD_COUNT<BLK))#error block dimension must be >= threadcount#endif#if ((CUBLAS_STRSM_THREAD_COUNT%BLK)!=0)#error threadcount and block dimensions do not divide evenly#endif#define A_NBR_COLS (CUBLAS_STRSM_THREAD_COUNT/BLK)#define B_NBR_COLS (CUBLAS_STRSM_THREAD_COUNT/BLK)#if (((BLK*BLK)%CUBLAS_STRSM_THREAD_COUNT)!=0)#error blocksize of A and B not evenly divided by threadcount!#endif#define A_ELEMS_PER_THREAD ((BLK * BLK) / CUBLAS_STRSM_THREAD_COUNT)#define B_ELEMS_PER_THREAD ((BLK * BLK) / CUBLAS_STRSM_THREAD_COUNT)__shared__ float AA[(BLK+1)*BLK]; // padded to avoid GRF bank conflicts__shared__ float BB[(BLK+1)*BLK]; // padded to avoid GRF bank conflicts__global__ void strsm_l_lo_nt_main_hw (struct cublasStrsmParams parms) {#undef NOUNIT#undef USE_MIXED_STEPPER#undef FAST_IMUL#undef ALPHA_IS_ZERO#undef FULL_TILES_ONLY#undef LOWER#undef TRANS#define NOUNIT 0#define USE_MIXED_STEPPER 0#define FAST_IMUL 0#define ALPHA_IS_ZERO 0#define FULL_TILES_ONLY 0#define LOWER 1#define TRANS 0#include "strsm_l.h"}__global__ void strsm_l_lo_tr_main_hw (struct cublasStrsmParams parms){#undef NOUNIT#undef USE_MIXED_STEPPER#undef FAST_IMUL#undef ALPHA_IS_ZERO#undef FULL_TILES_ONLY#undef LOWER#undef TRANS#define NOUNIT 0#define USE_MIXED_STEPPER 0#define FAST_IMUL 0#define ALPHA_IS_ZERO 0#define FULL_TILES_ONLY 0#define LOWER 1#define TRANS 1#include "strsm_l.h"}__global__ void strsm_l_up_nt_main_hw (struct cublasStrsmParams parms){#undef NOUNIT#undef USE_MIXED_STEPPER#undef FAST_IMUL#undef ALPHA_IS_ZERO#undef FULL_TILES_ONLY#undef LOWER#undef TRANS#define NOUNIT 0#define USE_MIXED_STEPPER 0#define FAST_IMUL 0#define ALPHA_IS_ZERO 0#define FULL_TILES_ONLY 0#define LOWER 0#define TRANS 0#include "strsm_l.h"}__global__ void strsm_l_up_tr_main_hw (struct cublasStrsmParams parms){#undef NOUNIT#undef USE_MIXED_STEPPER#undef FAST_IMUL#undef ALPHA_IS_ZERO#undef FULL_TILES_ONLY#undef LOWER#undef TRANS#define NOUNIT 0#define USE_MIXED_STEPPER 0#define FAST_IMUL 0#define ALPHA_IS_ZERO 0#define FULL_TILES_ONLY 0#define LOWER 0#define TRANS 1#include "strsm_l.h"}__global__ void strsm_l_lo_nt_main_fulltile_hw (struct cublasStrsmParams parms) {#undef NOUNIT#undef USE_MIXED_STEPPER#undef FAST_IMUL#undef ALPHA_IS_ZERO#undef FULL_TILES_ONLY #undef LOWER#undef TRANS#define NOUNIT 0#define USE_MIXED_STEPPER 0#define FAST_IMUL 0#define ALPHA_IS_ZERO 0#define FULL_TILES_ONLY 1#define LOWER 1#define TRANS 0#include "strsm_l.h"}__global__ void strsm_l_lo_tr_main_fulltile_hw (struct cublasStrsmParams parms){#undef NOUNIT#undef USE_MIXED_STEPPER#undef FAST_IMUL#undef ALPHA_IS_ZERO#undef FULL_TILES_ONLY#undef LOWER#undef TRANS#define NOUNIT 0#define USE_MIXED_STEPPER 0#define FAST_IMUL 0#define ALPHA_IS_ZERO 0#define FULL_TILES_ONLY 1#define LOWER 1#define TRANS 1#include "strsm_l.h"}__global__ void strsm_l_up_nt_main_fulltile_hw (struct cublasStrsmParams parms){#undef NOUNIT#undef USE_MIXED_STEPPER#undef FAST_IMUL#undef ALPHA_IS_ZERO#undef FULL_TILES_ONLY#undef LOWER#undef TRANS#define NOUNIT 0#define USE_MIXED_STEPPER 0#define FAST_IMUL 0#define ALPHA_IS_ZERO 0#define FULL_TILES_ONLY 1#define LOWER 0#define TRANS 0#include "strsm_l.h"}__global__ void strsm_l_up_tr_main_fulltile_hw (struct cublasStrsmParams parms){#undef NOUNIT#undef USE_MIXED_STEPPER#undef FAST_IMUL#undef ALPHA_IS_ZERO#undef FULL_TILES_ONLY#undef LOWER#undef TRANS#define NOUNIT 0#define USE_MIXED_STEPPER 0#define FAST_IMUL 0#define ALPHA_IS_ZERO 0#define FULL_TILES_ONLY 1#define LOWER 0#define TRANS 1#include "strsm_l.h"}__global__ void strsm_l_lo_nt_main_alpha0_hw (struct cublasStrsmParams parms) {#undef NOUNIT#undef USE_MIXED_STEPPER#undef FAST_IMUL#undef ALPHA_IS_ZERO#undef FULL_TILES_ONLY#undef LOWER#undef TRANS#define NOUNIT 0#define USE_MIXED_STEPPER 0#define FAST_IMUL 0#define ALPHA_IS_ZERO 1#define FULL_TILES_ONLY 0#define LOWER 1#define TRANS 0#include "strsm_l.h"}__global__ void strsm_l_lo_tr_main_alpha0_hw (struct cublasStrsmParams parms){#undef NOUNIT#undef USE_MIXED_STEPPER#undef FAST_IMUL#undef ALPHA_IS_ZERO#undef FULL_TILES_ONLY#undef LOWER#undef TRANS#define NOUNIT 0#define USE_MIXED_STEPPER 0#define FAST_IMUL 0#define ALPHA_IS_ZERO 1#define FULL_TILES_ONLY 0#define LOWER 1#define TRANS 1#include "strsm_l.h"}__global__ void strsm_l_up_nt_main_alpha0_hw (struct cublasStrsmParams parms){#undef NOUNIT#undef USE_MIXED_STEPPER#undef FAST_IMUL#undef ALPHA_IS_ZERO#undef FULL_TILES_ONLY#undef LOWER#undef TRANS#define NOUNIT 0#define USE_MIXED_STEPPER 0#define FAST_IMUL 0#define ALPHA_IS_ZERO 1#define FULL_TILES_ONLY 0#define LOWER 0#define TRANS 0#include "strsm_l.h"}__global__ void strsm_l_up_tr_main_alpha0_hw (struct cublasStrsmParams parms){#undef NOUNIT#undef USE_MIXED_STEPPER#undef FAST_IMUL#undef ALPHA_IS_ZERO#undef FULL_TILES_ONLY#undef LOWER#undef TRANS#define NOUNIT 0#define USE_MIXED_STEPPER 0#define FAST_IMUL 0#define ALPHA_IS_ZERO 1#define FULL_TILES_ONLY 0#define LOWER 0#define TRANS 1#include "strsm_l.h"}__global__ void strsm_l_lo_nt_main_fulltile_alpha0_hw (struct cublasStrsmParams parms) {#undef NOUNIT#undef USE_MIXED_STEPPER#undef FAST_IMUL#undef ALPHA_IS_ZERO#undef FULL_TILES_ONLY #undef LOWER#undef TRANS#define NOUNIT 0#define USE_MIXED_STEPPER 0#define FAST_IMUL 0#define ALPHA_IS_ZERO 1#define FULL_TILES_ONLY 1#define LOWER 1#define TRANS 0#include "strsm_l.h"}__global__ void strsm_l_lo_tr_main_fulltile_alpha0_hw (struct cublasStrsmParams parms){#undef NOUNIT#undef USE_MIXED_STEPPER#undef FAST_IMUL#undef ALPHA_IS_ZERO#undef FULL_TILES_ONLY#undef LOWER#undef TRANS#define NOUNIT 0#define USE_MIXED_STEPPER 0#define FAST_IMUL 0#define ALPHA_IS_ZERO 1#define FULL_TILES_ONLY 1#define LOWER 1#define TRANS 1#include "strsm_l.h"}__global__ void strsm_l_up_nt_main_fulltile_alpha0_hw (struct cublasStrsmParams parms){#undef NOUNIT#undef USE_MIXED_STEPPER#undef FAST_IMUL#undef ALPHA_IS_ZERO#undef FULL_TILES_ONLY#undef LOWER#undef TRANS#define NOUNIT 0#define USE_MIXED_STEPPER 0#define FAST_IMUL 0#define ALPHA_IS_ZERO 1#define FULL_TILES_ONLY 1#define LOWER 0#define TRANS 0#include "strsm_l.h"}__global__ void strsm_l_up_tr_main_fulltile_alpha0_hw (struct cublasStrsmParams parms){#undef NOUNIT#undef USE_MIXED_STEPPER#undef FAST_IMUL#undef ALPHA_IS_ZERO#undef FULL_TILES_ONLY#undef LOWER#undef TRANS#define NOUNIT 0#define USE_MIXED_STEPPER 0#define FAST_IMUL 0#define ALPHA_IS_ZERO 1#define FULL_TILES_ONLY 1#define LOWER 0#define TRANS 1#include "strsm_l.h"
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -