📄 strmm.cu
字号:
/* choose HW-only stepping if columns in result matrix do not exceed the * maximum CTA grid dimensions. */ usePureHwStepper = ((m < (CUBLAS_CTA_MAX_DIM * BLK)) && (n < (CUBLAS_CTA_MAX_DIM * BLK))); /* We can eliminate checking for endcases if we know all tiles are fully * populated. Important benchmark case! */ fullTilesOnly = (((m % BLK) == 0) && ((n % BLK) == 0)); /* choose version using 24-bit multiplies if all dimensions are less than * 2001, so we can guarantee that no multiplication result exceeds (2000 * * 2000 * 4) < 2^24. */ useFastImul = ((params.lda <= CUBLAS_FASTIMUL_F_MAX_DIM) && (params.ldb <= CUBLAS_FASTIMUL_F_MAX_DIM) && (params.m <= CUBLAS_FASTIMUL_F_MAX_DIM) && (params.n <= CUBLAS_FASTIMUL_F_MAX_DIM)); funcIdx = ((fullTilesOnly << 6) | (useFastImul << 5) | ((params.alpha == 0.0f) << 4) | (params.unit << 3) | (params.lside << 2) | (params.notrans << 1) | params.upper); cudaStat = cudaGetLastError(); /* clear error status */ if (usePureHwStepper) { strmm_hw[funcIdx]<<<ctaDimsHw,CUBLAS_STRMM_THREAD_COUNT>>>(params); } else { strmm_sw[funcIdx]<<<ctaDimsSw,CUBLAS_STRMM_THREAD_COUNT>>>(params); } cudaStat = cudaGetLastError(); /* check for launch error */ if (cudaStat != cudaSuccess) { cublasSetError (ctx, CUBLAS_STATUS_EXECUTION_FAILED); }}__shared__ float AA[(BLK+1)*BLK]; // padded to avoid GRF bank conflicts__shared__ float BB[(BLK+1)*BLK]; // padded to avoid GRF bank conflicts__global__ void strmm_l_up_nt_main_sw (struct cublasStrmmParams parms) {#undef USE_MIXED_STEPPER#undef FULL_TILES_ONLY#undef FAST_IMUL#undef ALPHA0#undef UNIT#undef LOWER#undef TRANS#define USE_MIXED_STEPPER 1#define FULL_TILES_ONLY 0#define FAST_IMUL 0#define ALHPA0 0#define UNIT 0#define LOWER 0#define TRANS 0#include "strmm_l.h"}__global__ void strmm_l_lo_nt_main_sw (struct cublasStrmmParams parms) {#undef USE_MIXED_STEPPER#undef FULL_TILES_ONLY#undef FAST_IMUL#undef ALPHA0#undef UNIT#undef LOWER#undef TRANS#define USE_MIXED_STEPPER 1#define FULL_TILES_ONLY 0#define FAST_IMUL 0#define ALPHA0 0#define UNIT 0#define LOWER 1#define TRANS 0#include "strmm_l.h"}__global__ void strmm_l_up_tr_main_sw (struct cublasStrmmParams parms) {#undef USE_MIXED_STEPPER#undef FULL_TILES_ONLY#undef FAST_IMUL#undef ALPHA0#undef UNIT#undef LOWER#undef TRANS#define USE_MIXED_STEPPER 1#define FULL_TILES_ONLY 0#define FAST_IMUL 0#define ALPHA0 0#define UNIT 0#define LOWER 0#define TRANS 1#include "strmm_l.h"}__global__ void strmm_l_lo_tr_main_sw (struct cublasStrmmParams parms) {#undef USE_MIXED_STEPPER#undef FULL_TILES_ONLY#undef FAST_IMUL#undef ALPHA0#undef UNIT#undef LOWER#undef TRANS#define USE_MIXED_STEPPER 1#define FULL_TILES_ONLY 0#define FAST_IMUL 0#define ALPHA0 0#define UNIT 0#define LOWER 1#define TRANS 1#include "strmm_l.h"}__global__ void strmm_r_up_nt_main_sw (struct cublasStrmmParams parms) {#undef USE_MIXED_STEPPER#undef FULL_TILES_ONLY#undef FAST_IMUL#undef ALPHA0#undef UNIT#undef LOWER#undef TRANS#define USE_MIXED_STEPPER 1#define FULL_TILES_ONLY 0#define FAST_IMUL 0#define ALPHA0 0#define UNIT 0#define LOWER 0#define TRANS 0#include "strmm_r.h"}__global__ void strmm_r_lo_nt_main_sw (struct cublasStrmmParams parms) {#undef USE_MIXED_STEPPER#undef FULL_TILES_ONLY#undef FAST_IMUL#undef ALPHA0#undef UNIT#undef LOWER#undef TRANS#define USE_MIXED_STEPPER 1#define FULL_TILES_ONLY 0#define FAST_IMUL 0#define ALPHA0 0#define UNIT 0#define LOWER 1#define TRANS 0#include "strmm_r.h"}__global__ void strmm_r_up_tr_main_sw (struct cublasStrmmParams parms) {#undef USE_MIXED_STEPPER#undef FULL_TILES_ONLY#undef FAST_IMUL#undef ALPHA0#undef UNIT#undef LOWER#undef TRANS#define USE_MIXED_STEPPER 1#define FULL_TILES_ONLY 0#define FAST_IMUL 0#define ALPHA0 0#define UNIT 0#define LOWER 0#define TRANS 1#include "strmm_r.h"}__global__ void strmm_r_lo_tr_main_sw (struct cublasStrmmParams parms) {#undef USE_MIXED_STEPPER#undef FULL_TILES_ONLY#undef FAST_IMUL#undef ALPHA0#undef UNIT#undef LOWER#undef TRANS#define USE_MIXED_STEPPER 1#define FULL_TILES_ONLY 0#define FAST_IMUL 0#define ALPHA0 0#define UNIT 0#define LOWER 1#define TRANS 1#include "strmm_r.h"}__global__ void strmm_l_up_nt_main_unit_sw (struct cublasStrmmParams parms) {#undef USE_MIXED_STEPPER#undef FULL_TILES_ONLY#undef FAST_IMUL#undef ALPHA0#undef UNIT#undef LOWER#undef TRANS#define USE_MIXED_STEPPER 1#define FULL_TILES_ONLY 0#define FAST_IMUL 0#define ALPHA0 0#define UNIT 1#define LOWER 0#define TRANS 0#include "strmm_l.h"}__global__ void strmm_l_lo_nt_main_unit_sw (struct cublasStrmmParams parms) {#undef USE_MIXED_STEPPER#undef FULL_TILES_ONLY#undef FAST_IMUL#undef ALPHA0#undef UNIT#undef LOWER#undef TRANS#define USE_MIXED_STEPPER 1#define FULL_TILES_ONLY 0#define FAST_IMUL 0#define ALPHA0 0#define UNIT 1#define LOWER 1#define TRANS 0#include "strmm_l.h"}__global__ void strmm_l_up_tr_main_unit_sw (struct cublasStrmmParams parms) {#undef USE_MIXED_STEPPER#undef FULL_TILES_ONLY#undef FAST_IMUL#undef ALPHA0#undef UNIT#undef LOWER#undef TRANS#define USE_MIXED_STEPPER 1#define FULL_TILES_ONLY 0#define FAST_IMUL 0#define ALPHA0 0#define UNIT 1#define LOWER 0#define TRANS 1#include "strmm_l.h"}__global__ void strmm_l_lo_tr_main_unit_sw (struct cublasStrmmParams parms) {#undef USE_MIXED_STEPPER#undef FULL_TILES_ONLY#undef FAST_IMUL#undef ALPHA0#undef UNIT#undef LOWER#undef TRANS#define USE_MIXED_STEPPER 1#define FULL_TILES_ONLY 0#define FAST_IMUL 0#define ALPHA0 0#define UNIT 1#define LOWER 1#define TRANS 1#include "strmm_l.h"}__global__ void strmm_r_up_nt_main_unit_sw (struct cublasStrmmParams parms) {#undef USE_MIXED_STEPPER#undef FULL_TILES_ONLY#undef FAST_IMUL#undef ALPHA0#undef UNIT#undef LOWER#undef TRANS#define USE_MIXED_STEPPER 1#define FULL_TILES_ONLY 0#define FAST_IMUL 0#define ALPHA0 0#define UNIT 1#define LOWER 0#define TRANS 0#include "strmm_r.h"}__global__ void strmm_r_lo_nt_main_unit_sw (struct cublasStrmmParams parms) {#undef USE_MIXED_STEPPER#undef FULL_TILES_ONLY#undef FAST_IMUL#undef ALPHA0#undef UNIT#undef LOWER#undef TRANS#define USE_MIXED_STEPPER 1#define FULL_TILES_ONLY 0#define FAST_IMUL 0#define ALPHA0 0#define UNIT 1#define LOWER 1#define TRANS 0#include "strmm_r.h"}__global__ void strmm_r_up_tr_main_unit_sw (struct cublasStrmmParams parms) {#undef USE_MIXED_STEPPER#undef FULL_TILES_ONLY#undef FAST_IMUL#undef ALPHA0#undef UNIT#undef LOWER#undef TRANS#define USE_MIXED_STEPPER 1#define FULL_TILES_ONLY 0#define FAST_IMUL 0#define ALPHA0 0#define UNIT 1#define LOWER 0#define TRANS 1#include "strmm_r.h"}__global__ void strmm_r_lo_tr_main_unit_sw (struct cublasStrmmParams parms) {#undef USE_MIXED_STEPPER#undef FULL_TILES_ONLY#undef FAST_IMUL#undef ALPHA0#undef UNIT#undef LOWER#undef TRANS#define USE_MIXED_STEPPER 1#define FULL_TILES_ONLY 0#define FAST_IMUL 0#define ALPHA0 0#define UNIT 1#define LOWER 1#define TRANS 1#include "strmm_r.h"}__global__ void strmm_l_up_nt_main_alpha0_sw (struct cublasStrmmParams parms) {#undef USE_MIXED_STEPPER#undef FULL_TILES_ONLY#undef FAST_IMUL#undef ALPHA0#undef UNIT#undef LOWER#undef TRANS#define USE_MIXED_STEPPER 1#define FULL_TILES_ONLY 0#define FAST_IMUL 0#define ALPHA0 1#define UNIT 0#define LOWER 0#define TRANS 0#include "strmm_l.h"}__global__ void strmm_l_lo_nt_main_alpha0_sw (struct cublasStrmmParams parms) {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -