⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 strmm.cu

📁 Nividia提供的CUDA的BLAS库源码
💻 CU
📖 第 1 页 / 共 5 页
字号:
    /* choose HW-only stepping if columns in result matrix do not exceed the     * maximum CTA grid dimensions.     */    usePureHwStepper = ((m < (CUBLAS_CTA_MAX_DIM * BLK)) &&                        (n < (CUBLAS_CTA_MAX_DIM * BLK)));    /* We can eliminate checking for endcases if we know all tiles are fully     * populated. Important benchmark case!     */    fullTilesOnly = (((m % BLK) == 0) && ((n % BLK) == 0));    /* choose version using 24-bit multiplies if all dimensions are less than     * 2001, so we can guarantee that no multiplication result exceeds (2000 *     * 2000 * 4) < 2^24.     */    useFastImul = ((params.lda <= CUBLAS_FASTIMUL_F_MAX_DIM) &&                    (params.ldb <= CUBLAS_FASTIMUL_F_MAX_DIM) &&                    (params.m <= CUBLAS_FASTIMUL_F_MAX_DIM) &&                    (params.n <= CUBLAS_FASTIMUL_F_MAX_DIM));    funcIdx = ((fullTilesOnly << 6) | (useFastImul << 5) |                ((params.alpha == 0.0f) << 4) |                (params.unit << 3) | (params.lside << 2) |                (params.notrans << 1) | params.upper);    cudaStat = cudaGetLastError(); /* clear error status */    if (usePureHwStepper) {        strmm_hw[funcIdx]<<<ctaDimsHw,CUBLAS_STRMM_THREAD_COUNT>>>(params);    } else {        strmm_sw[funcIdx]<<<ctaDimsSw,CUBLAS_STRMM_THREAD_COUNT>>>(params);    }    cudaStat = cudaGetLastError(); /* check for launch error */    if (cudaStat != cudaSuccess) {        cublasSetError (ctx, CUBLAS_STATUS_EXECUTION_FAILED);    }}__shared__ float AA[(BLK+1)*BLK];  // padded to avoid GRF bank conflicts__shared__ float BB[(BLK+1)*BLK];  // padded to avoid GRF bank conflicts__global__ void strmm_l_up_nt_main_sw (struct cublasStrmmParams parms) {#undef  USE_MIXED_STEPPER#undef  FULL_TILES_ONLY#undef  FAST_IMUL#undef  ALPHA0#undef  UNIT#undef  LOWER#undef  TRANS#define USE_MIXED_STEPPER 1#define FULL_TILES_ONLY   0#define FAST_IMUL         0#define ALHPA0            0#define UNIT              0#define LOWER             0#define TRANS             0#include "strmm_l.h"}__global__ void strmm_l_lo_nt_main_sw (struct cublasStrmmParams parms) {#undef  USE_MIXED_STEPPER#undef  FULL_TILES_ONLY#undef  FAST_IMUL#undef  ALPHA0#undef  UNIT#undef  LOWER#undef  TRANS#define USE_MIXED_STEPPER 1#define FULL_TILES_ONLY   0#define FAST_IMUL         0#define ALPHA0            0#define UNIT              0#define LOWER             1#define TRANS             0#include "strmm_l.h"}__global__ void strmm_l_up_tr_main_sw (struct cublasStrmmParams parms) {#undef  USE_MIXED_STEPPER#undef  FULL_TILES_ONLY#undef  FAST_IMUL#undef  ALPHA0#undef  UNIT#undef  LOWER#undef  TRANS#define USE_MIXED_STEPPER 1#define FULL_TILES_ONLY   0#define FAST_IMUL         0#define ALPHA0            0#define UNIT              0#define LOWER             0#define TRANS             1#include "strmm_l.h"}__global__ void strmm_l_lo_tr_main_sw (struct cublasStrmmParams parms) {#undef  USE_MIXED_STEPPER#undef  FULL_TILES_ONLY#undef  FAST_IMUL#undef  ALPHA0#undef  UNIT#undef  LOWER#undef  TRANS#define USE_MIXED_STEPPER 1#define FULL_TILES_ONLY   0#define FAST_IMUL         0#define ALPHA0            0#define UNIT              0#define LOWER             1#define TRANS             1#include "strmm_l.h"}__global__ void strmm_r_up_nt_main_sw (struct cublasStrmmParams parms) {#undef  USE_MIXED_STEPPER#undef  FULL_TILES_ONLY#undef  FAST_IMUL#undef  ALPHA0#undef  UNIT#undef  LOWER#undef  TRANS#define USE_MIXED_STEPPER 1#define FULL_TILES_ONLY   0#define FAST_IMUL         0#define ALPHA0            0#define UNIT              0#define LOWER             0#define TRANS             0#include "strmm_r.h"}__global__ void strmm_r_lo_nt_main_sw (struct cublasStrmmParams parms) {#undef  USE_MIXED_STEPPER#undef  FULL_TILES_ONLY#undef  FAST_IMUL#undef  ALPHA0#undef  UNIT#undef  LOWER#undef  TRANS#define USE_MIXED_STEPPER 1#define FULL_TILES_ONLY   0#define FAST_IMUL         0#define ALPHA0            0#define UNIT              0#define LOWER             1#define TRANS             0#include "strmm_r.h"}__global__ void strmm_r_up_tr_main_sw (struct cublasStrmmParams parms) {#undef  USE_MIXED_STEPPER#undef  FULL_TILES_ONLY#undef  FAST_IMUL#undef  ALPHA0#undef  UNIT#undef  LOWER#undef  TRANS#define USE_MIXED_STEPPER 1#define FULL_TILES_ONLY   0#define FAST_IMUL         0#define ALPHA0            0#define UNIT              0#define LOWER             0#define TRANS             1#include "strmm_r.h"}__global__ void strmm_r_lo_tr_main_sw (struct cublasStrmmParams parms) {#undef  USE_MIXED_STEPPER#undef  FULL_TILES_ONLY#undef  FAST_IMUL#undef  ALPHA0#undef  UNIT#undef  LOWER#undef  TRANS#define USE_MIXED_STEPPER 1#define FULL_TILES_ONLY   0#define FAST_IMUL         0#define ALPHA0            0#define UNIT              0#define LOWER             1#define TRANS             1#include "strmm_r.h"}__global__ void strmm_l_up_nt_main_unit_sw (struct cublasStrmmParams parms) {#undef  USE_MIXED_STEPPER#undef  FULL_TILES_ONLY#undef  FAST_IMUL#undef  ALPHA0#undef  UNIT#undef  LOWER#undef  TRANS#define USE_MIXED_STEPPER 1#define FULL_TILES_ONLY   0#define FAST_IMUL         0#define ALPHA0            0#define UNIT              1#define LOWER             0#define TRANS             0#include "strmm_l.h"}__global__ void strmm_l_lo_nt_main_unit_sw (struct cublasStrmmParams parms) {#undef  USE_MIXED_STEPPER#undef  FULL_TILES_ONLY#undef  FAST_IMUL#undef  ALPHA0#undef  UNIT#undef  LOWER#undef  TRANS#define USE_MIXED_STEPPER 1#define FULL_TILES_ONLY   0#define FAST_IMUL         0#define ALPHA0            0#define UNIT              1#define LOWER             1#define TRANS             0#include "strmm_l.h"}__global__ void strmm_l_up_tr_main_unit_sw (struct cublasStrmmParams parms) {#undef  USE_MIXED_STEPPER#undef  FULL_TILES_ONLY#undef  FAST_IMUL#undef  ALPHA0#undef  UNIT#undef  LOWER#undef  TRANS#define USE_MIXED_STEPPER 1#define FULL_TILES_ONLY   0#define FAST_IMUL         0#define ALPHA0            0#define UNIT              1#define LOWER             0#define TRANS             1#include "strmm_l.h"}__global__ void strmm_l_lo_tr_main_unit_sw (struct cublasStrmmParams parms) {#undef  USE_MIXED_STEPPER#undef  FULL_TILES_ONLY#undef  FAST_IMUL#undef  ALPHA0#undef  UNIT#undef  LOWER#undef  TRANS#define USE_MIXED_STEPPER 1#define FULL_TILES_ONLY   0#define FAST_IMUL         0#define ALPHA0            0#define UNIT              1#define LOWER             1#define TRANS             1#include "strmm_l.h"}__global__ void strmm_r_up_nt_main_unit_sw (struct cublasStrmmParams parms) {#undef  USE_MIXED_STEPPER#undef  FULL_TILES_ONLY#undef  FAST_IMUL#undef  ALPHA0#undef  UNIT#undef  LOWER#undef  TRANS#define USE_MIXED_STEPPER 1#define FULL_TILES_ONLY   0#define FAST_IMUL         0#define ALPHA0            0#define UNIT              1#define LOWER             0#define TRANS             0#include "strmm_r.h"}__global__ void strmm_r_lo_nt_main_unit_sw (struct cublasStrmmParams parms) {#undef  USE_MIXED_STEPPER#undef  FULL_TILES_ONLY#undef  FAST_IMUL#undef  ALPHA0#undef  UNIT#undef  LOWER#undef  TRANS#define USE_MIXED_STEPPER 1#define FULL_TILES_ONLY   0#define FAST_IMUL         0#define ALPHA0            0#define UNIT              1#define LOWER             1#define TRANS             0#include "strmm_r.h"}__global__ void strmm_r_up_tr_main_unit_sw (struct cublasStrmmParams parms) {#undef  USE_MIXED_STEPPER#undef  FULL_TILES_ONLY#undef  FAST_IMUL#undef  ALPHA0#undef  UNIT#undef  LOWER#undef  TRANS#define USE_MIXED_STEPPER 1#define FULL_TILES_ONLY   0#define FAST_IMUL         0#define ALPHA0            0#define UNIT              1#define LOWER             0#define TRANS             1#include "strmm_r.h"}__global__ void strmm_r_lo_tr_main_unit_sw (struct cublasStrmmParams parms) {#undef  USE_MIXED_STEPPER#undef  FULL_TILES_ONLY#undef  FAST_IMUL#undef  ALPHA0#undef  UNIT#undef  LOWER#undef  TRANS#define USE_MIXED_STEPPER 1#define FULL_TILES_ONLY   0#define FAST_IMUL         0#define ALPHA0            0#define UNIT              1#define LOWER             1#define TRANS             1#include "strmm_r.h"}__global__ void strmm_l_up_nt_main_alpha0_sw (struct cublasStrmmParams parms) {#undef  USE_MIXED_STEPPER#undef  FULL_TILES_ONLY#undef  FAST_IMUL#undef  ALPHA0#undef  UNIT#undef  LOWER#undef  TRANS#define USE_MIXED_STEPPER 1#define FULL_TILES_ONLY   0#define FAST_IMUL         0#define ALPHA0            1#define UNIT              0#define LOWER             0#define TRANS             0#include "strmm_l.h"}__global__ void strmm_l_lo_nt_main_alpha0_sw (struct cublasStrmmParams parms) {

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -