📄 zgemm_kernel.s
字号:
/*********************************************************************//* *//* Optimized BLAS libraries *//* By Kazushige Goto <kgoto@tacc.utexas.edu> *//* *//* Copyright (c) The University of Texas, 2005. All rights reserved. *//* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING *//* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF *//* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, *//* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY *//* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF *//* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO *//* THE USE OF THE SOFTWARE OR DOCUMENTATION. *//* Under no circumstances shall University be liable for incidental, *//* special, indirect, direct or consequential damages or loss of *//* profits, interruption of business, or related expenses which may *//* arise from use of Software or Documentation, including but not *//* limited to those resulting from defects in Software and/or *//* Documentation, or loss or inaccuracy of data of any kind. *//*********************************************************************/#define ASSEMBLER#include "common.h"#ifdef DOUBLE#define PREFETCHSIZE (16 * 8)#else#define PREFETCHSIZE (32 * 8)#endif#define CPREFETCHSIZE 7#define CPREFETCH lfetch.excl.nt1#define M r32#define N r33#define K r34#define A r37#define B r38#define C r39#define LDC r35#define I r15#define J r16#define AOFFSET r17#define BOFFSET r18#define TEMP r19#define L r20#define C1 r21#define C2 r22#define C3 r23#define C4 r24#define C5 r25#define C6 r26#define C7 r27#define C8 r28#define PREA r8#define PREB r9#define PREC r10#define SP r12#define ARLC r29#define PR r30#define ARPFS r31#define ALPHA_R f8#define ALPHA_I f9#define AORIG loc0#define KK loc1#define KK8 loc2#define OFFSET loc3#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(CC) || defined(CR) || defined(RC) || defined(RR)#define FCALC_A FSUB#define FCALC_B FADD#define FMA_A FNMA#define FMA_B FMA#else#define FCALC_A FADD#define FCALC_B FSUB#define FMA_A FMA#define FMA_B FNMA#endif#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NC) || defined(TC) || defined(NR) || defined(TR)#define FCALC_C FMA#define FCALC_D FNMA#else#define FCALC_C FNMA#define FCALC_D FMA#endif PROLOGUE .prologue PROFCODE { .mfi#ifdef TRMMKERNEL .save ar.pfs, ARPFS alloc ARPFS = ar.pfs, 8, 8, 0, 0#else nop __LINE__#endif mov f64 = f0 adds r14 = 16, SP } { .mfi nop __LINE__ mov f65 = f0 adds r15 = 24, SP } ;; { .mfi ld8 LDC = [r14] mov f81 = f0 mov PR = pr } { .mfi#ifdef TRMMKERNEL ld8 OFFSET = [r15]#else nop __LINE__#endif mov f96 = f0 shr J = N, 2 } ;; { .mfi shladd LDC = LDC, ZBASE_SHIFT, r0 mov f97 = f0 mov AOFFSET = A } { .mfi nop __LINE__ mov f113 = f0#if defined(TRMMKERNEL) && !defined(LEFT) sub KK = r0, OFFSET#endif } ;; .body { .mfi nop __LINE__ mov f80 = f0 mov ARLC = ar.lc } { .mfb cmp.ge p6, p0 = 0, J mov f112 = f0 (p6) br.cond.dpnt .L050 } ;; .align 16.L010: { .mmi mov C1 = C // coffset1 = c + 0 * ldc add C2 = LDC, C // coffset2 = c + 1 * ldc shr I = M, 2 } { .mmi adds J = -1, J#if defined(TRMMKERNEL) && defined(LEFT) mov KK = OFFSET#else nop __LINE__#endif nop __LINE__ } ;; { .mmi shladd C3 = LDC, 1, C // coffset3 = c + 2 * ldc shladd C4 = LDC, 1, C2 // coffset4 = c + 3 * ldc#ifdef TRMMKERNEL shladd KK8 = KK, ZBASE_SHIFT, r0#else nop __LINE__#endif } { .mib cmp.eq p6, p7 = 0, I shladd C = LDC, 2, C // coffset += 8 * ldc (p6) br.cond.dpnt .L020 } ;; .align 16.L011:#if !defined(TRMMKERNEL) || \ defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) { .mfb LDFPD f48, f49 = [B] mov f66 = f0 nop __LINE__ } { .mfb adds BOFFSET = 2 * SIZE, B mov f67 = f0 nop __LINE__ } ;;#else { .mfi shladd BOFFSET = KK8, 2, B mov f66 = f0 shladd AOFFSET = KK8, 2, AOFFSET } ;; { .mfi LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f67 = f0 nop __LINE__ } ;;#endif { .mfi LDFPD f32, f33 = [AOFFSET], 2 * SIZE mov f82 = f0#ifndef TRMMKERNEL nop __LINE__#else#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub L = K, KK#elif defined(LEFT) adds L = 4, KK#else adds L = 4, KK#endif#endif } { .mfi LDFPD f50, f51 = [BOFFSET], 2 * SIZE mov f83 = f0 adds PREC = CPREFETCHSIZE * SIZE, C1 } ;; { .mfi LDFPD f34, f35 = [AOFFSET], 2 * SIZE mov f98 = f0#ifndef TRMMKERNEL adds L = 1, K#else adds L = 1, L#endif } { .mfi LDFPD f52, f53 = [BOFFSET], 2 * SIZE mov f99 = f0 adds C5 = 4 * SIZE, C1 } ;; { .mfi LDFPD f36, f37 = [AOFFSET], 2 * SIZE mov f114 = f0 tbit.z p12, p0 = L, 0 } { .mfi LDFPD f54, f55 = [BOFFSET], 2 * SIZE mov f115 = f0 adds C6 = 4 * SIZE, C2 } ;; { .mfi LDFPD f38, f39 = [AOFFSET], 2 * SIZE mov f68 = f0 shr L = L, 1 } { .mfi setf.d f86 = r0 mov f69 = f0 adds C7 = 4 * SIZE, C3 } ;; { .mfi CPREFETCH [PREC], LDC mov f84 = f0 adds L = -1, L } { .mfi setf.d f87 = r0 mov f85 = f0 adds C8 = 4 * SIZE, C4 } ;; { .mfi CPREFETCH [PREC], LDC mov f100 = f0 mov ar.lc = L } { .mfi setf.d f102 = r0 mov f101 = f0 cmp.eq p3, p0 = r0, r0 } ;; { .mfi CPREFETCH [PREC], LDC mov f116 = f0 adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET } { .mfi setf.d f103 = r0 mov f117 = f0 adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET } ;; { .mfi CPREFETCH [PREC] mov f70 = f0 nop __LINE__ } { .mmf setf.d f118 = r0 setf.d f119 = r0 mov f71 = f0 } ;; .align 16.L012:/* 1 */ { .mfi lfetch.nt1 [PREA], 16 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 nop __LINE__ } { .mfb (p12) cmp.ne p3, p0 = 0, L FMA_B f65 = f32, f49, f65 // A1 * B2 nop __LINE__ } ;;/* 2 */ { .mfi lfetch.nt1 [PREB], 16 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 nop __LINE__ } { .mfb cmp.ne p4, p5 = 0, L FMA_B f81 = f32, f51, f81 // A1 * B4 nop __LINE__ } ;;/* 3 */ { .mfb (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f96 = f32, f52, f96 // A1 * B5 nop __LINE__ } { .mfb FMA_B f97 = f32, f53, f97 // A1 * B6 nop __LINE__ } ;;/* 4 */ { .mfb (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f112 = f32, f54, f112 // A1 * B7 nop __LINE__ } { .mfb FMA_B f113 = f32, f55, f113 // A1 * B8 nop __LINE__ } ;;/* 5 */ { .mfb (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 nop __LINE__ } { .mfb FMA_A f64 = f33, f49, f64 // A2 * B2 nop __LINE__ } ;;/* 6 */ { .mfb (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE FMA f81 = f33, f50, f81 // A2 * B3 nop __LINE__ } { .mfb FMA_A f80 = f33, f51, f80 // A2 * B4 nop __LINE__ } ;;/* 7 */ { .mfb (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE FMA f97 = f33, f52, f97 // A2 * B5 nop __LINE__ } { .mfb FMA_A f96 = f33, f53, f96 // A2 * B6 nop __LINE__ } ;;/* 8 */ { .mfb (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE FMA f113 = f33, f54, f113 // A2 * B7 nop __LINE__ } { .mfb FMA_A f112 = f33, f55, f112 // A2 * B8 nop __LINE__ } ;;/* 9 */ { .mfb (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE FMA f66 = f34, f48, f66 // A3 * B1 nop __LINE__ } { .mfb FMA_B f67 = f34, f49, f67 // A3 * B2 nop __LINE__ } ;;/* 10 */ { .mfb (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE FMA f82 = f34, f50, f82 // A3 * B3 nop __LINE__ } { .mfb FMA_B f83 = f34, f51, f83 // A3 * B4 nop __LINE__ } ;;/* 11 */ { .mfb FMA f98 = f34, f52, f98 // A3 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA_B f99 = f34, f53, f99 // A3 * B6 nop __LINE__ } ;; /* 12 */ { .mfb FMA f114 = f34, f54, f114 // A3 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA_B f115 = f34, f55, f115 // A3 * B8 nop __LINE__ } ;;/* 13 */ { .mfb nop __LINE__ FMA f67 = f35, f48, f67 // A4 * B1 } { .mfb nop __LINE__ FMA_A f66 = f35, f49, f66 // A4 * B2 nop __LINE__ } ;;/* 14 */ { .mfb FMA f83 = f35, f50, f83 // A4 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_A f82 = f35, f51, f82 // A4 * B4 nop __LINE__ } ;;/* 15 */ { .mfb FMA f99 = f35, f52, f99 // A4 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA_A f98 = f35, f53, f98 // A4 * B6 nop __LINE__ } ;;/* 16 */ { .mfb FMA f115 = f35, f54, f115 // A4 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA_A f114 = f35, f55, f114 // A4 * B8 nop __LINE__ } ;;/* 17 */ { .mfb nop __LINE__ FMA f68 = f36, f48, f68 // A5 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_B f69 = f36, f49, f69 // A5 * B2 nop __LINE__ } ;;/* 18 */ { .mfb nop __LINE__ FMA f84 = f36, f50, f84 // A5 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_B f85 = f36, f51, f85 // A5 * B4 nop __LINE__ } ;;/* 19 */ { .mfb nop __LINE__ FMA f100 = f36, f52, f100 // A5 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA_B f101 = f36, f53, f101 // A5 * B6 nop __LINE__ } ;;/* 20 */ { .mfb nop __LINE__ FMA f116 = f36, f54, f116 // A5 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA_B f117 = f36, f55, f117 // A5 * B8 nop __LINE__ } ;;/* 21 */ { .mfb nop __LINE__ FMA f69 = f37, f48, f69 // A6 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_A f68 = f37, f49, f68 // A6 * B2 nop __LINE__ } ;;/* 22 */ { .mfb nop __LINE__ FMA f85 = f37, f50, f85 // A6 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_A f84 = f37, f51, f84 // A6 * B4 nop __LINE__ } ;;/* 23 */ { .mfb nop __LINE__ FMA f101 = f37, f52, f101 // A6 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA_A f100 = f37, f53, f100 // A6 * B6 nop __LINE__ } ;;/* 24 */ { .mfb nop __LINE__ FMA f117 = f37, f54, f117 // A6 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA_A f116 = f37, f55, f116 // A6 * B8 nop __LINE__ } ;;/* 25 */ { .mfb nop __LINE__ FMA f70 = f38, f48, f70 // A7 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_B f71 = f38, f49, f71 // A7 * B2 nop __LINE__ } ;;/* 26 */ { .mfb nop __LINE__ FMA f86 = f38, f50, f86 // A7 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_B f87 = f38, f51, f87 // A7 * B4 nop __LINE__ } ;;/* 27 */ { .mfb nop __LINE__ FMA f102 = f38, f52, f102 // A7 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA_B f103 = f38, f53, f103 // A7 * B6 nop __LINE__ } ;;/* 28 */ { .mfb nop __LINE__ FMA f118 = f38, f54, f118 // A7 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA_B f119 = f38, f55, f119 // A7 * B8 nop __LINE__ } ;;/* 29 */ { .mfb nop __LINE__ FMA f71 = f39, f48, f71 // A8 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_A f70 = f39, f49, f70 // A8 * B2 nop __LINE__ } ;;/* 30 */ { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE FMA f87 = f39, f50, f87 // A8 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_A f86 = f39, f51, f86 // A8 * B4 nop __LINE__ } ;;/* 31 */ { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE FMA f103 = f39, f52, f103 // A8 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA_A f102 = f39, f53, f102 // A8 * B6 nop __LINE__ } ;;/* 32 */ { .mfb nop __LINE__ FMA f119 = f39, f54, f119 // A8 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA_A f118 = f39, f55, f118 // A8 * B8 nop __LINE__ } ;;/* 33 */ { .mfb nop __LINE__ (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f65 = f40, f57, f65 // A1 * B2 nop __LINE__ } ;;/* 34 */ { .mfb (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f81 = f40, f59, f81 // A1 * B4 nop __LINE__ } ;;/* 35 */ { .mfb (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE (p3) FMA f96 = f40, f60, f96 // A1 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f97 = f40, f61, f97 // A1 * B6 nop __LINE__ } ;;/* 36 */ { .mfb (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE (p3) FMA f112 = f40, f62, f112 // A1 * B7 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f113 = f40, f63, f113 // A1 * B8 nop __LINE__ } ;;/* 37 */ { .mfb (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f64 = f41, f57, f64 // A2 * B2 nop __LINE__ } ;;/* 38 */ { .mfb (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE (p3) FMA f81 = f41, f58, f81 // A2 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f80 = f41, f59, f80 // A2 * B4 nop __LINE__ } ;;/* 39 */ { .mfb (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE (p3) FMA f97 = f41, f60, f97 // A2 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f96 = f41, f61, f96 // A2 * B6 nop __LINE__ } ;;/* 40 */ { .mfb#if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f72 = [C1], SIZE#else nop __LINE__#endif (p3) FMA f113 = f41, f62, f113 // A2 * B7 nop __LINE__ } { .mfb#if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f76 = [C5], SIZE#else nop __LINE__#endif (p3) FMA_A f112 = f41, f63, f112 // A2 * B8 nop __LINE__ } ;; /* 41 */ { .mfb#if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f73 = [C1], SIZE#else nop __LINE__#endif (p3) FMA f66 = f42, f56, f66 // A3 * B1 nop __LINE__ } { .mfb#if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f77 = [C5], SIZE#else nop __LINE__#endif (p3) FMA_B f67 = f42, f57, f67 // A3 * B2 nop __LINE__ } ;;/* 42 */ { .mfb#if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f74 = [C1], SIZE
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -