📄 trsm_kernel_ln.s
字号:
/*********************************************************************//* *//* Optimized BLAS libraries *//* By Kazushige Goto <kgoto@tacc.utexas.edu> *//* *//* Copyright (c) The University of Texas, 2005. All rights reserved. *//* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING *//* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF *//* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, *//* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY *//* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF *//* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO *//* THE USE OF THE SOFTWARE OR DOCUMENTATION. *//* Under no circumstances shall University be liable for incidental, *//* special, indirect, direct or consequential damages or loss of *//* profits, interruption of business, or related expenses which may *//* arise from use of Software or Documentation, including but not *//* limited to those resulting from defects in Software and/or *//* Documentation, or loss or inaccuracy of data of any kind. *//*********************************************************************/#define ASSEMBLER#include "common.h"#ifdef DOUBLE#define PREFETCHSIZE (16 * 8)#else#define PREFETCHSIZE (32 * 8)#endif#define CPREFETCHSIZE -7#define CPREFETCH lfetch.excl.nt1#define M r32#define N r33#define K r34#define A r36#define B r37#define C r38#define LDC r39#define I r15#define J r16#define AOFFSET r17#define BOFFSET r18#define TEMP r19#define L r20#define C1 r21#define C2 r22#define C3 r23#define C4 r24#define C5 r25#define C6 r26#define C7 r27#define C8 r28#define C9 loc0#define C10 loc1#define C11 loc2#define C12 loc3#define C13 loc4#define C14 loc5#define C15 loc6#define C16 loc7#define PREA r8#define PREB r9#define PREC r10#define SP r12#define ARLC r29#define PR r30#define ARPFS r31#define ALPHA f8#define AORIG loc8#define KK loc9#define KK8 loc10#define OFFSET loc11#define AOFFSET2 loc12#define BOFFSET2 loc13 PROLOGUE .prologue PROFCODE { .mmi .save ar.pfs, ARPFS alloc ARPFS = ar.pfs, 8, 16, 0, 0 adds r14 = 16, SP mov ARLC = ar.lc } { .mmi adds r8 = -6 * 16, SP adds r9 = -5 * 16, SP adds SP = -6 * 16, SP } ;; { .mmi setf.sig f32 = M setf.sig f33 = K mov PR = pr } ;; { .mmi stf.spill [r8] = f16, 32 stf.spill [r9] = f17, 32 shr J = N, 3 } ;; { .mmi stf.spill [r8] = f18, 32 stf.spill [r9] = f19, 32 shladd LDC = LDC, BASE_SHIFT, r0 } ;; { .mmi stf.spill [r8] = f20 stf.spill [r9] = f21 mov AOFFSET = A } ;; .body { .mmf ld8 OFFSET = [r14] cmp.ge p6, p0 = 0, J xmpy.l f32 = f32, f33 } ;; { .mmi getf.sig r2 = f32 shladd C = M, BASE_SHIFT, C nop __LINE__ } ;; { .mmb shladd A = r2, BASE_SHIFT, A nop __LINE__ (p6) br.cond.dpnt .L050 } ;; .align 8.L000: { .mmf mov C1 = C add KK = M, OFFSET } { .mmi mov AORIG = A add C2 = LDC, C shladd C3 = LDC, 1, C } ;; { .mmf shladd C5 = LDC, 2, C shladd C = LDC, 3, C } { .mmf shladd C4 = LDC, 1, C2 shladd C6 = LDC, 2, C2 } ;; { .mfi shladd C7 = LDC, 2, C3 shladd C8 = LDC, 2, C4 } ;; ;; mov f64 = f0 mov f72 = f0 mov f80 = f0 mov f88 = f0 mov f96 = f0 mov f104 = f0 mov f112 = f0 mov f120 = f0.L040: { .mib sub L = K, KK tbit.z p6, p0 = M, 0 (p6) br.cond.dptk .L030 } ;; { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B shl r2 = K, 0 + BASE_SHIFT } { .mmi shladd r3 = KK, BASE_SHIFT, r0 nop __LINE__ nop __LINE__ } ;; { .mfi shladd BOFFSET = r3, 3, B sub AORIG = AORIG, r2 } ;; { .mfi (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE add AOFFSET = r3, AORIG } ;; { .mmi adds L = 1, L adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET cmp.eq p3, p0 = r0, r0 } ;; { .mii (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE tbit.z p12, p0 = L, 0 shr L = L, 1 } ;; { .mmi (p7) LDFPD f52, f53 = [BOFFSET], 2 * SIZE adds L = -1, L } ;; { .mmi (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE cmp.eq p6, p0 = -1, L } ;; { .mib (p7) LDFD f32 = [AOFFSET], 1 * SIZE mov ar.lc = L (p6) br.cond.dpnt .L048 } ;;.L042: { .mfb lfetch.nt1 [PREB], 16 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 nop __LINE__ } { .mfb (p12) cmp.ne p3, p0 = 0, L FMA f72 = f32, f49, f72 // A1 * B2 nop __LINE__ } ;; { .mfi (p3) LDFD f40 = [AOFFSET], 1 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 cmp.ne p4, p5 = 0, L } { .mfb (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f88 = f32, f51, f88 // A1 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE FMA f96 = f32, f52, f96 // A1 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA f104 = f32, f53, f104 // A1 * B6 nop __LINE__ } ;; { .mfb (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE FMA f112 = f32, f54, f112 // A1 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA f120 = f32, f55, f120 // A1 * B8 nop __LINE__ } ;; { .mfb (p4) LDFD f32 = [AOFFSET], 1 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE (p3) FMA f72 = f40, f57, f72 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f88 = f40, f59, f88 // A1 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE (p3) FMA f96 = f40, f60, f96 // A1 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f104 = f40, f61, f104 // A1 * B6 nop __LINE__ } ;; { .mfi (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE (p3) FMA f112 = f40, f62, f112 // A1 * B7 adds L = -1, L } { .mmb nop __LINE__ nop __LINE__ nop __LINE__ } ;; { .mfb (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE (p3) FMA f120 = f40, f63, f120 // A1 * B8 nop __LINE__ } { .mmb nop __LINE__ nop __LINE__ br.cloop.sptk.few .L042 } ;;.L048:#if defined(LN) || defined(RT)#ifdef LN adds r2 = -1, KK#else adds r2 = -8, KK#endif ;; shladd r2 = r2, BASE_SHIFT, r0 ;; add AOFFSET = r2, AORIG shladd BOFFSET = r2, 3, B ;; #endif adds AOFFSET2 = 4 * SIZE, AOFFSET adds BOFFSET2 = 4 * SIZE, BOFFSET ;;#if defined(LN) || defined(LT) LDFPD f32, f33 = [BOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [BOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [BOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [BOFFSET] adds BOFFSET = -6 * SIZE, BOFFSET ;; { .mfi FSUB f64 = f32, f64 nop __LINE__ } { .mfi nop __LINE__ FSUB f72 = f33, f72 nop __LINE__ } ;; { .mfi FSUB f80 = f34, f80 nop __LINE__ } { .mfi nop __LINE__ FSUB f88 = f35, f88 nop __LINE__ } ;; { .mfi FSUB f96 = f36, f96 nop __LINE__ } { .mfi nop __LINE__ FSUB f104 = f37, f104 nop __LINE__ } ;; { .mfi FSUB f112 = f38, f112 nop __LINE__ } { .mfi nop __LINE__ FSUB f120 = f39, f120 nop __LINE__ } ;;#else LDFPD f32, f33 = [AOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [AOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [AOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [AOFFSET] adds AOFFSET = -6 * SIZE, AOFFSET ;; FSUB f64 = f32, f64 FSUB f72 = f33, f72 FSUB f80 = f34, f80 FSUB f88 = f35, f88 FSUB f96 = f36, f96 FSUB f104 = f37, f104 FSUB f112 = f38, f112 FSUB f120 = f39, f120 ;;#endif#ifdef LN LDFD f32 = [AOFFSET] ;; FMPY f64 = f64, f32 FMPY f96 = f96, f32 FMPY f72 = f72, f32 FMPY f104 = f104, f32 FMPY f80 = f80, f32 FMPY f112 = f112, f32 FMPY f88 = f88, f32 FMPY f120 = f120, f32 ;; { .mmi STFD [BOFFSET] = f64, SIZE STFD [BOFFSET2] = f96, SIZE adds C1 = -1 * SIZE, C1 } ;; { .mmi STFD [BOFFSET] = f72, SIZE STFD [BOFFSET2] = f104, SIZE adds C2 = -1 * SIZE, C2 } ;; { .mmi STFD [BOFFSET] = f80, SIZE STFD [BOFFSET2] = f112, SIZE nop __LINE__ } ;; { .mmi STFD [BOFFSET] = f88, - 3 * SIZE STFD [BOFFSET2] = f120, - 3 * SIZE } ;; adds C3 = -1 * SIZE, C3 adds C4 = -1 * SIZE, C4 adds C5 = -1 * SIZE, C5 adds C6 = -1 * SIZE, C6 adds C7 = -1 * SIZE, C7 adds C8 = -1 * SIZE, C8 ;;#endif#ifdef LT LDFD f32 = [AOFFSET] ;; { .mfi FMPY f64 = f64, f32 nop __LINE__ } { .mfi nop __LINE__ FMPY f96 = f96, f32 nop __LINE__ } ;; { .mfi FMPY f72 = f72, f32 nop __LINE__ } { .mfi nop __LINE__ FMPY f104 = f104, f32 nop __LINE__ } ;; { .mfi FMPY f80 = f80, f32 } { .mfi nop __LINE__ FMPY f112 = f112, f32 nop __LINE__ } ;; { .mfi FMPY f88 = f88, f32 nop __LINE__ } { .mfi nop __LINE__ FMPY f120 = f120, f32 nop __LINE__ } ;; { .mfi STFD [BOFFSET] = f64, SIZE } { .mfi STFD [BOFFSET2] = f96, SIZE } ;; { .mfi STFD [BOFFSET] = f72, SIZE } { .mfi STFD [BOFFSET2] = f104, SIZE } ;; { .mfi STFD [BOFFSET] = f80, SIZE } { .mfi STFD [BOFFSET2] = f112, SIZE } ;; { .mfi STFD [BOFFSET] = f88, -3 * SIZE } { .mfi STFD [BOFFSET2] = f120, -3 * SIZE } ;;#endif#ifdef RN LDFPD f32, f33 = [BOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [BOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [BOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [BOFFSET] adds BOFFSET = 3 * SIZE, BOFFSET ;; LDFD f40 = [BOFFSET], 1 * SIZE ;; LDFPD f41, f42 = [BOFFSET], 2 * SIZE ;; LDFPD f43, f44 = [BOFFSET], 2 * SIZE ;; LDFPD f45, f46 = [BOFFSET] adds BOFFSET = 4 * SIZE, BOFFSET ;; LDFPD f47, f48 = [BOFFSET], 2 * SIZE ;; LDFPD f49, f50 = [BOFFSET], 2 * SIZE ;; LDFPD f51, f52 = [BOFFSET] adds BOFFSET = 5 * SIZE, BOFFSET ;; LDFD f53 = [BOFFSET], 1 * SIZE ;; LDFPD f54, f55 = [BOFFSET], 2 * SIZE ;; LDFPD f56, f57 = [BOFFSET] adds BOFFSET = 6 * SIZE, BOFFSET ;; LDFPD f58, f59 = [BOFFSET], 2 * SIZE ;; LDFPD f60, f61 = [BOFFSET] adds BOFFSET = 7 * SIZE, BOFFSET ;; LDFD f16 = [BOFFSET], 1 * SIZE ;; LDFPD f17, f18 = [BOFFSET] adds BOFFSET = 8 * SIZE, BOFFSET ;; LDFPD f19, f20 = [BOFFSET] adds BOFFSET = 9 * SIZE, BOFFSET ;; LDFD f21 = [BOFFSET] adds BOFFSET = -63 * SIZE, BOFFSET ;; FMPY f64 = f64, f32 ;; FNMA f72 = f64, f33, f72 ;; FNMA f80 = f64, f34, f80 ;; FNMA f88 = f64, f35, f88 ;; FNMA f96 = f64, f36, f96 ;; FNMA f104 = f64, f37, f104 ;; FNMA f112 = f64, f38, f112 ;; FNMA f120 = f64, f39, f120 ;; FMPY f72 = f72, f40 ;; FNMA f80 = f72, f41, f80 ;; FNMA f88 = f72, f42, f88 ;; FNMA f96 = f72, f43, f96 ;; FNMA f104 = f72, f44, f104 ;; FNMA f112 = f72, f45, f112 ;; FNMA f120 = f72, f46, f120 ;; FMPY f80 = f80, f47 ;; FNMA f88 = f80, f48, f88 ;; FNMA f96 = f80, f49, f96 ;; FNMA f104 = f80, f50, f104 ;; FNMA f112 = f80, f51, f112 ;; FNMA f120 = f80, f52, f120 ;; FMPY f88 = f88, f53 ;; FNMA f96 = f88, f54, f96 ;; FNMA f104 = f88, f55, f104 ;; FNMA f112 = f88, f56, f112 ;; FNMA f120 = f88, f57, f120 ;; FMPY f96 = f96, f58 ;; FNMA f104 = f96, f59, f104 ;; FNMA f112 = f96, f60, f112 ;; FNMA f120 = f96, f61, f120 ;; FMPY f104 = f104, f16 ;; FNMA f112 = f104, f17, f112 ;; FNMA f120 = f104, f18, f120 ;; FMPY f112 = f112, f19 ;; FNMA f120 = f112, f20, f120 ;; FMPY f120 = f120, f21 ;; STFD [AOFFSET] = f64, SIZE STFD [AOFFSET2] = f96, SIZE ;; STFD [AOFFSET] = f72, SIZE STFD [AOFFSET2] = f104, SIZE ;; STFD [AOFFSET] = f80, SIZE STFD [AOFFSET2] = f112, SIZE ;; STFD [AOFFSET] = f88, -3 * SIZE STFD [AOFFSET2] = f120, - 3 * SIZE ;;#endif#ifdef RT adds BOFFSET = 62 * SIZE, BOFFSET ;; LDFPD f33, f32 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f35, f34 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f37, f36 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f39, f38 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFD f40 = [BOFFSET], -2 * SIZE ;; LDFPD f42, f41 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f44, f43 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f46, f45 = [BOFFSET] adds BOFFSET = - 4 * SIZE, BOFFSET ;; LDFPD f48, f47 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f50, f49 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f52, f51 = [BOFFSET] adds BOFFSET = - 4 * SIZE, BOFFSET ;; LDFD f53 = [BOFFSET], -2 * SIZE ;; LDFPD f55, f54 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f57, f56 = [BOFFSET] adds BOFFSET = - 6 * SIZE, BOFFSET ;; LDFPD f59, f58 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f61, f60 = [BOFFSET] adds BOFFSET = - 6 * SIZE, BOFFSET ;; LDFD f16 = [BOFFSET], -2 * SIZE ;; LDFPD f18, f17 = [BOFFSET] adds BOFFSET = - 8 * SIZE, BOFFSET ;; LDFPD f20, f19 = [BOFFSET] adds BOFFSET = - 8 * SIZE, BOFFSET ;; LDFD f21 = [BOFFSET] ;; FMPY f120 = f120, f32 ;; FNMA f112 = f120, f33, f112 ;; FNMA f104 = f120, f34, f104 ;; FNMA f96 = f120, f35, f96 ;; FNMA f88 = f120, f36, f88 ;; FNMA f80 = f120, f37, f80
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -