📄 ztrsm_kernel_rt.s
字号:
/*********************************************************************//* *//* Optimized BLAS libraries *//* By Kazushige Goto <kgoto@tacc.utexas.edu> *//* *//* Copyright (c) The University of Texas, 2005. All rights reserved. *//* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING *//* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF *//* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, *//* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY *//* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF *//* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO *//* THE USE OF THE SOFTWARE OR DOCUMENTATION. *//* Under no circumstances shall University be liable for incidental, *//* special, indirect, direct or consequential damages or loss of *//* profits, interruption of business, or related expenses which may *//* arise from use of Software or Documentation, including but not *//* limited to those resulting from defects in Software and/or *//* Documentation, or loss or inaccuracy of data of any kind. *//*********************************************************************/#define ASSEMBLER#include "common.h"#ifdef DOUBLE#define PREFETCHSIZE (16 * 8)#else#define PREFETCHSIZE (32 * 8)#endif#ifndef LN#define CPREFETCHSIZE 7#else#define CPREFETCHSIZE -8#endif#define CPREFETCH lfetch.excl.nt1#define M r32#define N r33#define K r34#define A r37#define B r38#define C r39#define LDC r35#define I r15#define J r16#define AOFFSET r17#define BOFFSET r18#define TEMP r19#define L r20#define C1 r21#define C2 r22#define C3 r23#define C4 r24#define C5 r25#define C6 r26#define C7 r27#define C8 r28#define PREA r8#define PREB r9#define PREC r10#define SP r12#define ARLC r29#define PR r30#define ARPFS r31#define ALPHA_R f8#define ALPHA_I f9#define AORIG loc0#define KK loc1#define KK8 loc2#define OFFSET loc3#define AOFFSET2 loc4#define BOFFSET2 loc5#ifndef CONJ#define FCALC_A FSUB#define FCALC_B FADD#define FMA_A FNMA#define FMA_B FMA#else#define FCALC_A FADD#define FCALC_B FSUB#define FMA_A FMA#define FMA_B FNMA#endif#ifndef CONJ#define FCALC_C FMA#define FCALC_D FNMA#else#define FCALC_C FNMA#define FCALC_D FMA#endif#ifndef CONJ#define FMA_C FNMA#define FMA_D FMA#define FSUB_A FSUB#else#define FMA_C FMA#define FMA_D FMS#define FSUB_A FADD#endif PROLOGUE .prologue PROFCODE { .mfi .save ar.pfs, ARPFS alloc ARPFS = ar.pfs, 8, 8, 0, 0 mov f64 = f0 adds r14 = 16, SP } { .mfi nop __LINE__ mov f65 = f0 adds r15 = 24, SP } ;; { .mfi ld8 LDC = [r14] mov f81 = f0 mov PR = pr } { .mfi ld8 OFFSET = [r15] mov f96 = f0 } ;; { .mfi shladd LDC = LDC, ZBASE_SHIFT, r0 mov f97 = f0 } { .mfi nop __LINE__ mov f113 = f0 } ;;#ifdef LN { .mmi setf.sig f32 = M setf.sig f33 = K shladd C = M, ZBASE_SHIFT, C } ;; {.mmf nop __LINE__ nop __LINE__ xmpy.l f32 = f32, f33 } ;; { .mmi getf.sig r2 = f32 ;; nop __LINE__ shladd A = r2, ZBASE_SHIFT, A } ;;#endif#ifdef RN sub KK = r0, OFFSET#endif#ifdef RT { .mmi setf.sig f32 = N setf.sig f33 = K nop __LINE__ } ;; { .mmi setf.sig f34 = LDC nop __LINE__ nop __LINE__ } ;; { .mmf nop __LINE__ nop __LINE__ xmpy.l f33 = f32, f33 } { .mmf nop __LINE__ sub KK = N, OFFSET xmpy.l f34 = f32, f34 } ;; { .mmi getf.sig r2 = f33 getf.sig r3 = f34 } ;; shladd B = r2, ZBASE_SHIFT, B add C = r3, C#endif ;; .body { .mfi nop __LINE__ mov f80 = f0 mov ARLC = ar.lc } { .mfb mov f112 = f0 } ;; ;; shr I = M, 2 tbit.z p6, p0 = N, 0 (p6) br.cond.dpnt .L050 ;;#ifdef RT { .mmi shl r2 = K, ZBASE_SHIFT } ;; { .mmi sub B = B, r2 sub C = C, LDC nop __LINE__ } ;;#endif mov C1 = C#ifdef LN add KK = M, OFFSET#elif defined LT mov KK = OFFSET#else nop __LINE__#endif ;;#if defined(LN) || defined(RT) mov AORIG = A#else mov AOFFSET = A#endif ;;#if defined(LT) || defined(RN) mov L = KK#else sub L = K, KK#endif ;; { .mib cmp.eq p6, p7 = 0, I#ifndef RT add C = LDC, C#else nop __LINE__#endif (p6) br.cond.dpnt .L100 } ;; .align 16.L092: { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B shl r2 = K, 2 + ZBASE_SHIFT } { .mmi shladd r3 = KK, ZBASE_SHIFT, r0 nop __LINE__ nop __LINE__ } ;;#if defined(LT) || defined(RN) { .mfb (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f66 = f0 nop __LINE__ } { .mmf nop __LINE__ nop __LINE__ mov f67 = f0 } ;;#else { .mfi add BOFFSET = r3, B mov f66 = f0#ifdef LN sub AORIG = AORIG, r2#else nop __LINE__#endif } ;; { .mfi (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f67 = f0 shladd AOFFSET = r3, 2, AORIG } ;;#endif ;; (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE adds L = 1, L ;; { .mfi (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE tbit.z p12, p0 = L, 0 } { .mfi adds PREC = CPREFETCHSIZE * SIZE, C1 shr L = L, 1 } ;; { .mfi adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET adds L = -1, L } { .mmf (p7) LDFPD f36, f37 = [AOFFSET], 2 * SIZE CPREFETCH [PREC] } ;; { .mfi (p7) LDFPD f38, f39 = [AOFFSET], 2 * SIZE mov ar.lc = L } { .mmi adds C5 = 4 * SIZE, C1 adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET cmp.eq p3, p0 = r0, r0 } ;; cmp.eq p6, p0 = -1, L (p6) br.cond.dpnt .L098 ;; .align 16.L093:/* 1 */ { .mfi lfetch.nt1 [PREA], 16 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 cmp.ne p4, p5 = 0, L } { .mfi nop __LINE__ FMA_B f65 = f32, f49, f65 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi lfetch.nt1 [PREB], 4 * SIZE FMA f80 = f34, f48, f80 // A3 * B1 nop __LINE__ } { .mfi nop __LINE__ FMA_B f81 = f34, f49, f81 // A3 * B2 nop __LINE__ } ;; { .mfi (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f96 = f36, f48, f96 // A5 * B1 nop __LINE__ } { .mfi nop __LINE__ FMA_B f97 = f36, f49, f97 // A5 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f112 = f38, f48, f112 // A7 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_B f113 = f38, f49, f113 // A7 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_A f64 = f33, f49, f64 // A2 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE FMA f81 = f35, f48, f81 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_A f80 = f35, f49, f80 // A4 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE FMA f97 = f37, f48, f97 // A6 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_A f96 = f37, f49, f96 // A6 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE FMA f113 = f39, f48, f113 // A8 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_A f112 = f39, f49, f112 // A8 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f65 = f40, f57, f65 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE (p3) FMA f80 = f42, f56, f80 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f81 = f42, f57, f81 // A3 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE (p3) FMA f96 = f44, f56, f96 // A5 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f97 = f44, f57, f97 // A5 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE (p3) FMA f112 = f46, f56, f112 // A7 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f113 = f46, f57, f113 // A7 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f64 = f41, f57, f64 // A2 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f81 = f43, f56, f81 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f80 = f43, f57, f80 // A4 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f97 = f45, f56, f97 // A6 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f96 = f45, f57, f96 // A6 * B2 nop __LINE__ } ;; { .mfi nop __LINE__ (p3) FMA f113 = f47, f56, f113 // A8 * B1 adds L = -1, L } { .mfb nop __LINE__ (p3) FMA_A f112 = f47, f57, f112 // A8 * B2 br.cloop.sptk.few .L093 } ;;.L098:#if defined(LN) || defined(RT)#ifdef LN adds r2 = -4, KK#else adds r2 = -1, KK#endif ;; shladd r2 = r2, ZBASE_SHIFT, r0 ;; shladd AOFFSET = r2, 2, AORIG add BOFFSET = r2, B ;; #endif#if defined(LN) || defined(LT) LDFPD f72, f73 = [BOFFSET], 2 * SIZE ;; LDFPD f74, f75 = [BOFFSET], 2 * SIZE ;; LDFPD f88, f89 = [BOFFSET], 2 * SIZE ;; LDFPD f90, f91 = [BOFFSET] adds BOFFSET = -6 * SIZE, BOFFSET ;; FSUB f64 = f72, f64 FSUB_A f65 = f73, f65 FSUB f80 = f74, f80 FSUB_A f81 = f75, f81 FSUB f96 = f88, f96 FSUB_A f97 = f89, f97 FSUB f112 = f90, f112 FSUB_A f113 = f91, f113 ;;#else LDFPD f72, f73 = [AOFFSET], 2 * SIZE ;; LDFPD f74, f75 = [AOFFSET], 2 * SIZE ;; LDFPD f88, f89 = [AOFFSET], 2 * SIZE ;; LDFPD f90, f91 = [AOFFSET] adds AOFFSET = -6 * SIZE, AOFFSET ;; FSUB f64 = f72, f64 FSUB f65 = f73, f65 FSUB f80 = f74, f80 FSUB f81 = f75, f81 FSUB f96 = f88, f96 FSUB f97 = f89, f97 FSUB f112 = f90, f112 FSUB f113 = f91, f113 ;;#endif#ifdef LN adds AOFFSET = 30 * SIZE, AOFFSET ;; LDFPD f72, f73 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f74, f75 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f76, f77 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f78, f79 = [AOFFSET] adds AOFFSET = - 4 * SIZE, AOFFSET ;; LDFPD f88, f89 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f90, f91 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f92, f93 = [AOFFSET] adds AOFFSET = - 6 * SIZE, AOFFSET ;; LDFPD f104, f105 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f106, f107 = [AOFFSET] adds AOFFSET = - 8 * SIZE, AOFFSET ;; LDFPD f120, f121 = [AOFFSET] ;; FMPY f32 = f72, f112 FMPY f33 = f73, f112 ;; FMA_C f112 = f73, f113, f32 FMA_D f113 = f72, f113, f33 ;; FNMA f96 = f74, f112, f96 FMA_A f97 = f75, f112, f97 FNMA f80 = f76, f112, f80 FMA_A f81 = f77, f112, f81 FNMA f64 = f78, f112, f64 FMA_A f65 = f79, f112, f65 ;; FMA_B f96 = f75, f113, f96 FNMA f97 = f74, f113, f97 FMA_B f80 = f77, f113, f80 FNMA f81 = f76, f113, f81 FMA_B f64 = f79, f113, f64 FNMA f65 = f78, f113, f65 ;; FMPY f32 = f88, f96 FMPY f33 = f89, f96 ;; FMA_C f96 = f89, f97, f32 FMA_D f97 = f88, f97, f33 ;; FNMA f80 = f90, f96, f80 FMA_A f81 = f91, f96, f81 FNMA f64 = f92, f96, f64 FMA_A f65 = f93, f96, f65 ;; FMA_B f80 = f91, f97, f80 FNMA f81 = f90, f97, f81 FMA_B f64 = f93, f97, f64 FNMA f65 = f92, f97, f65 ;; FMPY f32 = f104, f80 FMPY f33 = f105, f80 ;; FMA_C f80 = f105, f81, f32 FMA_D f81 = f104, f81, f33 ;; FNMA f64 = f106, f80, f64 FMA_A f65 = f107, f80, f65 ;; FMA_B f64 = f107, f81, f64 FNMA f65 = f106, f81, f65 ;; FMPY f32 = f120, f64 FMPY f33 = f121, f64 ;; FMA_C f64 = f121, f65, f32 FMA_D f65 = f120, f65, f33 ;;#endif#ifdef LT LDFPD f72, f73 = [AOFFSET], 2 * SIZE ;; LDFPD f74, f75 = [AOFFSET], 2 * SIZE ;; LDFPD f76, f77 = [AOFFSET], 2 * SIZE ;; LDFPD f78, f79 = [AOFFSET] adds AOFFSET = 4 * SIZE, AOFFSET ;; LDFPD f90, f91 = [AOFFSET], 2 * SIZE ;; LDFPD f92, f93 = [AOFFSET], 2 * SIZE ;; LDFPD f94, f95 = [AOFFSET] adds AOFFSET = 6 * SIZE, AOFFSET ;; LDFPD f108, f109 = [AOFFSET], 2 * SIZE ;; LDFPD f110, f111 = [AOFFSET] adds AOFFSET = 8 * SIZE, AOFFSET ;; LDFPD f126, f127 = [AOFFSET] adds AOFFSET = - 30 * SIZE, AOFFSET ;; FMPY f32 = f72, f64 FMPY f33 = f73, f64 ;; FMA_C f64 = f73, f65, f32 FMA_D f65 = f72, f65, f33 ;; FNMA f80 = f74, f64, f80 FMA_A f81 = f75, f64, f81 FNMA f96 = f76, f64, f96 FMA_A f97 = f77, f64, f97 FNMA f112 = f78, f64, f112 FMA_A f113 = f79, f64, f113 ;; FMA_B f80 = f75, f65, f80 FNMA f81 = f74, f65, f81 FMA_B f96 = f77, f65, f96 FNMA f97 = f76, f65, f97 FMA_B f112 = f79, f65, f112 FNMA f113 = f78, f65, f113 ;; FMPY f32 = f90, f80 FMPY f33 = f91, f80 ;; FMA_C f80 = f91, f81, f32 FMA_D f81 = f90, f81, f33 ;; FNMA f96 = f92, f80, f96 FMA_A f97 = f93, f80, f97 FNMA f112 = f94, f80, f112 FMA_A f113 = f95, f80, f113 ;; FMA_B f96 = f93, f81, f96 FNMA f97 = f92, f81, f97 FMA_B f112 = f95, f81, f112 FNMA f113 = f94, f81, f113 ;; FMPY f32 = f108, f96 FMPY f33 = f109, f96 ;; FMA_C f96 = f109, f97, f32 FMA_D f97 = f108, f97, f33 ;; FNMA f112 = f110, f96, f112 FMA_A f113 = f111, f96, f113 ;; FMA_B f112 = f111, f97, f112 FNMA f113 = f110, f97, f113 ;; FMPY f32 = f126, f112 FMPY f33 = f127, f112 ;; FMA_C f112 = f127, f113, f32 FMA_D f113 = f126, f113, f33 ;;#endif#ifdef RN LDFPD f72, f73 = [BOFFSET] ;; FMPY f32 = f72, f64 FMPY f33 = f73, f64 FMPY f34 = f72, f80 FMPY f35 = f73, f80 FMPY f36 = f72, f96 FMPY f37 = f73, f96 FMPY f38 = f72, f112 FMPY f39 = f73, f112 ;; FMA_C f64 = f73, f65, f32 FMA_D f65 = f72, f65, f33
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -