symv_u.s
来自「Optimized GotoBLAS libraries」· S 代码 · 共 447 行
S
447 行
/*********************************************************************//* *//* Optimized BLAS libraries *//* By Kazushige Goto <kgoto@tacc.utexas.edu> *//* *//* Copyright (c) The University of Texas, 2005. All rights reserved. *//* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING *//* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF *//* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, *//* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY *//* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF *//* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO *//* THE USE OF THE SOFTWARE OR DOCUMENTATION. *//* Under no circumstances shall University be liable for incidental, *//* special, indirect, direct or consequential damages or loss of *//* profits, interruption of business, or related expenses which may *//* arise from use of Software or Documentation, including but not *//* limited to those resulting from defects in Software and/or *//* Documentation, or loss or inaccuracy of data of any kind. *//*********************************************************************/#define ASSEMBLER#include "common.h"#define SP r12#define M r32#define A r34#define LDA r35#define X r36#define INCX r37#define Y r38#define INCY r39#define BUFFER r33#define I r14#define IS r15#define A1 r16#define A2 r17#define A3 r18#define A4 r19#define NEW_X r20#define NEW_Y r21#define XX r22#define YY r23#define TEMP r24#define YYS r25#define PREA1 loc0#define PREA2 loc1#define PREA3 loc2#define PREA4 loc3#define A11 loc4#define A21 loc5#define A31 loc6#define A41 loc7 #define PREX r8#define PREY r9#define ARLC r29#define PR r30#define ARPFS r31 #ifdef DOUBLE#define RPREFETCH (16 * 3 + 4)#else#define RPREFETCH (16 * 3 + 16)#endif#define PREFETCH lfetch.nt1#define PREFETCHW lfetch.excl.nt1#define alpha f8#define atemp1 f6#define atemp2 f7#define atemp3 f10#define atemp4 f11#define xsum1 f12#define xsum2 f13#define xsum3 f14#define xsum4 f15 PROLOGUE .prologue PROFCODE { .mmi .save ar.pfs, ARPFS alloc ARPFS = ar.pfs, 8, 16, 8, 0 mov ARLC = ar.lc } ;; mov PR = pr adds r14 = 16, SP ;; adds r8 = -8 * 16, SP adds r9 = -7 * 16, SP adds SP = -8 * 16, SP ;; stf.spill [r8] = f16, 32 stf.spill [r9] = f17, 32 ;; stf.spill [r8] = f18, 32 stf.spill [r9] = f19, 32 ;; stf.spill [r8] = f20, 32 stf.spill [r9] = f21, 32 ;; stf.spill [r8] = f22 stf.spill [r9] = f23 .body ;; ld8 BUFFER = [r14] ;; shladd LDA = LDA, BASE_SHIFT, r0 shladd INCX = INCX, BASE_SHIFT, r0 shladd INCY = INCY, BASE_SHIFT, r0 ;; cmp.ge p7, p0 = 0, M ;; (p7) br.cond.dpnt .L999 ;; mov NEW_X = X cmp.eq p10, p0 = SIZE, INCX (p10) br.cond.dptk .L10 ;;.L10: mov NEW_Y = Y cmp.eq p10, p0 = SIZE, INCY (p10) br.cond.dptk .L20 ;;.L20: mov IS = 0 cmp.gt p10, p0 = 4, M (p10) br.cond.dpnt .L30 ;;.L21: mov A1 = A add A2 = LDA, A ;; shladd A3 = LDA, 1, A shladd A4 = LDA, 1, A2 shladd A = LDA, 2, A ;; ;; adds PREX = RPREFETCH * SIZE, NEW_X adds PREY = RPREFETCH * SIZE, NEW_Y adds PREA1 = RPREFETCH * SIZE, A1 adds PREA2 = RPREFETCH * SIZE, A2 adds PREA3 = RPREFETCH * SIZE, A3 adds PREA4 = RPREFETCH * SIZE, A4 ;; shladd TEMP = IS, BASE_SHIFT, NEW_X ;; LDFD atemp1 = [TEMP], 1 * SIZE ;; LDFD atemp2 = [TEMP], 1 * SIZE ;; LDFD atemp3 = [TEMP], 1 * SIZE ;; LDFD atemp4 = [TEMP], 1 * SIZE ;; FMPY atemp1 = alpha, atemp1 FMPY atemp2 = alpha, atemp2 FMPY atemp3 = alpha, atemp3 FMPY atemp4 = alpha, atemp4 ;; mov xsum1 = f0 mov xsum2 = f0 mov xsum3 = f0 mov xsum4 = f0 ;; mov XX = NEW_X mov YY = NEW_Y mov YYS = NEW_Y ;; shr I = IS, 2 mov pr.rot = 0 ;; mov ar.ec = 3 cmp.eq p16, p0 = r0, r0 ;; cmp.eq p6, p0 = 0, I adds I = -1, I ;; mov ar.lc = I (p6) br.cond.dpnt .L28 ;; .align 16.L22: { .mmf (p16) LDFPD f32, f35 = [A1], 2 * SIZE (p19) STFD [YYS] = f95, 1 * SIZE (p18) FMA xsum1 = f82, f34, xsum1 } { .mmf (p18) FMA f94 = atemp1, f34, f94 } ;; { .mmf (p17) LDFD f90 = [XX], 1 * SIZE (p18) FMA xsum2 = f82, f46, xsum2 } { .mmf (p18) FMA f98 = atemp1, f37, f98 } ;; { .mmf (p16) LDFPD f44, f47 = [A2], 2 * SIZE (p19) STFD [YYS] = f99, 1 * SIZE (p18) FMA xsum3 = f82, f58, xsum3 } { .mmf (p18) FMA f102 = atemp1, f40, f102 } ;; { .mmf (p16) PREFETCHW [PREY], 4 * SIZE (p16) LDFD f92 = [YY], 1 * SIZE (p18) FMA xsum4 = f82, f70, xsum4 } { .mmf (p18) FMA f106 = atemp1, f43, f106 } ;; { .mmf (p16) LDFPD f56, f59 = [A3], 2 * SIZE (p19) STFD [YYS] = f103, 1 * SIZE (p18) FMA xsum1 = f85, f37, xsum1 } { .mmf (p18) FMA f94 = atemp2, f46, f94 } ;; { .mmf (p16) LDFD f96 = [YY], 1 * SIZE (p18) FMA xsum2 = f85, f49, xsum2 } { .mmf (p18) FMA f98 = atemp2, f49, f98 } ;; { .mmf (p16) LDFPD f68, f71 = [A4], 2 * SIZE (p19) STFD [YYS] = f107, 1 * SIZE (p18) FMA xsum3 = f85, f61, xsum3 } { .mmf (p18) FMA f102 = atemp2, f52, f102 } ;; { .mmf (p16) LDFD f100 = [YY], 1 * SIZE (p18) FMA xsum4 = f85, f73, xsum4 } { .mmf (p18) FMA f106 = atemp2, f55, f106 } ;; { .mmf (p16) PREFETCH [PREA1], 4 * SIZE (p16) LDFPD f38, f41 = [A1], 2 * SIZE (p18) FMA xsum1 = f88, f40, xsum1 } { .mmf (p18) FMA f94 = atemp3, f58, f94 } ;; { .mmf (p16) LDFD f104 = [YY], 1 * SIZE (p18) FMA xsum2 = f88, f52, xsum2 } { .mmf (p18) FMA f98 = atemp3, f61, f98 } ;; { .mmf (p16) PREFETCH [PREA2], 4 * SIZE (p16) LDFPD f50, f53 = [A2], 2 * SIZE (p18) FMA xsum3 = f88, f64, xsum3 } { .mmf (p18) FMA f102 = atemp3, f64, f102 } ;; { .mmf (p16) PREFETCH [PREX], 4 * SIZE (p16) LDFD f80 = [XX], 1 * SIZE (p18) FMA xsum4 = f88, f76, xsum4 } { .mmf (p18) FMA f106 = atemp3, f67, f106 } ;; { .mmf (p16) PREFETCH [PREA3], 4 * SIZE (p16) LDFPD f62, f65 = [A3], 2 * SIZE (p18) FMA xsum1 = f91, f43, xsum1 } { .mmf (p18) FMA f94 = atemp4, f70, f94 } ;; { .mmf (p16) LDFD f83 = [XX], 1 * SIZE (p18) FMA xsum2 = f91, f55, xsum2 } { .mmf (p18) FMA f98 = atemp4, f73, f98 } ;; { .mmf (p16) PREFETCH [PREA4], 4 * SIZE (p16) LDFPD f74, f77 = [A4], 2 * SIZE (p18) FMA xsum3 = f91, f67, xsum3 } { .mmf (p18) FMA f102 = atemp4, f76, f102 } ;; { .mmf (p16) LDFD f86 = [XX], 1 * SIZE (p18) FMA xsum4 = f91, f79, xsum4 } { .mfb (p18) FMA f106 = atemp4, f79, f106 br.ctop.sptk.few .L22 } ;; (p19) STFD [YYS] = f95, 1 * SIZE ;; (p19) STFD [YYS] = f99, 1 * SIZE ;; (p19) STFD [YYS] = f103, 1 * SIZE ;; (p19) STFD [YYS] = f107, 1 * SIZE ;; ;; .align 16.L28: FMPY xsum1 = alpha, xsum1 FMPY xsum2 = alpha, xsum2 FMPY xsum3 = alpha, xsum3 FMPY xsum4 = alpha, xsum4 ;; LDFD f64 = [A1], 1 * SIZE LDFD f65 = [A2], 1 * SIZE LDFD f66 = [A3], 1 * SIZE LDFD f67 = [A4], 1 * SIZE ;; LDFD f68 = [A1], 1 * SIZE LDFD f69 = [A2], 1 * SIZE LDFD f70 = [A3], 1 * SIZE LDFD f71 = [A4], 1 * SIZE ;; LDFD f72 = [A1], 1 * SIZE LDFD f73 = [A2], 1 * SIZE LDFD f74 = [A3], 1 * SIZE LDFD f75 = [A4], 1 * SIZE ;; LDFD f76 = [A1], 1 * SIZE LDFD f77 = [A2], 1 * SIZE LDFD f78 = [A3], 1 * SIZE LDFD f79 = [A4], 1 * SIZE ;; FMA xsum1 = atemp1, f64, xsum1 FMA xsum2 = atemp1, f65, xsum2 FMA xsum3 = atemp1, f66, xsum3 FMA xsum4 = atemp1, f67, xsum4 ;; FMA xsum1 = atemp2, f65, xsum1 FMA xsum2 = atemp2, f69, xsum2 FMA xsum3 = atemp2, f70, xsum3 FMA xsum4 = atemp2, f71, xsum4 ;; FMA xsum1 = atemp3, f66, xsum1 FMA xsum2 = atemp3, f70, xsum2 FMA xsum3 = atemp3, f74, xsum3 FMA xsum4 = atemp3, f75, xsum4 ;; FMA xsum1 = atemp4, f67, xsum1 FMA xsum2 = atemp4, f71, xsum2 FMA xsum3 = atemp4, f75, xsum3 FMA xsum4 = atemp4, f79, xsum4 ;; LDFD f36 = [YY], 1 * SIZE ;; LDFD f37 = [YY], 1 * SIZE ;; LDFD f38 = [YY], 1 * SIZE ;; LDFD f39 = [YY], 1 * SIZE ;; FADD f36 = f36, xsum1 FADD f37 = f37, xsum2 FADD f38 = f38, xsum3 FADD f39 = f39, xsum4 ;; STFD [YYS] = f36, 1 * SIZE ;; STFD [YYS] = f37, 1 * SIZE ;; STFD [YYS] = f38, 1 * SIZE ;; STFD [YYS] = f39, 1 * SIZE ;; adds IS = 4, IS ;; adds TEMP = 4, IS ;; cmp.le p6, p0 = TEMP, M ;; (p6) br.cond.dpnt .L21 ;;.L30:.L990:.L999: mov r8 = r0 adds r9 = 1 * 16, SP ;; ldf.fill f16 = [SP], 32 ldf.fill f17 = [r9], 32 mov ar.lc = ARLC ;; ldf.fill f18 = [SP], 32 ldf.fill f19 = [r9], 32 mov pr = PR, -1 ;; ldf.fill f20 = [SP], 32 ldf.fill f21 = [r9], 32 mov ar.pfs = ARPFS ;; ldf.fill f22 = [SP], 32 ldf.fill f23 = [r9] br.ret.sptk.many b0 ;; EPILOGUE
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?