📄 scal.s
字号:
/*********************************************************************//* *//* Optimized BLAS libraries *//* By Kazushige Goto <kgoto@tacc.utexas.edu> *//* *//* Copyright (c) The University of Texas, 2005. All rights reserved. *//* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING *//* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF *//* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, *//* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY *//* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF *//* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO *//* THE USE OF THE SOFTWARE OR DOCUMENTATION. *//* Under no circumstances shall University be liable for incidental, *//* special, indirect, direct or consequential damages or loss of *//* profits, interruption of business, or related expenses which may *//* arise from use of Software or Documentation, including but not *//* limited to those resulting from defects in Software and/or *//* Documentation, or loss or inaccuracy of data of any kind. *//*********************************************************************/#define ASSEMBLER#include "common.h"#ifdef DOUBLE#define PREFETCH_SIZE (7 * 32)#else#define PREFETCH_SIZE (1 * 64)#endif#define ALPHA f8#define N r32#define X1 r36#define INCX r37#define X2 r14#define Y1 r15#define Y2 r16#define PRE1 r17#define I r18#define NAND15 r19#define INCX5 r20#define INCX16 r21#define XX r22#define PR r30#define ARLC r31 PROLOGUE .prologue PROFCODE { .mfi shladd INCX = INCX, BASE_SHIFT, r0 fcmp.eq p0, p6 = ALPHA, f0 .save ar.lc, ARLC mov ARLC = ar.lc } .body { .mib cmp.ge p7, p0 = 0, N tbit.z p0, p10 = X1, BASE_SHIFT (p7) br.ret.sptk.many b0 } ;; { .mmi mov XX = X1 (p10) LDFD f32 = [X1], INCX mov PR = pr } { .mmi shladd INCX5 = INCX, 2, INCX shladd INCX16 = INCX, 4, r0 (p10) adds N = -1, N } ;; { .mmi shladd X2 = INCX, 2, X1 nop.m 0 mov ar.ec = 5 } { .mmi and NAND15 = 15, N nop.m 0 shr I = N, 4 } ;; { .mmi adds I = -1, I nop.m 0 tbit.z p0, p12 = N, 3 } { .mmb cmp.ge p9, p0 = 0, NAND15 nop.m 0 (p6) br.cond.dptk .L100 // if (alpha != 0) goto L3 } ;; { .mmi (p10) STFD [XX] = f0 adds PRE1 = PREFETCH_SIZE * SIZE + 32, X1 mov ar.lc = I } { .mmb cmp.gt p8, p0 = 0, I (p8) br.cond.dpnt .L30 } ;; .align 32.L20: {.mmi STFD [X1] = f0 STFD [X2] = f0 nop.i 0 } {.mmi lfetch.excl.nt1 [PRE1], INCX16 add X1 = INCX, X1 add X2 = INCX, X2 } ;; {.mmi STFD [X1] = f0 STFD [X2] = f0 nop.i 0 } {.mmi add X1 = INCX, X1 add X2 = INCX, X2 nop.i 0 } ;; {.mmi STFD [X1] = f0 STFD [X2] = f0 nop.i 0 } {.mmi add X1 = INCX, X1 add X2 = INCX, X2 nop.i 0 } ;; {.mmi STFD [X1] = f0 STFD [X2] = f0 nop.i 0 } {.mmi add X1 = INCX5, X1 add X2 = INCX5, X2 nop.i 0 } ;; {.mmi STFD [X1] = f0 STFD [X2] = f0 nop.i 0 } {.mmi add X1 = INCX, X1 add X2 = INCX, X2 nop.i 0 } ;; {.mmi STFD [X1] = f0 STFD [X2] = f0 nop.i 0 } {.mmi add X1 = INCX, X1 add X2 = INCX, X2 nop.i 0 } ;; {.mmi STFD [X1] = f0 STFD [X2] = f0 nop.i 0 } {.mmi add X1 = INCX, X1 add X2 = INCX, X2 nop.i 0 } ;; {.mmi STFD [X1] = f0 STFD [X2] = f0 nop.i 0 } {.mmb add X1 = INCX5, X1 add X2 = INCX5, X2 br.cloop.sptk.few .L20 } ;; .align 16.L30: { .mmi (p12) STFD [X1] = f0 (p12) STFD [X2] = f0 mov ar.lc = ARLC } { .mmb (p12) add X1 = INCX, X1 (p12) add X2 = INCX, X2 (p9) br.ret.sptk.many b0 } ;; { .mmi (p12) STFD [X1] = f0 (p12) add X1 = INCX, X1 tbit.z p0, p13 = N, 2 } { .mmi (p12) STFD [X2] = f0 (p12) add X2 = INCX, X2 tbit.z p0, p14 = N, 1 } ;; { .mmi (p12) STFD [X1] = f0 (p12) add X1 = INCX, X1 tbit.z p0, p15 = N, 0 } { .mmb (p12) STFD [X2] = f0 (p12) add X2 = INCX, X2 nop.b 0 } ;; { .mmb (p12) STFD [X1] = f0 (p12) add X1 = INCX5, X1 nop.b 0 } { .mmb (p12) STFD [X2] = f0 (p12) add X2 = INCX5, X2 nop.b 0 } ;; { .mmb (p13) STFD [X1] = f0 (p13) add X1 = INCX, X1 nop.b 0 } ;; { .mmb (p13) STFD [X1] = f0 (p13) add X1 = INCX, X1 nop.b 0 } ;; { .mmb (p13) STFD [X1] = f0 (p13) add X1 = INCX, X1 nop.b 0 } ;; { .mmb (p13) STFD [X1] = f0 (p13) add X1 = INCX, X1 nop.b 0 } ;; { .mmb (p14) STFD [X1] = f0 (p14) add X1 = INCX, X1 nop.b 0 } ;; { .mmb (p14) STFD [X1] = f0 (p14) add X1 = INCX, X1 nop.b 0 } ;; { .mmb (p15) STFD [X1] = f0 nop.m 0 br.ret.sptk.many b0 } ;; .align 32.L100: { .mmi mov Y1 = X1 shladd Y2 = INCX, 2, X1 mov pr.rot= 0 } { .mmf adds PRE1 = PREFETCH_SIZE * SIZE + 64, X1 cmp.gt p8, p0 = 0, I (p10) FMPY f32 = ALPHA, f32 } ;; { .mmi (p10) STFD [XX] = f32 cmp.eq p0, p7 = SIZE, INCX mov ar.lc = I } { .mbb cmp.eq p16, p0 = r0, r0 (p7) br.cond.dpnt .L300 (p8) br.cond.dpnt .L120 } ;; .align 32.L110: { .mmf (p21) STFD [Y1] = f6, 1 * SIZE (p21) STFD [Y2] = f7, 1 * SIZE (p20) FMPY f112 = ALPHA, f36 } { .mmf (p16) lfetch.excl.nt1 [PRE1], 16 * SIZE (p16) LDFPD f32, f37 = [X1], 2 * SIZE (p20) FMPY f113 = ALPHA, f56 } ;; { .mmf (p21) STFD [Y1] = f10, 1 * SIZE (p21) STFD [Y2] = f11, 1 * SIZE (p20) FMPY f114 = ALPHA, f41 } { .mfb (p16) LDFPD f42, f47 = [X1], 2 * SIZE (p20) FMPY f115 = ALPHA, f61 nop.b 0 } ;; { .mmf (p21) STFD [Y1] = f12, 1 * SIZE (p21) STFD [Y2] = f13, 1 * SIZE (p20) FMPY f116 = ALPHA, f46 } { .mfb (p16) LDFPD f52, f57 = [X1], 2 * SIZE (p20) FMPY f117 = ALPHA, f66 nop.b 0 } ;; { .mmf (p21) STFD [Y1] = f14, 5 * SIZE (p21) STFD [Y2] = f15, 5 * SIZE (p20) FMPY f118 = ALPHA, f51 } { .mfb (p16) LDFPD f62, f67 = [X1], 2 * SIZE (p20) FMPY f119 = ALPHA, f71 nop.b 0 } ;; { .mmf (p20) STFD [Y1] = f112, 1 * SIZE (p20) STFD [Y2] = f113, 1 * SIZE (p20) FMPY f6 = ALPHA, f76 } { .mfb (p16) LDFPD f72, f77 = [X1], 2 * SIZE (p20) FMPY f7 = ALPHA, f96 nop.b 0 } ;; { .mmf (p20) STFD [Y1] = f114, 1 * SIZE (p20) STFD [Y2] = f115, 1 * SIZE (p20) FMPY f10 = ALPHA, f81 } { .mfb (p16) LDFPD f82, f87 = [X1], 2 * SIZE (p20) FMPY f11 = ALPHA, f101 nop.b 0 } ;; { .mmf (p20) STFD [Y1] = f116, 1 * SIZE (p20) STFD [Y2] = f117, 1 * SIZE (p20) FMPY f12 = ALPHA, f86 } { .mfb (p16) LDFPD f92, f97 = [X1], 2 * SIZE (p20) FMPY f13 = ALPHA, f106 nop.b 0 } ;; { .mmf (p20) STFD [Y1] = f118, 5 * SIZE (p20) STFD [Y2] = f119, 5 * SIZE (p20) FMPY f14 = ALPHA, f91 } { .mfb (p16) LDFPD f102, f107 = [X1], 2 * SIZE (p20) FMPY f15 = ALPHA, f111 br.ctop.sptk.few .L110 } ;; { .mmi STFD [Y1] = f6, 1 * SIZE STFD [Y2] = f7, 1 * SIZE } ;; { .mmi STFD [Y1] = f10, 1 * SIZE STFD [Y2] = f11, 1 * SIZE nop.i 0 } ;; STFD [Y1] = f12, 1 * SIZE STFD [Y2] = f13, 1 * SIZE shladd X2 = INCX, 2, X1 ;; { .mmi STFD [Y1] = f14, 5 * SIZE STFD [Y2] = f15, 5 * SIZE nop.i 0 } ;; .align 32.L120: { .mmi (p12) LDFPD f32, f33 = [X1], 2 * SIZE (p12) LDFPD f36, f37 = [X2], 2 * SIZE mov pr = PR, -65474 } ;; { .mmi (p12) LDFPD f34, f35 = [X1] (p12) LDFPD f38, f39 = [X2] mov ar.lc = ARLC } { .mib (p12) adds X1 = 6 * SIZE,X1 tbit.z p0, p13 = N, 2 (p9) br.ret.sptk.many b0 } ;; (p13) LDFPD f40, f41 = [X1], 2 * SIZE tbit.z p0, p14 = N, 1 ;; (p13) LDFPD f42, f43 = [X1], 2 * SIZE tbit.z p0, p15 = N, 0 ;; (p14) LDFPD f44, f45 = [X1], 2 * SIZE ;; (p15) LDFD f46 = [X1] ;; (p12) FMPY f32 = ALPHA, f32 (p12) FMPY f36 = ALPHA, f36 (p12) FMPY f33 = ALPHA, f33 (p12) FMPY f37 = ALPHA, f37 (p12) FMPY f34 = ALPHA, f34 (p12) FMPY f38 = ALPHA, f38 (p12) FMPY f35 = ALPHA, f35 (p12) FMPY f39 = ALPHA, f39 ;; (p12) STFD [Y1] = f32, 1 * SIZE (p13) FMPY f40 = ALPHA, f40 (p12) STFD [Y2] = f36, 1 * SIZE (p13) FMPY f41 = ALPHA, f41 ;; (p12) STFD [Y1] = f33, 1 * SIZE (p13) FMPY f42 = ALPHA, f42 (p12) STFD [Y2] = f37, 1 * SIZE (p13) FMPY f43 = ALPHA, f43 ;; (p12) STFD [Y1] = f34, 1 * SIZE (p14) FMPY f44 = ALPHA, f44 (p12) STFD [Y2] = f38, 1 * SIZE (p14) FMPY f45 = ALPHA, f45 ;; (p12) STFD [Y1] = f35, 5 * SIZE (p12) STFD [Y2] = f39, 5 * SIZE (p15) FMPY f46 = ALPHA, f46 ;; (p13) STFD [Y1] = f40, 1 * SIZE ;; (p13) STFD [Y1] = f41, 1 * SIZE ;; (p13) STFD [Y1] = f42, 1 * SIZE ;; (p13) STFD [Y1] = f43, 1 * SIZE ;; (p14) STFD [Y1] = f44, 1 * SIZE ;; (p14) STFD [Y1] = f45, 1 * SIZE ;; (p15) STFD [Y1] = f46 mov pr = PR, -65474 br.ret.sptk.many b0 ;; .align 32.L300: { .mmi adds PRE1 = PREFETCH_SIZE * SIZE + 64, X1 nop.m 0 mov.i ar.ec = 6 } { .mmb cmp.gt p8, p0 = 0, I nop.m 0 (p8) br.cond.dpnt .L320 } ;; .align 32.L310: { .mmf (p16) lfetch.excl.nt1 [PRE1], INCX16 (p16) LDFD f32 = [X1], INCX (p21) FMPY f6 = ALPHA, f37 } { .mmb (p22) STFD [Y1] = f12 (p22) add Y1 = INCX, Y1 nop.b 0 } ;; { .mfb (p16) LDFD f38 = [X1], INCX (p21) FMPY f7 = ALPHA, f43 nop.b 0 } { .mmb (p22) STFD [Y1] = f13 (p22) add Y1 = INCX, Y1 nop.b 0 } ;; { .mfb (p16) LDFD f44 = [X1], INCX (p21) FMPY f10 = ALPHA, f49 nop.b 0 } { .mmb (p22) STFD [Y1] = f14 (p22) add Y1 = INCX, Y1 nop.b 0 } ;; { .mfb (p16) LDFD f50 = [X1], INCX (p21) FMPY f11 = ALPHA, f55 nop.b 0 } { .mmb (p22) STFD [Y1] = f15 (p22) add Y1 = INCX, Y1 nop.b 0 } ;; { .mfb (p16) LDFD f56 = [X1], INCX (p21) FMPY f12 = ALPHA, f61 nop.b 0 } { .mmb (p21) STFD [Y1] = f6 (p21) add Y1 = INCX, Y1 nop.b 0 } ;; { .mfb (p16) LDFD f62 = [X1], INCX (p21) FMPY f13 = ALPHA, f67 nop.b 0 } { .mmb (p21) STFD [Y1] = f7 (p21) add Y1 = INCX, Y1 nop.b 0 } ;; { .mfb (p16) LDFD f68 = [X1], INCX (p21) FMPY f14 = ALPHA, f73 nop.b 0 } { .mmb (p21) STFD [Y1] = f10 (p21) add Y1 = INCX, Y1 nop.b 0 } ;; { .mfb (p16) LDFD f74 = [X1], INCX (p21) FMPY f15 = ALPHA, f79 nop.b 0 } { .mmb (p21) STFD [Y1] = f11 (p21) add Y1 = INCX, Y1 nop.b 0 } ;; { .mfb (p16) LDFD f80 = [X1], INCX (p21) FMPY f6 = ALPHA, f85 nop.b 0 } { .mmb (p21) STFD [Y1] = f12 (p21) add Y1 = INCX, Y1 nop.b 0 } ;; { .mfb (p16) LDFD f86 = [X1], INCX (p21) FMPY f7 = ALPHA, f91 nop.b 0 } { .mmb (p21) STFD [Y1] = f13 (p21) add Y1 = INCX, Y1 nop.b 0 } ;; { .mfb (p16) LDFD f92 = [X1], INCX (p21) FMPY f10 = ALPHA, f97 nop.b 0 } { .mmb (p21) STFD [Y1] = f14 (p21) add Y1 = INCX, Y1 nop.b 0 } ;; { .mfb (p16) LDFD f98 = [X1], INCX (p21) FMPY f11 = ALPHA, f103 nop.b 0 } { .mmb (p21) STFD [Y1] = f15 (p21) add Y1 = INCX, Y1 nop.b 0 } ;; { .mfb (p16) LDFD f104 = [X1], INCX (p21) FMPY f12 = ALPHA, f109 nop.b 0 } { .mmb (p21) STFD [Y1] = f6 (p21) add Y1 = INCX, Y1 nop.b 0 } ;; { .mfb (p16) LDFD f110 = [X1], INCX (p21) FMPY f13 = ALPHA, f115 nop.b 0 } { .mmb (p21) STFD [Y1] = f7 (p21) add Y1 = INCX, Y1 nop.b 0 } ;; { .mfb (p16) LDFD f116 = [X1], INCX (p21) FMPY f14 = ALPHA, f121 nop.b 0 } { .mmb (p21) STFD [Y1] = f10 (p21) add Y1 = INCX, Y1 nop.b 0 } ;; { .mfb (p16) LDFD f122 = [X1], INCX (p21) FMPY f15 = ALPHA, f127 nop.b 0 } { .mmb (p21) STFD [Y1] = f11 (p21) add Y1 = INCX, Y1 br.ctop.sptk.few .L310 } ;; STFD [Y1] = f12 add Y1 = INCX, Y1 shladd Y2 = INCX, 2, X1 ;; STFD [Y1] = f13 add Y1 = INCX, Y1 shladd X2 = INCX, 2, X1 ;; STFD [Y1] = f14 add Y1 = INCX, Y1 ;; STFD [Y1] = f15 add Y1 = INCX, Y1 ;; .align 16.L320: { .mmi (p12) LDFD f48 = [X1], INCX (p12) LDFD f52 = [X2], INCX mov ar.lc = ARLC } ;; { .mmi (p12) LDFD f49 = [X1], INCX (p12) LDFD f53 = [X2], INCX mov pr = PR, -65474 } { .mmb nop.m 0 nop.m 0 (p9) br.ret.sptk.many b0 } ;; { .mmi (p12) LDFD f50 = [X1], INCX (p12) LDFD f54 = [X2], INCX tbit.z p0, p13 = N, 2 } ;; { .mmi (p12) LDFD f51 = [X1], INCX5 (p12) LDFD f55 = [X2], INCX5 tbit.z p0, p14 = N, 1 } ;; (p13) LDFD f56 = [X1], INCX tbit.z p0, p15 = N, 0 ;; (p13) LDFD f57 = [X1], INCX ;; { .mfi (p13) LDFD f58 = [X1], INCX (p12) FMPY f48 = ALPHA, f48 } { .mfi (p12) FMPY f52 = ALPHA, f52 } ;; { .mfi (p13) LDFD f59 = [X1], INCX (p12) FMPY f49 = ALPHA, f49 } { .mfi (p12) FMPY f53 = ALPHA, f53 } ;; { .mfi (p14) LDFD f60 = [X1], INCX (p12) FMPY f50 = ALPHA, f50 } { .mfi (p12) FMPY f54 = ALPHA, f54 } ;; { .mfi (p14) LDFD f61 = [X1], INCX (p12) FMPY f51 = ALPHA, f51 } { .mfi (p12) FMPY f55 = ALPHA, f55 } ;; { .mmf (p12) STFD [Y1] = f48 (p12) STFD [Y2] = f52 (p13) FMPY f56 = ALPHA, f56 } { .mmi (p15) LDFD f62 = [X1] (p12) add Y1 = INCX, Y1 (p12) add Y2 = INCX, Y2 } ;; { .mmf (p12) STFD [Y1] = f49 (p12) STFD [Y2] = f53 (p13) FMPY f57 = ALPHA, f57 } { .mmi (p12) add Y1 = INCX, Y1 (p12) add Y2 = INCX, Y2 nop.i 0 } ;; { .mmf (p12) STFD [Y1] = f50 (p12) STFD [Y2] = f54 (p13) FMPY f58 = ALPHA, f58 } { .mmi (p12) add Y1 = INCX, Y1 (p12) add Y2 = INCX, Y2 nop.i 0 } ;; { .mmf (p12) STFD [Y1] = f51 (p12) STFD [Y2] = f55 (p13) FMPY f59 = ALPHA, f59 } { .mmi (p12) add Y1 = INCX5, Y1 (p12) add Y2 = INCX5, Y2 nop.i 0 } ;; { .mfi (p13) STFD [Y1] = f56 (p14) FMPY f60 = ALPHA, f60 (p13) add Y1 = INCX, Y1 } ;; { .mfi (p13) STFD [Y1] = f57 (p14) FMPY f61 = ALPHA, f61 (p13) add Y1 = INCX, Y1 } ;; { .mfi (p13) STFD [Y1] = f58 (p15) FMPY f62 = ALPHA, f62 (p13) add Y1 = INCX, Y1 } ;; { .mmi (p13) STFD [Y1] = f59 (p13) add Y1 = INCX, Y1 } ;; { .mmi (p14) STFD [Y1] = f60 (p14) add Y1 = INCX, Y1 } ;; { .mmi (p14) STFD [Y1] = f61 (p14) add Y1 = INCX, Y1 } ;; { .mib (p15) STFD [Y1] = f62 mov pr = PR, -65474 br.ret.sptk.many b0 } EPILOGUE
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -