📄 saxpy.s
字号:
/*********************************************************************//* *//* Optimized BLAS libraries *//* By Kazushige Goto <kgoto@tacc.utexas.edu> *//* *//* Copyright (c) The University of Texas, 2005. All rights reserved. *//* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING *//* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF *//* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, *//* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY *//* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF *//* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO *//* THE USE OF THE SOFTWARE OR DOCUMENTATION. *//* Under no circumstances shall University be liable for incidental, *//* special, indirect, direct or consequential damages or loss of *//* profits, interruption of business, or related expenses which may *//* arise from use of Software or Documentation, including but not *//* limited to those resulting from defects in Software and/or *//* Documentation, or loss or inaccuracy of data of any kind. *//*********************************************************************/#define ASSEMBLER#include "common.h"#define PREFETCHSIZE 64 * 8#define N r32#define X r36#define INCX r37#define Y r38#define INCY r39#define PRE1 r2#define PRE2 r3#define I r14#define J r15#define Y1 r16#define Y2 r17#define X1 r18#define X2 r19#define INCX16 r20#define INCY16 r21#define YYY r25#define YY r27#define XA r28#define XB r29#define PR r30#define ARLC r31 #define ALPHA f8#define ALPHA_P f9 PROLOGUE .prologue PROFCODE { .mii shladd INCX = INCX, BASE_SHIFT, r0 .save ar.lc, ARLC mov ARLC = ar.lc tbit.nz p10, p0 = X, BASE_SHIFT } { .mfb cmp.lt p0, p6 = r0, N fcmp.eq p7, p0 = ALPHA, f0 (p6) br.ret.sptk.many b0 } ;; .body { .mmi (p10) LDFD f32 = [X], INCX shladd INCY = INCY, BASE_SHIFT, r0 mov PR = pr } { .mib (p10) adds N = -1, N mov YYY = Y (p7) br.ret.sptk.many b0 } ;; { .mmi (p10) LDFD f33 = [Y], INCY cmp.ne p13, p0 = SIZE, INCX shr XA = X, 2 } { .mmi shladd INCX16 = INCX, 4, r0 shladd INCY16 = INCY, 4, r0 nop.i 0 } ;; { .mii mov Y1 = Y tbit.nz p11, p0 = Y, BASE_SHIFT shr XB = Y, 2 } ;; { .mmf and XA = 0x3f, XA and XB = 0x3f, XB (p10) FMA f32 = ALPHA, f32, f33 } ;; { .mmi sub XA = XB, XA shladd Y2 = INCY, 2, Y mov pr.rot = 0x10000 } { .mbb cmp.ne p14, p0 = SIZE, INCY (p13) br.cond.dpnt .L100 (p14) br.cond.dpnt .L100 } ;; { .mmi cmp.gt p14, p0 = r0, XA ;; and J = 15, N shr I = N, 4 } { .mfb (p14) adds XA = 64, XA fpack ALPHA_P = f8, f8 (p11) br.cond.dpnt .L30 } ;; { .mmi cmp.gt p14, p0 = 32, XA cmp.lt p15, p0 = 58, XA mov ar.ec = 3 } { .mmi and J = 31, N cmp.eq p16, p0 = r0, r0 shr I = N, 5 } ;; { .mmi cmp.eq p9, p0 = r0, J cmp.eq p7 ,p0 = 0, I adds I = -1, I } { .mbb nop.m 0 (p14) br.cond.dpnt .L20 (p15) br.cond.dpnt .L20 } ;; { .mmi (p10) STFD [YYY] = f32 adds PRE1 = PREFETCHSIZE * SIZE, X mov ar.lc = I } { .mib adds PRE2 = (PREFETCHSIZE - 24) * SIZE, Y tbit.z p0, p11 = N, 4 (p7) br.cond.dpnt .L15 } ;; .align 32.L12:/* 0 */ { .mmf (p18) stf8 [Y1] = f6, 2 * SIZE (p16) lfetch.nt1 [PRE1], 32 * SIZE (p18) fpma f12 = ALPHA_P, f46, f94 } { .mmi (p16) ldf8 f32 = [X], 2 * SIZE (p16) ldf8 f80 = [Y], 2 * SIZE } ;;/* 1 */ { .mmf (p18) stf8 [Y1] = f7, 2 * SIZE (p16) lfetch.excl.nt1 [PRE2], 32 * SIZE (p18) fpma f13 = ALPHA_P, f49, f97 } { .mmi (p16) ldf8 f35 = [X], 2 * SIZE (p16) ldf8 f83 = [Y], 2 * SIZE } ;;/* 2 */ { .mmf (p18) stf8 [Y1] = f10, 2 * SIZE (p18) fpma f14 = ALPHA_P, f52, f100 } { .mmi (p16) ldf8 f38 = [X], 2 * SIZE (p16) ldf8 f86 = [Y], 2 * SIZE } ;;/* 3 */ { .mmf (p18) stf8 [Y1] = f11, 2 * SIZE (p18) fpma f15 = ALPHA_P, f55, f103 } { .mmi (p16) ldf8 f41 = [X], 2 * SIZE (p16) ldf8 f89 = [Y], 2 * SIZE } ;;/* 4 */ { .mmf (p18) stf8 [Y1] = f12, 2 * SIZE (p18) fpma f6 = ALPHA_P, f58, f106 } { .mmi (p16) ldf8 f44 = [X], 2 * SIZE (p16) ldf8 f92 = [Y], 2 * SIZE } ;;/* 5 */ { .mmf (p18) stf8 [Y1] = f13, 2 * SIZE (p18) fpma f7 = ALPHA_P, f61, f109 } { .mmi (p16) ldf8 f47 = [X], 2 * SIZE (p16) ldf8 f95 = [Y], 2 * SIZE } ;;/* 6 */ { .mmf (p18) stf8 [Y1] = f14, 2 * SIZE (p18) fpma f10 = ALPHA_P, f64, f112 } { .mmi (p16) ldf8 f50 = [X], 2 * SIZE (p16) ldf8 f98 = [Y], 2 * SIZE } ;;/* 7 */ { .mmf (p18) stf8 [Y1] = f15, 2 * SIZE (p18) fpma f11 = ALPHA_P, f67, f115 } { .mmi (p16) ldf8 f53 = [X], 2 * SIZE (p16) ldf8 f101 = [Y], 2 * SIZE } ;;/* 8 */ { .mmf (p18) stf8 [Y1] = f6, 2 * SIZE (p18) fpma f12 = ALPHA_P, f70, f118 } { .mmi (p16) ldf8 f56 = [X], 2 * SIZE (p16) ldf8 f104 = [Y], 2 * SIZE } ;;/* 9 */ { .mmf (p18) stf8 [Y1] = f7, 2 * SIZE (p18) fpma f13 = ALPHA_P, f73, f121 } { .mmi (p16) ldf8 f59 = [X], 2 * SIZE (p16) ldf8 f107 = [Y], 2 * SIZE } ;;/* 10 */ { .mmf (p18) stf8 [Y1] = f10, 2 * SIZE (p18) fpma f14 = ALPHA_P, f76, f124 } { .mmi (p16) ldf8 f62 = [X], 2 * SIZE (p16) ldf8 f110 = [Y], 2 * SIZE } ;;/* 11 */ { .mmf (p18) stf8 [Y1] = f11, 2 * SIZE (p18) fpma f15 = ALPHA_P, f79, f127 } { .mmi (p16) ldf8 f65 = [X], 2 * SIZE (p16) ldf8 f113 = [Y], 2 * SIZE } ;;/* 12 */ { .mmf (p18) stf8 [Y1] = f12, 2 * SIZE (p17) fpma f6 = ALPHA_P, f33, f81 } { .mmi (p16) ldf8 f68 = [X], 2 * SIZE (p16) ldf8 f116 = [Y], 2 * SIZE } ;;/* 13 */ { .mmf (p18) stf8 [Y1] = f13, 2 * SIZE (p17) fpma f7 = ALPHA_P, f36, f84 } { .mmi (p16) ldf8 f71 = [X], 2 * SIZE (p16) ldf8 f119 = [Y], 2 * SIZE } ;;/* 14 */ { .mmf (p18) stf8 [Y1] = f14, 2 * SIZE (p17) fpma f10 = ALPHA_P, f39, f87 } { .mmi (p16) ldf8 f74 = [X], 2 * SIZE (p16) ldf8 f122 = [Y], 2 * SIZE } ;;/*15 */ { .mmf (p18) stf8 [Y1] = f15, 2 * SIZE (p17) fpma f11 = ALPHA_P, f42, f90 } { .mmb (p16) ldf8 f77 = [X], 2 * SIZE (p16) ldf8 f125 = [Y], 2 * SIZE br.ctop.sptk.few .L12 } ;; .align 32.L15: { .mmi (p11) ldf8 f32 = [X], 2 * SIZE (p11) ldf8 f33 = [Y], 2 * SIZE mov pr = PR, -65474 } ;; { .mmi (p11) ldf8 f34 = [X], 2 * SIZE (p11) ldf8 f35 = [Y], 2 * SIZE mov ar.lc = ARLC } ;; { .mmb (p11) ldf8 f36 = [X], 2 * SIZE (p11) ldf8 f37 = [Y], 2 * SIZE (p9) br.ret.sptk.many b0 } ;; { .mmi (p11) ldf8 f38 = [X], 2 * SIZE (p11) ldf8 f39 = [Y], 2 * SIZE tbit.z p0, p12 = N, 3 } ;; { .mmi (p11) ldf8 f40 = [X], 2 * SIZE (p11) ldf8 f41 = [Y], 2 * SIZE tbit.z p0, p13 = N, 2 } ;; { .mmi (p11) ldf8 f42 = [X], 2 * SIZE (p11) ldf8 f43 = [Y], 2 * SIZE tbit.z p0, p14 = N, 1 } ;; { .mmf (p11) ldf8 f44 = [X], 2 * SIZE (p11) ldf8 f45 = [Y], 2 * SIZE (p11) fpma f6 = ALPHA_P, f32, f33 } ;; { .mmf (p11) ldf8 f46 = [X], 2 * SIZE (p11) ldf8 f47 = [Y], 2 * SIZE (p11) fpma f7 = ALPHA_P, f34, f35 } ;; { .mmf (p12) ldf8 f48 = [X], 2 * SIZE (p12) ldf8 f49 = [Y], 2 * SIZE (p11) fpma f10 = ALPHA_P, f36, f37 } ;; { .mmi (p11) stf8 [Y1] = f6, 2 * SIZE nop.m 0 tbit.z p0, p15 = N, 0 } { .mmf (p12) ldf8 f50 = [X], 2 * SIZE (p12) ldf8 f51 = [Y], 2 * SIZE (p11) fpma f11 = ALPHA_P, f38, f39 } ;; { .mmi (p11) stf8 [Y1] = f7, 2 * SIZE nop.m 0 nop.i 0 } { .mmf (p12) ldf8 f52 = [X], 2 * SIZE (p12) ldf8 f53 = [Y], 2 * SIZE } ;; { .mmi (p11) stf8 [Y1] = f10, 2 * SIZE nop.m 0 nop.i 0 } { .mmf (p12) ldf8 f54 = [X], 2 * SIZE (p12) ldf8 f55 = [Y], 2 * SIZE (p11) fpma f12 = ALPHA_P, f40, f41 } ;; { .mmi (p11) stf8 [Y1] = f11, 2 * SIZE nop.m 0 nop.i 0 } { .mmf (p13) ldf8 f56 = [X], 2 * SIZE (p13) ldf8 f57 = [Y], 2 * SIZE (p11) fpma f13 = ALPHA_P, f42, f43 } ;; { .mmi (p11) stf8 [Y1] = f12, 2 * SIZE nop.m 0 nop.i 0 } { .mmf (p13) ldf8 f58 = [X], 2 * SIZE (p13) ldf8 f59 = [Y], 2 * SIZE (p11) fpma f14 = ALPHA_P, f44, f45 } ;; { .mmi (p11) stf8 [Y1] = f13, 2 * SIZE nop.m 0 nop.i 0 } { .mmf (p14) ldf8 f60 = [X], 2 * SIZE (p14) ldf8 f61 = [Y], 2 * SIZE (p11) fpma f15 = ALPHA_P, f46, f47 } ;; { .mmi (p11) stf8 [Y1] = f14, 2 * SIZE nop.m 0 nop.i 0 } { .mmf (p15) ldfs f62 = [X] (p15) ldfs f63 = [Y] (p12) fpma f6 = ALPHA_P, f48, f49 } ;; (p12) fpma f7 = ALPHA_P, f50, f51 (p12) fpma f10 = ALPHA_P, f52, f53 ;; (p11) stf8 [Y1] = f15, 2 * SIZE (p12) fpma f11 = ALPHA_P, f54, f55 ;; (p12) stf8 [Y1] = f6, 2 * SIZE (p13) fpma f12 = ALPHA_P, f56, f57 ;; (p12) stf8 [Y1] = f7, 2 * SIZE (p13) fpma f13 = ALPHA_P, f58, f59 ;; (p12) stf8 [Y1] = f10, 2 * SIZE (p14) fpma f14 = ALPHA_P, f60, f61 ;; (p12) stf8 [Y1] = f11, 2 * SIZE (p15) FMA f15 = ALPHA, f62, f63 ;; (p13) stf8 [Y1] = f12, 2 * SIZE ;; (p13) stf8 [Y1] = f13, 2 * SIZE ;; (p14) stf8 [Y1] = f14, 2 * SIZE ;; (p15) stfs [Y1] = f15 br.ret.sptk.many b0 ;; .align 32/* X is aligned; case 2 */.L20: { .mmi (p10) STFD [YYY] = f32 adds PRE1 = (PREFETCHSIZE - 28) * SIZE, X mov ar.lc = I } { .mib adds PRE2 = (PREFETCHSIZE + 4) * SIZE, Y tbit.z p0, p11 = N, 4 (p7) br.cond.dpnt .L25 } ;; .align 32.L22:/* 0 */ { .mmf (p18) stf8 [Y1] = f6, 2 * SIZE (p16) lfetch.nt1 [PRE1], 32 * SIZE (p18) fpma f12 = ALPHA_P, f46, f94 } { .mmi (p17) ldf8 f60 = [X], 2 * SIZE (p16) ldf8 f80 = [Y], 2 * SIZE } ;;/* 1 */ { .mmf (p18) stf8 [Y1] = f7, 2 * SIZE (p16) lfetch.excl.nt1 [PRE2], 32 * SIZE (p18) fpma f13 = ALPHA_P, f49, f97 } { .mmi (p17) ldf8 f63 = [X], 2 * SIZE (p16) ldf8 f83 = [Y], 2 * SIZE } ;;/* 2 */ { .mmf (p18) stf8 [Y1] = f10, 2 * SIZE (p18) fpma f14 = ALPHA_P, f52, f100 } { .mmi (p17) ldf8 f66 = [X], 2 * SIZE (p16) ldf8 f86 = [Y], 2 * SIZE } ;;/* 3 */ { .mmf (p18) stf8 [Y1] = f11, 2 * SIZE (p18) fpma f15 = ALPHA_P, f55, f103 } { .mmi (p17) ldf8 f69 = [X], 2 * SIZE (p16) ldf8 f89 = [Y], 2 * SIZE } ;;/* 4 */ { .mmf (p18) stf8 [Y1] = f12, 2 * SIZE (p18) fpma f6 = ALPHA_P, f58, f106 } { .mmi (p17) ldf8 f72 = [X], 2 * SIZE (p16) ldf8 f92 = [Y], 2 * SIZE } ;;/* 5 */ { .mmf (p18) stf8 [Y1] = f13, 2 * SIZE (p18) fpma f7 = ALPHA_P, f61, f109 } { .mmi (p17) ldf8 f75 = [X], 2 * SIZE (p16) ldf8 f95 = [Y], 2 * SIZE } ;;/* 6 */ { .mmf (p18) stf8 [Y1] = f14, 2 * SIZE (p18) fpma f10 = ALPHA_P, f64, f112 } { .mmi (p17) ldf8 f78 = [X], 2 * SIZE (p16) ldf8 f98 = [Y], 2 * SIZE } ;;/* 7 */ { .mmf (p18) stf8 [Y1] = f15, 2 * SIZE (p18) fpma f11 = ALPHA_P, f67, f115 } { .mmi (p16) ldf8 f32 = [X], 2 * SIZE (p16) ldf8 f101 = [Y], 2 * SIZE } ;;/* 8 */ { .mmf (p18) stf8 [Y1] = f6, 2 * SIZE (p18) fpma f12 = ALPHA_P, f70, f118 } { .mmi (p16) ldf8 f35 = [X], 2 * SIZE (p16) ldf8 f104 = [Y], 2 * SIZE } ;;/* 9 */ { .mmf (p18) stf8 [Y1] = f7, 2 * SIZE (p18) fpma f13 = ALPHA_P, f73, f121 } { .mmi (p16) ldf8 f38 = [X], 2 * SIZE (p16) ldf8 f107 = [Y], 2 * SIZE } ;;/* 10 */ { .mmf (p18) stf8 [Y1] = f10, 2 * SIZE (p18) fpma f14 = ALPHA_P, f76, f124 } { .mmi (p16) ldf8 f41 = [X], 2 * SIZE (p16) ldf8 f110 = [Y], 2 * SIZE } ;;/* 11 */ { .mmf (p18) stf8 [Y1] = f11, 2 * SIZE (p18) fpma f15 = ALPHA_P, f79, f127 } { .mmi (p16) ldf8 f44 = [X], 2 * SIZE (p16) ldf8 f113 = [Y], 2 * SIZE } ;;/* 12 */ { .mmf (p18) stf8 [Y1] = f12, 2 * SIZE (p17) fpma f6 = ALPHA_P, f33, f81 } { .mmi (p16) ldf8 f47 = [X], 2 * SIZE (p16) ldf8 f116 = [Y], 2 * SIZE } ;;/* 13 */ { .mmf (p18) stf8 [Y1] = f13, 2 * SIZE (p17) fpma f7 = ALPHA_P, f36, f84 } { .mmi (p16) ldf8 f50 = [X], 2 * SIZE (p16) ldf8 f119 = [Y], 2 * SIZE } ;;/* 14 */ { .mmf (p18) stf8 [Y1] = f14, 2 * SIZE (p17) fpma f10 = ALPHA_P, f39, f87 } { .mmi (p16) ldf8 f53 = [X], 2 * SIZE (p16) ldf8 f122 = [Y], 2 * SIZE } ;;/*15 */ { .mmf (p18) stf8 [Y1] = f15, 2 * SIZE (p17) fpma f11 = ALPHA_P, f42, f90 } { .mmb (p16) ldf8 f56 = [X], 2 * SIZE (p16) ldf8 f125 = [Y], 2 * SIZE br.ctop.sptk.few .L22 } ;; .align 32.L25: { .mmi (p11) ldf8 f32 = [X], 2 * SIZE (p11) ldf8 f33 = [Y], 2 * SIZE mov pr = PR, -65474 } ;; { .mmi (p11) ldf8 f34 = [X], 2 * SIZE (p11) ldf8 f35 = [Y], 2 * SIZE mov ar.lc = ARLC } ;; { .mmb (p11) ldf8 f36 = [X], 2 * SIZE (p11) ldf8 f37 = [Y], 2 * SIZE (p9) br.ret.sptk.many b0 } ;; { .mmi (p11) ldf8 f38 = [X], 2 * SIZE (p11) ldf8 f39 = [Y], 2 * SIZE tbit.z p0, p12 = N, 3 } ;; { .mmi (p11) ldf8 f40 = [X], 2 * SIZE (p11) ldf8 f41 = [Y], 2 * SIZE tbit.z p0, p13 = N, 2 } ;; { .mmi (p11) ldf8 f42 = [X], 2 * SIZE (p11) ldf8 f43 = [Y], 2 * SIZE tbit.z p0, p14 = N, 1 } ;; { .mmf (p11) ldf8 f44 = [X], 2 * SIZE (p11) ldf8 f45 = [Y], 2 * SIZE (p11) fpma f6 = ALPHA_P, f32, f33 } ;; { .mmf (p11) ldf8 f46 = [X], 2 * SIZE (p11) ldf8 f47 = [Y], 2 * SIZE (p11) fpma f7 = ALPHA_P, f34, f35 } ;; { .mmf (p12) ldf8 f48 = [X], 2 * SIZE (p12) ldf8 f49 = [Y], 2 * SIZE (p11) fpma f10 = ALPHA_P, f36, f37 } ;; { .mmi (p11) stf8 [Y1] = f6, 2 * SIZE nop.m 0 tbit.z p0, p15 = N, 0 } { .mmf (p12) ldf8 f50 = [X], 2 * SIZE (p12) ldf8 f51 = [Y], 2 * SIZE (p11) fpma f11 = ALPHA_P, f38, f39 } ;; { .mmi (p11) stf8 [Y1] = f7, 2 * SIZE nop.m 0 nop.i 0 } { .mmf (p12) ldf8 f52 = [X], 2 * SIZE (p12) ldf8 f53 = [Y], 2 * SIZE } ;; { .mmi (p11) stf8 [Y1] = f10, 2 * SIZE nop.m 0 nop.i 0 } { .mmf (p12) ldf8 f54 = [X], 2 * SIZE (p12) ldf8 f55 = [Y], 2 * SIZE (p11) fpma f12 = ALPHA_P, f40, f41 } ;; { .mmi (p11) stf8 [Y1] = f11, 2 * SIZE nop.m 0 nop.i 0 } { .mmf (p13) ldf8 f56 = [X], 2 * SIZE (p13) ldf8 f57 = [Y], 2 * SIZE (p11) fpma f13 = ALPHA_P, f42, f43 } ;; { .mmi (p11) stf8 [Y1] = f12, 2 * SIZE nop.m 0 nop.i 0 } { .mmf (p13) ldf8 f58 = [X], 2 * SIZE (p13) ldf8 f59 = [Y], 2 * SIZE (p11) fpma f14 = ALPHA_P, f44, f45 } ;; { .mmi (p11) stf8 [Y1] = f13, 2 * SIZE nop.m 0 nop.i 0 } { .mmf (p14) ldf8 f60 = [X], 2 * SIZE (p14) ldf8 f61 = [Y], 2 * SIZE (p11) fpma f15 = ALPHA_P, f46, f47 } ;; { .mmi (p11) stf8 [Y1] = f14, 2 * SIZE nop.m 0 nop.i 0 } { .mmf (p15) ldfs f62 = [X] (p15) ldfs f63 = [Y] (p12) fpma f6 = ALPHA_P, f48, f49 } ;; (p12) fpma f7 = ALPHA_P, f50, f51 (p12) fpma f10 = ALPHA_P, f52, f53 ;; (p11) stf8 [Y1] = f15, 2 * SIZE (p12) fpma f11 = ALPHA_P, f54, f55 ;; (p12) stf8 [Y1] = f6, 2 * SIZE (p13) fpma f12 = ALPHA_P, f56, f57 ;; (p12) stf8 [Y1] = f7, 2 * SIZE (p13) fpma f13 = ALPHA_P, f58, f59 ;; (p12) stf8 [Y1] = f10, 2 * SIZE (p14) fpma f14 = ALPHA_P, f60, f61 ;; (p12) stf8 [Y1] = f11, 2 * SIZE (p15) FMA f15 = ALPHA, f62, f63 ;; (p13) stf8 [Y1] = f12, 2 * SIZE ;; (p13) stf8 [Y1] = f13, 2 * SIZE ;; (p14) stf8 [Y1] = f14, 2 * SIZE ;; (p15) stfs [Y1] = f15
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -