📄 qscal.s
字号:
/*********************************************************************//* *//* Optimized BLAS libraries *//* By Kazushige Goto <kgoto@tacc.utexas.edu> *//* *//* Copyright (c) The University of Texas, 2005. All rights reserved. *//* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING *//* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF *//* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, *//* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY *//* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF *//* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO *//* THE USE OF THE SOFTWARE OR DOCUMENTATION. *//* Under no circumstances shall University be liable for incidental, *//* special, indirect, direct or consequential damages or loss of *//* profits, interruption of business, or related expenses which may *//* arise from use of Software or Documentation, including but not *//* limited to those resulting from defects in Software and/or *//* Documentation, or loss or inaccuracy of data of any kind. *//*********************************************************************/#define ASSEMBLER#include "common.h"#define PREFETCH_SIZE (16 * 16)#define ALPHA f8#define N r32#define X1 r38#define INCX r39#define X2 r14#define Y1 r15#define Y2 r16#define PRE1 r17#define I r18#define NAND15 r19#define INCX5 r20#define INCX8 r21#define XX r22#define PR r30#define ARLC r31 PROLOGUE .prologue PROFCODE { .mfi shladd INCX = INCX, BASE_SHIFT, r0 fcmp.eq p0, p6 = ALPHA, f0 .save ar.lc, ARLC mov ARLC = ar.lc } .body { .mib cmp.ge p7, p0 = 0, N (p7) br.ret.sptk.many b0 } ;; { .mmi mov XX = X1 mov PR = pr } { .mmi shladd INCX5 = INCX, 2, INCX shladd INCX8 = INCX, 3, r0 } ;; { .mmi shladd X2 = INCX, 2, X1 nop.m 0 mov ar.ec = 5 } { .mmi and NAND15 = 15, N nop.m 0 shr I = N, 4 } ;; { .mmi adds I = -1, I nop.m 0 tbit.z p0, p12 = N, 3 } { .mmb cmp.ge p9, p0 = 0, NAND15 nop.m 0 (p6) br.cond.dptk .L100 // if (alpha != 0) goto L3 } ;; { .mmi adds PRE1 = (PREFETCH_SIZE + 4) * SIZE, X1 mov ar.lc = I } { .mmb cmp.gt p8, p0 = 0, I (p8) br.cond.dpnt .L30 } ;; .align 32.L20: {.mmi STFD [X1] = f0 STFD [X2] = f0 nop.i 0 } {.mmi lfetch.excl.nt1 [PRE1], INCX8 add X1 = INCX, X1 add X2 = INCX, X2 } ;; {.mmi STFD [X1] = f0 STFD [X2] = f0 nop.i 0 } {.mmi add X1 = INCX, X1 add X2 = INCX, X2 nop.i 0 } ;; {.mmi STFD [X1] = f0 STFD [X2] = f0 nop.i 0 } {.mmi add X1 = INCX, X1 add X2 = INCX, X2 nop.i 0 } ;; {.mmi STFD [X1] = f0 STFD [X2] = f0 nop.i 0 } {.mmi add X1 = INCX5, X1 add X2 = INCX5, X2 nop.i 0 } ;; {.mmi STFD [X1] = f0 STFD [X2] = f0 nop.i 0 } {.mmi lfetch.excl.nt1 [PRE1], INCX8 add X1 = INCX, X1 add X2 = INCX, X2 } ;; {.mmi STFD [X1] = f0 STFD [X2] = f0 nop.i 0 } {.mmi add X1 = INCX, X1 add X2 = INCX, X2 nop.i 0 } ;; {.mmi STFD [X1] = f0 STFD [X2] = f0 nop.i 0 } {.mmi add X1 = INCX, X1 add X2 = INCX, X2 nop.i 0 } ;; {.mmi STFD [X1] = f0 STFD [X2] = f0 nop.i 0 } {.mmb add X1 = INCX5, X1 add X2 = INCX5, X2 br.cloop.sptk.few .L20 } ;; .align 16.L30: { .mmi (p12) STFD [X1] = f0 (p12) STFD [X2] = f0 mov ar.lc = ARLC } { .mmb (p12) add X1 = INCX, X1 (p12) add X2 = INCX, X2 (p9) br.ret.sptk.many b0 } ;; { .mmi (p12) STFD [X1] = f0 (p12) add X1 = INCX, X1 tbit.z p0, p13 = N, 2 } { .mmi (p12) STFD [X2] = f0 (p12) add X2 = INCX, X2 tbit.z p0, p14 = N, 1 } ;; { .mmi (p12) STFD [X1] = f0 (p12) add X1 = INCX, X1 tbit.z p0, p15 = N, 0 } { .mmb (p12) STFD [X2] = f0 (p12) add X2 = INCX, X2 nop __LINE__ } ;; { .mmb (p12) STFD [X1] = f0 (p12) add X1 = INCX5, X1 nop __LINE__ } { .mmb (p12) STFD [X2] = f0 (p12) add X2 = INCX5, X2 nop __LINE__ } ;; { .mmb (p13) STFD [X1] = f0 (p13) add X1 = INCX, X1 nop __LINE__ } ;; { .mmb (p13) STFD [X1] = f0 (p13) add X1 = INCX, X1 nop __LINE__ } ;; { .mmb (p13) STFD [X1] = f0 (p13) add X1 = INCX, X1 nop __LINE__ } ;; { .mmb (p13) STFD [X1] = f0 (p13) add X1 = INCX, X1 nop __LINE__ } ;; { .mmb (p14) STFD [X1] = f0 (p14) add X1 = INCX, X1 nop __LINE__ } ;; { .mmb (p14) STFD [X1] = f0 (p14) add X1 = INCX, X1 nop __LINE__ } ;; { .mmb (p15) STFD [X1] = f0 nop.m 0 br.ret.sptk.many b0 } ;; .align 32.L100: { .mmi mov Y1 = X1 shladd Y2 = INCX, 2, X1 mov pr.rot= 0 } ;; { .mmi mov ar.lc = I } cmp.eq p16, p0 = r0, r0 ;; { .mmi adds PRE1 = (PREFETCH_SIZE + 4) * SIZE, X1 nop.m 0 mov.i ar.ec = 6 } { .mmb cmp.gt p8, p0 = 0, I nop.m 0 (p8) br.cond.dpnt .L320 } ;; .align 32.L310: { .mmf (p16) lfetch.excl.nt1 [PRE1], INCX8 (p22) STFD [Y1] = f12 (p21) FMPY f6 = ALPHA, f37 } { .mmi (p16) LDFD f32 = [X1], INCX nop __LINE__ (p22) add Y1 = INCX, Y1 } ;; { .mmf (p22) STFD [Y1] = f13 (p16) LDFD f38 = [X1], INCX (p21) FMPY f7 = ALPHA, f43 } { .mmi nop __LINE__ nop __LINE__ (p22) add Y1 = INCX, Y1 } ;; { .mmf (p22) STFD [Y1] = f14 (p16) LDFD f44 = [X1], INCX (p21) FMPY f10 = ALPHA, f49 } { .mmi nop __LINE__ nop __LINE__ (p22) add Y1 = INCX, Y1 } ;; { .mmf (p22) STFD [Y1] = f15 (p16) LDFD f50 = [X1], INCX (p21) FMPY f11 = ALPHA, f55 } { .mmi nop __LINE__ nop __LINE__ (p22) add Y1 = INCX, Y1 } ;; { .mmf (p21) STFD [Y1] = f6 (p16) LDFD f56 = [X1], INCX (p21) FMPY f12 = ALPHA, f61 } { .mmi nop __LINE__ nop __LINE__ (p21) add Y1 = INCX, Y1 } ;; { .mmf (p16) lfetch.excl.nt1 [PRE1], INCX8 (p21) STFD [Y1] = f7 (p21) FMPY f13 = ALPHA, f67 } { .mmi (p16) LDFD f62 = [X1], INCX nop __LINE__ (p21) add Y1 = INCX, Y1 } ;; { .mmf (p21) STFD [Y1] = f10 (p16) LDFD f68 = [X1], INCX (p21) FMPY f14 = ALPHA, f73 } { .mmi nop __LINE__ nop __LINE__ (p21) add Y1 = INCX, Y1 } ;; { .mmf (p21) STFD [Y1] = f11 (p16) LDFD f74 = [X1], INCX (p21) FMPY f15 = ALPHA, f79 } { .mmi nop __LINE__ nop __LINE__ (p21) add Y1 = INCX, Y1 } ;; { .mmf (p21) STFD [Y1] = f12 (p16) LDFD f80 = [X1], INCX (p21) FMPY f6 = ALPHA, f85 } { .mmi nop __LINE__ nop __LINE__ (p21) add Y1 = INCX, Y1 } ;; { .mmf (p21) STFD [Y1] = f13 (p16) LDFD f86 = [X1], INCX (p21) FMPY f7 = ALPHA, f91 } { .mmi nop __LINE__ nop __LINE__ (p21) add Y1 = INCX, Y1 } ;; { .mmf (p21) STFD [Y1] = f14 (p16) LDFD f92 = [X1], INCX (p21) FMPY f10 = ALPHA, f97 } { .mmi nop __LINE__ nop __LINE__ (p21) add Y1 = INCX, Y1 } ;; { .mmf (p21) STFD [Y1] = f15 (p16) LDFD f98 = [X1], INCX (p21) FMPY f11 = ALPHA, f103 } { .mmi nop __LINE__ nop __LINE__ (p21) add Y1 = INCX, Y1 } ;; { .mmf (p21) STFD [Y1] = f6 (p16) LDFD f104 = [X1], INCX (p21) FMPY f12 = ALPHA, f109 } { .mmi nop __LINE__ nop __LINE__ (p21) add Y1 = INCX, Y1 } ;; { .mmf (p21) STFD [Y1] = f7 (p16) LDFD f110 = [X1], INCX (p21) FMPY f13 = ALPHA, f115 } { .mmi nop __LINE__ nop __LINE__ (p21) add Y1 = INCX, Y1 } ;; { .mmf (p21) STFD [Y1] = f10 (p16) LDFD f116 = [X1], INCX (p21) FMPY f14 = ALPHA, f121 } { .mmi nop __LINE__ nop __LINE__ (p21) add Y1 = INCX, Y1 } ;; { .mmf (p21) STFD [Y1] = f11 (p16) LDFD f122 = [X1], INCX (p21) FMPY f15 = ALPHA, f127 } { .mmb nop __LINE__ (p21) add Y1 = INCX, Y1 br.ctop.sptk.few .L310 } ;; { .mmi STFD [Y1] = f12 add Y1 = INCX, Y1 shladd Y2 = INCX, 2, X1 } ;; { .mmi STFD [Y1] = f13 add Y1 = INCX, Y1 shladd X2 = INCX, 2, X1 } ;; { .mmi STFD [Y1] = f14 nop __LINE__ add Y1 = INCX, Y1 } ;; { .mmi STFD [Y1] = f15 nop __LINE__ add Y1 = INCX, Y1 } ;; .align 16.L320: { .mmi (p12) LDFD f48 = [X1], INCX (p12) LDFD f52 = [X2], INCX mov ar.lc = ARLC } ;; { .mmi (p12) LDFD f49 = [X1], INCX (p12) LDFD f53 = [X2], INCX mov pr = PR, -65474 } { .mmb nop __LINE__ nop __LINE__ (p9) br.ret.sptk.many b0 } ;; { .mmi (p12) LDFD f50 = [X1], INCX (p12) LDFD f54 = [X2], INCX tbit.z p0, p13 = N, 2 } ;; { .mmi (p12) LDFD f51 = [X1], INCX5 (p12) LDFD f55 = [X2], INCX5 tbit.z p0, p14 = N, 1 } ;; (p13) LDFD f56 = [X1], INCX tbit.z p0, p15 = N, 0 ;; (p13) LDFD f57 = [X1], INCX ;; { .mmf (p13) LDFD f58 = [X1], INCX nop __LINE__ (p12) FMPY f48 = ALPHA, f48 } { .mmf nop __LINE__ nop __LINE__ (p12) FMPY f52 = ALPHA, f52 } ;; { .mmf (p13) LDFD f59 = [X1], INCX nop __LINE__ (p12) FMPY f49 = ALPHA, f49 } { .mmf nop __LINE__ nop __LINE__ (p12) FMPY f53 = ALPHA, f53 } ;; { .mmf (p14) LDFD f60 = [X1], INCX nop __LINE__ (p12) FMPY f50 = ALPHA, f50 } { .mmf nop __LINE__ nop __LINE__ (p12) FMPY f54 = ALPHA, f54 } ;; { .mmf (p14) LDFD f61 = [X1], INCX nop __LINE__ (p12) FMPY f51 = ALPHA, f51 } { .mmf nop __LINE__ nop __LINE__ (p12) FMPY f55 = ALPHA, f55 } ;; { .mmf (p12) STFD [Y1] = f48 (p12) STFD [Y2] = f52 (p13) FMPY f56 = ALPHA, f56 } { .mmi (p15) LDFD f62 = [X1] (p12) add Y1 = INCX, Y1 (p12) add Y2 = INCX, Y2 } ;; { .mmf (p12) STFD [Y1] = f49 (p12) STFD [Y2] = f53 (p13) FMPY f57 = ALPHA, f57 } { .mmi (p12) add Y1 = INCX, Y1 (p12) add Y2 = INCX, Y2 nop __LINE__ } ;; { .mmf (p12) STFD [Y1] = f50 (p12) STFD [Y2] = f54 (p13) FMPY f58 = ALPHA, f58 } { .mmi (p12) add Y1 = INCX, Y1 (p12) add Y2 = INCX, Y2 nop __LINE__ } ;; { .mmf (p12) STFD [Y1] = f51 (p12) STFD [Y2] = f55 (p13) FMPY f59 = ALPHA, f59 } { .mmi (p12) add Y1 = INCX5, Y1 (p12) add Y2 = INCX5, Y2 nop __LINE__ } ;; { .mfi (p13) STFD [Y1] = f56 (p14) FMPY f60 = ALPHA, f60 (p13) add Y1 = INCX, Y1 } ;; { .mfi (p13) STFD [Y1] = f57 (p14) FMPY f61 = ALPHA, f61 (p13) add Y1 = INCX, Y1 } ;; { .mfi (p13) STFD [Y1] = f58 (p15) FMPY f62 = ALPHA, f62 (p13) add Y1 = INCX, Y1 } ;; { .mmi (p13) STFD [Y1] = f59 nop __LINE__ (p13) add Y1 = INCX, Y1 } ;; { .mmi (p14) STFD [Y1] = f60 nop __LINE__ (p14) add Y1 = INCX, Y1 } ;; { .mmi (p14) STFD [Y1] = f61 nop __LINE__ (p14) add Y1 = INCX, Y1 } ;; { .mib (p15) STFD [Y1] = f62 mov pr = PR, -65474 br.ret.sptk.many b0 } EPILOGUE
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -