📄 zscal.s
字号:
/*********************************************************************//* *//* Optimized BLAS libraries *//* By Kazushige Goto <kgoto@tacc.utexas.edu> *//* *//* Copyright (c) The University of Texas, 2005. All rights reserved. *//* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING *//* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF *//* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, *//* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY *//* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF *//* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO *//* THE USE OF THE SOFTWARE OR DOCUMENTATION. *//* Under no circumstances shall University be liable for incidental, *//* special, indirect, direct or consequential damages or loss of *//* profits, interruption of business, or related expenses which may *//* arise from use of Software or Documentation, including but not *//* limited to those resulting from defects in Software and/or *//* Documentation, or loss or inaccuracy of data of any kind. *//*********************************************************************/#define ASSEMBLER#include "common.h"#define N %i0#if defined(DOUBLE) && !defined(__64BIT__)#define X %i3#define INCX %i4#else#define X %i5#define INCX %i3#endif#define I %i1#define XX %i2#ifdef DOUBLE#define c1 %f0#define c2 %f2#define c3 %f4#define c4 %f6#define c5 %f8#define c6 %f10#define c7 %f12#define c8 %f14#define t1 %f16#define t2 %f18#define t3 %f20#define t4 %f22#define t5 %f24#define t6 %f26#define t7 %f28#define t8 %f30#define c9 %f32#define c10 %f34#define c11 %f36#define c12 %f38#define c13 %f40#define c14 %f42#define c15 %f44#define c16 %f46#define s1 %f32#define s2 %f34#define s3 %f36#define s4 %f38#define s5 %f40#define s6 %f42#define s7 %f44#define s8 %f46#define FZERO %f48#define ALPHA_R %f50#define ALPHA_I %f52#else#define c1 %f0#define c2 %f1#define c3 %f2#define c4 %f3#define c5 %f4#define c6 %f5#define c7 %f6#define c8 %f7#define c9 %f8#define c10 %f9#define c11 %f10#define c12 %f11#define c13 %f12#define c14 %f13#define c15 %f14#define c16 %f15#define s1 %f8#define s2 %f9#define s3 %f10#define s4 %f11#define s5 %f12#define s6 %f13#define s7 %f14#define s8 %f15#define t1 %f16#define t2 %f17#define t3 %f18#define t4 %f19#define t5 %f20#define t6 %f21#define t7 %f22#define t8 %f23#define FZERO %f24#define ALPHA_R %f25#define ALPHA_I %f26#endif#define PREFETCHSIZE 128 PROLOGUE SAVESP#ifndef __64BIT__#ifdef DOUBLE st %g0, [%fp + STACK_START + 8] st %g0, [%fp + STACK_START + 12] st %i3, [%fp + STACK_START + 16] st %i4, [%fp + STACK_START + 20] st %i5, [%fp + STACK_START + 24] ld [%fp+ STACK_START + 32], X ld [%fp+ STACK_START + 36], INCX#else st %g0, [%fp + STACK_START + 8] st %i3, [%fp + STACK_START + 16] st %i4, [%fp + STACK_START + 24] ld [%fp+ STACK_START + 28], INCX#endif LDF [%fp + STACK_START + 8], FZERO LDF [%fp + STACK_START + 16], ALPHA_R LDF [%fp + STACK_START + 24], ALPHA_I#else#ifdef DOUBLE stx %g0, [%fp + STACK_START + 32] FMOV %f6, ALPHA_R FMOV %f8, ALPHA_I#else st %g0, [%fp + STACK_START + 32] FMOV %f7, ALPHA_R FMOV %f9, ALPHA_I#endif ldx [%fp + STACK_START + 56], INCX LDF [%fp + STACK_START + 32], FZERO#endif FCMP ALPHA_R, FZERO fbne .LL100 sll INCX, ZBASE_SHIFT, INCX FCMP ALPHA_I, FZERO fbne .LL100 nop cmp INCX, 2 * SIZE bne .LL50 nop sra N, 2, I cmp I, 0 ble,pn %icc, .LL15 nop.LL11: prefetch [X + PREFETCHSIZE * SIZE], 0 STF FZERO, [X + 0 * SIZE] add I, -1, I STF FZERO, [X + 1 * SIZE] cmp I, 0 STF FZERO, [X + 2 * SIZE] STF FZERO, [X + 3 * SIZE] STF FZERO, [X + 4 * SIZE] STF FZERO, [X + 5 * SIZE] add X, 8 * SIZE, X STF FZERO, [X - 2 * SIZE] bg,pt %icc, .LL11 STF FZERO, [X - 1 * SIZE].LL15: and N, 3, I cmp I, 0 ble,a,pn %icc, .LL19 nop.LL16: STF FZERO, [X + 0 * SIZE] STF FZERO, [X + 1 * SIZE] add I, -1, I cmp I, 0 bg,pt %icc, .LL16 add X, 2 * SIZE, X.LL19: return %i7 + 8 clr %o0.LL50: sra N, 2, I cmp I, 0 ble,pn %icc, .LL55 nop.LL51: STF FZERO, [X + 0 * SIZE] add I, -1, I STF FZERO, [X + 1 * SIZE] add X, INCX, X STF FZERO, [X + 0 * SIZE] cmp I, 0 STF FZERO, [X + 1 * SIZE] add X, INCX, X STF FZERO, [X + 0 * SIZE] STF FZERO, [X + 1 * SIZE] add X, INCX, X STF FZERO, [X + 0 * SIZE] STF FZERO, [X + 1 * SIZE] bg,pt %icc, .LL51 add X, INCX, X.LL55: and N, 3, I cmp I, 0 ble,a,pn %icc, .LL59 nop.LL56: STF FZERO, [X + 0 * SIZE] add I, -1, I STF FZERO, [X + 1 * SIZE] cmp I, 0 bg,pt %icc, .LL56 add X, INCX, X.LL59: return %i7 + 8 clr %o0.LL100: cmp INCX, 2 * SIZE bne .LL150 sra N, 2, I cmp I, 0 ble,pn %icc, .LL115 nop LDF [X + 0 * SIZE], c1 LDF [X + 1 * SIZE], c2 LDF [X + 2 * SIZE], c3 LDF [X + 3 * SIZE], c4 LDF [X + 4 * SIZE], c5 LDF [X + 5 * SIZE], c6 LDF [X + 6 * SIZE], c7 LDF [X + 7 * SIZE], c8 FMUL ALPHA_R, c1, t1 FMUL ALPHA_I, c2, t3 FMUL ALPHA_I, c1, t2 LDF [X + 8 * SIZE], c1 FMUL ALPHA_R, c2, t4 LDF [X + 9 * SIZE], c2 FMUL ALPHA_R, c3, t5 deccc I FMUL ALPHA_I, c4, t7 FSUB t1, t3, s1 FMUL ALPHA_I, c3, t6 LDF [X + 10 * SIZE], c3 FMUL ALPHA_R, c4, t8 LDF [X + 11 * SIZE], c4 FADD t4, t2, s2 ble,pn %icc, .LL112 nop.LL111: prefetch [X + PREFETCHSIZE * SIZE], 0 FMUL ALPHA_R, c5, t1 FMUL ALPHA_I, c6, t3 FSUB t5, t7, s3 STF s1, [X + 0 * SIZE] FMUL ALPHA_I, c5, t2 LDF [X + 12 * SIZE], c5 FMUL ALPHA_R, c6, t4 LDF [X + 13 * SIZE], c6 FADD t8, t6, s4 STF s2, [X + 1 * SIZE] FMUL ALPHA_R, c7, t5 FMUL ALPHA_I, c8, t7 FSUB t1, t3, s5 STF s3, [X + 2 * SIZE] FMUL ALPHA_I, c7, t6 LDF [X + 14 * SIZE], c7 FMUL ALPHA_R, c8, t8 LDF [X + 15 * SIZE], c8 FADD t4, t2, s6 STF s4, [X + 3 * SIZE] FMUL ALPHA_R, c1, t1 FMUL ALPHA_I, c2, t3 FSUB t5, t7, s7 STF s5, [X + 4 * SIZE] FMUL ALPHA_I, c1, t2 LDF [X + 16 * SIZE], c1 FMUL ALPHA_R, c2, t4 LDF [X + 17 * SIZE], c2 FADD t8, t6, s8 STF s6, [X + 5 * SIZE] FMUL ALPHA_R, c3, t5 deccc I FMUL ALPHA_I, c4, t7 FSUB t1, t3, s1 STF s7, [X + 6 * SIZE] FMUL ALPHA_I, c3, t6 LDF [X + 18 * SIZE], c3 FMUL ALPHA_R, c4, t8 LDF [X + 19 * SIZE], c4 FADD t4, t2, s2 STF s8, [X + 7 * SIZE] bg,pt %icc, .LL111 add X, 8 * SIZE, X.LL112: FMUL ALPHA_R, c5, t1 FMUL ALPHA_I, c6, t3 FSUB t5, t7, s3 STF s1, [X + 0 * SIZE] FMUL ALPHA_I, c5, t2 FMUL ALPHA_R, c6, t4 FADD t8, t6, s4 STF s2, [X + 1 * SIZE] FMUL ALPHA_R, c7, t5 FMUL ALPHA_I, c8, t7 FSUB t1, t3, s5 STF s3, [X + 2 * SIZE] FMUL ALPHA_I, c7, t6 FMUL ALPHA_R, c8, t8 FADD t4, t2, s6 STF s4, [X + 3 * SIZE] FSUB t5, t7, s7 FADD t8, t6, s8 STF s5, [X + 4 * SIZE] STF s6, [X + 5 * SIZE] STF s7, [X + 6 * SIZE] STF s8, [X + 7 * SIZE] add X, 8 * SIZE, X.LL115: and N, 3, I cmp I, 0 ble,a,pn %icc, .LL119 nop.LL116: LDF [X + 0 * SIZE], c1 LDF [X + 1 * SIZE], c2 FMUL ALPHA_R, c1, c3 FMUL ALPHA_I, c1, c4 FMUL ALPHA_I, c2, c1 FMUL ALPHA_R, c2, c2 FSUB c3, c1, c1 FADD c2, c4, c2 STF c1, [X + 0 * SIZE] STF c2, [X + 1 * SIZE] add I, -1, I cmp I, 0 bg,pt %icc, .LL116 add X, 2 * SIZE, X.LL119: return %i7 + 8 clr %o0.LL150: sra N, 2, I cmp I, 0 ble,pn %icc, .LL155 mov X, XX.LL151: LDF [X + 0 * SIZE], c1 LDF [X + 1 * SIZE], c2 add X, INCX, X LDF [X + 0 * SIZE], c3 FMUL ALPHA_R, c1, c9 LDF [X + 1 * SIZE], c4 FMUL ALPHA_I, c1, c10 add X, INCX, X LDF [X + 0 * SIZE], c5 FMUL ALPHA_I, c2, c1 LDF [X + 1 * SIZE], c6 FMUL ALPHA_R, c2, c2 add X, INCX, X LDF [X + 0 * SIZE], c7 FMUL ALPHA_R, c3, c11 LDF [X + 1 * SIZE], c8 FMUL ALPHA_I, c3, c12 add X, INCX, X FMUL ALPHA_I, c4, c3 FMUL ALPHA_R, c4, c4 FMUL ALPHA_R, c5, c13 FMUL ALPHA_I, c5, c14 FMUL ALPHA_I, c6, c5 FMUL ALPHA_R, c6, c6 FMUL ALPHA_R, c7, c15 FSUB c9, c1, c1 FMUL ALPHA_I, c7, c16 FADD c2, c10, c2 FMUL ALPHA_I, c8, c7 FSUB c11, c3, c3 FMUL ALPHA_R, c8, c8 FADD c4, c12, c4 STF c1, [XX + 0 * SIZE] FSUB c13, c5, c5 add I, -1, I STF c2, [XX + 1 * SIZE] FADD c6, c14, c6 add XX, INCX, XX STF c3, [XX + 0 * SIZE] FSUB c15, c7, c7 cmp I, 0 STF c4, [XX + 1 * SIZE] FADD c8, c16, c8 add XX, INCX, XX STF c5, [XX + 0 * SIZE] STF c6, [XX + 1 * SIZE] add XX, INCX, XX STF c7, [XX + 0 * SIZE] STF c8, [XX + 1 * SIZE] bg,pt %icc, .LL151 add XX, INCX, XX.LL155: and N, 3, I cmp I, 0 ble,a,pn %icc, .LL159 nop.LL156: LDF [X + 0 * SIZE], c1 LDF [X + 1 * SIZE], c2 FMUL ALPHA_R, c1, c3 FMUL ALPHA_I, c1, c4 FMUL ALPHA_I, c2, c1 FMUL ALPHA_R, c2, c2 FSUB c3, c1, c1 FADD c2, c4, c2 STF c1, [X + 0 * SIZE] STF c2, [X + 1 * SIZE] add I, -1, I cmp I, 0 bg,pt %icc, .LL156 add X, INCX, X.LL159: return %i7 + 8 clr %o0 EPILOGUE
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -