📄 zgemv_t.s
字号:
/*********************************************************************//* *//* Optimized BLAS libraries *//* By Kazushige Goto <kgoto@tacc.utexas.edu> *//* *//* Copyright (c) The University of Texas, 2005. All rights reserved. *//* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING *//* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF *//* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, *//* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY *//* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF *//* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO *//* THE USE OF THE SOFTWARE OR DOCUMENTATION. *//* Under no circumstances shall University be liable for incidental, *//* special, indirect, direct or consequential damages or loss of *//* profits, interruption of business, or related expenses which may *//* arise from use of Software or Documentation, including but not *//* limited to those resulting from defects in Software and/or *//* Documentation, or loss or inaccuracy of data of any kind. *//*********************************************************************/#define ASSEMBLER#include "common.h"#define P 4000#define M %i0#define N %i1#define A %i5#define LDA %i2#define X %i3#define INCX %i4 #define Y %l0#define INCY %l1#define BUFFER %l2#define I %l3#define IS %l4#define J %l5#define MIN_M %l6#define XP %l7#define A1 %o0#define A2 %o1#define A3 %o2#define A4 %o3#define X1 %o4#define Y1 %o5#define PNLDA %g1#define Y2 %o7 /* Danger? */#ifdef DOUBLE#define t1 %f0#define t2 %f2#define t3 %f4#define t4 %f6#define c1 %f8#define c2 %f10#define c3 %f12#define c4 %f14#define c5 %f16#define c6 %f18#define c7 %f20#define c8 %f22#define c9 %f24#define c10 %f26#define c11 %f28#define c12 %f30#define c13 %f32#define c14 %f34#define c15 %f36#define c16 %f38#define a1 %f40#define a2 %f42#define a3 %f44#define a4 %f46#define a5 %f48#define a6 %f50#define a7 %f52#define a8 %f54#define b1 %f56#define b2 %f58#define b3 %f60#define b4 %f62#else#define t1 %f0#define t2 %f1#define t3 %f2#define t4 %f3#define c1 %f4#define c2 %f5#define c3 %f6#define c4 %f7#define c5 %f8#define c6 %f9#define c7 %f10#define c8 %f11#define c9 %f12#define c10 %f13#define c11 %f14#define c12 %f15#define c13 %f16#define c14 %f17#define c15 %f18#define c16 %f19#define a1 %f20#define a2 %f21#define a3 %f22#define a4 %f23#define a5 %f24#define a6 %f25#define a7 %f26#define a8 %f27#define b1 %f28#define b2 %f29#define b3 %f30#define b4 %f31#endif#ifndef __64BIT__#define FZERO [%fp + STACK_START + 8]#define ALPHA_R [%fp + STACK_START + 16]#ifndef DOUBLE#define ALPHA_I [%fp + STACK_START + 20]#else#define ALPHA_I [%fp + STACK_START + 24]#endif#else#define FZERO [%fp + STACK_START + 24]#define ALPHA_R [%fp + STACK_START + 32]#define ALPHA_I [%fp + STACK_START + 40]#endif#ifdef DOUBLE#define PREFETCHSIZE 18#else#define PREFETCHSIZE 36#endif PROLOGUE SAVESP nop#ifndef __64BIT__#ifdef DOUBLE st %g0, [%fp + STACK_START + 8] st %g0, [%fp + STACK_START + 12] st %i3, [%fp + STACK_START + 16] /* ALPHA_R */ st %i4, [%fp + STACK_START + 20] st %i5, [%fp + STACK_START + 24] /* ALPHA_I */ ld [%fp + STACK_START + 32], A ld [%fp + STACK_START + 36], LDA ld [%fp + STACK_START + 40], X ld [%fp + STACK_START + 44], INCX ld [%fp + STACK_START + 48], Y ld [%fp + STACK_START + 52], INCY ld [%fp + STACK_START + 56], BUFFER#else st %g0, [%fp + STACK_START + 8] st %i3, [%fp + STACK_START + 16] /* ALPHA_R */ st %i4, [%fp + STACK_START + 20] /* ALPHA_I */ ld [%fp + STACK_START + 28], LDA ld [%fp + STACK_START + 32], X ld [%fp + STACK_START + 36], INCX ld [%fp + STACK_START + 40], Y ld [%fp + STACK_START + 44], INCY ld [%fp + STACK_START + 48], BUFFER#endif#else#ifdef DOUBLE stx %g0, FZERO std %f6, ALPHA_R std %f8, ALPHA_I#else st %g0, FZERO st %f7, ALPHA_R st %f9, ALPHA_I#endif ldx [%fp+ STACK_START + 56], LDA ldx [%fp+ STACK_START + 64], X ldx [%fp+ STACK_START + 72], INCX ldx [%fp+ STACK_START + 80], Y ldx [%fp+ STACK_START + 88], INCY ldx [%fp+ STACK_START + 96], BUFFER#endif clr IS mov P, I sll LDA, ZBASE_SHIFT, LDA sll I, ZBASE_SHIFT, I smul LDA, N, PNLDA sll INCX, ZBASE_SHIFT, INCX sll INCY, ZBASE_SHIFT, INCY sub I, PNLDA, PNLDA.LL10: sll IS, ZBASE_SHIFT, I sub M, IS, MIN_M mov P, J cmp MIN_M, J nop movg %icc, J, MIN_M nop cmp INCX, 2 * SIZE beq .LL100 add X, I, XP sra MIN_M, 2, I mov BUFFER, XP cmp I, 0 ble,pn %icc, .LL15 mov BUFFER, Y1.LL11: LDF [X + 0 * SIZE], a1 LDF [X + 1 * SIZE], a2 add X, INCX, X LDF [X + 0 * SIZE], a3 LDF [X + 1 * SIZE], a4 add X, INCX, X LDF [X + 0 * SIZE], a5 LDF [X + 1 * SIZE], a6 add X, INCX, X LDF [X + 0 * SIZE], a7 LDF [X + 1 * SIZE], a8 add X, INCX, X STF a1, [Y1 + 0 * SIZE] add I, -1, I STF a2, [Y1 + 1 * SIZE] cmp I, 0 STF a3, [Y1 + 2 * SIZE] STF a4, [Y1 + 3 * SIZE] STF a5, [Y1 + 4 * SIZE] STF a6, [Y1 + 5 * SIZE] STF a7, [Y1 + 6 * SIZE] STF a8, [Y1 + 7 * SIZE] bg,pn %icc, .LL11 add Y1, 8 * SIZE, Y1.LL15: and MIN_M, 3, I cmp I, 0 ble,pn %icc, .LL100 nop.LL16: LDF [X + 0 * SIZE], a1 LDF [X + 1 * SIZE], a2 add X, INCX, X add I, -1, I cmp I, 0 nop STF a1, [Y1 + 0 * SIZE] STF a2, [Y1 + 1 * SIZE] bg,pn %icc, .LL16 add Y1, 2 * SIZE, Y1.LL100: sra N, 2, J cmp J, 0 ble %icc, .LL200 mov Y, Y1.LL110: LDF FZERO, t1 FMOV t1, c1 sra MIN_M, 2, I FMOV t1, c2 add A, LDA, A2 FMOV t1, c3 mov A, A1 FMOV t1, c4 add A2, LDA, A3 FMOV t1, c5 FMOV t1, c6 FMOV t1, c7 FMOV t1, c8 FMOV t1, c9 FMOV t1, c10 FMOV t1, c11 FMOV t1, c12 FMOV t1, c13 FMOV t1, c14 FMOV t1, c15 FMOV t1, c16 add A3, LDA, A4 FMOV t1, t2 mov XP, X1 FMOV t1, t3 add A4, LDA, A cmp I, 0 ble %icc, .LL115 FMOV t1, t4 LDF [A1 + 0 * SIZE], a1 nop LDF [A1 + 1 * SIZE], a2 add A1, 2 * SIZE, A1 LDF [A2 + 0 * SIZE], a3 LDF [A2 + 1 * SIZE], a4 add A2, 2 * SIZE, A2 LDF [A3 + 0 * SIZE], a5 LDF [A3 + 1 * SIZE], a6 add A3, 2 * SIZE, A3 LDF [A4 + 0 * SIZE], a7 LDF [A4 + 1 * SIZE], a8 add A4, 2 * SIZE, A4 LDF [X1 + 0 * SIZE], b1 nop LDF [X1 + 1 * SIZE], b2 nop LDF [X1 + 2 * SIZE], b3 add X1, 4 * SIZE, X1 deccc I ble .LL112 prefetch [Y1 + 7 * SIZE], 2#ifndef XCONJ#define FADDX FADD#else#define FADDX FSUB#endif.LL111: FADD c13, t1, c13 prefetch [A1 + PREFETCHSIZE * SIZE], 1 FMUL a1, b1, t1 nop FADDX c14, t2, c14 nop FMUL a1, b2, t2 LDF [A1 + 0 * SIZE], a1 FADD c15, t3, c15 nop FMUL a2, b1, t3 LDF [X1 - 1 * SIZE], b4 FADD c16, t4, c16 nop FMUL a2, b2, t4 LDF [A1 + 1 * SIZE], a2 FADD c1, t1, c1 nop FMUL a3, b1, t1 nop FADDX c2, t2, c2 nop FMUL a3, b2, t2 LDF [A2 + 0 * SIZE], a3 FADD c3, t3, c3 nop FMUL a4, b1, t3 nop FADD c4, t4, c4 nop FMUL a4, b2, t4 LDF [A2 + 1 * SIZE], a4 FADD c5, t1, c5 nop FMUL a5, b1, t1 nop FADDX c6, t2, c6 nop FMUL a5, b2, t2 LDF [A3 + 0 * SIZE], a5 FADD c7, t3, c7 nop FMUL a6, b1, t3 nop FADD c8, t4, c8 nop FMUL a6, b2, t4 LDF [A3 + 1 * SIZE], a6 FADD c9, t1, c9 nop FMUL a7, b1, t1 nop FADDX c10, t2, c10 nop FMUL a7, b2, t2 LDF [A4 + 0 * SIZE], a7 FADD c11, t3, c11 nop FMUL a8, b1, t3 LDF [X1 + 0 * SIZE], b1 FADD c12, t4, c12 nop FMUL a8, b2, t4 LDF [A4 + 1 * SIZE], a8 FADD c13, t1, c13 nop FMUL a1, b3, t1 prefetch [A2 + PREFETCHSIZE * SIZE], 1 FADDX c14, t2, c14 nop FMUL a1, b4, t2 LDF [A1 + 2 * SIZE], a1 FADD c15, t3, c15 nop FMUL a2, b3, t3 LDF [X1 + 1 * SIZE], b2 FADD c16, t4, c16 nop FMUL a2, b4, t4 LDF [A1 + 3 * SIZE], a2 FADD c1, t1, c1 nop FMUL a3, b3, t1 nop FADDX c2, t2, c2 nop FMUL a3, b4, t2 LDF [A2 + 2 * SIZE], a3 FADD c3, t3, c3 nop FMUL a4, b3, t3 nop FADD c4, t4, c4 nop FMUL a4, b4, t4 LDF [A2 + 3 * SIZE], a4 FADD c5, t1, c5 nop FMUL a5, b3, t1 nop FADDX c6, t2, c6 nop FMUL a5, b4, t2 LDF [A3 + 2 * SIZE], a5 FADD c7, t3, c7 nop FMUL a6, b3, t3 nop FADD c8, t4, c8 nop FMUL a6, b4, t4 LDF [A3 + 3 * SIZE], a6 FADD c9, t1, c9 nop FMUL a7, b3, t1 nop FADDX c10, t2, c10 nop FMUL a7, b4, t2 LDF [A4 + 2 * SIZE], a7 FADD c11, t3, c11 nop FMUL a8, b3, t3 LDF [X1 + 2 * SIZE], b3 FADD c12, t4, c12 nop FMUL a8, b4, t4 LDF [A4 + 3 * SIZE], a8 FADD c13, t1, c13 prefetch [A3 + PREFETCHSIZE * SIZE], 1 FMUL a1, b1, t1 nop FADDX c14, t2, c14 nop FMUL a1, b2, t2 LDF [A1 + 4 * SIZE], a1 FADD c15, t3, c15 nop FMUL a2, b1, t3 LDF [X1 + 3 * SIZE], b4 FADD c16, t4, c16 nop FMUL a2, b2, t4 LDF [A1 + 5 * SIZE], a2 FADD c1, t1, c1 nop FMUL a3, b1, t1 nop FADDX c2, t2, c2 nop FMUL a3, b2, t2 LDF [A2 + 4 * SIZE], a3 FADD c3, t3, c3 nop FMUL a4, b1, t3 nop FADD c4, t4, c4 nop FMUL a4, b2, t4 LDF [A2 + 5 * SIZE], a4 FADD c5, t1, c5 nop FMUL a5, b1, t1 nop FADDX c6, t2, c6 nop FMUL a5, b2, t2 LDF [A3 + 4 * SIZE], a5 FADD c7, t3, c7 deccc I FMUL a6, b1, t3 nop FADD c8, t4, c8 nop FMUL a6, b2, t4 LDF [A3 + 5 * SIZE], a6 FADD c9, t1, c9 nop FMUL a7, b1, t1 nop FADDX c10, t2, c10 nop FMUL a7, b2, t2 LDF [A4 + 4 * SIZE], a7 FADD c11, t3, c11 nop FMUL a8, b1, t3 LDF [X1 + 4 * SIZE], b1 FADD c12, t4, c12 nop FMUL a8, b2, t4 LDF [A4 + 5 * SIZE], a8 FADD c13, t1, c13 prefetch [A4 + PREFETCHSIZE * SIZE], 1 FMUL a1, b3, t1 nop FADDX c14, t2, c14 nop FMUL a1, b4, t2 LDF [A1 + 6 * SIZE], a1 FADD c15, t3, c15 nop FMUL a2, b3, t3 LDF [X1 + 5 * SIZE], b2 FADD c16, t4, c16 nop FMUL a2, b4, t4 LDF [A1 + 7 * SIZE], a2 FADD c1, t1, c1 add A1, 8 * SIZE, A1 FMUL a3, b3, t1 nop FADDX c2, t2, c2 nop FMUL a3, b4, t2 LDF [A2 + 6 * SIZE], a3 FADD c3, t3, c3 nop FMUL a4, b3, t3 nop FADD c4, t4, c4 nop FMUL a4, b4, t4 LDF [A2 + 7 * SIZE], a4 FADD c5, t1, c5 add A2, 8 * SIZE, A2 FMUL a5, b3, t1 nop FADDX c6, t2, c6 nop FMUL a5, b4, t2 LDF [A3 + 6 * SIZE], a5 FADD c7, t3, c7 add A4, 8 * SIZE, A4 FMUL a6, b3, t3 nop FADD c8, t4, c8 nop FMUL a6, b4, t4 LDF [A3 + 7 * SIZE], a6 FADD c9, t1, c9 add A3, 8 * SIZE, A3 FMUL a7, b3, t1 nop FADDX c10, t2, c10 add X1, 8 * SIZE, X1 FMUL a7, b4, t2 LDF [A4 - 2 * SIZE], a7 FADD c11, t3, c11 nop FMUL a8, b3, t3 LDF [X1 - 2 * SIZE], b3 FADD c12, t4, c12 FMUL a8, b4, t4 bg,pn %icc, .LL111 LDF [A4 - 1 * SIZE], a8.LL112: FADD c13, t1, c13 nop FMUL a1, b1, t1 LDF [X1 - 1 * SIZE], b4 FADDX c14, t2, c14 nop FMUL a1, b2, t2 LDF [A1 + 0 * SIZE], a1 FADD c15, t3, c15 nop FMUL a2, b1, t3 LDF [X1 - 1 * SIZE], b4 FADD c16, t4, c16 nop FMUL a2, b2, t4 LDF [A1 + 1 * SIZE], a2 FADD c1, t1, c1 nop FMUL a3, b1, t1 nop FADDX c2, t2, c2 nop FMUL a3, b2, t2 LDF [A2 + 0 * SIZE], a3 FADD c3, t3, c3 nop FMUL a4, b1, t3 nop FADD c4, t4, c4 nop FMUL a4, b2, t4 LDF [A2 + 1 * SIZE], a4 FADD c5, t1, c5 nop FMUL a5, b1, t1 nop FADDX c6, t2, c6 nop FMUL a5, b2, t2 LDF [A3 + 0 * SIZE], a5 FADD c7, t3, c7 nop FMUL a6, b1, t3 nop FADD c8, t4, c8 nop FMUL a6, b2, t4 LDF [A3 + 1 * SIZE], a6 FADD c9, t1, c9 nop FMUL a7, b1, t1 nop FADDX c10, t2, c10 nop FMUL a7, b2, t2 LDF [A4 + 0 * SIZE], a7 FADD c11, t3, c11 nop FMUL a8, b1, t3 LDF [X1 + 0 * SIZE], b1 FADD c12, t4, c12 nop FMUL a8, b2, t4 LDF [A4 + 1 * SIZE], a8 FADD c13, t1, c13 nop FMUL a1, b3, t1 LDF [X1 + 1 * SIZE], b2 FADDX c14, t2, c14 nop FMUL a1, b4, t2 LDF [A1 + 2 * SIZE], a1 FADD c15, t3, c15 nop FMUL a2, b3, t3 nop FADD c16, t4, c16 nop FMUL a2, b4, t4 LDF [A1 + 3 * SIZE], a2 FADD c1, t1, c1 nop FMUL a3, b3, t1 nop FADDX c2, t2, c2 nop FMUL a3, b4, t2 LDF [A2 + 2 * SIZE], a3 FADD c3, t3, c3 nop FMUL a4, b3, t3 nop FADD c4, t4, c4 nop FMUL a4, b4, t4 LDF [A2 + 3 * SIZE], a4 FADD c5, t1, c5 nop FMUL a5, b3, t1 nop FADDX c6, t2, c6 nop FMUL a5, b4, t2 LDF [A3 + 2 * SIZE], a5 FADD c7, t3, c7 nop FMUL a6, b3, t3 nop FADD c8, t4, c8 nop FMUL a6, b4, t4 LDF [A3 + 3 * SIZE], a6 FADD c9, t1, c9 nop FMUL a7, b3, t1 nop FADDX c10, t2, c10 nop FMUL a7, b4, t2 LDF [A4 + 2 * SIZE], a7 FADD c11, t3, c11 nop FMUL a8, b3, t3 LDF [X1 + 2 * SIZE], b3 FADD c12, t4, c12 nop FMUL a8, b4, t4 LDF [A4 + 3 * SIZE], a8 FADD c13, t1, c13 nop FMUL a1, b1, t1 LDF [X1 + 3 * SIZE], b4 FADDX c14, t2, c14 add X1, 4 * SIZE, X1 FMUL a1, b2, t2 LDF [A1 + 4 * SIZE], a1 FADD c15, t3, c15 nop FMUL a2, b1, t3 nop FADD c16, t4, c16 nop FMUL a2, b2, t4 LDF [A1 + 5 * SIZE], a2 FADD c1, t1, c1 add A1, 6 * SIZE, A1 FMUL a3, b1, t1 nop FADDX c2, t2, c2 nop FMUL a3, b2, t2 LDF [A2 + 4 * SIZE], a3 FADD c3, t3, c3 nop FMUL a4, b1, t3 nop FADD c4, t4, c4 nop FMUL a4, b2, t4 LDF [A2 + 5 * SIZE], a4 FADD c5, t1, c5
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -