📄 trsm_kernel_rt.s
字号:
/*********************************************************************//* *//* Optimized BLAS libraries *//* By Kazushige Goto <kgoto@tacc.utexas.edu> *//* *//* Copyright (c) The University of Texas, 2005. All rights reserved. *//* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING *//* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF *//* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, *//* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY *//* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF *//* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO *//* THE USE OF THE SOFTWARE OR DOCUMENTATION. *//* Under no circumstances shall University be liable for incidental, *//* special, indirect, direct or consequential damages or loss of *//* profits, interruption of business, or related expenses which may *//* arise from use of Software or Documentation, including but not *//* limited to those resulting from defects in Software and/or *//* Documentation, or loss or inaccuracy of data of any kind. *//*********************************************************************/#define ASSEMBLER#include "common.h"#define M %i0#define N %i1#define K %i2#if defined(DOUBLE) && !defined(__64BIT__)#define A %i5#define B %i4#else#define A %i4#define B %i5#endif#define C %o4#define LDC %o5#define AO %l0#define BO %l1#define I %l2#define J %l3#define L %l4#define C1 %o0#define C2 %o1#define C3 %o2#define C4 %o3#define OFFSET %l5#define KK %l6#define TEMP1 %l7#define TEMP2 %i3#define AORIG %g1#ifdef DOUBLE#define c01 %f0#define c02 %f2#define c03 %f4#define c04 %f6#define c05 %f8#define c06 %f10#define c07 %f12#define c08 %f14#define c09 %f16#define c10 %f18#define c11 %f20#define c12 %f22#define c13 %f24#define c14 %f26#define c15 %f28#define c16 %f30#define t1 %f32#define t2 %f34#define t3 %f36#define t4 %f38#define a1 %f40#define a2 %f42#define a3 %f44#define a4 %f46#define a5 %f58#define b1 %f48#define b2 %f50#define b3 %f52#define b4 %f54#define b5 %f56#define FZERO %f60#define ALPHA %f62#else#define c01 %f0#define c02 %f1#define c03 %f2#define c04 %f3#define c05 %f4#define c06 %f5#define c07 %f6#define c08 %f7#define c09 %f8#define c10 %f9#define c11 %f10#define c12 %f11#define c13 %f12#define c14 %f13#define c15 %f14#define c16 %f15#define t1 %f16#define t2 %f17#define t3 %f18#define t4 %f19#define a1 %f20#define a2 %f21#define a3 %f22#define a4 %f23#define a5 %f31#define b1 %f24#define b2 %f25#define b3 %f26#define b4 %f27#define b5 %f28#define FZERO %f29#define ALPHA %f30#endif#define APREFETCHSIZE 40#define BPREFETCHSIZE 40#define APREFETCH_CATEGORY 0#define BPREFETCH_CATEGORY 0 PROLOGUE SAVESP nop#ifndef __64BIT__#ifdef DOUBLE st %g0, [%fp + STACK_START + 8] st %g0, [%fp + STACK_START + 12] st %i3, [%fp + STACK_START + 16] /* ALPHA */ st %i4, [%fp + STACK_START + 20] ld [%fp + STACK_START + 28], B ld [%fp + STACK_START + 32], C ld [%fp + STACK_START + 36], LDC ld [%fp + STACK_START + 40], OFFSET#else st %g0, [%fp + STACK_START + 8] st %i3, [%fp + STACK_START + 16] /* ALPHA */ ld [%fp + STACK_START + 28], C ld [%fp + STACK_START + 32], LDC ld [%fp + STACK_START + 36], OFFSET#endif LDF [%fp + STACK_START + 8], FZERO LDF [%fp + STACK_START + 16], ALPHA#else#ifdef DOUBLE stx %g0, [%fp + STACK_START + 32] FMOV %f6, ALPHA nop ldd [%fp + STACK_START + 32], FZERO#else st %g0, [%fp + STACK_START + 32] FMOV %f7, ALPHA nop ld [%fp + STACK_START + 32], FZERO#endif ldx [%fp+ STACK_START + 56], C ldx [%fp+ STACK_START + 64], LDC ldx [%fp+ STACK_START + 72], OFFSET#endif sll LDC, BASE_SHIFT, LDC#ifdef LN smul M, K, TEMP1 sll TEMP1, BASE_SHIFT, TEMP1 add A, TEMP1, A sll M, BASE_SHIFT, TEMP1 add C, TEMP1, C#endif#ifdef RN neg OFFSET, KK#endif#ifdef RT smul N, K, TEMP1 sll TEMP1, BASE_SHIFT, TEMP1 add B, TEMP1, B smul N, LDC, TEMP1 add C, TEMP1, C sub N, OFFSET, KK#endif and N, 1, J cmp J, 0 ble,pn %icc, .LL100 nop#ifdef RT sll K, 0 + BASE_SHIFT, TEMP1 sub B, TEMP1, B sub C, LDC, C#endif mov C, C1#ifdef LN add M, OFFSET, KK#endif#ifdef LT mov OFFSET, KK#endif#if defined(LN) || defined(RT) mov A, AORIG#else mov A, AO#endif#ifndef RT add C, LDC, C#endif sra M, 2, I cmp I, 0 ble,pn %icc, .LL250 nop.LL221:#if defined(LT) || defined(RN) sra KK, 2, L mov B, BO cmp L, 0#else#ifdef LN sll K, 2 + BASE_SHIFT, TEMP1 sub AORIG, TEMP1, AORIG#endif sll KK, 2 + BASE_SHIFT, TEMP1 sll KK, 0 + BASE_SHIFT, TEMP2 add AORIG, TEMP1, AO add B, TEMP2, BO sub K, KK, TEMP1 sra TEMP1, 2, L cmp L, 0#endif LDF [AO + 0 * SIZE], a1 FMOV FZERO, c01 LDF [BO + 0 * SIZE], b1 FMOV FZERO, t1 LDF [AO + 1 * SIZE], a2 FMOV FZERO, c02 LDF [BO + 1 * SIZE], b2 FMOV FZERO, t2 LDF [AO + 2 * SIZE], a3 FMOV FZERO, c03 LDF [BO + 2 * SIZE], b3 FMOV FZERO, t3 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c04 LDF [BO + 3 * SIZE], b4 FMOV FZERO, t4 ble,pn %icc, .LL225 prefetch [C1 + 4 * SIZE], 2.LL222: FADD c01, t1, c01 add BO, 4 * SIZE, BO FMUL a1, b1, t1 LDF [AO + 4 * SIZE], a1 FADD c02, t2, c02 FMUL a2, b1, t2 LDF [AO + 5 * SIZE], a2 FADD c03, t3, c03 add L, -1, L FMUL a3, b1, t3 LDF [AO + 6 * SIZE], a3 FADD c04, t4, c04 FMUL a4, b1, t4 LDF [AO + 7 * SIZE], a4 LDF [BO + 0 * SIZE], b1 FADD c01, t1, c01 cmp L, 0 FMUL a1, b2, t1 LDF [AO + 8 * SIZE], a1 FADD c02, t2, c02 FMUL a2, b2, t2 LDF [AO + 9 * SIZE], a2 FADD c03, t3, c03 FMUL a3, b2, t3 LDF [AO + 10 * SIZE], a3 FADD c04, t4, c04 FMUL a4, b2, t4 LDF [AO + 11 * SIZE], a4 LDF [BO + 1 * SIZE], b2 FADD c01, t1, c01 FMUL a1, b3, t1 LDF [AO + 12 * SIZE], a1 FADD c02, t2, c02 FMUL a2, b3, t2 LDF [AO + 13 * SIZE], a2 FADD c03, t3, c03 FMUL a3, b3, t3 LDF [AO + 14 * SIZE], a3 FADD c04, t4, c04 FMUL a4, b3, t4 LDF [AO + 15 * SIZE], a4 LDF [BO + 2 * SIZE], b3 FADD c01, t1, c01 FMUL a1, b4, t1 LDF [AO + 16 * SIZE], a1 FADD c02, t2, c02 FMUL a2, b4, t2 LDF [AO + 17 * SIZE], a2 FADD c03, t3, c03 FMUL a3, b4, t3 LDF [AO + 18 * SIZE], a3 FADD c04, t4, c04 FMUL a4, b4, t4 LDF [AO + 19 * SIZE], a4 add AO, 16 * SIZE, AO bg,pt %icc, .LL222 LDF [BO + 3 * SIZE], b4.LL225:#if defined(LT) || defined(RN) and KK, 3, L#else and TEMP1, 3, L#endif cmp L, 0 ble,a,pn %icc, .LL229 nop.LL226: FADD c01, t1, c01 add BO, 1 * SIZE, BO FMUL a1, b1, t1 LDF [AO + 4 * SIZE], a1 FADD c02, t2, c02 add L, -1, L FMUL a2, b1, t2 LDF [AO + 5 * SIZE], a2 FADD c03, t3, c03 cmp L, 0 FMUL a3, b1, t3 LDF [AO + 6 * SIZE], a3 FADD c04, t4, c04 FMUL a4, b1, t4 LDF [AO + 7 * SIZE], a4 add AO, 4 * SIZE, AO bg,pt %icc, .LL226 LDF [BO + 0 * SIZE], b1.LL229: FADD c01, t1, c01 FADD c02, t2, c02 FADD c03, t3, c03 FADD c04, t4, c04#if defined(LN) || defined(RT)#ifdef LN sub KK, 4, TEMP1#else sub KK, 1, TEMP1#endif sll TEMP1, 2 + BASE_SHIFT, TEMP2 sll TEMP1, 0 + BASE_SHIFT, TEMP1 add AORIG, TEMP2, AO add B, TEMP1, BO#endif#if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 FSUB a1, c01, c01 FSUB a2, c02, c02 FSUB a3, c03, c03 FSUB a4, c04, c04#else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 FSUB a1, c01, c01 FSUB a2, c02, c02 FSUB a3, c03, c03 FSUB a4, c04, c04#endif#ifdef LN LDF [AO + 15 * SIZE], a1 LDF [AO + 14 * SIZE], a2 LDF [AO + 13 * SIZE], a3 LDF [AO + 12 * SIZE], a4 FMUL a1, c04, c04 FMUL a2, c04, t1 FSUB c03, t1, c03 FMUL a3, c04, t1 FSUB c02, t1, c02 FMUL a4, c04, t1 FSUB c01, t1, c01 LDF [AO + 10 * SIZE], a1 LDF [AO + 9 * SIZE], a2 LDF [AO + 8 * SIZE], a3 FMUL a1, c03, c03 FMUL a2, c03, t1 FSUB c02, t1, c02 FMUL a3, c03, t1 FSUB c01, t1, c01 LDF [AO + 5 * SIZE], a1 LDF [AO + 4 * SIZE], a2 FMUL a1, c02, c02 FMUL a2, c02, t1 FSUB c01, t1, c01 LDF [AO + 0 * SIZE], a1 FMUL a1, c01, c01#endif#ifdef LT LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 FMUL a1, c01, c01 FMUL a2, c01, t1 FSUB c02, t1, c02 FMUL a3, c01, t1 FSUB c03, t1, c03 FMUL a4, c01, t1 FSUB c04, t1, c04 LDF [AO + 5 * SIZE], a1 LDF [AO + 6 * SIZE], a2 LDF [AO + 7 * SIZE], a3 FMUL a1, c02, c02 FMUL a2, c02, t1 FSUB c03, t1, c03 FMUL a3, c02, t1 FSUB c04, t1, c04 LDF [AO + 10 * SIZE], a1 LDF [AO + 11 * SIZE], a2 FMUL a1, c03, c03 FMUL a2, c03, t1 FSUB c04, t1, c04 LDF [AO + 15 * SIZE], a1 FMUL a1, c04, c04#endif#ifdef RN LDF [BO + 0 * SIZE], a1 FMUL a1, c01, c01 FMUL a1, c02, c02 FMUL a1, c03, c03 FMUL a1, c04, c04#endif#ifdef RT LDF [BO + 0 * SIZE], a1 FMUL a1, c01, c01 FMUL a1, c02, c02 FMUL a1, c03, c03 FMUL a1, c04, c04#endif#ifdef LN add C1, -4 * SIZE, C1#endif#if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c02, [BO + 1 * SIZE] STF c03, [BO + 2 * SIZE] STF c04, [BO + 3 * SIZE]#else STF c01, [AO + 0 * SIZE] STF c02, [AO + 1 * SIZE] STF c03, [AO + 2 * SIZE] STF c04, [AO + 3 * SIZE]#endif STF c01, [C1 + 0 * SIZE] STF c02, [C1 + 1 * SIZE] STF c03, [C1 + 2 * SIZE] STF c04, [C1 + 3 * SIZE] FMOV FZERO, t1 FMOV FZERO, t2 FMOV FZERO, t3 FMOV FZERO, t4#ifndef LN add C1, 4 * SIZE, C1#endif#ifdef RT sll K, 2 + BASE_SHIFT, TEMP1 add AORIG, TEMP1, AORIG#endif#if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, 2 + BASE_SHIFT, TEMP2 sll TEMP1, 0 + BASE_SHIFT, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO#endif#ifdef LT add KK, 4, KK#endif#ifdef LN sub KK, 4, KK#endif add I, -1, I cmp I, 0 bg,pt %icc, .LL221 nop.LL250: and M, 2, I cmp I, 0 ble,pn %icc, .LL270 nop#if defined(LT) || defined(RN) sra KK, 2, L mov B, BO cmp L, 0#else#ifdef LN sll K, 1 + BASE_SHIFT, TEMP1 sub AORIG, TEMP1, AORIG#endif sll KK, 1 + BASE_SHIFT, TEMP1 sll KK, 0 + BASE_SHIFT, TEMP2 add AORIG, TEMP1, AO add B, TEMP2, BO sub K, KK, TEMP1 sra TEMP1, 2, L cmp L, 0#endif LDF [AO + 0 * SIZE], a1 FMOV FZERO, c01 LDF [BO + 0 * SIZE], b1 FMOV FZERO, t1 LDF [AO + 1 * SIZE], a2 FMOV FZERO, c02 LDF [BO + 1 * SIZE], b2 FMOV FZERO, t2 LDF [AO + 2 * SIZE], a3 FMOV FZERO, c03 LDF [BO + 2 * SIZE], b3 FMOV FZERO, t3 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c04 LDF [BO + 3 * SIZE], b4 FMOV FZERO, t4 ble,pn %icc, .LL255 nop.LL252: FADD c01, t1, c01 add L, -1, L FMUL a1, b1, t1 LDF [AO + 4 * SIZE], a1 FADD c02, t2, c02 FMUL a2, b1, t2 LDF [AO + 5 * SIZE], a2 LDF [BO + 4 * SIZE], b1 FADD c03, t3, c03 cmp L, 0 FMUL a3, b2, t3 LDF [AO + 6 * SIZE], a3 FADD c04, t4, c04 FMUL a4, b2, t4 LDF [AO + 7 * SIZE], a4 LDF [BO + 5 * SIZE], b2 FADD c01, t1, c01 FMUL a1, b3, t1 LDF [AO + 8 * SIZE], a1 FADD c02, t2, c02 FMUL a2, b3, t2 LDF [AO + 9 * SIZE], a2 LDF [BO + 6 * SIZE], b3 FADD c03, t3, c03 FMUL a3, b4, t3 LDF [AO + 10 * SIZE], a3 FADD c04, t4, c04 FMUL a4, b4, t4 LDF [AO + 11 * SIZE], a4 add AO, 8 * SIZE, AO LDF [BO + 7 * SIZE], b4 bg,pt %icc, .LL252 add BO, 4 * SIZE, BO.LL255:#if defined(LT) || defined(RN) and KK, 3, L#else and TEMP1, 3, L#endif cmp L, 0 ble,a,pn %icc, .LL259 nop.LL256: FADD c01, t1, c01 add L, -1, L FMUL a1, b1, t1 LDF [AO + 2 * SIZE], a1 FADD c02, t2, c02 cmp L, 0 FMUL a2, b1, t2 LDF [AO + 3 * SIZE], a2 LDF [BO + 1 * SIZE], b1 add AO, 2 * SIZE, AO bg,pt %icc, .LL256 add BO, 1 * SIZE, BO.LL259: FADD c01, t1, c01 FADD c02, t2, c02 FADD c03, t3, c03 FADD c04, t4, c04 FADD c01, c03, c01 FADD c02, c04, c02#if defined(LN) || defined(RT)#ifdef LN sub KK, 2, TEMP1#else sub KK, 1, TEMP1#endif sll TEMP1, 1 + BASE_SHIFT, TEMP2 sll TEMP1, 0 + BASE_SHIFT, TEMP1 add AORIG, TEMP2, AO add B, TEMP1, BO#endif#if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 FSUB a1, c01, c01 FSUB a2, c02, c02#else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 FSUB a1, c01, c01 FSUB a2, c02, c02#endif#ifdef LN LDF [AO + 3 * SIZE], a1 LDF [AO + 2 * SIZE], a2 LDF [AO + 0 * SIZE], a3 FMUL a1, c02, c02 FMUL a2, c02, t1 FSUB c01, t1, c01 FMUL a3, c01, c01#endif#ifdef LT LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 3 * SIZE], a3 FMUL a1, c01, c01 FMUL a2, c01, t1 FSUB c02, t1, c02 FMUL a3, c02, c02#endif#ifdef RN LDF [BO + 0 * SIZE], a1 FMUL a1, c01, c01 FMUL a1, c02, c02#endif#ifdef RT LDF [BO + 0 * SIZE], a1 FMUL a1, c01, c01 FMUL a1, c02, c02#endif#ifdef LN add C1, -2 * SIZE, C1#endif#if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c02, [BO + 1 * SIZE]#else STF c01, [AO + 0 * SIZE] STF c02, [AO + 1 * SIZE]#endif STF c01, [C1 + 0 * SIZE] STF c02, [C1 + 1 * SIZE] FMOV FZERO, t1 FMOV FZERO, t2 FMOV FZERO, t3 FMOV FZERO, t4#ifndef LN add C1, 2 * SIZE, C1#endif#ifdef RT sll K, 1 + BASE_SHIFT, TEMP1 add AORIG, TEMP1, AORIG#endif#if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, 1 + BASE_SHIFT, TEMP2 sll TEMP1, 0 + BASE_SHIFT, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO#endif#ifdef LT add KK, 2, KK#endif#ifdef LN sub KK, 2, KK#endif.LL270: and M, 1, I cmp I, 0 ble,pn %icc, .LL299 nop#if defined(LT) || defined(RN) sra KK, 2, L mov B, BO cmp L, 0#else#ifdef LN sll K, 0 + BASE_SHIFT, TEMP1
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -