📄 ztrsm_kernel_ln.s
字号:
/*********************************************************************//* *//* Optimized BLAS libraries *//* By Kazushige Goto <kgoto@tacc.utexas.edu> *//* *//* Copyright (c) The University of Texas, 2005. All rights reserved. *//* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING *//* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF *//* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, *//* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY *//* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF *//* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO *//* THE USE OF THE SOFTWARE OR DOCUMENTATION. *//* Under no circumstances shall University be liable for incidental, *//* special, indirect, direct or consequential damages or loss of *//* profits, interruption of business, or related expenses which may *//* arise from use of Software or Documentation, including but not *//* limited to those resulting from defects in Software and/or *//* Documentation, or loss or inaccuracy of data of any kind. *//*********************************************************************/#define ASSEMBLER#include "common.h"#define M %i0#define N %i1#define K %i2#define A %i5#define B %i3#define C %i4#define LDC %o0#define AO %o1#define BO %o2#define I %o3#define J %o4#define L %o5#define C1 %l0#define C2 %l1#define OFFSET %l2#define KK %l3#define TEMP1 %l4#define TEMP2 %l5#define AORIG %l6#ifdef DOUBLE#define c01 %f0#define c02 %f2#define c03 %f4#define c04 %f6#define c05 %f8#define c06 %f10#define c07 %f12#define c08 %f14#define c09 %f16#define c10 %f18#define c11 %f20#define c12 %f22#define c13 %f24#define c14 %f26#define c15 %f28#define c16 %f30#define t1 %f32#define t2 %f34#define t3 %f36#define t4 %f38#define a1 %f40#define a2 %f42#define a3 %f44#define a4 %f46#define a5 %f62#define b1 %f48#define b2 %f50#define b3 %f52#define b4 %f54#define b5 %f56#define FZERO %f58#else#define c01 %f0#define c02 %f1#define c03 %f2#define c04 %f3#define c05 %f4#define c06 %f5#define c07 %f6#define c08 %f7#define c09 %f8#define c10 %f9#define c11 %f10#define c12 %f11#define c13 %f12#define c14 %f13#define c15 %f14#define c16 %f15#define t1 %f16#define t2 %f17#define t3 %f18#define t4 %f19#define a1 %f20#define a2 %f21#define a3 %f22#define a4 %f23#define a5 %f31#define b1 %f24#define b2 %f25#define b3 %f26#define b4 %f27#define b5 %f28#define FZERO %f29#endif#define t5 c13#define t6 c14#define t7 c15#define t8 c16#ifndef CONJ#define FADD1 FADD#define FADD2 FADD#define FADD3 FADD#define FADD4 FSUB#else#if defined(LN) || defined(LT)#define FADD1 FADD#define FADD2 FSUB#define FADD3 FADD#define FADD4 FADD#endif#if defined(RN) || defined(RT)#define FADD1 FADD#define FADD2 FADD#define FADD3 FSUB#define FADD4 FADD#endif#endif#define APREFETCHSIZE 40#define BPREFETCHSIZE 40#define APREFETCH_CATEGORY 0#define BPREFETCH_CATEGORY 0 PROLOGUE SAVESP #ifndef __64BIT__#ifdef DOUBLE st %g0, [%fp + STACK_START + 8] st %g0, [%fp + STACK_START + 12] nop st %i3, [%fp + STACK_START + 16] nop st %i4, [%fp + STACK_START + 20] nop st %i5, [%fp + STACK_START + 24] ld [%fp + STACK_START + 32], A ld [%fp + STACK_START + 36], B ld [%fp + STACK_START + 40], C ld [%fp + STACK_START + 44], LDC ld [%fp + STACK_START + 48], OFFSET ldd [%fp + STACK_START + 8], FZERO#else st %g0, [%fp + STACK_START + 8] nop st %i3, [%fp + STACK_START + 16] nop st %i4, [%fp + STACK_START + 20] ld [%fp + STACK_START + 28], B ld [%fp + STACK_START + 32], C ld [%fp + STACK_START + 36], LDC ld [%fp + STACK_START + 40], OFFSET ld [%fp + STACK_START + 8], FZERO#endif#else#ifdef DOUBLE stx %g0, [%fp + STACK_START + 32]#else st %g0, [%fp + STACK_START + 32]#endif ldx [%fp+ STACK_START + 56], B nop ldx [%fp+ STACK_START + 64], C nop ldx [%fp+ STACK_START + 72], LDC ldx [%fp+ STACK_START + 80], OFFSET LDF [%fp + STACK_START + 32], FZERO#endif sll LDC, ZBASE_SHIFT, LDC#ifdef LN smul M, K, TEMP1 sll TEMP1, ZBASE_SHIFT, TEMP1 add A, TEMP1, A sll M, ZBASE_SHIFT, TEMP1 add C, TEMP1, C#endif#ifdef RN neg OFFSET, KK#endif#ifdef RT smul N, K, TEMP1 sll TEMP1, ZBASE_SHIFT, TEMP1 add B, TEMP1, B smul N, LDC, TEMP1 add C, TEMP1, C sub N, OFFSET, KK#endif sra N, 1, J cmp J, 0 ble,pn %icc, .LL100 nop.LL11:#ifdef RT sll K, 1 + ZBASE_SHIFT, TEMP1 sub B, TEMP1, B add LDC, LDC, TEMP1 sub C, TEMP1, C#endif mov C, C1 add C, LDC, C2#ifdef LN add M, OFFSET, KK#endif#ifdef LT mov OFFSET, KK#endif#if defined(LN) || defined(RT) mov A, AORIG#else mov A, AO#endif#ifndef RT add C2, LDC, C#endif and M, 1, I cmp I, 0 ble,pn %icc, .LL50 nop #if defined(LT) || defined(RN) sra KK, 2, L mov B, BO cmp L, 0#else#ifdef LN sll K, 0 + ZBASE_SHIFT, TEMP1 sub AORIG, TEMP1, AORIG#endif sll KK, 0 + ZBASE_SHIFT, TEMP1 sll KK, 1 + ZBASE_SHIFT, TEMP2 add AORIG, TEMP1, AO add B, TEMP2, BO sub K, KK, TEMP1 sra TEMP1, 2, L cmp L, 0#endif FMOV FZERO, c02 FMOV FZERO, t1 FMOV FZERO, c04 LDF [AO + 0 * SIZE], a1 FMOV FZERO, t2 LDF [BO + 0 * SIZE], b1 FMOV FZERO, c06 LDF [AO + 1 * SIZE], a2 FMOV FZERO, t3 LDF [BO + 1 * SIZE], b2 FMOV FZERO, c08 LDF [AO + 2 * SIZE], a3 FMOV FZERO, t4 LDF [BO + 2 * SIZE], b3 FMOV FZERO, c01 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c03 LDF [BO + 3 * SIZE], b4 FMOV FZERO, c05 ble,pn %icc, .LL55 FMOV FZERO, c07.LL52: FADD2 c02, t1, c02 add AO, 8 * SIZE, AO prefetch [AO + APREFETCHSIZE * SIZE], 0 FMUL a1, b1, t1 add BO, 16 * SIZE, BO FADD4 c04, t2, c04 add L, -1, L FMUL a1, b2, t2 FADD2 c06, t3, c06 cmp L, 0 FMUL a1, b3, t3 FADD4 c08, t4, c08 FMUL a1, b4, t4 LDF [AO - 4 * SIZE], a1 FADD1 c01, t1, c01 FMUL a2, b1, t1 LDF [BO - 12 * SIZE], b1 FADD3 c03, t2, c03 FMUL a2, b2, t2 LDF [BO - 11 * SIZE], b2 FADD1 c05, t3, c05 FMUL a2, b3, t3 LDF [BO - 10 * SIZE], b3 FADD3 c07, t4, c07 FMUL a2, b4, t4 LDF [BO - 9 * SIZE], b4 FADD2 c02, t1, c02 FMUL a3, b1, t1 LDF [AO - 3 * SIZE], a2 FADD4 c04, t2, c04 FMUL a3, b2, t2 FADD2 c06, t3, c06 FMUL a3, b3, t3 FADD4 c08, t4, c08 FMUL a3, b4, t4 LDF [AO - 2 * SIZE], a3 FADD1 c01, t1, c01 FMUL a4, b1, t1 LDF [BO - 8 * SIZE], b1 FADD3 c03, t2, c03 FMUL a4, b2, t2 LDF [BO - 7 * SIZE], b2 FADD1 c05, t3, c05 FMUL a4, b3, t3 LDF [BO - 6 * SIZE], b3 FADD3 c07, t4, c07 FMUL a4, b4, t4 LDF [BO - 5 * SIZE], b4 FADD2 c02, t1, c02 FMUL a1, b1, t1 LDF [AO - 1 * SIZE], a4 FADD4 c04, t2, c04 FMUL a1, b2, t2 FADD2 c06, t3, c06 FMUL a1, b3, t3 FADD4 c08, t4, c08 FMUL a1, b4, t4 LDF [AO + 0 * SIZE], a1 FADD1 c01, t1, c01 FMUL a2, b1, t1 LDF [BO - 4 * SIZE], b1 FADD3 c03, t2, c03 FMUL a2, b2, t2 LDF [BO - 3 * SIZE], b2 FADD1 c05, t3, c05 FMUL a2, b3, t3 LDF [BO - 2 * SIZE], b3 FADD3 c07, t4, c07 FMUL a2, b4, t4 LDF [BO - 1 * SIZE], b4 FADD2 c02, t1, c02 FMUL a3, b1, t1 LDF [AO + 1 * SIZE], a2 FADD4 c04, t2, c04 FMUL a3, b2, t2 FADD2 c06, t3, c06 FMUL a3, b3, t3 FADD4 c08, t4, c08 FMUL a3, b4, t4 LDF [AO + 2 * SIZE], a3 FADD1 c01, t1, c01 FMUL a4, b1, t1 LDF [BO + 0 * SIZE], b1 FADD3 c03, t2, c03 FMUL a4, b2, t2 LDF [BO + 1 * SIZE], b2 FADD1 c05, t3, c05 FMUL a4, b3, t3 LDF [BO + 2 * SIZE], b3 FADD3 c07, t4, c07 FMUL a4, b4, t4 LDF [BO + 3 * SIZE], b4 bg,pt %icc, .LL52 LDF [AO + 3 * SIZE], a4.LL55:#if defined(LT) || defined(RN) and KK, 3, L#else and TEMP1, 3, L#endif cmp L, 0 ble,a,pn %icc, .LL59 nop.LL56: FADD2 c02, t1, c02 add AO, 2 * SIZE, AO FMUL a1, b1, t1 add L, -1, L add BO, 4 * SIZE, BO FADD4 c04, t2, c04 cmp L, 0 FMUL a1, b2, t2 FADD2 c06, t3, c06 FMUL a1, b3, t3 FADD4 c08, t4, c08 FMUL a1, b4, t4 LDF [AO + 0 * SIZE], a1 FADD1 c01, t1, c01 FMUL a2, b1, t1 LDF [BO + 0 * SIZE], b1 FADD3 c03, t2, c03 FMUL a2, b2, t2 LDF [BO + 1 * SIZE], b2 FADD1 c05, t3, c05 FMUL a2, b3, t3 LDF [BO + 2 * SIZE], b3 FADD3 c07, t4, c07 FMUL a2, b4, t4 LDF [BO + 3 * SIZE], b4 bg,pt %icc, .LL56 LDF [AO + 1 * SIZE], a2.LL59:#if defined(LN) || defined(RT)#ifdef LN sub KK, 1, TEMP1#else sub KK, 2, TEMP1#endif sll TEMP1, 0 + ZBASE_SHIFT, TEMP2 sll TEMP1, 1 + ZBASE_SHIFT, TEMP1 add AORIG, TEMP2, AO add B, TEMP1, BO#endif FADD2 c02, t1, c02 FADD4 c04, t2, c04 FADD2 c06, t3, c06 FADD4 c08, t4, c08 FADD c01, c04, c01 FADD c02, c03, c02 FADD c05, c08, c05 FADD c06, c07, c06#if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 FSUB a1, c01, c01 FSUB a2, c02, c02 FSUB a3, c05, c05 FSUB a4, c06, c06#else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 FSUB a1, c01, c01 FSUB a2, c02, c02 FSUB a3, c05, c05 FSUB a4, c06, c06#endif#ifdef LN LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 FMUL a1, c01, t1 FMUL a2, c02, t2 FMUL a1, c02, t3 FMUL a2, c01, t4 FMUL a1, c05, t5 FMUL a2, c06, t6 FMUL a1, c06, t7 FMUL a2, c05, t8 FADD4 t1, t2, c01 FADD2 t3, t4, c02 FADD4 t5, t6, c05 FADD2 t7, t8, c06#endif#ifdef LT LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 FMUL a1, c01, t1 FMUL a2, c02, t2 FMUL a1, c02, t3 FMUL a2, c01, t4 FMUL a1, c05, t5 FMUL a2, c06, t6 FMUL a1, c06, t7 FMUL a2, c05, t8 FADD4 t1, t2, c01 FADD2 t3, t4, c02 FADD4 t5, t6, c05 FADD2 t7, t8, c06#endif#ifdef RN LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 LDF [BO + 6 * SIZE], b1 LDF [BO + 7 * SIZE], b2 FMUL a1, c01, t1 FMUL a2, c02, t2 FMUL a1, c02, t3 FMUL a2, c01, t4 FADD4 t1, t2, c01 FADD3 t3, t4, c02 FMUL a3, c01, t1 FMUL a3, c02, t2 FMUL a4, c02, t3 FMUL a4, c01, t4 FSUB c05, t1, c05 FSUB c06, t2, c06 FADD3 c05, t3, c05 FADD4 c06, t4, c06 FMUL b1, c05, t1 FMUL b2, c06, t2 FMUL b1, c06, t3 FMUL b2, c05, t4 FADD4 t1, t2, c05 FADD3 t3, t4, c06#endif#ifdef RT LDF [BO + 6 * SIZE], a1 LDF [BO + 7 * SIZE], a2 LDF [BO + 4 * SIZE], a3 LDF [BO + 5 * SIZE], a4 LDF [BO + 0 * SIZE], b1 LDF [BO + 1 * SIZE], b2 FMUL a1, c05, t1 FMUL a2, c06, t2 FMUL a1, c06, t3 FMUL a2, c05, t4 FADD4 t1, t2, c05 FADD3 t3, t4, c06 FMUL a3, c05, t1 FMUL a3, c06, t2 FMUL a4, c06, t3 FMUL a4, c05, t4 FSUB c01, t1, c01 FSUB c02, t2, c02 FADD3 c01, t3, c01 FADD4 c02, t4, c02 FMUL b1, c01, t1 FMUL b2, c02, t2 FMUL b1, c02, t3 FMUL b2, c01, t4 FADD4 t1, t2, c01 FADD3 t3, t4, c02#endif#ifdef LN add C1, -2 * SIZE, C1 add C2, -2 * SIZE, C2#endif#if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c02, [BO + 1 * SIZE] STF c05, [BO + 2 * SIZE] STF c06, [BO + 3 * SIZE]#else STF c01, [AO + 0 * SIZE] STF c02, [AO + 1 * SIZE] STF c05, [AO + 2 * SIZE] STF c06, [AO + 3 * SIZE]#endif STF c01, [C1 + 0 * SIZE] STF c02, [C1 + 1 * SIZE] STF c05, [C2 + 0 * SIZE] STF c06, [C2 + 1 * SIZE] FMOV FZERO, t1 FMOV FZERO, t2 FMOV FZERO, t3 FMOV FZERO, t4#ifndef LN add C1, 2 * SIZE, C1 add C2, 2 * SIZE, C2#endif#ifdef RT sll K, 0 + ZBASE_SHIFT, TEMP1 add AORIG, TEMP1, AORIG#endif#if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, 0 + ZBASE_SHIFT, TEMP2 sll TEMP1, 1 + ZBASE_SHIFT, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO#endif#ifdef LT add KK, 1, KK#endif#ifdef LN sub KK, 1, KK#endif.LL50: sra M, 1, I cmp I, 0 ble,pn %icc, .LL99 nop.LL21:#if defined(LT) || defined(RN) sra KK, 2, L mov B, BO cmp L, 0#else#ifdef LN sll K, 1 + ZBASE_SHIFT, TEMP1 sub AORIG, TEMP1, AORIG#endif sll KK, 1 + ZBASE_SHIFT, TEMP1 add AORIG, TEMP1, AO add B, TEMP1, BO sub K, KK, TEMP1 sra TEMP1, 2, L cmp L, 0#endif FMOV FZERO, t1 FMOV FZERO, t2 FMOV FZERO, t3 FMOV FZERO, t4 FMOV FZERO, c01 FMOV FZERO, c02 LDF [AO + 0 * SIZE], a1 FMOV FZERO, c03 LDF [BO + 0 * SIZE], b1 FMOV FZERO, c04 LDF [AO + 1 * SIZE], a2 FMOV FZERO, c05 LDF [BO + 1 * SIZE], b2 FMOV FZERO, c06 LDF [AO + 2 * SIZE], a3 FMOV FZERO, c07 LDF [BO + 2 * SIZE], b3 FMOV FZERO, c08 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c09 LDF [BO + 3 * SIZE], b4 FMOV FZERO, c10 LDF [BO + 4 * SIZE], b5 FMOV FZERO, c11 LDF [AO + 4 * SIZE], a5 FMOV FZERO, c12#ifdef LN prefetch [C1 - 3 * SIZE], 3 FMOV FZERO, c13 prefetch [C2 - 3 * SIZE], 3 FMOV FZERO, c14#else prefetch [C1 + 3 * SIZE], 3 FMOV FZERO, c13 prefetch [C2 + 3 * SIZE], 3 FMOV FZERO, c14#endif FMOV FZERO, c15 ble,pn %icc, .LL25 FMOV FZERO, c16.LL22: FADD2 c04, t1, c04 prefetch [AO + APREFETCHSIZE * SIZE], APREFETCH_CATEGORY FMUL a1, b1, t1 nop FADD4 c08, t2, c08 prefetch [BO + BPREFETCHSIZE * SIZE], BPREFETCH_CATEGORY FMUL a1, b2, t2 add AO, 16 * SIZE, AO FADD2 c12, t3, c12 LDF [AO - 13 * SIZE], a4 FMUL a1, b3, t3 add BO, 16 * SIZE, BO FADD4 c16, t4, c16 nop FMUL a1, b4, t4 LDF [AO - 8 * SIZE], a1 FADD1 c01, t1, c01 nop FMUL a2, b1, t1 nop FADD3 c05, t2, c05 nop FMUL a2, b2, t2 nop
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -