📄 trsm_kernel_lt.s
字号:
/*********************************************************************//* *//* Optimized BLAS libraries *//* By Kazushige Goto <kgoto@tacc.utexas.edu> *//* *//* Copyright (c) The University of Texas, 2005. All rights reserved. *//* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING *//* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF *//* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, *//* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY *//* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF *//* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO *//* THE USE OF THE SOFTWARE OR DOCUMENTATION. *//* Under no circumstances shall University be liable for incidental, *//* special, indirect, direct or consequential damages or loss of *//* profits, interruption of business, or related expenses which may *//* arise from use of Software or Documentation, including but not *//* limited to those resulting from defects in Software and/or *//* Documentation, or loss or inaccuracy of data of any kind. *//*********************************************************************/#define ASSEMBLER#include "common.h"#define M %i0#define N %i1#define K %i2#if defined(DOUBLE) && !defined(__64BIT__)#define A %i5#define B %i4#else#define A %i4#define B %i5#endif#define C %o4#define LDC %o5#define AO %l0#define BO %l1#define I %l2#define J %l3#define L %l4#define C1 %o0#define C2 %o1#define C3 %o2#define C4 %o3#define OFFSET %l5#define KK %l6#define TEMP1 %l7#define TEMP2 %i3#define AORIG %g1#ifdef DOUBLE#define c01 %f0#define c02 %f2#define c03 %f4#define c04 %f6#define c05 %f8#define c06 %f10#define c07 %f12#define c08 %f14#define c09 %f16#define c10 %f18#define c11 %f20#define c12 %f22#define c13 %f24#define c14 %f26#define c15 %f28#define c16 %f30#define t1 %f32#define t2 %f34#define t3 %f36#define t4 %f38#define a1 %f40#define a2 %f42#define a3 %f44#define a4 %f46#define a5 %f58#define b1 %f48#define b2 %f50#define b3 %f52#define b4 %f54#define b5 %f56#define FZERO %f60#else#define c01 %f0#define c02 %f1#define c03 %f2#define c04 %f3#define c05 %f4#define c06 %f5#define c07 %f6#define c08 %f7#define c09 %f8#define c10 %f9#define c11 %f10#define c12 %f11#define c13 %f12#define c14 %f13#define c15 %f14#define c16 %f15#define t1 %f16#define t2 %f17#define t3 %f18#define t4 %f19#define a1 %f20#define a2 %f21#define a3 %f22#define a4 %f23#define a5 %f31#define b1 %f24#define b2 %f25#define b3 %f26#define b4 %f27#define b5 %f28#define FZERO %f29#endif PROLOGUE SAVESP nop#ifndef __64BIT__#ifdef DOUBLE st %g0, [%fp + STACK_START + 8] st %g0, [%fp + STACK_START + 12] ld [%fp + STACK_START + 28], B ld [%fp + STACK_START + 32], C ld [%fp + STACK_START + 36], LDC ld [%fp + STACK_START + 40], OFFSET#else st %g0, [%fp + STACK_START + 8] ld [%fp + STACK_START + 28], C ld [%fp + STACK_START + 32], LDC ld [%fp + STACK_START + 36], OFFSET#endif LDF [%fp + STACK_START + 8], FZERO#else#ifdef DOUBLE stx %g0, [%fp + STACK_START + 32] nop ldd [%fp + STACK_START + 32], FZERO#else st %g0, [%fp + STACK_START + 32] nop ld [%fp + STACK_START + 32], FZERO#endif ldx [%fp+ STACK_START + 56], C ldx [%fp+ STACK_START + 64], LDC ldx [%fp+ STACK_START + 72], OFFSET#endif sll LDC, BASE_SHIFT, LDC#ifdef LN smul M, K, TEMP1 sll TEMP1, BASE_SHIFT, TEMP1 add A, TEMP1, A sll M, BASE_SHIFT, TEMP1 add C, TEMP1, C#endif#ifdef RN neg OFFSET, KK#endif#ifdef RT smul N, K, TEMP1 sll TEMP1, BASE_SHIFT, TEMP1 add B, TEMP1, B smul N, LDC, TEMP1 add C, TEMP1, C sub N, OFFSET, KK#endif sra N, 2, J cmp J, 0 ble,pn %icc, .LL100 nop.LL11:#ifdef RT sll K, 2 + BASE_SHIFT, TEMP1 sub B, TEMP1, B sll LDC, 2, TEMP1 sub C, TEMP1, C#endif add C, LDC, C2 FMOV FZERO, t1 nop mov C, C1 add C2, LDC, C3 FMOV FZERO, t2 sra M, 2, I add C3, LDC, C4 FMOV FZERO, t3#ifdef LN add M, OFFSET, KK#endif#ifdef LT mov OFFSET, KK#endif#if defined(LN) || defined(RT) mov A, AORIG#else mov A, AO#endif cmp I, 0#ifndef RT add C4, LDC, C#endif FMOV FZERO, t4 ble,pn %icc, .LL50 FMOV FZERO, c01.LL21: FMOV FZERO, c02 FMOV FZERO, c03#if defined(LT) || defined(RN) sra KK, 2, L mov B, BO cmp L, 0#else#ifdef LN sll K, 2 + BASE_SHIFT, TEMP1 sub AORIG, TEMP1, AORIG#endif sll KK, 2 + BASE_SHIFT, TEMP1 add AORIG, TEMP1, AO add B, TEMP1, BO sub K, KK, TEMP1 sra TEMP1, 2, L cmp L, 0#endif LDF [AO + 0 * SIZE], a1 FMOV FZERO, c04 LDF [BO + 0 * SIZE], b1 FMOV FZERO, c05 LDF [AO + 1 * SIZE], a2 FMOV FZERO, c06 LDF [BO + 1 * SIZE], b2 FMOV FZERO, c07 LDF [AO + 2 * SIZE], a3 FMOV FZERO, c08 LDF [BO + 2 * SIZE], b3 FMOV FZERO, c09 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c10 LDF [BO + 3 * SIZE], b4 FMOV FZERO, c11 LDF [BO + 4 * SIZE], b5 /* ***** */ LDF [AO + 4 * SIZE], a5 /* ***** */ prefetch [C1 + 3 * SIZE], 3 FMOV FZERO, c12 prefetch [C2 + 3 * SIZE], 3 FMOV FZERO, c13 prefetch [C3 + 3 * SIZE], 3 FMOV FZERO, c14 prefetch [C4 + 3 * SIZE], 3 FMOV FZERO, c15 ble,pn %icc, .LL25 FMOV FZERO, c16#define APREFETCHSIZE 40#define BPREFETCHSIZE 40#define APREFETCH_CATEGORY 0#define BPREFETCH_CATEGORY 0.LL22: FADD c04, t1, c04 prefetch [AO + APREFETCHSIZE * SIZE], APREFETCH_CATEGORY FMUL a1, b1, t1 nop FADD c08, t2, c08 prefetch [BO + BPREFETCHSIZE * SIZE], BPREFETCH_CATEGORY FMUL a1, b2, t2 add AO, 16 * SIZE, AO FADD c12, t3, c12 LDF [AO - 13 * SIZE], a4 FMUL a1, b3, t3 add BO, 16 * SIZE, BO FADD c16, t4, c16 nop FMUL a1, b4, t4 LDF [AO - 8 * SIZE], a1 FADD c01, t1, c01 nop FMUL a2, b1, t1 nop FADD c05, t2, c05 nop FMUL a2, b2, t2 nop FADD c09, t3, c09 nop FMUL a2, b3, t3 nop FADD c13, t4, c13 add L, -1, L FMUL a2, b4, t4 LDF [AO - 11 * SIZE], a2 FADD c02, t1, c02 nop FMUL a3, b1, t1 nop FADD c06, t2, c06 nop FMUL a3, b2, t2 nop FADD c10, t3, c10 nop FMUL a3, b3, t3 nop FADD c14, t4, c14 nop FMUL a3, b4, t4 LDF [AO - 10 * SIZE], a3 FADD c03, t1, c03 nop FMUL a4, b1, t1 LDF [BO - 8 * SIZE], b1 FADD c07, t2, c07 nop FMUL a4, b2, t2 LDF [BO - 11 * SIZE], b2 FADD c11, t3, c11 nop FMUL a4, b3, t3 LDF [BO - 10 * SIZE], b3 FADD c15, t4, c15 nop FMUL a4, b4, t4 LDF [BO - 9 * SIZE], b4 FADD c04, t1, c04 nop FMUL a5, b5, t1 LDF [AO - 9 * SIZE], a4 FADD c08, t2, c08 nop FMUL a5, b2, t2 nop FADD c12, t3, c12 nop FMUL a5, b3, t3 nop FADD c16, t4, c16 nop FMUL a5, b4, t4 LDF [AO - 4 * SIZE], a5 FADD c01, t1, c01 nop FMUL a2, b5, t1 nop FADD c05, t2, c05 nop FMUL a2, b2, t2 nop FADD c09, t3, c09 nop FMUL a2, b3, t3 nop FADD c13, t4, c13 nop FMUL a2, b4, t4 LDF [AO - 7 * SIZE], a2 FADD c02, t1, c02 nop FMUL a3, b5, t1 nop FADD c06, t2, c06 nop FMUL a3, b2, t2 nop FADD c10, t3, c10 nop FMUL a3, b3, t3 nop FADD c14, t4, c14 nop FMUL a3, b4, t4 LDF [AO - 6 * SIZE], a3 FADD c03, t1, c03 nop FMUL a4, b5, t1 LDF [BO - 4 * SIZE], b5 FADD c07, t2, c07 nop FMUL a4, b2, t2 LDF [BO - 7 * SIZE], b2 FADD c11, t3, c11 nop FMUL a4, b3, t3 LDF [BO - 6 * SIZE], b3 FADD c15, t4, c15 nop FMUL a4, b4, t4 LDF [BO - 5 * SIZE], b4 FADD c04, t1, c04 nop FMUL a1, b1, t1 LDF [AO - 5 * SIZE], a4 FADD c08, t2, c08 nop FMUL a1, b2, t2 nop FADD c12, t3, c12 nop FMUL a1, b3, t3 nop FADD c16, t4, c16 nop FMUL a1, b4, t4 LDF [AO - 0 * SIZE], a1 FADD c01, t1, c01 nop FMUL a2, b1, t1 nop#ifdef DOUBLE prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY#else nop#endif FADD c05, t2, c05 nop FMUL a2, b2, t2 FADD c09, t3, c09 nop FMUL a2, b3, t3 nop FADD c13, t4, c13 nop FMUL a2, b4, t4 nop FADD c02, t1, c02 nop FMUL a3, b1, t1 LDF [AO - 3 * SIZE], a2 FADD c06, t2, c06#ifdef DOUBLE prefetch [BO + (BPREFETCHSIZE + 8) * SIZE], BPREFETCH_CATEGORY#else nop#endif FMUL a3, b2, t2 nop FADD c10, t3, c10 nop FMUL a3, b3, t3 nop FADD c14, t4, c14 nop FMUL a3, b4, t4 LDF [AO - 2 * SIZE], a3 FADD c03, t1, c03 nop FMUL a4, b1, t1 LDF [BO - 0 * SIZE], b1 FADD c07, t2, c07 nop FMUL a4, b2, t2 LDF [BO - 3 * SIZE], b2 FADD c11, t3, c11 nop FMUL a4, b3, t3 LDF [BO - 2 * SIZE], b3 FADD c15, t4, c15 nop FMUL a4, b4, t4 LDF [BO - 1 * SIZE], b4 FADD c04, t1, c04 nop FMUL a5, b5, t1 LDF [AO - 1 * SIZE], a4 FADD c08, t2, c08 FMUL a5, b2, t2 FADD c12, t3, c12 FMUL a5, b3, t3 FADD c16, t4, c16 nop FMUL a5, b4, t4 LDF [AO + 4 * SIZE], a5 FADD c01, t1, c01 nop FMUL a2, b5, t1 nop FADD c05, t2, c05 nop FMUL a2, b2, t2 nop FADD c09, t3, c09 nop FMUL a2, b3, t3 nop FADD c13, t4, c13 nop FMUL a2, b4, t4 LDF [AO + 1 * SIZE], a2 FADD c02, t1, c02 nop FMUL a3, b5, t1 nop FADD c06, t2, c06 nop FMUL a3, b2, t2 nop FADD c10, t3, c10 nop FMUL a3, b3, t3 nop FADD c14, t4, c14 nop FMUL a3, b4, t4 LDF [AO + 2 * SIZE], a3 FADD c03, t1, c03 cmp L, 0 FMUL a4, b5, t1 LDF [BO + 4 * SIZE], b5 FADD c07, t2, c07 nop FMUL a4, b2, t2 LDF [BO + 1 * SIZE], b2 FADD c11, t3, c11 nop FMUL a4, b3, t3 LDF [BO + 2 * SIZE], b3 FADD c15, t4, c15 FMUL a4, b4, t4 bg,pt %icc, .LL22 LDF [BO + 3 * SIZE], b4.LL25:#if defined(LT) || defined(RN) and KK, 3, L#else and TEMP1, 3, L#endif cmp L, 0 ble,a,pn %icc, .LL29 nop.LL26: FADD c04, t1, c04 LDF [AO + 3 * SIZE], a4 FMUL a1, b1, t1 add AO, 4 * SIZE, AO FADD c08, t2, c08 add BO, 4 * SIZE, BO FMUL a1, b2, t2 add L, -1, L FADD c12, t3, c12 nop FMUL a1, b3, t3 cmp L, 0 FADD c16, t4, c16 nop FMUL a1, b4, t4 LDF [AO + 0 * SIZE], a1 FADD c01, t1, c01 nop FMUL a2, b1, t1 nop FADD c05, t2, c05 nop FMUL a2, b2, t2 nop FADD c09, t3, c09 nop FMUL a2, b3, t3 nop FADD c13, t4, c13 nop FMUL a2, b4, t4 LDF [AO + 1 * SIZE], a2 FADD c02, t1, c02 nop FMUL a3, b1, t1 nop FADD c06, t2, c06 nop FMUL a3, b2, t2 nop FADD c10, t3, c10 nop FMUL a3, b3, t3 nop FADD c14, t4, c14 nop FMUL a3, b4, t4 LDF [AO + 2 * SIZE], a3 FADD c03, t1, c03 nop FMUL a4, b1, t1 LDF [BO + 0 * SIZE], b1 FADD c07, t2, c07 nop FMUL a4, b2, t2 LDF [BO + 1 * SIZE], b2 FADD c11, t3, c11 nop FMUL a4, b3, t3 LDF [BO + 2 * SIZE], b3 FADD c15, t4, c15 FMUL a4, b4, t4 bg,pt %icc, .LL26 LDF [BO + 3 * SIZE], b4.LL29:#if defined(LN) || defined(RT) sub KK, 4, TEMP1 sll TEMP1, 2 + BASE_SHIFT, TEMP1 add AORIG, TEMP1, AO add B, TEMP1, BO#endif FADD c04, t1, c04 FADD c08, t2, c08 FADD c12, t3, c12 FADD c16, t4, c16#if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 LDF [BO + 4 * SIZE], b1 LDF [BO + 5 * SIZE], b2 LDF [BO + 6 * SIZE], b3 LDF [BO + 7 * SIZE], b4 FSUB a1, c01, c01 FSUB a2, c05, c05 FSUB a3, c09, c09 FSUB a4, c13, c13 FSUB b1, c02, c02 FSUB b2, c06, c06 FSUB b3, c10, c10 FSUB b4, c14, c14 LDF [BO + 8 * SIZE], a1 LDF [BO + 9 * SIZE], a2 LDF [BO + 10 * SIZE], a3 LDF [BO + 11 * SIZE], a4 LDF [BO + 12 * SIZE], b1 LDF [BO + 13 * SIZE], b2 LDF [BO + 14 * SIZE], b3 LDF [BO + 15 * SIZE], b4 FSUB a1, c03, c03 FSUB a2, c07, c07 FSUB a3, c11, c11 FSUB a4, c15, c15 FSUB b1, c04, c04 FSUB b2, c08, c08 FSUB b3, c12, c12 FSUB b4, c16, c16#else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 LDF [AO + 4 * SIZE], b1 LDF [AO + 5 * SIZE], b2 LDF [AO + 6 * SIZE], b3 LDF [AO + 7 * SIZE], b4 FSUB a1, c01, c01 FSUB a2, c02, c02 FSUB a3, c03, c03 FSUB a4, c04, c04 FSUB b1, c05, c05 FSUB b2, c06, c06 FSUB b3, c07, c07 FSUB b4, c08, c08 LDF [AO + 8 * SIZE], a1 LDF [AO + 9 * SIZE], a2 LDF [AO + 10 * SIZE], a3 LDF [AO + 11 * SIZE], a4 LDF [AO + 12 * SIZE], b1 LDF [AO + 13 * SIZE], b2 LDF [AO + 14 * SIZE], b3 LDF [AO + 15 * SIZE], b4 FSUB a1, c09, c09 FSUB a2, c10, c10 FSUB a3, c11, c11 FSUB a4, c12, c12 FSUB b1, c13, c13 FSUB b2, c14, c14 FSUB b3, c15, c15 FSUB b4, c16, c16#endif#ifdef LN LDF [AO + 15 * SIZE], a1 LDF [AO + 14 * SIZE], a2 LDF [AO + 13 * SIZE], a3 LDF [AO + 12 * SIZE], a4 FMUL a1, c04, c04 FMUL a1, c08, c08 FMUL a1, c12, c12 FMUL a1, c16, c16 FMUL a2, c04, t1 FMUL a2, c08, t2 FMUL a2, c12, t3 FMUL a2, c16, t4 FSUB c03, t1, c03 FSUB c07, t2, c07 FSUB c11, t3, c11 FSUB c15, t4, c15 FMUL a3, c04, t1 FMUL a3, c08, t2 FMUL a3, c12, t3 FMUL a3, c16, t4
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -