📄 trsm_kernel_hummer_rt.s
字号:
/*********************************************************************//* *//* Optimized BLAS libraries *//* By Kazushige Goto <kgoto@tacc.utexas.edu> *//* *//* Copyright (c) The University of Texas, 2005. All rights reserved. *//* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING *//* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF *//* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, *//* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY *//* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF *//* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO *//* THE USE OF THE SOFTWARE OR DOCUMENTATION. *//* Under no circumstances shall University be liable for incidental, *//* special, indirect, direct or consequential damages or loss of *//* profits, interruption of business, or related expenses which may *//* arise from use of Software or Documentation, including but not *//* limited to those resulting from defects in Software and/or *//* Documentation, or loss or inaccuracy of data of any kind. *//*********************************************************************/#define ASSEMBLER#include "common.h" #define ALPHA 0#define FZERO 8#define M r3#define N r4#define K r5#ifdef linux#define A r6#define B r7#define C r8#define LDC r9#define OFFSET r10#endif#define TEMP r11#define AORIG r12#define KK r14#define INCM1 r15#define INCM4 r16#define INCM2 r17#define INC2 r19#define INC r20#define INC4 r21#define I r22#define J r23#define AO r24#define BO r25#define AO2 r26#define BO2 r27 #define CO1 r28#define CO2 r29#define CO3 r30#define CO4 r31#ifndef NEEDPARAM#define A1 f16#define A2 f17#define A3 f18#define A4 f19#define A5 f20#define A6 f21#define A7 f22#define A8 f23#define A9 f24#define A10 f25#define B1 f26#define B2 f27#define B3 f28#define B4 f29#define B5 f30#define B6 f31#define AP B6 PROLOGUE PROFCODE li r0, -16 stfpdux f14, SP, r0 stfpdux f15, SP, r0 stfpdux f16, SP, r0 stfpdux f17, SP, r0 stfpdux f18, SP, r0 stfpdux f19, SP, r0 stfpdux f20, SP, r0 stfpdux f21, SP, r0 stfpdux f22, SP, r0 stfpdux f23, SP, r0 stfpdux f24, SP, r0 stfpdux f25, SP, r0 stfpdux f26, SP, r0 stfpdux f27, SP, r0 stfpdux f28, SP, r0 stfpdux f29, SP, r0 stfpdux f30, SP, r0 stfpdux f31, SP, r0 stwu r31, -4(SP) stwu r30, -4(SP) stwu r29, -4(SP) stwu r28, -4(SP) stwu r27, -4(SP) stwu r26, -4(SP) stwu r25, -4(SP) stwu r24, -4(SP) stwu r23, -4(SP) stwu r22, -4(SP) stwu r21, -4(SP) stwu r20, -4(SP) stwu r19, -4(SP) stwu r18, -4(SP) stwu r17, -4(SP) stwu r16, -4(SP) stwu r15, -4(SP) stwu r14, -4(SP) # dummy li r0, 0 stwu r0, -4(SP) stwu r0, -4(SP) stfdu f1, -8(SP) slwi LDC, LDC, BASE_SHIFT cmpwi cr0, M, 0 ble .L999 cmpwi cr0, N, 0 ble .L999 cmpwi cr0, K, 0 ble .L999 li INC, 1 * SIZE li INC2, 2 * SIZE li INC4, 4 * SIZE li INCM1, -1 * SIZE li INCM2, -2 * SIZE li INCM4, -4 * SIZE addi C, C, - 1 * SIZE#ifdef LN mullw r0, M, K slwi r0, r0, BASE_SHIFT add A, A, r0 slwi r0, M, BASE_SHIFT add C, C, r0#endif#ifdef RN neg KK, OFFSET#endif#ifdef RT mullw r0, N, K slwi r0, r0, BASE_SHIFT add B, B, r0 mullw r0, N, LDC add C, C, r0 sub KK, N, OFFSET#endif andi. J, N, 1 beq .L50#ifdef RT slwi r0, K, 0 + BASE_SHIFT sub B, B, r0 sub C, C, LDC#endif mr CO1, C#ifdef LN add KK, M, OFFSET#endif#ifdef LT mr KK, OFFSET#endif#if defined(LN) || defined(RT) addi AORIG, A, -2 * SIZE#else addi AO, A, -2 * SIZE#endif#ifndef RT add C, CO1, LDC#endif li r0, FZERO lfpsx f0, SP, r0 srawi. I, M, 3 ble .L100 .align 4.L91:#if defined(LT) || defined(RN) fpmr f1, f0 addi BO, B, - 2 * SIZE fpmr f2, f0 fpmr f3, f0 srawi. r0, KK, 2 mtspr CTR, r0 ble .L94#else#ifdef LN slwi r0, K, 3 + BASE_SHIFT sub AORIG, AORIG, r0#endif slwi r0 , KK, 3 + BASE_SHIFT slwi TEMP, KK, 0 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK fpmr f1, f0 addi BO, BO, - 2 * SIZE fpmr f2, f0 fpmr f3, f0 srawi. r0, TEMP, 2 mtspr CTR, r0 ble .L94#endif LFPDUX B1, BO, INC2 LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 LFPDUX A3, AO, INC2 LFPDUX A4, AO, INC2 LFPDUX B2, BO, INC2 LFPDUX A5, AO, INC2 LFPDUX A6, AO, INC2 LFPDUX A7, AO, INC2 LFPDUX A8, AO, INC2 bdz- .L93 .align 4.L92: fxcpmadd f0, B1, A1, f0 LFPDUX A1, AO, INC2 fxcpmadd f1, B1, A2, f1 LFPDUX A2, AO, INC2 fxcpmadd f2, B1, A3, f2 LFPDUX A3, AO, INC2 fxcpmadd f3, B1, A4, f3 LFPDUX A4, AO, INC2 fxcsmadd f0, B1, A5, f0 LFPDUX A5, AO, INC2 fxcsmadd f1, B1, A6, f1 LFPDUX A6, AO, INC2 fxcsmadd f2, B1, A7, f2 LFPDUX A7, AO, INC2 fxcsmadd f3, B1, A8, f3 LFPDUX A8, AO, INC2 LFPDUX B1, BO, INC2 fxcpmadd f0, B2, A1, f0 LFPDUX A1, AO, INC2 fxcpmadd f1, B2, A2, f1 LFPDUX A2, AO, INC2 fxcpmadd f2, B2, A3, f2 LFPDUX A3, AO, INC2 fxcpmadd f3, B2, A4, f3 LFPDUX A4, AO, INC2 fxcsmadd f0, B2, A5, f0 LFPDUX A5, AO, INC2 fxcsmadd f1, B2, A6, f1 LFPDUX A6, AO, INC2 fxcsmadd f2, B2, A7, f2 LFPDUX A7, AO, INC2 fxcsmadd f3, B2, A8, f3 LFPDUX A8, AO, INC2 LFPDUX B2, BO, INC2 bdnz+ .L92 .align 4.L93: fxcpmadd f0, B1, A1, f0 LFPDUX A1, AO, INC2 fxcpmadd f1, B1, A2, f1 LFPDUX A2, AO, INC2 fxcpmadd f2, B1, A3, f2 LFPDUX A3, AO, INC2 fxcpmadd f3, B1, A4, f3 LFPDUX A4, AO, INC2 fxcsmadd f0, B1, A5, f0 LFPDUX A5, AO, INC2 fxcsmadd f1, B1, A6, f1 LFPDUX A6, AO, INC2 fxcsmadd f2, B1, A7, f2 LFPDUX A7, AO, INC2 fxcsmadd f3, B1, A8, f3 LFPDUX A8, AO, INC2 fxcpmadd f0, B2, A1, f0 fxcpmadd f1, B2, A2, f1 fxcpmadd f2, B2, A3, f2 fxcpmadd f3, B2, A4, f3 fxcsmadd f0, B2, A5, f0 fxcsmadd f1, B2, A6, f1 fxcsmadd f2, B2, A7, f2 fxcsmadd f3, B2, A8, f3 .align 4.L94:#if defined(LT) || defined(RN) andi. r0, KK, 3 mtspr CTR, r0 ble+ .L98#else andi. r0, TEMP, 3 mtspr CTR, r0 ble+ .L98#endif LFDX B1, BO, INC2 LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 LFPDUX A3, AO, INC2 LFPDUX A4, AO, INC2 add BO, BO, INC bdz- .L97 .align 4.L96: fxcpmadd f0, B1, A1, f0 LFPDUX A1, AO, INC2 fxcpmadd f1, B1, A2, f1 LFPDUX A2, AO, INC2 fxcpmadd f2, B1, A3, f2 LFPDUX A3, AO, INC2 fxcpmadd f3, B1, A4, f3 LFDX B1, BO, INC2 LFPDUX A4, AO, INC2 add BO, BO, INC bdnz+ .L96 .align 4.L97: fxcpmadd f0, B1, A1, f0 fxcpmadd f1, B1, A2, f1 fxcpmadd f2, B1, A3, f2 fxcpmadd f3, B1, A4, f3 .align 4.L98:#if defined(LN) || defined(RT)#ifdef LN subi r0, KK, 8#else subi r0, KK, 1#endif slwi TEMP, r0, 3 + BASE_SHIFT slwi r0, r0, 0 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 addi BO, BO, - 2 * SIZE#endif#if defined(LN) || defined(LT) LFPDUX f16, BO, INC2 LFPDUX f17, BO, INC2 LFPDUX f18, BO, INC2 LFPDUX f19, BO, INC2 subi BO, BO, 8 * SIZE fpsub f0, f16, f0 fpsub f1, f17, f1 fpsub f2, f18, f2 fpsub f3, f19, f3#else LFPDUX f16, AO, INC2 LFPDUX f17, AO, INC2 LFPDUX f18, AO, INC2 LFPDUX f19, AO, INC2 subi AO, AO, 8 * SIZE fpsub f0, f16, f0 fpsub f1, f17, f1 fpsub f2, f18, f2 fpsub f3, f19, f3#endif#ifdef LN fsmtp f4, f0 fsmtp f5, f1 fsmtp f6, f2 fsmtp f7, f3 LFD A1, (2 + 63) * SIZE(AO) LFD A2, (2 + 62) * SIZE(AO) LFD A3, (2 + 61) * SIZE(AO) LFD A4, (2 + 60) * SIZE(AO) LFD A5, (2 + 59) * SIZE(AO) LFD A6, (2 + 58) * SIZE(AO) LFD A7, (2 + 57) * SIZE(AO) LFD A8, (2 + 56) * SIZE(AO) fmul f7, A1, f7 fnmsub f3, A2, f7, f3 fnmsub f6, A3, f7, f6 fnmsub f2, A4, f7, f2 fnmsub f5, A5, f7, f5 fnmsub f1, A6, f7, f1 fnmsub f4, A7, f7, f4 fnmsub f0, A8, f7, f0 LFD A1, (2 + 54) * SIZE(AO) LFD A2, (2 + 53) * SIZE(AO) LFD A3, (2 + 52) * SIZE(AO) LFD A4, (2 + 51) * SIZE(AO) LFD A5, (2 + 50) * SIZE(AO) LFD A6, (2 + 49) * SIZE(AO) LFD A7, (2 + 48) * SIZE(AO) fmul f3, A1, f3 fnmsub f6, A2, f3, f6 fnmsub f2, A3, f3, f2 fnmsub f5, A4, f3, f5 fnmsub f1, A5, f3, f1 fnmsub f4, A6, f3, f4 fnmsub f0, A7, f3, f0 LFD A1, (2 + 45) * SIZE(AO) LFD A2, (2 + 44) * SIZE(AO) LFD A3, (2 + 43) * SIZE(AO) LFD A4, (2 + 42) * SIZE(AO) LFD A5, (2 + 41) * SIZE(AO) LFD A6, (2 + 40) * SIZE(AO) fmul f6, A1, f6 fnmsub f2, A2, f6, f2 fnmsub f5, A3, f6, f5 fnmsub f1, A4, f6, f1 fnmsub f4, A5, f6, f4 fnmsub f0, A6, f6, f0 LFD A1, (2 + 36) * SIZE(AO) LFD A2, (2 + 35) * SIZE(AO) LFD A3, (2 + 34) * SIZE(AO) LFD A4, (2 + 33) * SIZE(AO) LFD A5, (2 + 32) * SIZE(AO) fmul f2, A1, f2 fnmsub f5, A2, f2, f5 fnmsub f1, A3, f2, f1 fnmsub f4, A4, f2, f4 fnmsub f0, A5, f2, f0 LFD A1, (2 + 27) * SIZE(AO) LFD A2, (2 + 26) * SIZE(AO) LFD A3, (2 + 25) * SIZE(AO) LFD A4, (2 + 24) * SIZE(AO) fmul f5, A1, f5 fnmsub f1, A2, f5, f1 fnmsub f4, A3, f5, f4 fnmsub f0, A4, f5, f0 LFD A1, (2 + 18) * SIZE(AO) LFD A2, (2 + 17) * SIZE(AO) LFD A3, (2 + 16) * SIZE(AO) fmul f1, A1, f1 fnmsub f4, A2, f1, f4 fnmsub f0, A3, f1, f0 LFD A1, (2 + 9) * SIZE(AO) LFD A2, (2 + 8) * SIZE(AO) fmul f4, A1, f4 fnmsub f0, A2, f4, f0 LFD A1, (2 + 0) * SIZE(AO) fmul f0, A1, f0 fsmfp f0, f4 fsmfp f1, f5 fsmfp f2, f6 fsmfp f3, f7 #endif#ifdef LT fsmtp f4, f0 fsmtp f5, f1 fsmtp f6, f2 fsmtp f7, f3 LFD A1, (2 + 0) * SIZE(AO) LFD A2, (2 + 1) * SIZE(AO) LFD A3, (2 + 2) * SIZE(AO) LFD A4, (2 + 3) * SIZE(AO) LFD A5, (2 + 4) * SIZE(AO) LFD A6, (2 + 5) * SIZE(AO) LFD A7, (2 + 6) * SIZE(AO) LFD A8, (2 + 7) * SIZE(AO) fmul f0, A1, f0 fnmsub f4, A2, f0, f4 fnmsub f1, A3, f0, f1 fnmsub f5, A4, f0, f5 fnmsub f2, A5, f0, f2 fnmsub f6, A6, f0, f6 fnmsub f3, A7, f0, f3 fnmsub f7, A8, f0, f7 LFD A1, (2 + 9) * SIZE(AO) LFD A2, (2 + 10) * SIZE(AO) LFD A3, (2 + 11) * SIZE(AO) LFD A4, (2 + 12) * SIZE(AO) LFD A5, (2 + 13) * SIZE(AO) LFD A6, (2 + 14) * SIZE(AO) LFD A7, (2 + 15) * SIZE(AO) fmul f4, A1, f4 fnmsub f1, A2, f4, f1 fnmsub f5, A3, f4, f5 fnmsub f2, A4, f4, f2 fnmsub f6, A5, f4, f6 fnmsub f3, A6, f4, f3 fnmsub f7, A7, f4, f7 LFD A1, (2 + 18) * SIZE(AO) LFD A2, (2 + 19) * SIZE(AO) LFD A3, (2 + 20) * SIZE(AO) LFD A4, (2 + 21) * SIZE(AO) LFD A5, (2 + 22) * SIZE(AO) LFD A6, (2 + 23) * SIZE(AO) fmul f1, A1, f1 fnmsub f5, A2, f1, f5 fnmsub f2, A3, f1, f2 fnmsub f6, A4, f1, f6 fnmsub f3, A5, f1, f3 fnmsub f7, A6, f1, f7 LFD A1, (2 + 27) * SIZE(AO) LFD A2, (2 + 28) * SIZE(AO) LFD A3, (2 + 29) * SIZE(AO) LFD A4, (2 + 30) * SIZE(AO) LFD A5, (2 + 31) * SIZE(AO) fmul f5, A1, f5 fnmsub f2, A2, f5, f2 fnmsub f6, A3, f5, f6 fnmsub f3, A4, f5, f3 fnmsub f7, A5, f5, f7 LFD A1, (2 + 36) * SIZE(AO) LFD A2, (2 + 37) * SIZE(AO) LFD A3, (2 + 38) * SIZE(AO) LFD A4, (2 + 39) * SIZE(AO) fmul f2, A1, f2 fnmsub f6, A2, f2, f6 fnmsub f3, A3, f2, f3 fnmsub f7, A4, f2, f7 LFD A1, (2 + 45) * SIZE(AO) LFD A2, (2 + 46) * SIZE(AO) LFD A3, (2 + 47) * SIZE(AO) fmul f6, A1, f6 fnmsub f3, A2, f6, f3 fnmsub f7, A3, f6, f7 LFD A1, (2 + 54) * SIZE(AO) LFD A2, (2 + 55) * SIZE(AO) fmul f3, A1, f3 fnmsub f7, A2, f3, f7 LFD A1, (2 + 63) * SIZE(AO) fmul f7, A1, f7 fsmfp f0, f4 fsmfp f1, f5 fsmfp f2, f6 fsmfp f3, f7 #endif#ifdef RN LFPDX A1, BO, INC2 fxpmul f0, A1, f0 fxpmul f1, A1, f1 fxpmul f2, A1, f2 fxpmul f3, A1, f3#endif#ifdef RT LFPDX A1, BO, INC2 fxpmul f0, A1, f0 fxpmul f1, A1, f1 fxpmul f2, A1, f2 fxpmul f3, A1, f3#endif#ifdef LN subi CO1, CO1, 8 * SIZE#endif#if defined(LN) || defined(LT) STFPDUX f0, BO, INC2 STFPDUX f1, BO, INC2 STFPDUX f2, BO, INC2 STFPDUX f3, BO, INC2 subi BO, BO, 8 * SIZE STFDUX f0, CO1, INC STFSDUX f0, CO1, INC STFDUX f1, CO1, INC STFSDUX f1, CO1, INC STFDUX f2, CO1, INC STFSDUX f2, CO1, INC STFDUX f3, CO1, INC STFSDUX f3, CO1, INC#else STFPDUX f0, AO, INC2 STFPDUX f1, AO, INC2 STFPDUX f2, AO, INC2 STFPDUX f3, AO, INC2 subi AO, AO, 8 * SIZE STFDUX f0, CO1, INC STFSDUX f0, CO1, INC STFDUX f1, CO1, INC STFSDUX f1, CO1, INC STFDUX f2, CO1, INC STFSDUX f2, CO1, INC STFDUX f3, CO1, INC STFSDUX f3, CO1, INC#endif#ifdef LN subi CO1, CO1, 8 * SIZE#endif#ifdef RT slwi r0, K, 3 + BASE_SHIFT add AORIG, AORIG, r0#endif#if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 3 + BASE_SHIFT slwi TEMP, TEMP, 0 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP#endif#ifdef LT addi KK, KK, 8#endif#ifdef LN subi KK, KK, 8#endif addic. I, I, -1 li r0, FZERO lfpsx f0, SP, r0 bgt+ .L91 .align 4.L100: andi. I, M, 4 beq .L110#if defined(LT) || defined(RN) addi BO, B, - 2 * SIZE fpmr f1, f0 fpmr f2, f0 fpmr f3, f0 srawi. r0, KK, 3 mtspr CTR, r0 ble .L104#else#ifdef LN slwi r0, K, 2 + BASE_SHIFT sub AORIG, AORIG, r0#endif slwi r0 , KK, 2 + BASE_SHIFT slwi TEMP, KK, 0 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK addi BO, BO, - 2 * SIZE fpmr f1, f0 fpmr f2, f0 fpmr f3, f0 srawi. r0, TEMP, 3 mtspr CTR, r0 ble .L104#endif LFPDUX B1, BO, INC2 LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 LFPDUX A3, AO, INC2 LFPDUX A4, AO, INC2 LFPDUX B2, BO, INC2 LFPDUX A5, AO, INC2 LFPDUX A6, AO, INC2 LFPDUX A7, AO, INC2 LFPDUX A8, AO, INC2 LFPDUX B3, BO, INC2 LFPDUX B4, BO, INC2 bdz- .L103 .align 4.L102: fxcpmadd f0, B1, A1, f0 LFPDUX A1, AO, INC2 fxcpmadd f1, B1, A2, f1 LFPDUX A2, AO, INC2 fxcsmadd f2, B1, A3, f2 LFPDUX A3, AO, INC2 fxcsmadd f3, B1, A4, f3 LFPDUX A4, AO, INC2 LFPDUX B1, BO, INC2 fxcpmadd f0, B2, A5, f0 LFPDUX A5, AO, INC2 fxcpmadd f1, B2, A6, f1 LFPDUX A6, AO, INC2 fxcsmadd f2, B2, A7, f2 LFPDUX A7, AO, INC2 fxcsmadd f3, B2, A8, f3 LFPDUX A8, AO, INC2 LFPDUX B2, BO, INC2 fxcpmadd f0, B3, A1, f0 LFPDUX A1, AO, INC2 fxcpmadd f1, B3, A2, f1 LFPDUX A2, AO, INC2 fxcsmadd f2, B3, A3, f2 LFPDUX A3, AO, INC2 fxcsmadd f3, B3, A4, f3 LFPDUX A4, AO, INC2 LFPDUX B3, BO, INC2 fxcpmadd f0, B4, A5, f0 LFPDUX A5, AO, INC2
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -