📄 ztrsm_kernel_hummer_lt.s
字号:
/*********************************************************************//* *//* Optimized BLAS libraries *//* By Kazushige Goto <kgoto@tacc.utexas.edu> *//* *//* Copyright (c) The University of Texas, 2005. All rights reserved. *//* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING *//* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF *//* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, *//* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY *//* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF *//* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO *//* THE USE OF THE SOFTWARE OR DOCUMENTATION. *//* Under no circumstances shall University be liable for incidental, *//* special, indirect, direct or consequential damages or loss of *//* profits, interruption of business, or related expenses which may *//* arise from use of Software or Documentation, including but not *//* limited to those resulting from defects in Software and/or *//* Documentation, or loss or inaccuracy of data of any kind. *//*********************************************************************/#define ASSEMBLER#include "common.h" #undef ZERO#define ALPHA 0#define FZERO 16#define M r3#define N r4#define K r5#ifdef linux#define A r6#define B r7#define C r8#define LDC r9#define OFFSET r10#endif#define TEMP r11#define AORIG r12#define KK r14#define INCM1 r15#define INCM3 r16#define INCM5 r17#define INCM7 r18#define INC2 r19#define INC r20#define INC4 r21#define I r22#define J r23#define AO r24#define BO r25#define AO2 r26#define BO2 r27 #define CO1 r28#define CO2 r29#define ZERO r31#ifndef NEEDPARAM#define A1 f16#define A2 f17#define A3 f18#define A4 f19#define A5 f20#define A6 f21#define A7 f22#define A8 f23#define A9 f24#define A10 f25#define B1 f26#define B2 f27#define B3 f28#define B4 f29#define B5 f30#define B6 f31#define AP B6#ifndef CONJ#define FXCPMADD fxcpmadd#define FXCSMADD fxcxnpma#else#if defined(LN) || defined(LT)#define FXCPMADD fxcpnsma#define FXCSMADD fxcxma#else#define FXCPMADD fxcpmadd#define FXCSMADD fxcxnsma#endif#endif#ifndef CONJ#define FXCXNPMA fxcxnpma#define FXCXNSMA fxcxnsma#else#define FXCXNPMA fxcxnsma#define FXCXNSMA fxcxnpma#endif PROLOGUE PROFCODE li r0, -16 stfpdux f14, SP, r0 stfpdux f15, SP, r0 stfpdux f16, SP, r0 stfpdux f17, SP, r0 stfpdux f18, SP, r0 stfpdux f19, SP, r0 stfpdux f20, SP, r0 stfpdux f21, SP, r0 stfpdux f22, SP, r0 stfpdux f23, SP, r0 stfpdux f24, SP, r0 stfpdux f25, SP, r0 stfpdux f26, SP, r0 stfpdux f27, SP, r0 stfpdux f28, SP, r0 stfpdux f29, SP, r0 stfpdux f30, SP, r0 stfpdux f31, SP, r0 stwu r31, -4(SP) stwu r30, -4(SP) stwu r29, -4(SP) stwu r28, -4(SP) stwu r27, -4(SP) stwu r26, -4(SP) stwu r25, -4(SP) stwu r24, -4(SP) stwu r23, -4(SP) stwu r22, -4(SP) stwu r21, -4(SP) stwu r20, -4(SP) stwu r19, -4(SP) stwu r18, -4(SP) stwu r17, -4(SP) stwu r16, -4(SP) stwu r15, -4(SP) stwu r14, -4(SP) li r0, 0 stwu r0, -4(SP) stwu r0, -4(SP) stfdu f2, -8(SP) stfdu f1, -8(SP) slwi LDC, LDC, ZBASE_SHIFT cmpwi cr0, M, 0 ble .L999 cmpwi cr0, N, 0 ble .L999 cmpwi cr0, K, 0 ble .L999 li INC, 1 * SIZE li INC2, 2 * SIZE li INC4, 4 * SIZE li INCM1, -1 * SIZE li INCM3, -3 * SIZE li INCM5, -5 * SIZE li INCM7, -7 * SIZE addi C, C, - 1 * SIZE #ifdef LN mullw r0, M, K slwi r0, r0, ZBASE_SHIFT add A, A, r0 slwi r0, M, ZBASE_SHIFT add C, C, r0#endif#ifdef RN neg KK, OFFSET#endif#ifdef RT mullw r0, N, K slwi r0, r0, ZBASE_SHIFT add B, B, r0 mullw r0, N, LDC add C, C, r0 sub KK, N, OFFSET#endif srawi. J, N, 1 ble .L50 .align 4.L10:#ifdef RT slwi r0, K, 1 + ZBASE_SHIFT sub B, B, r0 slwi r0, LDC, 1 sub C, C, r0#endif mr CO1, C add CO2, C, LDC#ifdef LN add KK, M, OFFSET#endif#ifdef LT mr KK, OFFSET#endif#if defined(LN) || defined(RT) addi AORIG, A, -4 * SIZE#else addi AO, A, -4 * SIZE#endif#ifndef RT add C, CO2, LDC#endif li r0, FZERO lfpsx f0, SP, r0 srawi. I, M, 2 ble .L20 .align 4.L11:#if defined(LT) || defined(RN) addi AO2, AO, 2 * SIZE fpmr f4, f0 addi BO, B, - 4 * SIZE fpmr f8, f0 addi BO2, B, - 2 * SIZE fpmr f12, f0 fpmr f5, f0 fpmr f9, f0 fpmr f13, f0 fpmr f2, f0 fpmr f6, f0 fpmr f10, f0 fpmr f14, f0 fpmr f3, f0 fpmr f7, f0 fpmr f11, f0 fpmr f15, f0 srawi. r0, KK, 2 fpmr f1, f0 mtspr CTR, r0 ble .L14#else#ifdef LN slwi r0, K, 2 + ZBASE_SHIFT sub AORIG, AORIG, r0#endif slwi r0 , KK, 2 + ZBASE_SHIFT slwi TEMP, KK, 1 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK fpmr f5, f0 fpmr f9, f0 fpmr f13, f0 fpmr f2, f0 fpmr f6, f0 fpmr f10, f0 fpmr f14, f0 fpmr f3, f0 fpmr f7, f0 fpmr f11, f0 fpmr f15, f0 addi AO2, AO, 2 * SIZE fpmr f4, f0 addi BO, BO, - 4 * SIZE fpmr f8, f0 addi BO2, BO, 2 * SIZE fpmr f12, f0 srawi. r0, TEMP, 2 fpmr f1, f0 mtspr CTR, r0 ble .L14#endif LFPDUX A1, AO, INC4 fpmr f5, f0 LFPDUX A3, AO, INC4 fpmr f9, f0 LFPDUX B1, BO, INC4 fpmr f13, f0 LFPDUX A5, AO, INC4 fpmr f2, f0 LFPDUX A6, AO, INC4 fpmr f6, f0 LFPDUX B3, BO, INC4 fpmr f10, f0 LFPDUX A7, AO, INC4 fpmr f14, f0 LFPDUX A8, AO, INC4 fpmr f3, f0 LFPDUX B5, BO, INC4 fpmr f7, f0 LFPDUX A9, AO, INC4 fpmr f11, f0 LFPDUX A2, AO2, INC4 fpmr f15, f0 LFPDUX B2, BO2, INC4 bdz- .L13 .align 4.L12:## 1 ## FXCPMADD f0, B1, A1, f0 nop FXCSMADD f4, B1, A1, f4 nop FXCPMADD f8, B2, A1, f8 LFPDUX B4, BO2, INC4 FXCSMADD f12, B2, A1, f12 LFPDUX B6, BO, INC4 FXCPMADD f1, B1, A2, f1 nop FXCSMADD f5, B1, A2, f5 LFPDUX A4, AO2, INC4 FXCPMADD f9, B2, A2, f9 LFPDUX A10, AO, INC4 FXCSMADD f13, B2, A2, f13 nop FXCPMADD f2, B1, A3, f2 nop FXCSMADD f6, B1, A3, f6 nop FXCPMADD f10, B2, A3, f10 nop FXCSMADD f14, B2, A3, f14 nop FXCPMADD f3, B1, A4, f3 nop FXCSMADD f7, B1, A4, f7 LFPDUX A2, AO2, INC4 FXCPMADD f11, B2, A4, f11 LFPDUX A1, AO, INC4 FXCSMADD f15, B2, A4, f15 nop## 2 ## FXCPMADD f0, B3, A5, f0 nop FXCSMADD f4, B3, A5, f4 nop FXCPMADD f8, B4, A5, f8 LFPDUX B2, BO2, INC4 FXCSMADD f12, B4, A5, f12 LFPDUX B1, BO, INC4 FXCPMADD f1, B3, A2, f1 nop FXCSMADD f5, B3, A2, f5 LFPDUX A4, AO2, INC4 FXCPMADD f9, B4, A2, f9 LFPDUX A3, AO, INC4 FXCSMADD f13, B4, A2, f13 nop FXCPMADD f2, B3, A6, f2 nop FXCSMADD f6, B3, A6, f6 nop FXCPMADD f10, B4, A6, f10 nop FXCSMADD f14, B4, A6, f14 nop FXCPMADD f3, B3, A4, f3 nop FXCSMADD f7, B3, A4, f7 LFPDUX A2, AO2, INC4 FXCPMADD f11, B4, A4, f11 LFPDUX A5, AO, INC4 FXCSMADD f15, B4, A4, f15 nop## 3 ## FXCPMADD f0, B5, A7, f0 nop FXCSMADD f4, B5, A7, f4 nop FXCPMADD f8, B2, A7, f8 LFPDUX B4, BO2, INC4 FXCSMADD f12, B2, A7, f12 LFPDUX B3, BO, INC4 FXCPMADD f1, B5, A2, f1 nop FXCSMADD f5, B5, A2, f5 LFPDUX A4, AO2, INC4 FXCPMADD f9, B2, A2, f9 LFPDUX A6, AO, INC4 FXCSMADD f13, B2, A2, f13 nop FXCPMADD f2, B5, A8, f2 nop FXCSMADD f6, B5, A8, f6 nop FXCPMADD f10, B2, A8, f10 nop FXCSMADD f14, B2, A8, f14 nop FXCPMADD f3, B5, A4, f3 nop FXCSMADD f7, B5, A4, f7 LFPDUX A2, AO2, INC4 FXCPMADD f11, B2, A4, f11 LFPDUX A7, AO, INC4 FXCSMADD f15, B2, A4, f15 nop## 4 ## FXCPMADD f0, B6, A9, f0 nop FXCSMADD f4, B6, A9, f4 nop FXCPMADD f8, B4, A9, f8 LFPDUX B2, BO2, INC4 FXCSMADD f12, B4, A9, f12 LFPDUX B5, BO, INC4 FXCPMADD f1, B6, A2, f1 nop FXCSMADD f5, B6, A2, f5 LFPDUX A4, AO2, INC4 FXCPMADD f9, B4, A2, f9 LFPDUX A8, AO, INC4 FXCSMADD f13, B4, A2, f13 nop FXCPMADD f2, B6, A10, f2 nop FXCSMADD f6, B6, A10, f6 nop FXCPMADD f10, B4, A10, f10 nop FXCSMADD f14, B4, A10, f14 nop FXCPMADD f3, B6, A4, f3 LFPDUX A2, AO2, INC4 FXCSMADD f7, B6, A4, f7 LFPDUX A9, AO, INC4 FXCPMADD f11, B4, A4, f11 nop FXCSMADD f15, B4, A4, f15 bdnz+ .L12 .align 4.L13:## 1 ## FXCPMADD f0, B1, A1, f0 nop FXCSMADD f4, B1, A1, f4 nop FXCPMADD f8, B2, A1, f8 LFPDUX B4, BO2, INC4 FXCSMADD f12, B2, A1, f12 LFPDUX B6, BO, INC4 FXCPMADD f1, B1, A2, f1 nop FXCSMADD f5, B1, A2, f5 LFPDUX A4, AO2, INC4 FXCPMADD f9, B2, A2, f9 LFPDUX A10, AO, INC4 FXCSMADD f13, B2, A2, f13 nop FXCPMADD f2, B1, A3, f2 nop FXCSMADD f6, B1, A3, f6 nop FXCPMADD f10, B2, A3, f10 nop FXCSMADD f14, B2, A3, f14 nop FXCPMADD f3, B1, A4, f3 nop FXCSMADD f7, B1, A4, f7 LFPDUX A2, AO2, INC4 FXCPMADD f11, B2, A4, f11 nop FXCSMADD f15, B2, A4, f15 nop## 2 ## FXCPMADD f0, B3, A5, f0 nop FXCSMADD f4, B3, A5, f4 nop FXCPMADD f8, B4, A5, f8 LFPDUX B2, BO2, INC4 FXCSMADD f12, B4, A5, f12 nop FXCPMADD f1, B3, A2, f1 nop FXCSMADD f5, B3, A2, f5 LFPDUX A4, AO2, INC4 FXCPMADD f9, B4, A2, f9 nop FXCSMADD f13, B4, A2, f13 nop FXCPMADD f2, B3, A6, f2 nop FXCSMADD f6, B3, A6, f6 nop FXCPMADD f10, B4, A6, f10 nop FXCSMADD f14, B4, A6, f14 nop FXCPMADD f3, B3, A4, f3 nop FXCSMADD f7, B3, A4, f7 LFPDUX A2, AO2, INC4 FXCPMADD f11, B4, A4, f11 nop FXCSMADD f15, B4, A4, f15 nop## 3 ## FXCPMADD f0, B5, A7, f0 nop FXCSMADD f4, B5, A7, f4 nop FXCPMADD f8, B2, A7, f8 LFPDUX B4, BO2, INC4 FXCSMADD f12, B2, A7, f12 nop FXCPMADD f1, B5, A2, f1 nop FXCSMADD f5, B5, A2, f5 LFPDUX A4, AO2, INC4 FXCPMADD f9, B2, A2, f9 nop FXCSMADD f13, B2, A2, f13 nop FXCPMADD f2, B5, A8, f2 nop FXCSMADD f6, B5, A8, f6 nop FXCPMADD f10, B2, A8, f10 nop FXCSMADD f14, B2, A8, f14 nop FXCPMADD f3, B5, A4, f3 nop FXCSMADD f7, B5, A4, f7 LFPDUX A2, AO2, INC4 FXCPMADD f11, B2, A4, f11 nop FXCSMADD f15, B2, A4, f15 nop## 4 ## FXCPMADD f0, B6, A9, f0 nop FXCSMADD f4, B6, A9, f4 nop FXCPMADD f8, B4, A9, f8 nop FXCSMADD f12, B4, A9, f12 nop FXCPMADD f1, B6, A2, f1 nop FXCSMADD f5, B6, A2, f5 LFPDUX A4, AO2, INC4 FXCPMADD f9, B4, A2, f9 nop FXCSMADD f13, B4, A2, f13 nop FXCPMADD f2, B6, A10, f2 nop FXCSMADD f6, B6, A10, f6 nop FXCPMADD f10, B4, A10, f10 nop FXCSMADD f14, B4, A10, f14 nop FXCPMADD f3, B6, A4, f3 nop FXCSMADD f7, B6, A4, f7 nop FXCPMADD f11, B4, A4, f11 nop FXCSMADD f15, B4, A4, f15 nop .align 4.L14:#if defined(LT) || defined(RN) andi. r0, KK, 3 mtspr CTR, r0 ble+ .L18#else andi. r0, TEMP, 3 mtspr CTR, r0 ble+ .L18#endif.L15: LFPDUX A2, AO, INC4 LFPDUX A4, AO2, INC4 LFPDUX A10, BO, INC4 LFPDUX B4, BO2, INC4 bdz- .L17 .align 4.L16: FXCPMADD f0, A10, A2, f0 FXCSMADD f4, A10, A2, f4 FXCPMADD f8, B4, A2, f8 FXCSMADD f12, B4, A2, f12 LFPDUX A2, AO, INC4 FXCPMADD f1, A10, A4, f1 FXCSMADD f5, A10, A4, f5 FXCPMADD f9, B4, A4, f9 FXCSMADD f13, B4, A4, f13 LFPDUX A4, AO2, INC4 FXCPMADD f2, A10, A2, f2 FXCSMADD f6, A10, A2, f6 FXCPMADD f10, B4, A2, f10 FXCSMADD f14, B4, A2, f14 LFPDUX A2, AO, INC4 FXCPMADD f3, A10, A4, f3 FXCSMADD f7, A10, A4, f7 LFPDUX A10, BO, INC4 FXCPMADD f11, B4, A4, f11 FXCSMADD f15, B4, A4, f15 LFPDUX A4, AO2, INC4 LFPDUX B4, BO2, INC4 bdnz+ .L16 .align 4.L17: FXCPMADD f0, A10, A2, f0 FXCSMADD f4, A10, A2, f4 FXCPMADD f8, B4, A2, f8 FXCSMADD f12, B4, A2, f12 LFPDUX A2, AO, INC4 FXCPMADD f1, A10, A4, f1 FXCSMADD f5, A10, A4, f5 FXCPMADD f9, B4, A4, f9 FXCSMADD f13, B4, A4, f13 LFPDUX A4, AO2, INC4 FXCPMADD f2, A10, A2, f2 FXCSMADD f6, A10, A2, f6 FXCPMADD f10, B4, A2, f10 FXCSMADD f14, B4, A2, f14 FXCPMADD f3, A10, A4, f3 FXCSMADD f7, A10, A4, f7 FXCPMADD f11, B4, A4, f11 FXCSMADD f15, B4, A4, f15 .align 4.L18: fpadd f0, f0, f4 fpadd f8, f8, f12 fpadd f1, f1, f5 fpadd f9, f9, f13 fpadd f2, f2, f6 fpadd f10, f10, f14 fpadd f3, f3, f7 fpadd f11, f11, f15#if defined(LN) || defined(RT)#ifdef LN subi r0, KK, 4#else subi r0, KK, 2#endif
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -