📄 gemv_hummer_n.s
字号:
/*********************************************************************//* *//* Optimized BLAS libraries *//* By Kazushige Goto <kgoto@tacc.utexas.edu> *//* *//* Copyright (c) The University of Texas, 2005. All rights reserved. *//* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING *//* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF *//* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, *//* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY *//* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF *//* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO *//* THE USE OF THE SOFTWARE OR DOCUMENTATION. *//* Under no circumstances shall University be liable for incidental, *//* special, indirect, direct or consequential damages or loss of *//* profits, interruption of business, or related expenses which may *//* arise from use of Software or Documentation, including but not *//* limited to those resulting from defects in Software and/or *//* Documentation, or loss or inaccuracy of data of any kind. *//*********************************************************************/#define ASSEMBLER#include "common.h"#define M r3#define N r4#define A r6#define LDA r7#define X r8#define INCX r9#define Y r10#define INCY r5#define I r11#define J r12#define INCY2 r24#define A1 r25#define A2 r26#define A3 r27#define A4 r28#define YL r29#define YS r30#define INC2 r31#define yl1 f0#define yl2 f2#define yl3 f3#define yl4 f4#define ys1 f5#define ys2 f6#define ys3 f7#define ys4 f8#define yl5 f27#define ys5 f28#define alpha1 f9#define alpha2 f10#define a1 f11#define a2 f12#define a3 f13#define a4 f14#define a5 f15#define a6 f16#define a7 f17#define a8 f18#define a9 f19#define a10 f20#define a11 f21#define a12 f22#define a13 f23#define a14 f24#define a15 f25#define a16 f26#define alpha f1 PROLOGUE PROFCODE li r0, -16 lwz INCY, 8(SP) stfpdux f14, SP, r0 stfpdux f15, SP, r0 stfpdux f16, SP, r0 stfpdux f17, SP, r0 stfpdux f18, SP, r0 stfpdux f19, SP, r0 stfpdux f20, SP, r0 stfpdux f21, SP, r0 stfpdux f22, SP, r0 stfpdux f23, SP, r0 stfpdux f24, SP, r0 stfpdux f25, SP, r0 stfpdux f26, SP, r0 stfpdux f27, SP, r0 stfpdux f28, SP, r0 stfpdux f29, SP, r0 stfpdux f30, SP, r0 stfpdux f31, SP, r0 stwu r31, -4(SP) stwu r30, -4(SP) stwu r29, -4(SP) stwu r28, -4(SP) stwu r27, -4(SP) stwu r26, -4(SP) stwu r25, -4(SP) stwu r24, -4(SP) stwu r23, -4(SP) stwu r22, -4(SP) stwu r21, -4(SP) stwu r20, -4(SP) stwu r19, -4(SP) stwu r18, -4(SP) stwu r17, -4(SP) stwu r16, -4(SP) slwi LDA, LDA, BASE_SHIFT slwi INCX, INCX, BASE_SHIFT slwi INCY, INCY, BASE_SHIFT fsmfp alpha, alpha cmpwi cr0, M, 0 ble- .L999 cmpwi cr0, N, 0 ble- .L999 add INCY2, INCY, INCY li INC2, 2 * SIZE sub X, X, INCX andi. r0, A, 2 * SIZE - 1# bne .L100# All cases for aligned A, even LDA cmpwi cr0, INCY, SIZE bne .L70 andi. r0, Y, 2 * SIZE - 1 bne .L40# A : aligned LDA : even Y : Unit Aligned sub A, A, INC2 sub Y, Y, INCY2 srawi. J, N, 2 ble .L20 .align 4.L11: LFDUX alpha1, X, INCX mr A1, A add A2, A, LDA add A3, A2, LDA LFSDUX alpha1, X, INCX LFDUX alpha2, X, INCX add A4, A3, LDA add A, A4, LDA mr YL, Y LFSDUX alpha2, X, INCX fpmul alpha1, alpha, alpha1 mr YS, Y srawi. r0, M, 3 mtspr CTR, r0 fpmul alpha2, alpha, alpha2 ble .L15 LFPDUX yl1, YL, INCY2 LFPDUX yl2, YL, INCY2 LFPDUX yl3, YL, INCY2 LFPDUX yl4, YL, INCY2 LFPDUX a1, A1, INC2 LFPDUX a5, A1, INC2 LFPDUX a9, A1, INC2 LFPDUX a13, A1, INC2 LFPDUX a2, A2, INC2 LFPDUX a6, A2, INC2 LFPDUX a10, A2, INC2 LFPDUX a14, A2, INC2 LFPDUX a3, A3, INC2 LFPDUX a7, A3, INC2 LFPDUX a11, A3, INC2 LFPDUX a15, A3, INC2 LFPDUX a4, A4, INC2 fxcpmadd ys1, alpha1, a1, yl1 LFPDUX a8, A4, INC2 fxcpmadd ys2, alpha1, a5, yl2 LFPDUX a12, A4, INC2 fxcpmadd ys3, alpha1, a9, yl3 LFPDUX a16, A4, INC2 fxcpmadd ys4, alpha1, a13, yl4 bdz .L13 .align 4.L12: LFPDUX yl1, YL, INCY2 fxcsmadd ys1, alpha1, a2, ys1 LFPDUX a1, A1, INC2 fxcsmadd ys2, alpha1, a6, ys2 LFPDUX a5, A1, INC2 fxcsmadd ys3, alpha1, a10, ys3 LFPDUX a9, A1, INC2 fxcsmadd ys4, alpha1, a14, ys4 LFPDUX a13, A1, INC2 LFPDUX yl2, YL, INCY2 fxcpmadd ys1, alpha2, a3, ys1 LFPDUX a2, A2, INC2 fxcpmadd ys2, alpha2, a7, ys2 LFPDUX a6, A2, INC2 fxcpmadd ys3, alpha2, a11, ys3 LFPDUX a10, A2, INC2 fxcpmadd ys4, alpha2, a15, ys4 LFPDUX a14, A2, INC2 LFPDUX yl3, YL, INCY2 fxcsmadd ys1, alpha2, a4, ys1 LFPDUX a3, A3, INC2 fxcsmadd ys2, alpha2, a8, ys2 LFPDUX a7, A3, INC2 fxcsmadd ys3, alpha2, a12, ys3 LFPDUX a11, A3, INC2 fxcsmadd ys4, alpha2, a16, ys4 LFPDUX a15, A3, INC2 LFPDUX yl4, YL, INCY2 STFPDUX ys1, YS, INCY2 STFPDUX ys2, YS, INCY2 STFPDUX ys3, YS, INCY2 STFPDUX ys4, YS, INCY2 LFPDUX a4, A4, INC2 fxcpmadd ys1, alpha1, a1, yl1 LFPDUX a8, A4, INC2 fxcpmadd ys2, alpha1, a5, yl2 LFPDUX a12, A4, INC2 fxcpmadd ys3, alpha1, a9, yl3 LFPDUX a16, A4, INC2 fxcpmadd ys4, alpha1, a13, yl4 bdnz .L12 .align 4.L13: fxcsmadd ys1, alpha1, a2, ys1 fxcsmadd ys2, alpha1, a6, ys2 fxcsmadd ys3, alpha1, a10, ys3 fxcsmadd ys4, alpha1, a14, ys4 fxcpmadd ys1, alpha2, a3, ys1 fxcpmadd ys2, alpha2, a7, ys2 fxcpmadd ys3, alpha2, a11, ys3 fxcpmadd ys4, alpha2, a15, ys4 fxcsmadd ys1, alpha2, a4, ys1 fxcsmadd ys2, alpha2, a8, ys2 fxcsmadd ys3, alpha2, a12, ys3 fxcsmadd ys4, alpha2, a16, ys4 STFPDUX ys1, YS, INCY2 STFPDUX ys2, YS, INCY2 STFPDUX ys3, YS, INCY2 STFPDUX ys4, YS, INCY2 .align 4.L15: andi. r0, M, 7 ble .L19 andi. r0, M, 4 ble .L17 LFPDUX yl1, YL, INCY2 LFPDUX a1, A1, INC2 LFPDUX yl2, YL, INCY2 LFPDUX a5, A1, INC2 LFPDUX a2, A2, INC2 LFPDUX a6, A2, INC2 LFPDUX a3, A3, INC2 LFPDUX a7, A3, INC2 LFPDUX a4, A4, INC2 LFPDUX a8, A4, INC2 fxcpmadd ys1, alpha1, a1, yl1 fxcpmadd ys2, alpha1, a5, yl2 fxcsmadd ys1, alpha1, a2, ys1 fxcsmadd ys2, alpha1, a6, ys2 fxcpmadd ys1, alpha2, a3, ys1 fxcpmadd ys2, alpha2, a7, ys2 fxcsmadd ys1, alpha2, a4, ys1 fxcsmadd ys2, alpha2, a8, ys2 STFPDUX ys1, YS, INCY2 STFPDUX ys2, YS, INCY2 .align 4.L17: andi. r0, M, 2 ble .L18 LFPDUX yl1, YL, INCY2 LFPDUX a1, A1, INC2 LFPDUX a2, A2, INC2 LFPDUX a3, A3, INC2 LFPDUX a4, A4, INC2 fxcpmadd ys1, alpha1, a1, yl1 fxcsmadd ys1, alpha1, a2, ys1 fxcpmadd ys1, alpha2, a3, ys1 fxcsmadd ys1, alpha2, a4, ys1 STFPDUX ys1, YS, INCY2 .align 4.L18: andi. r0, M, 1 ble .L19 LFDUX yl1, YL, INCY2 LFDUX a1, A1, INC2 LFDUX a2, A2, INC2 LFDUX a3, A3, INC2 LFDUX a4, A4, INC2 fxcpmadd ys1, alpha1, a1, yl1 fxcsmadd ys1, alpha1, a2, ys1 fxcpmadd ys1, alpha2, a3, ys1 fxcsmadd ys1, alpha2, a4, ys1 STFDUX ys1, YS, INCY2 .align 4.L19: addi J, J, -1 cmpi cr0, 0, J, 0 bgt .L11 .align 4 .L20: andi. J, N, 2 ble .L30 LFDUX alpha1, X, INCX mr A1, A add A2, A, LDA add A, A2, LDA LFSDUX alpha1, X, INCX mr YL, Y mr YS, Y fpmul alpha1, alpha, alpha1 srawi. r0, M, 3 mtspr CTR, r0 ble .L25 LFPDUX yl1, YL, INCY2 LFPDUX a1, A1, INC2 LFPDUX yl2, YL, INCY2 LFPDUX a5, A1, INC2 LFPDUX yl3, YL, INCY2 LFPDUX a9, A1, INC2 LFPDUX yl4, YL, INCY2 LFPDUX a13, A1, INC2 LFPDUX a2, A2, INC2 LFPDUX a6, A2, INC2 LFPDUX a10, A2, INC2 LFPDUX a14, A2, INC2 bdz .L23 .align 4.L22: fxcpmadd ys1, alpha1, a1, yl1 LFPDUX a1, A1, INC2 LFPDUX yl1, YL, INCY2 fxcpmadd ys2, alpha1, a5, yl2 LFPDUX a5, A1, INC2 LFPDUX yl2, YL, INCY2 fxcpmadd ys3, alpha1, a9, yl3 LFPDUX a9, A1, INC2 LFPDUX yl3, YL, INCY2 fxcpmadd ys4, alpha1, a13, yl4 LFPDUX a13, A1, INC2 LFPDUX yl4, YL, INCY2 fxcsmadd ys1, alpha1, a2, ys1 LFPDUX a2, A2, INC2 fxcsmadd ys2, alpha1, a6, ys2 LFPDUX a6, A2, INC2 fxcsmadd ys3, alpha1, a10, ys3 LFPDUX a10, A2, INC2 fxcsmadd ys4, alpha1, a14, ys4 LFPDUX a14, A2, INC2 STFPDUX ys1, YS, INCY2 STFPDUX ys2, YS, INCY2 STFPDUX ys3, YS, INCY2 STFPDUX ys4, YS, INCY2 bdnz .L22 .align 4.L23: fxcpmadd ys1, alpha1, a1, yl1 fxcpmadd ys2, alpha1, a5, yl2 fxcpmadd ys3, alpha1, a9, yl3 fxcpmadd ys4, alpha1, a13, yl4 fxcsmadd ys1, alpha1, a2, ys1 fxcsmadd ys2, alpha1, a6, ys2 fxcsmadd ys3, alpha1, a10, ys3 fxcsmadd ys4, alpha1, a14, ys4 STFPDUX ys1, YS, INCY2 STFPDUX ys2, YS, INCY2 STFPDUX ys3, YS, INCY2 STFPDUX ys4, YS, INCY2 .align 4.L25: andi. r0, M, 7 ble .L30 andi. r0, M, 4 ble .L27 LFPDUX yl1, YL, INCY2 LFPDUX a1, A1, INC2 LFPDUX a2, A2, INC2 LFPDUX yl2, YL, INCY2 LFPDUX a5, A1, INC2 LFPDUX a6, A2, INC2 fxcpmadd ys1, alpha1, a1, yl1 fxcsmadd ys1, alpha1, a2, ys1 fxcpmadd ys2, alpha1, a5, yl2 fxcsmadd ys2, alpha1, a6, ys2 STFPDUX ys1, YS, INCY2 STFPDUX ys2, YS, INCY2 .align 4.L27: andi. r0, M, 2 ble .L28 LFPDUX yl1, YL, INCY2 LFPDUX a1, A1, INC2 LFPDUX a2, A2, INC2 fxcpmadd ys1, alpha1, a1, yl1 fxcsmadd ys1, alpha1, a2, ys1 STFPDUX ys1, YS, INCY2 .align 4.L28: andi. r0, M, 1 ble .L30 LFDUX yl1, YL, INCY2 LFDUX a1, A1, INC2 LFDUX a2, A2, INC2 fxcpmadd ys1, alpha1, a1, yl1 fxcsmadd ys1, alpha1, a2, ys1 STFDUX ys1, YS, INCY2 .align 4.L30: andi. J, N, 1 ble .L999 LFDUX alpha1, X, INCX mr A1, A mr YL, Y mr YS, Y fmul alpha1, alpha, alpha1 srawi. r0, M, 3 mtspr CTR, r0 ble .L35 LFPDUX yl1, YL, INCY2 LFPDUX a1, A1, INC2 LFPDUX yl2, YL, INCY2 LFPDUX a5, A1, INC2 LFPDUX yl3, YL, INCY2 LFPDUX a9, A1, INC2 LFPDUX yl4, YL, INCY2 LFPDUX a13, A1, INC2 bdz .L33 .align 4.L32: fxcpmadd ys1, alpha1, a1, yl1 LFPDUX yl1, YL, INCY2 LFPDUX a1, A1, INC2 fxcpmadd ys2, alpha1, a5, yl2 LFPDUX yl2, YL, INCY2 LFPDUX a5, A1, INC2 fxcpmadd ys3, alpha1, a9, yl3 LFPDUX yl3, YL, INCY2 LFPDUX a9, A1, INC2 fxcpmadd ys4, alpha1, a13, yl4 LFPDUX yl4, YL, INCY2 LFPDUX a13, A1, INC2 STFPDUX ys1, YS, INCY2 STFPDUX ys2, YS, INCY2 STFPDUX ys3, YS, INCY2 STFPDUX ys4, YS, INCY2 bdnz .L32 .align 4.L33: fxcpmadd ys1, alpha1, a1, yl1 fxcpmadd ys2, alpha1, a5, yl2 fxcpmadd ys3, alpha1, a9, yl3 fxcpmadd ys4, alpha1, a13, yl4 STFPDUX ys1, YS, INCY2 STFPDUX ys2, YS, INCY2 STFPDUX ys3, YS, INCY2 STFPDUX ys4, YS, INCY2 .align 4.L35: andi. r0, M, 7 ble .L999 andi. r0, M, 4 ble .L37 LFPDUX yl1, YL, INCY2 LFPDUX a1, A1, INC2 LFPDUX yl2, YL, INCY2 LFPDUX a5, A1, INC2 fxcpmadd ys1, alpha1, a1, yl1 fxcpmadd ys2, alpha1, a5, yl2 STFPDUX ys1, YS, INCY2 STFPDUX ys2, YS, INCY2 .align 4.L37: andi. r0, M, 2 ble .L38 LFPDUX yl1, YL, INCY2 LFPDUX a1, A1, INC2 fxcpmadd ys1, alpha1, a1, yl1 STFPDUX ys1, YS, INCY2 .align 4.L38:
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -