⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 gemv_hummer_n.s

📁 Optimized GotoBLAS libraries
💻 S
📖 第 1 页 / 共 3 页
字号:
/*********************************************************************//*                                                                   *//*             Optimized BLAS libraries                              *//*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     *//*                                                                   *//* Copyright (c) The University of Texas, 2005. All rights reserved. *//* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  *//* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      *//* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              *//* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  *//* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     *//* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   *//* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         *//* Under no circumstances shall University be liable for incidental, *//* special, indirect, direct or consequential damages or loss of     *//* profits, interruption of business, or related expenses which may  *//* arise from use of Software or Documentation, including but not    *//* limited to those resulting from defects in Software and/or        *//* Documentation, or loss or inaccuracy of data of any kind.         *//*********************************************************************/#define ASSEMBLER#include "common.h"#define M	r3#define	N	r4#define A	r6#define LDA	r7#define X	r8#define	INCX	r9#define	Y	r10#define	INCY	r5#define I	r11#define	J	r12#define INCY2	r24#define A1	r25#define A2	r26#define A3	r27#define A4	r28#define YL	r29#define	YS	r30#define INC2	r31#define yl1 f0#define yl2 f2#define yl3 f3#define yl4 f4#define ys1 f5#define ys2 f6#define ys3 f7#define ys4 f8#define	yl5 f27#define ys5 f28#define alpha1 f9#define alpha2 f10#define a1     f11#define a2     f12#define a3     f13#define a4     f14#define a5     f15#define a6     f16#define a7     f17#define a8     f18#define a9     f19#define a10    f20#define a11    f21#define a12    f22#define a13    f23#define a14    f24#define a15    f25#define a16    f26#define alpha  f1	PROLOGUE	PROFCODE	li	r0, -16	lwz	INCY,      8(SP)	stfpdux	f14, SP, r0	stfpdux	f15, SP, r0	stfpdux	f16, SP, r0	stfpdux	f17, SP, r0	stfpdux	f18, SP, r0	stfpdux	f19, SP, r0	stfpdux	f20, SP, r0	stfpdux	f21, SP, r0	stfpdux	f22, SP, r0	stfpdux	f23, SP, r0	stfpdux	f24, SP, r0	stfpdux	f25, SP, r0	stfpdux	f26, SP, r0	stfpdux	f27, SP, r0	stfpdux	f28, SP, r0	stfpdux	f29, SP, r0	stfpdux	f30, SP, r0	stfpdux	f31, SP, r0		stwu	r31,  -4(SP)	stwu	r30,  -4(SP)	stwu	r29,  -4(SP)	stwu	r28,  -4(SP)	stwu	r27,  -4(SP)	stwu	r26,  -4(SP)	stwu	r25,  -4(SP)	stwu	r24,  -4(SP)	stwu	r23,  -4(SP)	stwu	r22,  -4(SP)	stwu	r21,  -4(SP)	stwu	r20,  -4(SP)	stwu	r19,  -4(SP)	stwu	r18,  -4(SP)	stwu	r17,  -4(SP)	stwu	r16,  -4(SP)	slwi	LDA,  LDA,  BASE_SHIFT	slwi	INCX, INCX, BASE_SHIFT	slwi	INCY, INCY, BASE_SHIFT	fsmfp	alpha, alpha	cmpwi	cr0, M, 0	ble-	.L999	cmpwi	cr0, N, 0	ble-	.L999	add	INCY2, INCY, INCY	li	INC2, 2 * SIZE	sub	X, X, INCX	andi.	r0, A,  2 * SIZE - 1#	bne	.L100# All cases for aligned A, even LDA	cmpwi	cr0, INCY,  SIZE	bne	.L70	andi.	r0, Y,  2 * SIZE - 1	bne	.L40# A : aligned  LDA : even  Y : Unit Aligned	sub	A, A, INC2	sub	Y, Y, INCY2	srawi.	J, N, 2	ble	.L20	.align 4.L11:	LFDUX	alpha1, X, INCX	mr	A1, A	add	A2, A,  LDA	add	A3, A2, LDA	LFSDUX	alpha1, X, INCX	LFDUX	alpha2, X, INCX	add	A4, A3, LDA	add	A,  A4, LDA	mr	YL, Y	LFSDUX	alpha2, X, INCX	fpmul	alpha1, alpha, alpha1	mr	YS, Y	srawi.	r0,  M, 3	mtspr	CTR, r0	fpmul	alpha2, alpha, alpha2	ble	.L15	LFPDUX	yl1, YL, INCY2	LFPDUX	yl2, YL, INCY2	LFPDUX	yl3, YL, INCY2	LFPDUX	yl4, YL, INCY2	LFPDUX	a1,  A1, INC2	LFPDUX	a5,  A1, INC2	LFPDUX	a9,  A1, INC2	LFPDUX	a13, A1, INC2	LFPDUX	a2,  A2, INC2	LFPDUX	a6,  A2, INC2	LFPDUX	a10, A2, INC2	LFPDUX	a14, A2, INC2	LFPDUX	a3,  A3, INC2	LFPDUX	a7,  A3, INC2	LFPDUX	a11, A3, INC2	LFPDUX	a15, A3, INC2	LFPDUX	a4,  A4, INC2	fxcpmadd  ys1, alpha1, a1,  yl1	LFPDUX	a8,  A4, INC2	fxcpmadd  ys2, alpha1, a5,  yl2	LFPDUX	a12, A4, INC2	fxcpmadd  ys3, alpha1, a9,  yl3	LFPDUX	a16, A4, INC2	fxcpmadd  ys4, alpha1, a13, yl4	bdz	.L13	.align 4.L12:	LFPDUX	yl1, YL, INCY2	fxcsmadd  ys1, alpha1, a2,  ys1	LFPDUX	a1,  A1, INC2	fxcsmadd  ys2, alpha1, a6,  ys2	LFPDUX	a5,  A1, INC2	fxcsmadd  ys3, alpha1, a10, ys3	LFPDUX	a9,  A1, INC2	fxcsmadd  ys4, alpha1, a14, ys4	LFPDUX	a13, A1, INC2	LFPDUX	yl2, YL, INCY2	fxcpmadd  ys1, alpha2, a3,  ys1	LFPDUX	a2,  A2, INC2	fxcpmadd  ys2, alpha2, a7,  ys2	LFPDUX	a6,  A2, INC2	fxcpmadd  ys3, alpha2, a11, ys3	LFPDUX	a10, A2, INC2	fxcpmadd  ys4, alpha2, a15, ys4	LFPDUX	a14, A2, INC2	LFPDUX	yl3, YL, INCY2	fxcsmadd  ys1, alpha2, a4,  ys1	LFPDUX	a3,  A3, INC2	fxcsmadd  ys2, alpha2, a8,  ys2	LFPDUX	a7,  A3, INC2	fxcsmadd  ys3, alpha2, a12, ys3	LFPDUX	a11, A3, INC2	fxcsmadd  ys4, alpha2, a16, ys4	LFPDUX	a15, A3, INC2	LFPDUX	yl4, YL, INCY2	STFPDUX	ys1, YS, INCY2	STFPDUX	ys2, YS, INCY2	STFPDUX	ys3, YS, INCY2	STFPDUX	ys4, YS, INCY2	LFPDUX	a4,  A4, INC2	fxcpmadd  ys1, alpha1, a1,  yl1	LFPDUX	a8,  A4, INC2	fxcpmadd  ys2, alpha1, a5,  yl2	LFPDUX	a12, A4, INC2	fxcpmadd  ys3, alpha1, a9,  yl3	LFPDUX	a16, A4, INC2	fxcpmadd  ys4, alpha1, a13, yl4	bdnz	.L12	.align 4.L13:	fxcsmadd  ys1, alpha1, a2,  ys1	fxcsmadd  ys2, alpha1, a6,  ys2	fxcsmadd  ys3, alpha1, a10, ys3	fxcsmadd  ys4, alpha1, a14, ys4	fxcpmadd  ys1, alpha2, a3,  ys1	fxcpmadd  ys2, alpha2, a7,  ys2	fxcpmadd  ys3, alpha2, a11, ys3	fxcpmadd  ys4, alpha2, a15, ys4	fxcsmadd  ys1, alpha2, a4,  ys1	fxcsmadd  ys2, alpha2, a8,  ys2	fxcsmadd  ys3, alpha2, a12, ys3	fxcsmadd  ys4, alpha2, a16, ys4	STFPDUX	ys1, YS, INCY2	STFPDUX	ys2, YS, INCY2	STFPDUX	ys3, YS, INCY2	STFPDUX	ys4, YS, INCY2	.align 4.L15:	andi.	r0, M, 7	ble	.L19	andi.	r0, M, 4	ble	.L17	LFPDUX	yl1, YL, INCY2	LFPDUX	a1,  A1, INC2	LFPDUX	yl2, YL, INCY2	LFPDUX	a5,  A1, INC2	LFPDUX	a2,  A2, INC2	LFPDUX	a6,  A2, INC2	LFPDUX	a3,  A3, INC2	LFPDUX	a7,  A3, INC2	LFPDUX	a4,  A4, INC2	LFPDUX	a8,  A4, INC2	fxcpmadd  ys1, alpha1, a1, yl1	fxcpmadd  ys2, alpha1, a5, yl2	fxcsmadd  ys1, alpha1, a2, ys1	fxcsmadd  ys2, alpha1, a6, ys2	fxcpmadd  ys1, alpha2, a3, ys1	fxcpmadd  ys2, alpha2, a7, ys2	fxcsmadd  ys1, alpha2, a4, ys1	fxcsmadd  ys2, alpha2, a8, ys2	STFPDUX	ys1, YS, INCY2	STFPDUX	ys2, YS, INCY2	.align 4.L17:	andi.	r0, M, 2	ble	.L18	LFPDUX	yl1, YL, INCY2	LFPDUX	a1,  A1, INC2	LFPDUX	a2,  A2, INC2	LFPDUX	a3,  A3, INC2	LFPDUX	a4,  A4, INC2	fxcpmadd  ys1, alpha1, a1, yl1	fxcsmadd  ys1, alpha1, a2, ys1	fxcpmadd  ys1, alpha2, a3, ys1	fxcsmadd  ys1, alpha2, a4, ys1	STFPDUX	ys1, YS, INCY2	.align 4.L18:	andi.	r0, M, 1	ble	.L19	LFDUX	yl1, YL, INCY2	LFDUX	a1,  A1, INC2	LFDUX	a2,  A2, INC2	LFDUX	a3,  A3, INC2	LFDUX	a4,  A4, INC2	fxcpmadd  ys1, alpha1, a1, yl1	fxcsmadd  ys1, alpha1, a2, ys1	fxcpmadd  ys1, alpha2, a3, ys1	fxcsmadd  ys1, alpha2, a4, ys1	STFDUX	ys1, YS, INCY2	.align 4.L19:	addi	J, J, -1	cmpi	cr0, 0, J, 0	bgt	.L11	.align 4	.L20:	andi.	J, N, 2	ble	.L30	LFDUX	alpha1, X, INCX	mr	A1, A	add	A2, A,  LDA	add	A,  A2, LDA	LFSDUX	alpha1, X, INCX	mr	YL, Y	mr	YS, Y	fpmul	alpha1, alpha, alpha1	srawi.	r0,  M, 3	mtspr	CTR, r0	ble	.L25	LFPDUX	yl1, YL, INCY2	LFPDUX	a1,  A1, INC2	LFPDUX	yl2, YL, INCY2	LFPDUX	a5,  A1, INC2	LFPDUX	yl3, YL, INCY2	LFPDUX	a9,  A1, INC2	LFPDUX	yl4, YL, INCY2	LFPDUX	a13, A1, INC2	LFPDUX	a2,  A2, INC2	LFPDUX	a6,  A2, INC2	LFPDUX	a10, A2, INC2	LFPDUX	a14, A2, INC2	bdz	.L23	.align 4.L22:	fxcpmadd  ys1, alpha1, a1,  yl1	LFPDUX	a1,  A1, INC2	LFPDUX	yl1, YL, INCY2	fxcpmadd  ys2, alpha1, a5,  yl2	LFPDUX	a5,  A1, INC2	LFPDUX	yl2, YL, INCY2	fxcpmadd  ys3, alpha1, a9,  yl3	LFPDUX	a9,  A1, INC2	LFPDUX	yl3, YL, INCY2	fxcpmadd  ys4, alpha1, a13, yl4	LFPDUX	a13, A1, INC2	LFPDUX	yl4, YL, INCY2	fxcsmadd  ys1, alpha1, a2,  ys1	LFPDUX	a2,  A2, INC2	fxcsmadd  ys2, alpha1, a6,  ys2	LFPDUX	a6,  A2, INC2	fxcsmadd  ys3, alpha1, a10, ys3	LFPDUX	a10, A2, INC2	fxcsmadd  ys4, alpha1, a14, ys4	LFPDUX	a14, A2, INC2	STFPDUX	ys1, YS, INCY2	STFPDUX	ys2, YS, INCY2	STFPDUX	ys3, YS, INCY2	STFPDUX	ys4, YS, INCY2	bdnz	.L22	.align 4.L23:	fxcpmadd  ys1, alpha1, a1,  yl1	fxcpmadd  ys2, alpha1, a5,  yl2	fxcpmadd  ys3, alpha1, a9,  yl3	fxcpmadd  ys4, alpha1, a13, yl4	fxcsmadd  ys1, alpha1, a2,  ys1	fxcsmadd  ys2, alpha1, a6,  ys2	fxcsmadd  ys3, alpha1, a10, ys3	fxcsmadd  ys4, alpha1, a14, ys4	STFPDUX	ys1, YS, INCY2	STFPDUX	ys2, YS, INCY2	STFPDUX	ys3, YS, INCY2	STFPDUX	ys4, YS, INCY2	.align 4.L25:	andi.	r0, M, 7	ble	.L30	andi.	r0, M, 4	ble	.L27	LFPDUX	yl1, YL, INCY2	LFPDUX	a1,  A1, INC2	LFPDUX	a2,  A2, INC2	LFPDUX	yl2, YL, INCY2	LFPDUX	a5,  A1, INC2	LFPDUX	a6,  A2, INC2	fxcpmadd  ys1, alpha1, a1, yl1	fxcsmadd  ys1, alpha1, a2, ys1	fxcpmadd  ys2, alpha1, a5, yl2	fxcsmadd  ys2, alpha1, a6, ys2	STFPDUX	ys1, YS, INCY2	STFPDUX	ys2, YS, INCY2	.align 4.L27:	andi.	r0, M, 2	ble	.L28	LFPDUX	yl1, YL, INCY2	LFPDUX	a1,  A1, INC2	LFPDUX	a2,  A2, INC2	fxcpmadd  ys1, alpha1, a1, yl1	fxcsmadd  ys1, alpha1, a2, ys1	STFPDUX	ys1, YS, INCY2	.align 4.L28:	andi.	r0, M, 1	ble	.L30	LFDUX	yl1, YL, INCY2	LFDUX	a1,  A1, INC2	LFDUX	a2,  A2, INC2	fxcpmadd  ys1, alpha1, a1, yl1	fxcsmadd  ys1, alpha1, a2, ys1	STFDUX	ys1, YS, INCY2	.align 4.L30:	andi.	J, N, 1	ble	.L999	LFDUX	alpha1, X, INCX	mr	A1, A	mr	YL, Y	mr	YS, Y	fmul	alpha1, alpha, alpha1	srawi.	r0,  M, 3	mtspr	CTR, r0	ble	.L35	LFPDUX	yl1, YL, INCY2	LFPDUX	a1,  A1, INC2	LFPDUX	yl2, YL, INCY2	LFPDUX	a5,  A1, INC2	LFPDUX	yl3, YL, INCY2	LFPDUX	a9,  A1, INC2	LFPDUX	yl4, YL, INCY2	LFPDUX	a13, A1, INC2	bdz	.L33	.align 4.L32:	fxcpmadd  ys1, alpha1, a1,  yl1	LFPDUX	yl1, YL, INCY2	LFPDUX	a1,  A1, INC2	fxcpmadd  ys2, alpha1, a5,  yl2	LFPDUX	yl2, YL, INCY2	LFPDUX	a5,  A1, INC2	fxcpmadd  ys3, alpha1, a9,  yl3	LFPDUX	yl3, YL, INCY2	LFPDUX	a9,  A1, INC2	fxcpmadd  ys4, alpha1, a13, yl4	LFPDUX	yl4, YL, INCY2	LFPDUX	a13, A1, INC2	STFPDUX	ys1, YS, INCY2	STFPDUX	ys2, YS, INCY2	STFPDUX	ys3, YS, INCY2	STFPDUX	ys4, YS, INCY2	bdnz	.L32	.align 4.L33:	fxcpmadd  ys1, alpha1, a1,  yl1	fxcpmadd  ys2, alpha1, a5,  yl2	fxcpmadd  ys3, alpha1, a9,  yl3	fxcpmadd  ys4, alpha1, a13, yl4	STFPDUX	ys1, YS, INCY2	STFPDUX	ys2, YS, INCY2	STFPDUX	ys3, YS, INCY2	STFPDUX	ys4, YS, INCY2	.align 4.L35:	andi.	r0, M, 7	ble	.L999	andi.	r0, M, 4	ble	.L37	LFPDUX	yl1, YL, INCY2	LFPDUX	a1,  A1, INC2	LFPDUX	yl2, YL, INCY2	LFPDUX	a5,  A1, INC2	fxcpmadd  ys1, alpha1, a1, yl1	fxcpmadd  ys2, alpha1, a5, yl2	STFPDUX	ys1, YS, INCY2	STFPDUX	ys2, YS, INCY2	.align 4.L37:	andi.	r0, M, 2	ble	.L38	LFPDUX	yl1, YL, INCY2	LFPDUX	a1,  A1, INC2	fxcpmadd  ys1, alpha1, a1, yl1	STFPDUX	ys1, YS, INCY2	.align 4.L38:

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -