⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 zgemv_n.s

📁 Optimized GotoBLAS libraries
💻 S
📖 第 1 页 / 共 2 页
字号:
/*********************************************************************//*                                                                   *//*             Optimized BLAS libraries                              *//*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     *//*                                                                   *//* Copyright (c) The University of Texas, 2005. All rights reserved. *//* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  *//* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      *//* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              *//* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  *//* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     *//* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   *//* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         *//* Under no circumstances shall University be liable for incidental, *//* special, indirect, direct or consequential damages or loss of     *//* profits, interruption of business, or related expenses which may  *//* arise from use of Software or Documentation, including but not    *//* limited to those resulting from defects in Software and/or        *//* Documentation, or loss or inaccuracy of data of any kind.         *//*********************************************************************/#define ASSEMBLER#include "common.h"#ifdef DOUBLE#define PREFETCHSIZE 44#else#define PREFETCHSIZE 88#endif#define M	%i0#define N	%i1#define A	%i5#define LDA	%i2#define X	%i3#define INCX	%i4	#define Y	%l0#define INCY	%l1#define BUFFER	%l2#define I	%l3#define J	%l5#define A1	%o0#define A2	%o1#define A3	%o2#define A4	%o3#define Y1	%l4#define YY	%l6#ifdef DOUBLE#define t1	%f0#define	t2 	%f2#define t3	%f4#define	t4 	%f6#define y1	%f8#define y2	%f10#define y3	%f12#define y4	%f14#define y5	%f16#define y6	%f18#define y7	%f20#define y8	%f22#define a1	%f24#define a2	%f26#define a3	%f28#define a4	%f30#define a5	%f32#define a6	%f34#define a7	%f36#define a8	%f38#define a9	%f40#define a10	%f42#define a11	%f44#define a12	%f46#define a13	%f48#define a14	%f50#define a15	%f52#define a16	%f54#define x1	%f56#define x2	%f58#define x3	%f60#define x4	%f62#define FZERO	%f50#define ALPHA_R	%f52#define ALPHA_I	%f54#else#define t1	%f0#define	t2 	%f1#define t3	%f2#define	t4 	%f3#define y1	%f4#define y2	%f5#define y3	%f6#define y4	%f7#define y5	%f8#define y6	%f9#define y7	%f10#define y8	%f11#define a1	%f12#define a2	%f13#define a3	%f14#define a4	%f15#define a5	%f16#define a6	%f17#define a7	%f18#define a8	%f19#define a9	%f20#define a10	%f21#define a11	%f22#define a12	%f23#define a13	%f24#define a14	%f25#define a15	%f26#define a16	%f27#define x1	%f28#define x2	%f29#define x3	%f30#define x4	%f31#define FZERO	%f25#define ALPHA_R	%f26#define ALPHA_I	%f27#endif#ifndef __64BIT__#define STACK_FZERO	[%fp + STACK_START +  8]#define STACK_ALPHA_R	[%fp + STACK_START + 16]#ifndef DOUBLE#define STACK_ALPHA_I	[%fp + STACK_START + 20]#else#define STACK_ALPHA_I	[%fp + STACK_START + 24]#endif#else#define STACK_FZERO	[%fp + STACK_START + 24]#define STACK_ALPHA_R	[%fp + STACK_START + 32]#define STACK_ALPHA_I	[%fp + STACK_START + 40]#endif#ifndef CONJ#define	FSUBX	FSUB#define FADDX	FADD#else#define	FSUBX	FADD#define FADDX	FSUB#endif	PROLOGUE	SAVESP	nop#ifndef __64BIT__#ifdef DOUBLE	st	%g0, [%fp + STACK_START +  8]	st	%g0, [%fp + STACK_START + 12]	st	%i3, [%fp + STACK_START + 16]   /* ALPHA_R */	st	%i4, [%fp + STACK_START + 20]	st	%i5, [%fp + STACK_START + 24]   /* ALPHA_I */	ld	[%fp + STACK_START + 32], A	ld	[%fp + STACK_START + 36], LDA	ld	[%fp + STACK_START + 40], X	ld	[%fp + STACK_START + 44], INCX	ld	[%fp + STACK_START + 48], Y	ld	[%fp + STACK_START + 52], INCY	ld	[%fp + STACK_START + 56], BUFFER#else	st	%g0, [%fp + STACK_START +  8]	st	%i3, [%fp + STACK_START + 16]   /* ALPHA_R */	st	%i4, [%fp + STACK_START + 20]   /* ALPHA_I */	ld	[%fp + STACK_START + 28], LDA	ld	[%fp + STACK_START + 32], X	ld	[%fp + STACK_START + 36], INCX	ld	[%fp + STACK_START + 40], Y	ld	[%fp + STACK_START + 44], INCY	ld	[%fp + STACK_START + 48], BUFFER#endif#else#ifdef DOUBLE	stx	%g0, STACK_FZERO	std	%f6, STACK_ALPHA_R	std	%f8, STACK_ALPHA_I#else	st	%g0, STACK_FZERO	st	%f7, STACK_ALPHA_R	st	%f9, STACK_ALPHA_I#endif	ldx	[%fp+  STACK_START + 56], LDA	ldx	[%fp+  STACK_START + 64], X	ldx	[%fp+  STACK_START + 72], INCX	ldx	[%fp+  STACK_START + 80], Y	ldx	[%fp+  STACK_START + 88], INCY	ldx	[%fp+  STACK_START + 96], BUFFER#endif	sll	LDA, ZBASE_SHIFT, LDA	cmp	M, 0	ble	%icc, .LL999	sll	INCX, ZBASE_SHIFT, INCX	cmp	N, 0	ble	%icc, .LL999	sll	INCY, ZBASE_SHIFT, INCY	cmp	INCY, 2 * SIZE	be	%icc, .LL20	mov	Y, YY	add	M, 3, J	sra	J, 2, J	LDF	STACK_FZERO, FZERO	mov	BUFFER, YY	mov	BUFFER, Y1.LL01:	STF	FZERO, [Y1 +  0 * SIZE]	nop	STF	FZERO, [Y1 +  1 * SIZE]	STF	FZERO, [Y1 +  2 * SIZE]	STF	FZERO, [Y1 +  3 * SIZE]	STF	FZERO, [Y1 +  4 * SIZE]	nop	STF	FZERO, [Y1 +  5 * SIZE]	deccc	J	STF	FZERO, [Y1 +  6 * SIZE]	nop	STF	FZERO, [Y1 +  7 * SIZE]	bg,pn	%icc, .LL01	add	Y1, 8 * SIZE, Y1	.LL20:	sra	N, 1, J	cmp	J, 0	ble,pn	%icc, .LL30	nop.LL21:	mov	YY, Y1	mov	A,  A1	LDF	STACK_ALPHA_R, ALPHA_R	LDF	STACK_ALPHA_I, ALPHA_I	add	A,  LDA, A2	add	A2, LDA, A	LDF	[X + 0 * SIZE], x1	LDF	[X + 1 * SIZE], x2	add	X, INCX, X	LDF	[X + 0 * SIZE], x3	LDF	[X + 1 * SIZE], x4	add	X, INCX, X	FMUL	ALPHA_R, x1, a1	FMUL	ALPHA_I, x2, a4	FMUL	ALPHA_I, x1, a2	FMUL	ALPHA_R, x2, a3	FMUL	ALPHA_R, x3, a5	FMUL	ALPHA_I, x4, a8	FMUL	ALPHA_I, x3, a6	FMUL	ALPHA_R, x4, a7#ifndef XCONJ	FSUB	a1, a4, x1	FADD	a2, a3, x2	FSUB	a5, a8, x3	FADD	a6, a7, x4#else	FADD	a1, a4, x1	FSUB	a2, a3, x2	FADD	a5, a8, x3	FSUB	a6, a7, x4#endif	sra	M, 2, I	cmp	I, 0	ble,pn	%icc, .LL27	nop	LDF	[A1 + 0 * SIZE], a1	LDF	[A1 + 1 * SIZE], a2	LDF	[A1 + 2 * SIZE], a3	LDF	[A1 + 3 * SIZE], a4	LDF	[A1 + 4 * SIZE], a9	LDF	[A1 + 5 * SIZE], a10	LDF	[A1 + 6 * SIZE], a11	LDF	[A1 + 7 * SIZE], a12	LDF	[A2 + 0 * SIZE], a5	LDF	[A2 + 1 * SIZE], a6	LDF	[A2 + 2 * SIZE], a7	LDF	[A2 + 3 * SIZE], a8	LDF	[A2 + 4 * SIZE], a13	LDF	[A2 + 5 * SIZE], a14	LDF	[A2 + 6 * SIZE], a15	LDF	[A2 + 7 * SIZE], a16	LDF	[Y1 + 0 * SIZE], y1	LDF	[Y1 + 1 * SIZE], y2	LDF	[Y1 + 2 * SIZE], y3	FMUL	a1, x1, t1	deccc	I	FMUL	a1, x2, t2	LDF	[A1 +  8 * SIZE], a1	FMUL	a3, x1, t3	FMUL	a3, x2, t4	ble,pn	%icc, .LL26	LDF	[A1 + 10 * SIZE], a3	FADD	y1, t1, y1	LDF	[Y1 + 3 * SIZE], y4	FMUL	a2, x2, t1	FADD	y2, t2, y2	FMUL	a2, x1, t2	LDF	[A1 +  9 * SIZE], a2	FADD	y3, t3, y3	LDF	[Y1 + 4 * SIZE], y5	FMUL	a4, x2, t3	FADD	y4, t4, y4	FMUL	a4, x1, t4	LDF	[A1 + 11 * SIZE], a4	FSUBX	y1, t1, y1	LDF	[Y1 + 5 * SIZE], y6	FMUL	a5, x3, t1	FADDX	y2, t2, y2	FMUL	a5, x4, t2	LDF	[A2 +  8 * SIZE], a5	FSUBX	y3, t3, y3	LDF	[Y1 + 6 * SIZE], y7	FMUL	a7, x3, t3	FADDX	y4, t4, y4	FMUL	a7, x4, t4	LDF	[A2 + 10 * SIZE], a7	FADD	y1, t1, y1	LDF	[Y1 + 7 * SIZE], y8	FMUL	a6, x4, t1	FADD	y2, t2, y2	FMUL	a6, x3, t2	LDF	[A2 +  9 * SIZE], a6	FADD	y3, t3, y3	FMUL	a8, x4, t3	FADD	y4, t4, y4	FMUL	a8, x3, t4	LDF	[A2 + 11 * SIZE], a8	FSUBX	y1, t1, y1	FMUL	a9,  x1, t1	FADDX	y2, t2, y2	FMUL	a9,  x2, t2	LDF	[A1 + 12 * SIZE], a9	FSUBX	y3, t3, y3	deccc	I	FMUL	a11, x1, t3	FADDX	y4, t4, y4	FMUL	a11, x2, t4	ble,pn	%icc, .LL23	LDF	[A1 + 14 * SIZE], a11.LL22:	FADD	y5, t1, y5	prefetch  [A1 +  PREFETCHSIZE * SIZE], 1	FMUL	a10, x2, t1	LDF	[Y1 + 7 * SIZE], y8	FADD	y6, t2, y6	FMUL	a10, x1, t2	LDF	[A1 + 13 * SIZE], a10	FADD	y7, t3, y7	FMUL	a12, x2, t3	STF	y1, [Y1 +  0 * SIZE]	FADD	y8, t4, y8	FMUL	a12, x1, t4	LDF	[A1 + 15 * SIZE], a12	FSUBX	y5, t1, y5	FMUL	a13, x3, t1	STF	y2, [Y1 +  1 * SIZE]	FADDX	y6, t2, y6	FMUL	a13, x4, t2	LDF	[A2 + 12 * SIZE], a13	FSUBX	y7, t3, y7	FMUL	a15, x3, t3	STF	y3, [Y1 +  2 * SIZE]	FADDX	y8, t4, y8	FMUL	a15, x4, t4	LDF	[A2 + 14 * SIZE], a15	FADD	y5, t1, y5	FMUL	a14, x4, t1	STF	y4, [Y1 +  3 * SIZE]	FADD	y6, t2, y6	FMUL	a14, x3, t2	LDF	[A2 + 13 * SIZE], a14	FADD	y7, t3, y7	FMUL	a16, x4, t3	LDF	[Y1 +  8 * SIZE], y1	FADD	y8, t4, y8	FMUL	a16, x3, t4	LDF	[A2 + 15 * SIZE], a16	FSUBX	y5, t1, y5	FMUL	a1, x1, t1	LDF	[Y1 +  9 * SIZE], y2	FADDX	y6, t2, y6	FMUL	a1, x2, t2	LDF	[A1 + 16 * SIZE], a1	FSUBX	y7, t3, y7	FMUL	a3, x1, t3	LDF	[Y1 + 10 * SIZE], y3	FADDX	y8, t4, y8	FMUL	a3, x2, t4	LDF	[A1 + 18 * SIZE], a3	FADD	y1, t1, y1	prefetch  [A2 +  PREFETCHSIZE * SIZE], 1	FMUL	a2, x2, t1	LDF	[Y1 + 11 * SIZE], y4	FADD	y2, t2, y2	FMUL	a2, x1, t2	LDF	[A1 + 17 * SIZE], a2	FADD	y3, t3, y3	FMUL	a4, x2, t3	STF	y5, [Y1 +  4 * SIZE]	FADD	y4, t4, y4	FMUL	a4, x1, t4	LDF	[A1 + 19 * SIZE], a4	FSUBX	y1, t1, y1	FMUL	a5, x3, t1	STF	y6, [Y1 +  5 * SIZE]	FADDX	y2, t2, y2	FMUL	a5, x4, t2	LDF	[A2 + 16 * SIZE], a5	FSUBX	y3, t3, y3	FMUL	a7, x3, t3	STF	y7, [Y1 +  6 * SIZE]	FADDX	y4, t4, y4	deccc	I	FMUL	a7, x4, t4	LDF	[A2 + 18 * SIZE], a7	FADD	y1, t1, y1	FMUL	a6, x4, t1	STF	y8, [Y1 +  7 * SIZE]	FADD	y2, t2, y2	FMUL	a6, x3, t2	LDF	[A2 + 17 * SIZE], a6	FADD	y3, t3, y3	add	A1, 8 * SIZE, A1	FMUL	a8, x4, t3	LDF	[Y1 + 12 * SIZE], y5	FADD	y4, t4, y4	FMUL	a8, x3, t4	LDF	[A2 + 19 * SIZE], a8	FSUBX	y1, t1, y1	add	A2, 8 * SIZE, A2	FMUL	a9,  x1, t1	LDF	[Y1 + 13 * SIZE], y6	FADDX	y2, t2, y2	add	Y1, 8 * SIZE, Y1	FMUL	a9,  x2, t2	LDF	[A1 + 12 * SIZE], a9	FSUBX	y3, t3, y3	FMUL	a11, x1, t3	LDF	[Y1 +  6 * SIZE], y7	FADDX	y4, t4, y4	FMUL	a11, x2, t4	bg,pn	%icc, .LL22	LDF	[A1 + 14 * SIZE], a11.LL23:	FADD	y5, t1, y5	FMUL	a10, x2, t1	LDF	[Y1 + 7 * SIZE], y8	FADD	y6, t2, y6	FMUL	a10, x1, t2	LDF	[A1 + 13 * SIZE], a10	FADD	y7, t3, y7	FMUL	a12, x2, t3	STF	y1, [Y1 +  0 * SIZE]	FADD	y8, t4, y8	FMUL	a12, x1, t4	LDF	[A1 + 15 * SIZE], a12	FSUBX	y5, t1, y5	FMUL	a13, x3, t1	STF	y2, [Y1 +  1 * SIZE]	FADDX	y6, t2, y6	FMUL	a13, x4, t2	LDF	[A2 + 12 * SIZE], a13	FSUBX	y7, t3, y7	FMUL	a15, x3, t3	STF	y3, [Y1 +  2 * SIZE]	FADDX	y8, t4, y8	FMUL	a15, x4, t4	LDF	[A2 + 14 * SIZE], a15	FADD	y5, t1, y5	FMUL	a14, x4, t1	STF	y4, [Y1 +  3 * SIZE]	FADD	y6, t2, y6	FMUL	a14, x3, t2	LDF	[A2 + 13 * SIZE], a14	FADD	y7, t3, y7	FMUL	a16, x4, t3	LDF	[Y1 +  8 * SIZE], y1	FADD	y8, t4, y8	FMUL	a16, x3, t4	LDF	[A2 + 15 * SIZE], a16	FSUBX	y5, t1, y5	add	A1, 8 * SIZE, A1	FMUL	a1, x1, t1	LDF	[Y1 +  9 * SIZE], y2	FADDX	y6, t2, y6	add	A2, 8 * SIZE, A2	FMUL	a1, x2, t2

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -